Files
SVSimServer/SVSim.BattleNode/Hosting/BattleNodeWebSocketHandler.cs
gamer147 9f11896f7b feat(battle-node): polite Socket.IO close on waiting-room timeout
The PvP waiting-room timeout path in BattleNodeWebSocketHandler used to
return immediately after RemovePending, leaving the parked first arriver
to learn about the disconnect via TCP teardown after Kestrel finished
draining the request. BestHTTP / socket.io-client log that as an abrupt
drop rather than a controlled disconnect.

New TryPoliteCloseAsync helper emits an EIO "1" (Close) text frame, then
runs the WebSocket close handshake with NormalClosure. Wrapped in
try/catch + Debug log — teardown races between the server-side close and
client disconnect are routine and not actionable. Uses a fresh 5s CTS so
ctx.RequestAborted being canceled doesn't skip the close.

Wired into both bail-out paths post-AcceptWebSocketAsync that previously
just returned:
- PvP waiting-room timeout / Park-Park race (the main case, per PLAN.md
  L104 (c))
- Unknown BattleType default case (same shape, log message already said
  "closing WS" but didn't actually close — opportunistic fix)

PvpWaitingRoomTimeout integration test tightened: now asserts the polite
"1" text frame arrives before the close handshake, not just that the WS
eventually closes by any means.

172 battle-node tests passing (was 172 before the assertion tightening;
the existing timeout test stayed in.)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-02 14:05:25 -04:00

260 lines
12 KiB
C#

using System.Net.WebSockets;
using System.Text;
using Microsoft.AspNetCore.Http;
using Microsoft.Extensions.Logging;
using SVSim.BattleNode.Bridge;
using SVSim.BattleNode.Sessions;
using SVSim.BattleNode.Sessions.Participants;
using SVSim.BattleNode.Wire;
namespace SVSim.BattleNode.Hosting;
/// <summary>
/// Validates an incoming WebSocket upgrade request, accepts it, and hands off to a fresh
/// <see cref="BattleSession"/>. Singleton; no per-request state.
/// </summary>
/// <remarks>
/// <para>The validation chain — cheapest checks first, crypto only after both params are
/// present, WS accept only after the store lookup confirms the credentials match an outstanding
/// pending battle:</para>
/// <list type="number">
/// <item>Reject non-WS requests with 400 (someone hit <c>/socket.io/</c> via plain HTTP).</item>
/// <item>Read <c>BattleId</c> and encrypted <c>viewerId</c> from request headers, falling back
/// to query string. The real client puts them on headers despite BestHTTP's
/// <c>AdditionalQueryParams</c> API name — see project README §Wire-format gotchas.</item>
/// <item>Decrypt the viewerId with <see cref="NodeCrypto.DecryptForNode"/>; reject on
/// parse/decrypt failure.</item>
/// <item>Look up the <see cref="PendingBattle"/> in the store and verify the decrypted viewer
/// matches the one the <see cref="Bridge.IMatchingBridge"/> registered.</item>
/// <item>AcceptWebSocketAsync, remove the pending entry (it's now an active session), construct
/// <see cref="BattleSession"/>, await <see cref="BattleSession.RunAsync"/> until the WS
/// closes.</item>
/// </list>
/// </remarks>
public sealed class BattleNodeWebSocketHandler
{
private readonly IBattleSessionStore _store;
private readonly IWaitingRoom _waitingRoom;
private readonly BattleNodeOptions _options;
private readonly ILogger<BattleNodeWebSocketHandler> _log;
private readonly ILoggerFactory _loggerFactory;
public BattleNodeWebSocketHandler(
IBattleSessionStore store,
IWaitingRoom waitingRoom,
BattleNodeOptions options,
ILoggerFactory loggerFactory)
{
_store = store;
_waitingRoom = waitingRoom;
_options = options;
_loggerFactory = loggerFactory;
_log = loggerFactory.CreateLogger<BattleNodeWebSocketHandler>();
}
/// <summary>
/// Endpoint entry point. Sets <see cref="HttpContext.Response"/> to 400 on any validation
/// failure; otherwise upgrades to a WebSocket and awaits
/// <see cref="BattleSession.RunAsync"/> until the connection closes.
/// </summary>
public async Task HandleAsync(HttpContext ctx)
{
// Status code mapping: 400 protocol violations (not WS, missing creds);
// 401 credential validation failures (decrypt, viewer mismatch); 404 unknown
// BattleId. Log messages carry the diagnostic detail; the wire code gives the
// client class of failure.
if (!ctx.WebSockets.IsWebSocketRequest)
{
ctx.Response.StatusCode = StatusCodes.Status400BadRequest;
return;
}
// BestHTTP's SocketOptions.AdditionalQueryParams puts these on HTTP request HEADERS
// for the WebSocket-only transport (not on the URL query string). Real clients
// therefore send BattleId/viewerId as headers; the integration test sends them as
// query params for convenience. Check headers first, fall back to query.
var battleId = ReadCredential(ctx, "BattleId");
var encryptedViewerId = ReadCredential(ctx, "viewerId");
if (string.IsNullOrEmpty(battleId) || string.IsNullOrEmpty(encryptedViewerId))
{
_log.LogWarning("WS upgrade missing BattleId or viewerId (header or query).");
ctx.Response.StatusCode = StatusCodes.Status400BadRequest;
return;
}
long viewerId;
try
{
var plain = NodeCrypto.DecryptForNode(encryptedViewerId);
viewerId = long.Parse(plain);
}
catch (Exception ex)
{
_log.LogWarning(ex, "viewerId failed to decrypt (encryptedLen={Len})", encryptedViewerId.Length);
ctx.Response.StatusCode = StatusCodes.Status401Unauthorized;
return;
}
var pending = _store.TryGetPending(battleId);
if (pending is null)
{
_log.LogWarning(
"WS upgrade for unknown BattleId={Bid} (decrypted viewerId={Vid}). " +
"Bridge may not have minted this battle, or it was already consumed/expired.",
battleId, viewerId);
ctx.Response.StatusCode = StatusCodes.Status404NotFound;
return;
}
var isP1 = viewerId == pending.P1.ViewerId;
var isP2 = pending.P2 is not null && viewerId == pending.P2.ViewerId;
if (!isP1 && !isP2)
{
_log.LogWarning(
"WS upgrade viewer-id mismatch on BattleId={Bid}: bridge expected={P1}/{P2}, decrypted={Got}.",
battleId, pending.P1.ViewerId, pending.P2?.ViewerId, viewerId);
ctx.Response.StatusCode = StatusCodes.Status401Unauthorized;
return;
}
var ws = await ctx.WebSockets.AcceptWebSocketAsync();
switch (pending.Type)
{
case BattleType.Scripted:
{
_store.RemovePending(battleId);
var realParticipant = new RealParticipant(ws, viewerId, pending.P1.Context,
_loggerFactory.CreateLogger<RealParticipant>());
var scriptedBot = new ScriptedBotParticipant();
var session = new BattleSession(battleId, pending.Type, realParticipant, scriptedBot,
_loggerFactory.CreateLogger<BattleSession>());
await session.RunAsync(ctx.RequestAborted);
break;
}
case BattleType.Pvp:
{
// Pick this connection's MatchContext (P1's if isP1, P2's if isP2).
var selfCtx = isP1 ? pending.P1.Context : pending.P2!.Context;
var self = new RealParticipant(ws, viewerId, selfCtx,
_loggerFactory.CreateLogger<RealParticipant>());
var firstArriver = _waitingRoom.Pair(battleId, self);
if (firstArriver is not null)
{
// We are the SECOND arriver. Construct and drive the session.
_store.RemovePending(battleId);
var session = new BattleSession(
battleId, BattleType.Pvp, firstArriver, self,
_loggerFactory.CreateLogger<BattleSession>());
try
{
await session.RunAsync(ctx.RequestAborted);
}
finally
{
firstArriver.MarkSessionFinished();
}
}
else
{
// We are the FIRST arriver. Park; ParkAsync returns the second arriver
// on pairing, null on timeout / cancellation / TryAdd race.
var second = await _waitingRoom.ParkAsync(
battleId, self, _options.WaitingRoomTimeout, ctx.RequestAborted);
if (second is null)
{
// Either timeout (most common) or Park/Park race. Retry Pair once.
second = _waitingRoom.Pair(battleId, self);
if (second is null)
{
_log.LogWarning(
"PvP waiting-room timeout or race on BattleId={Bid}; first arriver disconnected.",
battleId);
_store.RemovePending(battleId);
await TryPoliteCloseAsync(ws, "waiting-room timeout", battleId);
return;
}
// Retry succeeded — we're the de-facto second arriver now. Own the session.
_store.RemovePending(battleId);
var raceSession = new BattleSession(
battleId, BattleType.Pvp, second, self,
_loggerFactory.CreateLogger<BattleSession>());
try { await raceSession.RunAsync(ctx.RequestAborted); }
finally { second.MarkSessionFinished(); }
return;
}
// Normal first-arriver path: session is being constructed/driven by the
// second arriver. Hold this HTTP request open until they signal completion.
// Do NOT call self.RunAsync — the session already does.
await self.AwaitSessionFinishedAsync(ctx.RequestAborted);
}
break;
}
case BattleType.Bot:
{
// Phase 3: real (Real, NoOp) session. Bot's pending always has P2 == null
// (per IMatchingBridge contract validation), so isP1 must be true here. The
// earlier isP1/isP2 check has already rejected viewer mismatches.
_store.RemovePending(battleId);
var botReal = new RealParticipant(ws, viewerId, pending.P1.Context,
_loggerFactory.CreateLogger<RealParticipant>());
var noopBot = new NoOpBotParticipant();
var botSession = new BattleSession(battleId, BattleType.Bot, botReal, noopBot,
_loggerFactory.CreateLogger<BattleSession>());
await botSession.RunAsync(ctx.RequestAborted);
break;
}
default:
_log.LogError("Unknown BattleType={Type} for BattleId={Bid}; closing WS", pending.Type, battleId);
await TryPoliteCloseAsync(ws, $"unknown BattleType={pending.Type}", battleId);
return;
}
}
private static string ReadCredential(HttpContext ctx, string name)
{
var header = ctx.Request.Headers[name].ToString();
if (!string.IsNullOrEmpty(header)) return header;
return ctx.Request.Query[name].ToString();
}
/// <summary>
/// Emit an EIO <c>1</c> (Close) text frame, then run the WebSocket close handshake with
/// <see cref="WebSocketCloseStatus.NormalClosure"/>. Without the EIO frame, BestHTTP /
/// socket.io-client log the disconnect as an abrupt drop rather than a controlled
/// disconnect; without the close handshake, the client only sees the TCP teardown after
/// Kestrel finishes draining. Best-effort: any exception (already-torn-down socket,
/// canceled token) is swallowed at Debug level since teardown races are routine.
/// </summary>
private async Task TryPoliteCloseAsync(WebSocket ws, string reason, string battleId)
{
// Use a fresh, short timeout — ctx.RequestAborted may already be canceled by the
// path that decided to bail out, which would skip the close immediately.
using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5));
try
{
if (ws.State == WebSocketState.Open)
{
var bytes = Encoding.UTF8.GetBytes(((int)EngineIoPacketType.Close).ToString());
await ws.SendAsync(bytes, WebSocketMessageType.Text, endOfMessage: true, cts.Token);
}
if (ws.State is WebSocketState.Open or WebSocketState.CloseReceived)
{
await ws.CloseAsync(WebSocketCloseStatus.NormalClosure, reason, cts.Token);
}
}
catch (Exception ex)
{
_log.LogDebug(ex,
"polite close failed on BattleId={Bid} (reason={Reason}); socket likely already torn down.",
battleId, reason);
}
}
}