diff --git a/src/lib/api-client.ts b/src/lib/api-client.ts index e8fdd37..d81beea 100644 --- a/src/lib/api-client.ts +++ b/src/lib/api-client.ts @@ -378,4 +378,26 @@ export class EngineApiClient { getPort(): number { return this.port; } + + /** + * Cheap drift probe for the MCP client cache. The engine mints a fresh + * api-token on every launch (local_api_server.cpp::_generate_api_token) and can + * bind a different port (tool_net_thread.cpp::start increments 6550..6565 when + * the old socket lingers), so a client built before an engine restart holds + * dead credentials. Re-read the on-disk creds and report whether they no longer + * match this client's snapshot. + * + * Empty / unreadable creds (engine mid-write, or just closed) are treated as + * "no drift" so a transient read never thrashes the cache — the live request + * will fail and trigger a reconnect+retry if the client really is stale. + */ + async credentialsChanged(): Promise { + try { + const [port, token] = await Promise.all([getApiPort(), getApiToken()]); + if (!token) return false; + return port !== this.port || token !== this.token; + } catch { + return false; + } + } } diff --git a/src/mcp/reconnect.integration.test.ts b/src/mcp/reconnect.integration.test.ts new file mode 100644 index 0000000..9a23a44 --- /dev/null +++ b/src/mcp/reconnect.integration.test.ts @@ -0,0 +1,115 @@ +import { createServer, type Server } from "node:http"; +import type { AddressInfo } from "node:net"; + +import { afterAll, afterEach, beforeAll, describe, expect, it, vi } from "vitest"; + +/** + * End-to-end disconnect repro over REAL HTTP. + * + * A fake "engine" HTTP server stands in for Summer Engine: /api/health is + * unauthenticated (matches tool_net_thread.cpp::_validate_auth) and every other + * route requires `Authorization: Bearer `, returning 401 on a stale + * token — exactly what the real engine does after it mints a fresh api-token on + * relaunch. We rotate the token to simulate an engine restart mid-session and + * assert the MCP client transparently reconnects (no surfaced error). + * + * Only the credential-file readers (getApiPort/getApiToken) are stubbed so the + * test never touches the user's real ~/.summer; checkEngineHealth and the whole + * EngineApiClient request path run for real. + */ + +const engineState = { token: "token-A", port: 0 }; + +vi.mock("../lib/engine.js", async (importActual) => { + const actual = await importActual(); + return { + ...actual, + getApiPort: vi.fn(async () => engineState.port), + getApiToken: vi.fn(async () => engineState.token), + }; +}); + +import { getClient, resetClient } from "./server.js"; +import { withEngine } from "./tools/with-engine.js"; + +let server: Server; + +beforeAll(async () => { + server = createServer((req, res) => { + const send = (code: number, body: unknown) => { + res.writeHead(code, { "content-type": "application/json" }); + res.end(JSON.stringify(body)); + }; + + if (req.url?.startsWith("/api/health")) { + send(200, { + ok: true, + engine: "summer", + version: "1.0.0", + port: engineState.port, + instanceId: "inst-current", + }); + return; + } + + // Authenticated routes: reject anything but the CURRENT token (a stale token + // from before the "restart" gets a 401, just like the real engine). + const auth = req.headers["authorization"]; + if (auth !== `Bearer ${engineState.token}`) { + send(401, { ok: false, error: "unauthorized" }); + return; + } + send(200, { ok: true, scene: "res://main.tscn" }); + }); + + await new Promise((resolve) => server.listen(0, "127.0.0.1", resolve)); + engineState.port = (server.address() as AddressInfo).port; +}); + +afterAll(() => { + server.close(); +}); + +afterEach(() => { + resetClient(); +}); + +describe("MCP session — survives a transient engine restart end-to-end", () => { + it("binds the project, then transparently reconnects after the engine rotates its token", async () => { + // Session start: bind to the running engine on token-A. + const first = await withEngine(async (client) => client.getSceneState()); + expect(first.isError).toBeFalsy(); + expect(first.content[0].text).toContain("res://main.tscn"); + + // Engine restarts: brand-new random api-token. The cached client still holds + // token-A and would 401 on its next authenticated call. + engineState.token = "token-B"; + + // Next tool call must just work — getClient detects the credential drift and + // rebuilds before the request goes out. No "disconnected" error surfaced. + const afterRestart = await withEngine(async (client) => client.getSceneState()); + expect(afterRestart.isError).toBeFalsy(); + expect(afterRestart.content[0].text).toContain("res://main.tscn"); + }); + + it("recovers when the token rotates AFTER the drift check but before the request (mid-call restart)", async () => { + // Warm the cache on the current token. + resetClient(); + engineState.token = "token-C"; + await withEngine(async (client) => client.getSceneState()); + + // Simulate the restart landing in the window between getClient()'s drift + // probe and the actual HTTP call: the proactive check sees no drift, the + // request 401s, and the reset+retry path is what saves the call. + let rotated = false; + const res = await withEngine(async (client) => { + if (!rotated) { + rotated = true; + engineState.token = "token-D"; // engine relaunched mid-call + } + return client.getSceneState(); + }); + expect(res.isError).toBeFalsy(); + expect(res.content[0].text).toContain("res://main.tscn"); + }); +}); diff --git a/src/mcp/server.reconnect.test.ts b/src/mcp/server.reconnect.test.ts new file mode 100644 index 0000000..e6ec99d --- /dev/null +++ b/src/mcp/server.reconnect.test.ts @@ -0,0 +1,82 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; + +/** + * Disconnect repro (the MCP/CLI "disconnecting from the project" class): + * + * The MCP server is a long-lived stdio process (it lives as long as Claude Code / + * Cursor / Codex keep it). It caches ONE EngineApiClient that snapshots the + * engine's api-port + api-token at first connect. But the engine mints a NEW + * random api-token on EVERY launch (local_api_server.cpp::_generate_api_token) + * and can bind a different port (tool_net_thread.cpp::start increments 6550..6565 + * when the old socket lingers). So any engine restart / reopen / crash-relaunch + * invalidates the cached credentials, and the stale client keeps hitting the old + * port with the old Bearer token -> 401 -> reads as "disconnected". + * + * getClient() must detect that the on-disk creds drifted and rebuild the client + * transparently, so a silent engine restart never surfaces as an error. + */ + +const state = { port: 6550, token: "tokenA" }; + +vi.mock("../lib/engine.js", () => ({ + getApiPort: vi.fn(async () => state.port), + getApiToken: vi.fn(async () => state.token), + checkEngineHealth: vi.fn(async () => ({ + ok: true, + engine: "summer", + version: "1.0.0", + port: state.port, + instanceId: "inst-1", + })), +})); + +import { getClient, resetClient } from "./server.js"; + +beforeEach(() => { + state.port = 6550; + state.token = "tokenA"; + resetClient(); +}); + +afterEach(() => { + resetClient(); + vi.clearAllMocks(); +}); + +describe("getClient — survives an engine restart (api-token rotation)", () => { + it("reuses the cached client while the on-disk creds are stable", async () => { + const c1 = await getClient(); + const c2 = await getClient(); + expect(c2).toBe(c1); + }); + + it("rebuilds transparently when the engine rotates its api-token (restart)", async () => { + const c1 = await getClient(); + expect(c1.getPort()).toBe(6550); + + // Engine relaunches: fresh random api-token, and the old port lingered so it + // bound the next one. + state.token = "tokenB"; + state.port = 6551; + + const c2 = await getClient(); + expect(c2).not.toBe(c1); // stale client must NOT be reused + expect(c2.getPort()).toBe(6551); + }); + + it("rebuilds when only the api-token rotates (same port reused)", async () => { + const c1 = await getClient(); + state.token = "tokenB"; // same 6550, new token + const c2 = await getClient(); + expect(c2).not.toBe(c1); + }); + + it("keeps the cached client when creds are briefly unreadable (engine mid-write)", async () => { + const c1 = await getClient(); + // Token file momentarily empty/locked while the engine rewrites it: must NOT + // thrash the cache — the live request will fail+retry if it is truly stale. + state.token = ""; + const c2 = await getClient(); + expect(c2).toBe(c1); + }); +}); diff --git a/src/mcp/server.ts b/src/mcp/server.ts index 9c1131f..72878af 100644 --- a/src/mcp/server.ts +++ b/src/mcp/server.ts @@ -40,7 +40,16 @@ let cachedClient: EngineApiClient | null = null; export async function getClient(): Promise { if (cachedClient) { - return cachedClient; + // The engine rotates its api-token (and can change ports) on every launch, so + // a cached client can outlive the engine instance it was built for. If the + // on-disk creds drifted, a silent engine restart happened — drop the stale + // client and reconnect transparently, instead of surfacing the resulting 401 + // as a "disconnected from the project" error. + if (await cachedClient.credentialsChanged()) { + cachedClient = null; + } else { + return cachedClient; + } } try { diff --git a/src/mcp/tools/with-engine.reconnect.test.ts b/src/mcp/tools/with-engine.reconnect.test.ts new file mode 100644 index 0000000..3008cf6 --- /dev/null +++ b/src/mcp/tools/with-engine.reconnect.test.ts @@ -0,0 +1,133 @@ +import { afterEach, describe, expect, it, vi } from "vitest"; + +/** + * Transient-disconnect recovery. The engine rotates its api-token on every launch + * and can move ports, so a restart that lands DURING a tool call shows up as a + * stale-token 401, an ECONNREFUSED on the old port, or a soft `not_connected` / + * `identity_mismatch` terminal state. withEngine must drop the cached client and + * retry ONCE so the drop heals itself instead of surfacing as "disconnected". + * + * It must NOT retry ambiguous/intentional failures (timed_out / content_mismatch + * / denied / canceled) — those could double-apply a mutation or mask user intent. + */ + +const mockGetClient = vi.fn(); +const mockResetClient = vi.fn(); + +vi.mock("../server.js", () => ({ + getClient: (...args: unknown[]) => mockGetClient(...args), + resetClient: (...args: unknown[]) => mockResetClient(...args), +})); + +vi.mock("../../lib/telemetry.js", () => ({ + recordMcpSession: vi.fn(), +})); + +import { withEngine } from "./with-engine.js"; + +afterEach(() => vi.clearAllMocks()); + +function resultText(res: { content: { text: string }[]; isError?: boolean }): string { + return res.content[0]?.text ?? ""; +} + +describe("withEngine — transparent reconnect on a transient drop", () => { + it("retries once after a stale-token 401 and returns the recovered result", async () => { + mockGetClient.mockResolvedValue({}); + let calls = 0; + const res = await withEngine(async () => { + calls += 1; + if (calls === 1) throw new Error("Engine API error 401: bad token"); + return { ok: true, value: 42 }; + }); + expect(calls).toBe(2); + expect(mockResetClient).toHaveBeenCalled(); + expect(res.isError).toBeFalsy(); + expect(resultText(res)).toContain("42"); + }); + + it("retries once after an ECONNREFUSED on the old port", async () => { + mockGetClient.mockResolvedValue({}); + let calls = 0; + const res = await withEngine(async () => { + calls += 1; + if (calls === 1) throw new Error("fetch failed"); + return { ok: true }; + }); + expect(calls).toBe(2); + expect(res.isError).toBeFalsy(); + }); + + it("retries once on a soft not_connected terminalState (nothing applied) then succeeds", async () => { + mockGetClient.mockResolvedValue({}); + let calls = 0; + const res = await withEngine(async () => { + calls += 1; + if (calls === 1) return { terminalState: "not_connected" }; + return { status: "ok", terminalState: "applied", results: [{ ok: true, op: "AddNode" }] }; + }); + expect(calls).toBe(2); + expect(mockResetClient).toHaveBeenCalled(); + expect(res.isError).toBeFalsy(); + }); + + it("retries once on identity_mismatch (wrong instance — never mutated) then succeeds", async () => { + mockGetClient.mockResolvedValue({}); + let calls = 0; + const res = await withEngine(async () => { + calls += 1; + if (calls === 1) return { terminalState: "identity_mismatch" }; + return { status: "ok", terminalState: "applied", results: [{ ok: true }] }; + }); + expect(calls).toBe(2); + expect(res.isError).toBeFalsy(); + }); + + it("does NOT retry an ambiguous timed_out (op may have landed) — surfaces it", async () => { + mockGetClient.mockResolvedValue({}); + let calls = 0; + const res = await withEngine(async () => { + calls += 1; + return { terminalState: "timed_out" }; + }); + expect(calls).toBe(1); // no double-submit + expect(res.isError).toBe(true); + expect(resultText(res)).toMatch(/tim/i); + }); + + it("does NOT retry a normal op failure (invalid value) — surfaces immediately", async () => { + mockGetClient.mockResolvedValue({}); + let calls = 0; + const res = await withEngine(async () => { + calls += 1; + return { results: [{ ok: false, op: "SetProp", error: "invalid value" }] }; + }); + expect(calls).toBe(1); + expect(res.isError).toBe(true); + expect(resultText(res)).toContain("invalid value"); + }); + + it("surfaces the disconnect after the retry also fails (engine truly down)", async () => { + mockGetClient.mockResolvedValue({}); + let calls = 0; + const res = await withEngine(async () => { + calls += 1; + throw new Error("Engine API error 401: still bad"); + }); + expect(calls).toBe(2); + expect(res.isError).toBe(true); + expect(resultText(res)).toMatch(/401/); + }); + + it("still resets the client on a non-retriable throw (preserves prior behavior)", async () => { + mockGetClient.mockResolvedValue({}); + let calls = 0; + const res = await withEngine(async () => { + calls += 1; + throw new Error("Engine API error 500: internal"); + }); + expect(calls).toBe(1); // a 500 may have mutated — do not retry + expect(mockResetClient).toHaveBeenCalled(); + expect(res.isError).toBe(true); + }); +}); diff --git a/src/mcp/tools/with-engine.ts b/src/mcp/tools/with-engine.ts index 238a0ef..2f8def5 100644 --- a/src/mcp/tools/with-engine.ts +++ b/src/mcp/tools/with-engine.ts @@ -71,6 +71,47 @@ export function extractOpError(result: unknown): string | null { return null; } +// Terminal states where the engine GUARANTEES the op never landed, so dropping +// the cached client and retrying once is safe (no double-mutation). Everything +// else — timed_out (may still be running), content_mismatch / denied / canceled +// (intentional) — must surface, never silently retry. +const RECONNECTABLE_TERMINAL_STATES: ReadonlySet = new Set([ + "not_connected", + "identity_mismatch", +]); + +function terminalStateOf(result: unknown): string | null { + if (!result || typeof result !== "object") return null; + const ts = (result as OpResult).terminalState; + return typeof ts === "string" && ts.length > 0 ? ts : null; +} + +/** + * A thrown transport error is safe to retry only when the engine provably never + * applied the op: a stale-token rejection (401/403, after the engine rotated its + * api-token on relaunch) or an unreachable/closed port (connection refused/reset, + * after the engine moved ports or is mid-restart). A timeout or any 5xx may have + * mutated, so those surface instead of risking a double-apply on retry. + * + * Exported for unit tests. + */ +export function isReconnectableThrow(err: unknown): boolean { + const m = (err instanceof Error ? err.message : String(err)).toLowerCase(); + if (m.includes("timed out") || m.includes("timeout") || m.includes("aborted")) { + return false; + } + return ( + m.includes("401") || + m.includes("403") || + m.includes("unauthorized") || + m.includes("econnrefused") || + m.includes("econnreset") || + m.includes("fetch failed") || + m.includes("not running") || + m.includes("not responding") + ); +} + function buildActionHint(message: string): string | null { const normalized = message.toLowerCase(); @@ -96,19 +137,46 @@ export async function withEngine( // No await, no throw, no quota gating. recordMcpSession(); - try { - const client = await getClient(); - const result = await fn(client); - const opError = extractOpError(result); - if (opError) { - const hint = buildActionHint(opError); - const message = hint ? `${opError}\n\nHint: ${hint}` : opError; - return { content: [{ type: "text", text: message }], isError: true }; + // The engine rotates its api-token and can move ports on every launch, so a + // restart that lands DURING a tool call shows up as a stale-token 401, an + // ECONNREFUSED on the old port, or a soft not_connected / identity_mismatch + // terminal state. Drop the cached client and retry ONCE so a transient drop + // heals itself (getClient reconnects with the fresh creds) instead of + // surfacing as a "disconnected" error. Only provably-not-applied failures are + // retried — see RECONNECTABLE_TERMINAL_STATES / isReconnectableThrow. + const MAX_ATTEMPTS = 2; + let lastError: unknown; + + for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) { + try { + const client = await getClient(); + const result = await fn(client); + const opError = extractOpError(result); + if (opError) { + const ts = terminalStateOf(result); + if (ts && RECONNECTABLE_TERMINAL_STATES.has(ts) && attempt < MAX_ATTEMPTS) { + resetClient(); + lastError = new Error(opError); + continue; + } + const hint = buildActionHint(opError); + const message = hint ? `${opError}\n\nHint: ${hint}` : opError; + return { content: [{ type: "text", text: message }], isError: true }; + } + return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; + } catch (err) { + // A thrown error means the cached client may be pointed at a dead/rotated + // engine — always drop it (prior behavior). Retry once only for + // connection-class throws that provably did not mutate. + resetClient(); + lastError = err; + if (attempt < MAX_ATTEMPTS && isReconnectableThrow(err)) { + continue; + } + break; } - return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; - } catch (err) { - resetClient(); - const msg = err instanceof Error ? err.message : String(err); - return { content: [{ type: "text", text: msg }], isError: true }; } + + const msg = lastError instanceof Error ? lastError.message : String(lastError); + return { content: [{ type: "text", text: msg }], isError: true }; }