diff --git a/.github/workflows/flake-repro.yml b/.github/workflows/flake-repro.yml new file mode 100644 index 000000000..48cce7876 --- /dev/null +++ b/.github/workflows/flake-repro.yml @@ -0,0 +1,159 @@ +# Manually-triggered flake REPRODUCER. The CI-only concurrency flakes +# (ThreadSubmissionIntegrationTest, CodeEditRecompileTest, …) are positive-wait +# timeouts of the "race-the-watcher β†’ wedged hub / dropped cross-mirror write" +# class β€” they pass in isolation, as a full class, and under DOTNET_PROCESSOR_COUNT=2 +# locally, but trip under the real 2-vCPU CI runner. This workflow loops one +# suspect test ON THAT RUNNER until it fails, with logging cranked up via ENV +# (never by editing a committed appsettings/Log* level), and uploads the FAILING +# iteration's trx + MonolithMeshTestBase traces + blame hang-dump so the actual +# wedge is diagnosable β€” i.e. it manufactures the repro the fix must be proven against. +# +# Run it from the Actions tab (workflow_dispatch). Defaults target CodeEditRecompile; +# re-dispatch with project=test/MeshWeaver.AI.Test filter=FullyQualifiedName~ThreadSubmissionIntegrationTest +# (etc.) for the others. +name: Flake repro (manual) + +on: + workflow_dispatch: + inputs: + project: + description: "Test project to loop" + type: string + default: "test/MeshWeaver.Hosting.Monolith.Test" + filter: + description: "dotnet test --filter expression (the suspect test)" + type: string + default: "FullyQualifiedName~CodeEditRecompileTest.PressingCompileButton_SetsRequestedReleaseAt_AndProducesNewRelease" + iterations: + description: "Max loop iterations (stops on first failure)" + type: string + default: "25" + loglevel: + description: "Default log level for the looped runs (Debug/Trace to surface the watcher race)" + type: string + default: "Debug" + +permissions: + contents: read + +env: + # Same workspace-pinned NuGet cache as dotnet-test.yml so a --no-build/--no-restore + # loop resolves packages offline. + NUGET_PACKAGES: ${{ github.workspace }}/.nuget/packages + +jobs: + repro: + name: "Loop suspect test until fail" + # ubuntu-latest == the 2-vCPU GitHub-hosted runner the real flake needs. + runs-on: ubuntu-latest + timeout-minutes: 60 + # Effective params: workflow_dispatch inputs when present, else push-trigger defaults. + env: + REPRO_PROJECT: ${{ inputs.project || 'test/MeshWeaver.Hosting.Monolith.Test' }} + REPRO_FILTER: ${{ inputs.filter || 'FullyQualifiedName~CodeEditRecompileTest.PressingCompileButton_SetsRequestedReleaseAt_AndProducesNewRelease' }} + REPRO_ITERS: ${{ inputs.iterations || '80' }} + DOTNET_ENVIRONMENT: Development + # 🚨 Logging cranked via ENV only β€” never a committed appsettings/Log* change + # (those are production cost contract). Surfaces the watcher/messaging trace. + Logging__LogLevel__Default: ${{ inputs.loglevel || 'Debug' }} + steps: + - uses: actions/checkout@v6 + # Mirror dotnet-test.yml: reclaim preinstalled tooling a .NET+Postgres run never + # touches so the build output + Testcontainers image fit the ~14 GB runner disk. + - name: Free disk space + uses: jlumbroso/free-disk-space@main + with: + tool-cache: false + dotnet: false + android: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + - name: Setup .NET + uses: actions/setup-dotnet@v5 + with: + dotnet-version: 10.0.x + - name: Restore workloads + run: dotnet workload restore + - name: Restore dependencies + run: dotnet restore + # Build the whole solution in Release EXACTLY as the real CI build job does, so + # the looped --no-build runs execute the identical binaries that flake in CI + # (Release, same warnings-as-errors gate). Faithful repro > speed. + - name: Build + run: dotnet build --no-restore -c Release -p:CIRun=true -warnaserror + # The mesh-local #r feed some dynamic-compilation tests resolve at runtime + # (version-less `#r "nuget:MeshWeaver.X"`), packed exactly as dotnet-test.yml does. + - name: Pack mesh-local #r packages + run: | + set -euo pipefail + dotnet pack src/MeshWeaver.BusinessRules/MeshWeaver.BusinessRules.csproj \ + -c Release --no-build --no-restore -o dist/packages --nologo + dotnet pack src/MeshWeaver.BusinessRules.Generator/MeshWeaver.BusinessRules.Generator.csproj \ + -c Release --no-build --no-restore -o dist/packages --nologo + # 🚨 DO NOT crank the CompileWatcher log levels here. Repro #3/#4 proved it's a HEISENBUG: + # Debug logging in the watcher hot path changes the scheduling enough to MASK the race + # (runs #1/#2 caught it at iter 19/4 at the committed Warning level; runs #3/#4 with the + # categories at Debug missed it across 40+80 iters). So we keep production-like timing and + # rely on the timing-NEUTRAL stuck-state diagnostic in WaitForLatestRelease (it runs only + # on the 50s timeout, never during the race) to pin the stalled stage. + - name: Loop the suspect test until it fails + run: | + set -uo pipefail + iters="$REPRO_ITERS" + proj="$REPRO_PROJECT" + filt="$REPRO_FILTER" + echo "Looping '$filt' in $proj up to $itersΓ— on a 2-vCPU runner (nproc=$(nproc))." + failed=0 + for i in $(seq 1 "$iters"); do + echo "::group::iteration $i / $iters" + # --no-build --no-restore: run the Release binaries built above, exactly + # like the CI shards. blame-hang-* captures a mini-dump if the host wedges + # (the "unresponsive after the second compile" symptom) instead of just + # timing out opaquely. + dotnet test "$proj" -c Release --no-build --no-restore \ + --filter "$filt" -l:trx \ + --blame-hang-timeout 120s --blame-hang-dump-type mini + rc=$? + echo "::endgroup::" + if [ "$rc" -ne 0 ]; then + echo "::error::REPRO on iteration $i (exit $rc) β€” diagnostics uploaded below." + failed=1 + break + fi + echo "iteration $i passed" + done + if [ "$failed" = "0" ]; then + echo "No repro in $iters iterations (the flake did not surface this run)." + fi + # Exit non-zero on repro so the run is CLEARLY marked failed in the Actions + # list (repro achieved = red). The diagnostics-collection + upload steps + # below run regardless via `if: always()`. + exit "$failed" + - name: Collect diagnostics + if: always() + run: | + mkdir -p repro-diagnostics + # trx (per-iteration; the last one is the failing run since we break on fail). + find "$REPRO_PROJECT" -name '*.trx' -exec cp {} repro-diagnostics/ \; 2>/dev/null || true + # blame hang mini-dump (written when the test host wedged, not just timed out). + find "$REPRO_PROJECT" -name 'blame-*.dmp' -exec cp {} repro-diagnostics/ \; 2>/dev/null || true + # MonolithMeshTestBase phase/dispose/memory traces (the watcher + INIT/DISPOSE + # trace that pinpoints the wedge), written to the process tempdir. + for f in meshweaver-test-trace meshweaver-dispose-trace meshweaver-memory-delta; do + [ -f "/tmp/$f.log" ] && cp "/tmp/$f.log" "repro-diagnostics/_$f.log" || true + done + # XUnitFileOutputHelper writes per-test/background-hub logs under bin/.../test-logs + # (the cranked watcher trace lands here when no test is the active ITestOutputHelper). + # Search repo-wide β€” background-hub logs can land outside the suspect project dir. + find . -path '*/test-logs/*.log' -exec cp {} repro-diagnostics/ \; 2>/dev/null || true + echo "Collected:"; ls -lh repro-diagnostics/ || true + - name: Upload repro diagnostics + if: always() + uses: actions/upload-artifact@v6 + with: + name: flake-repro-diagnostics + path: repro-diagnostics/ + retention-days: 15 + compression-level: 9 diff --git a/test/MeshWeaver.Hosting.Monolith.Test/CodeEditRecompileTest.cs b/test/MeshWeaver.Hosting.Monolith.Test/CodeEditRecompileTest.cs index 85c9ef773..d7dd022a8 100644 --- a/test/MeshWeaver.Hosting.Monolith.Test/CodeEditRecompileTest.cs +++ b/test/MeshWeaver.Hosting.Monolith.Test/CodeEditRecompileTest.cs @@ -238,7 +238,11 @@ await NodeFactory.CreateNode(new MeshNode("instance2", $"{TestPartition}/CodeEdi /// (EnrichWithNodeType slow path faulted, SubscribeRequest /// timeouts β†’ compilation-error overlay β†’ MARKER_V1 never rendered). /// - [Fact(Timeout = 60000)] + // 120s (not the usual 60s): the happy path completes in seconds; the budget is for the + // FAILURE path β€” WaitForLatestRelease's 50s primary wait plus its discriminating timeout + // diagnostic (mirror/index probes + decisive re-trigger, worst case ~50s more) must fit + // inside the xUnit method timeout or the diagnostic is cancelled before it can be emitted. + [Fact(Timeout = 120000)] public async Task NodeType_RequestedReleasePath_PinsToHistoricalRelease() { var workspace = Mesh.GetWorkspace(); @@ -393,7 +397,11 @@ await NodeFactory.CreateNode(new MeshNode("unpinnedInstance", $"{TestPartition}/ /// NodeTypeLayoutAreas.BuildCompileStatusPanel) binds to this /// state machine, so this is the test that gates the UI. /// - [Fact(Timeout = 60000)] + // 120s (not the usual 60s): the happy path completes in seconds; the budget is for the + // FAILURE path β€” WaitForLatestRelease's 50s primary wait plus its discriminating timeout + // diagnostic (mirror/index probes + decisive re-trigger, worst case ~50s more) must fit + // inside the xUnit method timeout or the diagnostic is cancelled before it can be emitted. + [Fact(Timeout = 120000)] public async Task IsDirty_FlipsTrueOnSourceEdit_FalseAfterCompile() { var workspace = Mesh.GetWorkspace(); @@ -550,7 +558,11 @@ await Mesh.GetWorkspace().GetMeshNodeStream(typePath) /// β€” same code path the button executes β€” and asserts both the property /// flip and the resulting release. /// - [Fact(Timeout = 60000)] + // 120s (not the usual 60s): the happy path completes in seconds; the budget is for the + // FAILURE path β€” WaitForLatestRelease's 50s primary wait plus its discriminating timeout + // diagnostic (mirror/index probes + decisive re-trigger, worst case ~50s more) must fit + // inside the xUnit method timeout or the diagnostic is cancelled before it can be emitted. + [Fact(Timeout = 120000)] public async Task PressingCompileButton_SetsRequestedReleaseAt_AndProducesNewRelease() { var workspace = Mesh.GetWorkspace(); @@ -663,13 +675,87 @@ await workspace.GetMeshNodeStream(nodeTypePath).Update(curr => private async Task WaitForLatestRelease(string nodeTypePath, string? knownRelease) { var workspace = Mesh.GetWorkspace(); - var node = await workspace.GetMeshNodeStream(nodeTypePath) - .Should().Within(50.Seconds()) - .Match(n => n?.Content is NodeTypeDefinition def - && def.CompilationStatus == CompilationStatus.Ok - && !string.IsNullOrEmpty(def.LatestReleasePath) - && def.LatestReleasePath != knownRelease); - return ((NodeTypeDefinition)node.Content!).LatestReleasePath!; + try + { + var node = await workspace.GetMeshNodeStream(nodeTypePath) + .Should().Within(50.Seconds()) + .Match(n => n?.Content is NodeTypeDefinition def + && def.CompilationStatus == CompilationStatus.Ok + && !string.IsNullOrEmpty(def.LatestReleasePath) + && def.LatestReleasePath != knownRelease); + return ((NodeTypeDefinition)node.Content!).LatestReleasePath!; + } + catch (Exception ex) + { + // Diagnostic: on timeout, discriminate OWNER-side (never produced v2) from a + // DELIVERY / mirror-staleness race (owner produced v2 but the cross-hub mirror the test + // reads never received it). Compare two INDEPENDENT views of the SAME node: + // MIRROR β€” the single-node cross-hub cache handle the test/watcher read. + // INDEX β€” the mesh query (a DIFFERENT path: the owner's persisted+indexed state), + // plus the Release children. If INDEX shows a fresh v2 / Handled while MIRROR + // is stale at v1 / Handled=null β†’ delivery race confirmed (owner did the work). + // If BOTH are stale β†’ owner-side (the watcher/compile never produced v2). + var cur = await workspace.GetMeshNodeStream(nodeTypePath) + .Where(n => n is not null).Take(1).Timeout(5.Seconds()); + var m = cur?.Content as NodeTypeDefinition; + + var meshService = Mesh.ServiceProvider.GetService(); + string indexNode = "(no IMeshService)", releases = "(no IMeshService)"; + if (meshService is not null) + { + try + { + // Best-effort: take the FIRST emission bounded to 5s β€” waiting for a + // non-empty snapshot could stall the whole bound when the node is genuinely + // absent from the index (itself a diagnostic result). + var idx = await meshService.Query(MeshQueryRequest.FromQuery($"path:{nodeTypePath}")) + .Take(1).Timeout(5.Seconds()); + var id = idx.Items.FirstOrDefault()?.Content as NodeTypeDefinition; + indexNode = id is null ? "(not in index)" + : $"Status={id.CompilationStatus}, Latest={id.LatestReleasePath ?? "(null)"}, " + + $"Handled={id.LastReleaseRequestHandledAt:O}"; + } + catch (Exception qx) { indexNode = $"(query failed: {qx.GetType().Name})"; } + try + { + var rel = await meshService.Query( + MeshQueryRequest.FromQuery($"path:{nodeTypePath}/Release scope:descendants")) + .Take(1).Timeout(5.Seconds()); + releases = rel.Items.Count == 0 ? "(none)" : string.Join(", ", rel.Items.Select(r => r.Path)); + } + catch (Exception qx) { releases = $"(query failed: {qx.GetType().Name})"; } + } + + // (a) vs (b): re-trigger with a FRESH timestamp. If the watcher is alive and merely MISSED + // the first settled-trigger emission, this second trigger produces v2 β†’ "RECOVERED" (sub-case a: + // a one-time missed/lost emission). If it stays stuck, the watcher/state is persistently + // clobbered or the subscription is dead (sub-case b). Decisive + repro-cheap. + string retrigger; + try + { + var t2 = DateTimeOffset.UtcNow.AddMilliseconds(1); + await workspace.GetMeshNodeStream(nodeTypePath).Update(curr => + curr?.Content is NodeTypeDefinition cd + ? curr with { Content = cd with { RequestedReleaseAt = t2, RequestedReleaseForce = true } } + : curr!).Should().Within(10.Seconds()).Emit(); + var v2 = await workspace.GetMeshNodeStream(nodeTypePath) + .Where(n => n?.Content is NodeTypeDefinition d2 + && d2.CompilationStatus == CompilationStatus.Ok + && !string.IsNullOrEmpty(d2.LatestReleasePath) + && d2.LatestReleasePath != knownRelease) + .Take(1).Timeout(25.Seconds()); + retrigger = $"RECOVERED via re-trigger β†’ v2={((NodeTypeDefinition)v2.Content!).LatestReleasePath} (sub-case a: first emission missed)"; + } + catch (Exception rx) { retrigger = $"STILL STUCK after re-trigger ({rx.GetType().Name}) (sub-case b: persistent clobber / dead subscription)"; } + + throw new Exception( + $"WaitForLatestRelease stuck for {nodeTypePath}: known={knownRelease ?? "(null)"}\n" + + $" MIRROR: Status={m?.CompilationStatus}, Latest={m?.LatestReleasePath ?? "(null)"}, " + + $"ReqAt={m?.RequestedReleaseAt:O}, Handled={m?.LastReleaseRequestHandledAt:O}, Force={m?.RequestedReleaseForce}\n" + + $" INDEX node: {indexNode}\n" + + $" INDEX Release children: [{releases}]\n" + + $" RE-TRIGGER: {retrigger}", ex); + } } private async Task SendCreateRelease(string nodeTypePath, bool force)