diff --git a/.github/workflows/flake-repro.yml b/.github/workflows/flake-repro.yml
new file mode 100644
index 000000000..48cce7876
--- /dev/null
+++ b/.github/workflows/flake-repro.yml
@@ -0,0 +1,159 @@
+# Manually-triggered flake REPRODUCER. The CI-only concurrency flakes
+# (ThreadSubmissionIntegrationTest, CodeEditRecompileTest, β¦) are positive-wait
+# timeouts of the "race-the-watcher β wedged hub / dropped cross-mirror write"
+# class β they pass in isolation, as a full class, and under DOTNET_PROCESSOR_COUNT=2
+# locally, but trip under the real 2-vCPU CI runner. This workflow loops one
+# suspect test ON THAT RUNNER until it fails, with logging cranked up via ENV
+# (never by editing a committed appsettings/Log* level), and uploads the FAILING
+# iteration's trx + MonolithMeshTestBase traces + blame hang-dump so the actual
+# wedge is diagnosable β i.e. it manufactures the repro the fix must be proven against.
+#
+# Run it from the Actions tab (workflow_dispatch). Defaults target CodeEditRecompile;
+# re-dispatch with project=test/MeshWeaver.AI.Test filter=FullyQualifiedName~ThreadSubmissionIntegrationTest
+# (etc.) for the others.
+name: Flake repro (manual)
+
+on:
+ workflow_dispatch:
+ inputs:
+ project:
+ description: "Test project to loop"
+ type: string
+ default: "test/MeshWeaver.Hosting.Monolith.Test"
+ filter:
+ description: "dotnet test --filter expression (the suspect test)"
+ type: string
+ default: "FullyQualifiedName~CodeEditRecompileTest.PressingCompileButton_SetsRequestedReleaseAt_AndProducesNewRelease"
+ iterations:
+ description: "Max loop iterations (stops on first failure)"
+ type: string
+ default: "25"
+ loglevel:
+ description: "Default log level for the looped runs (Debug/Trace to surface the watcher race)"
+ type: string
+ default: "Debug"
+
+permissions:
+ contents: read
+
+env:
+ # Same workspace-pinned NuGet cache as dotnet-test.yml so a --no-build/--no-restore
+ # loop resolves packages offline.
+ NUGET_PACKAGES: ${{ github.workspace }}/.nuget/packages
+
+jobs:
+ repro:
+ name: "Loop suspect test until fail"
+ # ubuntu-latest == the 2-vCPU GitHub-hosted runner the real flake needs.
+ runs-on: ubuntu-latest
+ timeout-minutes: 60
+ # Effective params: workflow_dispatch inputs when present, else push-trigger defaults.
+ env:
+ REPRO_PROJECT: ${{ inputs.project || 'test/MeshWeaver.Hosting.Monolith.Test' }}
+ REPRO_FILTER: ${{ inputs.filter || 'FullyQualifiedName~CodeEditRecompileTest.PressingCompileButton_SetsRequestedReleaseAt_AndProducesNewRelease' }}
+ REPRO_ITERS: ${{ inputs.iterations || '80' }}
+ DOTNET_ENVIRONMENT: Development
+ # π¨ Logging cranked via ENV only β never a committed appsettings/Log* change
+ # (those are production cost contract). Surfaces the watcher/messaging trace.
+ Logging__LogLevel__Default: ${{ inputs.loglevel || 'Debug' }}
+ steps:
+ - uses: actions/checkout@v6
+ # Mirror dotnet-test.yml: reclaim preinstalled tooling a .NET+Postgres run never
+ # touches so the build output + Testcontainers image fit the ~14 GB runner disk.
+ - name: Free disk space
+ uses: jlumbroso/free-disk-space@main
+ with:
+ tool-cache: false
+ dotnet: false
+ android: true
+ haskell: true
+ large-packages: true
+ docker-images: true
+ swap-storage: true
+ - name: Setup .NET
+ uses: actions/setup-dotnet@v5
+ with:
+ dotnet-version: 10.0.x
+ - name: Restore workloads
+ run: dotnet workload restore
+ - name: Restore dependencies
+ run: dotnet restore
+ # Build the whole solution in Release EXACTLY as the real CI build job does, so
+ # the looped --no-build runs execute the identical binaries that flake in CI
+ # (Release, same warnings-as-errors gate). Faithful repro > speed.
+ - name: Build
+ run: dotnet build --no-restore -c Release -p:CIRun=true -warnaserror
+ # The mesh-local #r feed some dynamic-compilation tests resolve at runtime
+ # (version-less `#r "nuget:MeshWeaver.X"`), packed exactly as dotnet-test.yml does.
+ - name: Pack mesh-local #r packages
+ run: |
+ set -euo pipefail
+ dotnet pack src/MeshWeaver.BusinessRules/MeshWeaver.BusinessRules.csproj \
+ -c Release --no-build --no-restore -o dist/packages --nologo
+ dotnet pack src/MeshWeaver.BusinessRules.Generator/MeshWeaver.BusinessRules.Generator.csproj \
+ -c Release --no-build --no-restore -o dist/packages --nologo
+ # π¨ DO NOT crank the CompileWatcher log levels here. Repro #3/#4 proved it's a HEISENBUG:
+ # Debug logging in the watcher hot path changes the scheduling enough to MASK the race
+ # (runs #1/#2 caught it at iter 19/4 at the committed Warning level; runs #3/#4 with the
+ # categories at Debug missed it across 40+80 iters). So we keep production-like timing and
+ # rely on the timing-NEUTRAL stuck-state diagnostic in WaitForLatestRelease (it runs only
+ # on the 50s timeout, never during the race) to pin the stalled stage.
+ - name: Loop the suspect test until it fails
+ run: |
+ set -uo pipefail
+ iters="$REPRO_ITERS"
+ proj="$REPRO_PROJECT"
+ filt="$REPRO_FILTER"
+ echo "Looping '$filt' in $proj up to $itersΓ on a 2-vCPU runner (nproc=$(nproc))."
+ failed=0
+ for i in $(seq 1 "$iters"); do
+ echo "::group::iteration $i / $iters"
+ # --no-build --no-restore: run the Release binaries built above, exactly
+ # like the CI shards. blame-hang-* captures a mini-dump if the host wedges
+ # (the "unresponsive after the second compile" symptom) instead of just
+ # timing out opaquely.
+ dotnet test "$proj" -c Release --no-build --no-restore \
+ --filter "$filt" -l:trx \
+ --blame-hang-timeout 120s --blame-hang-dump-type mini
+ rc=$?
+ echo "::endgroup::"
+ if [ "$rc" -ne 0 ]; then
+ echo "::error::REPRO on iteration $i (exit $rc) β diagnostics uploaded below."
+ failed=1
+ break
+ fi
+ echo "iteration $i passed"
+ done
+ if [ "$failed" = "0" ]; then
+ echo "No repro in $iters iterations (the flake did not surface this run)."
+ fi
+ # Exit non-zero on repro so the run is CLEARLY marked failed in the Actions
+ # list (repro achieved = red). The diagnostics-collection + upload steps
+ # below run regardless via `if: always()`.
+ exit "$failed"
+ - name: Collect diagnostics
+ if: always()
+ run: |
+ mkdir -p repro-diagnostics
+ # trx (per-iteration; the last one is the failing run since we break on fail).
+ find "$REPRO_PROJECT" -name '*.trx' -exec cp {} repro-diagnostics/ \; 2>/dev/null || true
+ # blame hang mini-dump (written when the test host wedged, not just timed out).
+ find "$REPRO_PROJECT" -name 'blame-*.dmp' -exec cp {} repro-diagnostics/ \; 2>/dev/null || true
+ # MonolithMeshTestBase phase/dispose/memory traces (the watcher + INIT/DISPOSE
+ # trace that pinpoints the wedge), written to the process tempdir.
+ for f in meshweaver-test-trace meshweaver-dispose-trace meshweaver-memory-delta; do
+ [ -f "/tmp/$f.log" ] && cp "/tmp/$f.log" "repro-diagnostics/_$f.log" || true
+ done
+ # XUnitFileOutputHelper writes per-test/background-hub logs under bin/.../test-logs
+ # (the cranked watcher trace lands here when no test is the active ITestOutputHelper).
+ # Search repo-wide β background-hub logs can land outside the suspect project dir.
+ find . -path '*/test-logs/*.log' -exec cp {} repro-diagnostics/ \; 2>/dev/null || true
+ echo "Collected:"; ls -lh repro-diagnostics/ || true
+ - name: Upload repro diagnostics
+ if: always()
+ uses: actions/upload-artifact@v6
+ with:
+ name: flake-repro-diagnostics
+ path: repro-diagnostics/
+ retention-days: 15
+ compression-level: 9
diff --git a/test/MeshWeaver.Hosting.Monolith.Test/CodeEditRecompileTest.cs b/test/MeshWeaver.Hosting.Monolith.Test/CodeEditRecompileTest.cs
index 85c9ef773..d7dd022a8 100644
--- a/test/MeshWeaver.Hosting.Monolith.Test/CodeEditRecompileTest.cs
+++ b/test/MeshWeaver.Hosting.Monolith.Test/CodeEditRecompileTest.cs
@@ -238,7 +238,11 @@ await NodeFactory.CreateNode(new MeshNode("instance2", $"{TestPartition}/CodeEdi
/// (EnrichWithNodeType slow path faulted, SubscribeRequest
/// timeouts β compilation-error overlay β MARKER_V1 never rendered).
///
- [Fact(Timeout = 60000)]
+ // 120s (not the usual 60s): the happy path completes in seconds; the budget is for the
+ // FAILURE path β WaitForLatestRelease's 50s primary wait plus its discriminating timeout
+ // diagnostic (mirror/index probes + decisive re-trigger, worst case ~50s more) must fit
+ // inside the xUnit method timeout or the diagnostic is cancelled before it can be emitted.
+ [Fact(Timeout = 120000)]
public async Task NodeType_RequestedReleasePath_PinsToHistoricalRelease()
{
var workspace = Mesh.GetWorkspace();
@@ -393,7 +397,11 @@ await NodeFactory.CreateNode(new MeshNode("unpinnedInstance", $"{TestPartition}/
/// NodeTypeLayoutAreas.BuildCompileStatusPanel) binds to this
/// state machine, so this is the test that gates the UI.
///
- [Fact(Timeout = 60000)]
+ // 120s (not the usual 60s): the happy path completes in seconds; the budget is for the
+ // FAILURE path β WaitForLatestRelease's 50s primary wait plus its discriminating timeout
+ // diagnostic (mirror/index probes + decisive re-trigger, worst case ~50s more) must fit
+ // inside the xUnit method timeout or the diagnostic is cancelled before it can be emitted.
+ [Fact(Timeout = 120000)]
public async Task IsDirty_FlipsTrueOnSourceEdit_FalseAfterCompile()
{
var workspace = Mesh.GetWorkspace();
@@ -550,7 +558,11 @@ await Mesh.GetWorkspace().GetMeshNodeStream(typePath)
/// β same code path the button executes β and asserts both the property
/// flip and the resulting release.
///
- [Fact(Timeout = 60000)]
+ // 120s (not the usual 60s): the happy path completes in seconds; the budget is for the
+ // FAILURE path β WaitForLatestRelease's 50s primary wait plus its discriminating timeout
+ // diagnostic (mirror/index probes + decisive re-trigger, worst case ~50s more) must fit
+ // inside the xUnit method timeout or the diagnostic is cancelled before it can be emitted.
+ [Fact(Timeout = 120000)]
public async Task PressingCompileButton_SetsRequestedReleaseAt_AndProducesNewRelease()
{
var workspace = Mesh.GetWorkspace();
@@ -663,13 +675,87 @@ await workspace.GetMeshNodeStream(nodeTypePath).Update(curr =>
private async Task WaitForLatestRelease(string nodeTypePath, string? knownRelease)
{
var workspace = Mesh.GetWorkspace();
- var node = await workspace.GetMeshNodeStream(nodeTypePath)
- .Should().Within(50.Seconds())
- .Match(n => n?.Content is NodeTypeDefinition def
- && def.CompilationStatus == CompilationStatus.Ok
- && !string.IsNullOrEmpty(def.LatestReleasePath)
- && def.LatestReleasePath != knownRelease);
- return ((NodeTypeDefinition)node.Content!).LatestReleasePath!;
+ try
+ {
+ var node = await workspace.GetMeshNodeStream(nodeTypePath)
+ .Should().Within(50.Seconds())
+ .Match(n => n?.Content is NodeTypeDefinition def
+ && def.CompilationStatus == CompilationStatus.Ok
+ && !string.IsNullOrEmpty(def.LatestReleasePath)
+ && def.LatestReleasePath != knownRelease);
+ return ((NodeTypeDefinition)node.Content!).LatestReleasePath!;
+ }
+ catch (Exception ex)
+ {
+ // Diagnostic: on timeout, discriminate OWNER-side (never produced v2) from a
+ // DELIVERY / mirror-staleness race (owner produced v2 but the cross-hub mirror the test
+ // reads never received it). Compare two INDEPENDENT views of the SAME node:
+ // MIRROR β the single-node cross-hub cache handle the test/watcher read.
+ // INDEX β the mesh query (a DIFFERENT path: the owner's persisted+indexed state),
+ // plus the Release children. If INDEX shows a fresh v2 / Handled while MIRROR
+ // is stale at v1 / Handled=null β delivery race confirmed (owner did the work).
+ // If BOTH are stale β owner-side (the watcher/compile never produced v2).
+ var cur = await workspace.GetMeshNodeStream(nodeTypePath)
+ .Where(n => n is not null).Take(1).Timeout(5.Seconds());
+ var m = cur?.Content as NodeTypeDefinition;
+
+ var meshService = Mesh.ServiceProvider.GetService();
+ string indexNode = "(no IMeshService)", releases = "(no IMeshService)";
+ if (meshService is not null)
+ {
+ try
+ {
+ // Best-effort: take the FIRST emission bounded to 5s β waiting for a
+ // non-empty snapshot could stall the whole bound when the node is genuinely
+ // absent from the index (itself a diagnostic result).
+ var idx = await meshService.Query(MeshQueryRequest.FromQuery($"path:{nodeTypePath}"))
+ .Take(1).Timeout(5.Seconds());
+ var id = idx.Items.FirstOrDefault()?.Content as NodeTypeDefinition;
+ indexNode = id is null ? "(not in index)"
+ : $"Status={id.CompilationStatus}, Latest={id.LatestReleasePath ?? "(null)"}, " +
+ $"Handled={id.LastReleaseRequestHandledAt:O}";
+ }
+ catch (Exception qx) { indexNode = $"(query failed: {qx.GetType().Name})"; }
+ try
+ {
+ var rel = await meshService.Query(
+ MeshQueryRequest.FromQuery($"path:{nodeTypePath}/Release scope:descendants"))
+ .Take(1).Timeout(5.Seconds());
+ releases = rel.Items.Count == 0 ? "(none)" : string.Join(", ", rel.Items.Select(r => r.Path));
+ }
+ catch (Exception qx) { releases = $"(query failed: {qx.GetType().Name})"; }
+ }
+
+ // (a) vs (b): re-trigger with a FRESH timestamp. If the watcher is alive and merely MISSED
+ // the first settled-trigger emission, this second trigger produces v2 β "RECOVERED" (sub-case a:
+ // a one-time missed/lost emission). If it stays stuck, the watcher/state is persistently
+ // clobbered or the subscription is dead (sub-case b). Decisive + repro-cheap.
+ string retrigger;
+ try
+ {
+ var t2 = DateTimeOffset.UtcNow.AddMilliseconds(1);
+ await workspace.GetMeshNodeStream(nodeTypePath).Update(curr =>
+ curr?.Content is NodeTypeDefinition cd
+ ? curr with { Content = cd with { RequestedReleaseAt = t2, RequestedReleaseForce = true } }
+ : curr!).Should().Within(10.Seconds()).Emit();
+ var v2 = await workspace.GetMeshNodeStream(nodeTypePath)
+ .Where(n => n?.Content is NodeTypeDefinition d2
+ && d2.CompilationStatus == CompilationStatus.Ok
+ && !string.IsNullOrEmpty(d2.LatestReleasePath)
+ && d2.LatestReleasePath != knownRelease)
+ .Take(1).Timeout(25.Seconds());
+ retrigger = $"RECOVERED via re-trigger β v2={((NodeTypeDefinition)v2.Content!).LatestReleasePath} (sub-case a: first emission missed)";
+ }
+ catch (Exception rx) { retrigger = $"STILL STUCK after re-trigger ({rx.GetType().Name}) (sub-case b: persistent clobber / dead subscription)"; }
+
+ throw new Exception(
+ $"WaitForLatestRelease stuck for {nodeTypePath}: known={knownRelease ?? "(null)"}\n" +
+ $" MIRROR: Status={m?.CompilationStatus}, Latest={m?.LatestReleasePath ?? "(null)"}, " +
+ $"ReqAt={m?.RequestedReleaseAt:O}, Handled={m?.LastReleaseRequestHandledAt:O}, Force={m?.RequestedReleaseForce}\n" +
+ $" INDEX node: {indexNode}\n" +
+ $" INDEX Release children: [{releases}]\n" +
+ $" RE-TRIGGER: {retrigger}", ex);
+ }
}
private async Task SendCreateRelease(string nodeTypePath, bool force)