Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 159 additions & 0 deletions .github/workflows/flake-repro.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# Manually-triggered flake REPRODUCER. The CI-only concurrency flakes
# (ThreadSubmissionIntegrationTest, CodeEditRecompileTest, …) are positive-wait
# timeouts of the "race-the-watcher → wedged hub / dropped cross-mirror write"
# class — they pass in isolation, as a full class, and under DOTNET_PROCESSOR_COUNT=2
# locally, but trip under the real 2-vCPU CI runner. This workflow loops one
# suspect test ON THAT RUNNER until it fails, with logging cranked up via ENV
# (never by editing a committed appsettings/Log* level), and uploads the FAILING
# iteration's trx + MonolithMeshTestBase traces + blame hang-dump so the actual
# wedge is diagnosable — i.e. it manufactures the repro the fix must be proven against.
#
# Run it from the Actions tab (workflow_dispatch). Defaults target CodeEditRecompile;
# re-dispatch with project=test/MeshWeaver.AI.Test filter=FullyQualifiedName~ThreadSubmissionIntegrationTest
# (etc.) for the others.
name: Flake repro (manual)

on:
workflow_dispatch:
inputs:
project:
description: "Test project to loop"
type: string
default: "test/MeshWeaver.Hosting.Monolith.Test"
filter:
description: "dotnet test --filter expression (the suspect test)"
type: string
default: "FullyQualifiedName~CodeEditRecompileTest.PressingCompileButton_SetsRequestedReleaseAt_AndProducesNewRelease"
iterations:
description: "Max loop iterations (stops on first failure)"
type: string
default: "25"
loglevel:
description: "Default log level for the looped runs (Debug/Trace to surface the watcher race)"
type: string
default: "Debug"

permissions:
contents: read

env:
# Same workspace-pinned NuGet cache as dotnet-test.yml so a --no-build/--no-restore
# loop resolves packages offline.
NUGET_PACKAGES: ${{ github.workspace }}/.nuget/packages

jobs:
repro:
name: "Loop suspect test until fail"
# ubuntu-latest == the 2-vCPU GitHub-hosted runner the real flake needs.
runs-on: ubuntu-latest
timeout-minutes: 60
# Effective params: workflow_dispatch inputs when present, else push-trigger defaults.
env:
REPRO_PROJECT: ${{ inputs.project || 'test/MeshWeaver.Hosting.Monolith.Test' }}
REPRO_FILTER: ${{ inputs.filter || 'FullyQualifiedName~CodeEditRecompileTest.PressingCompileButton_SetsRequestedReleaseAt_AndProducesNewRelease' }}
REPRO_ITERS: ${{ inputs.iterations || '80' }}
DOTNET_ENVIRONMENT: Development
# 🚨 Logging cranked via ENV only — never a committed appsettings/Log* change
# (those are production cost contract). Surfaces the watcher/messaging trace.
Logging__LogLevel__Default: ${{ inputs.loglevel || 'Debug' }}
steps:
- uses: actions/checkout@v6
# Mirror dotnet-test.yml: reclaim preinstalled tooling a .NET+Postgres run never
# touches so the build output + Testcontainers image fit the ~14 GB runner disk.
- name: Free disk space
uses: jlumbroso/free-disk-space@main
with:
tool-cache: false
dotnet: false
android: true
haskell: true
large-packages: true
docker-images: true
swap-storage: true
- name: Setup .NET
uses: actions/setup-dotnet@v5
with:
dotnet-version: 10.0.x
- name: Restore workloads
run: dotnet workload restore
- name: Restore dependencies
run: dotnet restore
# Build the whole solution in Release EXACTLY as the real CI build job does, so
# the looped --no-build runs execute the identical binaries that flake in CI
# (Release, same warnings-as-errors gate). Faithful repro > speed.
- name: Build
run: dotnet build --no-restore -c Release -p:CIRun=true -warnaserror
# The mesh-local #r feed some dynamic-compilation tests resolve at runtime
# (version-less `#r "nuget:MeshWeaver.X"`), packed exactly as dotnet-test.yml does.
- name: Pack mesh-local #r packages
run: |
set -euo pipefail
dotnet pack src/MeshWeaver.BusinessRules/MeshWeaver.BusinessRules.csproj \
-c Release --no-build --no-restore -o dist/packages --nologo
dotnet pack src/MeshWeaver.BusinessRules.Generator/MeshWeaver.BusinessRules.Generator.csproj \
-c Release --no-build --no-restore -o dist/packages --nologo
# 🚨 DO NOT crank the CompileWatcher log levels here. Repro #3/#4 proved it's a HEISENBUG:
# Debug logging in the watcher hot path changes the scheduling enough to MASK the race
# (runs #1/#2 caught it at iter 19/4 at the committed Warning level; runs #3/#4 with the
# categories at Debug missed it across 40+80 iters). So we keep production-like timing and
# rely on the timing-NEUTRAL stuck-state diagnostic in WaitForLatestRelease (it runs only
# on the 50s timeout, never during the race) to pin the stalled stage.
- name: Loop the suspect test until it fails
run: |
set -uo pipefail
iters="$REPRO_ITERS"
proj="$REPRO_PROJECT"
filt="$REPRO_FILTER"
echo "Looping '$filt' in $proj up to $iters× on a 2-vCPU runner (nproc=$(nproc))."
failed=0
for i in $(seq 1 "$iters"); do
echo "::group::iteration $i / $iters"
# --no-build --no-restore: run the Release binaries built above, exactly
# like the CI shards. blame-hang-* captures a mini-dump if the host wedges
# (the "unresponsive after the second compile" symptom) instead of just
# timing out opaquely.
dotnet test "$proj" -c Release --no-build --no-restore \
--filter "$filt" -l:trx \
--blame-hang-timeout 120s --blame-hang-dump-type mini
rc=$?
echo "::endgroup::"
if [ "$rc" -ne 0 ]; then
echo "::error::REPRO on iteration $i (exit $rc) — diagnostics uploaded below."
failed=1
break
fi
echo "iteration $i passed"
done
if [ "$failed" = "0" ]; then
echo "No repro in $iters iterations (the flake did not surface this run)."
fi
# Exit non-zero on repro so the run is CLEARLY marked failed in the Actions
# list (repro achieved = red). The diagnostics-collection + upload steps
# below run regardless via `if: always()`.
exit "$failed"
- name: Collect diagnostics
if: always()
run: |
mkdir -p repro-diagnostics
# trx (per-iteration; the last one is the failing run since we break on fail).
find "$REPRO_PROJECT" -name '*.trx' -exec cp {} repro-diagnostics/ \; 2>/dev/null || true
# blame hang mini-dump (written when the test host wedged, not just timed out).
find "$REPRO_PROJECT" -name 'blame-*.dmp' -exec cp {} repro-diagnostics/ \; 2>/dev/null || true
# MonolithMeshTestBase phase/dispose/memory traces (the watcher + INIT/DISPOSE
# trace that pinpoints the wedge), written to the process tempdir.
for f in meshweaver-test-trace meshweaver-dispose-trace meshweaver-memory-delta; do
[ -f "/tmp/$f.log" ] && cp "/tmp/$f.log" "repro-diagnostics/_$f.log" || true
done
# XUnitFileOutputHelper writes per-test/background-hub logs under bin/.../test-logs
# (the cranked watcher trace lands here when no test is the active ITestOutputHelper).
# Search repo-wide — background-hub logs can land outside the suspect project dir.
find . -path '*/test-logs/*.log' -exec cp {} repro-diagnostics/ \; 2>/dev/null || true
echo "Collected:"; ls -lh repro-diagnostics/ || true
- name: Upload repro diagnostics
if: always()
uses: actions/upload-artifact@v6
with:
name: flake-repro-diagnostics
path: repro-diagnostics/
retention-days: 15
compression-level: 9
106 changes: 96 additions & 10 deletions test/MeshWeaver.Hosting.Monolith.Test/CodeEditRecompileTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,11 @@ await NodeFactory.CreateNode(new MeshNode("instance2", $"{TestPartition}/CodeEdi
/// (<c>EnrichWithNodeType slow path faulted</c>, <c>SubscribeRequest</c>
/// timeouts → compilation-error overlay → MARKER_V1 never rendered).
/// </summary>
[Fact(Timeout = 60000)]
// 120s (not the usual 60s): the happy path completes in seconds; the budget is for the
// FAILURE path — WaitForLatestRelease's 50s primary wait plus its discriminating timeout
// diagnostic (mirror/index probes + decisive re-trigger, worst case ~50s more) must fit
// inside the xUnit method timeout or the diagnostic is cancelled before it can be emitted.
[Fact(Timeout = 120000)]
public async Task NodeType_RequestedReleasePath_PinsToHistoricalRelease()
{
var workspace = Mesh.GetWorkspace();
Expand Down Expand Up @@ -393,7 +397,11 @@ await NodeFactory.CreateNode(new MeshNode("unpinnedInstance", $"{TestPartition}/
/// <c>NodeTypeLayoutAreas.BuildCompileStatusPanel</c>) binds to this
/// state machine, so this is the test that gates the UI.
/// </summary>
[Fact(Timeout = 60000)]
// 120s (not the usual 60s): the happy path completes in seconds; the budget is for the
// FAILURE path — WaitForLatestRelease's 50s primary wait plus its discriminating timeout
// diagnostic (mirror/index probes + decisive re-trigger, worst case ~50s more) must fit
// inside the xUnit method timeout or the diagnostic is cancelled before it can be emitted.
[Fact(Timeout = 120000)]
public async Task IsDirty_FlipsTrueOnSourceEdit_FalseAfterCompile()
{
var workspace = Mesh.GetWorkspace();
Expand Down Expand Up @@ -550,7 +558,11 @@ await Mesh.GetWorkspace().GetMeshNodeStream(typePath)
/// — same code path the button executes — and asserts both the property
/// flip and the resulting release.
/// </summary>
[Fact(Timeout = 60000)]
// 120s (not the usual 60s): the happy path completes in seconds; the budget is for the
// FAILURE path — WaitForLatestRelease's 50s primary wait plus its discriminating timeout
// diagnostic (mirror/index probes + decisive re-trigger, worst case ~50s more) must fit
// inside the xUnit method timeout or the diagnostic is cancelled before it can be emitted.
[Fact(Timeout = 120000)]
public async Task PressingCompileButton_SetsRequestedReleaseAt_AndProducesNewRelease()
{
var workspace = Mesh.GetWorkspace();
Expand Down Expand Up @@ -663,13 +675,87 @@ await workspace.GetMeshNodeStream(nodeTypePath).Update(curr =>
private async Task<string> WaitForLatestRelease(string nodeTypePath, string? knownRelease)
{
var workspace = Mesh.GetWorkspace();
var node = await workspace.GetMeshNodeStream(nodeTypePath)
.Should().Within(50.Seconds())
.Match(n => n?.Content is NodeTypeDefinition def
&& def.CompilationStatus == CompilationStatus.Ok
&& !string.IsNullOrEmpty(def.LatestReleasePath)
&& def.LatestReleasePath != knownRelease);
return ((NodeTypeDefinition)node.Content!).LatestReleasePath!;
try
{
var node = await workspace.GetMeshNodeStream(nodeTypePath)
.Should().Within(50.Seconds())
.Match(n => n?.Content is NodeTypeDefinition def
&& def.CompilationStatus == CompilationStatus.Ok
&& !string.IsNullOrEmpty(def.LatestReleasePath)
&& def.LatestReleasePath != knownRelease);
return ((NodeTypeDefinition)node.Content!).LatestReleasePath!;
}
catch (Exception ex)
{
// Diagnostic: on timeout, discriminate OWNER-side (never produced v2) from a
// DELIVERY / mirror-staleness race (owner produced v2 but the cross-hub mirror the test
// reads never received it). Compare two INDEPENDENT views of the SAME node:
// MIRROR — the single-node cross-hub cache handle the test/watcher read.
// INDEX — the mesh query (a DIFFERENT path: the owner's persisted+indexed state),
// plus the Release children. If INDEX shows a fresh v2 / Handled while MIRROR
// is stale at v1 / Handled=null → delivery race confirmed (owner did the work).
// If BOTH are stale → owner-side (the watcher/compile never produced v2).
var cur = await workspace.GetMeshNodeStream(nodeTypePath)
.Where(n => n is not null).Take(1).Timeout(5.Seconds());
var m = cur?.Content as NodeTypeDefinition;

var meshService = Mesh.ServiceProvider.GetService<IMeshService>();
string indexNode = "(no IMeshService)", releases = "(no IMeshService)";
if (meshService is not null)
{
try
{
// Best-effort: take the FIRST emission bounded to 5s — waiting for a
// non-empty snapshot could stall the whole bound when the node is genuinely
// absent from the index (itself a diagnostic result).
var idx = await meshService.Query<MeshNode>(MeshQueryRequest.FromQuery($"path:{nodeTypePath}"))
.Take(1).Timeout(5.Seconds());
var id = idx.Items.FirstOrDefault()?.Content as NodeTypeDefinition;
indexNode = id is null ? "(not in index)"
: $"Status={id.CompilationStatus}, Latest={id.LatestReleasePath ?? "(null)"}, " +
$"Handled={id.LastReleaseRequestHandledAt:O}";
}
catch (Exception qx) { indexNode = $"(query failed: {qx.GetType().Name})"; }
try
{
var rel = await meshService.Query<MeshNode>(
MeshQueryRequest.FromQuery($"path:{nodeTypePath}/Release scope:descendants"))
.Take(1).Timeout(5.Seconds());
releases = rel.Items.Count == 0 ? "(none)" : string.Join(", ", rel.Items.Select(r => r.Path));
}
catch (Exception qx) { releases = $"(query failed: {qx.GetType().Name})"; }
}

// (a) vs (b): re-trigger with a FRESH timestamp. If the watcher is alive and merely MISSED
// the first settled-trigger emission, this second trigger produces v2 → "RECOVERED" (sub-case a:
// a one-time missed/lost emission). If it stays stuck, the watcher/state is persistently
// clobbered or the subscription is dead (sub-case b). Decisive + repro-cheap.
string retrigger;
try
{
var t2 = DateTimeOffset.UtcNow.AddMilliseconds(1);
await workspace.GetMeshNodeStream(nodeTypePath).Update(curr =>
curr?.Content is NodeTypeDefinition cd
? curr with { Content = cd with { RequestedReleaseAt = t2, RequestedReleaseForce = true } }
: curr!).Should().Within(10.Seconds()).Emit();
var v2 = await workspace.GetMeshNodeStream(nodeTypePath)
.Where(n => n?.Content is NodeTypeDefinition d2
&& d2.CompilationStatus == CompilationStatus.Ok
&& !string.IsNullOrEmpty(d2.LatestReleasePath)
&& d2.LatestReleasePath != knownRelease)
.Take(1).Timeout(25.Seconds());
retrigger = $"RECOVERED via re-trigger → v2={((NodeTypeDefinition)v2.Content!).LatestReleasePath} (sub-case a: first emission missed)";
}
catch (Exception rx) { retrigger = $"STILL STUCK after re-trigger ({rx.GetType().Name}) (sub-case b: persistent clobber / dead subscription)"; }
Comment thread
rbuergi marked this conversation as resolved.

throw new Exception(
$"WaitForLatestRelease stuck for {nodeTypePath}: known={knownRelease ?? "(null)"}\n" +
$" MIRROR: Status={m?.CompilationStatus}, Latest={m?.LatestReleasePath ?? "(null)"}, " +
$"ReqAt={m?.RequestedReleaseAt:O}, Handled={m?.LastReleaseRequestHandledAt:O}, Force={m?.RequestedReleaseForce}\n" +
$" INDEX node: {indexNode}\n" +
$" INDEX Release children: [{releases}]\n" +
$" RE-TRIGGER: {retrigger}", ex);
}
}

private async Task<CreateReleaseResponse> SendCreateRelease(string nodeTypePath, bool force)
Expand Down
Loading