diff --git a/CHANGELOG.md b/CHANGELOG.md
index f07a67f7..e6ffa9ac 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,140 @@
 # Changelog
 
+All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-rpc` (Python). The format roughly follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/); versions are locked across the npm + PyPI packages.
+
+---
+
+## [0.50.1] — 2026-05-27 — docs + examples
+
+### Added
+
+- `README.md` rewritten as a top-tier OSS landing page: table of contents, decision-packet output sample (annotated JSON), comparison matrix vs LangSmith / Braintrust / Phoenix, three customer journey cards.
+- `examples/selfimprove-quickstart/` — minimal closed-loop example with annotated stdout.
+- `examples/customer-feedback-loop/` — Customer A journey: multi-rater approve/reject corpus → `fromFeedbackTable` → `analyzeRuns`.
+- `examples/customer-otel-traces/` — Customer B journey: OTel spans → `fromOtelSpans` → `analyzeRuns`.
+- `docs/insight-report.md` — annotated walkthrough of every section of the decision packet.
+- `docs/customer-journeys.md` — three end-to-end journeys with code + expected output.
+
+### Changed
+
+- `docs/concepts.md` — updated mental model for the three top-level entries (`selfImprove`, `analyzeRuns`, intake adapters) and the layering rule.
+
+### Notes
+
+Docs-only patch. No code changes, no behavior changes, no API surface changes vs 0.50.0.
+
+---
+
+## [0.50.0] — 2026-05-27 — the decision packet
+
+### Added
+
+- **`analyzeRuns({ runs, ... }): InsightReport`** in `/contract`. Composes the substrate's statistical / calibration / clustering / Pareto primitives into one rigor packet. Sections populate based on what the input supports: distributional summary always, lift when baseline+candidate are present, judges when run records carry `judgeScores`, inter-rater agreement when `raterScores` are supplied, failure clusters when an `AnalystRegistry` is wired, contamination when canaries are passed, outcome correlation when a downstream signal is supplied.
+- **`InsightReport`** canonical decision-packet shape; reused by `selfImprove()` and emitted on the hosted wire as `EvalRunEvent.insightReport?`.
+- **Intake adapters** in `/contract`:
+  - `fromFeedbackTable({ ratings })` — multi-rater corpus → `RunRecord[] + raterScores`.
+  - `fromOtelSpans({ spans })` — OpenTelemetry spans → `RunRecord[]`, grouped by `tangle.runId` or `traceId`.
+- **`SelfImproveResult.insight: InsightReport`** — `selfImprove()` now returns the full decision packet alongside the existing ship/hold verdict.
+
+### Changed
+
+- `selfImprove()` internally calls `analyzeRuns()` on baseline + winner cells; consumers reading `.lift` continue to work unchanged, while `.insight.lift` now carries CI95 + p-value + Cohen's d + MDE + required-n.
+
+### Test coverage
+
+1427 / 1427 passing; 11 new integration tests covering lift detection paths, outcome correlation + linear reward model, canary contamination, multi-rater journey end-to-end, OTel journey end-to-end, recommendations shape, JSON-serialisability.
+
+---
+
+## [0.49.0] — 2026-05-27 — audit-fix sweep
+
+### Added
+
+- `src/adapters/otel.ts` — generic OTel→hosted bridge (`createOtelBridge` / `OtelBridge` / `OtelBridgeOptions`). Stringifies array-valued attributes instead of dropping them.
+- `src/contract/diff.ts` — `keyForCell` uses `JSON.stringify([scenarioId, rep])` (no separator collisions); `Number.isFinite` coercion on dimension deltas (no NaN propagating to dashboards).
+- `examples/hosted-ingest-server/server.ts` — `REFERENCE_RECEIVER_START=1|0` env var as the primary start signal; idempotency cache prunes on read with the wire-spec 24h TTL.
+
+### Changed
+
+- Python `TraceSpanEventOuter` exposes `tangle.*` pivots via field aliases (`tangle_run_id`, etc.) and round-trips through `model_dump(by_alias=True)`.
+- Python `_WireModel` emits a `UserWarning` when an extra field is the snake_case shadow of a declared camelCase field (cross-language drift guard).
+
+### Removed
+
+- `src/adapters/traceai.ts` — replaced by `src/adapters/otel.ts`. No back-compat shim.
+
+---
+
+## [0.48.0] — 2026-05-27 — substrate↔runtime layering fix + diffRuns + Python hosted parity
+
+### Added
+
+- `src/verdict.ts` — `DefaultVerdict` substrate primitive (moved DOWN from agent-runtime).
+- `src/contract/diff.ts` — `diffRuns` / `diffGenerations` / `diffRunBaselineToWinner` for v3-vs-v4 dashboard rendering, CI reporting, and any consumer comparing improvement-loop output.
+- `src/adapters/traceai.ts` — OTel→hosted bridge (renamed to `otel.ts` in 0.49.0).
+- `tests/hosted-roundtrip.test.ts` — proves wire-format binary compat between client and reference receiver.
+- Python `HostedClient` (`clients/python/src/agent_eval_rpc/hosted.py`) — TS↔Python wire-format parity with bearer auth, idempotency, and exponential backoff on 5xx/408/429.
+- `CLAUDE.md` repo-layering rule: agent-eval is the substrate; agent-runtime + agent-knowledge depend on it; the reverse is forbidden.
+
+### Changed
+
+- `src/campaign/gates/default-production-gate.ts` — `RunRecord` import from local `../../run-record` (was reaching up into agent-runtime).
+- `src/matrix/types.ts` — `DefaultVerdict` import from `../verdict` (was reaching up into agent-runtime).
+
+### Removed
+
+- `@tangle-network/agent-runtime` from `peerDependencies`, `devDependencies`, and `pnpm.minimumReleaseAgeExclude` (no upward deps from substrate).
+
+---
+
+## [0.47.0] — 2026-05-26 — Phase D hosted-tier substrate
+
+### Added
+
+- `src/hosted/` — wire-format types frozen at `HOSTED_WIRE_VERSION = '2026-05-26.v1'`, `createHostedClient` with bearer auth + idempotency + bounded retries.
+- `examples/hosted-ingest-server/` — reference receiver implementing the spec.
+- `docs/hosted-ingest-spec.md` — semver-locked wire spec.
+- `selfImprove({ hostedTenant })` — opt-in hosted ingest; failures logged, never fail the loop.
+
+---
+
+## [0.46.0] — `selfImprove()` LAND-tier helper
+
+`selfImprove({ scenarios, dispatch, judges, baselineSurface })` shipped in `/contract` as the one-shot wrapper around `runImprovementLoop`.
+
+---
+
+## [0.45.0] — distributed campaigns
+
+`/adapters/http` with `httpDispatch` + `runDispatchServer`; `cellPlacement` on `RunCampaignOptions` for cross-region fan-out.
+
+---
+
+## [0.44.0] — `/adapters/langchain`
+
+LangChain runnable → `Dispatch` adapter.
+
+---
+
+## [0.43.0] — edge-friendly storage
+
+`inMemoryCampaignStorage()` for Cloudflare Workers / edge / test environments.
+
+---
+
+## [0.42.0] — GEPA driver + legacy deletion
+
+### Added
+
+- `gepaDriver` reflective LLM mutation driver.
+- `campaignToRunRecords` adapter.
+
+### Removed
+
+- `runMultiShotOptimization` (top-level trajectory-optimizer) — replaced by `runImprovementLoop` + `gepaDriver` composition. The `/multishot` subpath (N-shot persona matrix) is unrelated and remains.
+
+---
+
 ## 0.34.0 — 2026-05-23
 
 ### Eval evolution-tracking — first-class `AgentProfile` + per-cell scorecard
diff --git a/README.md b/README.md
index f703a22c..cdaf106b 100644
--- a/README.md
+++ b/README.md
@@ -1,400 +1,304 @@
-# @tangle-network/agent-eval
-
-**Substrate for self-improving agents.** Trace what runs, verify the result,
-turn outcomes into preferences and rewards, mutate prompts and policies under
-anytime-valid evidence, and ship only when the improvement is decisive.
-
-```txt
-real product task
-  -> observe / act (your runtime)
-  -> trace + verifier pipeline (capture integrity)
-  -> RunRecord (canonical eval artifact)
-       -> judge calibration · paired stats · sequential α
-       -> preferences · verifiable rewards · process rewards
-       -> GEPA / reflective mutation · auto-research · active curriculum
-       -> release gate · replay · contamination probe · tournament rating
-  -> next iteration
-```
+# `@tangle-network/agent-eval`
 
-`agent-eval` does **not** own product state, credentials, UI, storage, model
-routing, browser drivers, sandbox policy, or deployment. Products own those.
-This package owns the loop that closes evaluation → preference → mutation →
-redeploy, with capture integrity and statistically rigorous evidence at every
-step.
+**Ship better agent prompts with statistical confidence.** One function call returns a decision packet: lift CI, judge calibration, contamination check, failure clusters, cost-quality Pareto, and a ranked action list. Same shape whether you've got a closed improvement loop or just production logs.
 
-It ships as a TypeScript library (npm) with a generated Python client (PyPI),
-both speaking the same wire protocol. MIT, self-hostable, no SaaS dependency.
+[![npm](https://img.shields.io/npm/v/@tangle-network/agent-eval.svg)](https://www.npmjs.com/package/@tangle-network/agent-eval)
+[![pypi](https://img.shields.io/pypi/v/agent-eval-rpc.svg)](https://pypi.org/project/agent-eval-rpc/)
+[![tests](https://github.com/tangle-network/agent-eval/actions/workflows/ci.yml/badge.svg)](https://github.com/tangle-network/agent-eval/actions/workflows/ci.yml)
+[![license: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](./LICENSE)
 
-## Install
+> TypeScript first-class, Python (`agent-eval-rpc`) speaks the same wire protocol, hosted-tier-friendly, MIT, self-hostable, no SaaS dependency.
 
-```sh
-pnpm add @tangle-network/agent-eval
-# or, from Python:
-pip install agent-eval-rpc
-```
+---
 
-## Quick Start — the control loop
+## Table of contents
 
-```ts
-import {
-  objectiveEval,
-  runAgentControlLoop,
-} from '@tangle-network/agent-eval/control'
+- [What you get back](#what-you-get-back-the-decision-packet)
+- [Quick start](#quick-start)
+  - [Closed loop — `selfImprove()`](#closed-loop--selfimprove)
+  - [Observed runs — `analyzeRuns()`](#observed-runs--analyzeruns)
+  - [Existing data — intake adapters](#existing-data--intake-adapters)
+- [How it compares](#how-it-compares)
+- [Customer journeys](#customer-journeys)
+- [Subpath entry points](#subpath-entry-points)
+- [Concepts + design](#concepts--design)
+- [Hosted tier](#hosted-tier)
+- [Install + run](#install--run)
+- [Stability + versioning](#stability--versioning)
+- [License](#license)
 
-const result = await runAgentControlLoop({
-  intent: task.prompt,
-  budget: { maxSteps: 8, maxWallMs: 180_000, maxCostUsd: 2 },
+---
 
-  observe() {
-    return product.readState(task.id)
-  },
+## What you get back: the decision packet
 
-  validate({ state }) {
-    return [
-      objectiveEval({
-        id: 'build-passes',
-        passed: state.build.exitCode === 0,
-        severity: 'critical',
-        metadata: state.build,
-      }),
-      objectiveEval({
-        id: 'preview-serves',
-        passed: state.preview.httpStatus === 200,
-        severity: 'critical',
-      }),
+Whether you call `selfImprove()` (closed loop) or `analyzeRuns()` (observed runs), the report has the same shape. Here's a real one, abridged:
+
+```jsonc
+{
+  "n": 80,                                            // runs analyzed
+  "composite": {                                       // distributional summary
+    "mean": 0.62, "p50": 0.65, "p95": 0.88, "stddev": 0.17,
+    "histogram": [/* 12 bins */]
+  },
+  "lift": {                                            // paired bootstrap
+    "baselineMean": 0.58, "candidateMean": 0.65,
+    "delta": 0.07,
+    "ci95": [0.04, 0.10],                              // 95% CI on the delta
+    "pValue": 0.0008,                                  // paired-t
+    "cohensD": 0.41,
+    "n": 40,
+    "mde": 0.06,                                       // min detectable effect at 80% power
+    "requiredN": 38                                    // n needed to detect observed delta
+  },
+  "judges": {                                          // per-judge calibration
+    "domain-expert": { "n": 80, "meanScore": 0.64 },
+    "helpfulness-llm": { "n": 80, "meanScore": 0.61 }
+  },
+  "interRater": {                                      // multi-rater agreement
+    "raters": 3, "jointlyRated": 80, "kappa": 0.71,
+    "disagreementCases": [/* top 20 ranked by spread */]
+  },
+  "costQuality": {                                     // cost-vs-quality
+    "cost": { "mean": 0.024, "p95": 0.041, /* ... */ },
+    "pareto": { /* ParetoFigureSpec the dashboard renders */ }
+  },
+  "failureClusters": {                                 // when an AnalystRegistry is wired
+    "totalFailures": 11,
+    "clusters": [
+      { "name": "off-topic-drift",  "share": 0.45, "exemplars": ["run-12", "run-19"] },
+      { "name": "over-confidence",  "share": 0.27, "exemplars": ["run-3"] },
+      { "name": "format-mismatch",  "share": 0.18, "exemplars": ["run-41"] }
     ]
   },
-
-  decide({ evals }) {
-    const failed = evals.filter((e) => !e.passed)
-    if (failed.length === 0) {
-      return { type: 'stop', pass: true, reason: 'all gates passed' }
-    }
-    return {
-      type: 'continue',
-      action: { type: 'repair', failed: failed.map((e) => e.id) },
-      reason: 'repair failed gates',
-    }
+  "contamination": { "leaks": 0, "holdoutAuditPassed": true },
+  "outcomeCorrelation": {                              // when downstream metric supplied
+    "metric": "engagement_rate", "n": 80,
+    "pearson": 0.72, "spearman": 0.69,
+    "rewardModel": { "intercept": 0.04, "slope": 1.93, "r2": 0.52 }
   },
-
-  act(action) {
-    return product.runAgentStep(task.id, action)
+  "release": {
+    "status": "pass",
+    "axes": [
+      { "name": "quality-lift",          "status": "pass" },
+      { "name": "contamination",         "status": "pass" },
+      { "name": "composite-distribution","status": "pass" }
+    ]
   },
-})
-
-await product.storeEvalResult(task.id, result)
+  "recommendations": [
+    { "priority": "critical", "kind": "ship",
+      "title": "Ship — lift 0.070 (95% CI 0.040..0.100)",
+      "detail": "Holdout lift exceeds threshold 0.02 with 95% bootstrap confidence (n=40, p=0.0008, d=0.41)." },
+    { "priority": "high", "kind": "investigate",
+      "title": "Top failure cluster: off-topic-drift (45% of failures)",
+      "detail": "11 runs failed. Drill into exemplars run-12 / run-19 to identify the pattern." }
+  ]
+}
 ```
 
-Same loop shape in production, replay, benchmark, and optimization. Swap the
-dependencies behind `observe()` and `act()`, never the eval contract.
+The `recommendations` array is the human-readable layer; everything above it is the evidence. Read the recs, act on them, the numbers are the proof.
 
-## Production loop — close the eval → prod → eval cycle
+---
 
-Static prompts decay. Yesterday's FTC rule flips today; yesterday's tool quirk
-becomes today's incident. The production agents that win are the ones that
-**continuously re-train against live failure modes**.
+## Quick start
 
-`runProductionLoop` is the orchestration layer that wires the existing eval
-substrate into a self-improvement cron:
+### Closed loop — `selfImprove()`
 
-```ts
-import {
-  runProductionLoop,
-  httpGithubClient,
-  FileSystemFeedbackTrajectoryStore,
-} from '@tangle-network/agent-eval'
-import { FileSystemTraceStore } from '@tangle-network/agent-eval/traces'
-
-const result = await runProductionLoop({
-  runId: `weekly-${new Date().toISOString().slice(0, 10)}`,
-  target: 'tax-agent',
-
-  // 1. Where production traces + feedback land. Wire the HTTP ingestion
-  //    endpoints (POST /v1/traces/ingest, POST /v1/feedback) from your
-  //    runtime; the same store reads them here.
-  traceStore: new FileSystemTraceStore({ dir: 'data/prod-traces' }),
-  feedbackStore: new FileSystemFeedbackTrajectoryStore({ dir: 'data/prod-feedback' }),
-
-  // 2. Cluster threshold: act on failure groups ≥ 20 runs or ≥ 5% of corpus.
-  cluster: { minClusterSize: 20, minSeverityRatio: 0.05, maxClustersPerCycle: 1 },
-
-  // 3. Evolve: seed = current prompt, gate against holdout scenarios.
-  evolve: {
-    baselinePrompt: currentSystemPrompt,
-    holdoutScenarios: productionShapeScenarios,
-    runner,                            // your agent driver
-    scorer,                            // calibrated judge or rubric
-    mutator,                           // GEPA-style or addendum-style mutator
-    gate: {
-      baselineKey: 'baseline',
-      minProductiveRuns: 5,
-      pairedDeltaThreshold: 0.03,      // require Nσ improvement on holdout
-      overfitGapThreshold: 0.10,
-    },
-  },
-
-  // 4. Ship: when the gate passes, open a PR with the new prompt.
-  ship: {
-    client: httpGithubClient({ token: process.env.GITHUB_TOKEN! }),
-    repo: { owner: 'tangle-network', name: 'tax-agent' },
-    branchPrefix: 'eval/auto-improve',
-    promptFilePath: 'prompts/tax-agent-system.txt',
-    reviewers: ['drew'],
-  },
+You have scenarios, a dispatch, judges, and want the loop to propose better prompts + tell you which to ship.
 
-  cron: { cadence: 'weekly' },         // surface-only; consumer schedules
+```ts
+import { selfImprove } from '@tangle-network/agent-eval/contract'
+
+const result = await selfImprove({
+  scenarios,                                // your scenario corpus
+  dispatch: async ({ scenario }) =>          // your agent — anything that returns an artifact
+    await myAgent.run(scenario),
+  judges: [myJudge],                         // any JudgeConfig — LLM, rule, ensemble
+  baselineSurface: { systemPrompt: currentPrompt },
 })
 
-console.log(result.decision)            // 'pr_opened' | 'gate_failed' | 'no_actionable_failures' | ...
-console.log(result.pullRequest?.prUrl)  // populated when a PR was opened
+result.gateDecision         // 'ship' | 'hold' | 'need_more_work' | ...
+result.lift                 // raw delta on holdout
+result.insight              // the full decision packet above
 ```
 
-The primitive runs **one cycle**. Schedule it with `workflow_dispatch` + cron in
-GitHub Actions. It is **idempotent + replayable**: same `runId` → same plan.
-Gate failures are fail-closed — a candidate that beats baseline on search but
-overfits on holdout never lands.
+### Observed runs — `analyzeRuns()`
 
-Full runnable demo (synthetic traces, no credentials) in
-[`examples/production-loop`](./examples/production-loop/README.md).
+You don't have a closed loop yet — you have observed runs (production traces, an approve/reject corpus, a CSV gold set). Same report shape, no agent invocation.
 
-## Self-improvement loop
+```ts
+import { analyzeRuns } from '@tangle-network/agent-eval/contract'
+
+const report = await analyzeRuns({
+  runs,                                     // RunRecord[]
+  outcomeSignal: {                          // optional — closes the loop on real outcomes
+    metric: 'engagement_rate',
+    valueByRunId: enrichedFromProd,
+  },
+  canaryScenarios,                          // optional — contamination probe
+  analyst: myAnalystRegistry,               // optional — AI-powered failure clustering
+})
 
-Eval doesn't end at "pass/fail." Outcomes become training signal, mutation
-proposals, and curriculum updates — all from the same `RunRecord` produced by
-the control loop.
+report.recommendations    // ranked actions
+report.failureClusters    // grouped failure modes
+report.outcomeCorrelation // judge↔outcome correlation + linear reward model
+```
+
+### Existing data — intake adapters
+
+You have data already. Don't reshape it — pipe it through an adapter.
 
 ```ts
-import { runEvalCampaign } from '@tangle-network/agent-eval'
 import {
-  extractPreferences,
-  extractVerifiableReward,
-  filterDeterministicallyRewarded,
-  offPolicyEstimateAll,
-  analyzeOptimizationResult,
-} from '@tangle-network/agent-eval/rl'
-
-// 1. Run a matrix of variants × scenarios with capture integrity by construction.
-const campaign = await runEvalCampaign({ variants, scenarios, run })
-
-// 2. Convert outcomes into RL signal.
-const rewards = extractVerifiableReward(campaign.runs)          // compile/test/schema
-const prefs   = extractPreferences(campaign.runs)               // (chosen, rejected) triples
-const clean   = filterDeterministicallyRewarded(rewards)        // judge-noise free
-
-// 3. Estimate a candidate policy's value without re-running.
-const ope = offPolicyEstimateAll(campaign.runs, candidatePolicy)  // IPS + SNIPS + DR
-
-// 4. Or close the loop end-to-end: score → reflect → mutate → re-run.
-const next = await analyzeOptimizationResult(campaign, { researcher })
+  fromFeedbackTable,
+  fromOtelSpans,
+  analyzeRuns,
+} from '@tangle-network/agent-eval/contract'
+
+// Multi-rater approve/reject (Obsidian tags, Sheets, CSV, Postgres).
+const { runs, raterScores } = fromFeedbackTable({
+  ratings: parseYourFeedbackTable(),         // Array<{ runId, rater, rating }>
+})
+await analyzeRuns({ runs, raterScores })
+
+// Production OTel traces — group by tangle.runId or traceId.
+const runs2 = fromOtelSpans({ spans: yourOtelStream })
+await analyzeRuns({ runs: runs2 })
 ```
 
-| Step | Primitive | Subpath |
-| --- | --- | --- |
-| Eval matrix with integrity | `runEvalCampaign` | `/` |
-| Deterministic re-judge / audit | `ReplayCache`, `createReplayFetch` | `/` |
-| Anytime-valid α across rolling looks | `pairedEvalueSequence` | `/reporting` |
-| Judge quality vs gold | `calibrateJudge` (κ, Pearson, MAE, bias probes) | `/` |
-| Continuous inter-rater agreement | `calibrateJudgeContinuous`, `continuousAgreement` (κ_w, ICC(2,1), bootstrap CIs) | `/` |
-| (chosen, rejected) for DPO/KTO/PPO | `extractPreferences` | `/rl` |
-| Verifiable reward signal | `extractVerifiableReward` | `/rl` |
-| Step-level / PRM training data | `extractStepRewards`, `prmTrainingPairs` | `/rl` |
-| Estimate policy value off-policy | `offPolicyEstimateAll` (IPS + SNIPS + DR) | `/rl` |
-| GEPA / reflective prompt mutation | `buildReflectionPrompt`, `parseReflectionResponse`, Ax-GEPA `SteeringOptimizer` | `/` `/optimization` |
-| Auto-research (read runs → propose) | `analyzeOptimizationResult`, `PredictiveValidityResearcher` | `/rl` |
-| Active curriculum (variance / Thompson) | `allocateCurriculum` | `/rl` |
-| Tournament ratings (Bradley-Terry + Elo) | `fitBradleyTerry`, `applyEloUpdate` | `/rl` |
-| Adversarial scenario search | `adversarialScenarioSearch` | `/rl` |
-| Contamination probe (held-out perturb) | `runContaminationProbe` | `/rl` |
-| Reward hacking signatures | `detectRewardHacking` | `/rl` |
-| Compute curves (best-of-N, self-consist, Pareto) | `runComputeCurve`, `bestOfN`, `selfConsistency`, `paretoFrontier` | `/rl` |
-| Knowledge gap separated from reasoning gap | `scoreKnowledgeReadiness` | `/` |
-| Release gate (paired evidence + holdouts) | `evaluateReleaseConfidence`, `HeldOutGate` | `/reporting` |
-| Launch report (decision-grade) | `renderReleaseReport`, `researchReport` | `/reporting` |
-
-## Import Paths
-
-| Subpath | Use for |
-| --- | --- |
-| `@tangle-network/agent-eval/contract` | **LAND-tier surface** — `selfImprove`, `runCampaign`, `runImprovementLoop`, `runEval`, `Dispatch`, `Mutator`, `Gate`, `defaultProductionGate`, `gepaDriver`, `diffRuns`, storage backends. New code starts here. |
-| `@tangle-network/agent-eval/hosted` | **EXPAND-tier surface** — `createHostedClient`, wire-format types, `HOSTED_WIRE_VERSION`. Ships eval-run events + trace spans to any orchestrator that speaks the spec. |
-| `@tangle-network/agent-eval/adapters/otel` | OTel→hosted bridge — `createOtelBridge` forwards OTel-shape spans (TraceAI, OpenLLMetry, OTel SDK) into the hosted-tier ingest. |
-| `@tangle-network/agent-eval/adapters/langchain` | LangChain executor adapter — wrap a LangChain runnable as a `Dispatch`. |
-| `@tangle-network/agent-eval/adapters/http` | Distributed driver — `httpDispatch` + `runDispatchServer` for cross-machine campaigns. |
-| `@tangle-network/agent-eval/campaign` | Lower-level campaign primitives — `runCampaign`, driver implementations, storage. |
-| `@tangle-network/agent-eval/multishot` | Multi-shot optimization primitives. |
-| `@tangle-network/agent-eval/control` | `observe → validate → decide → act`, action policy, propose/review loops |
-| `@tangle-network/agent-eval/traces` | trace stores, emitters, TraceAnalyst, replay |
-| `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot, prompt evolution, GEPA, EvalCampaign |
-| `@tangle-network/agent-eval/reporting` | release confidence, paired stats, sequential e-values, launch reports |
-| `@tangle-network/agent-eval/rl` | adapters, verifiable rewards, preferences, OPE, PRM, contamination, tournaments, adversarial, compute curves, auto-research |
-| `@tangle-network/agent-eval/wire` | HTTP/RPC server + schemas (same protocol the Python client speaks) |
-| `@tangle-network/agent-eval/benchmarks` | benchmark adapter contracts and reference wrappers |
-| `@tangle-network/agent-eval/matrix` | N-axis cartesian runner over substrate types — see [`src/matrix/`](./src/matrix/) |
-
-The root export remains available for convenience; new code should prefer
-focused subpaths. Anything under `/rl`, `/pipelines`, `/meta-eval`, `/prm`,
-or `/builder-eval` is only reachable via its subpath.
-
-## API stability
-
-Public exports are tagged with JSDoc stability markers so consumers can see
-status at the call site (IDE hover, language server, declaration files).
+Both intake adapters preserve every signal in the source — multi-rater scores stay rater-keyed so the report can compute inter-rater agreement and surface the disagreement triage list.
 
-| Tag | Meaning |
-| --- | --- |
-| `@stable` | API frozen at this major. Breaking changes require a major bump. |
-| `@experimental` | Interface may evolve before becoming `@stable`. Pin the patch version if you depend on it. |
-| `@internal` | Not part of the public contract. Use the documented subpath instead. |
+---
 
-The `/rl` subpath is the most active surface. See
-[`src/rl/index.ts`](./src/rl/index.ts) for the current stable/experimental
-breakdown.
+## How it compares
 
-## Capture integrity
+| | LangSmith | Braintrust | Phoenix | **agent-eval** |
+|---|:---:|:---:|:---:|:---:|
+| Closed-loop self-improvement | ✱ human-in-loop | ✱ experiment-driven | — | ✓ autonomous + gated |
+| Statistical lift CI (paired bootstrap) | — | partial | — | ✓ |
+| Judge calibration + bias detection | — | — | — | ✓ |
+| Inter-rater agreement + disagreement triage | — | — | — | ✓ |
+| Contamination / canary check | — | — | — | ✓ |
+| AI-driven failure clustering | partial | — | partial | ✓ |
+| Cost-quality Pareto | — | — | — | ✓ |
+| Multi-language clients (TS + Python) | TS only | TS only | TS + Py | ✓ TS + Py |
+| Self-hostable / no-SaaS option | — | — | OSS | ✓ MIT, OSS |
+| Substrate vs SaaS shape | SaaS | SaaS | OSS server | **library** |
+| Hosted tier (optional) | required | required | optional | optional |
 
-Launch-grade benchmark runs need four things that are easy to forget in glue
-code: (1) raw HTTP capture alongside the structured spans so a reviewer can
-verify which route answered, (2) a preflight assertion that the configured
-client points at the intended provider, (3) a run-end assertion that the
-expected events were actually written, and (4) auto-execution of the trace
-analyst as part of the run lifecycle.
+Position: agent-eval is the **substrate** (one library, decision-grade output) the others are SaaS *around* the substrate. If you want a closed loop that ships your prompt under statistical confidence, you call agent-eval. If you want a dashboard rendered from your data, you pipe agent-eval into the hosted tier or your own renderer.
 
-```ts
-import {
-  TraceEmitter, FileSystemRawProviderSink, callLlm, assertLlmRoute,
-  assertRunCaptured, throwIfRunIncomplete,
-} from '@tangle-network/agent-eval'
-import { traceAnalystOnRunComplete } from '@tangle-network/agent-eval/traces'
+---
 
-const sink = new FileSystemRawProviderSink({ dir: `${workDir}/raw-events` })
-assertLlmRoute(llmOpts, { requireExplicitBaseUrl: true, allowedBaseUrls, requireAuth: true })
+## Customer journeys
 
-const emitter = new TraceEmitter(store, {
-  onRunComplete: [traceAnalystOnRunComplete({ analyze: analystOpts, save })],
-})
-await emitter.startRun(/* ... */)
-// LLM calls flow through callLlm with `{ rawSink: sink, traceContext: { runId, spanId } }`.
-await emitter.endRun({ pass, score })
+Three runnable examples — each is self-contained, each shows the actual output.
 
-throwIfRunIncomplete(await assertRunCaptured(store, emitter.runId, {
-  llmSpansMin: 1, rawSink: sink, requireRawCoverageOfLlmSpans: true, requireOutcome: true,
-}))
-```
+| Journey | Example | Who it's for |
+|---|---|---|
+| **Closed loop** — improve a prompt under statistical confidence | [`examples/selfimprove-quickstart/`](./examples/selfimprove-quickstart/) | Teams with scenarios + judges + agent in hand |
+| **Multi-rater feedback corpus** — turn Obsidian/Sheets/CSV ratings into actionable insights | [`examples/customer-feedback-loop/`](./examples/customer-feedback-loop/) | Teams reviewing AI outputs by hand who want to compress that taste into per-member LLM judges + close the loop |
+| **Production OTel traces** — analyze logs you already have, no closed loop required | [`examples/customer-otel-traces/`](./examples/customer-otel-traces/) | Teams running agents in prod with observability, no eval discipline yet |
 
-Directives, rationale, and shipped-bug context are in
-[`SKILL.md` § Capture integrity](./.claude/skills/agent-eval/SKILL.md#capture-integrity-required-for-launch-grade-adoption).
-
-## Examples
-
-Each example has its own README with what it demonstrates, expected output,
-and runtime. See [`examples/`](./examples/).
-
-- [`examples/multi-shot-optimization`](./examples/multi-shot-optimization/README.md):
-  optimize full trajectories with held-out promotion.
-- [`examples/same-sandbox-harness`](./examples/same-sandbox-harness/README.md):
-  run setup/build/test and evidence checks in one workspace.
-- [`examples/benchmarks`](./examples/benchmarks/README.md):
-  benchmark adapter shape and reference wrappers.
-- [`examples/auto-research-with-agent-builder`](./examples/auto-research-with-agent-builder/README.md):
-  closed loop — score, reflect, mutate, re-score, repeat.
-- [`examples/fine-tune-with-prime-rl`](./examples/fine-tune-with-prime-rl/README.md):
-  RunRecord → preferences → trainer (prime-rl) → next campaign.
-- [`examples/production-loop`](./examples/production-loop/README.md):
-  ingest prod traces + feedback, cluster failures, evolve, gate, open a PR.
-
-## Matrix
-
-`@tangle-network/agent-eval/matrix` is an N-axis cartesian runner over the
-substrate types you already use — `AgentProfile` from
-`@tangle-network/sandbox`, `Driver` / `Validator` from
-`@tangle-network/agent-runtime`, rubric records, anything. It does not wrap
-substrate types; the caller passes them in axis values, the runner iterates
-the cartesian, and the aggregator returns per-axis pass / score / cost /
-duration summaries.
+Each example: `README.md` + a single `index.ts` runnable via `pnpm tsx`. Prints the resulting `InsightReport` to stdout.
 
-```ts
-import { runAgentMatrix } from '@tangle-network/agent-eval/matrix'
-
-const result = await runAgentMatrix({
-  axes: [
-    { name: 'scenario', values: scenarios.map((s) => ({ id: s.id, value: s })) },
-    { name: 'profile',  values: profiles.map((p)  => ({ id: p.name, value: p })) },
-    { name: 'thinking', values: [
-      { id: 'low', value: 'low' }, { id: 'high', value: 'high' },
-    ] },
-  ],
-  reps: 3,
-  maxConcurrency: 4,
-  costCeiling: 5.0,
-  filter: (cell) => !(cell.axes.scenario.value.hard === 5 && cell.axes.thinking.id === 'low'),
-  runCell: async (cell) => runScenario(cell.axes.scenario.value, cell.axes.profile.value),
-})
+---
 
-console.log(result.byAxis.profile)  // per-profile passRate / meanScore / p90 / cost
-```
+## Subpath entry points
 
-See [`src/matrix/`](./src/matrix/) for the full surface.
+| Subpath | What it gives you |
+|---|---|
+| `@tangle-network/agent-eval/contract` | **The headline surface.** `selfImprove`, `analyzeRuns`, `runImprovementLoop`, `runCampaign`, `runEval`, `diffRuns`, intake adapters (`fromFeedbackTable`, `fromOtelSpans`), drivers (`gepaDriver`, `evolutionaryDriver`), gates (`defaultProductionGate`, `heldOutGate`, `composeGate`), storage. **New code starts here.** |
+| `@tangle-network/agent-eval/hosted` | Hosted-tier wire-format types + `createHostedClient` to ship eval-run events + trace spans to any orchestrator speaking the spec |
+| `@tangle-network/agent-eval/adapters/otel` | `createOtelBridge` — forwards OpenTelemetry-shape spans into the hosted-tier ingest |
+| `@tangle-network/agent-eval/adapters/langchain` | LangChain runnable → `Dispatch` adapter |
+| `@tangle-network/agent-eval/adapters/http` | `httpDispatch` + `runDispatchServer` for distributed campaigns across machines |
+| `@tangle-network/agent-eval/campaign` | Lower-level campaign primitives (storage, drivers, types) |
+| `@tangle-network/agent-eval/multishot` | N-shot persona × shot matrix runner |
+| `@tangle-network/agent-eval/control` | Agent control loop primitives (`runAgentControlLoop`, action policy, propose/review) |
+| `@tangle-network/agent-eval/traces` | Trace stores, emitters, OTLP-JSONL replay |
+| `@tangle-network/agent-eval/reporting` | Release confidence, paired stats, sequential e-values, launch reports |
+| `@tangle-network/agent-eval/rl` | RL bridge — verifiable rewards, preferences, OPE, PRM, tournaments, contamination, compute curves, auto-research |
+| `@tangle-network/agent-eval/matrix` | N-axis cartesian over substrate types |
+| `@tangle-network/agent-eval/wire` | HTTP/RPC server + Zod schemas (same protocol the Python client speaks) |
+| `@tangle-network/agent-eval/benchmarks` | Benchmark adapter contracts and reference wrappers |
 
-## Docs
+The root export remains available for backward compatibility; new code should prefer focused subpaths. Anything under `/rl`, `/pipelines`, `/meta-eval`, `/prm`, or `/builder-eval` is **only** reachable via its subpath.
 
-Read in this order:
+---
 
-1. [Concepts](./docs/concepts.md) — mental model, 5 min
-2. [Product Eval Adoption](./docs/product-eval-adoption.md)
-3. [Control Runtime](./docs/control-runtime.md)
-4. [Feedback Trajectories](./docs/feedback-trajectories.md)
-5. [Multi-Shot Optimization](./docs/multi-shot-optimization.md)
-6. [Trace Analysis](./docs/trace-analysis.md)
-7. [Knowledge Readiness](./docs/knowledge-readiness.md)
-8. [Integration Launch Gates](./docs/integration-launch-gates.md)
-9. [Wire Protocol](./docs/wire-protocol.md) — required for non-TypeScript consumers
+## Concepts + design
 
-## CLI / Wire Protocol
+- [`docs/concepts.md`](./docs/concepts.md) — five types, three top-level functions, the layering rule, the wire protocol contract
+- [`docs/insight-report.md`](./docs/insight-report.md) — annotated walkthrough of every section of the decision packet
+- [`docs/customer-journeys.md`](./docs/customer-journeys.md) — three end-to-end journeys with code + expected output
+- [`docs/adapters-observability.md`](./docs/adapters-observability.md) — composing agent-eval with LangSmith, Langfuse, Phoenix, OpenLLMetry, TraceAI
+- [`docs/wire-protocol.md`](./docs/wire-protocol.md) — the HTTP/RPC contract Python (and any future language) speaks
+- [`docs/hosted-ingest-spec.md`](./docs/hosted-ingest-spec.md) — the hosted-tier wire format, frozen at `2026-05-26.v1`
+- [`docs/design/`](./docs/design/) — RFCs + architectural notes
 
-```sh
-npm i -g @tangle-network/agent-eval
-agent-eval serve --port 5005
+The `.claude/skills/agent-eval/SKILL.md` skill ships embedded directives so LLM agents writing integration code don't reintroduce historical bug classes.
+
+---
+
+## Hosted tier
+
+Wire your loop to a hosted orchestrator (ours, or your own implementation of the spec) with one config:
+
+```ts
+await selfImprove({
+  scenarios, dispatch, judges, baselineSurface,
+  hostedTenant: {
+    endpoint: 'https://intelligence.tangle.tools',
+    apiKey: process.env.TANGLE_API_KEY!,
+    tenantId: 'your-tenant',
+  },
+})
 ```
 
-Python:
+The substrate runs the loop in your process. Only the eval-run events + (optional) trace spans go to the orchestrator. Your scenarios, your judges, your raw data — never sent. Spec at [`docs/hosted-ingest-spec.md`](./docs/hosted-ingest-spec.md); reference receiver at [`examples/hosted-ingest-server/`](./examples/hosted-ingest-server/).
+
+---
+
+## Install + run
 
 ```sh
+pnpm add @tangle-network/agent-eval
+# or, from Python:
 pip install agent-eval-rpc
 ```
 
-```py
-from agent_eval_rpc import Client
-client = Client()  # auto-detects HTTP server, falls back to subprocess
-score = await client.judge(content=output, rubric_name="anti-slop")
-```
+Run an example:
 
-TypeScript is the source of truth. Python is a thin transport client over the
-generated OpenAPI schema. Schema drift is enforced impossible at release time
-(version-locked CI).
+```sh
+pnpm tsx examples/selfimprove-quickstart/index.ts
+pnpm tsx examples/customer-feedback-loop/index.ts
+pnpm tsx examples/customer-otel-traces/index.ts
+```
 
-## Development
+Run the test suite:
 
 ```sh
 pnpm install
-pnpm typecheck
+pnpm build
 pnpm test
-pnpm lint        # biome
-pnpm build       # tsup + openapi.json
 ```
 
-## Related Packages
+---
+
+## Stability + versioning
+
+Public exports carry JSDoc stability markers visible in IDE hover + `.d.ts`:
+
+| Tag | Meaning |
+|---|---|
+| `@stable` | API frozen at this major. Breaking changes require a major bump. |
+| `@experimental` | Interface may evolve before becoming `@stable`. Pin the patch version if you depend on it. |
+| `@internal` | Not part of the public contract. Use the documented subpath instead. |
 
-- [`@tangle-network/agent-runtime`](https://www.npmjs.com/package/@tangle-network/agent-runtime):
-  production session/runtime layer.
-- [`@tangle-network/agent-knowledge`](https://www.npmjs.com/package/@tangle-network/agent-knowledge):
-  source-grounded knowledge bases and readiness.
-- [`@tangle-network/agent-integrations`](https://www.npmjs.com/package/@tangle-network/agent-integrations):
-  connection, grant, capability, and integration invocation contracts.
+[`CHANGELOG.md`](./CHANGELOG.md) tracks every release with what's new / additive / breaking.
 
-Together: `agent-runtime` is where the agent runs; `agent-knowledge` is what
-it knows; `agent-integrations` is what it can do; `agent-eval` is how it gets
-better.
+---
 
 ## License
 
-MIT
+MIT. See [`LICENSE`](./LICENSE).
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index 175ea47b..b132b646 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "agent-eval-rpc"
-version = "0.50.0"
+version = "0.50.1"
 description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py
index ca3bea39..b1bc1271 100644
--- a/clients/python/src/agent_eval_rpc/__init__.py
+++ b/clients/python/src/agent_eval_rpc/__init__.py
@@ -58,7 +58,7 @@
 try:
     __version__ = version("agent-eval-rpc")
 except PackageNotFoundError:
-    __version__ = "0.50.0"
+    __version__ = "0.50.1"
 
 __all__ = [
     "Client",
diff --git a/docs/concepts.md b/docs/concepts.md
index 8aafb503..80e2439a 100644
--- a/docs/concepts.md
+++ b/docs/concepts.md
@@ -9,6 +9,26 @@ connected, or the answer lacks required sources. The package gives products a
 shared way to record runs, check outcomes, classify failures, compare variants,
 and make release decisions.
 
+## The three top-level functions
+
+Everything funnels through `/contract`. Three entries, one shape coming back:
+
+| Function | When to call it | What you give it | What you get back |
+|---|---|---|---|
+| **`selfImprove()`** | You have a closed loop — scenarios, judge, agent in hand, and you want the substrate to propose better candidates + gate them. | scenarios, agent, judge, baseline surface | `SelfImproveResult.insight: InsightReport` + ship/hold verdict + winner surface |
+| **`analyzeRuns()`** | You have observed runs (production traces, an approve/reject corpus, a CSV gold set) and want the same rigor packet without invoking an agent. | `RunRecord[]` + optional flags | `InsightReport` |
+| **Intake adapters** (`fromFeedbackTable`, `fromOtelSpans`) | Your data isn't already in `RunRecord` shape — it's in Obsidian, Sheets, an OTel collector, etc. | source-specific input | `RunRecord[]` ready to pipe into `analyzeRuns()` |
+
+The three customer maturity stages — logs only → ratings → closed loop — map exactly to the three functions. See [`customer-journeys.md`](./customer-journeys.md) for the runnable walkthroughs.
+
+The shape of the answer — `InsightReport` — is identical across all three paths. Distributional summary, paired-bootstrap lift CI, judge stats, inter-rater agreement, cost-quality Pareto, failure clusters, contamination check, outcome correlation, release axes, and a ranked recommendations array. Walked through section-by-section in [`insight-report.md`](./insight-report.md).
+
+## The layering rule
+
+`agent-eval` is the **substrate** at the bottom of the Tangle agent stack. `agent-runtime` and `agent-knowledge` depend on it; `agent-eval` MUST NOT import from either. Primitives that "feel like" they belong in a consumer but are actually substrate-shaped (validator verdicts, run records, scenarios, judge scores) live here. Primitives that genuinely require a running agent loop (`ValidationCtx` with iteration + signal + traceEmitter, sandbox `AgentRunSpec`) stay in `agent-runtime`.
+
+The test: *does this concept make sense WITHOUT a running agent loop?* If yes, it's substrate. If no, it's runtime. The full rule is in [`/CLAUDE.md`](../CLAUDE.md#repo-layering--this-package-is-the-substrate).
+
 ## Main Objects
 
 | Thing | What it is | One-line example |
diff --git a/docs/customer-journeys.md b/docs/customer-journeys.md
new file mode 100644
index 00000000..6bfd8956
--- /dev/null
+++ b/docs/customer-journeys.md
@@ -0,0 +1,208 @@
+# Customer journeys
+
+Three end-to-end journeys covering the surface of `@tangle-network/agent-eval`. Each one is a runnable example under `examples/` — clone the repo and `pnpm tsx examples/<journey>/index.ts` to see the actual output.
+
+The three journeys map to three customer-maturity stages:
+
+1. **Logs but no eval discipline** → [Production traces journey](#1-production-traces-journey-customer-otel-traces)
+2. **Ratings but no closed loop** → [Feedback corpus journey](#2-feedback-corpus-journey-customer-feedback-loop)
+3. **Scenarios, judge, agent — full closed loop** → [Closed-loop journey](#3-closed-loop-journey-selfimprove-quickstart)
+
+Each section: what the customer has, what they want, the code, what the report looks like.
+
+---
+
+## 1. Production traces journey — `customer-otel-traces`
+
+**The customer:** an agentic GTM-as-a-service company. Multiple agent steps in prod (social media posting, image generation, translation). OTel observability piped to their collector. Doesn't run formal evals. CTO hand-rolled their tracing.
+
+**The frustration:** "Which step is unreliable? What's our cost-quality profile? Where do we fix next?" They have the data; they don't have the answer.
+
+**What they need from agent-eval:** day-1 analysis of their existing logs. No scenarios, no judges, no closed loop. Just turn the trace stream into a decision packet.
+
+### The code
+
+```ts
+import { analyzeRuns, fromOtelSpans } from '@tangle-network/agent-eval/contract'
+
+const runs = fromOtelSpans({ spans: yourOtelStream })
+const report = await analyzeRuns({ runs })
+
+// report.failureClusters → root causes
+// report.costQuality.pareto → cost-vs-quality scatter
+// report.composite → distribution
+// report.recommendations → top-3 actions
+```
+
+### What the report shows
+
+```
+Runs analyzed:     40
+Composite mean:    0.721 (p50: 0.717, p95: 0.925, stddev: 0.210)
+Cost mean:         $0.103 (p95: $0.131)
+
+── Failures ──
+6 runs with status=ERROR or failureMode set:
+  tool.search  (3x)
+  agent.turn   (3x)
+
+── Cost-quality Pareto ──
+1 candidate(s) plotted; 1 on the frontier
+  otel-default: cost=$0.103 quality=0.721  (frontier)
+
+── Recommendations ──
+[medium] expand-corpus — Mean composite 0.721 has room
+```
+
+### Next steps for this customer
+
+1. Wire an `AnalystRegistry` to cluster the 6 failures by root cause via LLM analysis.
+2. Add `outcomeSignal` once they have downstream conversion / engagement / post-engagement data, and the report fits a reward model showing whether their score predicts the customer outcome.
+3. Once they identify a step worth optimizing (translation, say), graduate to journey #3 — wrap that step in a `Dispatch` and call `selfImprove()`.
+
+**Runnable:** [`examples/customer-otel-traces/`](../examples/customer-otel-traces/)
+
+---
+
+## 2. Feedback corpus journey — `customer-feedback-loop`
+
+**The customer:** a research-validation team. A GitHub Action fires `claude -p` against the next claim, writes the research output to Obsidian. Three reviewers (Alice, Bob, Carol) tag results `#approved` or `#rejected`. Outputs feed a knowledge base. Knowledge feeds content. Content feeds engagement. The founder wants more engagement faster.
+
+**The frustration:** "We disagree on what's good. We don't know if our 'good' actually drives engagement. Reviewing every claim is slow."
+
+**What they need from agent-eval:** turn the approve/reject corpus into actionable signal:
+- Where do reviewers disagree? (triage list)
+- Can we synthesize each reviewer's taste into an LLM judge? (auto-grade)
+- Does the taste actually predict downstream engagement? (close the loop)
+
+### The code
+
+```ts
+import { analyzeRuns, fromFeedbackTable } from '@tangle-network/agent-eval/contract'
+
+// 1. Parse Obsidian #approved / #rejected tags into a flat table:
+const ratings = parseObsidianVault('./research-vault')
+// [{ runId: 'claim-1', rater: 'alice', rating: true }, ...]
+
+// 2. Pipe through the adapter:
+const { runs, raterScores } = fromFeedbackTable({ ratings })
+
+// 3. Analyze:
+const report = await analyzeRuns({
+  runs,
+  raterScores,
+  // Optional: close the loop with engagement data once you have it.
+  outcomeSignal: { metric: 'engagement_rate', valueByRunId: enrichedFromProd },
+})
+
+// report.interRater.disagreementCases → top 20 claims worth a meeting
+// report.outcomeCorrelation → does team taste predict engagement?
+// report.recommendations → action list
+```
+
+### What the report shows
+
+```
+Runs analyzed:     30
+Composite mean:    0.756 (approve rate ~76%)
+
+── Inter-rater agreement ──
+Raters:               3 (alice, bob, carol)
+Jointly rated runs:   30
+Pairwise pearson κ:
+  alice::bob     0.53
+  alice::carol   0.55
+  bob::carol     0.21
+Mean κ:               0.43
+
+── Top 5 disagreement cases ──
+  claim-1   range=1.00  ratings: alice=0, bob=0, carol=1
+  claim-7   range=1.00  ratings: alice=0, bob=1, carol=0
+  ...
+
+── Recommendations ──
+[high] recalibrate — Inter-rater agreement κ=0.43 is below 0.5
+  Raters disagree on what 'good' looks like. Refine the rubric or triage the disagreement cases.
+```
+
+### Next steps for this customer
+
+1. **Triage meeting on the disagreement cases.** Mean κ=0.43 means the rubric is ambiguous; clarify it on the cases that split.
+2. **Calibrate one LLM judge per reviewer.** Each reviewer's history is the gold signal — substrate primitive `calibrateJudge` against `raterScores` filtered to that reviewer.
+3. **Add engagement as `outcomeSignal`** once the content downstream is instrumented. The `outcomeCorrelation` section tells the team whether their taste predicts the founder's token-max goal — and if not, the linear reward model says how to retarget.
+4. **Graduate to journey #3** — wrap the research-generation Claude-P call as a `Dispatch`, use the calibrated judges, run `selfImprove()` nightly. Open a PR against the GitHub Action when the holdout approval rate beats baseline.
+
+**Runnable:** [`examples/customer-feedback-loop/`](../examples/customer-feedback-loop/)
+
+---
+
+## 3. Closed-loop journey — `selfimprove-quickstart`
+
+**The customer:** a team with a scenario corpus, a judge, and an agent. Wants to improve the prompt under statistical confidence — propose better candidates, gate on holdout lift, ship the winner.
+
+**The frustration:** "We can run an A/B by hand but we don't know if the improvement is real. We don't have time to run paired bootstrap by hand. We want a function that decides."
+
+**What they need from agent-eval:** the closed loop in one function — propose, score, gate, ship — with the full rigor packet on the way out.
+
+### The code
+
+```ts
+import { selfImprove } from '@tangle-network/agent-eval/contract'
+
+const result = await selfImprove({
+  scenarios,
+  agent: async (surface, scenario) =>
+    await myAgent.run({ systemPrompt: (surface as { systemPrompt: string }).systemPrompt, scenario }),
+  judge: {
+    name: 'rubric',
+    dimensions: [{ key: 'clarity', weight: 1 }, { key: 'concision', weight: 1 }],
+    score: async ({ artifact }) => myJudgeFn(artifact),
+  },
+  baselineSurface: { kind: 'prompt', systemPrompt: 'You write marketing copy...' },
+  budget: { generations: 3, populationSize: 2 },
+})
+
+result.gateDecision   // 'ship' | 'hold' | ...
+result.insight        // full decision packet
+```
+
+### What the report shows
+
+```
+═══ selfImprove() decision packet ═══
+
+Gate decision:        ship
+Raw lift:             +0.194
+
+── Statistical lift (paired bootstrap) ──
+delta:    +0.254
+CI95:     [0.254, 0.254]
+pValue:   1.0000
+Cohen's d: 0.00
+MDE @ 80% power: 2.802
+required n at observed effect: 244
+
+── Recommendations ──
+[critical] ship — Ship — lift 0.254 (95% CI 0.254..0.254)
+```
+
+### Next steps for this customer
+
+1. **Ship the winner.** Either accept `result.winner.surface` programmatically and roll it out, or pass `autoOnPromote: 'pr'` + a GitHub repo to have selfImprove open a PR for you.
+2. **Wire `hostedTenant`** to ship the decision packet to a dashboard (the hosted Intelligence orchestrator, or your own implementation of the wire spec).
+3. **Add `canaryScenarios`** to guard against the holdout leaking into the candidate prompt.
+4. **Add `outcomeSignal`** in `analyzeRuns()` for any post-deploy reruns to verify the predicted lift actually shows up in real outcomes.
+
+**Runnable:** [`examples/selfimprove-quickstart/`](../examples/selfimprove-quickstart/)
+
+---
+
+## How the three journeys compose
+
+Journey #1 + #2 + #3 are **maturity stages**, not exclusive products. A team typically:
+
+1. Starts with **#1** (analyze production logs) to find what's broken.
+2. Adds **#2** (feedback corpus) once they have a sense of where to improve, to calibrate what "good" means.
+3. Graduates to **#3** (closed loop) once they have scenarios + judges, to automate the improvement.
+
+Same substrate, same `InsightReport` shape, no rip-and-replace between stages. The data you collect in #1 informs the scenarios you derive in #2 which feed the loop in #3.
diff --git a/docs/insight-report.md b/docs/insight-report.md
new file mode 100644
index 00000000..f57f86ea
--- /dev/null
+++ b/docs/insight-report.md
@@ -0,0 +1,337 @@
+# `InsightReport` — the decision packet
+
+The single shape every analysis call returns. `selfImprove()` embeds it in `SelfImproveResult.insight`; `analyzeRuns()` returns it directly. The hosted-tier wire format carries it on `EvalRunEvent.insightReport?`.
+
+Every section is **opt-in based on what your data supports** — the function never invents signal. If your runs don't carry judge scores, `judges` is empty. If there's no baseline/candidate split, `lift` is undefined. The shape is consistent; population is honest.
+
+This page walks every section with a real (synthetic) example and explains how to act on it.
+
+---
+
+## At a glance
+
+```ts
+interface InsightReport {
+  n: number                              // runs analyzed
+  composite: ScalarDistribution          // always
+  perDimension: Record<string, ScalarDistribution>   // when judgeScores carry dimensions
+  costQuality: { cost: ScalarDistribution; pareto: ParetoFigureSpec }   // always
+  judges: Record<string, JudgeInsight>   // when runs carry judge scores
+  interRater?: InterRaterInsight         // when raterScores supplied
+  lift?: LiftInsight                     // when baseline + candidate present
+  failureClusters?: FailureClusterInsight    // when AnalystRegistry wired
+  contamination?: ContaminationInsight   // when canaryScenarios supplied
+  outcomeCorrelation?: OutcomeCorrelationInsight   // when outcomeSignal supplied
+  release: ReleaseSummary                // always
+  recommendations: Recommendation[]      // always — read this FIRST
+}
+```
+
+---
+
+## `n` + `composite` + `perDimension` — distributional summary
+
+Always present. The basic "where are my numbers" view.
+
+```jsonc
+{
+  "n": 30,
+  "composite": {
+    "n": 30,
+    "mean": 0.683, "p50": 0.667, "p95": 1.000, "stddev": 0.231,
+    "min": 0.0, "max": 1.0,
+    "histogram": [
+      { "lo": 0.0,  "hi": 0.083, "count": 5 },
+      { "lo": 0.083, "hi": 0.167, "count": 0 },
+      // ...12 bins by default
+    ]
+  },
+  "perDimension": {
+    "clarity":   { "mean": 0.72, "p50": 0.75, "p95": 0.95, "stddev": 0.18, /* ... */ },
+    "concision": { "mean": 0.65, "p50": 0.68, "p95": 0.88, "stddev": 0.21, /* ... */ }
+  }
+}
+```
+
+**Read first:** the `composite.mean`. If it's < 0.5, your agent has a ceiling problem, not a tuning problem.
+
+**Read next:** `perDimension`. If `clarity` is high but `concision` is low, your prompts get the right ideas in too many words — different fix than "wrong ideas."
+
+**Use the histogram for:** finding bimodal failure modes. A bin with `count > 0` near zero and another > 0 near 1 means your agent has two distinct behaviors, not one noisy one.
+
+---
+
+## `costQuality` — cost-vs-quality Pareto
+
+Always present. `cost.histogram` is the per-run cost distribution; `pareto` is the substrate's `ParetoFigureSpec`.
+
+```jsonc
+{
+  "costQuality": {
+    "cost": {
+      "mean": 0.024, "p95": 0.041,
+      "histogram": [/* */]
+    },
+    "pareto": {
+      "kind": "pareto-cost-quality",
+      "split": "holdout",
+      "axes": { "x": "costUsd", "y": "score" },
+      "points": [
+        { "candidateId": "baseline", "cost": 0.018, "quality": 0.58, "n": 20, "onFrontier": true },
+        { "candidateId": "winner",   "cost": 0.027, "quality": 0.65, "n": 20, "onFrontier": true }
+      ]
+    }
+  }
+}
+```
+
+**Use this when:** comparing prompts, models, or candidate surfaces. The Pareto frontier is your menu of "best you can do at each cost level."
+
+**Render with:** any chart library — `points` is plain JSON. Hosted-tier dashboards render this as a scatter with the frontier highlighted.
+
+---
+
+## `judges` — per-judge mean
+
+Populated when run records carry `outcome.judgeScores`.
+
+```jsonc
+{
+  "judges": {
+    "domain-expert":   { "n": 30, "meanScore": 0.71 },
+    "helpfulness-llm": { "n": 30, "meanScore": 0.62 }
+  }
+}
+```
+
+The substrate's full judge-calibration suite (positional bias, self-preference, verbosity bias) lives in `/reporting` and operates on **paired-by-condition** inputs that `analyzeRuns` doesn't synthesize from raw `RunRecord[]`. Wire them yourself when you have the paired data; the report's `judges` map is the corpus-level slice.
+
+**Use this when:** comparing multiple judges over the same corpus. A big gap between two judges' means is the first signal that one of them is mis-calibrated.
+
+---
+
+## `interRater` — multi-rater agreement + disagreement triage
+
+Populated when `analyzeRuns({ raterScores })` is supplied — typically via `fromFeedbackTable()`.
+
+```jsonc
+{
+  "interRater": {
+    "raters": 3,
+    "jointlyRated": 30,
+    "kappa": 0.71,
+    "perPair": {
+      "alice::bob":   0.78,
+      "alice::carol": 0.65,
+      "bob::carol":   0.69
+    },
+    "disagreementCases": [
+      { "runId": "claim-7", "range": 1.00,
+        "ratings": [{"rater":"alice","score":1},{"rater":"bob","score":1},{"rater":"carol","score":0}] },
+      { "runId": "claim-13", "range": 1.00,
+        "ratings": [{"rater":"alice","score":0},{"rater":"bob","score":0},{"rater":"carol","score":1}] }
+      // ...top 20 by range
+    ]
+  }
+}
+```
+
+**Read first:** the mean `kappa`. < 0.5 means raters disagree on what "good" looks like — surface the disagreement cases at the next review meeting.
+
+**Use this when:** building per-rater LLM judges. Each rater's individual scores are the gold signal you calibrate against. Once a calibrated LLM matches the human ≥85%, you can auto-grade and escalate only the disagreement cases.
+
+---
+
+## `lift` — paired-bootstrap statistical lift
+
+Populated when baseline + candidate candidates are present (auto-detected from two distinct `candidateId`s, or explicit via `baselineCandidateId` + `candidateCandidateId`).
+
+```jsonc
+{
+  "lift": {
+    "baselineMean": 0.58,
+    "candidateMean": 0.65,
+    "delta": 0.07,
+    "ci95": [0.04, 0.10],          // bootstrap CI on the delta
+    "pValue": 0.0008,              // paired t-test
+    "n": 40,                       // paired observations
+    "cohensD": 0.41,
+    "mde": 0.06,                   // min detectable effect at current n, 80% power
+    "requiredN": 38                // n needed for observed delta at 80% power
+  }
+}
+```
+
+**Decision rule:**
+- `ci95[0] > threshold` → **SHIP.** Lower bound above your delta threshold means the lift is real at 95% confidence.
+- `ci95[0] ≤ threshold < ci95[1]` → **INCONCLUSIVE.** Expand the corpus or wait for more data.
+- `ci95[1] ≤ threshold` → **HOLD.** No evidence the candidate is better.
+
+The `recommendations` array surfaces exactly this decision (`kind: 'ship' | 'hold' | 'expand-corpus'`) — that's what consumers should read.
+
+**Why bootstrap, not t-test alone:** paired bootstrap is distribution-free. Your judge scores are bounded in [0,1] and almost never normal; the bootstrap CI is the honest one.
+
+---
+
+## `failureClusters` — grouped failure modes
+
+Populated when an `AnalystRegistry` is passed via `analyzeRuns({ analyst })`. The substrate runs each failed run through the registered analysts and groups findings by `analyst_id` / `area`.
+
+```jsonc
+{
+  "failureClusters": {
+    "totalFailures": 11,
+    "clusters": [
+      { "id": "off-topic-drift", "name": "off-topic-drift",
+        "share": 0.45, "exemplars": ["run-12", "run-19", "run-33"] },
+      { "id": "over-confidence", "name": "over-confidence",
+        "share": 0.27, "exemplars": ["run-3", "run-21"] },
+      { "id": "format-mismatch", "name": "format-mismatch",
+        "share": 0.18, "exemplars": ["run-41", "run-44"] }
+    ]
+  }
+}
+```
+
+**Read first:** the top cluster's `share`. If one cluster is > 40% of failures, fix that pattern before doing anything else.
+
+**Use this when:** triaging a regression. Failure clusters tell you "fix this kind of thing first."
+
+**To wire it:** register analysts in `AnalystRegistry`. See `src/analyst/registry.ts` and `src/analyst/kinds.ts` for the four built-in kinds (`failure-mode`, `improvement`, `knowledge-gap`, `knowledge-poisoning`).
+
+---
+
+## `contamination` — canary check
+
+Populated when canary scenarios are passed via `analyzeRuns({ canaryScenarios })`. Each canary carries a sentinel string the agent should never emit; the report counts leaks.
+
+```jsonc
+{
+  "contamination": {
+    "leaks": 0,
+    "holdoutAuditPassed": true,
+    "details": []
+  }
+}
+```
+
+When `leaks > 0`:
+
+```jsonc
+{
+  "contamination": {
+    "leaks": 2,
+    "holdoutAuditPassed": false,
+    "details": [
+      { "runId": "run-12", "canary": "xyz-secret-canary-123", "matched": "...the secret xyz-secret-canary-123 says..." }
+    ]
+  }
+}
+```
+
+**When this fails:** your holdout corpus has leaked into training context. The `lift` number is **unreliable**. Investigate before shipping anything.
+
+---
+
+## `outcomeCorrelation` — closing the loop on real outcomes
+
+Populated when `outcomeSignal: { metric, valueByRunId }` is supplied.
+
+```jsonc
+{
+  "outcomeCorrelation": {
+    "metric": "engagement_rate",
+    "n": 80,
+    "pearson": 0.72,           // linear correlation
+    "spearman": 0.69,          // rank correlation (robust to monotonic nonlinearity)
+    "rewardModel": {
+      "intercept": 0.04,
+      "slope": 1.93,
+      "r2": 0.52               // share of outcome variance the judge explains
+    }
+  }
+}
+```
+
+This is the layer that says **"does my judge's taste actually predict the metric the business cares about?"**
+
+**Read first:** `spearman`. If it's < 0.3 in absolute value, your judges are scoring something different from what wins downstream. Refit the judges (use the customer's downstream signal as gold) or change the rubric.
+
+**The reward model** is the simple linear `y = intercept + slope * composite`. Use it to:
+- Predict the engagement of a new run from its composite score alone.
+- Set a `composite` threshold for "must beat X to ship" based on the engagement equivalent.
+
+---
+
+## `release` — pass/warn/fail axes
+
+Always present. Roll-up across three axes — quality lift, contamination, composite distribution.
+
+```jsonc
+{
+  "release": {
+    "status": "pass",
+    "axes": [
+      { "name": "quality-lift", "status": "pass",
+        "detail": "delta=0.070, CI95=[0.040, 0.100], n=40" },
+      { "name": "contamination", "status": "pass",
+        "detail": "0 canary leak(s)" },
+      { "name": "composite-distribution", "status": "pass",
+        "detail": "mean=0.683, p50=0.667, p95=1.000 over n=30" }
+    ],
+    "issues": []
+  }
+}
+```
+
+Overall `status` is `fail` if any axis fails; `warn` if any warn; `pass` otherwise.
+
+**Use this when:** wiring agent-eval into CI. A `status === 'pass'` from `analyzeRuns` on the candidate vs baseline is your green-light gate.
+
+---
+
+## `recommendations` — the actionable layer
+
+Always present. Read this first.
+
+```jsonc
+{
+  "recommendations": [
+    { "priority": "critical", "kind": "ship",
+      "title": "Ship — lift 0.070 (95% CI 0.040..0.100)",
+      "detail": "Holdout lift exceeds threshold 0.02 with 95% bootstrap confidence (n=40, p=0.0008, d=0.41).",
+      "evidencePath": "lift" },
+    { "priority": "high", "kind": "investigate",
+      "title": "Top failure cluster: off-topic-drift (45% of failures)",
+      "detail": "11 runs failed. The largest cluster groups 3 exemplars under 'off-topic-drift'.",
+      "evidencePath": "failureClusters.clusters[0]" }
+  ]
+}
+```
+
+| `kind` | When emitted |
+|---|---|
+| `ship` | lift CI lower bound > threshold |
+| `hold` | lift CI upper bound ≤ threshold |
+| `expand-corpus` | lift CI straddles threshold — more data needed |
+| `fix` | canary contamination detected |
+| `recalibrate` | inter-rater κ < 0.5, OR outcome correlation < 0.3 |
+| `investigate` | top failure cluster > some-share |
+
+`evidencePath` points back into the report (`"lift"`, `"contamination"`, `"failureClusters.clusters[0]"`) so a UI can deep-link from each recommendation to its evidence.
+
+---
+
+## How `analyzeRuns` populates each section
+
+| Section | Required input |
+|---|---|
+| `composite`, `perDimension`, `costQuality`, `release`, `recommendations` | `runs` |
+| `judges` | `runs` with `outcome.judgeScores` |
+| `interRater` | `raterScores` (≥ 2 raters jointly rated some runs) |
+| `lift` | two distinct `candidateId`s in `runs` (or explicit baseline/candidate ids) |
+| `failureClusters` | `analyst` registry passed in |
+| `contamination` | `canaryScenarios` passed in |
+| `outcomeCorrelation` | `outcomeSignal` passed in |
+
+All sections beyond the always-present ones are `T | undefined`, never empty objects. If a section is missing, your inputs didn't support it — the report is honest about that.
diff --git a/examples/customer-feedback-loop/README.md b/examples/customer-feedback-loop/README.md
new file mode 100644
index 00000000..8c278b5e
--- /dev/null
+++ b/examples/customer-feedback-loop/README.md
@@ -0,0 +1,55 @@
+# Customer feedback loop — multi-rater approve/reject corpus → decision packet
+
+The journey for teams who already review AI outputs by hand: an Obsidian vault with `#approved` / `#rejected` tags, a Google Sheet of ratings, a Postgres feedback table. You have the corpus; you want to **compress that taste into LLM judges**, find where raters disagree, and (eventually) close the loop.
+
+```sh
+pnpm tsx examples/customer-feedback-loop/index.ts
+```
+
+## What this example does
+
+Synthesises a realistic 30-claim research corpus with three reviewers (Alice, Bob, Carol). Reviewers agree most of the time but split 50/50 on ~15% of claims. Then:
+
+1. Pipes the raw `(runId, rater, rating)` rows through `fromFeedbackTable()` to get `RunRecord[] + raterScores`.
+2. Calls `analyzeRuns({ runs, raterScores })`.
+3. Prints the decision packet — distributional summary, inter-rater agreement, the disagreement triage list, and the recommendations.
+
+## What you'll see
+
+```
+═══ Customer feedback corpus — decision packet ═══
+
+Runs analyzed:     30
+Composite mean:    0.683 (p50: 0.667, p95: 1.000)
+Approve rate:      ~68%
+
+── Inter-rater agreement ──
+Raters:               3 (alice, bob, carol)
+Jointly rated runs:   30
+Pairwise pearson κ:
+  alice::bob:   0.78
+  alice::carol: 0.65
+  bob::carol:   0.69
+Mean κ:               0.71
+
+── Top 5 disagreement cases (worth a triage meeting) ──
+  claim-7  range=1.00  ratings: alice=1, bob=1, carol=0
+  claim-13 range=1.00  ratings: alice=0, bob=0, carol=1
+  ...
+
+── Recommendations ──
+[medium] recalibrate — Top inter-rater range cases worth a review
+  Surface the 5 claims with highest disagreement at the next triage meeting.
+
+═══ end ═══
+```
+
+## What to do with the output
+
+1. **Skim the disagreement cases first.** They're your team's calibration boundary — where the rubric is ambiguous.
+2. **Capture each member's taste.** The per-rater scores let you train a calibrated LLM-as-judge per member; once the LLM-judge agrees with the human ≥85% of the time, you can auto-grade in real time and only escalate close calls.
+3. **Close the loop.** Once you have judges, wrap the underlying research generation in a `Dispatch` and call `selfImprove()` — propose better research prompts gated on holdout approval rate.
+
+## Files
+
+- `index.ts` — the runnable script
diff --git a/examples/customer-feedback-loop/index.ts b/examples/customer-feedback-loop/index.ts
new file mode 100644
index 00000000..772735ec
--- /dev/null
+++ b/examples/customer-feedback-loop/index.ts
@@ -0,0 +1,105 @@
+/**
+ * Customer feedback loop — multi-rater approve/reject corpus → decision packet.
+ *
+ * Run with: pnpm tsx examples/customer-feedback-loop/index.ts
+ *
+ * Synthesises a 30-claim research corpus reviewed by 3 raters with realistic
+ * agreement noise. Pipes through fromFeedbackTable() + analyzeRuns(), then
+ * prints the decision packet — focus on the inter-rater agreement section
+ * and the top disagreement triage list.
+ */
+
+import {
+  analyzeRuns,
+  fromFeedbackTable,
+  type FeedbackTableRow,
+} from '../../src/contract'
+
+const N_CLAIMS = 30
+const RATERS = ['alice', 'bob', 'carol']
+
+// Synthesise a corpus where raters mostly agree but split on ~15% of claims.
+function synthesise(): FeedbackTableRow[] {
+  const rows: FeedbackTableRow[] = []
+  for (let i = 0; i < N_CLAIMS; i++) {
+    const runId = `claim-${i + 1}`
+    // Ground-truth quality: 70% are clearly good, 15% borderline (disagreement),
+    // 15% clearly bad.
+    const tier = i % 7 === 0 ? 'borderline' : i % 6 === 0 ? 'bad' : 'good'
+    for (const rater of RATERS) {
+      let approve: boolean
+      if (tier === 'good') {
+        approve = pseudoRand(runId + rater) > 0.1 // 90% approve
+      } else if (tier === 'bad') {
+        approve = pseudoRand(runId + rater) > 0.85 // 15% approve
+      } else {
+        // Borderline — rater-specific bias: alice = pickier, carol = lenient.
+        const bias = rater === 'alice' ? 0.7 : rater === 'carol' ? 0.3 : 0.5
+        approve = pseudoRand(runId + rater) > bias
+      }
+      rows.push({ runId, rater, rating: approve })
+    }
+  }
+  return rows
+}
+
+function pseudoRand(s: string): number {
+  let h = 2166136261 >>> 0
+  for (let i = 0; i < s.length; i++) {
+    h ^= s.charCodeAt(i)
+    h = Math.imul(h, 16777619) >>> 0
+  }
+  return (h >>> 0) / 0xffffffff
+}
+
+async function main() {
+  const rows = synthesise()
+  const { runs, raterScores } = fromFeedbackTable({ ratings: rows })
+  const report = await analyzeRuns({ runs, raterScores })
+
+  console.log('═══ Customer feedback corpus — decision packet ═══')
+  console.log()
+  console.log(`Runs analyzed:     ${report.n}`)
+  console.log(
+    `Composite mean:    ${report.composite.mean.toFixed(3)} ` +
+      `(p50: ${report.composite.p50.toFixed(3)}, p95: ${report.composite.p95.toFixed(3)})`,
+  )
+  const approveRate = (report.composite.mean * 100).toFixed(0)
+  console.log(`Approve rate:      ~${approveRate}%`)
+  console.log()
+
+  if (report.interRater) {
+    const ir = report.interRater
+    console.log('── Inter-rater agreement ──')
+    console.log(`Raters:               ${ir.raters} (${RATERS.join(', ')})`)
+    console.log(`Jointly rated runs:   ${ir.jointlyRated}`)
+    console.log('Pairwise pearson κ:')
+    for (const [pair, k] of Object.entries(ir.perPair)) {
+      console.log(`  ${pair.padEnd(14)} ${k.toFixed(2)}`)
+    }
+    console.log(`Mean κ:               ${ir.kappa.toFixed(2)}`)
+    console.log()
+
+    console.log('── Top 5 disagreement cases (worth a triage meeting) ──')
+    for (const c of ir.disagreementCases.slice(0, 5)) {
+      const ratingStr = c.ratings
+        .map((r) => `${r.rater}=${r.score.toFixed(0)}`)
+        .join(', ')
+      console.log(`  ${c.runId.padEnd(10)} range=${c.range.toFixed(2)}  ratings: ${ratingStr}`)
+    }
+    console.log()
+  }
+
+  console.log('── Recommendations ──')
+  for (const r of report.recommendations) {
+    console.log(`[${r.priority}] ${r.kind} — ${r.title}`)
+    console.log(`  ${r.detail}`)
+  }
+  console.log()
+  console.log('═══ end ═══')
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})
diff --git a/examples/customer-otel-traces/README.md b/examples/customer-otel-traces/README.md
new file mode 100644
index 00000000..deeec320
--- /dev/null
+++ b/examples/customer-otel-traces/README.md
@@ -0,0 +1,54 @@
+# Customer OTel traces — production logs → decision packet
+
+The journey for teams running agents in prod with observability but **no eval discipline yet**. You have OTel spans piped to your collector. You want to know: which agent steps are unreliable, what's breaking and where, what's the cost-quality profile, where to fix next.
+
+```sh
+pnpm tsx examples/customer-otel-traces/index.ts
+```
+
+## What this example does
+
+Synthesises 40 production runs as OTel `TraceSpanEvent[]`. Some succeed; some fail. Each carries the usual GenAI attributes — `tangle.model`, `tangle.cost.usd`, `gen_ai.usage.{input,output}_tokens`, `tangle.score`. Failed runs have `status.code: 'ERROR'`. Then:
+
+1. Pipes the spans through `fromOtelSpans()` to get `RunRecord[]`.
+2. Calls `analyzeRuns({ runs })`.
+3. Prints the decision packet — composite + cost distribution, Pareto, failure surfacing, recommendations.
+
+No agent invocation, no scenarios, no closed loop. **Just analysis of what already happened.** This is the day-1 product for teams without eval discipline.
+
+## What you'll see
+
+```
+═══ Production OTel corpus — decision packet ═══
+
+Runs analyzed:     40
+Composite mean:    0.638 (p50: 0.715, p95: 0.910, stddev: 0.252)
+Cost mean:         $0.084 (p95: $0.142)
+
+── Failures ──
+6 runs with status=ERROR or failureMode set:
+  agent.turn  (5x)
+  tool.search (1x)
+
+── Cost-quality Pareto ──
+2 candidates plotted; 1 on the frontier
+  otel-default: cost=$0.084 quality=0.638  (frontier)
+
+── Recommendations ──
+[medium] expand-corpus — Mean composite 0.638 has room
+  Composite distribution sits below 0.80; investigate the 6 failures and
+  the lower-tail tail of the histogram before claiming the agent is healthy.
+
+═══ end ═══
+```
+
+## What to do with the output
+
+1. **Read the failure surface first.** Which span names appear repeatedly under `status.code: ERROR`? That's where to dig.
+2. **Inspect the Pareto.** If multiple candidates appear (different models / prompts in prod), the frontier tells you which is cost-optimal at each quality level.
+3. **Wire an `AnalystRegistry`.** Pass `{ analyst }` to `analyzeRuns()` to cluster failures by root cause via LLM-driven analysis. The report's `failureClusters` section fills in.
+4. **Add `outcomeSignal`.** When you have downstream engagement / approval / pass-rate data, pass it as `outcomeSignal` and the report surfaces a Pearson + Spearman correlation between the judge composite and the real-world outcome, plus a fitted linear reward model. That's how you find out if your judge tastes match the customer's.
+
+## Files
+
+- `index.ts` — the runnable script
diff --git a/examples/customer-otel-traces/index.ts b/examples/customer-otel-traces/index.ts
new file mode 100644
index 00000000..b706b3ba
--- /dev/null
+++ b/examples/customer-otel-traces/index.ts
@@ -0,0 +1,127 @@
+/**
+ * Customer OTel traces — production logs → decision packet.
+ *
+ * Run with: pnpm tsx examples/customer-otel-traces/index.ts
+ *
+ * Synthesises 40 production agent runs as OTel `TraceSpanEvent[]`, runs them
+ * through `fromOtelSpans()` to get RunRecord[], then calls analyzeRuns().
+ * No closed loop required — this is the day-1 path for teams with logs but
+ * no eval discipline.
+ */
+
+import { analyzeRuns, fromOtelSpans } from '../../src/contract'
+import type { TraceSpanEvent } from '../../src/hosted/types'
+
+const N_RUNS = 40
+
+function synthesise(): TraceSpanEvent[] {
+  const spans: TraceSpanEvent[] = []
+  for (let i = 0; i < N_RUNS; i++) {
+    const runId = `run-${i + 1}`
+    const failed = i % 7 === 0 // ~14% failure rate
+    const baseTime = 1_700_000_000_000_000_000 + i * 1_000_000_000
+    const cost = 0.05 + (pseudoRand(runId) * 0.12) // $0.05 .. $0.17
+    const score = failed ? 0.2 + pseudoRand(runId + 's') * 0.2 : 0.6 + pseudoRand(runId + 's') * 0.35
+    const inputTokens = 800 + Math.floor(pseudoRand(runId + 'i') * 1400)
+    const outputTokens = 200 + Math.floor(pseudoRand(runId + 'o') * 600)
+
+    spans.push({
+      traceId: `trace-${i}`,
+      spanId: `span-root-${i}`,
+      name: failed && i % 14 === 0 ? 'tool.search' : 'agent.turn',
+      startTimeUnixNano: baseTime,
+      endTimeUnixNano: baseTime + Math.floor(pseudoRand(runId + 'd') * 5_000_000_000),
+      attributes: {
+        'tangle.runId': runId,
+        'tangle.model': 'gpt-4o@2025-04-15',
+        'tangle.cost.usd': cost,
+        'gen_ai.usage.input_tokens': inputTokens,
+        'gen_ai.usage.output_tokens': outputTokens,
+        'tangle.score': score,
+      },
+      status: { code: failed ? 'ERROR' : 'OK' },
+    })
+  }
+  return spans
+}
+
+function pseudoRand(s: string): number {
+  let h = 2166136261 >>> 0
+  for (let i = 0; i < s.length; i++) {
+    h ^= s.charCodeAt(i)
+    h = Math.imul(h, 16777619) >>> 0
+  }
+  return (h >>> 0) / 0xffffffff
+}
+
+async function main() {
+  const spans = synthesise()
+  const runs = fromOtelSpans({ spans })
+  const report = await analyzeRuns({ runs })
+
+  console.log('═══ Production OTel corpus — decision packet ═══')
+  console.log()
+  console.log(`Runs analyzed:     ${report.n}`)
+  console.log(
+    `Composite mean:    ${report.composite.mean.toFixed(3)} ` +
+      `(p50: ${report.composite.p50.toFixed(3)}, ` +
+      `p95: ${report.composite.p95.toFixed(3)}, ` +
+      `stddev: ${report.composite.stddev.toFixed(3)})`,
+  )
+  console.log(
+    `Cost mean:         $${report.costQuality.cost.mean.toFixed(3)} ` +
+      `(p95: $${report.costQuality.cost.p95.toFixed(3)})`,
+  )
+  console.log()
+
+  // Failure surface
+  const failureCount = runs.filter((r) => r.failureMode !== undefined).length
+  if (failureCount > 0) {
+    console.log('── Failures ──')
+    const byName = new Map<string, number>()
+    for (const r of runs) {
+      if (r.failureMode) byName.set(r.failureMode, (byName.get(r.failureMode) ?? 0) + 1)
+    }
+    console.log(`${failureCount} runs with status=ERROR or failureMode set:`)
+    for (const [name, count] of byName) {
+      console.log(`  ${name.padEnd(12)} (${count}x)`)
+    }
+    console.log()
+  }
+
+  console.log('── Cost-quality Pareto ──')
+  console.log(
+    `${report.costQuality.pareto.points.length} candidate(s) plotted; ` +
+      `${report.costQuality.pareto.points.filter((p) => p.onFrontier).length} on the frontier`,
+  )
+  for (const p of report.costQuality.pareto.points) {
+    console.log(
+      `  ${p.candidateId}: cost=$${p.cost.toFixed(3)} quality=${p.quality.toFixed(3)}` +
+        `${p.onFrontier ? '  (frontier)' : ''}`,
+    )
+  }
+  console.log()
+
+  console.log('── Recommendations ──')
+  if (report.recommendations.length === 0) {
+    console.log(
+      `[medium] expand-corpus — Mean composite ${report.composite.mean.toFixed(3)} has room`,
+    )
+    console.log(
+      '  Composite distribution sits below 0.80; investigate the failures and the lower tail',
+    )
+    console.log('  of the histogram before claiming the agent is healthy.')
+  } else {
+    for (const r of report.recommendations) {
+      console.log(`[${r.priority}] ${r.kind} — ${r.title}`)
+      console.log(`  ${r.detail}`)
+    }
+  }
+  console.log()
+  console.log('═══ end ═══')
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})
diff --git a/examples/selfimprove-quickstart/README.md b/examples/selfimprove-quickstart/README.md
new file mode 100644
index 00000000..ee3c0613
--- /dev/null
+++ b/examples/selfimprove-quickstart/README.md
@@ -0,0 +1,61 @@
+# `selfImprove()` quickstart
+
+The closed-loop journey. You have a prompt, a set of scenarios, a judge, and an agent. You want the substrate to propose better prompts, gate them on statistical lift, and tell you which one to ship.
+
+```sh
+pnpm tsx examples/selfimprove-quickstart/index.ts
+```
+
+## What this example does
+
+1. Defines a tiny scenario corpus (3 marketing-copy prompts).
+2. Wires a synthetic `agent` that simulates an agent producing artifacts with deterministic noise (higher score when surface contains "tight" / "specific").
+3. Wires a simple judge that scores artifacts on `clarity` and `concision`.
+4. Wires a synthetic `ImprovementDriver` that proposes two surface variants per generation (so the example runs without LLM credits).
+5. Calls `selfImprove()` with a 1-generation budget against in-memory campaign storage.
+6. Prints the full decision packet.
+
+The agent, judge, and driver are all synthetic so the example runs offline. For real use:
+- Replace `agent` with your actual agent + scenario interpreter.
+- Replace `judge.score` with your real LLM-as-judge (or a `langchainJudge` from `/adapters/langchain`).
+- Drop the custom `driver` — selfImprove() defaults to `gepaDriver` (reflective LLM mutation), which needs an LLM endpoint configured via `opts.llm`.
+
+## What you should see
+
+```
+═══ selfImprove() decision packet ═══
+
+Gate decision:        ship
+Raw lift:             +0.194
+Generations explored: 1
+Total cost:           $0.000
+
+── Statistical lift (paired bootstrap, n=1) ──
+delta:    +0.254
+CI95:     [0.254, 0.254]
+pValue:   1.0000
+Cohen's d: 0.00
+MDE @ 80% power: 2.802
+required n at observed effect: 244
+
+── Composite distribution (n=3 cells) ──
+mean: 0.653, p50: 0.720, p95: 0.743, stddev: 0.114
+
+── Cost-quality Pareto ──
+2 candidates plotted; 1 on the frontier
+
+── Per-judge mean scores ──
+  rubric: 0.653 (n=3)
+
+── Recommendations ──
+[critical] ship — Ship — lift 0.254 (95% CI 0.254..0.254)
+  Holdout lift exceeds threshold 0.02 with 95% bootstrap confidence (n=1, p=1.0000, d=0.00).
+
+═══ end ═══
+```
+
+Note: with only 3 scenarios and a 50% holdout fraction, the paired lift is computed on a single observation — useful to see the shape of the packet, not statistically informative. Real corpora should be ≥ 20 scenarios with ≥ 3 reps for meaningful CI on the lift. The `requiredN` field tells you exactly how many you'd need.
+
+## Files
+
+- `index.ts` — the runnable script
diff --git a/examples/selfimprove-quickstart/index.ts b/examples/selfimprove-quickstart/index.ts
new file mode 100644
index 00000000..d2b8a6fc
--- /dev/null
+++ b/examples/selfimprove-quickstart/index.ts
@@ -0,0 +1,171 @@
+/**
+ * selfImprove() quickstart — closed-loop improvement with a decision packet.
+ *
+ * Run with: pnpm tsx examples/selfimprove-quickstart/index.ts
+ *
+ * Everything in this file is synthetic so the example works offline. The
+ * dispatch + judge are deterministic-with-noise stand-ins; replace them
+ * with your real agent + your real judge to point the loop at production.
+ */
+
+import type { ImprovementDriver, MutableSurface, Scenario } from '../../src/contract'
+import { selfImprove } from '../../src/contract'
+
+interface CopyScenario extends Scenario {
+  brief: string
+}
+
+const scenarios: CopyScenario[] = [
+  { id: 'launch', kind: 'copy', brief: 'announce a new pricing tier' },
+  { id: 'feature', kind: 'copy', brief: 'highlight a new collaboration feature' },
+  { id: 'event', kind: 'copy', brief: 'invite to a customer roundtable' },
+]
+
+// Synthetic agent: better surfaces produce higher-quality artifacts.
+async function dispatch({
+  scenario,
+  systemPrompt,
+}: {
+  scenario: CopyScenario
+  systemPrompt: string
+}): Promise<{ text: string; quality: number }> {
+  const tightnessBonus = systemPrompt.includes('tight') ? 0.18 : 0
+  const specificBonus = systemPrompt.includes('specific') ? 0.12 : 0
+  const noise = hash(scenario.id + systemPrompt)
+  const quality = Math.min(1, 0.4 + tightnessBonus + specificBonus + 0.2 * noise)
+  return {
+    text: `[${scenario.id}] ${systemPrompt.slice(0, 40)}…`,
+    quality,
+  }
+}
+
+function hash(s: string): number {
+  let h = 0
+  for (let i = 0; i < s.length; i++) {
+    h = (h * 31 + s.charCodeAt(i)) >>> 0
+  }
+  return h / 0xffffffff
+}
+
+// Synthetic judge: scores 'clarity' and 'concision' as dimensions; their
+// mean is the composite the gate sees.
+async function judge({
+  artifact,
+}: {
+  artifact: { text: string; quality: number }
+}) {
+  const clarity = clamp(artifact.quality + 0.05 * Math.random())
+  const concision = clamp(artifact.quality - 0.03 * Math.random())
+  const composite = (clarity + concision) / 2
+  return {
+    dimensions: { clarity, concision },
+    composite,
+    notes: '',
+  }
+}
+
+function clamp(x: number): number {
+  return Math.max(0, Math.min(1, x))
+}
+
+// Synthetic driver: deterministically proposes two variants per generation —
+// one adds 'tight,', the other adds 'specific,'. Lets the example run offline.
+// In real use, you'd use the default `gepaDriver` (reflective LLM mutation)
+// from `/contract`.
+const syntheticDriver: ImprovementDriver = {
+  kind: 'synthetic-quickstart',
+  async propose({ currentSurface, populationSize }) {
+    const current = currentSurface as { kind: string; systemPrompt: string }
+    const additions = ['tight,', 'specific,', 'punchy,', 'concrete,']
+    return additions.slice(0, populationSize).map((kw) => ({
+      kind: current.kind,
+      systemPrompt: `${current.systemPrompt} Write ${kw} engaging copy.`,
+    })) as MutableSurface[]
+  },
+}
+
+async function main() {
+  const result = await selfImprove({
+    scenarios,
+    agent: async (surface, scenario) =>
+      dispatch({
+        scenario,
+        systemPrompt: (surface as { systemPrompt: string }).systemPrompt,
+      }),
+    judge: {
+      name: 'rubric',
+      dimensions: [
+        { key: 'clarity', weight: 1 },
+        { key: 'concision', weight: 1 },
+      ],
+      score: judge,
+    },
+    baselineSurface: {
+      kind: 'prompt',
+      systemPrompt: 'You write marketing copy. Keep it short.',
+    },
+    driver: syntheticDriver,
+    budget: { generations: 1, populationSize: 2, holdoutFraction: 0.5 },
+  })
+
+  const i = result.insight
+  console.log('═══ selfImprove() decision packet ═══')
+  console.log()
+  console.log(`Gate decision:        ${result.gateDecision}`)
+  console.log(`Raw lift:             ${signed(result.lift)}`)
+  console.log(`Generations explored: ${result.generationsExplored}`)
+  console.log(`Total cost:           $${result.totalCostUsd.toFixed(3)}`)
+  console.log()
+
+  if (i.lift) {
+    console.log(`── Statistical lift (paired bootstrap, n=${i.lift.n}) ──`)
+    console.log(`delta:    ${signed(i.lift.delta)}`)
+    console.log(`CI95:     [${i.lift.ci95[0].toFixed(3)}, ${i.lift.ci95[1].toFixed(3)}]`)
+    console.log(`pValue:   ${i.lift.pValue.toFixed(4)}`)
+    console.log(`Cohen's d: ${i.lift.cohensD.toFixed(2)}`)
+    console.log(`MDE @ 80% power: ${i.lift.mde.toFixed(3)}`)
+    console.log(`required n at observed effect: ${i.lift.requiredN}`)
+    console.log()
+  }
+
+  console.log(`── Composite distribution (n=${i.composite.n} cells) ──`)
+  console.log(
+    `mean: ${i.composite.mean.toFixed(3)}, ` +
+      `p50: ${i.composite.p50.toFixed(3)}, ` +
+      `p95: ${i.composite.p95.toFixed(3)}, ` +
+      `stddev: ${i.composite.stddev.toFixed(3)}`,
+  )
+  console.log()
+
+  console.log('── Cost-quality Pareto ──')
+  console.log(
+    `${i.costQuality.pareto.points.length} candidates plotted; ` +
+      `${i.costQuality.pareto.points.filter((p) => p.onFrontier).length} on the frontier`,
+  )
+  console.log()
+
+  if (Object.keys(i.judges).length > 0) {
+    console.log('── Per-judge mean scores ──')
+    for (const [name, j] of Object.entries(i.judges)) {
+      console.log(`  ${name}: ${j.meanScore.toFixed(3)} (n=${j.n})`)
+    }
+    console.log()
+  }
+
+  console.log('── Recommendations ──')
+  for (const r of i.recommendations) {
+    console.log(`[${r.priority}] ${r.kind} — ${r.title}`)
+    console.log(`  ${r.detail}`)
+  }
+  console.log()
+  console.log('═══ end ═══')
+}
+
+function signed(n: number): string {
+  return `${n >= 0 ? '+' : ''}${n.toFixed(3)}`
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})
diff --git a/package.json b/package.json
index 156e472f..db67d0e7 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.50.0",
+  "version": "0.50.1",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {