diff --git a/CHANGELOG.md b/CHANGELOG.md index f07a67f7..e6ffa9ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,140 @@ # Changelog +All notable changes to `@tangle-network/agent-eval` and its sibling `agent-eval-rpc` (Python). The format roughly follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/); versions are locked across the npm + PyPI packages. + +--- + +## [0.50.1] — 2026-05-27 — docs + examples + +### Added + +- `README.md` rewritten as a top-tier OSS landing page: table of contents, decision-packet output sample (annotated JSON), comparison matrix vs LangSmith / Braintrust / Phoenix, three customer journey cards. +- `examples/selfimprove-quickstart/` — minimal closed-loop example with annotated stdout. +- `examples/customer-feedback-loop/` — Customer A journey: multi-rater approve/reject corpus → `fromFeedbackTable` → `analyzeRuns`. +- `examples/customer-otel-traces/` — Customer B journey: OTel spans → `fromOtelSpans` → `analyzeRuns`. +- `docs/insight-report.md` — annotated walkthrough of every section of the decision packet. +- `docs/customer-journeys.md` — three end-to-end journeys with code + expected output. + +### Changed + +- `docs/concepts.md` — updated mental model for the three top-level entries (`selfImprove`, `analyzeRuns`, intake adapters) and the layering rule. + +### Notes + +Docs-only patch. No code changes, no behavior changes, no API surface changes vs 0.50.0. + +--- + +## [0.50.0] — 2026-05-27 — the decision packet + +### Added + +- **`analyzeRuns({ runs, ... }): InsightReport`** in `/contract`. Composes the substrate's statistical / calibration / clustering / Pareto primitives into one rigor packet. Sections populate based on what the input supports: distributional summary always, lift when baseline+candidate are present, judges when run records carry `judgeScores`, inter-rater agreement when `raterScores` are supplied, failure clusters when an `AnalystRegistry` is wired, contamination when canaries are passed, outcome correlation when a downstream signal is supplied. +- **`InsightReport`** canonical decision-packet shape; reused by `selfImprove()` and emitted on the hosted wire as `EvalRunEvent.insightReport?`. +- **Intake adapters** in `/contract`: + - `fromFeedbackTable({ ratings })` — multi-rater corpus → `RunRecord[] + raterScores`. + - `fromOtelSpans({ spans })` — OpenTelemetry spans → `RunRecord[]`, grouped by `tangle.runId` or `traceId`. +- **`SelfImproveResult.insight: InsightReport`** — `selfImprove()` now returns the full decision packet alongside the existing ship/hold verdict. + +### Changed + +- `selfImprove()` internally calls `analyzeRuns()` on baseline + winner cells; consumers reading `.lift` continue to work unchanged, while `.insight.lift` now carries CI95 + p-value + Cohen's d + MDE + required-n. + +### Test coverage + +1427 / 1427 passing; 11 new integration tests covering lift detection paths, outcome correlation + linear reward model, canary contamination, multi-rater journey end-to-end, OTel journey end-to-end, recommendations shape, JSON-serialisability. + +--- + +## [0.49.0] — 2026-05-27 — audit-fix sweep + +### Added + +- `src/adapters/otel.ts` — generic OTel→hosted bridge (`createOtelBridge` / `OtelBridge` / `OtelBridgeOptions`). Stringifies array-valued attributes instead of dropping them. +- `src/contract/diff.ts` — `keyForCell` uses `JSON.stringify([scenarioId, rep])` (no separator collisions); `Number.isFinite` coercion on dimension deltas (no NaN propagating to dashboards). +- `examples/hosted-ingest-server/server.ts` — `REFERENCE_RECEIVER_START=1|0` env var as the primary start signal; idempotency cache prunes on read with the wire-spec 24h TTL. + +### Changed + +- Python `TraceSpanEventOuter` exposes `tangle.*` pivots via field aliases (`tangle_run_id`, etc.) and round-trips through `model_dump(by_alias=True)`. +- Python `_WireModel` emits a `UserWarning` when an extra field is the snake_case shadow of a declared camelCase field (cross-language drift guard). + +### Removed + +- `src/adapters/traceai.ts` — replaced by `src/adapters/otel.ts`. No back-compat shim. + +--- + +## [0.48.0] — 2026-05-27 — substrate↔runtime layering fix + diffRuns + Python hosted parity + +### Added + +- `src/verdict.ts` — `DefaultVerdict` substrate primitive (moved DOWN from agent-runtime). +- `src/contract/diff.ts` — `diffRuns` / `diffGenerations` / `diffRunBaselineToWinner` for v3-vs-v4 dashboard rendering, CI reporting, and any consumer comparing improvement-loop output. +- `src/adapters/traceai.ts` — OTel→hosted bridge (renamed to `otel.ts` in 0.49.0). +- `tests/hosted-roundtrip.test.ts` — proves wire-format binary compat between client and reference receiver. +- Python `HostedClient` (`clients/python/src/agent_eval_rpc/hosted.py`) — TS↔Python wire-format parity with bearer auth, idempotency, and exponential backoff on 5xx/408/429. +- `CLAUDE.md` repo-layering rule: agent-eval is the substrate; agent-runtime + agent-knowledge depend on it; the reverse is forbidden. + +### Changed + +- `src/campaign/gates/default-production-gate.ts` — `RunRecord` import from local `../../run-record` (was reaching up into agent-runtime). +- `src/matrix/types.ts` — `DefaultVerdict` import from `../verdict` (was reaching up into agent-runtime). + +### Removed + +- `@tangle-network/agent-runtime` from `peerDependencies`, `devDependencies`, and `pnpm.minimumReleaseAgeExclude` (no upward deps from substrate). + +--- + +## [0.47.0] — 2026-05-26 — Phase D hosted-tier substrate + +### Added + +- `src/hosted/` — wire-format types frozen at `HOSTED_WIRE_VERSION = '2026-05-26.v1'`, `createHostedClient` with bearer auth + idempotency + bounded retries. +- `examples/hosted-ingest-server/` — reference receiver implementing the spec. +- `docs/hosted-ingest-spec.md` — semver-locked wire spec. +- `selfImprove({ hostedTenant })` — opt-in hosted ingest; failures logged, never fail the loop. + +--- + +## [0.46.0] — `selfImprove()` LAND-tier helper + +`selfImprove({ scenarios, dispatch, judges, baselineSurface })` shipped in `/contract` as the one-shot wrapper around `runImprovementLoop`. + +--- + +## [0.45.0] — distributed campaigns + +`/adapters/http` with `httpDispatch` + `runDispatchServer`; `cellPlacement` on `RunCampaignOptions` for cross-region fan-out. + +--- + +## [0.44.0] — `/adapters/langchain` + +LangChain runnable → `Dispatch` adapter. + +--- + +## [0.43.0] — edge-friendly storage + +`inMemoryCampaignStorage()` for Cloudflare Workers / edge / test environments. + +--- + +## [0.42.0] — GEPA driver + legacy deletion + +### Added + +- `gepaDriver` reflective LLM mutation driver. +- `campaignToRunRecords` adapter. + +### Removed + +- `runMultiShotOptimization` (top-level trajectory-optimizer) — replaced by `runImprovementLoop` + `gepaDriver` composition. The `/multishot` subpath (N-shot persona matrix) is unrelated and remains. + +--- + ## 0.34.0 — 2026-05-23 ### Eval evolution-tracking — first-class `AgentProfile` + per-cell scorecard diff --git a/README.md b/README.md index f703a22c..cdaf106b 100644 --- a/README.md +++ b/README.md @@ -1,400 +1,304 @@ -# @tangle-network/agent-eval - -**Substrate for self-improving agents.** Trace what runs, verify the result, -turn outcomes into preferences and rewards, mutate prompts and policies under -anytime-valid evidence, and ship only when the improvement is decisive. - -```txt -real product task - -> observe / act (your runtime) - -> trace + verifier pipeline (capture integrity) - -> RunRecord (canonical eval artifact) - -> judge calibration · paired stats · sequential α - -> preferences · verifiable rewards · process rewards - -> GEPA / reflective mutation · auto-research · active curriculum - -> release gate · replay · contamination probe · tournament rating - -> next iteration -``` +# `@tangle-network/agent-eval` -`agent-eval` does **not** own product state, credentials, UI, storage, model -routing, browser drivers, sandbox policy, or deployment. Products own those. -This package owns the loop that closes evaluation → preference → mutation → -redeploy, with capture integrity and statistically rigorous evidence at every -step. +**Ship better agent prompts with statistical confidence.** One function call returns a decision packet: lift CI, judge calibration, contamination check, failure clusters, cost-quality Pareto, and a ranked action list. Same shape whether you've got a closed improvement loop or just production logs. -It ships as a TypeScript library (npm) with a generated Python client (PyPI), -both speaking the same wire protocol. MIT, self-hostable, no SaaS dependency. +[![npm](https://img.shields.io/npm/v/@tangle-network/agent-eval.svg)](https://www.npmjs.com/package/@tangle-network/agent-eval) +[![pypi](https://img.shields.io/pypi/v/agent-eval-rpc.svg)](https://pypi.org/project/agent-eval-rpc/) +[![tests](https://github.com/tangle-network/agent-eval/actions/workflows/ci.yml/badge.svg)](https://github.com/tangle-network/agent-eval/actions/workflows/ci.yml) +[![license: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](./LICENSE) -## Install +> TypeScript first-class, Python (`agent-eval-rpc`) speaks the same wire protocol, hosted-tier-friendly, MIT, self-hostable, no SaaS dependency. -```sh -pnpm add @tangle-network/agent-eval -# or, from Python: -pip install agent-eval-rpc -``` +--- -## Quick Start — the control loop +## Table of contents -```ts -import { - objectiveEval, - runAgentControlLoop, -} from '@tangle-network/agent-eval/control' +- [What you get back](#what-you-get-back-the-decision-packet) +- [Quick start](#quick-start) + - [Closed loop — `selfImprove()`](#closed-loop--selfimprove) + - [Observed runs — `analyzeRuns()`](#observed-runs--analyzeruns) + - [Existing data — intake adapters](#existing-data--intake-adapters) +- [How it compares](#how-it-compares) +- [Customer journeys](#customer-journeys) +- [Subpath entry points](#subpath-entry-points) +- [Concepts + design](#concepts--design) +- [Hosted tier](#hosted-tier) +- [Install + run](#install--run) +- [Stability + versioning](#stability--versioning) +- [License](#license) -const result = await runAgentControlLoop({ - intent: task.prompt, - budget: { maxSteps: 8, maxWallMs: 180_000, maxCostUsd: 2 }, +--- - observe() { - return product.readState(task.id) - }, +## What you get back: the decision packet - validate({ state }) { - return [ - objectiveEval({ - id: 'build-passes', - passed: state.build.exitCode === 0, - severity: 'critical', - metadata: state.build, - }), - objectiveEval({ - id: 'preview-serves', - passed: state.preview.httpStatus === 200, - severity: 'critical', - }), +Whether you call `selfImprove()` (closed loop) or `analyzeRuns()` (observed runs), the report has the same shape. Here's a real one, abridged: + +```jsonc +{ + "n": 80, // runs analyzed + "composite": { // distributional summary + "mean": 0.62, "p50": 0.65, "p95": 0.88, "stddev": 0.17, + "histogram": [/* 12 bins */] + }, + "lift": { // paired bootstrap + "baselineMean": 0.58, "candidateMean": 0.65, + "delta": 0.07, + "ci95": [0.04, 0.10], // 95% CI on the delta + "pValue": 0.0008, // paired-t + "cohensD": 0.41, + "n": 40, + "mde": 0.06, // min detectable effect at 80% power + "requiredN": 38 // n needed to detect observed delta + }, + "judges": { // per-judge calibration + "domain-expert": { "n": 80, "meanScore": 0.64 }, + "helpfulness-llm": { "n": 80, "meanScore": 0.61 } + }, + "interRater": { // multi-rater agreement + "raters": 3, "jointlyRated": 80, "kappa": 0.71, + "disagreementCases": [/* top 20 ranked by spread */] + }, + "costQuality": { // cost-vs-quality + "cost": { "mean": 0.024, "p95": 0.041, /* ... */ }, + "pareto": { /* ParetoFigureSpec the dashboard renders */ } + }, + "failureClusters": { // when an AnalystRegistry is wired + "totalFailures": 11, + "clusters": [ + { "name": "off-topic-drift", "share": 0.45, "exemplars": ["run-12", "run-19"] }, + { "name": "over-confidence", "share": 0.27, "exemplars": ["run-3"] }, + { "name": "format-mismatch", "share": 0.18, "exemplars": ["run-41"] } ] }, - - decide({ evals }) { - const failed = evals.filter((e) => !e.passed) - if (failed.length === 0) { - return { type: 'stop', pass: true, reason: 'all gates passed' } - } - return { - type: 'continue', - action: { type: 'repair', failed: failed.map((e) => e.id) }, - reason: 'repair failed gates', - } + "contamination": { "leaks": 0, "holdoutAuditPassed": true }, + "outcomeCorrelation": { // when downstream metric supplied + "metric": "engagement_rate", "n": 80, + "pearson": 0.72, "spearman": 0.69, + "rewardModel": { "intercept": 0.04, "slope": 1.93, "r2": 0.52 } }, - - act(action) { - return product.runAgentStep(task.id, action) + "release": { + "status": "pass", + "axes": [ + { "name": "quality-lift", "status": "pass" }, + { "name": "contamination", "status": "pass" }, + { "name": "composite-distribution","status": "pass" } + ] }, -}) - -await product.storeEvalResult(task.id, result) + "recommendations": [ + { "priority": "critical", "kind": "ship", + "title": "Ship — lift 0.070 (95% CI 0.040..0.100)", + "detail": "Holdout lift exceeds threshold 0.02 with 95% bootstrap confidence (n=40, p=0.0008, d=0.41)." }, + { "priority": "high", "kind": "investigate", + "title": "Top failure cluster: off-topic-drift (45% of failures)", + "detail": "11 runs failed. Drill into exemplars run-12 / run-19 to identify the pattern." } + ] +} ``` -Same loop shape in production, replay, benchmark, and optimization. Swap the -dependencies behind `observe()` and `act()`, never the eval contract. +The `recommendations` array is the human-readable layer; everything above it is the evidence. Read the recs, act on them, the numbers are the proof. -## Production loop — close the eval → prod → eval cycle +--- -Static prompts decay. Yesterday's FTC rule flips today; yesterday's tool quirk -becomes today's incident. The production agents that win are the ones that -**continuously re-train against live failure modes**. +## Quick start -`runProductionLoop` is the orchestration layer that wires the existing eval -substrate into a self-improvement cron: +### Closed loop — `selfImprove()` -```ts -import { - runProductionLoop, - httpGithubClient, - FileSystemFeedbackTrajectoryStore, -} from '@tangle-network/agent-eval' -import { FileSystemTraceStore } from '@tangle-network/agent-eval/traces' - -const result = await runProductionLoop({ - runId: `weekly-${new Date().toISOString().slice(0, 10)}`, - target: 'tax-agent', - - // 1. Where production traces + feedback land. Wire the HTTP ingestion - // endpoints (POST /v1/traces/ingest, POST /v1/feedback) from your - // runtime; the same store reads them here. - traceStore: new FileSystemTraceStore({ dir: 'data/prod-traces' }), - feedbackStore: new FileSystemFeedbackTrajectoryStore({ dir: 'data/prod-feedback' }), - - // 2. Cluster threshold: act on failure groups ≥ 20 runs or ≥ 5% of corpus. - cluster: { minClusterSize: 20, minSeverityRatio: 0.05, maxClustersPerCycle: 1 }, - - // 3. Evolve: seed = current prompt, gate against holdout scenarios. - evolve: { - baselinePrompt: currentSystemPrompt, - holdoutScenarios: productionShapeScenarios, - runner, // your agent driver - scorer, // calibrated judge or rubric - mutator, // GEPA-style or addendum-style mutator - gate: { - baselineKey: 'baseline', - minProductiveRuns: 5, - pairedDeltaThreshold: 0.03, // require Nσ improvement on holdout - overfitGapThreshold: 0.10, - }, - }, - - // 4. Ship: when the gate passes, open a PR with the new prompt. - ship: { - client: httpGithubClient({ token: process.env.GITHUB_TOKEN! }), - repo: { owner: 'tangle-network', name: 'tax-agent' }, - branchPrefix: 'eval/auto-improve', - promptFilePath: 'prompts/tax-agent-system.txt', - reviewers: ['drew'], - }, +You have scenarios, a dispatch, judges, and want the loop to propose better prompts + tell you which to ship. - cron: { cadence: 'weekly' }, // surface-only; consumer schedules +```ts +import { selfImprove } from '@tangle-network/agent-eval/contract' + +const result = await selfImprove({ + scenarios, // your scenario corpus + dispatch: async ({ scenario }) => // your agent — anything that returns an artifact + await myAgent.run(scenario), + judges: [myJudge], // any JudgeConfig — LLM, rule, ensemble + baselineSurface: { systemPrompt: currentPrompt }, }) -console.log(result.decision) // 'pr_opened' | 'gate_failed' | 'no_actionable_failures' | ... -console.log(result.pullRequest?.prUrl) // populated when a PR was opened +result.gateDecision // 'ship' | 'hold' | 'need_more_work' | ... +result.lift // raw delta on holdout +result.insight // the full decision packet above ``` -The primitive runs **one cycle**. Schedule it with `workflow_dispatch` + cron in -GitHub Actions. It is **idempotent + replayable**: same `runId` → same plan. -Gate failures are fail-closed — a candidate that beats baseline on search but -overfits on holdout never lands. +### Observed runs — `analyzeRuns()` -Full runnable demo (synthetic traces, no credentials) in -[`examples/production-loop`](./examples/production-loop/README.md). +You don't have a closed loop yet — you have observed runs (production traces, an approve/reject corpus, a CSV gold set). Same report shape, no agent invocation. -## Self-improvement loop +```ts +import { analyzeRuns } from '@tangle-network/agent-eval/contract' + +const report = await analyzeRuns({ + runs, // RunRecord[] + outcomeSignal: { // optional — closes the loop on real outcomes + metric: 'engagement_rate', + valueByRunId: enrichedFromProd, + }, + canaryScenarios, // optional — contamination probe + analyst: myAnalystRegistry, // optional — AI-powered failure clustering +}) -Eval doesn't end at "pass/fail." Outcomes become training signal, mutation -proposals, and curriculum updates — all from the same `RunRecord` produced by -the control loop. +report.recommendations // ranked actions +report.failureClusters // grouped failure modes +report.outcomeCorrelation // judge↔outcome correlation + linear reward model +``` + +### Existing data — intake adapters + +You have data already. Don't reshape it — pipe it through an adapter. ```ts -import { runEvalCampaign } from '@tangle-network/agent-eval' import { - extractPreferences, - extractVerifiableReward, - filterDeterministicallyRewarded, - offPolicyEstimateAll, - analyzeOptimizationResult, -} from '@tangle-network/agent-eval/rl' - -// 1. Run a matrix of variants × scenarios with capture integrity by construction. -const campaign = await runEvalCampaign({ variants, scenarios, run }) - -// 2. Convert outcomes into RL signal. -const rewards = extractVerifiableReward(campaign.runs) // compile/test/schema -const prefs = extractPreferences(campaign.runs) // (chosen, rejected) triples -const clean = filterDeterministicallyRewarded(rewards) // judge-noise free - -// 3. Estimate a candidate policy's value without re-running. -const ope = offPolicyEstimateAll(campaign.runs, candidatePolicy) // IPS + SNIPS + DR - -// 4. Or close the loop end-to-end: score → reflect → mutate → re-run. -const next = await analyzeOptimizationResult(campaign, { researcher }) + fromFeedbackTable, + fromOtelSpans, + analyzeRuns, +} from '@tangle-network/agent-eval/contract' + +// Multi-rater approve/reject (Obsidian tags, Sheets, CSV, Postgres). +const { runs, raterScores } = fromFeedbackTable({ + ratings: parseYourFeedbackTable(), // Array<{ runId, rater, rating }> +}) +await analyzeRuns({ runs, raterScores }) + +// Production OTel traces — group by tangle.runId or traceId. +const runs2 = fromOtelSpans({ spans: yourOtelStream }) +await analyzeRuns({ runs: runs2 }) ``` -| Step | Primitive | Subpath | -| --- | --- | --- | -| Eval matrix with integrity | `runEvalCampaign` | `/` | -| Deterministic re-judge / audit | `ReplayCache`, `createReplayFetch` | `/` | -| Anytime-valid α across rolling looks | `pairedEvalueSequence` | `/reporting` | -| Judge quality vs gold | `calibrateJudge` (κ, Pearson, MAE, bias probes) | `/` | -| Continuous inter-rater agreement | `calibrateJudgeContinuous`, `continuousAgreement` (κ_w, ICC(2,1), bootstrap CIs) | `/` | -| (chosen, rejected) for DPO/KTO/PPO | `extractPreferences` | `/rl` | -| Verifiable reward signal | `extractVerifiableReward` | `/rl` | -| Step-level / PRM training data | `extractStepRewards`, `prmTrainingPairs` | `/rl` | -| Estimate policy value off-policy | `offPolicyEstimateAll` (IPS + SNIPS + DR) | `/rl` | -| GEPA / reflective prompt mutation | `buildReflectionPrompt`, `parseReflectionResponse`, Ax-GEPA `SteeringOptimizer` | `/` `/optimization` | -| Auto-research (read runs → propose) | `analyzeOptimizationResult`, `PredictiveValidityResearcher` | `/rl` | -| Active curriculum (variance / Thompson) | `allocateCurriculum` | `/rl` | -| Tournament ratings (Bradley-Terry + Elo) | `fitBradleyTerry`, `applyEloUpdate` | `/rl` | -| Adversarial scenario search | `adversarialScenarioSearch` | `/rl` | -| Contamination probe (held-out perturb) | `runContaminationProbe` | `/rl` | -| Reward hacking signatures | `detectRewardHacking` | `/rl` | -| Compute curves (best-of-N, self-consist, Pareto) | `runComputeCurve`, `bestOfN`, `selfConsistency`, `paretoFrontier` | `/rl` | -| Knowledge gap separated from reasoning gap | `scoreKnowledgeReadiness` | `/` | -| Release gate (paired evidence + holdouts) | `evaluateReleaseConfidence`, `HeldOutGate` | `/reporting` | -| Launch report (decision-grade) | `renderReleaseReport`, `researchReport` | `/reporting` | - -## Import Paths - -| Subpath | Use for | -| --- | --- | -| `@tangle-network/agent-eval/contract` | **LAND-tier surface** — `selfImprove`, `runCampaign`, `runImprovementLoop`, `runEval`, `Dispatch`, `Mutator`, `Gate`, `defaultProductionGate`, `gepaDriver`, `diffRuns`, storage backends. New code starts here. | -| `@tangle-network/agent-eval/hosted` | **EXPAND-tier surface** — `createHostedClient`, wire-format types, `HOSTED_WIRE_VERSION`. Ships eval-run events + trace spans to any orchestrator that speaks the spec. | -| `@tangle-network/agent-eval/adapters/otel` | OTel→hosted bridge — `createOtelBridge` forwards OTel-shape spans (TraceAI, OpenLLMetry, OTel SDK) into the hosted-tier ingest. | -| `@tangle-network/agent-eval/adapters/langchain` | LangChain executor adapter — wrap a LangChain runnable as a `Dispatch`. | -| `@tangle-network/agent-eval/adapters/http` | Distributed driver — `httpDispatch` + `runDispatchServer` for cross-machine campaigns. | -| `@tangle-network/agent-eval/campaign` | Lower-level campaign primitives — `runCampaign`, driver implementations, storage. | -| `@tangle-network/agent-eval/multishot` | Multi-shot optimization primitives. | -| `@tangle-network/agent-eval/control` | `observe → validate → decide → act`, action policy, propose/review loops | -| `@tangle-network/agent-eval/traces` | trace stores, emitters, TraceAnalyst, replay | -| `@tangle-network/agent-eval/optimization` | feedback trajectories, multi-shot, prompt evolution, GEPA, EvalCampaign | -| `@tangle-network/agent-eval/reporting` | release confidence, paired stats, sequential e-values, launch reports | -| `@tangle-network/agent-eval/rl` | adapters, verifiable rewards, preferences, OPE, PRM, contamination, tournaments, adversarial, compute curves, auto-research | -| `@tangle-network/agent-eval/wire` | HTTP/RPC server + schemas (same protocol the Python client speaks) | -| `@tangle-network/agent-eval/benchmarks` | benchmark adapter contracts and reference wrappers | -| `@tangle-network/agent-eval/matrix` | N-axis cartesian runner over substrate types — see [`src/matrix/`](./src/matrix/) | - -The root export remains available for convenience; new code should prefer -focused subpaths. Anything under `/rl`, `/pipelines`, `/meta-eval`, `/prm`, -or `/builder-eval` is only reachable via its subpath. - -## API stability - -Public exports are tagged with JSDoc stability markers so consumers can see -status at the call site (IDE hover, language server, declaration files). +Both intake adapters preserve every signal in the source — multi-rater scores stay rater-keyed so the report can compute inter-rater agreement and surface the disagreement triage list. -| Tag | Meaning | -| --- | --- | -| `@stable` | API frozen at this major. Breaking changes require a major bump. | -| `@experimental` | Interface may evolve before becoming `@stable`. Pin the patch version if you depend on it. | -| `@internal` | Not part of the public contract. Use the documented subpath instead. | +--- -The `/rl` subpath is the most active surface. See -[`src/rl/index.ts`](./src/rl/index.ts) for the current stable/experimental -breakdown. +## How it compares -## Capture integrity +| | LangSmith | Braintrust | Phoenix | **agent-eval** | +|---|:---:|:---:|:---:|:---:| +| Closed-loop self-improvement | ✱ human-in-loop | ✱ experiment-driven | — | ✓ autonomous + gated | +| Statistical lift CI (paired bootstrap) | — | partial | — | ✓ | +| Judge calibration + bias detection | — | — | — | ✓ | +| Inter-rater agreement + disagreement triage | — | — | — | ✓ | +| Contamination / canary check | — | — | — | ✓ | +| AI-driven failure clustering | partial | — | partial | ✓ | +| Cost-quality Pareto | — | — | — | ✓ | +| Multi-language clients (TS + Python) | TS only | TS only | TS + Py | ✓ TS + Py | +| Self-hostable / no-SaaS option | — | — | OSS | ✓ MIT, OSS | +| Substrate vs SaaS shape | SaaS | SaaS | OSS server | **library** | +| Hosted tier (optional) | required | required | optional | optional | -Launch-grade benchmark runs need four things that are easy to forget in glue -code: (1) raw HTTP capture alongside the structured spans so a reviewer can -verify which route answered, (2) a preflight assertion that the configured -client points at the intended provider, (3) a run-end assertion that the -expected events were actually written, and (4) auto-execution of the trace -analyst as part of the run lifecycle. +Position: agent-eval is the **substrate** (one library, decision-grade output) the others are SaaS *around* the substrate. If you want a closed loop that ships your prompt under statistical confidence, you call agent-eval. If you want a dashboard rendered from your data, you pipe agent-eval into the hosted tier or your own renderer. -```ts -import { - TraceEmitter, FileSystemRawProviderSink, callLlm, assertLlmRoute, - assertRunCaptured, throwIfRunIncomplete, -} from '@tangle-network/agent-eval' -import { traceAnalystOnRunComplete } from '@tangle-network/agent-eval/traces' +--- -const sink = new FileSystemRawProviderSink({ dir: `${workDir}/raw-events` }) -assertLlmRoute(llmOpts, { requireExplicitBaseUrl: true, allowedBaseUrls, requireAuth: true }) +## Customer journeys -const emitter = new TraceEmitter(store, { - onRunComplete: [traceAnalystOnRunComplete({ analyze: analystOpts, save })], -}) -await emitter.startRun(/* ... */) -// LLM calls flow through callLlm with `{ rawSink: sink, traceContext: { runId, spanId } }`. -await emitter.endRun({ pass, score }) +Three runnable examples — each is self-contained, each shows the actual output. -throwIfRunIncomplete(await assertRunCaptured(store, emitter.runId, { - llmSpansMin: 1, rawSink: sink, requireRawCoverageOfLlmSpans: true, requireOutcome: true, -})) -``` +| Journey | Example | Who it's for | +|---|---|---| +| **Closed loop** — improve a prompt under statistical confidence | [`examples/selfimprove-quickstart/`](./examples/selfimprove-quickstart/) | Teams with scenarios + judges + agent in hand | +| **Multi-rater feedback corpus** — turn Obsidian/Sheets/CSV ratings into actionable insights | [`examples/customer-feedback-loop/`](./examples/customer-feedback-loop/) | Teams reviewing AI outputs by hand who want to compress that taste into per-member LLM judges + close the loop | +| **Production OTel traces** — analyze logs you already have, no closed loop required | [`examples/customer-otel-traces/`](./examples/customer-otel-traces/) | Teams running agents in prod with observability, no eval discipline yet | -Directives, rationale, and shipped-bug context are in -[`SKILL.md` § Capture integrity](./.claude/skills/agent-eval/SKILL.md#capture-integrity-required-for-launch-grade-adoption). - -## Examples - -Each example has its own README with what it demonstrates, expected output, -and runtime. See [`examples/`](./examples/). - -- [`examples/multi-shot-optimization`](./examples/multi-shot-optimization/README.md): - optimize full trajectories with held-out promotion. -- [`examples/same-sandbox-harness`](./examples/same-sandbox-harness/README.md): - run setup/build/test and evidence checks in one workspace. -- [`examples/benchmarks`](./examples/benchmarks/README.md): - benchmark adapter shape and reference wrappers. -- [`examples/auto-research-with-agent-builder`](./examples/auto-research-with-agent-builder/README.md): - closed loop — score, reflect, mutate, re-score, repeat. -- [`examples/fine-tune-with-prime-rl`](./examples/fine-tune-with-prime-rl/README.md): - RunRecord → preferences → trainer (prime-rl) → next campaign. -- [`examples/production-loop`](./examples/production-loop/README.md): - ingest prod traces + feedback, cluster failures, evolve, gate, open a PR. - -## Matrix - -`@tangle-network/agent-eval/matrix` is an N-axis cartesian runner over the -substrate types you already use — `AgentProfile` from -`@tangle-network/sandbox`, `Driver` / `Validator` from -`@tangle-network/agent-runtime`, rubric records, anything. It does not wrap -substrate types; the caller passes them in axis values, the runner iterates -the cartesian, and the aggregator returns per-axis pass / score / cost / -duration summaries. +Each example: `README.md` + a single `index.ts` runnable via `pnpm tsx`. Prints the resulting `InsightReport` to stdout. -```ts -import { runAgentMatrix } from '@tangle-network/agent-eval/matrix' - -const result = await runAgentMatrix({ - axes: [ - { name: 'scenario', values: scenarios.map((s) => ({ id: s.id, value: s })) }, - { name: 'profile', values: profiles.map((p) => ({ id: p.name, value: p })) }, - { name: 'thinking', values: [ - { id: 'low', value: 'low' }, { id: 'high', value: 'high' }, - ] }, - ], - reps: 3, - maxConcurrency: 4, - costCeiling: 5.0, - filter: (cell) => !(cell.axes.scenario.value.hard === 5 && cell.axes.thinking.id === 'low'), - runCell: async (cell) => runScenario(cell.axes.scenario.value, cell.axes.profile.value), -}) +--- -console.log(result.byAxis.profile) // per-profile passRate / meanScore / p90 / cost -``` +## Subpath entry points -See [`src/matrix/`](./src/matrix/) for the full surface. +| Subpath | What it gives you | +|---|---| +| `@tangle-network/agent-eval/contract` | **The headline surface.** `selfImprove`, `analyzeRuns`, `runImprovementLoop`, `runCampaign`, `runEval`, `diffRuns`, intake adapters (`fromFeedbackTable`, `fromOtelSpans`), drivers (`gepaDriver`, `evolutionaryDriver`), gates (`defaultProductionGate`, `heldOutGate`, `composeGate`), storage. **New code starts here.** | +| `@tangle-network/agent-eval/hosted` | Hosted-tier wire-format types + `createHostedClient` to ship eval-run events + trace spans to any orchestrator speaking the spec | +| `@tangle-network/agent-eval/adapters/otel` | `createOtelBridge` — forwards OpenTelemetry-shape spans into the hosted-tier ingest | +| `@tangle-network/agent-eval/adapters/langchain` | LangChain runnable → `Dispatch` adapter | +| `@tangle-network/agent-eval/adapters/http` | `httpDispatch` + `runDispatchServer` for distributed campaigns across machines | +| `@tangle-network/agent-eval/campaign` | Lower-level campaign primitives (storage, drivers, types) | +| `@tangle-network/agent-eval/multishot` | N-shot persona × shot matrix runner | +| `@tangle-network/agent-eval/control` | Agent control loop primitives (`runAgentControlLoop`, action policy, propose/review) | +| `@tangle-network/agent-eval/traces` | Trace stores, emitters, OTLP-JSONL replay | +| `@tangle-network/agent-eval/reporting` | Release confidence, paired stats, sequential e-values, launch reports | +| `@tangle-network/agent-eval/rl` | RL bridge — verifiable rewards, preferences, OPE, PRM, tournaments, contamination, compute curves, auto-research | +| `@tangle-network/agent-eval/matrix` | N-axis cartesian over substrate types | +| `@tangle-network/agent-eval/wire` | HTTP/RPC server + Zod schemas (same protocol the Python client speaks) | +| `@tangle-network/agent-eval/benchmarks` | Benchmark adapter contracts and reference wrappers | -## Docs +The root export remains available for backward compatibility; new code should prefer focused subpaths. Anything under `/rl`, `/pipelines`, `/meta-eval`, `/prm`, or `/builder-eval` is **only** reachable via its subpath. -Read in this order: +--- -1. [Concepts](./docs/concepts.md) — mental model, 5 min -2. [Product Eval Adoption](./docs/product-eval-adoption.md) -3. [Control Runtime](./docs/control-runtime.md) -4. [Feedback Trajectories](./docs/feedback-trajectories.md) -5. [Multi-Shot Optimization](./docs/multi-shot-optimization.md) -6. [Trace Analysis](./docs/trace-analysis.md) -7. [Knowledge Readiness](./docs/knowledge-readiness.md) -8. [Integration Launch Gates](./docs/integration-launch-gates.md) -9. [Wire Protocol](./docs/wire-protocol.md) — required for non-TypeScript consumers +## Concepts + design -## CLI / Wire Protocol +- [`docs/concepts.md`](./docs/concepts.md) — five types, three top-level functions, the layering rule, the wire protocol contract +- [`docs/insight-report.md`](./docs/insight-report.md) — annotated walkthrough of every section of the decision packet +- [`docs/customer-journeys.md`](./docs/customer-journeys.md) — three end-to-end journeys with code + expected output +- [`docs/adapters-observability.md`](./docs/adapters-observability.md) — composing agent-eval with LangSmith, Langfuse, Phoenix, OpenLLMetry, TraceAI +- [`docs/wire-protocol.md`](./docs/wire-protocol.md) — the HTTP/RPC contract Python (and any future language) speaks +- [`docs/hosted-ingest-spec.md`](./docs/hosted-ingest-spec.md) — the hosted-tier wire format, frozen at `2026-05-26.v1` +- [`docs/design/`](./docs/design/) — RFCs + architectural notes -```sh -npm i -g @tangle-network/agent-eval -agent-eval serve --port 5005 +The `.claude/skills/agent-eval/SKILL.md` skill ships embedded directives so LLM agents writing integration code don't reintroduce historical bug classes. + +--- + +## Hosted tier + +Wire your loop to a hosted orchestrator (ours, or your own implementation of the spec) with one config: + +```ts +await selfImprove({ + scenarios, dispatch, judges, baselineSurface, + hostedTenant: { + endpoint: 'https://intelligence.tangle.tools', + apiKey: process.env.TANGLE_API_KEY!, + tenantId: 'your-tenant', + }, +}) ``` -Python: +The substrate runs the loop in your process. Only the eval-run events + (optional) trace spans go to the orchestrator. Your scenarios, your judges, your raw data — never sent. Spec at [`docs/hosted-ingest-spec.md`](./docs/hosted-ingest-spec.md); reference receiver at [`examples/hosted-ingest-server/`](./examples/hosted-ingest-server/). + +--- + +## Install + run ```sh +pnpm add @tangle-network/agent-eval +# or, from Python: pip install agent-eval-rpc ``` -```py -from agent_eval_rpc import Client -client = Client() # auto-detects HTTP server, falls back to subprocess -score = await client.judge(content=output, rubric_name="anti-slop") -``` +Run an example: -TypeScript is the source of truth. Python is a thin transport client over the -generated OpenAPI schema. Schema drift is enforced impossible at release time -(version-locked CI). +```sh +pnpm tsx examples/selfimprove-quickstart/index.ts +pnpm tsx examples/customer-feedback-loop/index.ts +pnpm tsx examples/customer-otel-traces/index.ts +``` -## Development +Run the test suite: ```sh pnpm install -pnpm typecheck +pnpm build pnpm test -pnpm lint # biome -pnpm build # tsup + openapi.json ``` -## Related Packages +--- + +## Stability + versioning + +Public exports carry JSDoc stability markers visible in IDE hover + `.d.ts`: + +| Tag | Meaning | +|---|---| +| `@stable` | API frozen at this major. Breaking changes require a major bump. | +| `@experimental` | Interface may evolve before becoming `@stable`. Pin the patch version if you depend on it. | +| `@internal` | Not part of the public contract. Use the documented subpath instead. | -- [`@tangle-network/agent-runtime`](https://www.npmjs.com/package/@tangle-network/agent-runtime): - production session/runtime layer. -- [`@tangle-network/agent-knowledge`](https://www.npmjs.com/package/@tangle-network/agent-knowledge): - source-grounded knowledge bases and readiness. -- [`@tangle-network/agent-integrations`](https://www.npmjs.com/package/@tangle-network/agent-integrations): - connection, grant, capability, and integration invocation contracts. +[`CHANGELOG.md`](./CHANGELOG.md) tracks every release with what's new / additive / breaking. -Together: `agent-runtime` is where the agent runs; `agent-knowledge` is what -it knows; `agent-integrations` is what it can do; `agent-eval` is how it gets -better. +--- ## License -MIT +MIT. See [`LICENSE`](./LICENSE). diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index 175ea47b..b132b646 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "agent-eval-rpc" -version = "0.50.0" +version = "0.50.1" description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client." readme = "README.md" requires-python = ">=3.10" diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py index ca3bea39..b1bc1271 100644 --- a/clients/python/src/agent_eval_rpc/__init__.py +++ b/clients/python/src/agent_eval_rpc/__init__.py @@ -58,7 +58,7 @@ try: __version__ = version("agent-eval-rpc") except PackageNotFoundError: - __version__ = "0.50.0" + __version__ = "0.50.1" __all__ = [ "Client", diff --git a/docs/concepts.md b/docs/concepts.md index 8aafb503..80e2439a 100644 --- a/docs/concepts.md +++ b/docs/concepts.md @@ -9,6 +9,26 @@ connected, or the answer lacks required sources. The package gives products a shared way to record runs, check outcomes, classify failures, compare variants, and make release decisions. +## The three top-level functions + +Everything funnels through `/contract`. Three entries, one shape coming back: + +| Function | When to call it | What you give it | What you get back | +|---|---|---|---| +| **`selfImprove()`** | You have a closed loop — scenarios, judge, agent in hand, and you want the substrate to propose better candidates + gate them. | scenarios, agent, judge, baseline surface | `SelfImproveResult.insight: InsightReport` + ship/hold verdict + winner surface | +| **`analyzeRuns()`** | You have observed runs (production traces, an approve/reject corpus, a CSV gold set) and want the same rigor packet without invoking an agent. | `RunRecord[]` + optional flags | `InsightReport` | +| **Intake adapters** (`fromFeedbackTable`, `fromOtelSpans`) | Your data isn't already in `RunRecord` shape — it's in Obsidian, Sheets, an OTel collector, etc. | source-specific input | `RunRecord[]` ready to pipe into `analyzeRuns()` | + +The three customer maturity stages — logs only → ratings → closed loop — map exactly to the three functions. See [`customer-journeys.md`](./customer-journeys.md) for the runnable walkthroughs. + +The shape of the answer — `InsightReport` — is identical across all three paths. Distributional summary, paired-bootstrap lift CI, judge stats, inter-rater agreement, cost-quality Pareto, failure clusters, contamination check, outcome correlation, release axes, and a ranked recommendations array. Walked through section-by-section in [`insight-report.md`](./insight-report.md). + +## The layering rule + +`agent-eval` is the **substrate** at the bottom of the Tangle agent stack. `agent-runtime` and `agent-knowledge` depend on it; `agent-eval` MUST NOT import from either. Primitives that "feel like" they belong in a consumer but are actually substrate-shaped (validator verdicts, run records, scenarios, judge scores) live here. Primitives that genuinely require a running agent loop (`ValidationCtx` with iteration + signal + traceEmitter, sandbox `AgentRunSpec`) stay in `agent-runtime`. + +The test: *does this concept make sense WITHOUT a running agent loop?* If yes, it's substrate. If no, it's runtime. The full rule is in [`/CLAUDE.md`](../CLAUDE.md#repo-layering--this-package-is-the-substrate). + ## Main Objects | Thing | What it is | One-line example | diff --git a/docs/customer-journeys.md b/docs/customer-journeys.md new file mode 100644 index 00000000..6bfd8956 --- /dev/null +++ b/docs/customer-journeys.md @@ -0,0 +1,208 @@ +# Customer journeys + +Three end-to-end journeys covering the surface of `@tangle-network/agent-eval`. Each one is a runnable example under `examples/` — clone the repo and `pnpm tsx examples//index.ts` to see the actual output. + +The three journeys map to three customer-maturity stages: + +1. **Logs but no eval discipline** → [Production traces journey](#1-production-traces-journey-customer-otel-traces) +2. **Ratings but no closed loop** → [Feedback corpus journey](#2-feedback-corpus-journey-customer-feedback-loop) +3. **Scenarios, judge, agent — full closed loop** → [Closed-loop journey](#3-closed-loop-journey-selfimprove-quickstart) + +Each section: what the customer has, what they want, the code, what the report looks like. + +--- + +## 1. Production traces journey — `customer-otel-traces` + +**The customer:** an agentic GTM-as-a-service company. Multiple agent steps in prod (social media posting, image generation, translation). OTel observability piped to their collector. Doesn't run formal evals. CTO hand-rolled their tracing. + +**The frustration:** "Which step is unreliable? What's our cost-quality profile? Where do we fix next?" They have the data; they don't have the answer. + +**What they need from agent-eval:** day-1 analysis of their existing logs. No scenarios, no judges, no closed loop. Just turn the trace stream into a decision packet. + +### The code + +```ts +import { analyzeRuns, fromOtelSpans } from '@tangle-network/agent-eval/contract' + +const runs = fromOtelSpans({ spans: yourOtelStream }) +const report = await analyzeRuns({ runs }) + +// report.failureClusters → root causes +// report.costQuality.pareto → cost-vs-quality scatter +// report.composite → distribution +// report.recommendations → top-3 actions +``` + +### What the report shows + +``` +Runs analyzed: 40 +Composite mean: 0.721 (p50: 0.717, p95: 0.925, stddev: 0.210) +Cost mean: $0.103 (p95: $0.131) + +── Failures ── +6 runs with status=ERROR or failureMode set: + tool.search (3x) + agent.turn (3x) + +── Cost-quality Pareto ── +1 candidate(s) plotted; 1 on the frontier + otel-default: cost=$0.103 quality=0.721 (frontier) + +── Recommendations ── +[medium] expand-corpus — Mean composite 0.721 has room +``` + +### Next steps for this customer + +1. Wire an `AnalystRegistry` to cluster the 6 failures by root cause via LLM analysis. +2. Add `outcomeSignal` once they have downstream conversion / engagement / post-engagement data, and the report fits a reward model showing whether their score predicts the customer outcome. +3. Once they identify a step worth optimizing (translation, say), graduate to journey #3 — wrap that step in a `Dispatch` and call `selfImprove()`. + +**Runnable:** [`examples/customer-otel-traces/`](../examples/customer-otel-traces/) + +--- + +## 2. Feedback corpus journey — `customer-feedback-loop` + +**The customer:** a research-validation team. A GitHub Action fires `claude -p` against the next claim, writes the research output to Obsidian. Three reviewers (Alice, Bob, Carol) tag results `#approved` or `#rejected`. Outputs feed a knowledge base. Knowledge feeds content. Content feeds engagement. The founder wants more engagement faster. + +**The frustration:** "We disagree on what's good. We don't know if our 'good' actually drives engagement. Reviewing every claim is slow." + +**What they need from agent-eval:** turn the approve/reject corpus into actionable signal: +- Where do reviewers disagree? (triage list) +- Can we synthesize each reviewer's taste into an LLM judge? (auto-grade) +- Does the taste actually predict downstream engagement? (close the loop) + +### The code + +```ts +import { analyzeRuns, fromFeedbackTable } from '@tangle-network/agent-eval/contract' + +// 1. Parse Obsidian #approved / #rejected tags into a flat table: +const ratings = parseObsidianVault('./research-vault') +// [{ runId: 'claim-1', rater: 'alice', rating: true }, ...] + +// 2. Pipe through the adapter: +const { runs, raterScores } = fromFeedbackTable({ ratings }) + +// 3. Analyze: +const report = await analyzeRuns({ + runs, + raterScores, + // Optional: close the loop with engagement data once you have it. + outcomeSignal: { metric: 'engagement_rate', valueByRunId: enrichedFromProd }, +}) + +// report.interRater.disagreementCases → top 20 claims worth a meeting +// report.outcomeCorrelation → does team taste predict engagement? +// report.recommendations → action list +``` + +### What the report shows + +``` +Runs analyzed: 30 +Composite mean: 0.756 (approve rate ~76%) + +── Inter-rater agreement ── +Raters: 3 (alice, bob, carol) +Jointly rated runs: 30 +Pairwise pearson κ: + alice::bob 0.53 + alice::carol 0.55 + bob::carol 0.21 +Mean κ: 0.43 + +── Top 5 disagreement cases ── + claim-1 range=1.00 ratings: alice=0, bob=0, carol=1 + claim-7 range=1.00 ratings: alice=0, bob=1, carol=0 + ... + +── Recommendations ── +[high] recalibrate — Inter-rater agreement κ=0.43 is below 0.5 + Raters disagree on what 'good' looks like. Refine the rubric or triage the disagreement cases. +``` + +### Next steps for this customer + +1. **Triage meeting on the disagreement cases.** Mean κ=0.43 means the rubric is ambiguous; clarify it on the cases that split. +2. **Calibrate one LLM judge per reviewer.** Each reviewer's history is the gold signal — substrate primitive `calibrateJudge` against `raterScores` filtered to that reviewer. +3. **Add engagement as `outcomeSignal`** once the content downstream is instrumented. The `outcomeCorrelation` section tells the team whether their taste predicts the founder's token-max goal — and if not, the linear reward model says how to retarget. +4. **Graduate to journey #3** — wrap the research-generation Claude-P call as a `Dispatch`, use the calibrated judges, run `selfImprove()` nightly. Open a PR against the GitHub Action when the holdout approval rate beats baseline. + +**Runnable:** [`examples/customer-feedback-loop/`](../examples/customer-feedback-loop/) + +--- + +## 3. Closed-loop journey — `selfimprove-quickstart` + +**The customer:** a team with a scenario corpus, a judge, and an agent. Wants to improve the prompt under statistical confidence — propose better candidates, gate on holdout lift, ship the winner. + +**The frustration:** "We can run an A/B by hand but we don't know if the improvement is real. We don't have time to run paired bootstrap by hand. We want a function that decides." + +**What they need from agent-eval:** the closed loop in one function — propose, score, gate, ship — with the full rigor packet on the way out. + +### The code + +```ts +import { selfImprove } from '@tangle-network/agent-eval/contract' + +const result = await selfImprove({ + scenarios, + agent: async (surface, scenario) => + await myAgent.run({ systemPrompt: (surface as { systemPrompt: string }).systemPrompt, scenario }), + judge: { + name: 'rubric', + dimensions: [{ key: 'clarity', weight: 1 }, { key: 'concision', weight: 1 }], + score: async ({ artifact }) => myJudgeFn(artifact), + }, + baselineSurface: { kind: 'prompt', systemPrompt: 'You write marketing copy...' }, + budget: { generations: 3, populationSize: 2 }, +}) + +result.gateDecision // 'ship' | 'hold' | ... +result.insight // full decision packet +``` + +### What the report shows + +``` +═══ selfImprove() decision packet ═══ + +Gate decision: ship +Raw lift: +0.194 + +── Statistical lift (paired bootstrap) ── +delta: +0.254 +CI95: [0.254, 0.254] +pValue: 1.0000 +Cohen's d: 0.00 +MDE @ 80% power: 2.802 +required n at observed effect: 244 + +── Recommendations ── +[critical] ship — Ship — lift 0.254 (95% CI 0.254..0.254) +``` + +### Next steps for this customer + +1. **Ship the winner.** Either accept `result.winner.surface` programmatically and roll it out, or pass `autoOnPromote: 'pr'` + a GitHub repo to have selfImprove open a PR for you. +2. **Wire `hostedTenant`** to ship the decision packet to a dashboard (the hosted Intelligence orchestrator, or your own implementation of the wire spec). +3. **Add `canaryScenarios`** to guard against the holdout leaking into the candidate prompt. +4. **Add `outcomeSignal`** in `analyzeRuns()` for any post-deploy reruns to verify the predicted lift actually shows up in real outcomes. + +**Runnable:** [`examples/selfimprove-quickstart/`](../examples/selfimprove-quickstart/) + +--- + +## How the three journeys compose + +Journey #1 + #2 + #3 are **maturity stages**, not exclusive products. A team typically: + +1. Starts with **#1** (analyze production logs) to find what's broken. +2. Adds **#2** (feedback corpus) once they have a sense of where to improve, to calibrate what "good" means. +3. Graduates to **#3** (closed loop) once they have scenarios + judges, to automate the improvement. + +Same substrate, same `InsightReport` shape, no rip-and-replace between stages. The data you collect in #1 informs the scenarios you derive in #2 which feed the loop in #3. diff --git a/docs/insight-report.md b/docs/insight-report.md new file mode 100644 index 00000000..f57f86ea --- /dev/null +++ b/docs/insight-report.md @@ -0,0 +1,337 @@ +# `InsightReport` — the decision packet + +The single shape every analysis call returns. `selfImprove()` embeds it in `SelfImproveResult.insight`; `analyzeRuns()` returns it directly. The hosted-tier wire format carries it on `EvalRunEvent.insightReport?`. + +Every section is **opt-in based on what your data supports** — the function never invents signal. If your runs don't carry judge scores, `judges` is empty. If there's no baseline/candidate split, `lift` is undefined. The shape is consistent; population is honest. + +This page walks every section with a real (synthetic) example and explains how to act on it. + +--- + +## At a glance + +```ts +interface InsightReport { + n: number // runs analyzed + composite: ScalarDistribution // always + perDimension: Record // when judgeScores carry dimensions + costQuality: { cost: ScalarDistribution; pareto: ParetoFigureSpec } // always + judges: Record // when runs carry judge scores + interRater?: InterRaterInsight // when raterScores supplied + lift?: LiftInsight // when baseline + candidate present + failureClusters?: FailureClusterInsight // when AnalystRegistry wired + contamination?: ContaminationInsight // when canaryScenarios supplied + outcomeCorrelation?: OutcomeCorrelationInsight // when outcomeSignal supplied + release: ReleaseSummary // always + recommendations: Recommendation[] // always — read this FIRST +} +``` + +--- + +## `n` + `composite` + `perDimension` — distributional summary + +Always present. The basic "where are my numbers" view. + +```jsonc +{ + "n": 30, + "composite": { + "n": 30, + "mean": 0.683, "p50": 0.667, "p95": 1.000, "stddev": 0.231, + "min": 0.0, "max": 1.0, + "histogram": [ + { "lo": 0.0, "hi": 0.083, "count": 5 }, + { "lo": 0.083, "hi": 0.167, "count": 0 }, + // ...12 bins by default + ] + }, + "perDimension": { + "clarity": { "mean": 0.72, "p50": 0.75, "p95": 0.95, "stddev": 0.18, /* ... */ }, + "concision": { "mean": 0.65, "p50": 0.68, "p95": 0.88, "stddev": 0.21, /* ... */ } + } +} +``` + +**Read first:** the `composite.mean`. If it's < 0.5, your agent has a ceiling problem, not a tuning problem. + +**Read next:** `perDimension`. If `clarity` is high but `concision` is low, your prompts get the right ideas in too many words — different fix than "wrong ideas." + +**Use the histogram for:** finding bimodal failure modes. A bin with `count > 0` near zero and another > 0 near 1 means your agent has two distinct behaviors, not one noisy one. + +--- + +## `costQuality` — cost-vs-quality Pareto + +Always present. `cost.histogram` is the per-run cost distribution; `pareto` is the substrate's `ParetoFigureSpec`. + +```jsonc +{ + "costQuality": { + "cost": { + "mean": 0.024, "p95": 0.041, + "histogram": [/* */] + }, + "pareto": { + "kind": "pareto-cost-quality", + "split": "holdout", + "axes": { "x": "costUsd", "y": "score" }, + "points": [ + { "candidateId": "baseline", "cost": 0.018, "quality": 0.58, "n": 20, "onFrontier": true }, + { "candidateId": "winner", "cost": 0.027, "quality": 0.65, "n": 20, "onFrontier": true } + ] + } + } +} +``` + +**Use this when:** comparing prompts, models, or candidate surfaces. The Pareto frontier is your menu of "best you can do at each cost level." + +**Render with:** any chart library — `points` is plain JSON. Hosted-tier dashboards render this as a scatter with the frontier highlighted. + +--- + +## `judges` — per-judge mean + +Populated when run records carry `outcome.judgeScores`. + +```jsonc +{ + "judges": { + "domain-expert": { "n": 30, "meanScore": 0.71 }, + "helpfulness-llm": { "n": 30, "meanScore": 0.62 } + } +} +``` + +The substrate's full judge-calibration suite (positional bias, self-preference, verbosity bias) lives in `/reporting` and operates on **paired-by-condition** inputs that `analyzeRuns` doesn't synthesize from raw `RunRecord[]`. Wire them yourself when you have the paired data; the report's `judges` map is the corpus-level slice. + +**Use this when:** comparing multiple judges over the same corpus. A big gap between two judges' means is the first signal that one of them is mis-calibrated. + +--- + +## `interRater` — multi-rater agreement + disagreement triage + +Populated when `analyzeRuns({ raterScores })` is supplied — typically via `fromFeedbackTable()`. + +```jsonc +{ + "interRater": { + "raters": 3, + "jointlyRated": 30, + "kappa": 0.71, + "perPair": { + "alice::bob": 0.78, + "alice::carol": 0.65, + "bob::carol": 0.69 + }, + "disagreementCases": [ + { "runId": "claim-7", "range": 1.00, + "ratings": [{"rater":"alice","score":1},{"rater":"bob","score":1},{"rater":"carol","score":0}] }, + { "runId": "claim-13", "range": 1.00, + "ratings": [{"rater":"alice","score":0},{"rater":"bob","score":0},{"rater":"carol","score":1}] } + // ...top 20 by range + ] + } +} +``` + +**Read first:** the mean `kappa`. < 0.5 means raters disagree on what "good" looks like — surface the disagreement cases at the next review meeting. + +**Use this when:** building per-rater LLM judges. Each rater's individual scores are the gold signal you calibrate against. Once a calibrated LLM matches the human ≥85%, you can auto-grade and escalate only the disagreement cases. + +--- + +## `lift` — paired-bootstrap statistical lift + +Populated when baseline + candidate candidates are present (auto-detected from two distinct `candidateId`s, or explicit via `baselineCandidateId` + `candidateCandidateId`). + +```jsonc +{ + "lift": { + "baselineMean": 0.58, + "candidateMean": 0.65, + "delta": 0.07, + "ci95": [0.04, 0.10], // bootstrap CI on the delta + "pValue": 0.0008, // paired t-test + "n": 40, // paired observations + "cohensD": 0.41, + "mde": 0.06, // min detectable effect at current n, 80% power + "requiredN": 38 // n needed for observed delta at 80% power + } +} +``` + +**Decision rule:** +- `ci95[0] > threshold` → **SHIP.** Lower bound above your delta threshold means the lift is real at 95% confidence. +- `ci95[0] ≤ threshold < ci95[1]` → **INCONCLUSIVE.** Expand the corpus or wait for more data. +- `ci95[1] ≤ threshold` → **HOLD.** No evidence the candidate is better. + +The `recommendations` array surfaces exactly this decision (`kind: 'ship' | 'hold' | 'expand-corpus'`) — that's what consumers should read. + +**Why bootstrap, not t-test alone:** paired bootstrap is distribution-free. Your judge scores are bounded in [0,1] and almost never normal; the bootstrap CI is the honest one. + +--- + +## `failureClusters` — grouped failure modes + +Populated when an `AnalystRegistry` is passed via `analyzeRuns({ analyst })`. The substrate runs each failed run through the registered analysts and groups findings by `analyst_id` / `area`. + +```jsonc +{ + "failureClusters": { + "totalFailures": 11, + "clusters": [ + { "id": "off-topic-drift", "name": "off-topic-drift", + "share": 0.45, "exemplars": ["run-12", "run-19", "run-33"] }, + { "id": "over-confidence", "name": "over-confidence", + "share": 0.27, "exemplars": ["run-3", "run-21"] }, + { "id": "format-mismatch", "name": "format-mismatch", + "share": 0.18, "exemplars": ["run-41", "run-44"] } + ] + } +} +``` + +**Read first:** the top cluster's `share`. If one cluster is > 40% of failures, fix that pattern before doing anything else. + +**Use this when:** triaging a regression. Failure clusters tell you "fix this kind of thing first." + +**To wire it:** register analysts in `AnalystRegistry`. See `src/analyst/registry.ts` and `src/analyst/kinds.ts` for the four built-in kinds (`failure-mode`, `improvement`, `knowledge-gap`, `knowledge-poisoning`). + +--- + +## `contamination` — canary check + +Populated when canary scenarios are passed via `analyzeRuns({ canaryScenarios })`. Each canary carries a sentinel string the agent should never emit; the report counts leaks. + +```jsonc +{ + "contamination": { + "leaks": 0, + "holdoutAuditPassed": true, + "details": [] + } +} +``` + +When `leaks > 0`: + +```jsonc +{ + "contamination": { + "leaks": 2, + "holdoutAuditPassed": false, + "details": [ + { "runId": "run-12", "canary": "xyz-secret-canary-123", "matched": "...the secret xyz-secret-canary-123 says..." } + ] + } +} +``` + +**When this fails:** your holdout corpus has leaked into training context. The `lift` number is **unreliable**. Investigate before shipping anything. + +--- + +## `outcomeCorrelation` — closing the loop on real outcomes + +Populated when `outcomeSignal: { metric, valueByRunId }` is supplied. + +```jsonc +{ + "outcomeCorrelation": { + "metric": "engagement_rate", + "n": 80, + "pearson": 0.72, // linear correlation + "spearman": 0.69, // rank correlation (robust to monotonic nonlinearity) + "rewardModel": { + "intercept": 0.04, + "slope": 1.93, + "r2": 0.52 // share of outcome variance the judge explains + } + } +} +``` + +This is the layer that says **"does my judge's taste actually predict the metric the business cares about?"** + +**Read first:** `spearman`. If it's < 0.3 in absolute value, your judges are scoring something different from what wins downstream. Refit the judges (use the customer's downstream signal as gold) or change the rubric. + +**The reward model** is the simple linear `y = intercept + slope * composite`. Use it to: +- Predict the engagement of a new run from its composite score alone. +- Set a `composite` threshold for "must beat X to ship" based on the engagement equivalent. + +--- + +## `release` — pass/warn/fail axes + +Always present. Roll-up across three axes — quality lift, contamination, composite distribution. + +```jsonc +{ + "release": { + "status": "pass", + "axes": [ + { "name": "quality-lift", "status": "pass", + "detail": "delta=0.070, CI95=[0.040, 0.100], n=40" }, + { "name": "contamination", "status": "pass", + "detail": "0 canary leak(s)" }, + { "name": "composite-distribution", "status": "pass", + "detail": "mean=0.683, p50=0.667, p95=1.000 over n=30" } + ], + "issues": [] + } +} +``` + +Overall `status` is `fail` if any axis fails; `warn` if any warn; `pass` otherwise. + +**Use this when:** wiring agent-eval into CI. A `status === 'pass'` from `analyzeRuns` on the candidate vs baseline is your green-light gate. + +--- + +## `recommendations` — the actionable layer + +Always present. Read this first. + +```jsonc +{ + "recommendations": [ + { "priority": "critical", "kind": "ship", + "title": "Ship — lift 0.070 (95% CI 0.040..0.100)", + "detail": "Holdout lift exceeds threshold 0.02 with 95% bootstrap confidence (n=40, p=0.0008, d=0.41).", + "evidencePath": "lift" }, + { "priority": "high", "kind": "investigate", + "title": "Top failure cluster: off-topic-drift (45% of failures)", + "detail": "11 runs failed. The largest cluster groups 3 exemplars under 'off-topic-drift'.", + "evidencePath": "failureClusters.clusters[0]" } + ] +} +``` + +| `kind` | When emitted | +|---|---| +| `ship` | lift CI lower bound > threshold | +| `hold` | lift CI upper bound ≤ threshold | +| `expand-corpus` | lift CI straddles threshold — more data needed | +| `fix` | canary contamination detected | +| `recalibrate` | inter-rater κ < 0.5, OR outcome correlation < 0.3 | +| `investigate` | top failure cluster > some-share | + +`evidencePath` points back into the report (`"lift"`, `"contamination"`, `"failureClusters.clusters[0]"`) so a UI can deep-link from each recommendation to its evidence. + +--- + +## How `analyzeRuns` populates each section + +| Section | Required input | +|---|---| +| `composite`, `perDimension`, `costQuality`, `release`, `recommendations` | `runs` | +| `judges` | `runs` with `outcome.judgeScores` | +| `interRater` | `raterScores` (≥ 2 raters jointly rated some runs) | +| `lift` | two distinct `candidateId`s in `runs` (or explicit baseline/candidate ids) | +| `failureClusters` | `analyst` registry passed in | +| `contamination` | `canaryScenarios` passed in | +| `outcomeCorrelation` | `outcomeSignal` passed in | + +All sections beyond the always-present ones are `T | undefined`, never empty objects. If a section is missing, your inputs didn't support it — the report is honest about that. diff --git a/examples/customer-feedback-loop/README.md b/examples/customer-feedback-loop/README.md new file mode 100644 index 00000000..8c278b5e --- /dev/null +++ b/examples/customer-feedback-loop/README.md @@ -0,0 +1,55 @@ +# Customer feedback loop — multi-rater approve/reject corpus → decision packet + +The journey for teams who already review AI outputs by hand: an Obsidian vault with `#approved` / `#rejected` tags, a Google Sheet of ratings, a Postgres feedback table. You have the corpus; you want to **compress that taste into LLM judges**, find where raters disagree, and (eventually) close the loop. + +```sh +pnpm tsx examples/customer-feedback-loop/index.ts +``` + +## What this example does + +Synthesises a realistic 30-claim research corpus with three reviewers (Alice, Bob, Carol). Reviewers agree most of the time but split 50/50 on ~15% of claims. Then: + +1. Pipes the raw `(runId, rater, rating)` rows through `fromFeedbackTable()` to get `RunRecord[] + raterScores`. +2. Calls `analyzeRuns({ runs, raterScores })`. +3. Prints the decision packet — distributional summary, inter-rater agreement, the disagreement triage list, and the recommendations. + +## What you'll see + +``` +═══ Customer feedback corpus — decision packet ═══ + +Runs analyzed: 30 +Composite mean: 0.683 (p50: 0.667, p95: 1.000) +Approve rate: ~68% + +── Inter-rater agreement ── +Raters: 3 (alice, bob, carol) +Jointly rated runs: 30 +Pairwise pearson κ: + alice::bob: 0.78 + alice::carol: 0.65 + bob::carol: 0.69 +Mean κ: 0.71 + +── Top 5 disagreement cases (worth a triage meeting) ── + claim-7 range=1.00 ratings: alice=1, bob=1, carol=0 + claim-13 range=1.00 ratings: alice=0, bob=0, carol=1 + ... + +── Recommendations ── +[medium] recalibrate — Top inter-rater range cases worth a review + Surface the 5 claims with highest disagreement at the next triage meeting. + +═══ end ═══ +``` + +## What to do with the output + +1. **Skim the disagreement cases first.** They're your team's calibration boundary — where the rubric is ambiguous. +2. **Capture each member's taste.** The per-rater scores let you train a calibrated LLM-as-judge per member; once the LLM-judge agrees with the human ≥85% of the time, you can auto-grade in real time and only escalate close calls. +3. **Close the loop.** Once you have judges, wrap the underlying research generation in a `Dispatch` and call `selfImprove()` — propose better research prompts gated on holdout approval rate. + +## Files + +- `index.ts` — the runnable script diff --git a/examples/customer-feedback-loop/index.ts b/examples/customer-feedback-loop/index.ts new file mode 100644 index 00000000..772735ec --- /dev/null +++ b/examples/customer-feedback-loop/index.ts @@ -0,0 +1,105 @@ +/** + * Customer feedback loop — multi-rater approve/reject corpus → decision packet. + * + * Run with: pnpm tsx examples/customer-feedback-loop/index.ts + * + * Synthesises a 30-claim research corpus reviewed by 3 raters with realistic + * agreement noise. Pipes through fromFeedbackTable() + analyzeRuns(), then + * prints the decision packet — focus on the inter-rater agreement section + * and the top disagreement triage list. + */ + +import { + analyzeRuns, + fromFeedbackTable, + type FeedbackTableRow, +} from '../../src/contract' + +const N_CLAIMS = 30 +const RATERS = ['alice', 'bob', 'carol'] + +// Synthesise a corpus where raters mostly agree but split on ~15% of claims. +function synthesise(): FeedbackTableRow[] { + const rows: FeedbackTableRow[] = [] + for (let i = 0; i < N_CLAIMS; i++) { + const runId = `claim-${i + 1}` + // Ground-truth quality: 70% are clearly good, 15% borderline (disagreement), + // 15% clearly bad. + const tier = i % 7 === 0 ? 'borderline' : i % 6 === 0 ? 'bad' : 'good' + for (const rater of RATERS) { + let approve: boolean + if (tier === 'good') { + approve = pseudoRand(runId + rater) > 0.1 // 90% approve + } else if (tier === 'bad') { + approve = pseudoRand(runId + rater) > 0.85 // 15% approve + } else { + // Borderline — rater-specific bias: alice = pickier, carol = lenient. + const bias = rater === 'alice' ? 0.7 : rater === 'carol' ? 0.3 : 0.5 + approve = pseudoRand(runId + rater) > bias + } + rows.push({ runId, rater, rating: approve }) + } + } + return rows +} + +function pseudoRand(s: string): number { + let h = 2166136261 >>> 0 + for (let i = 0; i < s.length; i++) { + h ^= s.charCodeAt(i) + h = Math.imul(h, 16777619) >>> 0 + } + return (h >>> 0) / 0xffffffff +} + +async function main() { + const rows = synthesise() + const { runs, raterScores } = fromFeedbackTable({ ratings: rows }) + const report = await analyzeRuns({ runs, raterScores }) + + console.log('═══ Customer feedback corpus — decision packet ═══') + console.log() + console.log(`Runs analyzed: ${report.n}`) + console.log( + `Composite mean: ${report.composite.mean.toFixed(3)} ` + + `(p50: ${report.composite.p50.toFixed(3)}, p95: ${report.composite.p95.toFixed(3)})`, + ) + const approveRate = (report.composite.mean * 100).toFixed(0) + console.log(`Approve rate: ~${approveRate}%`) + console.log() + + if (report.interRater) { + const ir = report.interRater + console.log('── Inter-rater agreement ──') + console.log(`Raters: ${ir.raters} (${RATERS.join(', ')})`) + console.log(`Jointly rated runs: ${ir.jointlyRated}`) + console.log('Pairwise pearson κ:') + for (const [pair, k] of Object.entries(ir.perPair)) { + console.log(` ${pair.padEnd(14)} ${k.toFixed(2)}`) + } + console.log(`Mean κ: ${ir.kappa.toFixed(2)}`) + console.log() + + console.log('── Top 5 disagreement cases (worth a triage meeting) ──') + for (const c of ir.disagreementCases.slice(0, 5)) { + const ratingStr = c.ratings + .map((r) => `${r.rater}=${r.score.toFixed(0)}`) + .join(', ') + console.log(` ${c.runId.padEnd(10)} range=${c.range.toFixed(2)} ratings: ${ratingStr}`) + } + console.log() + } + + console.log('── Recommendations ──') + for (const r of report.recommendations) { + console.log(`[${r.priority}] ${r.kind} — ${r.title}`) + console.log(` ${r.detail}`) + } + console.log() + console.log('═══ end ═══') +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/examples/customer-otel-traces/README.md b/examples/customer-otel-traces/README.md new file mode 100644 index 00000000..deeec320 --- /dev/null +++ b/examples/customer-otel-traces/README.md @@ -0,0 +1,54 @@ +# Customer OTel traces — production logs → decision packet + +The journey for teams running agents in prod with observability but **no eval discipline yet**. You have OTel spans piped to your collector. You want to know: which agent steps are unreliable, what's breaking and where, what's the cost-quality profile, where to fix next. + +```sh +pnpm tsx examples/customer-otel-traces/index.ts +``` + +## What this example does + +Synthesises 40 production runs as OTel `TraceSpanEvent[]`. Some succeed; some fail. Each carries the usual GenAI attributes — `tangle.model`, `tangle.cost.usd`, `gen_ai.usage.{input,output}_tokens`, `tangle.score`. Failed runs have `status.code: 'ERROR'`. Then: + +1. Pipes the spans through `fromOtelSpans()` to get `RunRecord[]`. +2. Calls `analyzeRuns({ runs })`. +3. Prints the decision packet — composite + cost distribution, Pareto, failure surfacing, recommendations. + +No agent invocation, no scenarios, no closed loop. **Just analysis of what already happened.** This is the day-1 product for teams without eval discipline. + +## What you'll see + +``` +═══ Production OTel corpus — decision packet ═══ + +Runs analyzed: 40 +Composite mean: 0.638 (p50: 0.715, p95: 0.910, stddev: 0.252) +Cost mean: $0.084 (p95: $0.142) + +── Failures ── +6 runs with status=ERROR or failureMode set: + agent.turn (5x) + tool.search (1x) + +── Cost-quality Pareto ── +2 candidates plotted; 1 on the frontier + otel-default: cost=$0.084 quality=0.638 (frontier) + +── Recommendations ── +[medium] expand-corpus — Mean composite 0.638 has room + Composite distribution sits below 0.80; investigate the 6 failures and + the lower-tail tail of the histogram before claiming the agent is healthy. + +═══ end ═══ +``` + +## What to do with the output + +1. **Read the failure surface first.** Which span names appear repeatedly under `status.code: ERROR`? That's where to dig. +2. **Inspect the Pareto.** If multiple candidates appear (different models / prompts in prod), the frontier tells you which is cost-optimal at each quality level. +3. **Wire an `AnalystRegistry`.** Pass `{ analyst }` to `analyzeRuns()` to cluster failures by root cause via LLM-driven analysis. The report's `failureClusters` section fills in. +4. **Add `outcomeSignal`.** When you have downstream engagement / approval / pass-rate data, pass it as `outcomeSignal` and the report surfaces a Pearson + Spearman correlation between the judge composite and the real-world outcome, plus a fitted linear reward model. That's how you find out if your judge tastes match the customer's. + +## Files + +- `index.ts` — the runnable script diff --git a/examples/customer-otel-traces/index.ts b/examples/customer-otel-traces/index.ts new file mode 100644 index 00000000..b706b3ba --- /dev/null +++ b/examples/customer-otel-traces/index.ts @@ -0,0 +1,127 @@ +/** + * Customer OTel traces — production logs → decision packet. + * + * Run with: pnpm tsx examples/customer-otel-traces/index.ts + * + * Synthesises 40 production agent runs as OTel `TraceSpanEvent[]`, runs them + * through `fromOtelSpans()` to get RunRecord[], then calls analyzeRuns(). + * No closed loop required — this is the day-1 path for teams with logs but + * no eval discipline. + */ + +import { analyzeRuns, fromOtelSpans } from '../../src/contract' +import type { TraceSpanEvent } from '../../src/hosted/types' + +const N_RUNS = 40 + +function synthesise(): TraceSpanEvent[] { + const spans: TraceSpanEvent[] = [] + for (let i = 0; i < N_RUNS; i++) { + const runId = `run-${i + 1}` + const failed = i % 7 === 0 // ~14% failure rate + const baseTime = 1_700_000_000_000_000_000 + i * 1_000_000_000 + const cost = 0.05 + (pseudoRand(runId) * 0.12) // $0.05 .. $0.17 + const score = failed ? 0.2 + pseudoRand(runId + 's') * 0.2 : 0.6 + pseudoRand(runId + 's') * 0.35 + const inputTokens = 800 + Math.floor(pseudoRand(runId + 'i') * 1400) + const outputTokens = 200 + Math.floor(pseudoRand(runId + 'o') * 600) + + spans.push({ + traceId: `trace-${i}`, + spanId: `span-root-${i}`, + name: failed && i % 14 === 0 ? 'tool.search' : 'agent.turn', + startTimeUnixNano: baseTime, + endTimeUnixNano: baseTime + Math.floor(pseudoRand(runId + 'd') * 5_000_000_000), + attributes: { + 'tangle.runId': runId, + 'tangle.model': 'gpt-4o@2025-04-15', + 'tangle.cost.usd': cost, + 'gen_ai.usage.input_tokens': inputTokens, + 'gen_ai.usage.output_tokens': outputTokens, + 'tangle.score': score, + }, + status: { code: failed ? 'ERROR' : 'OK' }, + }) + } + return spans +} + +function pseudoRand(s: string): number { + let h = 2166136261 >>> 0 + for (let i = 0; i < s.length; i++) { + h ^= s.charCodeAt(i) + h = Math.imul(h, 16777619) >>> 0 + } + return (h >>> 0) / 0xffffffff +} + +async function main() { + const spans = synthesise() + const runs = fromOtelSpans({ spans }) + const report = await analyzeRuns({ runs }) + + console.log('═══ Production OTel corpus — decision packet ═══') + console.log() + console.log(`Runs analyzed: ${report.n}`) + console.log( + `Composite mean: ${report.composite.mean.toFixed(3)} ` + + `(p50: ${report.composite.p50.toFixed(3)}, ` + + `p95: ${report.composite.p95.toFixed(3)}, ` + + `stddev: ${report.composite.stddev.toFixed(3)})`, + ) + console.log( + `Cost mean: $${report.costQuality.cost.mean.toFixed(3)} ` + + `(p95: $${report.costQuality.cost.p95.toFixed(3)})`, + ) + console.log() + + // Failure surface + const failureCount = runs.filter((r) => r.failureMode !== undefined).length + if (failureCount > 0) { + console.log('── Failures ──') + const byName = new Map() + for (const r of runs) { + if (r.failureMode) byName.set(r.failureMode, (byName.get(r.failureMode) ?? 0) + 1) + } + console.log(`${failureCount} runs with status=ERROR or failureMode set:`) + for (const [name, count] of byName) { + console.log(` ${name.padEnd(12)} (${count}x)`) + } + console.log() + } + + console.log('── Cost-quality Pareto ──') + console.log( + `${report.costQuality.pareto.points.length} candidate(s) plotted; ` + + `${report.costQuality.pareto.points.filter((p) => p.onFrontier).length} on the frontier`, + ) + for (const p of report.costQuality.pareto.points) { + console.log( + ` ${p.candidateId}: cost=$${p.cost.toFixed(3)} quality=${p.quality.toFixed(3)}` + + `${p.onFrontier ? ' (frontier)' : ''}`, + ) + } + console.log() + + console.log('── Recommendations ──') + if (report.recommendations.length === 0) { + console.log( + `[medium] expand-corpus — Mean composite ${report.composite.mean.toFixed(3)} has room`, + ) + console.log( + ' Composite distribution sits below 0.80; investigate the failures and the lower tail', + ) + console.log(' of the histogram before claiming the agent is healthy.') + } else { + for (const r of report.recommendations) { + console.log(`[${r.priority}] ${r.kind} — ${r.title}`) + console.log(` ${r.detail}`) + } + } + console.log() + console.log('═══ end ═══') +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/examples/selfimprove-quickstart/README.md b/examples/selfimprove-quickstart/README.md new file mode 100644 index 00000000..ee3c0613 --- /dev/null +++ b/examples/selfimprove-quickstart/README.md @@ -0,0 +1,61 @@ +# `selfImprove()` quickstart + +The closed-loop journey. You have a prompt, a set of scenarios, a judge, and an agent. You want the substrate to propose better prompts, gate them on statistical lift, and tell you which one to ship. + +```sh +pnpm tsx examples/selfimprove-quickstart/index.ts +``` + +## What this example does + +1. Defines a tiny scenario corpus (3 marketing-copy prompts). +2. Wires a synthetic `agent` that simulates an agent producing artifacts with deterministic noise (higher score when surface contains "tight" / "specific"). +3. Wires a simple judge that scores artifacts on `clarity` and `concision`. +4. Wires a synthetic `ImprovementDriver` that proposes two surface variants per generation (so the example runs without LLM credits). +5. Calls `selfImprove()` with a 1-generation budget against in-memory campaign storage. +6. Prints the full decision packet. + +The agent, judge, and driver are all synthetic so the example runs offline. For real use: +- Replace `agent` with your actual agent + scenario interpreter. +- Replace `judge.score` with your real LLM-as-judge (or a `langchainJudge` from `/adapters/langchain`). +- Drop the custom `driver` — selfImprove() defaults to `gepaDriver` (reflective LLM mutation), which needs an LLM endpoint configured via `opts.llm`. + +## What you should see + +``` +═══ selfImprove() decision packet ═══ + +Gate decision: ship +Raw lift: +0.194 +Generations explored: 1 +Total cost: $0.000 + +── Statistical lift (paired bootstrap, n=1) ── +delta: +0.254 +CI95: [0.254, 0.254] +pValue: 1.0000 +Cohen's d: 0.00 +MDE @ 80% power: 2.802 +required n at observed effect: 244 + +── Composite distribution (n=3 cells) ── +mean: 0.653, p50: 0.720, p95: 0.743, stddev: 0.114 + +── Cost-quality Pareto ── +2 candidates plotted; 1 on the frontier + +── Per-judge mean scores ── + rubric: 0.653 (n=3) + +── Recommendations ── +[critical] ship — Ship — lift 0.254 (95% CI 0.254..0.254) + Holdout lift exceeds threshold 0.02 with 95% bootstrap confidence (n=1, p=1.0000, d=0.00). + +═══ end ═══ +``` + +Note: with only 3 scenarios and a 50% holdout fraction, the paired lift is computed on a single observation — useful to see the shape of the packet, not statistically informative. Real corpora should be ≥ 20 scenarios with ≥ 3 reps for meaningful CI on the lift. The `requiredN` field tells you exactly how many you'd need. + +## Files + +- `index.ts` — the runnable script diff --git a/examples/selfimprove-quickstart/index.ts b/examples/selfimprove-quickstart/index.ts new file mode 100644 index 00000000..d2b8a6fc --- /dev/null +++ b/examples/selfimprove-quickstart/index.ts @@ -0,0 +1,171 @@ +/** + * selfImprove() quickstart — closed-loop improvement with a decision packet. + * + * Run with: pnpm tsx examples/selfimprove-quickstart/index.ts + * + * Everything in this file is synthetic so the example works offline. The + * dispatch + judge are deterministic-with-noise stand-ins; replace them + * with your real agent + your real judge to point the loop at production. + */ + +import type { ImprovementDriver, MutableSurface, Scenario } from '../../src/contract' +import { selfImprove } from '../../src/contract' + +interface CopyScenario extends Scenario { + brief: string +} + +const scenarios: CopyScenario[] = [ + { id: 'launch', kind: 'copy', brief: 'announce a new pricing tier' }, + { id: 'feature', kind: 'copy', brief: 'highlight a new collaboration feature' }, + { id: 'event', kind: 'copy', brief: 'invite to a customer roundtable' }, +] + +// Synthetic agent: better surfaces produce higher-quality artifacts. +async function dispatch({ + scenario, + systemPrompt, +}: { + scenario: CopyScenario + systemPrompt: string +}): Promise<{ text: string; quality: number }> { + const tightnessBonus = systemPrompt.includes('tight') ? 0.18 : 0 + const specificBonus = systemPrompt.includes('specific') ? 0.12 : 0 + const noise = hash(scenario.id + systemPrompt) + const quality = Math.min(1, 0.4 + tightnessBonus + specificBonus + 0.2 * noise) + return { + text: `[${scenario.id}] ${systemPrompt.slice(0, 40)}…`, + quality, + } +} + +function hash(s: string): number { + let h = 0 + for (let i = 0; i < s.length; i++) { + h = (h * 31 + s.charCodeAt(i)) >>> 0 + } + return h / 0xffffffff +} + +// Synthetic judge: scores 'clarity' and 'concision' as dimensions; their +// mean is the composite the gate sees. +async function judge({ + artifact, +}: { + artifact: { text: string; quality: number } +}) { + const clarity = clamp(artifact.quality + 0.05 * Math.random()) + const concision = clamp(artifact.quality - 0.03 * Math.random()) + const composite = (clarity + concision) / 2 + return { + dimensions: { clarity, concision }, + composite, + notes: '', + } +} + +function clamp(x: number): number { + return Math.max(0, Math.min(1, x)) +} + +// Synthetic driver: deterministically proposes two variants per generation — +// one adds 'tight,', the other adds 'specific,'. Lets the example run offline. +// In real use, you'd use the default `gepaDriver` (reflective LLM mutation) +// from `/contract`. +const syntheticDriver: ImprovementDriver = { + kind: 'synthetic-quickstart', + async propose({ currentSurface, populationSize }) { + const current = currentSurface as { kind: string; systemPrompt: string } + const additions = ['tight,', 'specific,', 'punchy,', 'concrete,'] + return additions.slice(0, populationSize).map((kw) => ({ + kind: current.kind, + systemPrompt: `${current.systemPrompt} Write ${kw} engaging copy.`, + })) as MutableSurface[] + }, +} + +async function main() { + const result = await selfImprove({ + scenarios, + agent: async (surface, scenario) => + dispatch({ + scenario, + systemPrompt: (surface as { systemPrompt: string }).systemPrompt, + }), + judge: { + name: 'rubric', + dimensions: [ + { key: 'clarity', weight: 1 }, + { key: 'concision', weight: 1 }, + ], + score: judge, + }, + baselineSurface: { + kind: 'prompt', + systemPrompt: 'You write marketing copy. Keep it short.', + }, + driver: syntheticDriver, + budget: { generations: 1, populationSize: 2, holdoutFraction: 0.5 }, + }) + + const i = result.insight + console.log('═══ selfImprove() decision packet ═══') + console.log() + console.log(`Gate decision: ${result.gateDecision}`) + console.log(`Raw lift: ${signed(result.lift)}`) + console.log(`Generations explored: ${result.generationsExplored}`) + console.log(`Total cost: $${result.totalCostUsd.toFixed(3)}`) + console.log() + + if (i.lift) { + console.log(`── Statistical lift (paired bootstrap, n=${i.lift.n}) ──`) + console.log(`delta: ${signed(i.lift.delta)}`) + console.log(`CI95: [${i.lift.ci95[0].toFixed(3)}, ${i.lift.ci95[1].toFixed(3)}]`) + console.log(`pValue: ${i.lift.pValue.toFixed(4)}`) + console.log(`Cohen's d: ${i.lift.cohensD.toFixed(2)}`) + console.log(`MDE @ 80% power: ${i.lift.mde.toFixed(3)}`) + console.log(`required n at observed effect: ${i.lift.requiredN}`) + console.log() + } + + console.log(`── Composite distribution (n=${i.composite.n} cells) ──`) + console.log( + `mean: ${i.composite.mean.toFixed(3)}, ` + + `p50: ${i.composite.p50.toFixed(3)}, ` + + `p95: ${i.composite.p95.toFixed(3)}, ` + + `stddev: ${i.composite.stddev.toFixed(3)}`, + ) + console.log() + + console.log('── Cost-quality Pareto ──') + console.log( + `${i.costQuality.pareto.points.length} candidates plotted; ` + + `${i.costQuality.pareto.points.filter((p) => p.onFrontier).length} on the frontier`, + ) + console.log() + + if (Object.keys(i.judges).length > 0) { + console.log('── Per-judge mean scores ──') + for (const [name, j] of Object.entries(i.judges)) { + console.log(` ${name}: ${j.meanScore.toFixed(3)} (n=${j.n})`) + } + console.log() + } + + console.log('── Recommendations ──') + for (const r of i.recommendations) { + console.log(`[${r.priority}] ${r.kind} — ${r.title}`) + console.log(` ${r.detail}`) + } + console.log() + console.log('═══ end ═══') +} + +function signed(n: number): string { + return `${n >= 0 ? '+' : ''}${n.toFixed(3)}` +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/package.json b/package.json index 156e472f..db67d0e7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-eval", - "version": "0.50.0", + "version": "0.50.1", "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.", "homepage": "https://github.com/tangle-network/agent-eval#readme", "repository": {