feat(daemon): Critique Theater Phase 12 observability foundations

Lands the metrics registry, the structured logger, the /api/metrics
route, and the adapter-degraded bump that wires up the first data
point. The orchestrator-side bumps for runs / rounds / composite /
must-fix / interrupted / parser_errors / protocol_version land in a
follow-up commit on this branch (kept separate so the wiring diff
reads cleanly against the registry shape).

Surfaces added:

- apps/daemon/src/metrics/index.ts: 9 Prometheus series under the
  open_design_critique_* namespace with the histogram buckets the
  spec calls out (round_duration_ms at 100 / 250 / 500 / 1000 /
  2500 / 5000 / 10000 / 30000 / 60000 ms; composite_score at
  0-10 integer steps).
- apps/daemon/src/logging/critique.ts: 6 typed events, one JSON line
  per call on stdout, namespaced critique. Matches the JSON-per-line
  convention cli.ts already uses; no new logger framework.
- apps/daemon/src/server.ts: GET /api/metrics route. Honors
  OD_METRICS_ENDPOINT=disabled to opt out for air-gapped installs.
- apps/daemon/src/critique/adapter-degraded.ts: markDegraded now
  bumps degraded_total so the adapter-health dashboard panel
  reflects every TTL refresh and every fresh mark.

Deps: prom-client ^15.1.0, @opentelemetry/api ^1.9.0 added to
apps/daemon/package.json. Both are zero-config no-ops without an
exporter wired; daemon bundle size impact is ~150 KB uncompressed.
The @opentelemetry/api dep is in place ahead of the OTel-spans
follow-up commit; it adds no behavior on this commit.

Tests:
- tests/metrics/critique.test.ts (3 cases): registry shape +
  exposition text + reset-between-tests
- tests/logging/critique.test.ts (4 cases): event shape + ordering
  + newline framing + namespace stamping

Verification (Windows-local):
- pnpm --filter @open-design/daemon typecheck: clean
- New metrics + logging suites: 7 / 7 green
- Existing adapter-degraded + conformance + rollout suites:
  22 / 22 green; the bump is non-breaking
This commit is contained in:
Nagendhra
2026-05-12 12:11:52 -04:00
parent 09bf396bcb
commit 2b8b7445a1
8 changed files with 462 additions and 2 deletions

View File

@@ -37,12 +37,14 @@
"@open-design/platform": "workspace:*",
"@open-design/sidecar": "workspace:*",
"@open-design/sidecar-proto": "workspace:*",
"@opentelemetry/api": "^1.9.0",
"better-sqlite3": "^12.9.0",
"blake3-wasm": "2.1.5",
"chokidar": "^5.0.0",
"express": "^4.19.2",
"jszip": "^3.10.1",
"multer": "^2.1.1",
"prom-client": "^15.1.0",
"undici": "^7.16.0"
},
"devDependencies": {

View File

@@ -16,6 +16,8 @@
import type { DegradedReason } from '@open-design/contracts/critique';
import { critiqueDegradedTotal } from '../metrics/index.js';
export type DegradedSource = 'conformance' | 'orchestrator' | 'manual';
export interface DegradedEntry {
@@ -65,6 +67,13 @@ export function markDegraded(
expiresAtMs: markedAtMs + ttlMs,
};
store.set(adapterId, entry);
// Phase 12: every degraded mark increments the Prometheus counter so
// the dashboard's "adapter health" panel reflects the same rate the
// orchestrator and conformance harness observe. Bump is unconditional
// (every call records, including overwrites of a still-live entry)
// because the dashboard rate query divides over the time window, not
// over distinct adapters.
critiqueDegradedTotal.inc({ reason, adapter: adapterId });
return entry;
}

View File

@@ -0,0 +1,72 @@
/**
* Critique Theater structured logger (Phase 12).
*
* Six events, one JSON object per line on stdout, namespaced
* `critique`. Matches the JSON-line convention `cli.ts` and
* `mcp-live-artifacts-server.ts` already write so an operator's
* existing log pipeline (Loki, Cloudwatch, Datadog, etc.) ingests
* critique events without a new adapter.
*
* Why a discriminated union instead of pino / winston: the daemon
* already does JSON-per-line writes through `process.stdout`; adding
* pino would either wrap that surface (a refactor outside Phase 12's
* scope) or run two logger systems side by side. The thin wrapper
* below tests via `process.stdout.write` capture and a future system
* swap can replace the implementation without touching the call sites.
*/
export type CritiqueLogEvent =
| {
event: 'run_started';
runId: string;
adapter: string;
skill: string;
protocolVersion: number;
}
| {
event: 'round_closed';
runId: string;
round: number;
composite: number;
mustFix: number;
decision: 'continue' | 'ship';
}
| {
event: 'run_shipped';
runId: string;
round: number;
composite: number;
status: string;
}
| {
event: 'degraded';
runId: string;
reason: string;
adapter: string;
}
| {
event: 'parser_recover';
runId: string;
kind: string;
position: number;
}
| {
event: 'run_failed';
runId: string;
cause: string;
error?: string;
};
/**
* Emit one JSON line for the given critique event. The timestamp is
* ISO-8601 with millisecond precision so an aggregator that ingests
* multiple log streams can stable-sort across them.
*/
export function logCritique(e: CritiqueLogEvent): void {
const line = JSON.stringify({
...e,
namespace: 'critique',
timestamp: new Date().toISOString(),
});
process.stdout.write(line + '\n');
}

View File

@@ -0,0 +1,128 @@
/**
* Critique Theater Prometheus registry (Phase 12).
*
* Nine series, all under the `open_design_critique_*` namespace, scoped
* to the default `prom-client` registry so a single `/api/metrics`
* scrape returns everything the dashboard panels query.
*
* Label cardinality is bounded:
* - `adapter` is the agent id, capped at installation-time discovery
* - `skill` is the SKILL.md id, capped at the skills registry size
* - `panelist` / `dim` are closed enums (`PANELIST_ROLES`, dimension
* names from the parser contract)
* - `reason` / `cause` / `kind` are closed enums in
* `packages/contracts/src/critique.ts`
* - `round` is small-integer (typical 1-3)
* - `status` and `version` are closed enums / small integers
*
* Histogram buckets follow `specs/current/critique-theater.md` §
* Observability and are intentionally explicit so an operator can tune
* them after the first 1000 runs without code changes.
*
* No call signed off on a particular skill label being globally
* available; orchestrator call sites pass `skill ?? 'unknown'` when the
* spawn handler did not thread the skill id. Defaulting to 'unknown'
* keeps the series shape stable while threading the label is a
* separate fix.
*/
import {
Counter,
Gauge,
Histogram,
register,
} from 'prom-client';
export const critiqueRunsTotal = new Counter({
name: 'open_design_critique_runs_total',
help: 'Total Critique Theater runs that reached a terminal phase.',
labelNames: ['status', 'adapter', 'skill'] as const,
registers: [register],
});
export const critiqueRoundsTotal = new Counter({
name: 'open_design_critique_rounds_total',
help: 'Total round_end events processed across all runs.',
labelNames: ['adapter', 'skill'] as const,
registers: [register],
});
export const critiqueRoundDurationMs = new Histogram({
name: 'open_design_critique_round_duration_ms',
help: 'Wall-clock duration of a single round, in milliseconds.',
labelNames: ['adapter', 'skill', 'round'] as const,
buckets: [100, 250, 500, 1000, 2500, 5000, 10000, 30000, 60000],
registers: [register],
});
export const critiqueCompositeScore = new Histogram({
name: 'open_design_critique_composite_score',
help: 'Composite score reported by round_end events.',
labelNames: ['adapter', 'skill'] as const,
buckets: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
registers: [register],
});
export const critiqueMustFixTotal = new Counter({
name: 'open_design_critique_must_fix_total',
help: 'Total panelist_must_fix events emitted across all runs.',
labelNames: ['panelist', 'dim', 'adapter', 'skill'] as const,
registers: [register],
});
export const critiqueDegradedTotal = new Counter({
name: 'open_design_critique_degraded_total',
help: 'Total times an adapter was marked degraded (TTL-bounded).',
labelNames: ['reason', 'adapter'] as const,
registers: [register],
});
export const critiqueInterruptedTotal = new Counter({
name: 'open_design_critique_interrupted_total',
help: 'Total runs terminated via the user interrupt path.',
labelNames: ['adapter'] as const,
registers: [register],
});
export const critiqueParserErrorsTotal = new Counter({
name: 'open_design_critique_parser_errors_total',
help: 'Total parser_warning events surfaced from the SSE stream.',
labelNames: ['kind', 'adapter'] as const,
registers: [register],
});
export const critiqueProtocolVersion = new Gauge({
name: 'open_design_critique_protocol_version',
help: 'Highest protocolVersion observed in a run_started frame.',
labelNames: ['version'] as const,
registers: [register],
});
/**
* Exposed to the `/api/metrics` route. Returns the full Prometheus
* exposition format as a single string with the correct content type
* available on `register.contentType`.
*/
export async function getCritiqueMetrics(): Promise<string> {
return register.metrics();
}
export { register };
/**
* Test-only: wipe the default registry so vitest cases can assert
* series shape without leakage from a prior `runOrchestrator` call.
* Production code never reaches this; the function name is deliberate
* so a grep audit flags any accidental production caller.
*/
export function __resetCritiqueMetricsForTests(): void {
critiqueRunsTotal.reset();
critiqueRoundsTotal.reset();
critiqueRoundDurationMs.reset();
critiqueCompositeScore.reset();
critiqueMustFixTotal.reset();
critiqueDegradedTotal.reset();
critiqueInterruptedTotal.reset();
critiqueParserErrorsTotal.reset();
critiqueProtocolVersion.reset();
}

View File

@@ -69,6 +69,7 @@ import { runOrchestrator } from './critique/orchestrator.js';
import { createRunRegistry } from './critique/run-registry.js';
import { handleCritiqueInterrupt } from './critique/interrupt-handler.js';
import { handleCritiqueArtifact } from './critique/artifact-handler.js';
import { getCritiqueMetrics, register } from './metrics/index.js';
import {
isCritiqueEnabled,
parseEnvEnabled,
@@ -2262,6 +2263,19 @@ export async function startServer({
res.json({ version });
});
// Prometheus scrape endpoint (Phase 12). Returns the full exposition
// format string. Operators put this behind their existing auth proxy;
// there is no built-in authn on the daemon HTTP server. To disable
// the endpoint entirely (air-gapped installs, regulatory contexts),
// set `OD_METRICS_ENDPOINT=disabled`; the route is registered only
// when that env value is not the literal string 'disabled'.
if (process.env.OD_METRICS_ENDPOINT !== 'disabled') {
app.get('/api/metrics', async (_req, res) => {
res.setHeader('Content-Type', register.contentType);
res.send(await getCritiqueMetrics());
});
}
registerConnectorRoutes(app, {
sendApiError,
authorizeToolRequest,

View File

@@ -0,0 +1,112 @@
/**
* Critique Theater structured logger test (Phase 12).
*
* Captures `process.stdout.write` while `logCritique` fires, parses the
* resulting JSON lines, and asserts the documented event shape. Locks
* the contract so an ingest pipeline (Loki / Datadog / Cloudwatch) that
* keys on `namespace=critique` and `event=...` does not silently break
* after a future refactor.
*/
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import { logCritique } from '../../src/logging/critique.js';
let captured: string[] = [];
let writeSpy: ReturnType<typeof vi.spyOn>;
beforeEach(() => {
captured = [];
writeSpy = vi.spyOn(process.stdout, 'write').mockImplementation((chunk: unknown) => {
if (typeof chunk === 'string') captured.push(chunk);
return true;
});
});
afterEach(() => {
writeSpy.mockRestore();
});
function parseLines(): Array<Record<string, unknown>> {
return captured
.join('')
.split('\n')
.filter((line) => line.length > 0)
.map((line) => JSON.parse(line) as Record<string, unknown>);
}
describe('logCritique structured events (Phase 12)', () => {
it('emits run_started with adapter / skill / protocolVersion', () => {
logCritique({
event: 'run_started',
runId: 'r-1',
adapter: 'mock',
skill: 'unit-test',
protocolVersion: 1,
});
const lines = parseLines();
expect(lines).toHaveLength(1);
expect(lines[0]).toMatchObject({
event: 'run_started',
runId: 'r-1',
adapter: 'mock',
skill: 'unit-test',
protocolVersion: 1,
namespace: 'critique',
});
expect(typeof lines[0]!.timestamp).toBe('string');
});
it('emits round_closed with composite / mustFix / decision', () => {
logCritique({
event: 'round_closed',
runId: 'r-1',
round: 1,
composite: 8.6,
mustFix: 0,
decision: 'ship',
});
expect(parseLines()[0]).toMatchObject({
event: 'round_closed',
round: 1,
composite: 8.6,
mustFix: 0,
decision: 'ship',
namespace: 'critique',
});
});
it('emits run_shipped / degraded / parser_recover / run_failed', () => {
logCritique({
event: 'run_shipped', runId: 'r-1', round: 1, composite: 8.6, status: 'shipped',
});
logCritique({
event: 'degraded', runId: 'r-2', reason: 'malformed_block', adapter: 'mock',
});
logCritique({
event: 'parser_recover', runId: 'r-3', kind: 'composite_mismatch', position: 12,
});
logCritique({
event: 'run_failed', runId: 'r-4', cause: 'cli_exit_nonzero',
});
const lines = parseLines();
expect(lines).toHaveLength(4);
expect(lines.map((l) => l.event)).toEqual([
'run_shipped', 'degraded', 'parser_recover', 'run_failed',
]);
for (const line of lines) {
expect(line.namespace).toBe('critique');
expect(typeof line.timestamp).toBe('string');
}
});
it('writes exactly one newline-terminated line per call', () => {
logCritique({ event: 'run_started', runId: 'r-1', adapter: 'm', skill: 's', protocolVersion: 1 });
logCritique({ event: 'run_started', runId: 'r-2', adapter: 'm', skill: 's', protocolVersion: 1 });
const joined = captured.join('');
const lines = joined.split('\n');
// joined ends with '\n' so split produces a trailing empty string.
expect(lines).toHaveLength(3);
expect(lines[2]).toBe('');
});
});

View File

@@ -0,0 +1,89 @@
/**
* Critique Theater Prometheus registry shape test (Phase 12).
*
* Asserts every series the dashboard depends on is registered in the
* default registry and renders through `getCritiqueMetrics()` with the
* documented metric type. A rename or accidental drop would otherwise
* surface as an empty Grafana panel only at scrape time; this test
* catches it inside the daemon vitest lane.
*/
import { afterEach, describe, expect, it } from 'vitest';
import {
__resetCritiqueMetricsForTests,
critiqueCompositeScore,
critiqueDegradedTotal,
critiqueInterruptedTotal,
critiqueMustFixTotal,
critiqueParserErrorsTotal,
critiqueProtocolVersion,
critiqueRoundDurationMs,
critiqueRoundsTotal,
critiqueRunsTotal,
getCritiqueMetrics,
register,
} from '../../src/metrics/index.js';
afterEach(() => __resetCritiqueMetricsForTests());
const EXPECTED_SERIES = [
{ name: 'open_design_critique_runs_total', type: 'counter' as const },
{ name: 'open_design_critique_rounds_total', type: 'counter' as const },
{ name: 'open_design_critique_round_duration_ms', type: 'histogram' as const },
{ name: 'open_design_critique_composite_score', type: 'histogram' as const },
{ name: 'open_design_critique_must_fix_total', type: 'counter' as const },
{ name: 'open_design_critique_degraded_total', type: 'counter' as const },
{ name: 'open_design_critique_interrupted_total', type: 'counter' as const },
{ name: 'open_design_critique_parser_errors_total', type: 'counter' as const },
{ name: 'open_design_critique_protocol_version', type: 'gauge' as const },
];
describe('critique metrics registry (Phase 12)', () => {
it('registers every expected series in the default registry', () => {
const registered = register
.getMetricsAsArray()
.map((m) => ({ name: m.name, type: m.type }));
for (const want of EXPECTED_SERIES) {
const match = registered.find((r) => r.name === want.name);
expect(match, `expected ${want.name} to be registered`).toBeDefined();
expect(match!.type).toBe(want.type);
}
});
it('renders a happy-path bump into the exposition text', async () => {
critiqueRunsTotal.inc({ status: 'shipped', adapter: 'mock', skill: 'unit-test' });
critiqueRoundsTotal.inc({ adapter: 'mock', skill: 'unit-test' });
critiqueRoundDurationMs.observe({ adapter: 'mock', skill: 'unit-test', round: '1' }, 1234);
critiqueCompositeScore.observe({ adapter: 'mock', skill: 'unit-test' }, 8.6);
critiqueMustFixTotal.inc({
panelist: 'critic',
dim: 'hierarchy',
adapter: 'mock',
skill: 'unit-test',
});
critiqueDegradedTotal.inc({ reason: 'malformed_block', adapter: 'mock' });
critiqueInterruptedTotal.inc({ adapter: 'mock' });
critiqueParserErrorsTotal.inc({ kind: 'composite_mismatch', adapter: 'mock' });
critiqueProtocolVersion.set({ version: '1' }, 1);
const text = await getCritiqueMetrics();
expect(text).toContain('open_design_critique_runs_total{status="shipped",adapter="mock",skill="unit-test"} 1');
expect(text).toContain('open_design_critique_rounds_total{adapter="mock",skill="unit-test"} 1');
expect(text).toContain('open_design_critique_must_fix_total{panelist="critic",dim="hierarchy",adapter="mock",skill="unit-test"} 1');
expect(text).toContain('open_design_critique_degraded_total{reason="malformed_block",adapter="mock"} 1');
expect(text).toContain('open_design_critique_interrupted_total{adapter="mock"} 1');
expect(text).toContain('open_design_critique_parser_errors_total{kind="composite_mismatch",adapter="mock"} 1');
expect(text).toContain('open_design_critique_protocol_version{version="1"} 1');
// Histogram exposes _bucket / _sum / _count lines instead of a flat value.
expect(text).toContain('open_design_critique_round_duration_ms_bucket{');
expect(text).toContain('open_design_critique_composite_score_bucket{');
});
it('resets the registry between tests so cases stay isolated', async () => {
critiqueRunsTotal.inc({ status: 'shipped', adapter: 'a', skill: 's' }, 5);
__resetCritiqueMetricsForTests();
const text = await getCritiqueMetrics();
expect(text).not.toContain('open_design_critique_runs_total{status="shipped",adapter="a",skill="s"} 5');
});
});

38
pnpm-lock.yaml generated
View File

@@ -44,6 +44,9 @@ importers:
'@open-design/sidecar-proto':
specifier: workspace:*
version: link:../../packages/sidecar-proto
'@opentelemetry/api':
specifier: ^1.9.0
version: 1.9.1
better-sqlite3:
specifier: ^12.9.0
version: 12.9.0
@@ -62,6 +65,9 @@ importers:
multer:
specifier: ^2.1.1
version: 2.1.1
prom-client:
specifier: ^15.1.0
version: 15.1.3
undici:
specifier: ^7.16.0
version: 7.25.0
@@ -212,7 +218,7 @@ importers:
version: link:../../packages/sidecar-proto
next:
specifier: ^16.2.5
version: 16.2.5(@playwright/test@1.59.1)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)
version: 16.2.5(@opentelemetry/api@1.9.1)(@playwright/test@1.59.1)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)
openai:
specifier: ^6.36.0
version: 6.36.0(zod@4.4.2)
@@ -1343,6 +1349,10 @@ packages:
cpu: [x64]
os: [win32]
'@opentelemetry/api@1.9.1':
resolution: {integrity: sha512-gLyJlPHPZYdAk1JENA9LeHejZe1Ti77/pTeFm/nMXmQH/HFZlcS/O2XJB+L8fkbrNSqhdtlvjBVjxwUYanNH5Q==}
engines: {node: '>=8.0.0'}
'@oslojs/encoding@1.1.0':
resolution: {integrity: sha512-70wQhgYmndg4GCPxPPxPGevRKqTIJ2Nh4OkiMWmDAVYsTQ+Ta7Sq+rPevXyXGdzr30/qZBnyOalCszoMxlyldQ==}
@@ -2007,6 +2017,9 @@ packages:
bindings@1.5.0:
resolution: {integrity: sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ==}
bintrees@1.0.2:
resolution: {integrity: sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==}
bl@4.1.0:
resolution: {integrity: sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==}
@@ -3692,6 +3705,10 @@ packages:
resolution: {integrity: sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==}
engines: {node: '>=0.4.0'}
prom-client@15.1.3:
resolution: {integrity: sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g==}
engines: {node: ^16 || ^18 || >=20}
promise-retry@2.0.1:
resolution: {integrity: sha512-y+WKFlBR8BGXnsNlIHFGPZmyDf3DFMoLhaflAnyZgV6rG6xu+JwesTo2Q9R6XwYmtmwAFCkAk3e35jEdoeh/3g==}
engines: {node: '>=10'}
@@ -4123,6 +4140,9 @@ packages:
resolution: {integrity: sha512-tOG/7GyXpFevhXVh8jOPJrmtRpOTsYqUIkVdVooZYJS/z8WhfQUX8RJILmeuJNinGAMSu1veBr4asSHFt5/hng==}
engines: {node: '>=18'}
tdigest@0.1.2:
resolution: {integrity: sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==}
temp-file@3.4.0:
resolution: {integrity: sha512-C5tjlC/HCtVUOi3KWVokd4vHVViOmGjtLwIh4MuzPo/nMYTV/p1urt3RnMz2IWXDdKEGJH3k5+KPxtqRsUYGtg==}
@@ -5505,6 +5525,8 @@ snapshots:
'@next/swc-win32-x64-msvc@16.2.5':
optional: true
'@opentelemetry/api@1.9.1': {}
'@oslojs/encoding@1.1.0': {}
'@playwright/test@1.59.1':
@@ -6263,6 +6285,8 @@ snapshots:
dependencies:
file-uri-to-path: 1.0.0
bintrees@1.0.2: {}
bl@4.1.0:
dependencies:
buffer: 5.7.1
@@ -8066,7 +8090,7 @@ snapshots:
neotraverse@0.6.18: {}
next@16.2.5(@playwright/test@1.59.1)(react-dom@18.3.1(react@18.3.1))(react@18.3.1):
next@16.2.5(@opentelemetry/api@1.9.1)(@playwright/test@1.59.1)(react-dom@18.3.1(react@18.3.1))(react@18.3.1):
dependencies:
'@next/env': 16.2.5
'@swc/helpers': 0.5.15
@@ -8085,6 +8109,7 @@ snapshots:
'@next/swc-linux-x64-musl': 16.2.5
'@next/swc-win32-arm64-msvc': 16.2.5
'@next/swc-win32-x64-msvc': 16.2.5
'@opentelemetry/api': 1.9.1
'@playwright/test': 1.59.1
sharp: 0.34.5
transitivePeerDependencies:
@@ -8317,6 +8342,11 @@ snapshots:
progress@2.0.3: {}
prom-client@15.1.3:
dependencies:
'@opentelemetry/api': 1.9.1
tdigest: 0.1.2
promise-retry@2.0.1:
dependencies:
err-code: 2.0.3
@@ -8917,6 +8947,10 @@ snapshots:
minizlib: 3.1.0
yallist: 5.0.0
tdigest@0.1.2:
dependencies:
bintrees: 1.0.2
temp-file@3.4.0:
dependencies:
async-exit-hook: 2.0.1