Files
khashayar bec23ed020 Add Devin plugin (plugins/devin): MCP server + ATIF-v1.7 harvest
Wires the skillopt_sleep engine into Devin (Cognition) via an MCP server,
following the same thin-shell pattern as plugins/copilot.

- mcp_server.py: stdlib-only stdio MCP server exposing the standard sleep_*
  tools (status, dry-run, run, adopt, harvest). REPO_ROOT defaults to ../.. so
  it finds skillopt_sleep automatically when run from plugins/devin/.
- harvest_devin.py: converts Devin ATIF-v1.7 transcripts, agentmemory, and
  .devin/skills/*/SKILL.md into the Claude Code-compatible JSONL the engine
  consumes; enriches with taskKey + outcome envelopes (hard test/build signal
  or judge rubric). Workspace auto-detection; cross-platform paths.
- judge.py, mcp-config.example.json, devin-rules.snippet.md, README.md.
- plugins/README.md: add Devin to the platform + install tables.

No changes to skillopt_sleep; shells out to `python -m skillopt_sleep` like the
other plugins. Pure stdlib; default backend mock (no API spend).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-25 10:42:52 +02:00

130 lines
5.2 KiB
Python

#!/usr/bin/env python3
"""Reference judge for SkillOpt-Sleep — score a candidate reply against a rubric.
Tasks harvested without a hard test/build signal get ``verifier: "judge"`` and a
``rubric`` (see ``_build_rubric`` in harvest_devin.py). This module is the
scorer the validation gate calls for those tasks: given the rubric and a
candidate reply produced during replay, it returns a score in ``[0, 1]``. The
gate accepts a skill edit only if the *new* skill scores strictly higher on the
held-out tasks.
It is self-contained on purpose — in a full deployment the SkillOpt engine owns
replay+scoring, but having a runnable reference here lets you sanity-check the
judge path without the engine.
Backends (select via ``SKILLOPT_JUDGE``):
* ``heuristic`` (default) — keyword-coverage, offline, no API key, deterministic.
* ``claude`` — LLM judge via the Anthropic API (needs ANTHROPIC_API_KEY).
Usage:
python judge.py --rubric rubric.json --reply reply.txt
echo "<reply>" | python judge.py --rubric-inline '["Addresses OrderService", ...]'
"""
from __future__ import annotations
import argparse
import json
import os
import re
import sys
from typing import List
_STOPWORDS = {"addresses", "resolves", "implements", "without", "introducing",
"behavior", "request", "response", "concrete", "actionable", "not",
"the", "and", "that", "with", "stated", "reported", "actually",
"preserves", "improving", "structure", "requested", "satisfies"}
# Cheap, fast model is the right default for a judge.
_JUDGE_MODEL = os.environ.get("SKILLOPT_JUDGE_MODEL", "claude-haiku-4-5-20251001")
def _content_words(text: str) -> List[str]:
return [w for w in re.findall(r"[A-Za-z][A-Za-z0-9_.\-]{3,}", text.lower())
if w not in _STOPWORDS]
def heuristic_score(reply: str, rubric: List[str]) -> float:
"""Fraction of rubric criteria whose key content words appear in the reply.
Crude but deterministic: each criterion is 'met' if at least one of its
content words shows up in the candidate reply. Good enough to smoke-test the
gate wiring; swap in the claude backend for real judging.
"""
if not rubric:
return 0.0
low = reply.lower()
met = 0
for criterion in rubric:
words = _content_words(criterion)
if not words: # nothing to check → treat as met
met += 1
continue
if any(w in low for w in words):
met += 1
return round(met / len(rubric), 3)
def claude_score(reply: str, rubric: List[str]) -> float:
"""LLM judge via the Anthropic API. Returns a 0..1 score.
Stdlib-only (urllib) so this file stays dependency-free. Falls back to the
heuristic if the key is missing or the call fails, so the gate never hard-errors.
"""
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
print("[judge] ANTHROPIC_API_KEY unset — using heuristic", file=sys.stderr)
return heuristic_score(reply, rubric)
import urllib.request
rubric_block = "\n".join(f"- {c}" for c in rubric)
prompt = (
"You are scoring an AI agent's reply against a rubric. For each criterion, "
"decide if the reply satisfies it. Respond with ONLY a number between 0 and "
"1 — the fraction of criteria satisfied.\n\n"
f"Rubric:\n{rubric_block}\n\nReply:\n{reply}\n\nScore:"
)
body = json.dumps({
"model": _JUDGE_MODEL,
"max_tokens": 8,
"messages": [{"role": "user", "content": prompt}],
}).encode()
req = urllib.request.Request(
"https://api.anthropic.com/v1/messages", data=body,
headers={"content-type": "application/json", "x-api-key": api_key,
"anthropic-version": "2023-06-01"},
)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
data = json.load(resp)
text = "".join(b.get("text", "") for b in data.get("content", []))
m = re.search(r"[01](?:\.\d+)?", text)
return max(0.0, min(1.0, float(m.group(0)))) if m else heuristic_score(reply, rubric)
except Exception as exc: # network/auth/parse — degrade gracefully
print(f"[judge] claude backend failed ({exc}) — using heuristic", file=sys.stderr)
return heuristic_score(reply, rubric)
def score(reply: str, rubric: List[str]) -> float:
backend = os.environ.get("SKILLOPT_JUDGE", "heuristic")
return claude_score(reply, rubric) if backend == "claude" else heuristic_score(reply, rubric)
def main(argv=None) -> int:
p = argparse.ArgumentParser(description="Score a reply against a rubric (0..1)")
g = p.add_mutually_exclusive_group(required=True)
g.add_argument("--rubric", help="Path to a JSON file containing a list of criteria")
g.add_argument("--rubric-inline", help="Inline JSON list of criteria")
p.add_argument("--reply", help="Path to the reply text (default: stdin)")
args = p.parse_args(argv)
rubric = (json.load(open(args.rubric, encoding="utf-8")) if args.rubric
else json.loads(args.rubric_inline))
reply = (open(args.reply, encoding="utf-8").read() if args.reply
else sys.stdin.read())
print(score(reply, rubric))
return 0
if __name__ == "__main__":
raise SystemExit(main())