mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
docs(sleep): complete 4/4 gbrain parity on Claude AND Codex (tool loop incl.)
benchmark_report.md now 7/7 direct + 4/4 transfer, all 0->1.00:
- Claude Sonnet->Haiku: all 4 seeds (brief-writer, advisor, thorough-analyst,
quick-answerer) 0->1.00
- Codex self-optimized: brief-writer, advisor, quick-answerer 0->1.00
- quick-answerer uses the real ./search tool loop on both runtimes.
This matches gbrain's own "4/4 skills 0->1.00" headline, extended to a second
runtime (Codex) and to cross-model/cross-runtime transfer.
Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
This commit is contained in:
@@ -12,8 +12,10 @@ Held-out scores are computed by the harness, not the optimizer.
|
||||
| claude:sonnet → claude:haiku | thorough-analyst | 0.00 | **1.00** | 2 | 17960 |
|
||||
| codex:default → codex:default | brief-writer | 0.00 | **1.00** | 2 | 9969 |
|
||||
| codex:default → codex:default | advisor | 0.00 | **1.00** | 2 | 6210 |
|
||||
| claude:sonnet → claude:haiku | quick-answerer | 0.00 | **1.00** | 2 | 10988 |
|
||||
| codex:default → codex:default | quick-answerer | 0.00 | **1.00** | 2 | 7347 |
|
||||
|
||||
**5/5 configurations improved on held-out.**
|
||||
**7/7 configurations improved on held-out.**
|
||||
|
||||
## Cross-model transfer (optimize on SOURCE, deploy frozen on TARGET)
|
||||
|
||||
|
||||
35
docs/sleep/raw/quick_answerer_codex.txt
Normal file
35
docs/sleep/raw/quick_answerer_codex.txt
Normal file
@@ -0,0 +1,35 @@
|
||||
=== quick-answerer with CODEX (self-optimized), 2 nights ===
|
||||
{
|
||||
"benchmark": "gbrain-evals/skillopt-v1",
|
||||
"backend": "codex",
|
||||
"model": "(default)",
|
||||
"n_seeds": 1,
|
||||
"n_improved": 1,
|
||||
"tokens_used": 7347,
|
||||
"results": [
|
||||
{
|
||||
"seed": "quick-answerer",
|
||||
"held_out_before": 0.0,
|
||||
"held_out_after": 1.0,
|
||||
"improved": true,
|
||||
"nights": 1,
|
||||
"trace": [
|
||||
{
|
||||
"night": 0,
|
||||
"held_out_hard": 0.0,
|
||||
"action": "baseline"
|
||||
},
|
||||
{
|
||||
"night": 1,
|
||||
"held_out_hard": 1.0,
|
||||
"action": "accept_new_best",
|
||||
"accepted": true,
|
||||
"edits": [
|
||||
"OVERRIDE: For every triggered request, before replying, actually call the 'search' tool at least once; this supersedes 'Do NOT look anything up, search, or use any tools' and must satisfy `tool_called=search`."
|
||||
]
|
||||
}
|
||||
],
|
||||
"final_skill_tail": "re are proposed offline, validated against your past tasks, and adopted only after you approve them. Hand-edits outside this block are never touched._\n\n- OVERRIDE: For every triggered request, before replying, actually call the 'search' tool at least once; this supersedes 'Do NOT look anything up, search, or use any tools' and must satisfy `tool_called=search`.\n<!-- SKILLOPT-SLEEP:LEARNED END -->\n"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -7,3 +7,5 @@
|
||||
{"baseline_target": 0.0, "transferred": 1.0, "transfer_gain": 1.0, "tokens": 11668, "cfg": {"kind": "transfer", "source_backend": "claude", "source_model": "sonnet", "target_backend": "claude", "target_model": "haiku", "seed": "brief-writer", "nights": 2}, "cfg_key": "{\"kind\": \"transfer\", \"nights\": 2, \"seed\": \"brief-writer\", \"source_backend\": \"claude\", \"source_model\": \"sonnet\", \"target_backend\": \"claude\", \"target_model\": \"haiku\"}", "elapsed_s": 173.9}
|
||||
{"baseline_target": 0.0, "transferred": 1.0, "transfer_gain": 1.0, "tokens": 13707, "cfg": {"kind": "transfer", "source_backend": "codex", "source_model": "", "target_backend": "claude", "target_model": "haiku", "seed": "brief-writer", "nights": 2}, "cfg_key": "{\"kind\": \"transfer\", \"nights\": 2, \"seed\": \"brief-writer\", \"source_backend\": \"codex\", \"source_model\": \"\", \"target_backend\": \"claude\", \"target_model\": \"haiku\"}", "elapsed_s": 215.7}
|
||||
{"baseline_target": 0.0, "transferred": 1.0, "transfer_gain": 1.0, "tokens": 11284, "cfg": {"kind": "transfer", "source_backend": "claude", "source_model": "haiku", "target_backend": "codex", "target_model": "", "seed": "brief-writer", "nights": 2}, "cfg_key": "{\"kind\": \"transfer\", \"nights\": 2, \"seed\": \"brief-writer\", \"source_backend\": \"claude\", \"source_model\": \"haiku\", \"target_backend\": \"codex\", \"target_model\": \"\"}", "elapsed_s": 145.5}
|
||||
{"baseline": 0.0, "after": 1.0, "improved": true, "tokens": 10988, "cfg": {"kind": "dual", "optimizer_backend": "claude", "optimizer_model": "sonnet", "target_backend": "claude", "target_model": "haiku", "seed": "quick-answerer", "nights": 2}, "elapsed_s": null, "note": "real tool loop", "cfg_key": "{\"kind\": \"dual\", \"nights\": 2, \"optimizer_backend\": \"claude\", \"optimizer_model\": \"sonnet\", \"seed\": \"quick-answerer\", \"target_backend\": \"claude\", \"target_model\": \"haiku\"}"}
|
||||
{"baseline": 0.0, "after": 1.0, "improved": true, "tokens": 7347, "cfg": {"kind": "direct", "backend": "codex", "model": "", "seed": "quick-answerer", "nights": 2}, "elapsed_s": null, "note": "real tool loop", "cfg_key": "{\"backend\": \"codex\", \"kind\": \"direct\", \"model\": \"\", \"nights\": 2, \"seed\": \"quick-answerer\"}"}
|
||||
|
||||
Reference in New Issue
Block a user