mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
refactor: rename teacher/student to optimizer/target, remove best skills, fix slow update
- Rename teacher -> optimizer, student -> target across all code, configs, docs, prompts - CLI: --teacher_model -> --optimizer_model, --student_model -> --target_model - Remove best_skill files, keep only initial skills - Fix slow update gate (force write into skill) - Fix SLOW_UPDATE marker stripping - Remove deep_reflect and meta_reflect mechanisms - Update .env.example with export prefix and azure_cli docs - Add endpoint empty validation in azure_openai.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
25
.env.example
25
.env.example
@@ -1,24 +1,25 @@
|
||||
# SkillOpt Environment Variables
|
||||
# Copy this file to .env and fill in your values.
|
||||
# Usage: set -a; source .env; set +a
|
||||
|
||||
# ── Azure OpenAI (required for openai_chat backend) ──────────────────
|
||||
AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
|
||||
AZURE_OPENAI_API_VERSION=2024-12-01-preview
|
||||
export AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
|
||||
export AZURE_OPENAI_API_VERSION=2024-12-01-preview
|
||||
# Authentication: choose one method
|
||||
# Option 1: API Key
|
||||
AZURE_OPENAI_API_KEY=
|
||||
# Option 2: Azure CLI (set auth_mode=azure_cli in config)
|
||||
# Option 3: Managed Identity (set auth_mode=managed_identity + client_id in config)
|
||||
export AZURE_OPENAI_API_KEY=
|
||||
# Option 2: Azure CLI (no API key needed, recommended on Azure VMs)
|
||||
# export AZURE_OPENAI_AUTH_MODE=azure_cli
|
||||
# Option 3: Managed Identity
|
||||
# export AZURE_OPENAI_AUTH_MODE=managed_identity
|
||||
# export AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID=your-client-id
|
||||
|
||||
# ── OpenAI (alternative to Azure) ────────────────────────────────────
|
||||
# OPENAI_API_KEY=sk-...
|
||||
# export OPENAI_API_KEY=sk-...
|
||||
|
||||
# ── Anthropic / Claude (for claude_chat backend) ─────────────────────
|
||||
# ANTHROPIC_API_KEY=sk-ant-...
|
||||
# export ANTHROPIC_API_KEY=sk-ant-...
|
||||
|
||||
# ── Qwen Local Model (for qwen_chat backend) ────────────────────────
|
||||
# QWEN_CHAT_BASE_URL=http://localhost:8000/v1
|
||||
# QWEN_CHAT_MODEL=Qwen/Qwen3.5-4B
|
||||
|
||||
# ── Ray (optional, for distributed rollout) ──────────────────────────
|
||||
# RAY_ADDRESS=auto
|
||||
# export QWEN_CHAT_BASE_URL=http://localhost:8000/v1
|
||||
# export QWEN_CHAT_MODEL=Qwen/Qwen3.5-4B
|
||||
|
||||
31
.gradio/certificate.pem
Normal file
31
.gradio/certificate.pem
Normal file
@@ -0,0 +1,31 @@
|
||||
-----BEGIN CERTIFICATE-----
|
||||
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
||||
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
||||
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
||||
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
||||
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
||||
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
||||
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
||||
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
||||
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
||||
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
||||
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
||||
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
||||
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
||||
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
||||
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
||||
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
||||
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
||||
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
||||
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
||||
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
||||
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
||||
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
||||
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
||||
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
||||
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
||||
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
||||
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
||||
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
||||
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
||||
-----END CERTIFICATE-----
|
||||
64
README.md
64
README.md
@@ -19,16 +19,16 @@ SkillOpt is a framework for optimizing a natural-language **skill document** thr
|
||||
|
||||
It does **not** fine-tune model parameters. Instead, it treats the skill document as the optimization target:
|
||||
|
||||
- The **student** model executes tasks with the current skill
|
||||
- The **teacher** model analyzes trajectories and proposes edits
|
||||
- The **target** model executes tasks with the current skill
|
||||
- The **optimizer** model analyzes trajectories and proposes edits
|
||||
- The framework merges, ranks, applies, and validates those edits
|
||||
- Only validated skill updates are kept
|
||||
|
||||
| Deep Learning | SkillOpt |
|
||||
|---|---|
|
||||
| Model weights | Skill document (Markdown) |
|
||||
| Forward pass | Rollout (student executes tasks) |
|
||||
| Loss computation | Reflect (teacher analyzes trajectories) |
|
||||
| Forward pass | Rollout (target executes tasks) |
|
||||
| Loss computation | Reflect (optimizer analyzes trajectories) |
|
||||
| Gradient | Edit patches (proposed skill improvements) |
|
||||
| Gradient clipping | Edit ranking & selection (`learning_rate`) |
|
||||
| Weight update | Patch application to skill document |
|
||||
@@ -59,17 +59,17 @@ This gives a training-style loop for prompt / policy optimization:
|
||||
Every training step executes the following pipeline in `skillopt/engine/trainer.py`:
|
||||
|
||||
1. **Rollout**
|
||||
The student model runs a batch of tasks using the current skill.
|
||||
The target model runs a batch of tasks using the current skill.
|
||||
|
||||
2. **Reflect**
|
||||
The teacher analyzes minibatches of trajectories and emits raw patches.
|
||||
The optimizer analyzes minibatches of trajectories and emits raw patches.
|
||||
Failure-driven and success-driven patches are tracked separately.
|
||||
|
||||
3. **Aggregate**
|
||||
Raw patches are merged hierarchically. Metadata such as `support_count` and `source_type` is carried into the merged patch so later ranking can use it.
|
||||
|
||||
4. **Select**
|
||||
The teacher ranks the merged edit pool and keeps up to `edit_budget` edits.
|
||||
The optimizer ranks the merged edit pool and keeps up to `edit_budget` edits.
|
||||
|
||||
5. **Update**
|
||||
The selected edits are applied to the skill document. The framework records an `edit_apply_report.json` so you can see which edits actually landed, which were skipped, and why.
|
||||
@@ -84,7 +84,7 @@ Inside an epoch, the trainer maintains a step buffer containing:
|
||||
- Compact failure-pattern summaries from previous steps
|
||||
- Rejected edits and their score deltas
|
||||
|
||||
That context is fed back into later reflection calls so the teacher can avoid repeating ineffective edits and can focus on unsolved error patterns.
|
||||
That context is fed back into later reflection calls so the optimizer can avoid repeating ineffective edits and can focus on unsolved error patterns.
|
||||
|
||||
### Epoch-Level Mechanisms
|
||||
|
||||
@@ -96,7 +96,7 @@ This guidance is **not** blindly written through — it is converted into a cand
|
||||
|
||||
#### Meta Skill
|
||||
|
||||
`meta_skill` is teacher-side cross-epoch memory. It does not directly edit the current skill. Instead, it writes a compact memory artifact describing longer-term patterns across adjacent epochs. That memory is loaded into later reflection / merge / ranking calls as extra context.
|
||||
`meta_skill` is optimizer-side cross-epoch memory. It does not directly edit the current skill. Instead, it writes a compact memory artifact describing longer-term patterns across adjacent epochs. That memory is loaded into later reflection / merge / ranking calls as extra context.
|
||||
|
||||
#### Meta Reflect
|
||||
|
||||
@@ -161,10 +161,10 @@ SkillOpt uses a hierarchical YAML configuration system. Each benchmark config in
|
||||
|
||||
```yaml
|
||||
model:
|
||||
teacher_backend: openai_chat # openai_chat | claude_chat | qwen_chat
|
||||
student_backend: openai_chat # openai_chat | claude_chat | codex_exec | qwen_chat
|
||||
teacher: gpt-5.5 # teacher model deployment name
|
||||
student: gpt-5.5 # student model deployment name
|
||||
optimizer_backend: openai_chat # openai_chat | claude_chat | qwen_chat
|
||||
target_backend: openai_chat # openai_chat | claude_chat | codex_exec | qwen_chat
|
||||
optimizer: gpt-5.5 # optimizer model deployment name
|
||||
target: gpt-5.5 # target model deployment name
|
||||
reasoning_effort: medium # low | medium | high
|
||||
|
||||
train:
|
||||
@@ -205,8 +205,8 @@ Override any config key from the command line:
|
||||
```bash
|
||||
python scripts/train.py \
|
||||
--config configs/searchqa/default.yaml \
|
||||
--cfg-options model.teacher_backend=openai_chat \
|
||||
model.student_backend=codex_exec \
|
||||
--cfg-options model.optimizer_backend=openai_chat \
|
||||
model.target_backend=codex_exec \
|
||||
train.batch_size=40 \
|
||||
optimizer.learning_rate=4
|
||||
|
||||
@@ -214,8 +214,8 @@ python scripts/train.py \
|
||||
python scripts/train.py \
|
||||
--config configs/searchqa/default.yaml \
|
||||
--backend azure_openai \
|
||||
--teacher_model gpt-5.5 \
|
||||
--student_model gpt-5.5 \
|
||||
--optimizer_model gpt-5.5 \
|
||||
--target_model gpt-5.5 \
|
||||
--reasoning_effort medium
|
||||
```
|
||||
|
||||
@@ -227,19 +227,19 @@ All model access goes through the unified backend router in `skillopt/model/`.
|
||||
|
||||
| Backend | Use case | Config key |
|
||||
|---|---|---|
|
||||
| `openai_chat` | Azure OpenAI / OpenAI API | teacher / student |
|
||||
| `claude_chat` | Anthropic Claude | teacher / student |
|
||||
| `codex_exec` | Codex execution harness | student only |
|
||||
| `qwen_chat` | Local Qwen via vLLM | teacher / student |
|
||||
| `openai_chat` | Azure OpenAI / OpenAI API | optimizer / target |
|
||||
| `claude_chat` | Anthropic Claude | optimizer / target |
|
||||
| `codex_exec` | Codex execution harness | target only |
|
||||
| `qwen_chat` | Local Qwen via vLLM | optimizer / target |
|
||||
|
||||
Separate teacher/student endpoints are supported:
|
||||
Separate optimizer/target endpoints are supported:
|
||||
|
||||
```yaml
|
||||
model:
|
||||
teacher_backend: openai_chat
|
||||
student_backend: codex_exec
|
||||
teacher: gpt-5.5
|
||||
student: gpt-5.5-codex
|
||||
optimizer_backend: openai_chat
|
||||
target_backend: codex_exec
|
||||
optimizer: gpt-5.5
|
||||
target: gpt-5.5-codex
|
||||
```
|
||||
|
||||
---
|
||||
@@ -292,15 +292,15 @@ Basic training:
|
||||
python scripts/train.py --config configs/searchqa/default.yaml
|
||||
```
|
||||
|
||||
Exec harness (Codex student):
|
||||
Exec harness (Codex target):
|
||||
|
||||
```bash
|
||||
python scripts/train.py \
|
||||
--config configs/searchqa/default.yaml \
|
||||
--teacher_backend openai_chat \
|
||||
--student_backend codex_exec \
|
||||
--teacher_model gpt-5.5 \
|
||||
--student_model gpt-5.5-codex \
|
||||
--optimizer_backend openai_chat \
|
||||
--target_backend codex_exec \
|
||||
--optimizer_model gpt-5.5 \
|
||||
--target_model gpt-5.5-codex \
|
||||
--use_deep_reflect true \
|
||||
--skill_update_mode rewrite_from_suggestions
|
||||
```
|
||||
@@ -366,7 +366,7 @@ The trainer resumes from `runtime_state.json` when present. That state tracks:
|
||||
1. Create `skillopt/envs/<your_env>/` with:
|
||||
- `adapter.py` — implements `EnvAdapter`
|
||||
- `dataloader.py` — data loading logic
|
||||
- `rollout.py` — student execution logic
|
||||
- `rollout.py` — target execution logic
|
||||
- `skills/initial.md` — initial skill document
|
||||
2. Add a config at `configs/<your_env>/default.yaml`
|
||||
3. Register in `skillopt/envs/__init__.py`
|
||||
|
||||
@@ -3,10 +3,10 @@
|
||||
|
||||
model:
|
||||
backend: azure_openai
|
||||
teacher: gpt-5.5
|
||||
student: gpt-5.5
|
||||
teacher_backend: openai_chat
|
||||
student_backend: openai_chat
|
||||
optimizer: gpt-5.5
|
||||
target: gpt-5.5
|
||||
optimizer_backend: openai_chat
|
||||
target_backend: openai_chat
|
||||
reasoning_effort: medium
|
||||
rewrite_reasoning_effort: ""
|
||||
rewrite_max_completion_tokens: 64000
|
||||
@@ -24,25 +24,25 @@ model:
|
||||
claude_code_exec_use_sdk: auto
|
||||
claude_code_exec_effort: medium
|
||||
claude_code_exec_max_thinking_tokens: 16384
|
||||
codex_trace_to_teacher: true
|
||||
codex_trace_to_optimizer: true
|
||||
azure_openai_endpoint: "" # e.g. "https://your-resource.openai.azure.com/"
|
||||
azure_openai_api_version: "2024-12-01-preview"
|
||||
azure_openai_api_key: "" # Fill locally if you do not export AZURE_OPENAI_API_KEY
|
||||
azure_openai_auth_mode: azure_cli
|
||||
azure_openai_ad_scope: "https://cognitiveservices.azure.com/.default"
|
||||
azure_openai_managed_identity_client_id: ""
|
||||
teacher_azure_openai_endpoint: "" # e.g. "https://your-resource.openai.azure.com/"
|
||||
teacher_azure_openai_api_version: "2024-12-01-preview"
|
||||
teacher_azure_openai_api_key: ""
|
||||
teacher_azure_openai_auth_mode: azure_cli
|
||||
teacher_azure_openai_ad_scope: "https://cognitiveservices.azure.com/.default"
|
||||
teacher_azure_openai_managed_identity_client_id: ""
|
||||
student_azure_openai_endpoint: "" # e.g. "https://your-resource.openai.azure.com/"
|
||||
student_azure_openai_api_version: "2024-12-01-preview"
|
||||
student_azure_openai_api_key: ""
|
||||
student_azure_openai_auth_mode: azure_cli
|
||||
student_azure_openai_ad_scope: "https://cognitiveservices.azure.com/.default"
|
||||
student_azure_openai_managed_identity_client_id: ""
|
||||
optimizer_azure_openai_endpoint: "" # e.g. "https://your-resource.openai.azure.com/"
|
||||
optimizer_azure_openai_api_version: "2024-12-01-preview"
|
||||
optimizer_azure_openai_api_key: ""
|
||||
optimizer_azure_openai_auth_mode: azure_cli
|
||||
optimizer_azure_openai_ad_scope: "https://cognitiveservices.azure.com/.default"
|
||||
optimizer_azure_openai_managed_identity_client_id: ""
|
||||
target_azure_openai_endpoint: "" # e.g. "https://your-resource.openai.azure.com/"
|
||||
target_azure_openai_api_version: "2024-12-01-preview"
|
||||
target_azure_openai_api_key: ""
|
||||
target_azure_openai_auth_mode: azure_cli
|
||||
target_azure_openai_ad_scope: "https://cognitiveservices.azure.com/.default"
|
||||
target_azure_openai_managed_identity_client_id: ""
|
||||
|
||||
train:
|
||||
num_epochs: 4
|
||||
@@ -57,9 +57,6 @@ gradient:
|
||||
analyst_workers: 16
|
||||
max_analyst_rounds: 3
|
||||
failure_only: false
|
||||
use_deep_reflect: false
|
||||
deep_reflect_failures: 4
|
||||
deep_reflect_successes: 2
|
||||
|
||||
optimizer:
|
||||
learning_rate: 4 # max edits per step (edit_budget)
|
||||
@@ -67,8 +64,6 @@ optimizer:
|
||||
lr_scheduler: cosine # constant / linear / cosine / autonomous
|
||||
lr_control_mode: fixed # fixed / autonomous / none
|
||||
skill_update_mode: patch # patch / rewrite_from_suggestions / full_rewrite_minibatch
|
||||
use_meta_reflect: false
|
||||
meta_learning_rate: 4 # max edits per epoch-level meta-reflect
|
||||
use_slow_update: true
|
||||
slow_update_samples: 20
|
||||
longitudinal_pair_policy: mixed # mixed / changed / unchanged
|
||||
@@ -89,5 +84,5 @@ env:
|
||||
split_dir: ""
|
||||
data_path: ""
|
||||
split_output_dir: ""
|
||||
exec_timeout: 120 # per student model/code-agent call timeout in seconds
|
||||
exec_timeout: 120 # per target model/code-agent call timeout in seconds
|
||||
out_root: ""
|
||||
|
||||
@@ -10,7 +10,6 @@ gradient:
|
||||
|
||||
optimizer:
|
||||
learning_rate: 4
|
||||
use_meta_reflect: false
|
||||
|
||||
evaluation:
|
||||
sel_env_num: 0
|
||||
|
||||
@@ -25,8 +25,8 @@ Benchmark configs inherit from `_base_/default.yaml` and override specific value
|
||||
```yaml
|
||||
model:
|
||||
backend: azure_openai # azure_openai | openai_chat | claude_code_exec | qwen
|
||||
teacher: gpt-5.5 # Teacher model (for reflection)
|
||||
student: gpt-5.5 # Student model (for rollout)
|
||||
optimizer: gpt-5.5 # Optimizer model (for reflection)
|
||||
target: gpt-5.5 # Target model (for rollout)
|
||||
```
|
||||
|
||||
### Training
|
||||
|
||||
@@ -7,9 +7,9 @@ SkillOpt is designed around a core insight: **optimizing natural-language prompt
|
||||
| Deep Learning | SkillOpt | Description |
|
||||
|---|---|---|
|
||||
| **Model weights** | Skill document (Markdown) | The thing being optimized |
|
||||
| **Forward pass** | Rollout | Student executes tasks using current skill |
|
||||
| **Forward pass** | Rollout | Target executes tasks using current skill |
|
||||
| **Loss function** | Task evaluator | Scores task execution quality |
|
||||
| **Backpropagation** | Reflect | Teacher analyzes failures → edit patches |
|
||||
| **Backpropagation** | Reflect | Optimizer analyzes failures → edit patches |
|
||||
| **Gradients** | Edit patches | Proposed changes to the skill |
|
||||
| **Gradient aggregation** | Patch aggregation | Merge similar edits |
|
||||
| **Gradient clipping** | Edit selection | Cap max edits per step |
|
||||
@@ -21,7 +21,7 @@ SkillOpt is designed around a core insight: **optimizing natural-language prompt
|
||||
| **Training step** | Step | One rollout → reflect → update cycle |
|
||||
| **Epoch** | Epoch | Full pass with slow update + meta memory |
|
||||
| **Momentum** | Slow update | Longitudinal comparison at epoch boundary |
|
||||
| **Meta-learning** | Meta skill | Cross-epoch teacher strategy memory |
|
||||
| **Meta-learning** | Meta skill | Cross-epoch optimizer strategy memory |
|
||||
| **Batch size** | `batch_size` | Tasks sampled per rollout |
|
||||
| **Data parallelism** | `analyst_workers` | Parallel reflection workers |
|
||||
| **Training set** | Train split | Items used for rollout |
|
||||
@@ -44,7 +44,7 @@ From our experiments, these DL intuitions transfer well:
|
||||
- **Cosine schedule > constant** — same as in DL, cosine annealing helps convergence
|
||||
- **Moderate LR (4-16) > very high/low** — too few edits = slow learning, too many = noisy
|
||||
- **Slow update helps** — longitudinal comparison prevents catastrophic forgetting across epochs
|
||||
- **Meta skill memory improves reflection** — teacher benefits from cross-epoch strategy notes
|
||||
- **Meta skill memory improves reflection** — optimizer benefits from cross-epoch strategy notes
|
||||
|
||||
!!! warning "What doesn't transfer"
|
||||
- **Batch size ≠ better** — larger rollout batches have diminishing returns due to API costs
|
||||
|
||||
@@ -33,7 +33,7 @@ optimizer:
|
||||
learning_rate: 4 # (max edits per step)
|
||||
lr_scheduler: cosine # (learning rate schedule)
|
||||
use_slow_update: true # (momentum at epoch boundary)
|
||||
use_meta_skill: true # (cross-epoch teacher memory)
|
||||
use_meta_skill: true # (cross-epoch optimizer memory)
|
||||
|
||||
gradient:
|
||||
analyst_workers: 16 # (parallel reflection workers)
|
||||
|
||||
@@ -76,7 +76,7 @@ class MyBenchmarkEnv(EnvAdapter):
|
||||
Args:
|
||||
item: The data item to process
|
||||
skill: Current skill document content
|
||||
model: The student model instance
|
||||
model: The target model instance
|
||||
|
||||
Returns:
|
||||
TaskResult with prediction, score, and trajectory
|
||||
|
||||
@@ -70,7 +70,7 @@ Track your skill's evolution through:
|
||||
1. **Start with a seed skill** (`env.skill_init`) if you have domain knowledge — it converges faster
|
||||
2. **Use cosine LR schedule** — aggressive early exploration + careful late refinement
|
||||
3. **Enable slow update** (`use_slow_update: true`) to prevent forgetting across epochs
|
||||
4. **Enable meta skill** (`use_meta_skill: true`) so the teacher accumulates strategy memory
|
||||
4. **Enable meta skill** (`use_meta_skill: true`) so the optimizer accumulates strategy memory
|
||||
|
||||
## Next Steps
|
||||
|
||||
|
||||
@@ -10,8 +10,8 @@ SkillOpt's core insight: **optimizing natural-language skill documents follows t
|
||||
│ │
|
||||
│ for epoch in epochs: │
|
||||
│ for step in steps: │
|
||||
│ 1. Rollout — Student executes tasks │
|
||||
│ 2. Reflect — Teacher analyzes trajectories │
|
||||
│ 1. Rollout — Target executes tasks │
|
||||
│ 2. Reflect — Optimizer analyzes trajectories │
|
||||
│ 3. Aggregate — Hierarchical merge of patches │
|
||||
│ 4. Select — Rank & clip edits (learning rate) │
|
||||
│ 5. Update — Apply patches to skill doc │
|
||||
@@ -27,7 +27,7 @@ SkillOpt's core insight: **optimizing natural-language skill documents follows t
|
||||
|
||||
### 1. Rollout (Forward Pass)
|
||||
|
||||
The **student** model executes tasks using the current skill document as its prompt. Each task produces a trajectory and a score.
|
||||
The **target** model executes tasks using the current skill document as its prompt. Each task produces a trajectory and a score.
|
||||
|
||||
```python
|
||||
# Analogy: forward pass through the network
|
||||
@@ -37,7 +37,7 @@ scores = evaluate(predictions, ground_truth)
|
||||
|
||||
### 2. Reflect (Backward Pass)
|
||||
|
||||
The **teacher** model analyzes failed trajectories and produces **edit patches** — structured suggestions for improving the skill document.
|
||||
The **optimizer** model analyzes failed trajectories and produces **edit patches** — structured suggestions for improving the skill document.
|
||||
|
||||
Two modes:
|
||||
|
||||
@@ -84,7 +84,7 @@ At the end of each epoch (starting from epoch 2), the system performs a **longit
|
||||
|
||||
### Meta Skill
|
||||
|
||||
A **meta-skill memory** accumulates high-level strategy notes across the entire training run. At the end of each epoch, the teacher reflects on what changed between epochs and produces a compact memory that is provided as additional context during future reflection steps.
|
||||
A **meta-skill memory** accumulates high-level strategy notes across the entire training run. At the end of each epoch, the optimizer reflects on what changed between epochs and produces a compact memory that is provided as additional context during future reflection steps.
|
||||
|
||||
## Next Steps
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ hide:
|
||||
<div class="pipeline-stage" id="stage-rollout">
|
||||
<div class="stage-icon">🎯</div>
|
||||
<div class="stage-label">Rollout</div>
|
||||
<div class="stage-desc">Student executes tasks</div>
|
||||
<div class="stage-desc">Target executes tasks</div>
|
||||
</div>
|
||||
|
||||
<div class="pipeline-arrow"><div class="flow-line"></div></div>
|
||||
@@ -34,7 +34,7 @@ hide:
|
||||
<div class="pipeline-stage" id="stage-reflect">
|
||||
<div class="stage-icon">🔍</div>
|
||||
<div class="stage-label">Reflect</div>
|
||||
<div class="stage-desc">Teacher analyzes trajectories</div>
|
||||
<div class="stage-desc">Optimizer analyzes trajectories</div>
|
||||
</div>
|
||||
|
||||
<div class="pipeline-arrow"><div class="flow-line"></div></div>
|
||||
@@ -88,8 +88,8 @@ SkillOpt brings the familiar deep-learning training paradigm to agentic prompt o
|
||||
| Deep Learning | SkillOpt |
|
||||
|---|---|
|
||||
| Model weights | Skill document (Markdown) |
|
||||
| Forward pass | Rollout (student executes tasks) |
|
||||
| Loss / gradient | Reflect (teacher produces edit patches) |
|
||||
| Forward pass | Rollout (target executes tasks) |
|
||||
| Loss / gradient | Reflect (optimizer produces edit patches) |
|
||||
| Gradient clipping | Edit selection (`learning_rate` = max edits) |
|
||||
| SGD step | Patch application to skill |
|
||||
| Validation set | Gated evaluation on selection split |
|
||||
|
||||
@@ -7,8 +7,8 @@ Complete reference for all SkillOpt configuration parameters.
|
||||
| Parameter | Type | Default | Description |
|
||||
|---|---|---|---|
|
||||
| `model.backend` | str | `azure_openai` | Backend: `azure_openai` / `openai_chat` / `claude_code_exec` / `qwen` |
|
||||
| `model.teacher` | str | `gpt-5.5` | Teacher model (for reflection & slow update) |
|
||||
| `model.student` | str | `gpt-5.5` | Student model (for rollout execution) |
|
||||
| `model.optimizer` | str | `gpt-5.5` | Optimizer model (for reflection & slow update) |
|
||||
| `model.target` | str | `gpt-5.5` | Target model (for rollout execution) |
|
||||
| `model.reasoning_effort` | str | `medium` | Reasoning effort level |
|
||||
|
||||
## Training (`train`)
|
||||
@@ -40,7 +40,7 @@ Complete reference for all SkillOpt configuration parameters.
|
||||
| `optimizer.skill_update_mode` | str | `patch` | — | `patch` / `rewrite_from_suggestions` / `full_rewrite_minibatch` |
|
||||
| `optimizer.use_slow_update` | bool | `true` | Momentum | Epoch-boundary longitudinal comparison & guidance |
|
||||
| `optimizer.slow_update_samples` | int | 20 | — | Samples for slow update evaluation |
|
||||
| `optimizer.use_meta_skill` | bool | `true` | Meta-learning | Cross-epoch teacher-side strategy memory |
|
||||
| `optimizer.use_meta_skill` | bool | `true` | Meta-learning | Cross-epoch optimizer-side strategy memory |
|
||||
| `optimizer.longitudinal_pair_policy` | str | `mixed` | — | `mixed` / `changed` / `unchanged` |
|
||||
|
||||
## Evaluation (`evaluation`)
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
"""ReflACT eval-only: run a single skill on a dataset without training.
|
||||
"""SkillOpt eval-only: run a single skill on a dataset without training.
|
||||
|
||||
Usage
|
||||
-----
|
||||
@@ -29,10 +29,10 @@ from skillopt.model import (
|
||||
configure_claude_code_exec,
|
||||
configure_codex_exec,
|
||||
set_reasoning_effort,
|
||||
set_student_backend,
|
||||
set_student_deployment,
|
||||
set_teacher_backend,
|
||||
set_teacher_deployment,
|
||||
set_target_backend,
|
||||
set_target_deployment,
|
||||
set_optimizer_backend,
|
||||
set_optimizer_deployment,
|
||||
)
|
||||
from skillopt.model.common import default_model_for_backend, normalize_backend_name
|
||||
|
||||
@@ -126,7 +126,7 @@ _BOOL = lambda x: str(x).lower() in ("true", "1", "yes") # noqa: E731
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(description="ReflACT eval-only")
|
||||
p = argparse.ArgumentParser(description="SkillOpt eval-only")
|
||||
p.add_argument("--config", type=str, required=True)
|
||||
p.add_argument("--skill", type=str, required=True,
|
||||
help="Path to skill .md file to evaluate")
|
||||
@@ -138,10 +138,10 @@ def parse_args() -> argparse.Namespace:
|
||||
p.add_argument("--env", type=str)
|
||||
p.add_argument("--backend", type=str,
|
||||
choices=["azure_openai", "codex", "codex_exec", "claude", "claude_chat", "claude_code_exec"])
|
||||
p.add_argument("--teacher_model", type=str)
|
||||
p.add_argument("--student_model", type=str)
|
||||
p.add_argument("--teacher_backend", type=str)
|
||||
p.add_argument("--student_backend", type=str)
|
||||
p.add_argument("--optimizer_model", type=str)
|
||||
p.add_argument("--target_model", type=str)
|
||||
p.add_argument("--optimizer_backend", type=str)
|
||||
p.add_argument("--target_backend", type=str)
|
||||
p.add_argument("--reasoning_effort", type=str,
|
||||
choices=["", "low", "medium", "high", "xhigh", "max"])
|
||||
p.add_argument("--azure_endpoint", type=str)
|
||||
@@ -153,18 +153,18 @@ def parse_args() -> argparse.Namespace:
|
||||
p.add_argument("--azure_openai_auth_mode", type=str)
|
||||
p.add_argument("--azure_openai_ad_scope", type=str)
|
||||
p.add_argument("--azure_openai_managed_identity_client_id", type=str)
|
||||
p.add_argument("--teacher_azure_openai_endpoint", type=str)
|
||||
p.add_argument("--teacher_azure_openai_api_version", type=str)
|
||||
p.add_argument("--teacher_azure_openai_api_key", type=str)
|
||||
p.add_argument("--teacher_azure_openai_auth_mode", type=str)
|
||||
p.add_argument("--teacher_azure_openai_ad_scope", type=str)
|
||||
p.add_argument("--teacher_azure_openai_managed_identity_client_id", type=str)
|
||||
p.add_argument("--student_azure_openai_endpoint", type=str)
|
||||
p.add_argument("--student_azure_openai_api_version", type=str)
|
||||
p.add_argument("--student_azure_openai_api_key", type=str)
|
||||
p.add_argument("--student_azure_openai_auth_mode", type=str)
|
||||
p.add_argument("--student_azure_openai_ad_scope", type=str)
|
||||
p.add_argument("--student_azure_openai_managed_identity_client_id", type=str)
|
||||
p.add_argument("--optimizer_azure_openai_endpoint", type=str)
|
||||
p.add_argument("--optimizer_azure_openai_api_version", type=str)
|
||||
p.add_argument("--optimizer_azure_openai_api_key", type=str)
|
||||
p.add_argument("--optimizer_azure_openai_auth_mode", type=str)
|
||||
p.add_argument("--optimizer_azure_openai_ad_scope", type=str)
|
||||
p.add_argument("--optimizer_azure_openai_managed_identity_client_id", type=str)
|
||||
p.add_argument("--target_azure_openai_endpoint", type=str)
|
||||
p.add_argument("--target_azure_openai_api_version", type=str)
|
||||
p.add_argument("--target_azure_openai_api_key", type=str)
|
||||
p.add_argument("--target_azure_openai_auth_mode", type=str)
|
||||
p.add_argument("--target_azure_openai_ad_scope", type=str)
|
||||
p.add_argument("--target_azure_openai_managed_identity_client_id", type=str)
|
||||
p.add_argument("--codex_exec_path", type=str)
|
||||
p.add_argument("--codex_exec_sandbox", type=str)
|
||||
p.add_argument("--codex_exec_profile", type=str)
|
||||
@@ -214,10 +214,10 @@ def main() -> None:
|
||||
from skillopt.config import apply_overrides
|
||||
_MAP = {
|
||||
"backend": "model.backend",
|
||||
"teacher_model": "model.teacher",
|
||||
"student_model": "model.student",
|
||||
"teacher_backend": "model.teacher_backend",
|
||||
"student_backend": "model.student_backend",
|
||||
"optimizer_model": "model.optimizer",
|
||||
"target_model": "model.target",
|
||||
"optimizer_backend": "model.optimizer_backend",
|
||||
"target_backend": "model.target_backend",
|
||||
"reasoning_effort": "model.reasoning_effort",
|
||||
"azure_endpoint": "model.azure_endpoint",
|
||||
"azure_api_version": "model.azure_api_version",
|
||||
@@ -228,18 +228,18 @@ def main() -> None:
|
||||
"azure_openai_auth_mode": "model.azure_openai_auth_mode",
|
||||
"azure_openai_ad_scope": "model.azure_openai_ad_scope",
|
||||
"azure_openai_managed_identity_client_id": "model.azure_openai_managed_identity_client_id",
|
||||
"teacher_azure_openai_endpoint": "model.teacher_azure_openai_endpoint",
|
||||
"teacher_azure_openai_api_version": "model.teacher_azure_openai_api_version",
|
||||
"teacher_azure_openai_api_key": "model.teacher_azure_openai_api_key",
|
||||
"teacher_azure_openai_auth_mode": "model.teacher_azure_openai_auth_mode",
|
||||
"teacher_azure_openai_ad_scope": "model.teacher_azure_openai_ad_scope",
|
||||
"teacher_azure_openai_managed_identity_client_id": "model.teacher_azure_openai_managed_identity_client_id",
|
||||
"student_azure_openai_endpoint": "model.student_azure_openai_endpoint",
|
||||
"student_azure_openai_api_version": "model.student_azure_openai_api_version",
|
||||
"student_azure_openai_api_key": "model.student_azure_openai_api_key",
|
||||
"student_azure_openai_auth_mode": "model.student_azure_openai_auth_mode",
|
||||
"student_azure_openai_ad_scope": "model.student_azure_openai_ad_scope",
|
||||
"student_azure_openai_managed_identity_client_id": "model.student_azure_openai_managed_identity_client_id",
|
||||
"optimizer_azure_openai_endpoint": "model.optimizer_azure_openai_endpoint",
|
||||
"optimizer_azure_openai_api_version": "model.optimizer_azure_openai_api_version",
|
||||
"optimizer_azure_openai_api_key": "model.optimizer_azure_openai_api_key",
|
||||
"optimizer_azure_openai_auth_mode": "model.optimizer_azure_openai_auth_mode",
|
||||
"optimizer_azure_openai_ad_scope": "model.optimizer_azure_openai_ad_scope",
|
||||
"optimizer_azure_openai_managed_identity_client_id": "model.optimizer_azure_openai_managed_identity_client_id",
|
||||
"target_azure_openai_endpoint": "model.target_azure_openai_endpoint",
|
||||
"target_azure_openai_api_version": "model.target_azure_openai_api_version",
|
||||
"target_azure_openai_api_key": "model.target_azure_openai_api_key",
|
||||
"target_azure_openai_auth_mode": "model.target_azure_openai_auth_mode",
|
||||
"target_azure_openai_ad_scope": "model.target_azure_openai_ad_scope",
|
||||
"target_azure_openai_managed_identity_client_id": "model.target_azure_openai_managed_identity_client_id",
|
||||
"codex_exec_path": "model.codex_exec_path",
|
||||
"codex_exec_sandbox": "model.codex_exec_sandbox",
|
||||
"codex_exec_profile": "model.codex_exec_profile",
|
||||
@@ -288,7 +288,7 @@ def main() -> None:
|
||||
explicit_backend = str(option).split("=", 1)[1].strip()
|
||||
break
|
||||
|
||||
backend = normalize_backend_name(cfg.get("model_backend") or cfg.get("student_backend") or "azure_openai")
|
||||
backend = normalize_backend_name(cfg.get("model_backend") or cfg.get("target_backend") or "azure_openai")
|
||||
|
||||
def _has_model_override(dotted_key: str, legacy_key: str) -> bool:
|
||||
if getattr(args, legacy_key, None) is not None:
|
||||
@@ -303,43 +303,43 @@ def main() -> None:
|
||||
backend = normalize_backend_name(explicit_backend)
|
||||
cfg["model_backend"] = backend
|
||||
if backend in {"claude", "claude_chat"}:
|
||||
cfg.setdefault("teacher_backend", "claude_chat")
|
||||
cfg.setdefault("student_backend", "claude_chat")
|
||||
cfg.setdefault("optimizer_backend", "claude_chat")
|
||||
cfg.setdefault("target_backend", "claude_chat")
|
||||
elif backend in {"codex", "codex_exec"}:
|
||||
cfg.setdefault("teacher_backend", "openai_chat")
|
||||
cfg.setdefault("student_backend", "codex_exec")
|
||||
cfg.setdefault("optimizer_backend", "openai_chat")
|
||||
cfg.setdefault("target_backend", "codex_exec")
|
||||
elif backend == "claude_code_exec":
|
||||
cfg.setdefault("teacher_backend", "openai_chat")
|
||||
cfg.setdefault("student_backend", "claude_code_exec")
|
||||
cfg.setdefault("optimizer_backend", "openai_chat")
|
||||
cfg.setdefault("target_backend", "claude_code_exec")
|
||||
else:
|
||||
cfg.setdefault("teacher_backend", "openai_chat")
|
||||
cfg.setdefault("student_backend", "openai_chat")
|
||||
cfg.setdefault("optimizer_backend", "openai_chat")
|
||||
cfg.setdefault("target_backend", "openai_chat")
|
||||
else:
|
||||
cfg.setdefault("teacher_backend", "openai_chat")
|
||||
cfg.setdefault("student_backend", "openai_chat")
|
||||
cfg.setdefault("optimizer_backend", "openai_chat")
|
||||
cfg.setdefault("target_backend", "openai_chat")
|
||||
|
||||
if cfg.get("teacher_backend") == "claude_chat":
|
||||
if cfg.get("optimizer_backend") == "claude_chat":
|
||||
if (
|
||||
str(cfg.get("teacher_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.teacher", "teacher_model")
|
||||
str(cfg.get("optimizer_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.optimizer", "optimizer_model")
|
||||
):
|
||||
cfg["teacher_model"] = default_model_for_backend("claude_chat")
|
||||
if cfg.get("student_backend") == "claude_chat":
|
||||
cfg["optimizer_model"] = default_model_for_backend("claude_chat")
|
||||
if cfg.get("target_backend") == "claude_chat":
|
||||
if (
|
||||
str(cfg.get("student_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.student", "student_model")
|
||||
str(cfg.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.target", "target_model")
|
||||
):
|
||||
cfg["student_model"] = default_model_for_backend("claude_chat")
|
||||
if cfg.get("student_backend") == "claude_code_exec":
|
||||
cfg["target_model"] = default_model_for_backend("claude_chat")
|
||||
if cfg.get("target_backend") == "claude_code_exec":
|
||||
if (
|
||||
str(cfg.get("student_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.student", "student_model")
|
||||
str(cfg.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.target", "target_model")
|
||||
):
|
||||
cfg["student_model"] = default_model_for_backend("claude_chat")
|
||||
cfg["target_model"] = default_model_for_backend("claude_chat")
|
||||
|
||||
if not cfg.get("out_root"):
|
||||
env = cfg.get("env", "unknown")
|
||||
model = cfg.get("student_model", "unknown").replace("/", "-")
|
||||
model = cfg.get("target_model", "unknown").replace("/", "-")
|
||||
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
cfg["out_root"] = os.path.join("outputs", f"eval_{env}_{model}_{ts}")
|
||||
|
||||
@@ -362,27 +362,27 @@ def main() -> None:
|
||||
auth_mode=cfg.get("azure_openai_auth_mode") or None,
|
||||
ad_scope=cfg.get("azure_openai_ad_scope") or None,
|
||||
managed_identity_client_id=cfg.get("azure_openai_managed_identity_client_id") or None,
|
||||
teacher_endpoint=cfg.get("teacher_azure_openai_endpoint") or None,
|
||||
teacher_api_version=cfg.get("teacher_azure_openai_api_version") or None,
|
||||
teacher_api_key=cfg.get("teacher_azure_openai_api_key") or None,
|
||||
teacher_auth_mode=cfg.get("teacher_azure_openai_auth_mode") or None,
|
||||
teacher_ad_scope=cfg.get("teacher_azure_openai_ad_scope") or None,
|
||||
teacher_managed_identity_client_id=(
|
||||
cfg.get("teacher_azure_openai_managed_identity_client_id") or None
|
||||
optimizer_endpoint=cfg.get("optimizer_azure_openai_endpoint") or None,
|
||||
optimizer_api_version=cfg.get("optimizer_azure_openai_api_version") or None,
|
||||
optimizer_api_key=cfg.get("optimizer_azure_openai_api_key") or None,
|
||||
optimizer_auth_mode=cfg.get("optimizer_azure_openai_auth_mode") or None,
|
||||
optimizer_ad_scope=cfg.get("optimizer_azure_openai_ad_scope") or None,
|
||||
optimizer_managed_identity_client_id=(
|
||||
cfg.get("optimizer_azure_openai_managed_identity_client_id") or None
|
||||
),
|
||||
student_endpoint=cfg.get("student_azure_openai_endpoint") or None,
|
||||
student_api_version=cfg.get("student_azure_openai_api_version") or None,
|
||||
student_api_key=cfg.get("student_azure_openai_api_key") or None,
|
||||
student_auth_mode=cfg.get("student_azure_openai_auth_mode") or None,
|
||||
student_ad_scope=cfg.get("student_azure_openai_ad_scope") or None,
|
||||
student_managed_identity_client_id=(
|
||||
cfg.get("student_azure_openai_managed_identity_client_id") or None
|
||||
target_endpoint=cfg.get("target_azure_openai_endpoint") or None,
|
||||
target_api_version=cfg.get("target_azure_openai_api_version") or None,
|
||||
target_api_key=cfg.get("target_azure_openai_api_key") or None,
|
||||
target_auth_mode=cfg.get("target_azure_openai_auth_mode") or None,
|
||||
target_ad_scope=cfg.get("target_azure_openai_ad_scope") or None,
|
||||
target_managed_identity_client_id=(
|
||||
cfg.get("target_azure_openai_managed_identity_client_id") or None
|
||||
),
|
||||
)
|
||||
set_teacher_backend(cfg.get("teacher_backend", "openai_chat"))
|
||||
set_student_backend(cfg.get("student_backend", "openai_chat"))
|
||||
set_teacher_deployment(cfg.get("teacher_model", default_model_for_backend(backend)))
|
||||
set_student_deployment(cfg.get("student_model", default_model_for_backend(backend)))
|
||||
set_optimizer_backend(cfg.get("optimizer_backend", "openai_chat"))
|
||||
set_target_backend(cfg.get("target_backend", "openai_chat"))
|
||||
set_optimizer_deployment(cfg.get("optimizer_model", default_model_for_backend(backend)))
|
||||
set_target_deployment(cfg.get("target_model", default_model_for_backend(backend)))
|
||||
configure_codex_exec(
|
||||
path=cfg.get("codex_exec_path", "codex"),
|
||||
sandbox=cfg.get("codex_exec_sandbox", "workspace-write"),
|
||||
|
||||
@@ -1,28 +1,26 @@
|
||||
#!/usr/bin/env bash
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# ReflACT — ALFWorld training launch script
|
||||
# SkillOpt — ALFWorld training launch script
|
||||
#
|
||||
# Prerequisites:
|
||||
# pip install -e ".[alfworld]"
|
||||
# pip install alfworld[full] && alfworld-download
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/run_alfworld.sh
|
||||
# bash scripts/run_alfworld.sh --num_epochs 2 --edit_budget 6
|
||||
# bash scripts/run_alfworld.sh --split_dir /path/to/alfworld_split
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
set -euo pipefail
|
||||
|
||||
# ── Paths ────────────────────────────────────────────────────────────────────
|
||||
WORKSPACE="${WORKSPACE:-$(cd "$(dirname "$0")/../.." && pwd)}"
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_ROOT="$(dirname "${SCRIPT_DIR}")"
|
||||
|
||||
# Activate conda environment
|
||||
export PATH="${WORKSPACE}/miniconda3/envs/skillopt/bin:${WORKSPACE}/miniconda3/bin:${PATH}"
|
||||
|
||||
# ALFWorld data — uses ~/.cache/alfworld by default (standard alfworld location)
|
||||
export ALFWORLD_DATA="${ALFWORLD_DATA:-${HOME}/.cache/alfworld}"
|
||||
|
||||
# Ensure ReflACT is importable
|
||||
export PYTHONPATH="${PROJECT_ROOT}:${PYTHONPATH:-}"
|
||||
|
||||
# ── Verify ALFWorld data exists ──────────────────────────────────────────────
|
||||
# ALFWorld data — uses ~/.cache/alfworld by default
|
||||
export ALFWORLD_DATA="${ALFWORLD_DATA:-${HOME}/.cache/alfworld}"
|
||||
|
||||
if [ ! -d "${ALFWORLD_DATA}/json_2.1.1" ]; then
|
||||
echo "ERROR: ALFWorld data not found at ${ALFWORLD_DATA}/json_2.1.1"
|
||||
echo ""
|
||||
@@ -34,25 +32,17 @@ if [ ! -d "${ALFWORLD_DATA}/json_2.1.1" ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# ── Azure OpenAI credentials ────────────────────────────────────────────────
|
||||
export AZURE_OPENAI_ENDPOINT="${AZURE_OPENAI_ENDPOINT:?Set AZURE_OPENAI_ENDPOINT}"
|
||||
export AZURE_OPENAI_API_KEY="${AZURE_OPENAI_API_KEY:?Set AZURE_OPENAI_API_KEY}"
|
||||
export AZURE_OPENAI_API_VERSION="${AZURE_OPENAI_API_VERSION:-2025-04-01-preview}"
|
||||
OPTIMIZER_MODEL="${OPTIMIZER_MODEL:-gpt-5.5}"
|
||||
TARGET_MODEL="${TARGET_MODEL:-gpt-5.5}"
|
||||
|
||||
# ── Model configuration ─────────────────────────────────────────────────────
|
||||
export TEACHER_DEPLOYMENT="${TEACHER_DEPLOYMENT:-gpt-5.5}"
|
||||
export STUDENT_DEPLOYMENT="${STUDENT_DEPLOYMENT:-gpt-5.5}"
|
||||
|
||||
# ── Output directory ─────────────────────────────────────────────────────────
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
DEFAULT_OUT_ROOT="${PROJECT_ROOT}/outputs/skillopt_alfworld_${STUDENT_DEPLOYMENT}_${TIMESTAMP}"
|
||||
DEFAULT_OUT_ROOT="${PROJECT_ROOT}/outputs/skillopt_alfworld_${TARGET_MODEL}_${TIMESTAMP}"
|
||||
|
||||
# ── Run ──────────────────────────────────────────────────────────────────────
|
||||
echo "============================================================"
|
||||
echo " ReflACT — Reflective Agent Tuning (ALFWorld)"
|
||||
echo " SkillOpt — ALFWorld Training"
|
||||
echo "============================================================"
|
||||
echo " Teacher: ${TEACHER_DEPLOYMENT}"
|
||||
echo " Student: ${STUDENT_DEPLOYMENT}"
|
||||
echo " Optimizer: ${OPTIMIZER_MODEL}"
|
||||
echo " Target: ${TARGET_MODEL}"
|
||||
echo " ALFWORLD_DATA: ${ALFWORLD_DATA}"
|
||||
echo " Output: ${DEFAULT_OUT_ROOT}"
|
||||
echo "============================================================"
|
||||
@@ -60,7 +50,9 @@ echo "============================================================"
|
||||
cd "${PROJECT_ROOT}"
|
||||
|
||||
python scripts/train.py \
|
||||
--config configs/alfworld_default.yaml \
|
||||
--config configs/alfworld/default.yaml \
|
||||
--optimizer_model "${OPTIMIZER_MODEL}" \
|
||||
--target_model "${TARGET_MODEL}" \
|
||||
--out_root "${DEFAULT_OUT_ROOT}" \
|
||||
"$@"
|
||||
|
||||
|
||||
@@ -1,41 +1,38 @@
|
||||
#!/usr/bin/env bash
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# ReflACT — SearchQA training launch script
|
||||
# SkillOpt — SearchQA training launch script
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/run_searchqa.sh
|
||||
# bash scripts/run_searchqa.sh --data_path data/searchqa_train_2000.json
|
||||
# bash scripts/run_searchqa.sh --num_epochs 2 --edit_budget 6
|
||||
# bash scripts/run_searchqa.sh --split_dir /path/to/searchqa_split
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
set -euo pipefail
|
||||
|
||||
# ── Paths ────────────────────────────────────────────────────────────────────
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_ROOT="$(dirname "${SCRIPT_DIR}")"
|
||||
|
||||
# Ensure ReflACT is importable
|
||||
export PYTHONPATH="${PROJECT_ROOT}:${PYTHONPATH:-}"
|
||||
|
||||
# ── Model configuration ─────────────────────────────────────────────────────
|
||||
export TEACHER_DEPLOYMENT="${TEACHER_DEPLOYMENT:-gpt-5.5}"
|
||||
export STUDENT_DEPLOYMENT="${STUDENT_DEPLOYMENT:-gpt-5.5}"
|
||||
OPTIMIZER_MODEL="${OPTIMIZER_MODEL:-gpt-5.5}"
|
||||
TARGET_MODEL="${TARGET_MODEL:-gpt-5.5}"
|
||||
|
||||
# ── Output directory ─────────────────────────────────────────────────────────
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
DEFAULT_OUT_ROOT="${PROJECT_ROOT}/outputs/skillopt_searchqa_${STUDENT_DEPLOYMENT}_${TIMESTAMP}"
|
||||
DEFAULT_OUT_ROOT="${PROJECT_ROOT}/outputs/skillopt_searchqa_${TARGET_MODEL}_${TIMESTAMP}"
|
||||
|
||||
# ── Run ──────────────────────────────────────────────────────────────────────
|
||||
echo "============================================================"
|
||||
echo " ReflACT — Reflective Agent Tuning (SearchQA)"
|
||||
echo " SkillOpt — SearchQA Training"
|
||||
echo "============================================================"
|
||||
echo " Teacher: ${TEACHER_DEPLOYMENT}"
|
||||
echo " Student: ${STUDENT_DEPLOYMENT}"
|
||||
echo " Optimizer: ${OPTIMIZER_MODEL}"
|
||||
echo " Target: ${TARGET_MODEL}"
|
||||
echo "============================================================"
|
||||
|
||||
cd "${PROJECT_ROOT}"
|
||||
|
||||
python scripts/train.py \
|
||||
--config configs/searchqa_default.yaml \
|
||||
--config configs/searchqa/default.yaml \
|
||||
--optimizer_model "${OPTIMIZER_MODEL}" \
|
||||
--target_model "${TARGET_MODEL}" \
|
||||
--out_root "${DEFAULT_OUT_ROOT}" \
|
||||
"$@"
|
||||
|
||||
|
||||
@@ -1,46 +1,37 @@
|
||||
#!/usr/bin/env bash
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# ReflACT — SpreadsheetBench training launch script
|
||||
# SkillOpt — SpreadsheetBench training launch script
|
||||
#
|
||||
# Usage:
|
||||
# bash scripts/run_spreadsheetbench.sh \
|
||||
# --data_root /path/to/data \
|
||||
# --jsonl_path /path/to/benchmark.jsonl
|
||||
#
|
||||
# bash scripts/run_spreadsheetbench.sh \
|
||||
# --data_root /path/to/data \
|
||||
# --jsonl_path /path/to/benchmark.jsonl \
|
||||
# --num_epochs 2 --edit_budget 6
|
||||
# bash scripts/run_spreadsheetbench.sh --split_dir /path/to/split --data_root /path/to/data
|
||||
# bash scripts/run_spreadsheetbench.sh --num_epochs 2 --edit_budget 6
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
set -euo pipefail
|
||||
|
||||
# ── Paths ────────────────────────────────────────────────────────────────────
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_ROOT="$(dirname "${SCRIPT_DIR}")"
|
||||
|
||||
# Ensure ReflACT is importable
|
||||
export PYTHONPATH="${PROJECT_ROOT}:${PYTHONPATH:-}"
|
||||
|
||||
# ── Model configuration ─────────────────────────────────────────────────────
|
||||
export TEACHER_DEPLOYMENT="${TEACHER_DEPLOYMENT:-gpt-5.5}"
|
||||
export STUDENT_DEPLOYMENT="${STUDENT_DEPLOYMENT:-gpt-5.5}"
|
||||
OPTIMIZER_MODEL="${OPTIMIZER_MODEL:-gpt-5.5}"
|
||||
TARGET_MODEL="${TARGET_MODEL:-gpt-5.5}"
|
||||
|
||||
# ── Output directory ─────────────────────────────────────────────────────────
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
DEFAULT_OUT_ROOT="${PROJECT_ROOT}/outputs/skillopt_spreadsheetbench_${STUDENT_DEPLOYMENT}_${TIMESTAMP}"
|
||||
DEFAULT_OUT_ROOT="${PROJECT_ROOT}/outputs/skillopt_spreadsheetbench_${TARGET_MODEL}_${TIMESTAMP}"
|
||||
|
||||
# ── Run ──────────────────────────────────────────────────────────────────────
|
||||
echo "============================================================"
|
||||
echo " ReflACT — Reflective Agent Tuning (SpreadsheetBench)"
|
||||
echo " SkillOpt — SpreadsheetBench Training"
|
||||
echo "============================================================"
|
||||
echo " Teacher: ${TEACHER_DEPLOYMENT}"
|
||||
echo " Student: ${STUDENT_DEPLOYMENT}"
|
||||
echo " Optimizer: ${OPTIMIZER_MODEL}"
|
||||
echo " Target: ${TARGET_MODEL}"
|
||||
echo "============================================================"
|
||||
|
||||
cd "${PROJECT_ROOT}"
|
||||
|
||||
python scripts/train.py \
|
||||
--config configs/spreadsheetbench_default.yaml \
|
||||
--config configs/spreadsheetbench/default.yaml \
|
||||
--optimizer_model "${OPTIMIZER_MODEL}" \
|
||||
--target_model "${TARGET_MODEL}" \
|
||||
--out_root "${DEFAULT_OUT_ROOT}" \
|
||||
"$@"
|
||||
|
||||
|
||||
154
scripts/train.py
154
scripts/train.py
@@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
"""ReflACT unified training entry point.
|
||||
"""SkillOpt unified training entry point.
|
||||
|
||||
Usage
|
||||
-----
|
||||
@@ -125,7 +125,7 @@ _BOOL = lambda x: x.lower() in ("true", "1", "yes") # noqa: E731
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(
|
||||
description="ReflACT: Reflective Agent Tuning",
|
||||
description="SkillOpt: Executive Strategy for Self-Evolving Agent Skills",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__,
|
||||
)
|
||||
@@ -138,10 +138,10 @@ def parse_args() -> argparse.Namespace:
|
||||
p.add_argument("--env", type=str)
|
||||
p.add_argument("--backend", type=str,
|
||||
choices=["azure_openai", "codex", "codex_exec", "claude", "claude_chat", "claude_code_exec", "qwen", "qwen_chat"])
|
||||
p.add_argument("--teacher_model", type=str)
|
||||
p.add_argument("--student_model", type=str)
|
||||
p.add_argument("--teacher_backend", type=str)
|
||||
p.add_argument("--student_backend", type=str)
|
||||
p.add_argument("--optimizer_model", type=str)
|
||||
p.add_argument("--target_model", type=str)
|
||||
p.add_argument("--optimizer_backend", type=str)
|
||||
p.add_argument("--target_backend", type=str)
|
||||
p.add_argument("--reasoning_effort", type=str,
|
||||
choices=["", "low", "medium", "high", "xhigh", "max"])
|
||||
p.add_argument("--rewrite_reasoning_effort", type=str)
|
||||
@@ -155,18 +155,18 @@ def parse_args() -> argparse.Namespace:
|
||||
p.add_argument("--azure_openai_auth_mode", type=str)
|
||||
p.add_argument("--azure_openai_ad_scope", type=str)
|
||||
p.add_argument("--azure_openai_managed_identity_client_id", type=str)
|
||||
p.add_argument("--teacher_azure_openai_endpoint", type=str)
|
||||
p.add_argument("--teacher_azure_openai_api_version", type=str)
|
||||
p.add_argument("--teacher_azure_openai_api_key", type=str)
|
||||
p.add_argument("--teacher_azure_openai_auth_mode", type=str)
|
||||
p.add_argument("--teacher_azure_openai_ad_scope", type=str)
|
||||
p.add_argument("--teacher_azure_openai_managed_identity_client_id", type=str)
|
||||
p.add_argument("--student_azure_openai_endpoint", type=str)
|
||||
p.add_argument("--student_azure_openai_api_version", type=str)
|
||||
p.add_argument("--student_azure_openai_api_key", type=str)
|
||||
p.add_argument("--student_azure_openai_auth_mode", type=str)
|
||||
p.add_argument("--student_azure_openai_ad_scope", type=str)
|
||||
p.add_argument("--student_azure_openai_managed_identity_client_id", type=str)
|
||||
p.add_argument("--optimizer_azure_openai_endpoint", type=str)
|
||||
p.add_argument("--optimizer_azure_openai_api_version", type=str)
|
||||
p.add_argument("--optimizer_azure_openai_api_key", type=str)
|
||||
p.add_argument("--optimizer_azure_openai_auth_mode", type=str)
|
||||
p.add_argument("--optimizer_azure_openai_ad_scope", type=str)
|
||||
p.add_argument("--optimizer_azure_openai_managed_identity_client_id", type=str)
|
||||
p.add_argument("--target_azure_openai_endpoint", type=str)
|
||||
p.add_argument("--target_azure_openai_api_version", type=str)
|
||||
p.add_argument("--target_azure_openai_api_key", type=str)
|
||||
p.add_argument("--target_azure_openai_auth_mode", type=str)
|
||||
p.add_argument("--target_azure_openai_ad_scope", type=str)
|
||||
p.add_argument("--target_azure_openai_managed_identity_client_id", type=str)
|
||||
p.add_argument("--qwen_chat_base_url", type=str)
|
||||
p.add_argument("--qwen_chat_api_key", type=str)
|
||||
p.add_argument("--qwen_chat_temperature", type=float)
|
||||
@@ -187,7 +187,7 @@ def parse_args() -> argparse.Namespace:
|
||||
p.add_argument("--claude_code_exec_use_sdk", type=str)
|
||||
p.add_argument("--claude_code_exec_effort", type=str)
|
||||
p.add_argument("--claude_code_exec_max_thinking_tokens", type=int)
|
||||
p.add_argument("--codex_trace_to_teacher", type=_BOOL)
|
||||
p.add_argument("--codex_trace_to_optimizer", type=_BOOL)
|
||||
p.add_argument("--skill_init", type=str)
|
||||
p.add_argument("--num_epochs", type=int)
|
||||
p.add_argument("--train_size", type=int)
|
||||
@@ -212,8 +212,6 @@ def parse_args() -> argparse.Namespace:
|
||||
p.add_argument("--analyst_workers", type=int)
|
||||
p.add_argument("--failure_only", type=_BOOL)
|
||||
p.add_argument("--minibatch_size", type=int)
|
||||
p.add_argument("--use_meta_reflect", type=_BOOL)
|
||||
p.add_argument("--meta_edit_budget", type=int)
|
||||
p.add_argument("--skill_update_mode", type=str,
|
||||
choices=[
|
||||
"patch",
|
||||
@@ -224,9 +222,6 @@ def parse_args() -> argparse.Namespace:
|
||||
"full_rewrite_minibatch",
|
||||
"minibatch_full_rewrite",
|
||||
])
|
||||
p.add_argument("--use_deep_reflect", type=_BOOL)
|
||||
p.add_argument("--deep_reflect_failures", type=int)
|
||||
p.add_argument("--deep_reflect_successes", type=int)
|
||||
p.add_argument("--use_slow_update", type=_BOOL)
|
||||
p.add_argument("--slow_update_samples", type=int)
|
||||
p.add_argument("--longitudinal_pair_policy", type=str,
|
||||
@@ -260,10 +255,10 @@ def parse_args() -> argparse.Namespace:
|
||||
|
||||
_LEGACY_TO_STRUCTURED: dict[str, str] = {
|
||||
"backend": "model.backend",
|
||||
"teacher_model": "model.teacher",
|
||||
"student_model": "model.student",
|
||||
"teacher_backend": "model.teacher_backend",
|
||||
"student_backend": "model.student_backend",
|
||||
"optimizer_model": "model.optimizer",
|
||||
"target_model": "model.target",
|
||||
"optimizer_backend": "model.optimizer_backend",
|
||||
"target_backend": "model.target_backend",
|
||||
"reasoning_effort": "model.reasoning_effort",
|
||||
"rewrite_reasoning_effort": "model.rewrite_reasoning_effort",
|
||||
"rewrite_max_completion_tokens": "model.rewrite_max_completion_tokens",
|
||||
@@ -276,18 +271,18 @@ _LEGACY_TO_STRUCTURED: dict[str, str] = {
|
||||
"azure_openai_auth_mode": "model.azure_openai_auth_mode",
|
||||
"azure_openai_ad_scope": "model.azure_openai_ad_scope",
|
||||
"azure_openai_managed_identity_client_id": "model.azure_openai_managed_identity_client_id",
|
||||
"teacher_azure_openai_endpoint": "model.teacher_azure_openai_endpoint",
|
||||
"teacher_azure_openai_api_version": "model.teacher_azure_openai_api_version",
|
||||
"teacher_azure_openai_api_key": "model.teacher_azure_openai_api_key",
|
||||
"teacher_azure_openai_auth_mode": "model.teacher_azure_openai_auth_mode",
|
||||
"teacher_azure_openai_ad_scope": "model.teacher_azure_openai_ad_scope",
|
||||
"teacher_azure_openai_managed_identity_client_id": "model.teacher_azure_openai_managed_identity_client_id",
|
||||
"student_azure_openai_endpoint": "model.student_azure_openai_endpoint",
|
||||
"student_azure_openai_api_version": "model.student_azure_openai_api_version",
|
||||
"student_azure_openai_api_key": "model.student_azure_openai_api_key",
|
||||
"student_azure_openai_auth_mode": "model.student_azure_openai_auth_mode",
|
||||
"student_azure_openai_ad_scope": "model.student_azure_openai_ad_scope",
|
||||
"student_azure_openai_managed_identity_client_id": "model.student_azure_openai_managed_identity_client_id",
|
||||
"optimizer_azure_openai_endpoint": "model.optimizer_azure_openai_endpoint",
|
||||
"optimizer_azure_openai_api_version": "model.optimizer_azure_openai_api_version",
|
||||
"optimizer_azure_openai_api_key": "model.optimizer_azure_openai_api_key",
|
||||
"optimizer_azure_openai_auth_mode": "model.optimizer_azure_openai_auth_mode",
|
||||
"optimizer_azure_openai_ad_scope": "model.optimizer_azure_openai_ad_scope",
|
||||
"optimizer_azure_openai_managed_identity_client_id": "model.optimizer_azure_openai_managed_identity_client_id",
|
||||
"target_azure_openai_endpoint": "model.target_azure_openai_endpoint",
|
||||
"target_azure_openai_api_version": "model.target_azure_openai_api_version",
|
||||
"target_azure_openai_api_key": "model.target_azure_openai_api_key",
|
||||
"target_azure_openai_auth_mode": "model.target_azure_openai_auth_mode",
|
||||
"target_azure_openai_ad_scope": "model.target_azure_openai_ad_scope",
|
||||
"target_azure_openai_managed_identity_client_id": "model.target_azure_openai_managed_identity_client_id",
|
||||
"qwen_chat_base_url": "model.qwen_chat_base_url",
|
||||
"qwen_chat_api_key": "model.qwen_chat_api_key",
|
||||
"qwen_chat_temperature": "model.qwen_chat_temperature",
|
||||
@@ -308,7 +303,7 @@ _LEGACY_TO_STRUCTURED: dict[str, str] = {
|
||||
"claude_code_exec_use_sdk": "model.claude_code_exec_use_sdk",
|
||||
"claude_code_exec_effort": "model.claude_code_exec_effort",
|
||||
"claude_code_exec_max_thinking_tokens": "model.claude_code_exec_max_thinking_tokens",
|
||||
"codex_trace_to_teacher": "model.codex_trace_to_teacher",
|
||||
"codex_trace_to_optimizer": "model.codex_trace_to_optimizer",
|
||||
"num_epochs": "train.num_epochs",
|
||||
"train_size": "train.train_size",
|
||||
"steps_per_epoch": "train.steps_per_epoch",
|
||||
@@ -320,16 +315,11 @@ _LEGACY_TO_STRUCTURED: dict[str, str] = {
|
||||
"analyst_workers": "gradient.analyst_workers",
|
||||
"max_analyst_rounds": "gradient.max_analyst_rounds",
|
||||
"failure_only": "gradient.failure_only",
|
||||
"use_deep_reflect": "gradient.use_deep_reflect",
|
||||
"deep_reflect_failures": "gradient.deep_reflect_failures",
|
||||
"deep_reflect_successes": "gradient.deep_reflect_successes",
|
||||
"edit_budget": "optimizer.learning_rate",
|
||||
"min_edit_budget": "optimizer.min_learning_rate",
|
||||
"lr_scheduler": "optimizer.lr_scheduler",
|
||||
"lr_control_mode": "optimizer.lr_control_mode",
|
||||
"skill_update_mode": "optimizer.skill_update_mode",
|
||||
"use_meta_reflect": "optimizer.use_meta_reflect",
|
||||
"meta_edit_budget": "optimizer.meta_learning_rate",
|
||||
"use_slow_update": "optimizer.use_slow_update",
|
||||
"slow_update_samples": "optimizer.slow_update_samples",
|
||||
"longitudinal_pair_policy": "optimizer.longitudinal_pair_policy",
|
||||
@@ -387,7 +377,7 @@ def load_config(args: argparse.Namespace) -> dict:
|
||||
explicit_backend = str(option).split("=", 1)[1].strip()
|
||||
break
|
||||
|
||||
backend = normalize_backend_name(flat.get("model_backend") or flat.get("student_backend") or "azure_openai")
|
||||
backend = normalize_backend_name(flat.get("model_backend") or flat.get("target_backend") or "azure_openai")
|
||||
|
||||
def _has_model_override(dotted_key: str, legacy_key: str) -> bool:
|
||||
if getattr(args, legacy_key, None) is not None:
|
||||
@@ -402,53 +392,53 @@ def load_config(args: argparse.Namespace) -> dict:
|
||||
backend = normalize_backend_name(explicit_backend)
|
||||
flat["model_backend"] = backend
|
||||
if backend in {"claude", "claude_chat"}:
|
||||
flat.setdefault("teacher_backend", "claude_chat")
|
||||
flat.setdefault("student_backend", "claude_chat")
|
||||
flat.setdefault("optimizer_backend", "claude_chat")
|
||||
flat.setdefault("target_backend", "claude_chat")
|
||||
elif backend in {"codex", "codex_exec"}:
|
||||
flat.setdefault("teacher_backend", "openai_chat")
|
||||
flat.setdefault("student_backend", "codex_exec")
|
||||
flat.setdefault("optimizer_backend", "openai_chat")
|
||||
flat.setdefault("target_backend", "codex_exec")
|
||||
elif backend == "claude_code_exec":
|
||||
flat.setdefault("teacher_backend", "openai_chat")
|
||||
flat.setdefault("student_backend", "claude_code_exec")
|
||||
flat.setdefault("optimizer_backend", "openai_chat")
|
||||
flat.setdefault("target_backend", "claude_code_exec")
|
||||
elif backend in {"qwen", "qwen_chat"}:
|
||||
flat.setdefault("teacher_backend", "openai_chat")
|
||||
flat.setdefault("student_backend", "qwen_chat")
|
||||
flat.setdefault("optimizer_backend", "openai_chat")
|
||||
flat.setdefault("target_backend", "qwen_chat")
|
||||
else:
|
||||
flat.setdefault("teacher_backend", "openai_chat")
|
||||
flat.setdefault("student_backend", "openai_chat")
|
||||
flat.setdefault("optimizer_backend", "openai_chat")
|
||||
flat.setdefault("target_backend", "openai_chat")
|
||||
else:
|
||||
flat.setdefault("teacher_backend", "openai_chat")
|
||||
flat.setdefault("student_backend", "openai_chat")
|
||||
flat.setdefault("optimizer_backend", "openai_chat")
|
||||
flat.setdefault("target_backend", "openai_chat")
|
||||
|
||||
if flat.get("teacher_backend") == "claude_chat":
|
||||
if flat.get("optimizer_backend") == "claude_chat":
|
||||
if (
|
||||
str(flat.get("teacher_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.teacher", "teacher_model")
|
||||
str(flat.get("optimizer_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.optimizer", "optimizer_model")
|
||||
):
|
||||
flat["teacher_model"] = default_model_for_backend("claude_chat")
|
||||
if flat.get("student_backend") == "claude_chat":
|
||||
flat["optimizer_model"] = default_model_for_backend("claude_chat")
|
||||
if flat.get("target_backend") == "claude_chat":
|
||||
if (
|
||||
str(flat.get("student_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.student", "student_model")
|
||||
str(flat.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.target", "target_model")
|
||||
):
|
||||
flat["student_model"] = default_model_for_backend("claude_chat")
|
||||
if flat.get("student_backend") == "claude_code_exec":
|
||||
flat["target_model"] = default_model_for_backend("claude_chat")
|
||||
if flat.get("target_backend") == "claude_code_exec":
|
||||
if (
|
||||
str(flat.get("student_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.student", "student_model")
|
||||
str(flat.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.target", "target_model")
|
||||
):
|
||||
flat["student_model"] = default_model_for_backend("claude_chat")
|
||||
if flat.get("student_backend") == "qwen_chat":
|
||||
flat["target_model"] = default_model_for_backend("claude_chat")
|
||||
if flat.get("target_backend") == "qwen_chat":
|
||||
if (
|
||||
str(flat.get("student_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.student", "student_model")
|
||||
str(flat.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS
|
||||
and not _has_model_override("model.target", "target_model")
|
||||
):
|
||||
flat["student_model"] = default_model_for_backend("qwen_chat")
|
||||
flat["target_model"] = default_model_for_backend("qwen_chat")
|
||||
|
||||
# Auto-generate output root
|
||||
if not flat.get("out_root"):
|
||||
env = flat.get("env", "unknown")
|
||||
model = flat.get("teacher_model", "unknown").replace("/", "-")
|
||||
model = flat.get("optimizer_model", "unknown").replace("/", "-")
|
||||
ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
flat["out_root"] = os.path.join("outputs", f"skillopt_{env}_{model}_{ts}")
|
||||
|
||||
@@ -463,13 +453,13 @@ def main() -> None:
|
||||
cfg = load_config(args)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f" ReflACT — Reflective Agent Tuning")
|
||||
print(f" SkillOpt — Executive Strategy for Self-Evolving Agent Skills")
|
||||
print(f"{'='*60}")
|
||||
print(f" env: {cfg.get('env')}")
|
||||
print(f" teacher_model: {cfg.get('teacher_model')}")
|
||||
print(f" student_model: {cfg.get('student_model')}")
|
||||
print(f" teacher_backend:{cfg.get('teacher_backend', 'openai_chat')}")
|
||||
print(f" student_backend:{cfg.get('student_backend', 'openai_chat')}")
|
||||
print(f" optimizer_model: {cfg.get('optimizer_model')}")
|
||||
print(f" target_model: {cfg.get('target_model')}")
|
||||
print(f" optimizer_backend:{cfg.get('optimizer_backend', 'openai_chat')}")
|
||||
print(f" target_backend:{cfg.get('target_backend', 'openai_chat')}")
|
||||
print(f" reasoning: {cfg.get('reasoning_effort') or 'off'}")
|
||||
print(f" rewrite_effort: {cfg.get('rewrite_reasoning_effort') or 'off'}")
|
||||
print(f" epochs: {cfg.get('num_epochs')}")
|
||||
@@ -482,8 +472,8 @@ def main() -> None:
|
||||
print(f" min_edit_budget:{cfg.get('min_edit_budget', 2)}")
|
||||
print(f" minibatch_size: {cfg.get('minibatch_size')}")
|
||||
print(f" seed: {cfg.get('seed')}")
|
||||
print(f" meta_reflect: {cfg.get('use_meta_reflect', False)}")
|
||||
print(f" meta_skill: {cfg.get('use_meta_skill', False)}")
|
||||
print(f" slow_update: {cfg.get('use_slow_update', False)}")
|
||||
print(f" out_root: {cfg.get('out_root')}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
|
||||
@@ -21,7 +21,6 @@ from skillopt.types import ( # noqa: F401
|
||||
FailureSummaryEntry,
|
||||
GateAction,
|
||||
GateResult,
|
||||
MetaReflectResult,
|
||||
Patch,
|
||||
RawPatch,
|
||||
RolloutResult,
|
||||
|
||||
@@ -30,10 +30,10 @@ _STRUCTURED_SECTIONS = frozenset({
|
||||
|
||||
_FLATTEN_MAP: dict[str, str] = {
|
||||
"model.backend": "model_backend",
|
||||
"model.teacher": "teacher_model",
|
||||
"model.student": "student_model",
|
||||
"model.teacher_backend": "teacher_backend",
|
||||
"model.student_backend": "student_backend",
|
||||
"model.optimizer": "optimizer_model",
|
||||
"model.target": "target_model",
|
||||
"model.optimizer_backend": "optimizer_backend",
|
||||
"model.target_backend": "target_backend",
|
||||
"model.reasoning_effort": "reasoning_effort",
|
||||
"model.rewrite_reasoning_effort": "rewrite_reasoning_effort",
|
||||
"model.rewrite_max_completion_tokens": "rewrite_max_completion_tokens",
|
||||
@@ -51,7 +51,7 @@ _FLATTEN_MAP: dict[str, str] = {
|
||||
"model.claude_code_exec_use_sdk": "claude_code_exec_use_sdk",
|
||||
"model.claude_code_exec_effort": "claude_code_exec_effort",
|
||||
"model.claude_code_exec_max_thinking_tokens": "claude_code_exec_max_thinking_tokens",
|
||||
"model.codex_trace_to_teacher": "codex_trace_to_teacher",
|
||||
"model.codex_trace_to_optimizer": "codex_trace_to_optimizer",
|
||||
"model.azure_endpoint": "azure_endpoint",
|
||||
"model.azure_api_version": "azure_api_version",
|
||||
"model.azure_api_key": "azure_api_key",
|
||||
@@ -61,18 +61,18 @@ _FLATTEN_MAP: dict[str, str] = {
|
||||
"model.azure_openai_auth_mode": "azure_openai_auth_mode",
|
||||
"model.azure_openai_ad_scope": "azure_openai_ad_scope",
|
||||
"model.azure_openai_managed_identity_client_id": "azure_openai_managed_identity_client_id",
|
||||
"model.teacher_azure_openai_endpoint": "teacher_azure_openai_endpoint",
|
||||
"model.teacher_azure_openai_api_version": "teacher_azure_openai_api_version",
|
||||
"model.teacher_azure_openai_api_key": "teacher_azure_openai_api_key",
|
||||
"model.teacher_azure_openai_auth_mode": "teacher_azure_openai_auth_mode",
|
||||
"model.teacher_azure_openai_ad_scope": "teacher_azure_openai_ad_scope",
|
||||
"model.teacher_azure_openai_managed_identity_client_id": "teacher_azure_openai_managed_identity_client_id",
|
||||
"model.student_azure_openai_endpoint": "student_azure_openai_endpoint",
|
||||
"model.student_azure_openai_api_version": "student_azure_openai_api_version",
|
||||
"model.student_azure_openai_api_key": "student_azure_openai_api_key",
|
||||
"model.student_azure_openai_auth_mode": "student_azure_openai_auth_mode",
|
||||
"model.student_azure_openai_ad_scope": "student_azure_openai_ad_scope",
|
||||
"model.student_azure_openai_managed_identity_client_id": "student_azure_openai_managed_identity_client_id",
|
||||
"model.optimizer_azure_openai_endpoint": "optimizer_azure_openai_endpoint",
|
||||
"model.optimizer_azure_openai_api_version": "optimizer_azure_openai_api_version",
|
||||
"model.optimizer_azure_openai_api_key": "optimizer_azure_openai_api_key",
|
||||
"model.optimizer_azure_openai_auth_mode": "optimizer_azure_openai_auth_mode",
|
||||
"model.optimizer_azure_openai_ad_scope": "optimizer_azure_openai_ad_scope",
|
||||
"model.optimizer_azure_openai_managed_identity_client_id": "optimizer_azure_openai_managed_identity_client_id",
|
||||
"model.target_azure_openai_endpoint": "target_azure_openai_endpoint",
|
||||
"model.target_azure_openai_api_version": "target_azure_openai_api_version",
|
||||
"model.target_azure_openai_api_key": "target_azure_openai_api_key",
|
||||
"model.target_azure_openai_auth_mode": "target_azure_openai_auth_mode",
|
||||
"model.target_azure_openai_ad_scope": "target_azure_openai_ad_scope",
|
||||
"model.target_azure_openai_managed_identity_client_id": "target_azure_openai_managed_identity_client_id",
|
||||
"model.qwen_chat_base_url": "qwen_chat_base_url",
|
||||
"model.qwen_chat_api_key": "qwen_chat_api_key",
|
||||
"model.qwen_chat_temperature": "qwen_chat_temperature",
|
||||
@@ -89,16 +89,12 @@ _FLATTEN_MAP: dict[str, str] = {
|
||||
"gradient.merge_batch_size": "merge_batch_size",
|
||||
"gradient.analyst_workers": "analyst_workers",
|
||||
"gradient.failure_only": "failure_only",
|
||||
"gradient.use_deep_reflect": "use_deep_reflect",
|
||||
"gradient.deep_reflect_failures": "deep_reflect_failures",
|
||||
"gradient.deep_reflect_successes": "deep_reflect_successes",
|
||||
"gradient.max_analyst_rounds": "max_analyst_rounds",
|
||||
"optimizer.learning_rate": "edit_budget",
|
||||
"optimizer.min_learning_rate": "min_edit_budget",
|
||||
"optimizer.lr_scheduler": "lr_scheduler",
|
||||
"optimizer.lr_control_mode": "lr_control_mode",
|
||||
"optimizer.skill_update_mode": "skill_update_mode",
|
||||
"optimizer.use_meta_reflect": "use_meta_reflect",
|
||||
"optimizer.meta_learning_rate": "meta_edit_budget",
|
||||
"optimizer.use_slow_update": "use_slow_update",
|
||||
"optimizer.slow_update_samples": "slow_update_samples",
|
||||
|
||||
@@ -26,7 +26,6 @@ from skillopt.datasets.base import BatchSpec
|
||||
from skillopt.envs.base import EnvAdapter
|
||||
from skillopt.evaluation.gate import evaluate_gate
|
||||
from skillopt.gradient.aggregate import merge_patches
|
||||
from skillopt.optimizer.meta_reflect import build_epoch_history, run_meta_reflect
|
||||
from skillopt.optimizer.meta_skill import run_meta_skill
|
||||
from skillopt.optimizer.clip import rank_and_select
|
||||
from skillopt.optimizer.lr_autonomous import decide_autonomous_learning_rate
|
||||
@@ -56,10 +55,10 @@ from skillopt.model import (
|
||||
get_token_summary,
|
||||
reset_token_tracker,
|
||||
set_reasoning_effort,
|
||||
set_student_backend,
|
||||
set_student_deployment,
|
||||
set_teacher_backend,
|
||||
set_teacher_deployment,
|
||||
set_target_backend,
|
||||
set_target_deployment,
|
||||
set_optimizer_backend,
|
||||
set_optimizer_deployment,
|
||||
)
|
||||
from skillopt.utils import compute_score, skill_hash
|
||||
|
||||
@@ -132,7 +131,7 @@ def _normalise_lr_control_mode(mode: str | None) -> str:
|
||||
"scheduled": "fixed",
|
||||
"autonomous": "autonomous",
|
||||
"auto": "autonomous",
|
||||
"teacher": "autonomous",
|
||||
"optimizer": "autonomous",
|
||||
"none": "none",
|
||||
"off": "none",
|
||||
"no_lr": "none",
|
||||
@@ -570,47 +569,47 @@ class ReflACTTrainer:
|
||||
auth_mode=cfg.get("azure_openai_auth_mode") or None,
|
||||
ad_scope=cfg.get("azure_openai_ad_scope") or None,
|
||||
managed_identity_client_id=cfg.get("azure_openai_managed_identity_client_id") or None,
|
||||
teacher_endpoint=cfg.get("teacher_azure_openai_endpoint") or None,
|
||||
teacher_api_version=cfg.get("teacher_azure_openai_api_version") or None,
|
||||
teacher_api_key=cfg.get("teacher_azure_openai_api_key") or None,
|
||||
teacher_auth_mode=cfg.get("teacher_azure_openai_auth_mode") or None,
|
||||
teacher_ad_scope=cfg.get("teacher_azure_openai_ad_scope") or None,
|
||||
teacher_managed_identity_client_id=(
|
||||
cfg.get("teacher_azure_openai_managed_identity_client_id") or None
|
||||
optimizer_endpoint=cfg.get("optimizer_azure_openai_endpoint") or None,
|
||||
optimizer_api_version=cfg.get("optimizer_azure_openai_api_version") or None,
|
||||
optimizer_api_key=cfg.get("optimizer_azure_openai_api_key") or None,
|
||||
optimizer_auth_mode=cfg.get("optimizer_azure_openai_auth_mode") or None,
|
||||
optimizer_ad_scope=cfg.get("optimizer_azure_openai_ad_scope") or None,
|
||||
optimizer_managed_identity_client_id=(
|
||||
cfg.get("optimizer_azure_openai_managed_identity_client_id") or None
|
||||
),
|
||||
student_endpoint=cfg.get("student_azure_openai_endpoint") or None,
|
||||
student_api_version=cfg.get("student_azure_openai_api_version") or None,
|
||||
student_api_key=cfg.get("student_azure_openai_api_key") or None,
|
||||
student_auth_mode=cfg.get("student_azure_openai_auth_mode") or None,
|
||||
student_ad_scope=cfg.get("student_azure_openai_ad_scope") or None,
|
||||
student_managed_identity_client_id=(
|
||||
cfg.get("student_azure_openai_managed_identity_client_id") or None
|
||||
target_endpoint=cfg.get("target_azure_openai_endpoint") or None,
|
||||
target_api_version=cfg.get("target_azure_openai_api_version") or None,
|
||||
target_api_key=cfg.get("target_azure_openai_api_key") or None,
|
||||
target_auth_mode=cfg.get("target_azure_openai_auth_mode") or None,
|
||||
target_ad_scope=cfg.get("target_azure_openai_ad_scope") or None,
|
||||
target_managed_identity_client_id=(
|
||||
cfg.get("target_azure_openai_managed_identity_client_id") or None
|
||||
),
|
||||
)
|
||||
teacher_backend = cfg.get("teacher_backend")
|
||||
student_backend = cfg.get("student_backend")
|
||||
if not teacher_backend or not student_backend:
|
||||
optimizer_backend = cfg.get("optimizer_backend")
|
||||
target_backend = cfg.get("target_backend")
|
||||
if not optimizer_backend or not target_backend:
|
||||
if backend in {"claude", "claude_chat"}:
|
||||
teacher_backend = teacher_backend or "claude_chat"
|
||||
student_backend = student_backend or "claude_chat"
|
||||
optimizer_backend = optimizer_backend or "claude_chat"
|
||||
target_backend = target_backend or "claude_chat"
|
||||
elif backend in {"codex", "codex_exec"}:
|
||||
teacher_backend = teacher_backend or "openai_chat"
|
||||
student_backend = student_backend or "codex_exec"
|
||||
optimizer_backend = optimizer_backend or "openai_chat"
|
||||
target_backend = target_backend or "codex_exec"
|
||||
elif backend == "claude_code_exec":
|
||||
teacher_backend = teacher_backend or "openai_chat"
|
||||
student_backend = student_backend or "claude_code_exec"
|
||||
optimizer_backend = optimizer_backend or "openai_chat"
|
||||
target_backend = target_backend or "claude_code_exec"
|
||||
elif backend in {"qwen", "qwen_chat"}:
|
||||
teacher_backend = teacher_backend or "openai_chat"
|
||||
student_backend = student_backend or "qwen_chat"
|
||||
optimizer_backend = optimizer_backend or "openai_chat"
|
||||
target_backend = target_backend or "qwen_chat"
|
||||
else:
|
||||
teacher_backend = teacher_backend or "openai_chat"
|
||||
student_backend = student_backend or "openai_chat"
|
||||
cfg["teacher_backend"] = teacher_backend
|
||||
cfg["student_backend"] = student_backend
|
||||
set_teacher_backend(teacher_backend)
|
||||
set_student_backend(student_backend)
|
||||
set_teacher_deployment(cfg["teacher_model"])
|
||||
set_student_deployment(cfg["student_model"])
|
||||
optimizer_backend = optimizer_backend or "openai_chat"
|
||||
target_backend = target_backend or "openai_chat"
|
||||
cfg["optimizer_backend"] = optimizer_backend
|
||||
cfg["target_backend"] = target_backend
|
||||
set_optimizer_backend(optimizer_backend)
|
||||
set_target_backend(target_backend)
|
||||
set_optimizer_deployment(cfg["optimizer_model"])
|
||||
set_target_deployment(cfg["target_model"])
|
||||
configure_codex_exec(
|
||||
path=cfg.get("codex_exec_path", "codex"),
|
||||
sandbox=cfg.get("codex_exec_sandbox", "workspace-write"),
|
||||
@@ -637,19 +636,17 @@ class ReflACTTrainer:
|
||||
max_tokens=cfg.get("qwen_chat_max_tokens"),
|
||||
enable_thinking=cfg.get("qwen_chat_enable_thinking"),
|
||||
)
|
||||
os.environ["REFLACT_CODEX_TRACE_TO_TEACHER"] = (
|
||||
os.environ["REFLACT_CODEX_TRACE_TO_OPTIMIZER"] = (
|
||||
"1"
|
||||
if student_backend == "codex_exec" and cfg.get("codex_trace_to_teacher", False)
|
||||
if target_backend == "codex_exec" and cfg.get("codex_trace_to_optimizer", False)
|
||||
else "0"
|
||||
)
|
||||
reasoning = cfg.get("reasoning_effort", "") or None
|
||||
set_reasoning_effort(reasoning)
|
||||
if student_backend == "claude_code_exec" and cfg.get("use_deep_reflect", False):
|
||||
raise NotImplementedError("claude_code_exec does not support use_deep_reflect yet.")
|
||||
print(
|
||||
f" [model config] backend={backend} "
|
||||
f"teacher={cfg['teacher_model']} ({teacher_backend}) "
|
||||
f"student={cfg['student_model']} ({student_backend}) "
|
||||
f"optimizer={cfg['optimizer_model']} ({optimizer_backend}) "
|
||||
f"target={cfg['target_model']} ({target_backend}) "
|
||||
f"reasoning={reasoning or 'off'}"
|
||||
)
|
||||
|
||||
@@ -897,7 +894,7 @@ class ReflACTTrainer:
|
||||
epoch_rng.shuffle(shuffled_seeds)
|
||||
|
||||
# Step buffer: accumulates per-step context (failure patterns +
|
||||
# rejected edits) within this epoch so teachers see full history.
|
||||
# rejected edits) within this epoch so optimizers see full history.
|
||||
step_buffer: list[dict] = []
|
||||
active_meta_skill = (
|
||||
_load_meta_skill_content(out_root, epoch - 1)
|
||||
@@ -948,7 +945,6 @@ class ReflACTTrainer:
|
||||
accum_rollout_stats: list[dict] = []
|
||||
total_rollout_time = 0.0
|
||||
total_reflect_time = 0.0
|
||||
total_deep_reflect_time = 0.0
|
||||
|
||||
for a in range(accumulation):
|
||||
batch_idx = step_in_epoch * accumulation + a
|
||||
@@ -1013,33 +1009,6 @@ class ReflACTTrainer:
|
||||
f"success_patches={len(success_patches)}"
|
||||
)
|
||||
|
||||
deep_failure_patches: list[dict] = []
|
||||
deep_success_patches: list[dict] = []
|
||||
if cfg.get("use_deep_reflect", False):
|
||||
t_phase = time.time()
|
||||
deep_raw_patches = adapter.deep_reflect(
|
||||
rollout_results,
|
||||
current_skill,
|
||||
batch_dir,
|
||||
env_manager=train_env,
|
||||
prediction_dir=pred_dir,
|
||||
random_seed=batch_seed,
|
||||
step_buffer_context=step_buffer_context,
|
||||
meta_skill_context=active_meta_skill,
|
||||
)
|
||||
deep_failure_patches, deep_success_patches = _normalise_patches(
|
||||
deep_raw_patches,
|
||||
update_mode=update_mode,
|
||||
)
|
||||
all_failure_patches.extend(deep_failure_patches)
|
||||
all_success_patches.extend(deep_success_patches)
|
||||
all_raw_patches.extend(deep_raw_patches)
|
||||
total_deep_reflect_time += time.time() - t_phase
|
||||
print(
|
||||
f" [2b/6 DEEP REFLECT] failure_patches={len(deep_failure_patches)} "
|
||||
f"success_patches={len(deep_success_patches)}"
|
||||
)
|
||||
|
||||
# Track per-batch stats
|
||||
accum_rollout_stats.append({
|
||||
"batch_idx": a,
|
||||
@@ -1049,8 +1018,6 @@ class ReflACTTrainer:
|
||||
"soft": r_soft,
|
||||
"n_failure_patches": len(failure_patches),
|
||||
"n_success_patches": len(success_patches),
|
||||
"n_deep_failure_patches": len(deep_failure_patches),
|
||||
"n_deep_success_patches": len(deep_success_patches),
|
||||
})
|
||||
|
||||
# ── End of accumulation loop ─────────────────────────────
|
||||
@@ -1066,8 +1033,6 @@ class ReflACTTrainer:
|
||||
step_rec["accumulation_batches"] = accum_rollout_stats
|
||||
step_rec["timing"]["rollout_s"] = round(total_rollout_time, 1)
|
||||
step_rec["timing"]["reflect_s"] = round(total_reflect_time, 1)
|
||||
if cfg.get("use_deep_reflect", False):
|
||||
step_rec["timing"]["deep_reflect_s"] = round(total_deep_reflect_time, 1)
|
||||
|
||||
n_total_patches = len(all_failure_patches) + len(all_success_patches)
|
||||
step_rec["n_patches"] = n_total_patches
|
||||
@@ -1383,7 +1348,7 @@ class ReflACTTrainer:
|
||||
|
||||
step_buffer.append(buf_entry)
|
||||
|
||||
# Persist for meta-reflect
|
||||
# Persist step digest for step buffer context
|
||||
digest_path = os.path.join(step_dir, "trajectory_digest.json")
|
||||
with open(digest_path, "w") as f:
|
||||
json.dump(buf_entry, f, indent=2, ensure_ascii=False)
|
||||
@@ -1431,7 +1396,6 @@ class ReflACTTrainer:
|
||||
f"dt={step_rec['wall_time_s']}s\n"
|
||||
f" timing: rollout={timing.get('rollout_s',0)}s "
|
||||
f"reflect={timing.get('reflect_s',0)}s "
|
||||
f"deep_reflect={timing.get('deep_reflect_s',0)}s "
|
||||
f"aggregate={timing.get('aggregate_s',0)}s "
|
||||
f"select={timing.get('select_s',0)}s "
|
||||
f"evaluate={timing.get('evaluate_s',0)}s"
|
||||
@@ -1463,12 +1427,17 @@ class ReflACTTrainer:
|
||||
epoch_comparison_pairs = None
|
||||
if (
|
||||
slow_saved.get("slow_update_content")
|
||||
and slow_saved.get("action") in {"accept", "accept_new_best"}
|
||||
and slow_saved.get("action") in {
|
||||
"accept", "accept_new_best", "force_accept",
|
||||
}
|
||||
and epoch >= 2
|
||||
):
|
||||
current_skill = replace_slow_update_field(
|
||||
current_skill, slow_saved["slow_update_content"],
|
||||
)
|
||||
best_skill = replace_slow_update_field(
|
||||
best_skill, slow_saved["slow_update_content"],
|
||||
)
|
||||
elif epoch == 1:
|
||||
# Epoch 1: inject empty placeholder
|
||||
os.makedirs(slow_dir, exist_ok=True)
|
||||
@@ -1577,7 +1546,7 @@ class ReflACTTrainer:
|
||||
# 5. Extract previous slow update guidance for reflection
|
||||
existing_guidance = extract_slow_update_field(current_skill)
|
||||
|
||||
# 6. Teacher analysis (with reflection on previous guidance)
|
||||
# 6. Optimizer analysis (with reflection on previous guidance)
|
||||
slow_result = run_slow_update(
|
||||
current_skill,
|
||||
results_prev,
|
||||
@@ -1608,67 +1577,29 @@ class ReflACTTrainer:
|
||||
"observed across adjacent epochs."
|
||||
)
|
||||
|
||||
if slow_candidate_hash in sel_cache:
|
||||
slow_sel_hard, slow_sel_soft = sel_cache[slow_candidate_hash]
|
||||
print(
|
||||
f" [slow gate] cache hit: hard={slow_sel_hard:.4f}"
|
||||
)
|
||||
else:
|
||||
sel_env, sel_n = _build_eval_env(
|
||||
split="valid_seen",
|
||||
env_num=cfg["sel_env_num"],
|
||||
seed=seed,
|
||||
)
|
||||
print(f" [slow gate] selection items={sel_n}")
|
||||
slow_eval_dir = os.path.join(slow_dir, "selection_eval")
|
||||
slow_eval_results = adapter.rollout(
|
||||
sel_env, slow_candidate, slow_eval_dir,
|
||||
)
|
||||
slow_sel_hard, slow_sel_soft = compute_score(slow_eval_results)
|
||||
sel_cache[slow_candidate_hash] = (slow_sel_hard, slow_sel_soft)
|
||||
|
||||
slow_gate = evaluate_gate(
|
||||
candidate_skill=slow_candidate,
|
||||
cand_hard=slow_sel_hard,
|
||||
current_skill=current_skill,
|
||||
current_score=current_score,
|
||||
best_skill=best_skill,
|
||||
best_score=best_score,
|
||||
best_step=best_step,
|
||||
global_step=global_step,
|
||||
# Slow update field is force-updated into both
|
||||
# current_skill and best_skill unconditionally.
|
||||
# The epoch-level longitudinal guidance should always
|
||||
# persist — it must not be gated by step-level
|
||||
# selection scores.
|
||||
slow_content = slow_result["slow_update_content"]
|
||||
current_skill = replace_slow_update_field(
|
||||
current_skill, slow_content,
|
||||
)
|
||||
slow_result["selection_hard"] = slow_sel_hard
|
||||
slow_result["selection_soft"] = slow_sel_soft
|
||||
slow_result["action"] = slow_gate.action
|
||||
prev_current = current_score
|
||||
prev_best = best_score
|
||||
current_skill = slow_gate.current_skill
|
||||
current_score = slow_gate.current_score
|
||||
best_skill = slow_gate.best_skill
|
||||
best_score = slow_gate.best_score
|
||||
best_step = slow_gate.best_step
|
||||
if slow_gate.action in {"accept", "accept_new_best"}:
|
||||
current_origin = f"slow_update_epoch_{epoch:02d}"
|
||||
if slow_gate.action == "accept_new_best":
|
||||
best_origin = current_origin
|
||||
print(
|
||||
f" [slow gate] ACCEPT (new best) "
|
||||
f"hard={slow_sel_hard:.4f} > prev best {prev_best:.4f}"
|
||||
)
|
||||
elif slow_gate.action == "accept":
|
||||
print(
|
||||
f" [slow gate] ACCEPT "
|
||||
f"hard={slow_sel_hard:.4f} > current={prev_current:.4f}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f" [slow gate] REJECT "
|
||||
f"hard={slow_sel_hard:.4f} <= current={current_score:.4f}"
|
||||
)
|
||||
best_skill = replace_slow_update_field(
|
||||
best_skill, slow_content,
|
||||
)
|
||||
# Update caches so downstream steps use the
|
||||
# slow-update-injected skill for hashing.
|
||||
slow_candidate_hash = skill_hash(current_skill)
|
||||
sel_cache[slow_candidate_hash] = (current_score, 0.0)
|
||||
|
||||
slow_result["action"] = "force_accept"
|
||||
current_origin = f"slow_update_epoch_{epoch:02d}"
|
||||
|
||||
print(
|
||||
f" [slow update] guidance written "
|
||||
f"({len(slow_result['slow_update_content'])} chars), "
|
||||
f" [slow update] force-injected into current & best "
|
||||
f"({len(slow_content)} chars), "
|
||||
f"{slow_time}s"
|
||||
)
|
||||
else:
|
||||
@@ -1693,7 +1624,7 @@ class ReflACTTrainer:
|
||||
f"current={current_score:.4f} best={best_score:.4f}"
|
||||
)
|
||||
|
||||
# ── META SKILL (end of epoch, teacher-side memory) ─────────
|
||||
# ── META SKILL (end of epoch, optimizer-side memory) ─────────
|
||||
use_meta_skill = cfg.get("use_meta_skill", False)
|
||||
if use_meta_skill:
|
||||
meta_skill_dir = os.path.join(out_root, "meta_skill", f"epoch_{epoch:02d}")
|
||||
@@ -1713,7 +1644,7 @@ class ReflACTTrainer:
|
||||
print(
|
||||
f"\n {'='*60}\n"
|
||||
f" META SKILL — Epoch {epoch} "
|
||||
f"(teacher memory from epoch {epoch-1} vs {epoch})\n"
|
||||
f"(optimizer memory from epoch {epoch-1} vs {epoch})\n"
|
||||
f" {'='*60}"
|
||||
)
|
||||
|
||||
@@ -1806,232 +1737,6 @@ class ReflACTTrainer:
|
||||
with open(meta_skill_done_path, "w") as f:
|
||||
json.dump(meta_skill_result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# ── META-REFLECT (end of epoch) ─────────────────────────────
|
||||
use_meta = cfg.get("use_meta_reflect", False)
|
||||
if use_meta:
|
||||
# Collect this epoch's step records from history
|
||||
epoch_records = [
|
||||
h for h in history if h.get("epoch") == epoch
|
||||
]
|
||||
if epoch_records:
|
||||
meta_step_tag = f"meta_epoch_{epoch}"
|
||||
meta_dir = os.path.join(out_root, "meta_reflect", f"epoch_{epoch:02d}")
|
||||
meta_done_path = os.path.join(meta_dir, "meta_result.json")
|
||||
|
||||
# Resume support: skip if already done
|
||||
if os.path.exists(meta_done_path):
|
||||
with open(meta_done_path) as f:
|
||||
meta_result = json.load(f)
|
||||
meta_summary = meta_result.get("meta_summary", "")
|
||||
meta_action = meta_result.get("action", "unknown")
|
||||
print(
|
||||
f"\n [META-REFLECT epoch {epoch}] "
|
||||
f"resumed — {meta_action}"
|
||||
)
|
||||
else:
|
||||
os.makedirs(meta_dir, exist_ok=True)
|
||||
print(
|
||||
f"\n {'='*60}\n"
|
||||
f" META-REFLECT — Epoch {epoch} "
|
||||
f"({len(epoch_records)} steps)\n"
|
||||
f" {'='*60}"
|
||||
)
|
||||
|
||||
meta_edit_budget = cfg.get("meta_edit_budget", 4)
|
||||
|
||||
# Build epoch history text
|
||||
epoch_history_text = build_epoch_history(
|
||||
epoch_records, out_root,
|
||||
update_mode=update_mode,
|
||||
)
|
||||
|
||||
# Load previous meta summary
|
||||
prev_meta_path = os.path.join(
|
||||
out_root, "meta_reflect",
|
||||
f"epoch_{epoch - 1:02d}", "meta_result.json",
|
||||
)
|
||||
prev_meta_summary = ""
|
||||
if os.path.exists(prev_meta_path):
|
||||
try:
|
||||
with open(prev_meta_path) as f:
|
||||
prev = json.load(f)
|
||||
prev_meta_summary = prev.get("meta_summary", "")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Get env-specific meta prompt if available
|
||||
meta_system = adapter.get_meta_reflect_prompt() \
|
||||
if hasattr(adapter, "get_meta_reflect_prompt") else None
|
||||
|
||||
# Run meta-reflect
|
||||
t_meta = time.time()
|
||||
meta_result = run_meta_reflect(
|
||||
skill_content=current_skill,
|
||||
epoch_history_text=epoch_history_text,
|
||||
prev_meta_summary=prev_meta_summary,
|
||||
meta_edit_budget=meta_edit_budget,
|
||||
system_prompt=meta_system,
|
||||
update_mode=update_mode,
|
||||
)
|
||||
meta_time = round(time.time() - t_meta, 1)
|
||||
|
||||
meta_items = get_payload_items(meta_result.get("patch", {}) if meta_result else {}, update_mode)
|
||||
if meta_result and meta_items:
|
||||
for item in meta_items:
|
||||
item.setdefault("update_origin", "meta_reflect_momentum")
|
||||
item.setdefault(
|
||||
"update_target",
|
||||
"Consolidate epoch-level accepted/rejected edit patterns.",
|
||||
)
|
||||
meta_summary = meta_result.get("meta_summary", "")
|
||||
print(
|
||||
f" [meta-reflect] "
|
||||
f"{len(meta_items)} {payload_label(update_mode)} proposed, "
|
||||
f"{meta_time}s"
|
||||
)
|
||||
|
||||
meta_rewrite_result = None
|
||||
if update_mode == "rewrite_from_suggestions":
|
||||
meta_rewrite_result = rewrite_skill_from_suggestions(
|
||||
current_skill,
|
||||
meta_result["patch"],
|
||||
env=cfg.get("env"),
|
||||
reasoning_effort=rewrite_reasoning_effort,
|
||||
max_completion_tokens=rewrite_max_completion_tokens,
|
||||
)
|
||||
if meta_rewrite_result and meta_rewrite_result.get("new_skill"):
|
||||
meta_candidate = meta_rewrite_result["new_skill"]
|
||||
meta_apply_report = []
|
||||
else:
|
||||
meta_candidate = current_skill
|
||||
meta_apply_report = []
|
||||
else:
|
||||
meta_candidate, meta_apply_report = apply_patch_with_report(
|
||||
current_skill, meta_result["patch"],
|
||||
)
|
||||
meta_cand_hash = skill_hash(meta_candidate)
|
||||
|
||||
# Save meta candidate
|
||||
with open(os.path.join(meta_dir, "meta_candidate.md"), "w") as f:
|
||||
f.write(meta_candidate)
|
||||
with open(os.path.join(meta_dir, "meta_patch.json"), "w") as f:
|
||||
json.dump(meta_result, f, indent=2, ensure_ascii=False)
|
||||
if meta_apply_report:
|
||||
with open(os.path.join(meta_dir, "meta_edit_apply_report.json"), "w") as f:
|
||||
json.dump(meta_apply_report, f, indent=2, ensure_ascii=False)
|
||||
if meta_rewrite_result:
|
||||
with open(os.path.join(meta_dir, "meta_rewrite_result.json"), "w") as f:
|
||||
json.dump(meta_rewrite_result, f, indent=2, ensure_ascii=False)
|
||||
meta_result["rewrite_change_summary"] = meta_rewrite_result.get("change_summary", [])
|
||||
|
||||
if update_mode == "rewrite_from_suggestions" and meta_rewrite_result is None:
|
||||
meta_action = "skip_no_rewrite"
|
||||
meta_result["action"] = meta_action
|
||||
meta_result["meta_summary"] = meta_summary
|
||||
meta_result["time_s"] = meta_time
|
||||
print(
|
||||
" [meta-reflect] no usable rewrite generated — "
|
||||
f"skill unchanged, {meta_time}s"
|
||||
)
|
||||
else:
|
||||
# Gate: evaluate meta candidate
|
||||
if meta_cand_hash in sel_cache:
|
||||
meta_hard, meta_soft = sel_cache[meta_cand_hash]
|
||||
print(
|
||||
f" [meta-gate] "
|
||||
f"cache hit: hard={meta_hard:.4f}"
|
||||
)
|
||||
else:
|
||||
sel_env, _ = _build_eval_env(
|
||||
split="valid_seen",
|
||||
env_num=cfg["sel_env_num"],
|
||||
seed=seed,
|
||||
)
|
||||
meta_eval_dir = os.path.join(meta_dir, "selection_eval")
|
||||
meta_eval_results = adapter.rollout(
|
||||
sel_env, meta_candidate, meta_eval_dir,
|
||||
)
|
||||
meta_hard, meta_soft = compute_score(meta_eval_results)
|
||||
sel_cache[meta_cand_hash] = (meta_hard, meta_soft)
|
||||
|
||||
meta_gate = evaluate_gate(
|
||||
candidate_skill=meta_candidate,
|
||||
cand_hard=meta_hard,
|
||||
current_skill=current_skill,
|
||||
current_score=current_score,
|
||||
best_skill=best_skill,
|
||||
best_score=best_score,
|
||||
best_step=best_step,
|
||||
global_step=global_step,
|
||||
)
|
||||
meta_action = meta_gate.action
|
||||
prev_score = current_score
|
||||
current_skill = meta_gate.current_skill
|
||||
current_score = meta_gate.current_score
|
||||
best_skill = meta_gate.best_skill
|
||||
best_score = meta_gate.best_score
|
||||
best_step = meta_gate.best_step
|
||||
if meta_gate.action in {"accept", "accept_new_best"}:
|
||||
current_origin = f"meta_reflect_epoch_{epoch:02d}"
|
||||
if meta_gate.action == "accept_new_best":
|
||||
best_origin = current_origin
|
||||
if meta_gate.action == "accept_new_best":
|
||||
print(
|
||||
f" [meta-gate] ACCEPT (new best) "
|
||||
f"hard={meta_hard:.4f} > "
|
||||
f"prev best {prev_score:.4f}"
|
||||
)
|
||||
elif meta_gate.action == "accept":
|
||||
print(
|
||||
f" [meta-gate] ACCEPT "
|
||||
f"hard={meta_hard:.4f} > "
|
||||
f"current={prev_score:.4f}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f" [meta-gate] REJECT "
|
||||
f"hard={meta_hard:.4f} <= "
|
||||
f"current={current_score:.4f}"
|
||||
)
|
||||
|
||||
# Save meta result with gate outcome
|
||||
meta_result["action"] = meta_action
|
||||
meta_result["gate_score"] = meta_hard
|
||||
meta_result["time_s"] = meta_time
|
||||
meta_result["update_origin"] = "meta_reflect_momentum"
|
||||
meta_result["update_target"] = (
|
||||
"Consolidate epoch-level editing directions that helped or hurt."
|
||||
)
|
||||
else:
|
||||
meta_summary = meta_result.get("meta_summary", "") if meta_result else ""
|
||||
meta_action = f"skip_no_{payload_label(update_mode)}"
|
||||
if meta_result is None:
|
||||
meta_result = {}
|
||||
meta_result["action"] = meta_action
|
||||
meta_result["meta_summary"] = meta_summary
|
||||
meta_result["time_s"] = meta_time
|
||||
print(
|
||||
f" [meta-reflect] no {payload_label(update_mode)} proposed — "
|
||||
f"skill unchanged, {meta_time}s"
|
||||
)
|
||||
|
||||
# Persist
|
||||
with open(meta_done_path, "w") as f:
|
||||
json.dump(meta_result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Save updated skill after meta-reflect
|
||||
_save_skill(out_root, global_step, current_skill)
|
||||
with open(os.path.join(out_root, "best_skill.md"), "w") as f:
|
||||
f.write(best_skill)
|
||||
_persist_runtime_state(global_step)
|
||||
|
||||
print(
|
||||
f"\n [META-REFLECT epoch {epoch} done] "
|
||||
f"action={meta_action} "
|
||||
f"current={current_score:.4f} "
|
||||
f"best={best_score:.4f}"
|
||||
)
|
||||
|
||||
# ── Save best skill ──────────────────────────────────────────────
|
||||
with open(os.path.join(out_root, "best_skill.md"), "w") as f:
|
||||
f.write(best_skill)
|
||||
|
||||
@@ -31,7 +31,7 @@ optimizer:
|
||||
learning_rate: 4 # Max edits per step (edit budget)
|
||||
lr_scheduler: cosine # cosine | linear | constant | autonomous
|
||||
use_slow_update: true # Epoch-boundary momentum
|
||||
use_meta_skill: true # Cross-epoch teacher memory
|
||||
use_meta_skill: true # Cross-epoch optimizer memory
|
||||
|
||||
# ── Evaluation ───────────────────────────────────
|
||||
evaluation:
|
||||
@@ -41,5 +41,5 @@ evaluation:
|
||||
# ── Model ────────────────────────────────────────
|
||||
model:
|
||||
backend: azure_openai # azure_openai | openai_chat | claude_code_exec | qwen
|
||||
teacher: gpt-5.5
|
||||
student: gpt-5.5
|
||||
optimizer: gpt-4o
|
||||
target: gpt-4o
|
||||
|
||||
@@ -4,7 +4,7 @@ Benchmark Environment Template
|
||||
Copy this file and implement the TODO sections to add a new benchmark.
|
||||
|
||||
The EnvAdapter is responsible for:
|
||||
1. Executing tasks using the student model + current skill document
|
||||
1. Executing tasks using the target model + current skill document
|
||||
2. Evaluating predictions against ground truth
|
||||
3. Returning structured results for the training loop
|
||||
"""
|
||||
@@ -25,12 +25,12 @@ class TemplateBenchmarkEnv(EnvAdapter):
|
||||
|
||||
async def execute(self, item, skill: str, model):
|
||||
"""
|
||||
Execute a single task with the student model.
|
||||
Execute a single task with the target model.
|
||||
|
||||
Args:
|
||||
item: DataItem with .id, .input, .ground_truth, .metadata
|
||||
skill: Current skill document content (Markdown string)
|
||||
model: Student model backend instance
|
||||
model: Target model backend instance
|
||||
|
||||
Returns:
|
||||
TaskResult with prediction, score, and trajectory
|
||||
@@ -38,7 +38,7 @@ class TemplateBenchmarkEnv(EnvAdapter):
|
||||
# Step 1: Build the prompt combining skill + task input
|
||||
prompt = self.build_prompt(item, skill)
|
||||
|
||||
# Step 2: Call the student model
|
||||
# Step 2: Call the target model
|
||||
# TODO: Customize the message format for your benchmark
|
||||
messages = [
|
||||
{"role": "system", "content": skill},
|
||||
|
||||
@@ -9,7 +9,6 @@ from dataclasses import dataclass
|
||||
import json
|
||||
import os
|
||||
|
||||
from skillopt.gradient.deep_probe import generate_deep_probe_instruction
|
||||
from skillopt.datasets.base import BatchSpec
|
||||
from skillopt.envs.base import EnvAdapter
|
||||
from skillopt.envs.alfworld.dataloader import ALFWorldDataLoader
|
||||
@@ -82,11 +81,7 @@ class ALFWorldAdapter(EnvAdapter):
|
||||
analyst_workers: int = 16,
|
||||
failure_only: bool = False,
|
||||
minibatch_size: int = 8,
|
||||
edit_budget: int = 4,
|
||||
use_deep_reflect: bool = False,
|
||||
deep_reflect_failures: int = 4,
|
||||
deep_reflect_successes: int = 2,
|
||||
) -> None:
|
||||
edit_budget: int = 4, ) -> None:
|
||||
self.max_steps = max_steps
|
||||
self.workers = max(int(workers or 1), 1)
|
||||
self.max_api_workers = max_api_workers
|
||||
@@ -94,9 +89,6 @@ class ALFWorldAdapter(EnvAdapter):
|
||||
self.failure_only = failure_only
|
||||
self.minibatch_size = minibatch_size
|
||||
self.edit_budget = edit_budget
|
||||
self.use_deep_reflect = use_deep_reflect
|
||||
self.deep_reflect_failures = deep_reflect_failures
|
||||
self.deep_reflect_successes = deep_reflect_successes
|
||||
self.dataloader = ALFWorldDataLoader(
|
||||
split_dir=split_dir,
|
||||
data_path=data_path,
|
||||
@@ -457,129 +449,6 @@ class ALFWorldAdapter(EnvAdapter):
|
||||
meta_skill_context=meta_skill_context,
|
||||
)
|
||||
|
||||
def deep_reflect(
|
||||
self,
|
||||
results: list[dict],
|
||||
skill_content: str,
|
||||
out_dir: str,
|
||||
**kwargs,
|
||||
) -> list[dict | None]:
|
||||
if not self.use_deep_reflect:
|
||||
return []
|
||||
|
||||
prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions"))
|
||||
random_seed = kwargs.get("random_seed")
|
||||
step_buffer_context = kwargs.get("step_buffer_context", "")
|
||||
meta_skill_context = kwargs.get("meta_skill_context", "")
|
||||
selected_items = self.select_representative_items(
|
||||
results,
|
||||
results,
|
||||
n_failures=self.deep_reflect_failures,
|
||||
n_successes=self.deep_reflect_successes,
|
||||
seed=random_seed,
|
||||
)
|
||||
if not selected_items:
|
||||
return []
|
||||
|
||||
selected_ids = {str(item["id"]) for item in selected_items}
|
||||
selected_results = [row for row in results if str(row.get("id")) in selected_ids]
|
||||
selected_examples = self.attach_reference_context(selected_results, selected_items)
|
||||
|
||||
field_counts: dict[str, int] = {}
|
||||
selected_metadata: list[dict] = []
|
||||
for item in selected_items:
|
||||
meta = self.get_reference_metadata(item)
|
||||
for field in meta["fields"]:
|
||||
field_counts[field] = field_counts.get(field, 0) + 1
|
||||
selected_metadata.append({
|
||||
"id": str(item["id"]),
|
||||
"task_type": str(item.get("task_type") or "alfworld"),
|
||||
"gamefile": str(item.get("gamefile") or ""),
|
||||
"reference_fields": meta["fields"],
|
||||
"reference_preview": meta["preview"],
|
||||
})
|
||||
|
||||
deep_dir = os.path.join(out_dir, "deep_reflect")
|
||||
rollout_dir = os.path.join(deep_dir, "rollout")
|
||||
patches_dir = os.path.join(deep_dir, "patches")
|
||||
os.makedirs(deep_dir, exist_ok=True)
|
||||
field_summary = ", ".join(
|
||||
f"{field}({count}/{len(selected_items)})"
|
||||
for field, count in sorted(field_counts.items())
|
||||
) or "none"
|
||||
print(
|
||||
f" [2b/6 DEEP REFLECT setup] selected={len(selected_items)} "
|
||||
f"reference_fields={field_summary}"
|
||||
)
|
||||
probe = generate_deep_probe_instruction(
|
||||
skill_content=skill_content,
|
||||
items=selected_examples,
|
||||
prediction_dir=prediction_dir,
|
||||
system_prompt=self.get_deep_probe_prompt(),
|
||||
step_buffer_context=step_buffer_context,
|
||||
meta_skill_context=meta_skill_context,
|
||||
output_requirements=[
|
||||
"- Some trajectories may include a hidden Reference block. Use it to target the student's latent subgoal, missing precondition, or next-step intent, but do not reveal or paraphrase that reference to the student.",
|
||||
"- The instruction must request a brief diagnostic readout inside the existing <think>...</think> block.",
|
||||
"- The student must still output exactly one admissible action inside <action>...</action>.",
|
||||
"- Do not ask for exhaustive inventories, full plans, or long chain-of-thought.",
|
||||
"- The instruction text should be ready to append directly to the student's prompt.",
|
||||
],
|
||||
)
|
||||
if not probe:
|
||||
return []
|
||||
|
||||
with open(os.path.join(deep_dir, "probe.json"), "w", encoding="utf-8") as f:
|
||||
json.dump(
|
||||
{
|
||||
**probe,
|
||||
"reference_summary": {
|
||||
"selected_count": len(selected_items),
|
||||
"field_counts": field_counts,
|
||||
},
|
||||
"selected_examples": selected_metadata,
|
||||
},
|
||||
f,
|
||||
ensure_ascii=False,
|
||||
indent=2,
|
||||
)
|
||||
|
||||
gamefiles = [str(item.get("gamefile") or "") for item in selected_items]
|
||||
if any(not gamefile for gamefile in gamefiles):
|
||||
return []
|
||||
eval_dataset, is_train = self._infer_dataset_from_gamefile(gamefiles[0])
|
||||
deep_env = ALFWorldBatchRun(
|
||||
env_num=len(selected_items),
|
||||
eval_dataset=eval_dataset,
|
||||
seed=random_seed or 42,
|
||||
is_train=is_train,
|
||||
specific_gamefiles=gamefiles,
|
||||
workers=min(self.workers, max(len(selected_items), 1)),
|
||||
result_ids=[str(item["id"]) for item in selected_items],
|
||||
)
|
||||
deep_results = self._run_batch(
|
||||
deep_env,
|
||||
skill_content=skill_content,
|
||||
out_dir=rollout_dir,
|
||||
diagnostic_mode=True,
|
||||
diagnostic_instruction=probe["probe_instruction"],
|
||||
)
|
||||
deep_results = self.attach_reference_context(deep_results, selected_items)
|
||||
return run_minibatch_reflect(
|
||||
results=deep_results,
|
||||
skill_content=skill_content,
|
||||
prediction_dir=os.path.join(rollout_dir, "predictions"),
|
||||
patches_dir=patches_dir,
|
||||
workers=self.analyst_workers,
|
||||
failure_only=self.failure_only,
|
||||
minibatch_size=self.minibatch_size,
|
||||
edit_budget=self.edit_budget,
|
||||
random_seed=random_seed,
|
||||
error_system=self.get_error_minibatch_prompt(),
|
||||
success_system=self.get_success_minibatch_prompt(),
|
||||
step_buffer_context=step_buffer_context,
|
||||
meta_skill_context=meta_skill_context,
|
||||
)
|
||||
|
||||
def get_task_types(self) -> list[str]:
|
||||
return list(TASKS)
|
||||
|
||||
@@ -15,7 +15,7 @@ import time
|
||||
import concurrent.futures
|
||||
import numpy as np
|
||||
|
||||
from skillopt.model import chat_student
|
||||
from skillopt.model import chat_target
|
||||
|
||||
# ── Constants ─────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -210,7 +210,7 @@ def run_alfworld_batch(
|
||||
|
||||
def call_api(idx):
|
||||
try:
|
||||
response, _ = chat_student(
|
||||
response, _ = chat_target(
|
||||
system="You are an expert agent operating in the ALFRED Embodied Environment.",
|
||||
user=prompts[idx],
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
|
||||
@@ -31,7 +31,6 @@ import os
|
||||
import random
|
||||
|
||||
from skillopt.datasets.base import BaseDataLoader, BatchSpec
|
||||
from skillopt.model.codex_harness import extract_codex_trace_prefix, format_codex_trace_steps, parse_codex_raw
|
||||
from skillopt.prompts import load_prompt
|
||||
|
||||
|
||||
@@ -60,24 +59,8 @@ class EnvAdapter(ABC):
|
||||
"""Return whether this adapter requires Ray runtime initialization."""
|
||||
return False
|
||||
|
||||
def deep_reflect(
|
||||
self,
|
||||
results: list[dict],
|
||||
skill_content: str,
|
||||
out_dir: str,
|
||||
**kwargs,
|
||||
) -> list[dict | None]:
|
||||
"""Optional deeper diagnostic reflection pass.
|
||||
|
||||
Default behavior is a no-op. Dataset-backed adapters may override this
|
||||
to re-query the student on a small representative subset of the current
|
||||
batch using minimally-perturbed diagnostic prompts that expose
|
||||
intermediate reasoning state.
|
||||
"""
|
||||
return []
|
||||
|
||||
def build_reference_text(self, item: dict) -> str:
|
||||
"""Return hidden reference material for deep reflection, if any."""
|
||||
"""Return hidden reference material for reflection, if any."""
|
||||
return str(item.get("reference_text") or "").strip()
|
||||
|
||||
def get_reference_metadata(self, item: dict) -> dict:
|
||||
@@ -90,65 +73,6 @@ class EnvAdapter(ABC):
|
||||
"preview": reference_text[:400],
|
||||
}
|
||||
|
||||
def get_codex_deep_probe_prompt(self) -> str | None:
|
||||
env_name = getattr(self, "_cfg", {}).get("env_name")
|
||||
return load_prompt("deep_probe_codex", env=env_name)
|
||||
|
||||
def attach_codex_probe_context(
|
||||
self,
|
||||
results: list[dict],
|
||||
prediction_dir: str,
|
||||
) -> list[dict]:
|
||||
"""Attach compact Codex step metadata for codex-aware deep reflection."""
|
||||
enriched: list[dict] = []
|
||||
for row in results:
|
||||
merged = dict(row)
|
||||
tid = str(row.get("id"))
|
||||
raw_path = os.path.join(prediction_dir, tid, "codex_raw.txt")
|
||||
if os.path.exists(raw_path):
|
||||
with open(raw_path, encoding="utf-8") as f:
|
||||
raw = f.read()
|
||||
parsed = parse_codex_raw(raw)
|
||||
merged["codex_probe_trace_steps"] = format_codex_trace_steps(raw)
|
||||
merged["codex_probe_step_count"] = len(parsed["steps"])
|
||||
enriched.append(merged)
|
||||
return enriched
|
||||
|
||||
def resolve_codex_probe_target(
|
||||
self,
|
||||
*,
|
||||
selected_items: list[dict],
|
||||
selected_examples: list[dict],
|
||||
prediction_dir: str,
|
||||
probe: dict,
|
||||
) -> tuple[list[dict], dict[str, str] | None, dict]:
|
||||
"""Resolve the teacher-selected codex probe target and raw trace prefix."""
|
||||
target_id = str(probe.get("probe_target_id", "")).strip()
|
||||
selected_id_set = {str(item["id"]) for item in selected_items}
|
||||
if target_id not in selected_id_set:
|
||||
target_id = str(selected_items[0]["id"])
|
||||
target_item = next(item for item in selected_items if str(item["id"]) == target_id)
|
||||
target_result = next(
|
||||
(row for row in selected_examples if str(row.get("id")) == target_id),
|
||||
None,
|
||||
)
|
||||
max_probe_step = int((target_result or {}).get("codex_probe_step_count", 0))
|
||||
default_probe_step = max_probe_step - 1 if max_probe_step > 1 else max_probe_step
|
||||
probe_after_step = int(probe.get("probe_after_step", default_probe_step))
|
||||
if max_probe_step > 0:
|
||||
probe_after_step = max(0, min(probe_after_step, max_probe_step))
|
||||
else:
|
||||
probe_after_step = 0
|
||||
raw_path = os.path.join(prediction_dir, target_id, "codex_raw.txt")
|
||||
trace_prefix = ""
|
||||
if os.path.exists(raw_path):
|
||||
with open(raw_path, encoding="utf-8") as f:
|
||||
trace_prefix = extract_codex_trace_prefix(f.read(), after_step=probe_after_step)
|
||||
updated_probe = dict(probe)
|
||||
updated_probe["probe_target_id"] = target_id
|
||||
updated_probe["probe_after_step"] = probe_after_step
|
||||
return [target_item], {target_id: trace_prefix}, updated_probe
|
||||
|
||||
def attach_reference_context(
|
||||
self,
|
||||
results: list[dict],
|
||||
@@ -383,14 +307,3 @@ class EnvAdapter(ABC):
|
||||
if prompt is not None:
|
||||
return prompt
|
||||
return self._load_env_prompt("analyst_success")
|
||||
|
||||
def get_deep_probe_prompt(self) -> str | None:
|
||||
return self._load_env_prompt("deep_probe")
|
||||
|
||||
def get_meta_reflect_prompt(self) -> str | None:
|
||||
update_mode = getattr(self, "_cfg", {}).get("skill_update_mode", "patch")
|
||||
if str(update_mode).strip().lower() == "rewrite_from_suggestions":
|
||||
prompt = self._load_env_prompt("meta_reflect_rewrite")
|
||||
if prompt is not None:
|
||||
return prompt
|
||||
return self._load_env_prompt("meta_reflect")
|
||||
|
||||
@@ -4,7 +4,6 @@ import os
|
||||
|
||||
from skillopt.datasets.base import BatchSpec
|
||||
from skillopt.envs.base import EnvAdapter
|
||||
from skillopt.envs.deep_reflect import run_no_reference_deep_reflect
|
||||
from skillopt.envs.docvqa.dataloader import DocVQADataLoader
|
||||
from skillopt.envs.docvqa.rollout import run_batch
|
||||
from skillopt.gradient.reflect import run_minibatch_reflect
|
||||
@@ -28,11 +27,7 @@ class DocVQAAdapter(EnvAdapter):
|
||||
edit_budget: int = 4,
|
||||
seed: int = 42,
|
||||
limit: int = 0,
|
||||
image_detail: str = "auto",
|
||||
use_deep_reflect: bool = False,
|
||||
deep_reflect_failures: int = 4,
|
||||
deep_reflect_successes: int = 2,
|
||||
) -> None:
|
||||
image_detail: str = "auto", ) -> None:
|
||||
self.max_turns = max_turns
|
||||
self.exec_timeout = exec_timeout
|
||||
self.workers = workers
|
||||
@@ -41,9 +36,6 @@ class DocVQAAdapter(EnvAdapter):
|
||||
self.minibatch_size = minibatch_size
|
||||
self.edit_budget = edit_budget
|
||||
self.image_detail = image_detail
|
||||
self.use_deep_reflect = use_deep_reflect
|
||||
self.deep_reflect_failures = deep_reflect_failures
|
||||
self.deep_reflect_successes = deep_reflect_successes
|
||||
self.dataloader = DocVQADataLoader(
|
||||
split_dir=split_dir,
|
||||
data_path=data_path,
|
||||
@@ -109,38 +101,6 @@ class DocVQAAdapter(EnvAdapter):
|
||||
update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"),
|
||||
)
|
||||
|
||||
def deep_reflect(
|
||||
self,
|
||||
results: list[dict],
|
||||
skill_content: str,
|
||||
out_dir: str,
|
||||
**kwargs,
|
||||
) -> list[dict | None]:
|
||||
return run_no_reference_deep_reflect(
|
||||
self,
|
||||
results,
|
||||
skill_content,
|
||||
out_dir,
|
||||
env_manager=kwargs.get("env_manager"),
|
||||
prediction_dir=kwargs.get("prediction_dir"),
|
||||
random_seed=kwargs.get("random_seed"),
|
||||
step_buffer_context=kwargs.get("step_buffer_context", ""),
|
||||
output_requirements=[
|
||||
"- There is no hidden reference block. Use only the document image prompt, student output, and evaluation result to infer what intermediate state is worth probing.",
|
||||
"- The instruction must explicitly request a short <analysis>...</analysis> block before the final <answer>...</answer>.",
|
||||
"- The readout should focus on visual region, field/table/figure label, OCR text read, candidate answer, and answer-format normalization.",
|
||||
"- Do not ask for exhaustive transcription or a full chain-of-thought.",
|
||||
"- The instruction text should be ready to append directly to the student's prompt.",
|
||||
],
|
||||
metadata_builder=lambda item: {
|
||||
"id": str(item.get("id")),
|
||||
"task_type": str(item.get("task_type") or "docvqa"),
|
||||
"question_preview": str(item.get("question") or "")[:200],
|
||||
"image_path": item.get("image_path", ""),
|
||||
"docId": item.get("docId", ""),
|
||||
"page": item.get("ucsf_document_page_no", ""),
|
||||
},
|
||||
)
|
||||
|
||||
def get_task_types(self) -> list[str]:
|
||||
seen: list[str] = []
|
||||
|
||||
@@ -6,8 +6,8 @@ import time
|
||||
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
|
||||
|
||||
from skillopt.envs.docvqa.evaluator import evaluate
|
||||
from skillopt.model import chat_student_messages, get_student_backend, is_student_exec_backend
|
||||
from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_student_exec
|
||||
from skillopt.model import chat_target_messages, get_target_backend, is_target_exec_backend
|
||||
from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_target_exec
|
||||
from skillopt.prompts import load_prompt
|
||||
|
||||
|
||||
@@ -112,11 +112,11 @@ def _run_codex_once(
|
||||
images=[item["image_path"]],
|
||||
)
|
||||
prompt = (
|
||||
"Use the `skillopt-student` skill available in this workspace.\n"
|
||||
"Use the `skillopt-target` skill available in this workspace.\n"
|
||||
"Read `task.md`, inspect the attached document image, and answer the DocVQA question.\n"
|
||||
"Return the final answer inside <answer>...</answer>."
|
||||
)
|
||||
final_message, raw = run_student_exec(
|
||||
final_message, raw = run_target_exec(
|
||||
work_dir=work_dir,
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
@@ -158,7 +158,7 @@ def process_one(
|
||||
system_prompt = ""
|
||||
user_text = ""
|
||||
conversation: list[dict] = []
|
||||
if is_student_exec_backend():
|
||||
if is_target_exec_backend():
|
||||
from skillopt.model import azure_openai as _llm
|
||||
|
||||
conversation = [
|
||||
@@ -172,7 +172,7 @@ def process_one(
|
||||
pred_dir=os.path.join(out_root, "predictions", item_id),
|
||||
item=item,
|
||||
skill_content=skill_content,
|
||||
model=_llm.STUDENT_DEPLOYMENT,
|
||||
model=_llm.TARGET_DEPLOYMENT,
|
||||
timeout=exec_timeout,
|
||||
image_detail=image_detail,
|
||||
diagnostic_mode=diagnostic_mode if turn == 0 else False,
|
||||
@@ -198,7 +198,7 @@ def process_one(
|
||||
]
|
||||
for turn in range(max_turns):
|
||||
if turn == 0:
|
||||
resp_text, _ = chat_student_messages(
|
||||
resp_text, _ = chat_target_messages(
|
||||
messages=messages,
|
||||
max_completion_tokens=768,
|
||||
retries=5,
|
||||
@@ -212,7 +212,7 @@ def process_one(
|
||||
{"role": "assistant", "content": response},
|
||||
{"role": "user", "content": "Review the same image carefully and answer again. Keep the final answer inside <answer>...</answer>."},
|
||||
]
|
||||
resp_text, _ = chat_student_messages(
|
||||
resp_text, _ = chat_target_messages(
|
||||
messages=refinement_messages,
|
||||
max_completion_tokens=512,
|
||||
retries=5,
|
||||
@@ -230,9 +230,9 @@ def process_one(
|
||||
|
||||
pred_dir = os.path.join(out_root, "predictions", item_id)
|
||||
os.makedirs(pred_dir, exist_ok=True)
|
||||
with open(os.path.join(pred_dir, "student_system_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
with open(os.path.join(pred_dir, "target_system_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(system_prompt)
|
||||
with open(os.path.join(pred_dir, "student_user_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
with open(os.path.join(pred_dir, "target_user_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(user_text)
|
||||
|
||||
eval_result = evaluate(response, item.get("answers", []))
|
||||
|
||||
@@ -4,13 +4,12 @@ from __future__ import annotations
|
||||
import json
|
||||
import os
|
||||
|
||||
from skillopt.gradient.deep_probe import generate_deep_probe_instruction
|
||||
from skillopt.datasets.base import BatchSpec
|
||||
from skillopt.gradient.reflect import run_minibatch_reflect
|
||||
from skillopt.envs.base import EnvAdapter
|
||||
from skillopt.envs.livemathematicianbench.dataloader import LiveMathematicianBenchDataLoader
|
||||
from skillopt.envs.livemathematicianbench.rollout import run_batch
|
||||
from skillopt.model import get_student_backend
|
||||
from skillopt.model import get_target_backend
|
||||
|
||||
|
||||
class LiveMathematicianBenchAdapter(EnvAdapter):
|
||||
@@ -61,11 +60,7 @@ class LiveMathematicianBenchAdapter(EnvAdapter):
|
||||
limit: int = 0,
|
||||
shuffle_choices: bool = True,
|
||||
use_theorem: bool = False,
|
||||
use_sketch: bool = False,
|
||||
use_deep_reflect: bool = False,
|
||||
deep_reflect_failures: int = 4,
|
||||
deep_reflect_successes: int = 2,
|
||||
) -> None:
|
||||
use_sketch: bool = False, ) -> None:
|
||||
self.max_turns = max_turns
|
||||
self.exec_timeout = exec_timeout
|
||||
self.workers = workers
|
||||
@@ -75,9 +70,6 @@ class LiveMathematicianBenchAdapter(EnvAdapter):
|
||||
self.edit_budget = edit_budget
|
||||
self.use_theorem = use_theorem
|
||||
self.use_sketch = use_sketch
|
||||
self.use_deep_reflect = use_deep_reflect
|
||||
self.deep_reflect_failures = deep_reflect_failures
|
||||
self.deep_reflect_successes = deep_reflect_successes
|
||||
self.dataloader = LiveMathematicianBenchDataLoader(
|
||||
split_dir=split_dir,
|
||||
data_path=data_path,
|
||||
@@ -161,122 +153,6 @@ class LiveMathematicianBenchAdapter(EnvAdapter):
|
||||
update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"),
|
||||
)
|
||||
|
||||
def deep_reflect(
|
||||
self,
|
||||
results: list[dict],
|
||||
skill_content: str,
|
||||
out_dir: str,
|
||||
**kwargs,
|
||||
) -> list[dict | None]:
|
||||
if not self.use_deep_reflect:
|
||||
return []
|
||||
|
||||
env_manager = kwargs.get("env_manager")
|
||||
prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions"))
|
||||
random_seed = kwargs.get("random_seed")
|
||||
step_buffer_context = kwargs.get("step_buffer_context", "")
|
||||
meta_skill_context = kwargs.get("meta_skill_context", "")
|
||||
codex_backend = get_student_backend() == "codex_exec"
|
||||
selected_items = self.select_representative_items(
|
||||
results,
|
||||
env_manager if isinstance(env_manager, list) else None,
|
||||
n_failures=self.deep_reflect_failures,
|
||||
n_successes=self.deep_reflect_successes,
|
||||
seed=random_seed,
|
||||
)
|
||||
if not selected_items:
|
||||
return []
|
||||
selected_ids = {str(item["id"]) for item in selected_items}
|
||||
selected_results = [row for row in results if str(row.get("id")) in selected_ids]
|
||||
selected_examples = self.attach_reference_context(selected_results, selected_items)
|
||||
if codex_backend:
|
||||
selected_examples = self.attach_codex_probe_context(selected_examples, prediction_dir)
|
||||
selected_metadata = []
|
||||
theorem_count = 0
|
||||
sketch_count = 0
|
||||
for item in selected_items:
|
||||
meta = self.get_reference_metadata(item)
|
||||
if "theorem" in meta["fields"]:
|
||||
theorem_count += 1
|
||||
if "sketch" in meta["fields"]:
|
||||
sketch_count += 1
|
||||
selected_metadata.append({
|
||||
"id": str(item["id"]),
|
||||
"task_type": str(item.get("theorem_type", ["math_mcq"])[0] if item.get("theorem_type") else "math_mcq"),
|
||||
"reference_fields": meta["fields"],
|
||||
"reference_preview": meta["preview"],
|
||||
})
|
||||
|
||||
deep_dir = os.path.join(out_dir, "deep_reflect")
|
||||
rollout_dir = os.path.join(deep_dir, "rollout")
|
||||
patches_dir = os.path.join(deep_dir, "patches")
|
||||
os.makedirs(deep_dir, exist_ok=True)
|
||||
print(
|
||||
f" [2b/6 DEEP REFLECT setup] selected={len(selected_items)} "
|
||||
f"reference_fields=theorem({theorem_count}/{len(selected_items)}),"
|
||||
f"sketch({sketch_count}/{len(selected_items)})"
|
||||
)
|
||||
probe = generate_deep_probe_instruction(
|
||||
skill_content=skill_content,
|
||||
items=selected_examples,
|
||||
prediction_dir=prediction_dir,
|
||||
system_prompt=self.get_codex_deep_probe_prompt() if codex_backend else self.get_deep_probe_prompt(),
|
||||
step_buffer_context=step_buffer_context,
|
||||
meta_skill_context=meta_skill_context,
|
||||
)
|
||||
if not probe:
|
||||
return []
|
||||
diagnostic_trace_context_by_id = None
|
||||
if codex_backend:
|
||||
selected_items, diagnostic_trace_context_by_id, probe = self.resolve_codex_probe_target(
|
||||
selected_items=selected_items,
|
||||
selected_examples=selected_examples,
|
||||
prediction_dir=prediction_dir,
|
||||
probe=probe,
|
||||
)
|
||||
probe_record = {
|
||||
**probe,
|
||||
"reference_summary": {
|
||||
"selected_count": len(selected_items),
|
||||
"field_counts": {
|
||||
"theorem": theorem_count,
|
||||
"sketch": sketch_count,
|
||||
},
|
||||
},
|
||||
"selected_examples": selected_metadata,
|
||||
}
|
||||
with open(os.path.join(deep_dir, "probe.json"), "w", encoding="utf-8") as f:
|
||||
json.dump(probe_record, f, ensure_ascii=False, indent=2)
|
||||
deep_results = run_batch(
|
||||
items=selected_items,
|
||||
out_root=rollout_dir,
|
||||
skill_content=skill_content,
|
||||
max_turns=self.max_turns,
|
||||
workers=min(self.workers, max(len(selected_items), 1)),
|
||||
use_theorem=self.use_theorem,
|
||||
use_sketch=self.use_sketch,
|
||||
diagnostic_mode=True,
|
||||
diagnostic_instruction=probe["probe_instruction"],
|
||||
diagnostic_trace_context_by_id=diagnostic_trace_context_by_id,
|
||||
task_timeout=self.exec_timeout,
|
||||
)
|
||||
deep_results = self.attach_reference_context(deep_results, selected_items)
|
||||
return run_minibatch_reflect(
|
||||
results=deep_results,
|
||||
skill_content=skill_content,
|
||||
prediction_dir=os.path.join(rollout_dir, "predictions"),
|
||||
patches_dir=patches_dir,
|
||||
workers=self.analyst_workers,
|
||||
failure_only=self.failure_only,
|
||||
minibatch_size=self.minibatch_size,
|
||||
edit_budget=self.edit_budget,
|
||||
random_seed=random_seed,
|
||||
error_system=self.get_error_minibatch_prompt(),
|
||||
success_system=self.get_success_minibatch_prompt(),
|
||||
step_buffer_context=step_buffer_context,
|
||||
meta_skill_context=meta_skill_context,
|
||||
update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"),
|
||||
)
|
||||
|
||||
def get_task_types(self) -> list[str]:
|
||||
return self.dataloader.get_task_types()
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
You are an expert failure-analysis agent for theorem-grounded mathematical multiple-choice questions.
|
||||
|
||||
You will be given MULTIPLE failed trajectories from a single minibatch and the current skill document.
|
||||
Each trajectory includes the student's response and an evaluation result showing the predicted option
|
||||
Each trajectory includes the target's response and an evaluation result showing the predicted option
|
||||
versus the correct option.
|
||||
|
||||
Your job is to identify COMMON reasoning failures across the batch and propose concise skill edits.
|
||||
|
||||
@@ -7,8 +7,8 @@ import time
|
||||
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
|
||||
|
||||
from skillopt.envs.livemathematicianbench.evaluator import evaluate
|
||||
from skillopt.model import chat_student, get_student_backend, is_student_exec_backend
|
||||
from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_student_exec
|
||||
from skillopt.model import chat_target, get_target_backend, is_target_exec_backend
|
||||
from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_target_exec
|
||||
from skillopt.prompts import load_prompt
|
||||
|
||||
def _build_system(skill_content: str) -> str:
|
||||
@@ -95,11 +95,11 @@ def _run_codex_once(
|
||||
work_dir = os.path.join(pred_dir, "codex_exec")
|
||||
prepare_workspace(work_dir=work_dir, skill_md=skill_md, task_text=task_text)
|
||||
prompt = (
|
||||
"Use the `skillopt-student` skill available in this workspace.\n"
|
||||
"Use the `skillopt-target` skill available in this workspace.\n"
|
||||
"Read `task.md` and solve the multiple-choice problem.\n"
|
||||
"Output only the final choice label inside <answer>...</answer>."
|
||||
)
|
||||
final_message, raw = run_student_exec(
|
||||
final_message, raw = run_target_exec(
|
||||
work_dir=work_dir,
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
@@ -143,7 +143,7 @@ def process_one(
|
||||
pred_dir = os.path.join(out_root, "predictions", item_id)
|
||||
os.makedirs(pred_dir, exist_ok=True)
|
||||
|
||||
if is_student_exec_backend():
|
||||
if is_target_exec_backend():
|
||||
from skillopt.model import azure_openai as _llm
|
||||
|
||||
conversation: list[dict] = []
|
||||
@@ -155,7 +155,7 @@ def process_one(
|
||||
pred_dir=pred_dir,
|
||||
skill_content=skill_content,
|
||||
item=item,
|
||||
model=_llm.STUDENT_DEPLOYMENT,
|
||||
model=_llm.TARGET_DEPLOYMENT,
|
||||
timeout=exec_timeout,
|
||||
use_theorem=use_theorem,
|
||||
use_sketch=use_sketch,
|
||||
@@ -172,9 +172,9 @@ def process_one(
|
||||
result["agent_ok"] = True
|
||||
result["n_turns"] = len(conversation)
|
||||
|
||||
with open(os.path.join(pred_dir, "student_system_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
with open(os.path.join(pred_dir, "target_system_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(system)
|
||||
with open(os.path.join(pred_dir, "student_user_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
with open(os.path.join(pred_dir, "target_user_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(user)
|
||||
|
||||
eval_result = evaluate(response, item["correct_choice"], item["choices"])
|
||||
@@ -216,7 +216,7 @@ def process_one(
|
||||
|
||||
for turn in range(max_turns):
|
||||
if turn == 0:
|
||||
resp_text, _ = chat_student(
|
||||
resp_text, _ = chat_target(
|
||||
system=system,
|
||||
user=user,
|
||||
max_completion_tokens=16384,
|
||||
@@ -230,7 +230,7 @@ def process_one(
|
||||
"Re-evaluate the exact option wording. If needed, correct it. "
|
||||
"Output only the final choice label inside <answer>...</answer>."
|
||||
)
|
||||
resp_text, _ = chat_student(
|
||||
resp_text, _ = chat_target(
|
||||
system=system,
|
||||
user=refinement,
|
||||
max_completion_tokens=16384,
|
||||
@@ -247,9 +247,9 @@ def process_one(
|
||||
result["agent_ok"] = True
|
||||
result["n_turns"] = len(conversation)
|
||||
|
||||
with open(os.path.join(pred_dir, "student_system_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
with open(os.path.join(pred_dir, "target_system_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(system)
|
||||
with open(os.path.join(pred_dir, "student_user_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
with open(os.path.join(pred_dir, "target_user_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(user)
|
||||
|
||||
eval_result = evaluate(response, item["correct_choice"], item["choices"])
|
||||
|
||||
@@ -4,7 +4,6 @@ import os
|
||||
|
||||
from skillopt.datasets.base import BatchSpec
|
||||
from skillopt.envs.base import EnvAdapter
|
||||
from skillopt.envs.deep_reflect import run_no_reference_deep_reflect
|
||||
from skillopt.envs.officeqa.dataloader import OfficeQADataLoader
|
||||
from skillopt.envs.officeqa.rollout import run_batch
|
||||
from skillopt.gradient.reflect import run_minibatch_reflect
|
||||
@@ -37,11 +36,7 @@ class OfficeQAAdapter(EnvAdapter):
|
||||
search_timeout_seconds: int = 20,
|
||||
use_local_tools: bool = True,
|
||||
data_dirs: list[str] | str | None = None,
|
||||
docs_dirs: list[str] | str | None = None,
|
||||
use_deep_reflect: bool = False,
|
||||
deep_reflect_failures: int = 4,
|
||||
deep_reflect_successes: int = 2,
|
||||
) -> None:
|
||||
docs_dirs: list[str] | str | None = None, ) -> None:
|
||||
self.workers = workers
|
||||
self.analyst_workers = analyst_workers
|
||||
self.failure_only = failure_only
|
||||
@@ -58,9 +53,6 @@ class OfficeQAAdapter(EnvAdapter):
|
||||
self.search_timeout_seconds = int(search_timeout_seconds)
|
||||
self.use_local_tools = bool(use_local_tools)
|
||||
self.data_dirs = data_dirs if data_dirs is not None else docs_dirs
|
||||
self.use_deep_reflect = use_deep_reflect
|
||||
self.deep_reflect_failures = deep_reflect_failures
|
||||
self.deep_reflect_successes = deep_reflect_successes
|
||||
self.dataloader = OfficeQADataLoader(
|
||||
split_dir=split_dir,
|
||||
data_path=data_path,
|
||||
@@ -133,37 +125,6 @@ class OfficeQAAdapter(EnvAdapter):
|
||||
update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"),
|
||||
)
|
||||
|
||||
def deep_reflect(
|
||||
self,
|
||||
results: list[dict],
|
||||
skill_content: str,
|
||||
out_dir: str,
|
||||
**kwargs,
|
||||
) -> list[dict | None]:
|
||||
return run_no_reference_deep_reflect(
|
||||
self,
|
||||
results,
|
||||
skill_content,
|
||||
out_dir,
|
||||
env_manager=kwargs.get("env_manager"),
|
||||
prediction_dir=kwargs.get("prediction_dir"),
|
||||
random_seed=kwargs.get("random_seed"),
|
||||
step_buffer_context=kwargs.get("step_buffer_context", ""),
|
||||
output_requirements=[
|
||||
"- There is no hidden reference block. Use only the question, candidate files, tool trace, student output, and evaluation result to infer what intermediate state is worth probing.",
|
||||
"- The instruction must explicitly request a short <analysis>...</analysis> block before the final <answer>...</answer>.",
|
||||
"- The readout should focus on selected document/file, evidence span or table, extracted value, units, and any date or fiscal-period normalization.",
|
||||
"- Do not ask for exhaustive copying of source text or a full chain-of-thought.",
|
||||
"- The instruction text should be ready to append directly to the student's prompt.",
|
||||
],
|
||||
metadata_builder=lambda item: {
|
||||
"id": str(item.get("id")),
|
||||
"task_type": str(item.get("task_type") or "officeqa"),
|
||||
"question_preview": str(item.get("question") or "")[:200],
|
||||
"source_files": item.get("source_files", []),
|
||||
"source_docs": item.get("source_docs", []),
|
||||
},
|
||||
)
|
||||
|
||||
def get_task_types(self) -> list[str]:
|
||||
seen: list[str] = []
|
||||
|
||||
@@ -14,8 +14,8 @@ try:
|
||||
from skillopt.envs.sealqa.tool_runtime import custom_search
|
||||
except ImportError:
|
||||
custom_search = None # type: ignore[assignment]
|
||||
from skillopt.model import chat_student_messages, get_student_backend, is_student_exec_backend
|
||||
from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_student_exec
|
||||
from skillopt.model import chat_target_messages, get_target_backend, is_target_exec_backend
|
||||
from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_target_exec
|
||||
from skillopt.prompts import load_prompt
|
||||
_TOOL_SCHEMAS = [
|
||||
{
|
||||
@@ -299,12 +299,12 @@ def _run_codex_once(
|
||||
link_dirs=_docs_link_targets(docs_roots),
|
||||
)
|
||||
prompt = (
|
||||
"Use the `skillopt-student` skill available in this workspace.\n"
|
||||
"Use the `skillopt-target` skill available in this workspace.\n"
|
||||
"Read `task.md`, inspect or search the full OfficeQA corpus under `docs/`, and answer the question.\n"
|
||||
"Treat candidate files in `task.md` as hints, not an access limit.\n"
|
||||
"Return the final answer inside <answer>...</answer>."
|
||||
)
|
||||
final_message, raw = run_student_exec(
|
||||
final_message, raw = run_target_exec(
|
||||
work_dir=work_dir,
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
@@ -356,8 +356,8 @@ def _run_custom_search_process(
|
||||
raise ValueError("custom_search mode requires a non-empty search_api_url")
|
||||
if not os.environ.get(search_auth_env, "").strip():
|
||||
raise ValueError(f"custom_search mode requires auth token env var {search_auth_env}")
|
||||
if get_student_backend() not in {"openai_chat", "qwen_chat"}:
|
||||
raise ValueError("custom_search mode is only supported with student_backend='openai_chat' or 'qwen_chat'")
|
||||
if get_target_backend() not in {"openai_chat", "qwen_chat"}:
|
||||
raise ValueError("custom_search mode is only supported with target_backend='openai_chat' or 'qwen_chat'")
|
||||
system = _build_system(
|
||||
skill_content,
|
||||
search_mode=_CUSTOM_SEARCH_MODE,
|
||||
@@ -385,7 +385,7 @@ def _run_custom_search_process(
|
||||
fail_reason = ""
|
||||
last_response_metadata: dict = {}
|
||||
for turn in range(1, max_tool_turns + 1):
|
||||
message, _ = chat_student_messages(
|
||||
message, _ = chat_target_messages(
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
retries=5,
|
||||
@@ -439,8 +439,8 @@ def _run_azure_search_process(
|
||||
diagnostic_mode: bool,
|
||||
diagnostic_instruction: str,
|
||||
) -> tuple[str, str, str, str, list[dict], str, dict]:
|
||||
if get_student_backend() != "openai_chat":
|
||||
raise ValueError("azure_search mode is only supported with student_backend='openai_chat'")
|
||||
if get_target_backend() != "openai_chat":
|
||||
raise ValueError("azure_search mode is only supported with target_backend='openai_chat'")
|
||||
system = _build_system(skill_content, search_mode=_AZURE_SEARCH_MODE)
|
||||
user = _build_user(
|
||||
item,
|
||||
@@ -453,7 +453,7 @@ def _run_azure_search_process(
|
||||
{"role": "user", "content": user},
|
||||
]
|
||||
conversation: list[dict] = [{"role": "user", "content": user}]
|
||||
message, _ = chat_student_messages(
|
||||
message, _ = chat_target_messages(
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
retries=5,
|
||||
@@ -494,7 +494,7 @@ def _run_offline_no_tools_process(
|
||||
{"role": "user", "content": user},
|
||||
]
|
||||
conversation: list[dict] = [{"role": "user", "content": user}]
|
||||
message, _ = chat_student_messages(
|
||||
message, _ = chat_target_messages(
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
retries=5,
|
||||
@@ -616,7 +616,7 @@ def process_one(
|
||||
candidate_files=candidate_files,
|
||||
oracle_context=oracle_context,
|
||||
)
|
||||
elif is_student_exec_backend():
|
||||
elif is_target_exec_backend():
|
||||
from skillopt.model import azure_openai as _llm
|
||||
response = ""
|
||||
system = ""
|
||||
@@ -628,7 +628,7 @@ def process_one(
|
||||
skill_content=skill_content,
|
||||
candidate_files=candidate_files,
|
||||
docs_roots=docs_roots,
|
||||
model=_llm.STUDENT_DEPLOYMENT,
|
||||
model=_llm.TARGET_DEPLOYMENT,
|
||||
timeout=180,
|
||||
diagnostic_mode=diagnostic_mode if turn == 1 else False,
|
||||
diagnostic_instruction=diagnostic_instruction if turn == 1 else "",
|
||||
@@ -650,7 +650,7 @@ def process_one(
|
||||
{"role": "user", "content": user},
|
||||
]
|
||||
for turn in range(1, max_tool_turns + 1):
|
||||
message, _ = chat_student_messages(
|
||||
message, _ = chat_target_messages(
|
||||
messages=messages,
|
||||
max_completion_tokens=768,
|
||||
retries=5,
|
||||
@@ -688,9 +688,9 @@ def process_one(
|
||||
break
|
||||
except Exception as e: # noqa: BLE001
|
||||
fail_reason = f"error: {e}"
|
||||
with open(os.path.join(pred_dir, "student_system_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
with open(os.path.join(pred_dir, "target_system_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(system)
|
||||
with open(os.path.join(pred_dir, "student_user_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
with open(os.path.join(pred_dir, "target_user_prompt.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(user)
|
||||
with open(os.path.join(pred_dir, "conversation.json"), "w", encoding="utf-8") as f:
|
||||
json.dump(conversation, f, ensure_ascii=False, indent=2)
|
||||
@@ -714,8 +714,8 @@ def process_one(
|
||||
"agent_ok": not fail_reason,
|
||||
"n_turns": len(conversation),
|
||||
"last_finish_reason": last_response_metadata.get("finish_reason", ""),
|
||||
"student_system_prompt": system,
|
||||
"student_user_prompt": user,
|
||||
"target_system_prompt": system,
|
||||
"target_user_prompt": user,
|
||||
}
|
||||
return result
|
||||
def run_batch(
|
||||
|
||||
@@ -4,13 +4,12 @@ from __future__ import annotations
|
||||
import json
|
||||
import os
|
||||
|
||||
from skillopt.gradient.deep_probe import generate_deep_probe_instruction
|
||||
from skillopt.datasets.base import BatchSpec
|
||||
from skillopt.envs.base import EnvAdapter
|
||||
from skillopt.envs.searchqa.dataloader import SearchQADataLoader
|
||||
from skillopt.envs.searchqa.rollout import run_batch
|
||||
from skillopt.gradient.reflect import run_minibatch_reflect
|
||||
from skillopt.model import get_student_backend
|
||||
from skillopt.model import get_target_backend
|
||||
|
||||
|
||||
class SearchQAAdapter(EnvAdapter):
|
||||
@@ -32,11 +31,7 @@ class SearchQAAdapter(EnvAdapter):
|
||||
minibatch_size: int = 8,
|
||||
edit_budget: int = 4,
|
||||
seed: int = 42,
|
||||
limit: int = 0,
|
||||
use_deep_reflect: bool = False,
|
||||
deep_reflect_failures: int = 4,
|
||||
deep_reflect_successes: int = 2,
|
||||
) -> None:
|
||||
limit: int = 0, ) -> None:
|
||||
self.max_turns = max_turns
|
||||
self.exec_timeout = exec_timeout
|
||||
self.workers = workers
|
||||
@@ -44,9 +39,6 @@ class SearchQAAdapter(EnvAdapter):
|
||||
self.failure_only = failure_only
|
||||
self.minibatch_size = minibatch_size
|
||||
self.edit_budget = edit_budget
|
||||
self.use_deep_reflect = use_deep_reflect
|
||||
self.deep_reflect_failures = deep_reflect_failures
|
||||
self.deep_reflect_successes = deep_reflect_successes
|
||||
self.dataloader = SearchQADataLoader(
|
||||
split_dir=split_dir,
|
||||
data_path=data_path,
|
||||
@@ -128,121 +120,6 @@ class SearchQAAdapter(EnvAdapter):
|
||||
update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"),
|
||||
)
|
||||
|
||||
def deep_reflect(
|
||||
self,
|
||||
results: list[dict],
|
||||
skill_content: str,
|
||||
out_dir: str,
|
||||
**kwargs,
|
||||
) -> list[dict | None]:
|
||||
if not self.use_deep_reflect:
|
||||
return []
|
||||
|
||||
env_manager = kwargs.get("env_manager")
|
||||
if not isinstance(env_manager, list):
|
||||
return []
|
||||
|
||||
prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions"))
|
||||
random_seed = kwargs.get("random_seed")
|
||||
step_buffer_context = kwargs.get("step_buffer_context", "")
|
||||
meta_skill_context = kwargs.get("meta_skill_context", "")
|
||||
codex_backend = get_student_backend() == "codex_exec"
|
||||
selected_items = self.select_representative_items(
|
||||
results,
|
||||
env_manager,
|
||||
n_failures=self.deep_reflect_failures,
|
||||
n_successes=self.deep_reflect_successes,
|
||||
seed=random_seed,
|
||||
)
|
||||
if not selected_items:
|
||||
return []
|
||||
|
||||
selected_ids = {str(item["id"]) for item in selected_items}
|
||||
selected_results = [row for row in results if str(row.get("id")) in selected_ids]
|
||||
selected_examples = (
|
||||
self.attach_codex_probe_context(selected_results, prediction_dir)
|
||||
if codex_backend
|
||||
else selected_results
|
||||
)
|
||||
selected_metadata = [
|
||||
{
|
||||
"id": str(item["id"]),
|
||||
"question_preview": str(item.get("question") or "")[:200],
|
||||
"has_context": bool(str(item.get("context") or "").strip()),
|
||||
"n_gold_answers": len(item.get("answers") or []),
|
||||
}
|
||||
for item in selected_items
|
||||
]
|
||||
|
||||
deep_dir = os.path.join(out_dir, "deep_reflect")
|
||||
rollout_dir = os.path.join(deep_dir, "rollout")
|
||||
patches_dir = os.path.join(deep_dir, "patches")
|
||||
os.makedirs(deep_dir, exist_ok=True)
|
||||
print(
|
||||
f" [2b/6 DEEP REFLECT setup] selected={len(selected_items)} "
|
||||
f"mode=no_reference_probe"
|
||||
)
|
||||
probe = generate_deep_probe_instruction(
|
||||
skill_content=skill_content,
|
||||
items=selected_examples,
|
||||
prediction_dir=prediction_dir,
|
||||
system_prompt=self.get_codex_deep_probe_prompt() if codex_backend else self.get_deep_probe_prompt(),
|
||||
step_buffer_context=step_buffer_context,
|
||||
meta_skill_context=meta_skill_context,
|
||||
output_requirements=[
|
||||
"- There is no hidden reference block. Use only the question, provided context, the student's output, and the evaluation result to infer what intermediate state is worth probing.",
|
||||
"- The instruction must explicitly request a short <analysis>...</analysis> block before the final <answer>...</answer>.",
|
||||
"- The readout should focus on likely evidence span, top candidate and runner-up, decisive clue, or a few short intermediate conclusions.",
|
||||
"- Do not ask for exhaustive copying of the context or a full chain-of-thought.",
|
||||
"- The instruction text should be ready to append directly to the student's prompt.",
|
||||
],
|
||||
)
|
||||
if not probe:
|
||||
return []
|
||||
diagnostic_trace_context_by_id = None
|
||||
if codex_backend:
|
||||
selected_items, diagnostic_trace_context_by_id, probe = self.resolve_codex_probe_target(
|
||||
selected_items=selected_items,
|
||||
selected_examples=selected_examples,
|
||||
prediction_dir=prediction_dir,
|
||||
probe=probe,
|
||||
)
|
||||
|
||||
with open(os.path.join(deep_dir, "probe.json"), "w", encoding="utf-8") as f:
|
||||
json.dump(
|
||||
{
|
||||
**probe,
|
||||
"selected_examples": selected_metadata,
|
||||
},
|
||||
f,
|
||||
ensure_ascii=False,
|
||||
indent=2,
|
||||
)
|
||||
|
||||
deep_results = self.rollout(
|
||||
selected_items,
|
||||
skill_content,
|
||||
rollout_dir,
|
||||
diagnostic_mode=True,
|
||||
diagnostic_instruction=probe["probe_instruction"],
|
||||
diagnostic_trace_context_by_id=diagnostic_trace_context_by_id,
|
||||
)
|
||||
return run_minibatch_reflect(
|
||||
results=deep_results,
|
||||
skill_content=skill_content,
|
||||
prediction_dir=os.path.join(rollout_dir, "predictions"),
|
||||
patches_dir=patches_dir,
|
||||
workers=self.analyst_workers,
|
||||
failure_only=self.failure_only,
|
||||
minibatch_size=self.minibatch_size,
|
||||
edit_budget=self.edit_budget,
|
||||
random_seed=random_seed,
|
||||
error_system=self.get_error_minibatch_prompt(),
|
||||
success_system=self.get_success_minibatch_prompt(),
|
||||
step_buffer_context=step_buffer_context,
|
||||
meta_skill_context=meta_skill_context,
|
||||
update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"),
|
||||
)
|
||||
|
||||
def get_task_types(self) -> list[str]:
|
||||
return ["qa"]
|
||||
|
||||
@@ -16,8 +16,8 @@ import time
|
||||
import traceback
|
||||
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
|
||||
|
||||
from skillopt.model import chat_student, get_student_backend, is_student_exec_backend
|
||||
from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_student_exec
|
||||
from skillopt.model import chat_target, get_target_backend, is_target_exec_backend
|
||||
from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_target_exec
|
||||
from skillopt.prompts import load_prompt
|
||||
from skillopt.envs.searchqa.evaluator import evaluate
|
||||
|
||||
@@ -123,11 +123,11 @@ def _run_codex_once(
|
||||
task_text=task_text,
|
||||
)
|
||||
prompt = (
|
||||
"Use the `skillopt-student` skill available in this workspace.\n"
|
||||
"Use the `skillopt-target` skill available in this workspace.\n"
|
||||
"Read `task.md` and answer the SearchQA question.\n"
|
||||
"Return the final answer inside <answer>...</answer>."
|
||||
)
|
||||
final_message, raw = run_student_exec(
|
||||
final_message, raw = run_target_exec(
|
||||
work_dir=work_dir,
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
@@ -192,7 +192,7 @@ def process_one(
|
||||
pred_dir = os.path.join(out_root, "predictions", item_id)
|
||||
os.makedirs(pred_dir, exist_ok=True)
|
||||
|
||||
if is_student_exec_backend():
|
||||
if is_target_exec_backend():
|
||||
from skillopt.model import azure_openai as _llm
|
||||
|
||||
conversation: list[dict] = []
|
||||
@@ -205,7 +205,7 @@ def process_one(
|
||||
skill_content=skill_content,
|
||||
question=question,
|
||||
context=context,
|
||||
model=_llm.STUDENT_DEPLOYMENT,
|
||||
model=_llm.TARGET_DEPLOYMENT,
|
||||
timeout=exec_timeout,
|
||||
diagnostic_mode=diagnostic_mode if turn == 0 else False,
|
||||
diagnostic_instruction=diagnostic_instruction if turn == 0 else "",
|
||||
@@ -220,9 +220,9 @@ def process_one(
|
||||
result["agent_ok"] = True
|
||||
result["n_turns"] = len(conversation)
|
||||
|
||||
with open(os.path.join(pred_dir, "student_system_prompt.txt"), "w") as f:
|
||||
with open(os.path.join(pred_dir, "target_system_prompt.txt"), "w") as f:
|
||||
f.write(system)
|
||||
with open(os.path.join(pred_dir, "student_user_prompt.txt"), "w") as f:
|
||||
with open(os.path.join(pred_dir, "target_user_prompt.txt"), "w") as f:
|
||||
f.write(user)
|
||||
with open(os.path.join(pred_dir, "conversation.json"), "w") as f:
|
||||
json.dump(conversation, f, ensure_ascii=False, indent=2)
|
||||
@@ -266,7 +266,7 @@ def process_one(
|
||||
|
||||
for turn in range(max_turns):
|
||||
if turn == 0:
|
||||
resp_text, _ = chat_student(
|
||||
resp_text, _ = chat_target(
|
||||
system=system, user=user,
|
||||
max_completion_tokens=512,
|
||||
retries=5, stage="rollout",
|
||||
@@ -279,7 +279,7 @@ def process_one(
|
||||
f"If correct, repeat it. If wrong, provide a corrected answer.\n"
|
||||
f"Use <answer>...</answer> tags for your final answer."
|
||||
)
|
||||
resp_text, _ = chat_student(
|
||||
resp_text, _ = chat_target(
|
||||
system=system, user=refinement,
|
||||
max_completion_tokens=512,
|
||||
retries=5, stage="rollout",
|
||||
@@ -297,9 +297,9 @@ def process_one(
|
||||
result["n_turns"] = len(conversation)
|
||||
|
||||
# Save conversation
|
||||
with open(os.path.join(pred_dir, "student_system_prompt.txt"), "w") as f:
|
||||
with open(os.path.join(pred_dir, "target_system_prompt.txt"), "w") as f:
|
||||
f.write(system)
|
||||
with open(os.path.join(pred_dir, "student_user_prompt.txt"), "w") as f:
|
||||
with open(os.path.join(pred_dir, "target_user_prompt.txt"), "w") as f:
|
||||
f.write(user)
|
||||
with open(os.path.join(pred_dir, "conversation.json"), "w") as f:
|
||||
json.dump(conversation, f, ensure_ascii=False, indent=2)
|
||||
|
||||
@@ -8,7 +8,6 @@ from __future__ import annotations
|
||||
import json
|
||||
import os
|
||||
|
||||
from skillopt.gradient.deep_probe import generate_deep_probe_instruction
|
||||
from skillopt.datasets.base import BatchSpec
|
||||
from skillopt.envs.base import EnvAdapter
|
||||
from skillopt.envs.spreadsheetbench.dataloader import SpreadsheetBenchDataLoader
|
||||
@@ -18,7 +17,7 @@ from skillopt.envs.spreadsheetbench.rollout import (
|
||||
run_spreadsheet_batch_codegen,
|
||||
)
|
||||
from skillopt.gradient.reflect import run_minibatch_reflect
|
||||
from skillopt.model import get_student_backend, is_student_exec_backend
|
||||
from skillopt.model import get_target_backend, is_target_exec_backend
|
||||
|
||||
|
||||
# Task types used for per-category breakdowns
|
||||
@@ -45,11 +44,7 @@ class SpreadsheetBenchAdapter(EnvAdapter):
|
||||
failure_only: bool = False,
|
||||
minibatch_size: int = 8,
|
||||
edit_budget: int = 4,
|
||||
seed: int = 42,
|
||||
use_deep_reflect: bool = False,
|
||||
deep_reflect_failures: int = 4,
|
||||
deep_reflect_successes: int = 2,
|
||||
) -> None:
|
||||
seed: int = 42, ) -> None:
|
||||
self.data_root = data_root
|
||||
self.mode = mode # "single", "multi", or "react"
|
||||
self.max_turns = max_turns
|
||||
@@ -59,9 +54,6 @@ class SpreadsheetBenchAdapter(EnvAdapter):
|
||||
self.failure_only = failure_only
|
||||
self.minibatch_size = minibatch_size
|
||||
self.edit_budget = edit_budget
|
||||
self.use_deep_reflect = use_deep_reflect
|
||||
self.deep_reflect_failures = deep_reflect_failures
|
||||
self.deep_reflect_successes = deep_reflect_successes
|
||||
self.dataloader = SpreadsheetBenchDataLoader(
|
||||
split_dir=split_dir,
|
||||
data_path=data_path,
|
||||
@@ -75,9 +67,9 @@ class SpreadsheetBenchAdapter(EnvAdapter):
|
||||
|
||||
def setup(self, cfg: dict) -> None:
|
||||
super().setup(cfg)
|
||||
if is_student_exec_backend() and self.mode != "single":
|
||||
if is_target_exec_backend() and self.mode != "single":
|
||||
raise NotImplementedError(
|
||||
"Exec student backends are currently supported only for SpreadsheetBench mode=single."
|
||||
"Exec target backends are currently supported only for SpreadsheetBench mode=single."
|
||||
)
|
||||
self.dataloader.setup(cfg)
|
||||
|
||||
@@ -190,120 +182,6 @@ class SpreadsheetBenchAdapter(EnvAdapter):
|
||||
update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"),
|
||||
)
|
||||
|
||||
def deep_reflect(
|
||||
self,
|
||||
results: list[dict],
|
||||
skill_content: str,
|
||||
out_dir: str,
|
||||
**kwargs,
|
||||
) -> list[dict | None]:
|
||||
if not self.use_deep_reflect:
|
||||
return []
|
||||
|
||||
env_manager = kwargs.get("env_manager")
|
||||
if not isinstance(env_manager, list):
|
||||
return []
|
||||
|
||||
prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions"))
|
||||
random_seed = kwargs.get("random_seed")
|
||||
step_buffer_context = kwargs.get("step_buffer_context", "")
|
||||
meta_skill_context = kwargs.get("meta_skill_context", "")
|
||||
codex_backend = get_student_backend() == "codex_exec"
|
||||
selected_items = self.select_representative_items(
|
||||
results,
|
||||
env_manager,
|
||||
n_failures=self.deep_reflect_failures,
|
||||
n_successes=self.deep_reflect_successes,
|
||||
seed=random_seed,
|
||||
)
|
||||
if not selected_items:
|
||||
return []
|
||||
|
||||
selected_ids = {str(item["id"]) for item in selected_items}
|
||||
selected_results = [row for row in results if str(row.get("id")) in selected_ids]
|
||||
selected_examples = (
|
||||
self.attach_codex_probe_context(selected_results, prediction_dir)
|
||||
if codex_backend
|
||||
else selected_results
|
||||
)
|
||||
selected_metadata = [
|
||||
{
|
||||
"id": str(item["id"]),
|
||||
"instruction_type": str(item.get("instruction_type") or ""),
|
||||
"answer_position": str(item.get("answer_position") or ""),
|
||||
}
|
||||
for item in selected_items
|
||||
]
|
||||
|
||||
deep_dir = os.path.join(out_dir, "deep_reflect")
|
||||
rollout_dir = os.path.join(deep_dir, "rollout")
|
||||
patches_dir = os.path.join(deep_dir, "patches")
|
||||
os.makedirs(deep_dir, exist_ok=True)
|
||||
print(
|
||||
f" [2b/6 DEEP REFLECT setup] selected={len(selected_items)} "
|
||||
f"mode={self.mode}"
|
||||
)
|
||||
probe = generate_deep_probe_instruction(
|
||||
skill_content=skill_content,
|
||||
items=selected_examples,
|
||||
prediction_dir=prediction_dir,
|
||||
system_prompt=self.get_codex_deep_probe_prompt() if codex_backend else self.get_deep_probe_prompt(),
|
||||
step_buffer_context=step_buffer_context,
|
||||
meta_skill_context=meta_skill_context,
|
||||
output_requirements=[
|
||||
"- The instruction must ask for a short structured diagnostic readout before the student writes code or starts tool use.",
|
||||
"- The readout should focus on task family, source/target region, and decisive transformation rule.",
|
||||
"- The student must still complete the original spreadsheet task.",
|
||||
"- Keep the readout concise and avoid exhaustive cell enumeration.",
|
||||
"- The instruction text should be ready to append directly to the student's prompt.",
|
||||
],
|
||||
)
|
||||
if not probe:
|
||||
return []
|
||||
diagnostic_trace_context_by_id = None
|
||||
if codex_backend:
|
||||
selected_items, diagnostic_trace_context_by_id, probe = self.resolve_codex_probe_target(
|
||||
selected_items=selected_items,
|
||||
selected_examples=selected_examples,
|
||||
prediction_dir=prediction_dir,
|
||||
probe=probe,
|
||||
)
|
||||
|
||||
with open(os.path.join(deep_dir, "probe.json"), "w", encoding="utf-8") as f:
|
||||
json.dump(
|
||||
{
|
||||
**probe,
|
||||
"selected_examples": selected_metadata,
|
||||
},
|
||||
f,
|
||||
ensure_ascii=False,
|
||||
indent=2,
|
||||
)
|
||||
|
||||
deep_results = self.rollout(
|
||||
selected_items,
|
||||
skill_content,
|
||||
rollout_dir,
|
||||
diagnostic_mode=True,
|
||||
diagnostic_instruction=probe["probe_instruction"],
|
||||
diagnostic_trace_context_by_id=diagnostic_trace_context_by_id,
|
||||
)
|
||||
return run_minibatch_reflect(
|
||||
results=deep_results,
|
||||
skill_content=skill_content,
|
||||
prediction_dir=os.path.join(rollout_dir, "predictions"),
|
||||
patches_dir=patches_dir,
|
||||
workers=self.analyst_workers,
|
||||
failure_only=self.failure_only,
|
||||
minibatch_size=self.minibatch_size,
|
||||
edit_budget=self.edit_budget,
|
||||
random_seed=random_seed,
|
||||
error_system=self.get_error_minibatch_prompt(),
|
||||
success_system=self.get_success_minibatch_prompt(),
|
||||
step_buffer_context=step_buffer_context,
|
||||
meta_skill_context=meta_skill_context,
|
||||
update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"),
|
||||
)
|
||||
|
||||
def get_task_types(self) -> list[str]:
|
||||
return list(TASK_TYPES)
|
||||
|
||||
@@ -30,12 +30,12 @@ def _timeout_handler(signum, frame):
|
||||
|
||||
from skillopt.model.azure_openai import (
|
||||
get_reasoning_effort,
|
||||
get_student_client,
|
||||
get_target_client,
|
||||
_needs_responses_api,
|
||||
tracker,
|
||||
)
|
||||
from skillopt.model import get_codex_exec_config, get_student_backend, is_student_exec_backend
|
||||
from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_student_exec
|
||||
from skillopt.model import get_codex_exec_config, get_target_backend, is_target_exec_backend
|
||||
from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_target_exec
|
||||
from skillopt.prompts import load_prompt
|
||||
from skillopt.envs.spreadsheetbench.executor import run_generated_code
|
||||
from skillopt.envs.spreadsheetbench.evaluator import evaluate
|
||||
@@ -44,13 +44,13 @@ from skillopt.envs.spreadsheetbench.evaluator import evaluate
|
||||
# ── Eval feedback helper (no golden value leakage) ─────────────────────────
|
||||
|
||||
def _build_eval_feedback(verify_report: str) -> str:
|
||||
"""Build Student feedback from a verify report, hiding expected values.
|
||||
"""Build Target feedback from a verify report, hiding expected values.
|
||||
|
||||
The verify report contains lines like:
|
||||
Sheet1!D2: got=None, expected=0 ✗
|
||||
Sheet1!D10: got=None, expected=None ✓
|
||||
|
||||
We strip the ``expected=...`` part so the Student sees only its own
|
||||
We strip the ``expected=...`` part so the Target sees only its own
|
||||
output and whether each cell is correct or wrong.
|
||||
"""
|
||||
import re
|
||||
@@ -203,7 +203,7 @@ def _llm_call_with_retry(call_fn, *, retries: int = 5, timeout: int = 120):
|
||||
|
||||
def _get_deployment() -> str:
|
||||
from skillopt.model import azure_openai as _llm
|
||||
return _llm.STUDENT_DEPLOYMENT
|
||||
return _llm.TARGET_DEPLOYMENT
|
||||
|
||||
|
||||
def _build_codex_skill(skill_content: str) -> str:
|
||||
@@ -242,7 +242,7 @@ def _build_codex_task(
|
||||
return (
|
||||
f"{prompt}\n\n"
|
||||
"## Codex Harness Task\n"
|
||||
"- Read `.agents/skills/skillopt-student/SKILL.md` before writing code; do not call a Skill tool.\n"
|
||||
"- Read `.agents/skills/skillopt-target/SKILL.md` before writing code; do not call a Skill tool.\n"
|
||||
"- Read and optionally inspect `input.xlsx` in this workspace.\n"
|
||||
"- Write the final Python solution to `solution.py`.\n"
|
||||
"- The script should use the provided `INPUT_PATH` and `OUTPUT_PATH` variables.\n"
|
||||
@@ -296,7 +296,7 @@ def _prepare_codex_workspace(
|
||||
diagnostic_trace_context=diagnostic_trace_context,
|
||||
)
|
||||
prompt = (
|
||||
"Read `.agents/skills/skillopt-student/SKILL.md` directly; do not call a Skill tool.\n"
|
||||
"Read `.agents/skills/skillopt-target/SKILL.md` directly; do not call a Skill tool.\n"
|
||||
"Read `task.md`, inspect `input.xlsx` if useful, and write the final solution to `solution.py`.\n"
|
||||
"You may run `python run_solution.py` to validate the script locally.\n"
|
||||
"In your final response, briefly confirm whether `solution.py` was written and summarize the approach."
|
||||
@@ -319,7 +319,7 @@ def _run_exec_backend(
|
||||
model: str,
|
||||
timeout: int,
|
||||
) -> tuple[str, str]:
|
||||
return run_student_exec(
|
||||
return run_target_exec(
|
||||
work_dir=work_dir,
|
||||
prompt=prompt,
|
||||
model=model,
|
||||
@@ -416,7 +416,7 @@ def run_single(
|
||||
|
||||
Returns ``{"code": str, "raw": str, "n_turns": 1}``.
|
||||
"""
|
||||
if is_student_exec_backend():
|
||||
if is_target_exec_backend():
|
||||
deadline = time.time() + task_timeout
|
||||
deployment = _get_deployment()
|
||||
work_dir, skill_md, task_md, prompt = _prepare_codex_workspace(
|
||||
@@ -449,12 +449,12 @@ def run_single(
|
||||
"raw": raw or final_message,
|
||||
"n_turns": 1,
|
||||
"conversation": [{"role": "assistant", "content": final_message or raw}],
|
||||
"student_system_prompt": skill_md,
|
||||
"student_user_prompt": f"{prompt}\n\n## Task File\n\n{task_md}",
|
||||
"target_system_prompt": skill_md,
|
||||
"target_user_prompt": f"{prompt}\n\n## Task File\n\n{task_md}",
|
||||
}
|
||||
|
||||
deadline = time.time() + task_timeout
|
||||
client = get_student_client()
|
||||
client = get_target_client()
|
||||
deployment = _get_deployment()
|
||||
system = _build_system(skill_content)
|
||||
user = _build_user(
|
||||
@@ -483,8 +483,8 @@ def run_single(
|
||||
"raw": raw,
|
||||
"n_turns": 1,
|
||||
"conversation": [{"role": "assistant", "content": raw}],
|
||||
"student_system_prompt": system,
|
||||
"student_user_prompt": user,
|
||||
"target_system_prompt": system,
|
||||
"target_user_prompt": user,
|
||||
}
|
||||
|
||||
|
||||
@@ -520,7 +520,7 @@ def run_multi(
|
||||
|
||||
Returns ``{"code": str, "raw": str, "n_turns": int, "conversation": [...]}``.
|
||||
"""
|
||||
if is_student_exec_backend():
|
||||
if is_target_exec_backend():
|
||||
deadline = time.time() + task_timeout
|
||||
deployment = _get_deployment()
|
||||
work_dir, skill_md, task_md, initial_prompt = _prepare_codex_workspace(
|
||||
@@ -613,12 +613,12 @@ def run_multi(
|
||||
"raw": raw or final_message,
|
||||
"n_turns": len([m for m in conversation if m["role"] == "assistant"]),
|
||||
"conversation": conversation,
|
||||
"student_system_prompt": skill_md,
|
||||
"student_user_prompt": f"{initial_prompt}\n\n## Task File\n\n{task_md}",
|
||||
"target_system_prompt": skill_md,
|
||||
"target_user_prompt": f"{initial_prompt}\n\n## Task File\n\n{task_md}",
|
||||
}
|
||||
|
||||
deadline = time.time() + task_timeout
|
||||
client = get_student_client()
|
||||
client = get_target_client()
|
||||
deployment = _get_deployment()
|
||||
system = _build_system(skill_content)
|
||||
user = _build_user(
|
||||
@@ -699,6 +699,6 @@ def run_multi(
|
||||
"raw": raw,
|
||||
"n_turns": turn + 1,
|
||||
"conversation": conversation,
|
||||
"student_system_prompt": system,
|
||||
"student_user_prompt": user,
|
||||
"target_system_prompt": system,
|
||||
"target_user_prompt": user,
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ import json
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from skillopt.model import chat_student_messages
|
||||
from skillopt.model import chat_target_messages
|
||||
from skillopt.prompts import load_prompt
|
||||
|
||||
# ── Tool schemas ─────────────────────────────────────────────────────────────
|
||||
@@ -298,7 +298,7 @@ def _react_loop(
|
||||
n_turns = 0
|
||||
|
||||
for _ in range(max_turns):
|
||||
message, _ = chat_student_messages(
|
||||
message, _ = chat_target_messages(
|
||||
messages=messages,
|
||||
tools=[BASH_TOOL_CHAT, WRITE_FILE_TOOL_CHAT],
|
||||
tool_choice="auto",
|
||||
@@ -390,6 +390,6 @@ def run_react(
|
||||
diagnostic_trace_context=diagnostic_trace_context,
|
||||
)
|
||||
result = _react_loop(system, user, work_dir, max_turns, max_output_tokens)
|
||||
result["student_system_prompt"] = system
|
||||
result["student_user_prompt"] = user
|
||||
result["target_system_prompt"] = system
|
||||
result["target_user_prompt"] = user
|
||||
return result
|
||||
|
||||
@@ -233,37 +233,37 @@ def process_one(
|
||||
|
||||
no1, ip1, _ = cases[0]
|
||||
pred_path_1 = os.path.join(task_out_dir, f"{no1}_pred.xlsx")
|
||||
student_prompt_parts = [
|
||||
target_prompt_parts = [
|
||||
f"# Instruction\n{instruction}",
|
||||
f"# Input file\n{ip1}",
|
||||
f"# Output file\n{pred_path_1}",
|
||||
]
|
||||
if instruction_type:
|
||||
student_prompt_parts.append(f"# Instruction type\n{instruction_type}")
|
||||
target_prompt_parts.append(f"# Instruction type\n{instruction_type}")
|
||||
if answer_position_eval:
|
||||
student_prompt_parts.append(f"# Answer position\n{answer_position_eval}")
|
||||
target_prompt_parts.append(f"# Answer position\n{answer_position_eval}")
|
||||
if diagnostic_trace_context.strip():
|
||||
student_prompt_parts.insert(
|
||||
target_prompt_parts.insert(
|
||||
0,
|
||||
"# Previous Codex Trace Snapshot\n"
|
||||
"This is a partial transcript from an earlier attempt. Use it as your current reasoning context.\n\n"
|
||||
f"{diagnostic_trace_context.strip()}",
|
||||
)
|
||||
if diagnostic_mode and diagnostic_instruction.strip():
|
||||
student_prompt_parts.append(f"# Training readout\n{diagnostic_instruction.strip()}")
|
||||
student_user_prompt = "\n\n".join(student_prompt_parts)
|
||||
target_prompt_parts.append(f"# Training readout\n{diagnostic_instruction.strip()}")
|
||||
target_user_prompt = "\n\n".join(target_prompt_parts)
|
||||
try:
|
||||
from skillopt.envs.spreadsheetbench.react_agent import _build_system
|
||||
student_system_prompt = _build_system(skill_content)
|
||||
target_system_prompt = _build_system(skill_content)
|
||||
except Exception:
|
||||
student_system_prompt = ""
|
||||
if student_system_prompt:
|
||||
with open(os.path.join(task_out_dir, "student_system_prompt.txt"), "w") as f:
|
||||
f.write(student_system_prompt)
|
||||
result["student_system_prompt"] = student_system_prompt
|
||||
with open(os.path.join(task_out_dir, "student_user_prompt.txt"), "w") as f:
|
||||
f.write(student_user_prompt)
|
||||
result["student_user_prompt"] = student_user_prompt
|
||||
target_system_prompt = ""
|
||||
if target_system_prompt:
|
||||
with open(os.path.join(task_out_dir, "target_system_prompt.txt"), "w") as f:
|
||||
f.write(target_system_prompt)
|
||||
result["target_system_prompt"] = target_system_prompt
|
||||
with open(os.path.join(task_out_dir, "target_user_prompt.txt"), "w") as f:
|
||||
f.write(target_user_prompt)
|
||||
result["target_user_prompt"] = target_user_prompt
|
||||
|
||||
# ── Stage 1: run ReAct agent on test case 1 ─────────────────────
|
||||
result["phase"] = "agent"
|
||||
@@ -288,14 +288,14 @@ def process_one(
|
||||
diagnostic_trace_context=diagnostic_trace_context,
|
||||
)
|
||||
result["n_turns"] = agent_result.get("n_turns", 0)
|
||||
if agent_result.get("student_system_prompt"):
|
||||
with open(os.path.join(task_out_dir, "student_system_prompt.txt"), "w") as f:
|
||||
f.write(agent_result["student_system_prompt"])
|
||||
result["student_system_prompt"] = agent_result["student_system_prompt"]
|
||||
if agent_result.get("student_user_prompt"):
|
||||
with open(os.path.join(task_out_dir, "student_user_prompt.txt"), "w") as f:
|
||||
f.write(agent_result["student_user_prompt"])
|
||||
result["student_user_prompt"] = agent_result["student_user_prompt"]
|
||||
if agent_result.get("target_system_prompt"):
|
||||
with open(os.path.join(task_out_dir, "target_system_prompt.txt"), "w") as f:
|
||||
f.write(agent_result["target_system_prompt"])
|
||||
result["target_system_prompt"] = agent_result["target_system_prompt"]
|
||||
if agent_result.get("target_user_prompt"):
|
||||
with open(os.path.join(task_out_dir, "target_user_prompt.txt"), "w") as f:
|
||||
f.write(agent_result["target_user_prompt"])
|
||||
result["target_user_prompt"] = agent_result["target_user_prompt"]
|
||||
|
||||
# Save conversation log
|
||||
with open(os.path.join(task_out_dir, "conversation.json"), "w") as f:
|
||||
@@ -606,7 +606,7 @@ def process_one_codegen(
|
||||
task_out_dir = os.path.join(out_root, "predictions", task_id)
|
||||
os.makedirs(task_out_dir, exist_ok=True)
|
||||
|
||||
# ── Save context for Teacher (Reflect stage) ──────────────────
|
||||
# ── Save context for Optimizer (Reflect stage) ──────────────────
|
||||
from skillopt.envs.spreadsheetbench.codegen_agent import (
|
||||
_preview_workbook, _build_system, _build_user,
|
||||
)
|
||||
@@ -615,8 +615,8 @@ def process_one_codegen(
|
||||
preview_text = _preview_workbook(first_input_for_preview)
|
||||
except Exception:
|
||||
preview_text = "(preview failed)"
|
||||
student_system = _build_system(skill_content)
|
||||
student_user = _build_user(
|
||||
target_system = _build_system(skill_content)
|
||||
target_user = _build_user(
|
||||
instruction,
|
||||
first_input_for_preview,
|
||||
instruction_type,
|
||||
@@ -628,14 +628,14 @@ def process_one_codegen(
|
||||
|
||||
with open(os.path.join(task_out_dir, "spreadsheet_preview.txt"), "w") as f:
|
||||
f.write(preview_text)
|
||||
with open(os.path.join(task_out_dir, "student_system_prompt.txt"), "w") as f:
|
||||
f.write(student_system)
|
||||
with open(os.path.join(task_out_dir, "student_user_prompt.txt"), "w") as f:
|
||||
f.write(student_user)
|
||||
with open(os.path.join(task_out_dir, "target_system_prompt.txt"), "w") as f:
|
||||
f.write(target_system)
|
||||
with open(os.path.join(task_out_dir, "target_user_prompt.txt"), "w") as f:
|
||||
f.write(target_user)
|
||||
|
||||
result["spreadsheet_preview"] = preview_text
|
||||
result["student_system_prompt"] = student_system
|
||||
result["student_user_prompt"] = student_user
|
||||
result["target_system_prompt"] = target_system
|
||||
result["target_user_prompt"] = target_user
|
||||
|
||||
# ── LLM phase ──────────────────────────────────────────────────
|
||||
result["phase"] = "llm"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""ReflACT Gradient -- trajectory analysis and patch generation.
|
||||
"""SkillOpt Gradient -- trajectory analysis and patch generation.
|
||||
|
||||
Analogous to gradient computation in neural network training: analyzes
|
||||
minibatch rollout trajectories to produce skill-edit patches (the "gradient"
|
||||
@@ -8,10 +8,8 @@ Modules
|
||||
-------
|
||||
- reflect: minibatch trajectory analysis (gradient computation)
|
||||
- aggregate: hierarchical patch merging (gradient aggregation)
|
||||
- deep_probe: diagnostic probe generation (gradient probing)
|
||||
"""
|
||||
from skillopt.gradient.reflect import ( # noqa: F401
|
||||
run_minibatch_reflect,
|
||||
)
|
||||
from skillopt.gradient.aggregate import merge_patches # noqa: F401
|
||||
from skillopt.gradient.deep_probe import generate_deep_probe_instruction # noqa: F401
|
||||
|
||||
@@ -9,7 +9,7 @@ from __future__ import annotations
|
||||
import json
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from skillopt.model import chat_teacher
|
||||
from skillopt.model import chat_optimizer
|
||||
from skillopt.optimizer.meta_skill import format_meta_skill_context
|
||||
from skillopt.optimizer.update_modes import (
|
||||
get_payload_items,
|
||||
@@ -33,17 +33,17 @@ def _merge_batch(
|
||||
meta_skill_context: str = "",
|
||||
level: int = 1,
|
||||
) -> dict:
|
||||
"""Call teacher LLM to merge a batch of patches into one."""
|
||||
"""Call optimizer LLM to merge a batch of patches into one."""
|
||||
patches_text = json.dumps(patches, ensure_ascii=False, indent=2)
|
||||
user = (
|
||||
f"## Current Skill\n{skill_content}\n\n"
|
||||
f"## Patches to merge ({len(patches)} total, merge level {level})\n{patches_text}"
|
||||
)
|
||||
teacher_ctx = format_meta_skill_context(meta_skill_context)
|
||||
if teacher_ctx:
|
||||
user = f"{teacher_ctx}\n\n{user}"
|
||||
optimizer_ctx = format_meta_skill_context(meta_skill_context)
|
||||
if optimizer_ctx:
|
||||
user = f"{optimizer_ctx}\n\n{user}"
|
||||
try:
|
||||
response, _ = chat_teacher(
|
||||
response, _ = chat_optimizer(
|
||||
system=system_prompt,
|
||||
user=user,
|
||||
max_completion_tokens=64000 if is_full_rewrite_minibatch_mode(update_mode) else 4096,
|
||||
@@ -224,11 +224,11 @@ def merge_patches(
|
||||
f"{len(s_edits)} edits\n\n"
|
||||
f"{combined_text}"
|
||||
)
|
||||
teacher_ctx = format_meta_skill_context(meta_skill_context)
|
||||
if teacher_ctx:
|
||||
user = f"{teacher_ctx}\n\n{user}"
|
||||
optimizer_ctx = format_meta_skill_context(meta_skill_context)
|
||||
if optimizer_ctx:
|
||||
user = f"{optimizer_ctx}\n\n{user}"
|
||||
try:
|
||||
response, _ = chat_teacher(
|
||||
response, _ = chat_optimizer(
|
||||
system=merge_final_prompt,
|
||||
user=user,
|
||||
max_completion_tokens=64000 if is_full_rewrite_minibatch_mode(update_mode) else 4096,
|
||||
|
||||
@@ -15,8 +15,8 @@ Public API
|
||||
----------
|
||||
- :func:`fmt_trajectory` -- format one conversation into text
|
||||
- :func:`fmt_minibatch_trajectories` -- format multiple trajectories for batch analysis
|
||||
- :func:`run_error_analyst_minibatch` -- one teacher call for a group of failures
|
||||
- :func:`run_success_analyst_minibatch` -- one teacher call for a group of successes
|
||||
- :func:`run_error_analyst_minibatch` -- one optimizer call for a group of failures
|
||||
- :func:`run_success_analyst_minibatch` -- one optimizer call for a group of successes
|
||||
- :func:`run_minibatch_reflect` -- full reflect stage dispatcher
|
||||
"""
|
||||
from __future__ import annotations
|
||||
@@ -27,7 +27,7 @@ import random
|
||||
import traceback
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from skillopt.model import chat_teacher
|
||||
from skillopt.model import chat_optimizer
|
||||
from skillopt.optimizer.meta_skill import format_meta_skill_context
|
||||
from skillopt.optimizer.update_modes import (
|
||||
get_payload_items,
|
||||
@@ -115,7 +115,7 @@ def fmt_minibatch_trajectories(
|
||||
``"task_type"``, ``"fail_reason"``, etc. Reads ``conversation.json``
|
||||
for each and formats them together with trajectory headers.
|
||||
|
||||
If available, includes the spreadsheet preview and student system prompt
|
||||
If available, includes the spreadsheet preview and target system prompt
|
||||
so the analyst can see what the agent saw.
|
||||
|
||||
Parameters
|
||||
@@ -160,32 +160,32 @@ def fmt_minibatch_trajectories(
|
||||
f"{reference_text[:4000]}\n"
|
||||
)
|
||||
|
||||
# ── Append student context (what the agent saw) ──────────────
|
||||
student_prompt = item.get("student_system_prompt", "")
|
||||
if not student_prompt:
|
||||
prompt_path = os.path.join(prediction_dir, tid, "student_system_prompt.txt")
|
||||
# ── Append target context (what the agent saw) ──────────────
|
||||
target_prompt = item.get("target_system_prompt", "")
|
||||
if not target_prompt:
|
||||
prompt_path = os.path.join(prediction_dir, tid, "target_system_prompt.txt")
|
||||
if os.path.exists(prompt_path):
|
||||
with open(prompt_path) as f:
|
||||
student_prompt = f.read()
|
||||
if student_prompt:
|
||||
target_prompt = f.read()
|
||||
if target_prompt:
|
||||
header += (
|
||||
f"\n#### Student System Prompt\n"
|
||||
f"{student_prompt[:3000]}\n"
|
||||
f"\n#### Target System Prompt\n"
|
||||
f"{target_prompt[:3000]}\n"
|
||||
)
|
||||
|
||||
user_prompt = item.get("student_user_prompt", "")
|
||||
user_prompt = item.get("target_user_prompt", "")
|
||||
if not user_prompt:
|
||||
user_prompt_path = os.path.join(prediction_dir, tid, "student_user_prompt.txt")
|
||||
user_prompt_path = os.path.join(prediction_dir, tid, "target_user_prompt.txt")
|
||||
if os.path.exists(user_prompt_path):
|
||||
with open(user_prompt_path) as f:
|
||||
user_prompt = f.read()
|
||||
if user_prompt:
|
||||
header += (
|
||||
f"\n#### Student User Prompt\n"
|
||||
f"\n#### Target User Prompt\n"
|
||||
f"{user_prompt[:3000]}\n"
|
||||
)
|
||||
|
||||
if os.environ.get("REFLACT_CODEX_TRACE_TO_TEACHER", "0") == "1":
|
||||
if os.environ.get("REFLACT_CODEX_TRACE_TO_OPTIMIZER", "0") == "1":
|
||||
codex_trace_summary = item.get("codex_trace_summary", "")
|
||||
if not codex_trace_summary:
|
||||
codex_trace_summary_path = os.path.join(prediction_dir, tid, "codex_trace_summary.txt")
|
||||
@@ -262,7 +262,7 @@ def run_error_analyst_minibatch(
|
||||
meta_skill_context: str = "",
|
||||
update_mode: str = "patch",
|
||||
) -> dict | None:
|
||||
"""Analyze a minibatch of failed trajectories in one teacher call.
|
||||
"""Analyze a minibatch of failed trajectories in one optimizer call.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -315,13 +315,13 @@ def run_error_analyst_minibatch(
|
||||
ctx = f"{ctx}\n{trajectory_memory_context}" if ctx else trajectory_memory_context
|
||||
if ctx.strip():
|
||||
user += f"## Previous Steps in This Epoch\n{ctx}\n\n"
|
||||
teacher_ctx = format_meta_skill_context(meta_skill_context)
|
||||
if teacher_ctx:
|
||||
user += teacher_ctx + "\n\n"
|
||||
optimizer_ctx = format_meta_skill_context(meta_skill_context)
|
||||
if optimizer_ctx:
|
||||
user += optimizer_ctx + "\n\n"
|
||||
user += f"## Failed Trajectories ({len(items)} total)\n{trajectories_text}"
|
||||
|
||||
try:
|
||||
response, _ = chat_teacher(
|
||||
response, _ = chat_optimizer(
|
||||
system=actual_system, user=user,
|
||||
max_completion_tokens=64000 if is_full_rewrite_minibatch_mode(mode) else 4096,
|
||||
retries=3,
|
||||
@@ -350,7 +350,7 @@ def run_success_analyst_minibatch(
|
||||
meta_skill_context: str = "",
|
||||
update_mode: str = "patch",
|
||||
) -> dict | None:
|
||||
"""Analyze a minibatch of successful trajectories in one teacher call.
|
||||
"""Analyze a minibatch of successful trajectories in one optimizer call.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -390,13 +390,13 @@ def run_success_analyst_minibatch(
|
||||
ctx = step_buffer_context or trajectory_memory_context or ""
|
||||
if ctx.strip():
|
||||
user += f"## Previous Steps in This Epoch\n{ctx}\n\n"
|
||||
teacher_ctx = format_meta_skill_context(meta_skill_context)
|
||||
if teacher_ctx:
|
||||
user += teacher_ctx + "\n\n"
|
||||
optimizer_ctx = format_meta_skill_context(meta_skill_context)
|
||||
if optimizer_ctx:
|
||||
user += optimizer_ctx + "\n\n"
|
||||
user += f"## Successful Trajectories ({len(items)} total)\n{trajectories_text}"
|
||||
|
||||
try:
|
||||
response, _ = chat_teacher(
|
||||
response, _ = chat_optimizer(
|
||||
system=actual_system, user=user,
|
||||
max_completion_tokens=64000 if is_full_rewrite_minibatch_mode(mode) else 4096,
|
||||
retries=3,
|
||||
@@ -454,7 +454,7 @@ def run_minibatch_reflect(
|
||||
meta_skill_context: str = "",
|
||||
update_mode: str = "patch",
|
||||
) -> list[dict | None]:
|
||||
"""Full minibatch reflect stage: group → parallel teacher calls → patches.
|
||||
"""Full minibatch reflect stage: group → parallel optimizer calls → patches.
|
||||
|
||||
Separates failure and success trajectories, splits each into minibatches
|
||||
of size M, runs all minibatches in parallel, and saves patch files.
|
||||
@@ -470,7 +470,7 @@ def run_minibatch_reflect(
|
||||
patches_dir : str
|
||||
Path to save per-minibatch patch JSON files.
|
||||
workers : int
|
||||
Max parallel teacher calls.
|
||||
Max parallel optimizer calls.
|
||||
failure_only : bool
|
||||
If True, skip success trajectories.
|
||||
minibatch_size : int
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""ReflACT model API with runtime backend selection for the student path."""
|
||||
"""ReflACT model API with runtime backend selection for the target path."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -12,73 +12,73 @@ from skillopt.model.backend_config import ( # noqa: F401
|
||||
configure_codex_exec,
|
||||
get_claude_code_exec_config,
|
||||
get_codex_exec_config,
|
||||
get_student_backend,
|
||||
get_teacher_backend,
|
||||
is_student_chat_backend,
|
||||
is_student_exec_backend,
|
||||
is_teacher_chat_backend,
|
||||
set_student_backend,
|
||||
set_teacher_backend,
|
||||
get_target_backend,
|
||||
get_optimizer_backend,
|
||||
is_target_chat_backend,
|
||||
is_target_exec_backend,
|
||||
is_optimizer_chat_backend,
|
||||
set_target_backend,
|
||||
set_optimizer_backend,
|
||||
)
|
||||
|
||||
|
||||
def set_backend(name: str | None) -> str:
|
||||
"""Backward-compatible global backend setter.
|
||||
|
||||
Historically the codebase used one shared backend for both teacher and
|
||||
student. Keep that entry point so older scripts continue to work, while
|
||||
mapping it onto the split teacher/student backend model.
|
||||
Historically the codebase used one shared backend for both optimizer and
|
||||
target. Keep that entry point so older scripts continue to work, while
|
||||
mapping it onto the split optimizer/target backend model.
|
||||
"""
|
||||
normalized = str(name or "azure_openai").strip().lower()
|
||||
if normalized in {"azure_openai", "openai_chat", "azure", "azure-openai"}:
|
||||
set_teacher_backend("openai_chat")
|
||||
set_student_backend("openai_chat")
|
||||
set_optimizer_backend("openai_chat")
|
||||
set_target_backend("openai_chat")
|
||||
return "azure_openai"
|
||||
if normalized in {"claude", "claude_chat", "anthropic"}:
|
||||
set_teacher_backend("claude_chat")
|
||||
set_student_backend("claude_chat")
|
||||
set_optimizer_backend("claude_chat")
|
||||
set_target_backend("claude_chat")
|
||||
return "claude_chat"
|
||||
if normalized == "codex":
|
||||
set_teacher_backend("openai_chat")
|
||||
set_student_backend("codex_exec")
|
||||
set_optimizer_backend("openai_chat")
|
||||
set_target_backend("codex_exec")
|
||||
return "codex"
|
||||
if normalized in {"codex_exec", "claude_code_exec"}:
|
||||
set_teacher_backend("openai_chat")
|
||||
set_student_backend(normalized)
|
||||
set_optimizer_backend("openai_chat")
|
||||
set_target_backend(normalized)
|
||||
return normalized
|
||||
if normalized in {"qwen", "qwen_chat"}:
|
||||
set_teacher_backend("openai_chat")
|
||||
set_student_backend("qwen_chat")
|
||||
set_optimizer_backend("openai_chat")
|
||||
set_target_backend("qwen_chat")
|
||||
return "qwen_chat"
|
||||
raise ValueError(f"Unsupported legacy backend: {name!r}")
|
||||
|
||||
|
||||
def get_backend_name() -> str:
|
||||
"""Best-effort backward-compatible backend summary."""
|
||||
teacher = get_teacher_backend()
|
||||
student = get_student_backend()
|
||||
if teacher == "claude_chat" and student == "claude_chat":
|
||||
optimizer = get_optimizer_backend()
|
||||
target = get_target_backend()
|
||||
if optimizer == "claude_chat" and target == "claude_chat":
|
||||
return "claude_chat"
|
||||
if teacher == "openai_chat" and student == "openai_chat":
|
||||
if optimizer == "openai_chat" and target == "openai_chat":
|
||||
return "azure_openai"
|
||||
if teacher == "openai_chat" and student == "codex_exec":
|
||||
if optimizer == "openai_chat" and target == "codex_exec":
|
||||
return "codex"
|
||||
if teacher == "openai_chat" and student == "qwen_chat":
|
||||
if optimizer == "openai_chat" and target == "qwen_chat":
|
||||
return "qwen_chat"
|
||||
return f"{teacher}+{student}"
|
||||
return f"{optimizer}+{target}"
|
||||
|
||||
|
||||
def chat_teacher(
|
||||
def chat_optimizer(
|
||||
system: str,
|
||||
user: str,
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "teacher",
|
||||
stage: str = "optimizer",
|
||||
reasoning_effort: str | None = None,
|
||||
timeout: int | None = None,
|
||||
) -> tuple[str, dict]:
|
||||
if get_teacher_backend() == "claude_chat":
|
||||
return _claude.chat_teacher(
|
||||
if get_optimizer_backend() == "claude_chat":
|
||||
return _claude.chat_optimizer(
|
||||
system=system,
|
||||
user=user,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
@@ -86,7 +86,7 @@ def chat_teacher(
|
||||
stage=stage,
|
||||
timeout=timeout,
|
||||
)
|
||||
return _openai.chat_teacher(
|
||||
return _openai.chat_optimizer(
|
||||
system=system,
|
||||
user=user,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
@@ -97,17 +97,17 @@ def chat_teacher(
|
||||
)
|
||||
|
||||
|
||||
def chat_student(
|
||||
def chat_target(
|
||||
system: str,
|
||||
user: str,
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "student",
|
||||
stage: str = "target",
|
||||
reasoning_effort: str | None = None,
|
||||
timeout: int | None = None,
|
||||
) -> tuple[str, dict]:
|
||||
if get_student_backend() == "claude_chat":
|
||||
return _claude.chat_student(
|
||||
if get_target_backend() == "claude_chat":
|
||||
return _claude.chat_target(
|
||||
system=system,
|
||||
user=user,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
@@ -115,8 +115,8 @@ def chat_student(
|
||||
stage=stage,
|
||||
timeout=timeout,
|
||||
)
|
||||
if get_student_backend() == "qwen_chat":
|
||||
return _qwen.chat_student(
|
||||
if get_target_backend() == "qwen_chat":
|
||||
return _qwen.chat_target(
|
||||
system=system,
|
||||
user=user,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
@@ -124,12 +124,12 @@ def chat_student(
|
||||
stage=stage,
|
||||
reasoning_effort=reasoning_effort,
|
||||
)
|
||||
if not is_student_chat_backend():
|
||||
if not is_target_chat_backend():
|
||||
raise NotImplementedError(
|
||||
"chat_student is only supported with student_backend=openai_chat, claude_chat, or qwen_chat. "
|
||||
"chat_target is only supported with target_backend=openai_chat, claude_chat, or qwen_chat. "
|
||||
"Exec backends are handled in environment-specific rollout code."
|
||||
)
|
||||
return _openai.chat_student(
|
||||
return _openai.chat_target(
|
||||
system=system,
|
||||
user=user,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
@@ -140,11 +140,11 @@ def chat_student(
|
||||
)
|
||||
|
||||
|
||||
def chat_teacher_messages(
|
||||
def chat_optimizer_messages(
|
||||
messages: list[dict[str, Any]],
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "teacher",
|
||||
stage: str = "optimizer",
|
||||
reasoning_effort: str | None = None,
|
||||
*,
|
||||
tools: list[dict[str, Any]] | None = None,
|
||||
@@ -152,8 +152,8 @@ def chat_teacher_messages(
|
||||
return_message: bool = False,
|
||||
timeout: int | None = None,
|
||||
) -> tuple[Any, dict]:
|
||||
if get_teacher_backend() == "claude_chat":
|
||||
return _claude.chat_teacher_messages(
|
||||
if get_optimizer_backend() == "claude_chat":
|
||||
return _claude.chat_optimizer_messages(
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
retries=retries,
|
||||
@@ -163,7 +163,7 @@ def chat_teacher_messages(
|
||||
return_message=return_message,
|
||||
timeout=timeout,
|
||||
)
|
||||
return _openai.chat_teacher_messages(
|
||||
return _openai.chat_optimizer_messages(
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
retries=retries,
|
||||
@@ -176,11 +176,11 @@ def chat_teacher_messages(
|
||||
)
|
||||
|
||||
|
||||
def chat_student_messages(
|
||||
def chat_target_messages(
|
||||
messages: list[dict[str, Any]],
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "student",
|
||||
stage: str = "target",
|
||||
reasoning_effort: str | None = None,
|
||||
*,
|
||||
tools: list[dict[str, Any]] | None = None,
|
||||
@@ -188,8 +188,8 @@ def chat_student_messages(
|
||||
return_message: bool = False,
|
||||
timeout: int | None = None,
|
||||
) -> tuple[Any, dict]:
|
||||
if get_student_backend() == "claude_chat":
|
||||
return _claude.chat_student_messages(
|
||||
if get_target_backend() == "claude_chat":
|
||||
return _claude.chat_target_messages(
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
retries=retries,
|
||||
@@ -199,8 +199,8 @@ def chat_student_messages(
|
||||
return_message=return_message,
|
||||
timeout=timeout,
|
||||
)
|
||||
if get_student_backend() == "qwen_chat":
|
||||
return _qwen.chat_student_messages(
|
||||
if get_target_backend() == "qwen_chat":
|
||||
return _qwen.chat_target_messages(
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
retries=retries,
|
||||
@@ -210,12 +210,12 @@ def chat_student_messages(
|
||||
tool_choice=tool_choice,
|
||||
return_message=return_message,
|
||||
)
|
||||
if not is_student_chat_backend():
|
||||
if not is_target_chat_backend():
|
||||
raise NotImplementedError(
|
||||
"chat_student_messages is only supported with student_backend=openai_chat, claude_chat, or qwen_chat. "
|
||||
"chat_target_messages is only supported with target_backend=openai_chat, claude_chat, or qwen_chat. "
|
||||
"Exec backends are handled in environment-specific rollout code."
|
||||
)
|
||||
return _openai.chat_student_messages(
|
||||
return _openai.chat_target_messages(
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
retries=retries,
|
||||
@@ -332,18 +332,18 @@ def configure_azure_openai(
|
||||
auth_mode: str | None = None,
|
||||
ad_scope: str | None = None,
|
||||
managed_identity_client_id: str | None = None,
|
||||
teacher_endpoint: str | None = None,
|
||||
teacher_api_version: str | None = None,
|
||||
teacher_api_key: str | None = None,
|
||||
teacher_auth_mode: str | None = None,
|
||||
teacher_ad_scope: str | None = None,
|
||||
teacher_managed_identity_client_id: str | None = None,
|
||||
student_endpoint: str | None = None,
|
||||
student_api_version: str | None = None,
|
||||
student_api_key: str | None = None,
|
||||
student_auth_mode: str | None = None,
|
||||
student_ad_scope: str | None = None,
|
||||
student_managed_identity_client_id: str | None = None,
|
||||
optimizer_endpoint: str | None = None,
|
||||
optimizer_api_version: str | None = None,
|
||||
optimizer_api_key: str | None = None,
|
||||
optimizer_auth_mode: str | None = None,
|
||||
optimizer_ad_scope: str | None = None,
|
||||
optimizer_managed_identity_client_id: str | None = None,
|
||||
target_endpoint: str | None = None,
|
||||
target_api_version: str | None = None,
|
||||
target_api_key: str | None = None,
|
||||
target_auth_mode: str | None = None,
|
||||
target_ad_scope: str | None = None,
|
||||
target_managed_identity_client_id: str | None = None,
|
||||
) -> None:
|
||||
_openai.configure_azure_openai(
|
||||
endpoint=endpoint,
|
||||
@@ -352,18 +352,18 @@ def configure_azure_openai(
|
||||
auth_mode=auth_mode,
|
||||
ad_scope=ad_scope,
|
||||
managed_identity_client_id=managed_identity_client_id,
|
||||
teacher_endpoint=teacher_endpoint,
|
||||
teacher_api_version=teacher_api_version,
|
||||
teacher_api_key=teacher_api_key,
|
||||
teacher_auth_mode=teacher_auth_mode,
|
||||
teacher_ad_scope=teacher_ad_scope,
|
||||
teacher_managed_identity_client_id=teacher_managed_identity_client_id,
|
||||
student_endpoint=student_endpoint,
|
||||
student_api_version=student_api_version,
|
||||
student_api_key=student_api_key,
|
||||
student_auth_mode=student_auth_mode,
|
||||
student_ad_scope=student_ad_scope,
|
||||
student_managed_identity_client_id=student_managed_identity_client_id,
|
||||
optimizer_endpoint=optimizer_endpoint,
|
||||
optimizer_api_version=optimizer_api_version,
|
||||
optimizer_api_key=optimizer_api_key,
|
||||
optimizer_auth_mode=optimizer_auth_mode,
|
||||
optimizer_ad_scope=optimizer_ad_scope,
|
||||
optimizer_managed_identity_client_id=optimizer_managed_identity_client_id,
|
||||
target_endpoint=target_endpoint,
|
||||
target_api_version=target_api_version,
|
||||
target_api_key=target_api_key,
|
||||
target_auth_mode=target_auth_mode,
|
||||
target_ad_scope=target_ad_scope,
|
||||
target_managed_identity_client_id=target_managed_identity_client_id,
|
||||
)
|
||||
|
||||
|
||||
@@ -392,12 +392,12 @@ def set_reasoning_effort(effort: str | None) -> None:
|
||||
_qwen.set_reasoning_effort(effort)
|
||||
|
||||
|
||||
def set_student_deployment(deployment: str) -> None:
|
||||
_openai.set_student_deployment(deployment)
|
||||
_claude.set_student_deployment(deployment)
|
||||
_qwen.set_student_deployment(deployment)
|
||||
def set_target_deployment(deployment: str) -> None:
|
||||
_openai.set_target_deployment(deployment)
|
||||
_claude.set_target_deployment(deployment)
|
||||
_qwen.set_target_deployment(deployment)
|
||||
|
||||
|
||||
def set_teacher_deployment(deployment: str) -> None:
|
||||
_openai.set_teacher_deployment(deployment)
|
||||
_claude.set_teacher_deployment(deployment)
|
||||
def set_optimizer_deployment(deployment: str) -> None:
|
||||
_openai.set_optimizer_deployment(deployment)
|
||||
_claude.set_optimizer_deployment(deployment)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""ReflACT Model backend — Azure OpenAI wrapper with token tracking.
|
||||
|
||||
Provides teacher/student dual-deployment chat functions and a global
|
||||
Provides optimizer/target dual-deployment chat functions and a global
|
||||
TokenTracker for per-stage cost accounting. Previously llm/azure_openai.py.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
@@ -35,69 +35,69 @@ MANAGED_IDENTITY_CLIENT_ID = os.environ.get(
|
||||
"",
|
||||
).strip()
|
||||
|
||||
TEACHER_ENDPOINT = (
|
||||
os.environ.get("TEACHER_AZURE_OPENAI_ENDPOINT")
|
||||
or os.environ.get("AZURE_OPENAI_TEACHER_ENDPOINT")
|
||||
OPTIMIZER_ENDPOINT = (
|
||||
os.environ.get("OPTIMIZER_AZURE_OPENAI_ENDPOINT")
|
||||
or os.environ.get("AZURE_OPENAI_OPTIMIZER_ENDPOINT")
|
||||
or ENDPOINT
|
||||
)
|
||||
STUDENT_ENDPOINT = (
|
||||
os.environ.get("STUDENT_AZURE_OPENAI_ENDPOINT")
|
||||
or os.environ.get("AZURE_OPENAI_STUDENT_ENDPOINT")
|
||||
TARGET_ENDPOINT = (
|
||||
os.environ.get("TARGET_AZURE_OPENAI_ENDPOINT")
|
||||
or os.environ.get("AZURE_OPENAI_TARGET_ENDPOINT")
|
||||
or ENDPOINT
|
||||
)
|
||||
TEACHER_API_VERSION = (
|
||||
os.environ.get("TEACHER_AZURE_OPENAI_API_VERSION")
|
||||
or os.environ.get("AZURE_OPENAI_TEACHER_API_VERSION")
|
||||
OPTIMIZER_API_VERSION = (
|
||||
os.environ.get("OPTIMIZER_AZURE_OPENAI_API_VERSION")
|
||||
or os.environ.get("AZURE_OPENAI_OPTIMIZER_API_VERSION")
|
||||
or API_VERSION
|
||||
)
|
||||
STUDENT_API_VERSION = (
|
||||
os.environ.get("STUDENT_AZURE_OPENAI_API_VERSION")
|
||||
or os.environ.get("AZURE_OPENAI_STUDENT_API_VERSION")
|
||||
TARGET_API_VERSION = (
|
||||
os.environ.get("TARGET_AZURE_OPENAI_API_VERSION")
|
||||
or os.environ.get("AZURE_OPENAI_TARGET_API_VERSION")
|
||||
or API_VERSION
|
||||
)
|
||||
TEACHER_API_KEY = (
|
||||
os.environ.get("TEACHER_AZURE_OPENAI_API_KEY")
|
||||
or os.environ.get("AZURE_OPENAI_TEACHER_API_KEY")
|
||||
OPTIMIZER_API_KEY = (
|
||||
os.environ.get("OPTIMIZER_AZURE_OPENAI_API_KEY")
|
||||
or os.environ.get("AZURE_OPENAI_OPTIMIZER_API_KEY")
|
||||
or API_KEY
|
||||
)
|
||||
STUDENT_API_KEY = (
|
||||
os.environ.get("STUDENT_AZURE_OPENAI_API_KEY")
|
||||
or os.environ.get("AZURE_OPENAI_STUDENT_API_KEY")
|
||||
TARGET_API_KEY = (
|
||||
os.environ.get("TARGET_AZURE_OPENAI_API_KEY")
|
||||
or os.environ.get("AZURE_OPENAI_TARGET_API_KEY")
|
||||
or API_KEY
|
||||
)
|
||||
TEACHER_AUTH_MODE = (
|
||||
os.environ.get("TEACHER_AZURE_OPENAI_AUTH_MODE")
|
||||
or os.environ.get("AZURE_OPENAI_TEACHER_AUTH_MODE")
|
||||
OPTIMIZER_AUTH_MODE = (
|
||||
os.environ.get("OPTIMIZER_AZURE_OPENAI_AUTH_MODE")
|
||||
or os.environ.get("AZURE_OPENAI_OPTIMIZER_AUTH_MODE")
|
||||
or AUTH_MODE
|
||||
).strip().lower()
|
||||
STUDENT_AUTH_MODE = (
|
||||
os.environ.get("STUDENT_AZURE_OPENAI_AUTH_MODE")
|
||||
or os.environ.get("AZURE_OPENAI_STUDENT_AUTH_MODE")
|
||||
TARGET_AUTH_MODE = (
|
||||
os.environ.get("TARGET_AZURE_OPENAI_AUTH_MODE")
|
||||
or os.environ.get("AZURE_OPENAI_TARGET_AUTH_MODE")
|
||||
or AUTH_MODE
|
||||
).strip().lower()
|
||||
TEACHER_AD_SCOPE = (
|
||||
os.environ.get("TEACHER_AZURE_OPENAI_AD_SCOPE")
|
||||
or os.environ.get("AZURE_OPENAI_TEACHER_AD_SCOPE")
|
||||
OPTIMIZER_AD_SCOPE = (
|
||||
os.environ.get("OPTIMIZER_AZURE_OPENAI_AD_SCOPE")
|
||||
or os.environ.get("AZURE_OPENAI_OPTIMIZER_AD_SCOPE")
|
||||
or AD_SCOPE
|
||||
)
|
||||
STUDENT_AD_SCOPE = (
|
||||
os.environ.get("STUDENT_AZURE_OPENAI_AD_SCOPE")
|
||||
or os.environ.get("AZURE_OPENAI_STUDENT_AD_SCOPE")
|
||||
TARGET_AD_SCOPE = (
|
||||
os.environ.get("TARGET_AZURE_OPENAI_AD_SCOPE")
|
||||
or os.environ.get("AZURE_OPENAI_TARGET_AD_SCOPE")
|
||||
or AD_SCOPE
|
||||
)
|
||||
TEACHER_MANAGED_IDENTITY_CLIENT_ID = (
|
||||
os.environ.get("TEACHER_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID")
|
||||
or os.environ.get("AZURE_OPENAI_TEACHER_MANAGED_IDENTITY_CLIENT_ID")
|
||||
OPTIMIZER_MANAGED_IDENTITY_CLIENT_ID = (
|
||||
os.environ.get("OPTIMIZER_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID")
|
||||
or os.environ.get("AZURE_OPENAI_OPTIMIZER_MANAGED_IDENTITY_CLIENT_ID")
|
||||
or MANAGED_IDENTITY_CLIENT_ID
|
||||
).strip()
|
||||
STUDENT_MANAGED_IDENTITY_CLIENT_ID = (
|
||||
os.environ.get("STUDENT_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID")
|
||||
or os.environ.get("AZURE_OPENAI_STUDENT_MANAGED_IDENTITY_CLIENT_ID")
|
||||
TARGET_MANAGED_IDENTITY_CLIENT_ID = (
|
||||
os.environ.get("TARGET_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID")
|
||||
or os.environ.get("AZURE_OPENAI_TARGET_MANAGED_IDENTITY_CLIENT_ID")
|
||||
or MANAGED_IDENTITY_CLIENT_ID
|
||||
).strip()
|
||||
|
||||
TEACHER_DEPLOYMENT = os.environ.get("TEACHER_DEPLOYMENT", "gpt-5.5")
|
||||
STUDENT_DEPLOYMENT = os.environ.get("STUDENT_DEPLOYMENT", "gpt-5.5")
|
||||
OPTIMIZER_DEPLOYMENT = os.environ.get("OPTIMIZER_DEPLOYMENT", "gpt-4o")
|
||||
TARGET_DEPLOYMENT = os.environ.get("TARGET_DEPLOYMENT", "gpt-4o")
|
||||
|
||||
REASONING_EFFORT: str | None = None
|
||||
|
||||
@@ -177,30 +177,30 @@ tracker = TokenTracker()
|
||||
|
||||
# ── Client management ─────────────────────────────────────────────────────────
|
||||
|
||||
_teacher_client: AzureOpenAI | None = None
|
||||
_student_client: AzureOpenAI | None = None
|
||||
_teacher_lock = threading.Lock()
|
||||
_student_lock = threading.Lock()
|
||||
_optimizer_client: AzureOpenAI | None = None
|
||||
_target_client: AzureOpenAI | None = None
|
||||
_optimizer_lock = threading.Lock()
|
||||
_target_lock = threading.Lock()
|
||||
|
||||
|
||||
def _role_config(role: str) -> dict[str, str]:
|
||||
if role == "teacher":
|
||||
if role == "optimizer":
|
||||
return {
|
||||
"endpoint": TEACHER_ENDPOINT,
|
||||
"api_version": TEACHER_API_VERSION,
|
||||
"api_key": TEACHER_API_KEY,
|
||||
"auth_mode": TEACHER_AUTH_MODE,
|
||||
"ad_scope": TEACHER_AD_SCOPE,
|
||||
"managed_identity_client_id": TEACHER_MANAGED_IDENTITY_CLIENT_ID,
|
||||
"endpoint": OPTIMIZER_ENDPOINT,
|
||||
"api_version": OPTIMIZER_API_VERSION,
|
||||
"api_key": OPTIMIZER_API_KEY,
|
||||
"auth_mode": OPTIMIZER_AUTH_MODE,
|
||||
"ad_scope": OPTIMIZER_AD_SCOPE,
|
||||
"managed_identity_client_id": OPTIMIZER_MANAGED_IDENTITY_CLIENT_ID,
|
||||
}
|
||||
if role == "student":
|
||||
if role == "target":
|
||||
return {
|
||||
"endpoint": STUDENT_ENDPOINT,
|
||||
"api_version": STUDENT_API_VERSION,
|
||||
"api_key": STUDENT_API_KEY,
|
||||
"auth_mode": STUDENT_AUTH_MODE,
|
||||
"ad_scope": STUDENT_AD_SCOPE,
|
||||
"managed_identity_client_id": STUDENT_MANAGED_IDENTITY_CLIENT_ID,
|
||||
"endpoint": TARGET_ENDPOINT,
|
||||
"api_version": TARGET_API_VERSION,
|
||||
"api_key": TARGET_API_KEY,
|
||||
"auth_mode": TARGET_AUTH_MODE,
|
||||
"ad_scope": TARGET_AD_SCOPE,
|
||||
"managed_identity_client_id": TARGET_MANAGED_IDENTITY_CLIENT_ID,
|
||||
}
|
||||
raise ValueError(f"Unknown Azure OpenAI client role: {role!r}")
|
||||
|
||||
@@ -280,6 +280,12 @@ def _make_azure_cli_token_provider(ad_scope: str):
|
||||
|
||||
def _make_client(role: str) -> AzureOpenAI:
|
||||
cfg = _role_config(role)
|
||||
if not cfg["endpoint"]:
|
||||
raise ValueError(
|
||||
f"Azure OpenAI endpoint is not configured for {role}. "
|
||||
"Pass --azure_openai_endpoint https://your-resource.openai.azure.com/ "
|
||||
"or set AZURE_OPENAI_ENDPOINT in your environment."
|
||||
)
|
||||
auth_mode = cfg["auth_mode"]
|
||||
if auth_mode in {"api_key", "key"}:
|
||||
if not cfg["api_key"]:
|
||||
@@ -303,29 +309,29 @@ def _make_client(role: str) -> AzureOpenAI:
|
||||
)
|
||||
|
||||
|
||||
def get_teacher_client() -> AzureOpenAI:
|
||||
global _teacher_client
|
||||
with _teacher_lock:
|
||||
if _teacher_client is None:
|
||||
_teacher_client = _make_client("teacher")
|
||||
return _teacher_client
|
||||
def get_optimizer_client() -> AzureOpenAI:
|
||||
global _optimizer_client
|
||||
with _optimizer_lock:
|
||||
if _optimizer_client is None:
|
||||
_optimizer_client = _make_client("optimizer")
|
||||
return _optimizer_client
|
||||
|
||||
|
||||
def get_student_client() -> AzureOpenAI | OpenAI:
|
||||
global _student_client
|
||||
with _student_lock:
|
||||
if _student_client is None:
|
||||
def get_target_client() -> AzureOpenAI | OpenAI:
|
||||
global _target_client
|
||||
with _target_lock:
|
||||
if _target_client is None:
|
||||
# When using qwen_chat backend, return an OpenAI client pointing to vLLM
|
||||
from skillopt.model.backend_config import get_student_backend
|
||||
if get_student_backend() == "qwen_chat":
|
||||
from skillopt.model.backend_config import get_target_backend
|
||||
if get_target_backend() == "qwen_chat":
|
||||
from skillopt.model import qwen_backend as _qwen
|
||||
_student_client = OpenAI(
|
||||
_target_client = OpenAI(
|
||||
base_url=_qwen.BASE_URL,
|
||||
api_key=_qwen.API_KEY or "dummy",
|
||||
)
|
||||
else:
|
||||
_student_client = _make_client("student")
|
||||
return _student_client
|
||||
_target_client = _make_client("target")
|
||||
return _target_client
|
||||
|
||||
|
||||
def _needs_responses_api(deployment: str) -> bool:
|
||||
@@ -587,25 +593,25 @@ def configure_azure_openai(
|
||||
auth_mode: str | None = None,
|
||||
ad_scope: str | None = None,
|
||||
managed_identity_client_id: str | None = None,
|
||||
teacher_endpoint: str | None = None,
|
||||
teacher_api_version: str | None = None,
|
||||
teacher_api_key: str | None = None,
|
||||
teacher_auth_mode: str | None = None,
|
||||
teacher_ad_scope: str | None = None,
|
||||
teacher_managed_identity_client_id: str | None = None,
|
||||
student_endpoint: str | None = None,
|
||||
student_api_version: str | None = None,
|
||||
student_api_key: str | None = None,
|
||||
student_auth_mode: str | None = None,
|
||||
student_ad_scope: str | None = None,
|
||||
student_managed_identity_client_id: str | None = None,
|
||||
optimizer_endpoint: str | None = None,
|
||||
optimizer_api_version: str | None = None,
|
||||
optimizer_api_key: str | None = None,
|
||||
optimizer_auth_mode: str | None = None,
|
||||
optimizer_ad_scope: str | None = None,
|
||||
optimizer_managed_identity_client_id: str | None = None,
|
||||
target_endpoint: str | None = None,
|
||||
target_api_version: str | None = None,
|
||||
target_api_key: str | None = None,
|
||||
target_auth_mode: str | None = None,
|
||||
target_ad_scope: str | None = None,
|
||||
target_managed_identity_client_id: str | None = None,
|
||||
) -> None:
|
||||
global ENDPOINT, API_VERSION, API_KEY, AUTH_MODE, AD_SCOPE, MANAGED_IDENTITY_CLIENT_ID
|
||||
global TEACHER_ENDPOINT, TEACHER_API_VERSION, TEACHER_API_KEY, TEACHER_AUTH_MODE
|
||||
global TEACHER_AD_SCOPE, TEACHER_MANAGED_IDENTITY_CLIENT_ID
|
||||
global STUDENT_ENDPOINT, STUDENT_API_VERSION, STUDENT_API_KEY, STUDENT_AUTH_MODE
|
||||
global STUDENT_AD_SCOPE, STUDENT_MANAGED_IDENTITY_CLIENT_ID
|
||||
global _teacher_client, _student_client
|
||||
global OPTIMIZER_ENDPOINT, OPTIMIZER_API_VERSION, OPTIMIZER_API_KEY, OPTIMIZER_AUTH_MODE
|
||||
global OPTIMIZER_AD_SCOPE, OPTIMIZER_MANAGED_IDENTITY_CLIENT_ID
|
||||
global TARGET_ENDPOINT, TARGET_API_VERSION, TARGET_API_KEY, TARGET_AUTH_MODE
|
||||
global TARGET_AD_SCOPE, TARGET_MANAGED_IDENTITY_CLIENT_ID
|
||||
global _optimizer_client, _target_client
|
||||
|
||||
def _clean(value: str | None, *, lower: bool = False) -> str | None:
|
||||
if value is None:
|
||||
@@ -641,72 +647,72 @@ def configure_azure_openai(
|
||||
"AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID",
|
||||
)
|
||||
|
||||
resolved_teacher_endpoint = _clean(teacher_endpoint) or shared_endpoint
|
||||
resolved_teacher_api_version = _clean(teacher_api_version) or shared_api_version
|
||||
resolved_teacher_api_key = _clean(teacher_api_key) or shared_api_key
|
||||
resolved_teacher_auth_mode = _clean(teacher_auth_mode, lower=True) or shared_auth_mode
|
||||
resolved_teacher_ad_scope = _clean(teacher_ad_scope) or shared_ad_scope
|
||||
resolved_teacher_mi = (
|
||||
_clean(teacher_managed_identity_client_id)
|
||||
resolved_optimizer_endpoint = _clean(optimizer_endpoint) or shared_endpoint
|
||||
resolved_optimizer_api_version = _clean(optimizer_api_version) or shared_api_version
|
||||
resolved_optimizer_api_key = _clean(optimizer_api_key) or shared_api_key
|
||||
resolved_optimizer_auth_mode = _clean(optimizer_auth_mode, lower=True) or shared_auth_mode
|
||||
resolved_optimizer_ad_scope = _clean(optimizer_ad_scope) or shared_ad_scope
|
||||
resolved_optimizer_mi = (
|
||||
_clean(optimizer_managed_identity_client_id)
|
||||
or shared_managed_identity_client_id
|
||||
)
|
||||
resolved_student_endpoint = _clean(student_endpoint) or shared_endpoint
|
||||
resolved_student_api_version = _clean(student_api_version) or shared_api_version
|
||||
resolved_student_api_key = _clean(student_api_key) or shared_api_key
|
||||
resolved_student_auth_mode = _clean(student_auth_mode, lower=True) or shared_auth_mode
|
||||
resolved_student_ad_scope = _clean(student_ad_scope) or shared_ad_scope
|
||||
resolved_student_mi = (
|
||||
_clean(student_managed_identity_client_id)
|
||||
resolved_target_endpoint = _clean(target_endpoint) or shared_endpoint
|
||||
resolved_target_api_version = _clean(target_api_version) or shared_api_version
|
||||
resolved_target_api_key = _clean(target_api_key) or shared_api_key
|
||||
resolved_target_auth_mode = _clean(target_auth_mode, lower=True) or shared_auth_mode
|
||||
resolved_target_ad_scope = _clean(target_ad_scope) or shared_ad_scope
|
||||
resolved_target_mi = (
|
||||
_clean(target_managed_identity_client_id)
|
||||
or shared_managed_identity_client_id
|
||||
)
|
||||
|
||||
_set("TEACHER_ENDPOINT", resolved_teacher_endpoint, "TEACHER_AZURE_OPENAI_ENDPOINT")
|
||||
_set("OPTIMIZER_ENDPOINT", resolved_optimizer_endpoint, "OPTIMIZER_AZURE_OPENAI_ENDPOINT")
|
||||
_set(
|
||||
"TEACHER_API_VERSION",
|
||||
resolved_teacher_api_version,
|
||||
"TEACHER_AZURE_OPENAI_API_VERSION",
|
||||
"OPTIMIZER_API_VERSION",
|
||||
resolved_optimizer_api_version,
|
||||
"OPTIMIZER_AZURE_OPENAI_API_VERSION",
|
||||
)
|
||||
_set("TEACHER_API_KEY", resolved_teacher_api_key, "TEACHER_AZURE_OPENAI_API_KEY")
|
||||
_set("TEACHER_AUTH_MODE", resolved_teacher_auth_mode, "TEACHER_AZURE_OPENAI_AUTH_MODE")
|
||||
_set("TEACHER_AD_SCOPE", resolved_teacher_ad_scope, "TEACHER_AZURE_OPENAI_AD_SCOPE")
|
||||
_set("OPTIMIZER_API_KEY", resolved_optimizer_api_key, "OPTIMIZER_AZURE_OPENAI_API_KEY")
|
||||
_set("OPTIMIZER_AUTH_MODE", resolved_optimizer_auth_mode, "OPTIMIZER_AZURE_OPENAI_AUTH_MODE")
|
||||
_set("OPTIMIZER_AD_SCOPE", resolved_optimizer_ad_scope, "OPTIMIZER_AZURE_OPENAI_AD_SCOPE")
|
||||
_set(
|
||||
"TEACHER_MANAGED_IDENTITY_CLIENT_ID",
|
||||
resolved_teacher_mi,
|
||||
"TEACHER_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID",
|
||||
"OPTIMIZER_MANAGED_IDENTITY_CLIENT_ID",
|
||||
resolved_optimizer_mi,
|
||||
"OPTIMIZER_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID",
|
||||
)
|
||||
_set("STUDENT_ENDPOINT", resolved_student_endpoint, "STUDENT_AZURE_OPENAI_ENDPOINT")
|
||||
_set("TARGET_ENDPOINT", resolved_target_endpoint, "TARGET_AZURE_OPENAI_ENDPOINT")
|
||||
_set(
|
||||
"STUDENT_API_VERSION",
|
||||
resolved_student_api_version,
|
||||
"STUDENT_AZURE_OPENAI_API_VERSION",
|
||||
"TARGET_API_VERSION",
|
||||
resolved_target_api_version,
|
||||
"TARGET_AZURE_OPENAI_API_VERSION",
|
||||
)
|
||||
_set("STUDENT_API_KEY", resolved_student_api_key, "STUDENT_AZURE_OPENAI_API_KEY")
|
||||
_set("STUDENT_AUTH_MODE", resolved_student_auth_mode, "STUDENT_AZURE_OPENAI_AUTH_MODE")
|
||||
_set("STUDENT_AD_SCOPE", resolved_student_ad_scope, "STUDENT_AZURE_OPENAI_AD_SCOPE")
|
||||
_set("TARGET_API_KEY", resolved_target_api_key, "TARGET_AZURE_OPENAI_API_KEY")
|
||||
_set("TARGET_AUTH_MODE", resolved_target_auth_mode, "TARGET_AZURE_OPENAI_AUTH_MODE")
|
||||
_set("TARGET_AD_SCOPE", resolved_target_ad_scope, "TARGET_AZURE_OPENAI_AD_SCOPE")
|
||||
_set(
|
||||
"STUDENT_MANAGED_IDENTITY_CLIENT_ID",
|
||||
resolved_student_mi,
|
||||
"STUDENT_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID",
|
||||
"TARGET_MANAGED_IDENTITY_CLIENT_ID",
|
||||
resolved_target_mi,
|
||||
"TARGET_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID",
|
||||
)
|
||||
|
||||
with _teacher_lock:
|
||||
_teacher_client = None
|
||||
with _student_lock:
|
||||
_student_client = None
|
||||
with _optimizer_lock:
|
||||
_optimizer_client = None
|
||||
with _target_lock:
|
||||
_target_client = None
|
||||
|
||||
|
||||
def chat_teacher(
|
||||
def chat_optimizer(
|
||||
system: str,
|
||||
user: str,
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "teacher",
|
||||
stage: str = "optimizer",
|
||||
reasoning_effort: str | None = None,
|
||||
timeout: int | None = None,
|
||||
) -> tuple[str, dict]:
|
||||
"""Call the teacher model. Returns (response_text, usage_dict)."""
|
||||
"""Call the optimizer model. Returns (response_text, usage_dict)."""
|
||||
return _chat_impl(
|
||||
get_teacher_client(), TEACHER_DEPLOYMENT,
|
||||
get_optimizer_client(), OPTIMIZER_DEPLOYMENT,
|
||||
system, user, max_completion_tokens, retries, stage, reasoning_effort, timeout,
|
||||
)
|
||||
|
||||
@@ -723,7 +729,7 @@ def chat_with_deployment(
|
||||
) -> tuple[str, dict]:
|
||||
"""Call an arbitrary deployment using the shared Azure client."""
|
||||
return _chat_impl(
|
||||
get_teacher_client(),
|
||||
get_optimizer_client(),
|
||||
deployment,
|
||||
system,
|
||||
user,
|
||||
@@ -735,27 +741,27 @@ def chat_with_deployment(
|
||||
)
|
||||
|
||||
|
||||
def chat_student(
|
||||
def chat_target(
|
||||
system: str,
|
||||
user: str,
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "student",
|
||||
stage: str = "target",
|
||||
reasoning_effort: str | None = None,
|
||||
timeout: int | None = None,
|
||||
) -> tuple[str, dict]:
|
||||
"""Call the student model. Returns (response_text, usage_dict)."""
|
||||
"""Call the target model. Returns (response_text, usage_dict)."""
|
||||
return _chat_impl(
|
||||
get_student_client(), STUDENT_DEPLOYMENT,
|
||||
get_target_client(), TARGET_DEPLOYMENT,
|
||||
system, user, max_completion_tokens, retries, stage, reasoning_effort, timeout,
|
||||
)
|
||||
|
||||
|
||||
def chat_teacher_messages(
|
||||
def chat_optimizer_messages(
|
||||
messages: list[dict[str, Any]],
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "teacher",
|
||||
stage: str = "optimizer",
|
||||
reasoning_effort: str | None = None,
|
||||
*,
|
||||
tools: list[dict[str, Any]] | None = None,
|
||||
@@ -763,10 +769,10 @@ def chat_teacher_messages(
|
||||
return_message: bool = False,
|
||||
timeout: int | None = None,
|
||||
) -> tuple[Any, dict]:
|
||||
"""Call the teacher model with a pre-built chat message list."""
|
||||
"""Call the optimizer model with a pre-built chat message list."""
|
||||
return _chat_messages_impl(
|
||||
get_teacher_client(),
|
||||
TEACHER_DEPLOYMENT,
|
||||
get_optimizer_client(),
|
||||
OPTIMIZER_DEPLOYMENT,
|
||||
messages,
|
||||
max_completion_tokens,
|
||||
retries,
|
||||
@@ -794,7 +800,7 @@ def chat_messages_with_deployment(
|
||||
) -> tuple[Any, dict]:
|
||||
"""Call an arbitrary deployment with a pre-built chat message list."""
|
||||
return _chat_messages_impl(
|
||||
get_teacher_client(),
|
||||
get_optimizer_client(),
|
||||
deployment,
|
||||
messages,
|
||||
max_completion_tokens,
|
||||
@@ -808,11 +814,11 @@ def chat_messages_with_deployment(
|
||||
)
|
||||
|
||||
|
||||
def chat_student_messages(
|
||||
def chat_target_messages(
|
||||
messages: list[dict[str, Any]],
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "student",
|
||||
stage: str = "target",
|
||||
reasoning_effort: str | None = None,
|
||||
*,
|
||||
tools: list[dict[str, Any]] | None = None,
|
||||
@@ -820,10 +826,10 @@ def chat_student_messages(
|
||||
return_message: bool = False,
|
||||
timeout: int | None = None,
|
||||
) -> tuple[Any, dict]:
|
||||
"""Call the student model with a pre-built chat message list."""
|
||||
"""Call the target model with a pre-built chat message list."""
|
||||
return _chat_messages_impl(
|
||||
get_student_client(),
|
||||
STUDENT_DEPLOYMENT,
|
||||
get_target_client(),
|
||||
TARGET_DEPLOYMENT,
|
||||
messages,
|
||||
max_completion_tokens,
|
||||
retries,
|
||||
@@ -845,14 +851,14 @@ def reset_token_tracker() -> None:
|
||||
tracker.reset()
|
||||
|
||||
|
||||
def set_student_deployment(deployment: str) -> None:
|
||||
"""Change student deployment at runtime."""
|
||||
global _student_client, STUDENT_DEPLOYMENT
|
||||
STUDENT_DEPLOYMENT = deployment
|
||||
os.environ["STUDENT_DEPLOYMENT"] = deployment
|
||||
def set_target_deployment(deployment: str) -> None:
|
||||
"""Change target deployment at runtime."""
|
||||
global _target_client, TARGET_DEPLOYMENT
|
||||
TARGET_DEPLOYMENT = deployment
|
||||
os.environ["TARGET_DEPLOYMENT"] = deployment
|
||||
os.environ["AZURE_OPENAI_DEPLOYMENT"] = deployment
|
||||
with _student_lock:
|
||||
_student_client = None
|
||||
with _target_lock:
|
||||
_target_client = None
|
||||
try:
|
||||
import llm_client as _legacy
|
||||
_legacy.DEPLOYMENT = deployment
|
||||
@@ -872,10 +878,10 @@ def get_reasoning_effort() -> str | None:
|
||||
return REASONING_EFFORT
|
||||
|
||||
|
||||
def set_teacher_deployment(deployment: str) -> None:
|
||||
"""Change teacher deployment at runtime."""
|
||||
global _teacher_client, TEACHER_DEPLOYMENT
|
||||
TEACHER_DEPLOYMENT = deployment
|
||||
os.environ["TEACHER_DEPLOYMENT"] = deployment
|
||||
with _teacher_lock:
|
||||
_teacher_client = None
|
||||
def set_optimizer_deployment(deployment: str) -> None:
|
||||
"""Change optimizer deployment at runtime."""
|
||||
global _optimizer_client, OPTIMIZER_DEPLOYMENT
|
||||
OPTIMIZER_DEPLOYMENT = deployment
|
||||
os.environ["OPTIMIZER_DEPLOYMENT"] = deployment
|
||||
with _optimizer_lock:
|
||||
_optimizer_client = None
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Runtime backend configuration for teacher/student model calls."""
|
||||
"""Runtime backend configuration for optimizer/target model calls."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
@@ -12,8 +12,8 @@ def _parse_bool(value: str | None, default: bool) -> bool:
|
||||
return str(value).strip().lower() in {"1", "true", "yes", "on"}
|
||||
|
||||
|
||||
TEACHER_BACKEND = normalize_backend_name(os.environ.get("TEACHER_BACKEND", "openai_chat"))
|
||||
STUDENT_BACKEND = normalize_backend_name(os.environ.get("STUDENT_BACKEND", "openai_chat"))
|
||||
OPTIMIZER_BACKEND = normalize_backend_name(os.environ.get("OPTIMIZER_BACKEND", "openai_chat"))
|
||||
TARGET_BACKEND = normalize_backend_name(os.environ.get("TARGET_BACKEND", "openai_chat"))
|
||||
|
||||
CODEX_EXEC_PATH = os.environ.get("CODEX_EXEC_PATH", "codex")
|
||||
CODEX_EXEC_SANDBOX = os.environ.get("CODEX_EXEC_SANDBOX", "workspace-write")
|
||||
@@ -46,46 +46,46 @@ CLAUDE_CODE_EXEC_MAX_THINKING_TOKENS = max(
|
||||
)
|
||||
|
||||
|
||||
def set_teacher_backend(backend: str) -> None:
|
||||
global TEACHER_BACKEND
|
||||
TEACHER_BACKEND = normalize_backend_name(backend or "openai_chat")
|
||||
if TEACHER_BACKEND not in {"openai_chat", "claude_chat"}:
|
||||
def set_optimizer_backend(backend: str) -> None:
|
||||
global OPTIMIZER_BACKEND
|
||||
OPTIMIZER_BACKEND = normalize_backend_name(backend or "openai_chat")
|
||||
if OPTIMIZER_BACKEND not in {"openai_chat", "claude_chat"}:
|
||||
raise ValueError(
|
||||
f"Unsupported teacher backend: {TEACHER_BACKEND!r}. "
|
||||
f"Unsupported optimizer backend: {OPTIMIZER_BACKEND!r}. "
|
||||
"Supported values are 'openai_chat' and 'claude_chat'."
|
||||
)
|
||||
os.environ["TEACHER_BACKEND"] = TEACHER_BACKEND
|
||||
os.environ["OPTIMIZER_BACKEND"] = OPTIMIZER_BACKEND
|
||||
|
||||
|
||||
def get_teacher_backend() -> str:
|
||||
return TEACHER_BACKEND
|
||||
def get_optimizer_backend() -> str:
|
||||
return OPTIMIZER_BACKEND
|
||||
|
||||
|
||||
def set_student_backend(backend: str) -> None:
|
||||
global STUDENT_BACKEND
|
||||
STUDENT_BACKEND = normalize_backend_name(backend or "openai_chat")
|
||||
if STUDENT_BACKEND not in {"openai_chat", "claude_chat", "qwen_chat", "codex_exec", "claude_code_exec"}:
|
||||
def set_target_backend(backend: str) -> None:
|
||||
global TARGET_BACKEND
|
||||
TARGET_BACKEND = normalize_backend_name(backend or "openai_chat")
|
||||
if TARGET_BACKEND not in {"openai_chat", "claude_chat", "qwen_chat", "codex_exec", "claude_code_exec"}:
|
||||
raise ValueError(
|
||||
f"Unsupported student backend: {STUDENT_BACKEND!r}. "
|
||||
f"Unsupported target backend: {TARGET_BACKEND!r}. "
|
||||
"Supported values are 'openai_chat', 'claude_chat', 'qwen_chat', 'codex_exec', and 'claude_code_exec'."
|
||||
)
|
||||
os.environ["STUDENT_BACKEND"] = STUDENT_BACKEND
|
||||
os.environ["TARGET_BACKEND"] = TARGET_BACKEND
|
||||
|
||||
|
||||
def get_student_backend() -> str:
|
||||
return STUDENT_BACKEND
|
||||
def get_target_backend() -> str:
|
||||
return TARGET_BACKEND
|
||||
|
||||
|
||||
def is_student_exec_backend() -> bool:
|
||||
return STUDENT_BACKEND in {"codex_exec", "claude_code_exec"}
|
||||
def is_target_exec_backend() -> bool:
|
||||
return TARGET_BACKEND in {"codex_exec", "claude_code_exec"}
|
||||
|
||||
|
||||
def is_teacher_chat_backend() -> bool:
|
||||
return TEACHER_BACKEND in {"openai_chat", "claude_chat"}
|
||||
def is_optimizer_chat_backend() -> bool:
|
||||
return OPTIMIZER_BACKEND in {"openai_chat", "claude_chat"}
|
||||
|
||||
|
||||
def is_student_chat_backend() -> bool:
|
||||
return STUDENT_BACKEND in {"openai_chat", "claude_chat", "qwen_chat"}
|
||||
def is_target_chat_backend() -> bool:
|
||||
return TARGET_BACKEND in {"openai_chat", "claude_chat", "qwen_chat"}
|
||||
|
||||
|
||||
def configure_codex_exec(
|
||||
|
||||
@@ -19,8 +19,8 @@ CLAUDE_PERMISSION_MODE = os.environ.get("CLAUDE_PERMISSION_MODE", "dontAsk")
|
||||
CLAUDE_SETTING_SOURCES = os.environ.get("CLAUDE_SETTING_SOURCES", "user,project")
|
||||
CLAUDE_ALLOW_ATTACHMENT_READ = os.environ.get("CLAUDE_ALLOW_ATTACHMENT_READ", "1").strip().lower() not in {"0", "false", "no"}
|
||||
|
||||
TEACHER_DEPLOYMENT = os.environ.get("TEACHER_DEPLOYMENT", "claude-sonnet-4-6")
|
||||
STUDENT_DEPLOYMENT = os.environ.get("STUDENT_DEPLOYMENT", "claude-sonnet-4-6")
|
||||
OPTIMIZER_DEPLOYMENT = os.environ.get("OPTIMIZER_DEPLOYMENT", "claude-sonnet-4-6")
|
||||
TARGET_DEPLOYMENT = os.environ.get("TARGET_DEPLOYMENT", "claude-sonnet-4-6")
|
||||
REASONING_EFFORT: str | None = None
|
||||
_VALID_EFFORTS = {"low", "medium", "high", "xhigh", "max"}
|
||||
|
||||
@@ -292,7 +292,7 @@ def _compat_message_from_payload(payload: Any) -> CompatAssistantMessage:
|
||||
def _call_messages(messages: list[dict[str, Any]], max_completion_tokens: int, retries: int, stage: str, *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, deployment: str | None = None, timeout: int | None = None) -> tuple[Any, dict[str, int]]:
|
||||
del max_completion_tokens
|
||||
system, prompt, attachments = _build_prompt_from_messages(messages, tools=tools, tool_choice=tool_choice, structured_output=return_message)
|
||||
model = deployment or STUDENT_DEPLOYMENT
|
||||
model = deployment or TARGET_DEPLOYMENT
|
||||
last_err = None
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
@@ -307,14 +307,14 @@ def _call_messages(messages: list[dict[str, Any]], max_completion_tokens: int, r
|
||||
raise RuntimeError(f"Claude backend failed after {retries} retries: {last_err}")
|
||||
|
||||
|
||||
def chat_teacher(system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, stage: str = "teacher", timeout: int | None = None) -> tuple[str, dict[str, int]]:
|
||||
def chat_optimizer(system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, stage: str = "optimizer", timeout: int | None = None) -> tuple[str, dict[str, int]]:
|
||||
messages = [{"role": "system", "content": system}, {"role": "user", "content": user}]
|
||||
return _call_messages(messages, max_completion_tokens, retries, stage, deployment=TEACHER_DEPLOYMENT, timeout=timeout)
|
||||
return _call_messages(messages, max_completion_tokens, retries, stage, deployment=OPTIMIZER_DEPLOYMENT, timeout=timeout)
|
||||
|
||||
|
||||
def chat_student(system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, stage: str = "student", timeout: int | None = None) -> tuple[str, dict[str, int]]:
|
||||
def chat_target(system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, stage: str = "target", timeout: int | None = None) -> tuple[str, dict[str, int]]:
|
||||
messages = [{"role": "system", "content": system}, {"role": "user", "content": user}]
|
||||
return _call_messages(messages, max_completion_tokens, retries, stage, deployment=STUDENT_DEPLOYMENT, timeout=timeout)
|
||||
return _call_messages(messages, max_completion_tokens, retries, stage, deployment=TARGET_DEPLOYMENT, timeout=timeout)
|
||||
|
||||
|
||||
def chat_with_deployment(deployment: str, system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, stage: str = "custom", timeout: int | None = None) -> tuple[str, dict[str, int]]:
|
||||
@@ -322,12 +322,12 @@ def chat_with_deployment(deployment: str, system: str, user: str, max_completion
|
||||
return _call_messages(messages, max_completion_tokens, retries, stage, deployment=deployment, timeout=timeout)
|
||||
|
||||
|
||||
def chat_teacher_messages(messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, stage: str = "teacher", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, timeout: int | None = None) -> tuple[Any, dict[str, int]]:
|
||||
return _call_messages(messages, max_completion_tokens, retries, stage, tools=tools, tool_choice=tool_choice, return_message=return_message, deployment=TEACHER_DEPLOYMENT, timeout=timeout)
|
||||
def chat_optimizer_messages(messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, stage: str = "optimizer", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, timeout: int | None = None) -> tuple[Any, dict[str, int]]:
|
||||
return _call_messages(messages, max_completion_tokens, retries, stage, tools=tools, tool_choice=tool_choice, return_message=return_message, deployment=OPTIMIZER_DEPLOYMENT, timeout=timeout)
|
||||
|
||||
|
||||
def chat_student_messages(messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, stage: str = "student", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, timeout: int | None = None) -> tuple[Any, dict[str, int]]:
|
||||
return _call_messages(messages, max_completion_tokens, retries, stage, tools=tools, tool_choice=tool_choice, return_message=return_message, deployment=STUDENT_DEPLOYMENT, timeout=timeout)
|
||||
def chat_target_messages(messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, stage: str = "target", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, timeout: int | None = None) -> tuple[Any, dict[str, int]]:
|
||||
return _call_messages(messages, max_completion_tokens, retries, stage, tools=tools, tool_choice=tool_choice, return_message=return_message, deployment=TARGET_DEPLOYMENT, timeout=timeout)
|
||||
|
||||
|
||||
def chat_messages_with_deployment(deployment: str, messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, stage: str = "custom", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, timeout: int | None = None) -> tuple[Any, dict[str, int]]:
|
||||
@@ -347,13 +347,13 @@ def set_reasoning_effort(effort: str | None) -> None:
|
||||
REASONING_EFFORT = effort if effort else None
|
||||
|
||||
|
||||
def set_student_deployment(deployment: str) -> None:
|
||||
global STUDENT_DEPLOYMENT
|
||||
STUDENT_DEPLOYMENT = deployment or default_model_for_backend("claude")
|
||||
os.environ["STUDENT_DEPLOYMENT"] = STUDENT_DEPLOYMENT
|
||||
def set_target_deployment(deployment: str) -> None:
|
||||
global TARGET_DEPLOYMENT
|
||||
TARGET_DEPLOYMENT = deployment or default_model_for_backend("claude")
|
||||
os.environ["TARGET_DEPLOYMENT"] = TARGET_DEPLOYMENT
|
||||
|
||||
|
||||
def set_teacher_deployment(deployment: str) -> None:
|
||||
global TEACHER_DEPLOYMENT
|
||||
TEACHER_DEPLOYMENT = deployment or default_model_for_backend("claude")
|
||||
os.environ["TEACHER_DEPLOYMENT"] = TEACHER_DEPLOYMENT
|
||||
def set_optimizer_deployment(deployment: str) -> None:
|
||||
global OPTIMIZER_DEPLOYMENT
|
||||
OPTIMIZER_DEPLOYMENT = deployment or default_model_for_backend("claude")
|
||||
os.environ["OPTIMIZER_DEPLOYMENT"] = OPTIMIZER_DEPLOYMENT
|
||||
|
||||
@@ -24,8 +24,8 @@ CODEX_BIN = os.environ.get("CODEX_CLI_BIN", "codex")
|
||||
CODEX_PROFILE = os.environ.get("CODEX_PROFILE", "review")
|
||||
CODEX_SANDBOX_MODE = os.environ.get("CODEX_SANDBOX_MODE", "read-only")
|
||||
|
||||
TEACHER_DEPLOYMENT = os.environ.get("TEACHER_DEPLOYMENT", "gpt-5.5")
|
||||
STUDENT_DEPLOYMENT = os.environ.get("STUDENT_DEPLOYMENT", "gpt-5.5")
|
||||
OPTIMIZER_DEPLOYMENT = os.environ.get("OPTIMIZER_DEPLOYMENT", "gpt-4o")
|
||||
TARGET_DEPLOYMENT = os.environ.get("TARGET_DEPLOYMENT", "gpt-4o")
|
||||
|
||||
REASONING_EFFORT: str | None = None
|
||||
|
||||
@@ -508,16 +508,16 @@ def chat_messages_with_model(
|
||||
)
|
||||
|
||||
|
||||
def chat_teacher(
|
||||
def chat_optimizer(
|
||||
system: str,
|
||||
user: str,
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "teacher",
|
||||
stage: str = "optimizer",
|
||||
timeout: int | None = None,
|
||||
) -> tuple[str, dict[str, int]]:
|
||||
return chat_with_model(
|
||||
model=TEACHER_DEPLOYMENT,
|
||||
model=OPTIMIZER_DEPLOYMENT,
|
||||
system=system,
|
||||
user=user,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
@@ -547,16 +547,16 @@ def chat_with_deployment(
|
||||
)
|
||||
|
||||
|
||||
def chat_student(
|
||||
def chat_target(
|
||||
system: str,
|
||||
user: str,
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "student",
|
||||
stage: str = "target",
|
||||
timeout: int | None = None,
|
||||
) -> tuple[str, dict[str, int]]:
|
||||
return chat_with_model(
|
||||
model=STUDENT_DEPLOYMENT,
|
||||
model=TARGET_DEPLOYMENT,
|
||||
system=system,
|
||||
user=user,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
@@ -566,11 +566,11 @@ def chat_student(
|
||||
)
|
||||
|
||||
|
||||
def chat_teacher_messages(
|
||||
def chat_optimizer_messages(
|
||||
messages: list[dict[str, Any]],
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "teacher",
|
||||
stage: str = "optimizer",
|
||||
*,
|
||||
tools: list[dict[str, Any]] | None = None,
|
||||
tool_choice: str | dict[str, Any] | None = None,
|
||||
@@ -578,7 +578,7 @@ def chat_teacher_messages(
|
||||
timeout: int | None = None,
|
||||
) -> tuple[Any, dict[str, int]]:
|
||||
return _chat_messages_impl(
|
||||
TEACHER_DEPLOYMENT,
|
||||
OPTIMIZER_DEPLOYMENT,
|
||||
messages,
|
||||
max_completion_tokens,
|
||||
retries,
|
||||
@@ -615,11 +615,11 @@ def chat_messages_with_deployment(
|
||||
)
|
||||
|
||||
|
||||
def chat_student_messages(
|
||||
def chat_target_messages(
|
||||
messages: list[dict[str, Any]],
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "student",
|
||||
stage: str = "target",
|
||||
*,
|
||||
tools: list[dict[str, Any]] | None = None,
|
||||
tool_choice: str | dict[str, Any] | None = None,
|
||||
@@ -627,7 +627,7 @@ def chat_student_messages(
|
||||
timeout: int | None = None,
|
||||
) -> tuple[Any, dict[str, int]]:
|
||||
return _chat_messages_impl(
|
||||
STUDENT_DEPLOYMENT,
|
||||
TARGET_DEPLOYMENT,
|
||||
messages,
|
||||
max_completion_tokens,
|
||||
retries,
|
||||
@@ -647,10 +647,10 @@ def reset_token_tracker() -> None:
|
||||
tracker.reset()
|
||||
|
||||
|
||||
def set_student_deployment(deployment: str) -> None:
|
||||
global STUDENT_DEPLOYMENT
|
||||
STUDENT_DEPLOYMENT = deployment
|
||||
os.environ["STUDENT_DEPLOYMENT"] = deployment
|
||||
def set_target_deployment(deployment: str) -> None:
|
||||
global TARGET_DEPLOYMENT
|
||||
TARGET_DEPLOYMENT = deployment
|
||||
os.environ["TARGET_DEPLOYMENT"] = deployment
|
||||
|
||||
|
||||
def set_reasoning_effort(effort: str | None) -> None:
|
||||
@@ -658,7 +658,7 @@ def set_reasoning_effort(effort: str | None) -> None:
|
||||
REASONING_EFFORT = effort if effort else None
|
||||
|
||||
|
||||
def set_teacher_deployment(deployment: str) -> None:
|
||||
global TEACHER_DEPLOYMENT
|
||||
TEACHER_DEPLOYMENT = deployment
|
||||
os.environ["TEACHER_DEPLOYMENT"] = deployment
|
||||
def set_optimizer_deployment(deployment: str) -> None:
|
||||
global OPTIMIZER_DEPLOYMENT
|
||||
OPTIMIZER_DEPLOYMENT = deployment
|
||||
os.environ["OPTIMIZER_DEPLOYMENT"] = deployment
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""Helpers for running exec backends as the student harness."""
|
||||
"""Helpers for running exec backends as the target harness."""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
@@ -14,7 +14,7 @@ from typing import Any
|
||||
from skillopt.model.backend_config import (
|
||||
get_claude_code_exec_config,
|
||||
get_codex_exec_config,
|
||||
get_student_backend,
|
||||
get_target_backend,
|
||||
)
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ ANSWER_SCHEMA: dict[str, Any] = {
|
||||
def render_skill_md(
|
||||
skill_content: str,
|
||||
*,
|
||||
name: str = "skillopt-student",
|
||||
name: str = "skillopt-target",
|
||||
description: str = "Dynamic ReflACT skill for the current benchmark task.",
|
||||
preamble: str = "",
|
||||
) -> str:
|
||||
@@ -49,7 +49,7 @@ def render_skill_md(
|
||||
f'description: "{description}"',
|
||||
"---",
|
||||
"",
|
||||
"# ReflACT Student Skill",
|
||||
"# ReflACT Target Skill",
|
||||
"",
|
||||
]
|
||||
if preamble.strip():
|
||||
@@ -77,9 +77,9 @@ def prepare_workspace(
|
||||
) -> tuple[str, str]:
|
||||
if os.path.exists(work_dir):
|
||||
shutil.rmtree(work_dir)
|
||||
os.makedirs(os.path.join(work_dir, ".agents", "skills", "skillopt-student"), exist_ok=True)
|
||||
os.makedirs(os.path.join(work_dir, ".agents", "skills", "skillopt-target"), exist_ok=True)
|
||||
|
||||
skill_path = os.path.join(work_dir, ".agents", "skills", "skillopt-student", "SKILL.md")
|
||||
skill_path = os.path.join(work_dir, ".agents", "skills", "skillopt-target", "SKILL.md")
|
||||
with open(skill_path, "w", encoding="utf-8") as f:
|
||||
f.write(skill_md)
|
||||
|
||||
@@ -318,7 +318,7 @@ def parse_codex_raw(raw: str) -> dict:
|
||||
|
||||
|
||||
def format_codex_trace_steps(raw: str, *, max_chars: int = 4000) -> str:
|
||||
"""Render parsed Codex trace into numbered compact steps for teacher prompts."""
|
||||
"""Render parsed Codex trace into numbered compact steps for optimizer prompts."""
|
||||
parsed = parse_codex_raw(raw)
|
||||
steps = parsed["steps"]
|
||||
if not steps:
|
||||
@@ -474,12 +474,12 @@ def _exec_prompt(prompt: str, *, allow_file_edits: bool = False) -> str:
|
||||
)
|
||||
return (
|
||||
"Use the workspace files to solve the task. Read task.md and the skill at "
|
||||
".agents/skills/skillopt-student/SKILL.md before answering. "
|
||||
".agents/skills/skillopt-target/SKILL.md before answering. "
|
||||
"If ATTACHMENTS.md exists, read it and inspect the listed local files. "
|
||||
"Do not call a Skill tool; the ReflACT guidance is a local markdown file. "
|
||||
f"Do not ask for permission. {edit_instruction}"
|
||||
"Return only the final answer text, keeping any required <answer>...</answer> tags exactly.\n\n"
|
||||
f"{_normalize_student_exec_prompt(prompt)}"
|
||||
f"{_normalize_target_exec_prompt(prompt)}"
|
||||
)
|
||||
|
||||
|
||||
@@ -489,20 +489,20 @@ def _retry_prompt(prompt: str, attempt: int) -> str:
|
||||
return (
|
||||
f"{prompt}\n\n"
|
||||
"Previous execution returned an empty final response. Re-read task.md and "
|
||||
".agents/skills/skillopt-student/SKILL.md. If ATTACHMENTS.md exists, use the listed files. "
|
||||
".agents/skills/skillopt-target/SKILL.md. If ATTACHMENTS.md exists, use the listed files. "
|
||||
"Then produce the final answer inside <answer>...</answer>."
|
||||
)
|
||||
|
||||
|
||||
def _normalize_student_exec_prompt(prompt: str) -> str:
|
||||
def _normalize_target_exec_prompt(prompt: str) -> str:
|
||||
"""Avoid wording that makes Claude Code call an unregistered Skill tool."""
|
||||
text = prompt or ""
|
||||
replacements = {
|
||||
"Use the `skillopt-student` skill available in this workspace.": (
|
||||
"Read `.agents/skills/skillopt-student/SKILL.md` directly; do not call a Skill tool."
|
||||
"Use the `skillopt-target` skill available in this workspace.": (
|
||||
"Read `.agents/skills/skillopt-target/SKILL.md` directly; do not call a Skill tool."
|
||||
),
|
||||
"- Use the local `skillopt-student` skill before writing code.": (
|
||||
"- Read `.agents/skills/skillopt-student/SKILL.md` before writing code; do not call a Skill tool."
|
||||
"- Use the local `skillopt-target` skill before writing code.": (
|
||||
"- Read `.agents/skills/skillopt-target/SKILL.md` before writing code; do not call a Skill tool."
|
||||
),
|
||||
}
|
||||
for old, new in replacements.items():
|
||||
@@ -586,7 +586,7 @@ def _run_claude_code_sdk_exec(
|
||||
"preset": "claude_code",
|
||||
"append": (
|
||||
"Use the workspace files to solve the task. Read task.md and the skill at "
|
||||
".agents/skills/skillopt-student/SKILL.md before answering. "
|
||||
".agents/skills/skillopt-target/SKILL.md before answering. "
|
||||
"If ATTACHMENTS.md exists, read it and inspect the listed local files. "
|
||||
"Do not call a Skill tool; the ReflACT guidance is a local markdown file. "
|
||||
+ (
|
||||
@@ -619,7 +619,7 @@ def _run_claude_code_sdk_exec(
|
||||
|
||||
messages = []
|
||||
async with ClaudeSDKClient(options) as client:
|
||||
await client.query(_normalize_student_exec_prompt(prompt))
|
||||
await client.query(_normalize_target_exec_prompt(prompt))
|
||||
messages = [msg async for msg in client.receive_response()]
|
||||
last = messages[-1] if messages else None
|
||||
raw_structured_output = _extract_claude_structured_output(messages)
|
||||
@@ -1016,7 +1016,7 @@ def run_codex_exec(
|
||||
return last_response, combined
|
||||
|
||||
|
||||
def run_student_exec(
|
||||
def run_target_exec(
|
||||
*,
|
||||
work_dir: str,
|
||||
prompt: str,
|
||||
@@ -1030,7 +1030,7 @@ def run_student_exec(
|
||||
full_auto: bool | None = None,
|
||||
allow_file_edits: bool = False,
|
||||
) -> tuple[str, str]:
|
||||
backend = get_student_backend()
|
||||
backend = get_target_backend()
|
||||
if backend == "codex_exec":
|
||||
return run_codex_exec(
|
||||
work_dir=work_dir,
|
||||
|
||||
@@ -17,10 +17,10 @@ _RESPONSES_API_MODELS = {
|
||||
}
|
||||
|
||||
_BACKEND_DEFAULT_MODELS = {
|
||||
"azure_openai": "gpt-5.5",
|
||||
"openai_chat": "gpt-5.5",
|
||||
"codex": "gpt-5.5",
|
||||
"codex_exec": "gpt-5.5",
|
||||
"azure_openai": "gpt-4o",
|
||||
"openai_chat": "gpt-4o",
|
||||
"codex": "gpt-4o",
|
||||
"codex_exec": "gpt-4o",
|
||||
"claude": "claude-sonnet-4-6",
|
||||
"claude_chat": "claude-sonnet-4-6",
|
||||
"claude_code_exec": "claude-sonnet-4-6",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""OpenAI-compatible Qwen chat backend for the student path."""
|
||||
"""OpenAI-compatible Qwen chat backend for the target path."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
@@ -32,8 +32,8 @@ ENABLE_THINKING = os.environ.get("QWEN_CHAT_ENABLE_THINKING", "false").strip().l
|
||||
"on",
|
||||
}
|
||||
|
||||
STUDENT_DEPLOYMENT = os.environ.get(
|
||||
"STUDENT_DEPLOYMENT",
|
||||
TARGET_DEPLOYMENT = os.environ.get(
|
||||
"TARGET_DEPLOYMENT",
|
||||
default_model_for_backend("qwen_chat"),
|
||||
)
|
||||
|
||||
@@ -140,7 +140,7 @@ def _chat_messages_impl(
|
||||
timeout: float | None = None,
|
||||
) -> tuple[Any, dict[str, int]]:
|
||||
payload: dict[str, Any] = {
|
||||
"model": deployment or STUDENT_DEPLOYMENT,
|
||||
"model": deployment or TARGET_DEPLOYMENT,
|
||||
"messages": _json_safe(messages),
|
||||
"max_tokens": min(max_completion_tokens, MAX_TOKENS),
|
||||
}
|
||||
@@ -214,12 +214,12 @@ def get_max_tokens() -> int:
|
||||
return MAX_TOKENS
|
||||
|
||||
|
||||
def chat_student(
|
||||
def chat_target(
|
||||
system: str,
|
||||
user: str,
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "student",
|
||||
stage: str = "target",
|
||||
reasoning_effort: str | None = None,
|
||||
timeout: float | None = None,
|
||||
) -> tuple[str, dict[str, int]]:
|
||||
@@ -234,11 +234,11 @@ def chat_student(
|
||||
)
|
||||
|
||||
|
||||
def chat_student_messages(
|
||||
def chat_target_messages(
|
||||
messages: list[dict[str, Any]],
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "student",
|
||||
stage: str = "target",
|
||||
reasoning_effort: str | None = None,
|
||||
*,
|
||||
tools: list[dict[str, Any]] | None = None,
|
||||
@@ -271,7 +271,7 @@ def set_reasoning_effort(effort: str | None) -> None:
|
||||
del effort
|
||||
|
||||
|
||||
def set_student_deployment(deployment: str) -> None:
|
||||
global STUDENT_DEPLOYMENT
|
||||
STUDENT_DEPLOYMENT = deployment or default_model_for_backend("qwen_chat")
|
||||
os.environ["STUDENT_DEPLOYMENT"] = STUDENT_DEPLOYMENT
|
||||
def set_target_deployment(deployment: str) -> None:
|
||||
global TARGET_DEPLOYMENT
|
||||
TARGET_DEPLOYMENT = deployment or default_model_for_backend("qwen_chat")
|
||||
os.environ["TARGET_DEPLOYMENT"] = TARGET_DEPLOYMENT
|
||||
|
||||
@@ -43,15 +43,15 @@ def get_backend_name() -> str:
|
||||
return _ACTIVE_BACKEND
|
||||
|
||||
|
||||
def chat_teacher(
|
||||
def chat_optimizer(
|
||||
system: str,
|
||||
user: str,
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "teacher",
|
||||
stage: str = "optimizer",
|
||||
timeout: int | None = None,
|
||||
) -> tuple[str, dict[str, int]]:
|
||||
return _backend_module(_ACTIVE_BACKEND).chat_teacher(
|
||||
return _backend_module(_ACTIVE_BACKEND).chat_optimizer(
|
||||
system=system,
|
||||
user=user,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
@@ -61,15 +61,15 @@ def chat_teacher(
|
||||
)
|
||||
|
||||
|
||||
def chat_student(
|
||||
def chat_target(
|
||||
system: str,
|
||||
user: str,
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "student",
|
||||
stage: str = "target",
|
||||
timeout: int | None = None,
|
||||
) -> tuple[str, dict[str, int]]:
|
||||
return _backend_module(_ACTIVE_BACKEND).chat_student(
|
||||
return _backend_module(_ACTIVE_BACKEND).chat_target(
|
||||
system=system,
|
||||
user=user,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
@@ -99,18 +99,18 @@ def chat_with_deployment(
|
||||
)
|
||||
|
||||
|
||||
def chat_teacher_messages(
|
||||
def chat_optimizer_messages(
|
||||
messages: list[dict[str, Any]],
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "teacher",
|
||||
stage: str = "optimizer",
|
||||
*,
|
||||
tools: list[dict[str, Any]] | None = None,
|
||||
tool_choice: str | dict[str, Any] | None = None,
|
||||
return_message: bool = False,
|
||||
timeout: int | None = None,
|
||||
) -> tuple[Any, dict[str, int]]:
|
||||
return _backend_module(_ACTIVE_BACKEND).chat_teacher_messages(
|
||||
return _backend_module(_ACTIVE_BACKEND).chat_optimizer_messages(
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
retries=retries,
|
||||
@@ -122,18 +122,18 @@ def chat_teacher_messages(
|
||||
)
|
||||
|
||||
|
||||
def chat_student_messages(
|
||||
def chat_target_messages(
|
||||
messages: list[dict[str, Any]],
|
||||
max_completion_tokens: int = 16384,
|
||||
retries: int = 5,
|
||||
stage: str = "student",
|
||||
stage: str = "target",
|
||||
*,
|
||||
tools: list[dict[str, Any]] | None = None,
|
||||
tool_choice: str | dict[str, Any] | None = None,
|
||||
return_message: bool = False,
|
||||
timeout: int | None = None,
|
||||
) -> tuple[Any, dict[str, int]]:
|
||||
return _backend_module(_ACTIVE_BACKEND).chat_student_messages(
|
||||
return _backend_module(_ACTIVE_BACKEND).chat_target_messages(
|
||||
messages=messages,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
retries=retries,
|
||||
@@ -183,14 +183,14 @@ def set_reasoning_effort(effort: str | None) -> None:
|
||||
module.set_reasoning_effort(effort)
|
||||
|
||||
|
||||
def set_student_deployment(deployment: str) -> None:
|
||||
def set_target_deployment(deployment: str) -> None:
|
||||
for module in _all_backend_modules():
|
||||
module.set_student_deployment(deployment)
|
||||
module.set_target_deployment(deployment)
|
||||
|
||||
|
||||
def set_teacher_deployment(deployment: str) -> None:
|
||||
def set_optimizer_deployment(deployment: str) -> None:
|
||||
for module in _all_backend_modules():
|
||||
module.set_teacher_deployment(deployment)
|
||||
module.set_optimizer_deployment(deployment)
|
||||
|
||||
|
||||
def configure_azure_openai(
|
||||
@@ -201,18 +201,18 @@ def configure_azure_openai(
|
||||
auth_mode: str | None = None,
|
||||
ad_scope: str | None = None,
|
||||
managed_identity_client_id: str | None = None,
|
||||
teacher_endpoint: str | None = None,
|
||||
teacher_api_version: str | None = None,
|
||||
teacher_api_key: str | None = None,
|
||||
teacher_auth_mode: str | None = None,
|
||||
teacher_ad_scope: str | None = None,
|
||||
teacher_managed_identity_client_id: str | None = None,
|
||||
student_endpoint: str | None = None,
|
||||
student_api_version: str | None = None,
|
||||
student_api_key: str | None = None,
|
||||
student_auth_mode: str | None = None,
|
||||
student_ad_scope: str | None = None,
|
||||
student_managed_identity_client_id: str | None = None,
|
||||
optimizer_endpoint: str | None = None,
|
||||
optimizer_api_version: str | None = None,
|
||||
optimizer_api_key: str | None = None,
|
||||
optimizer_auth_mode: str | None = None,
|
||||
optimizer_ad_scope: str | None = None,
|
||||
optimizer_managed_identity_client_id: str | None = None,
|
||||
target_endpoint: str | None = None,
|
||||
target_api_version: str | None = None,
|
||||
target_api_key: str | None = None,
|
||||
target_auth_mode: str | None = None,
|
||||
target_ad_scope: str | None = None,
|
||||
target_managed_identity_client_id: str | None = None,
|
||||
) -> None:
|
||||
azure_openai.configure_azure_openai(
|
||||
endpoint=endpoint,
|
||||
@@ -221,16 +221,16 @@ def configure_azure_openai(
|
||||
auth_mode=auth_mode,
|
||||
ad_scope=ad_scope,
|
||||
managed_identity_client_id=managed_identity_client_id,
|
||||
teacher_endpoint=teacher_endpoint,
|
||||
teacher_api_version=teacher_api_version,
|
||||
teacher_api_key=teacher_api_key,
|
||||
teacher_auth_mode=teacher_auth_mode,
|
||||
teacher_ad_scope=teacher_ad_scope,
|
||||
teacher_managed_identity_client_id=teacher_managed_identity_client_id,
|
||||
student_endpoint=student_endpoint,
|
||||
student_api_version=student_api_version,
|
||||
student_api_key=student_api_key,
|
||||
student_auth_mode=student_auth_mode,
|
||||
student_ad_scope=student_ad_scope,
|
||||
student_managed_identity_client_id=student_managed_identity_client_id,
|
||||
optimizer_endpoint=optimizer_endpoint,
|
||||
optimizer_api_version=optimizer_api_version,
|
||||
optimizer_api_key=optimizer_api_key,
|
||||
optimizer_auth_mode=optimizer_auth_mode,
|
||||
optimizer_ad_scope=optimizer_ad_scope,
|
||||
optimizer_managed_identity_client_id=optimizer_managed_identity_client_id,
|
||||
target_endpoint=target_endpoint,
|
||||
target_api_version=target_api_version,
|
||||
target_api_key=target_api_key,
|
||||
target_auth_mode=target_auth_mode,
|
||||
target_ad_scope=target_ad_scope,
|
||||
target_managed_identity_client_id=target_managed_identity_client_id,
|
||||
)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
"""ReflACT Optimizer -- skill update operations.
|
||||
"""SkillOpt Optimizer -- skill update operations.
|
||||
|
||||
Analogous to the optimizer in neural network training: applies the computed
|
||||
"gradient" (patches) to the current skill document to produce an updated
|
||||
@@ -8,8 +8,8 @@ Modules
|
||||
-------
|
||||
- skill: edit application (optimizer.step() / parameter update)
|
||||
- clip: edit ranking and selection (gradient clipping)
|
||||
- meta_reflect: epoch-level macro refinement (momentum)
|
||||
- slow_update: longitudinal comparison and guidance (EMA / regularization)
|
||||
- meta_skill: cross-epoch memory for optimizer context
|
||||
"""
|
||||
from skillopt.optimizer.skill import apply_edit, apply_patch # noqa: F401
|
||||
from skillopt.optimizer.clip import rank_and_select # noqa: F401
|
||||
|
||||
@@ -6,7 +6,7 @@ effective step size. Previously core/select.py.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from skillopt.model import chat_teacher
|
||||
from skillopt.model import chat_optimizer
|
||||
from skillopt.optimizer.meta_skill import format_meta_skill_context
|
||||
from skillopt.optimizer.update_modes import (
|
||||
describe_item,
|
||||
@@ -29,10 +29,10 @@ def rank_and_select(
|
||||
meta_skill_context: str = "",
|
||||
update_mode: str = "patch",
|
||||
) -> dict:
|
||||
"""Use a teacher LLM to rank edits by importance, then keep top-L.
|
||||
"""Use a optimizer LLM to rank edits by importance, then keep top-L.
|
||||
|
||||
If the edit pool is within budget, returns the patch unchanged.
|
||||
Otherwise, calls the teacher to rank and select the most impactful edits.
|
||||
Otherwise, calls the optimizer to rank and select the most impactful edits.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -54,7 +54,7 @@ def rank_and_select(
|
||||
if len(edits) <= max_edits:
|
||||
return patch
|
||||
|
||||
# Build the edit pool description for the teacher
|
||||
# Build the edit pool description for the optimizer
|
||||
edits_desc = []
|
||||
for i, edit in enumerate(edits):
|
||||
edits_desc.append(f"[{i}] {describe_item(edit, update_mode, max_chars=500)}")
|
||||
@@ -66,13 +66,13 @@ def rank_and_select(
|
||||
+ f"\n\nSelect the {max_edits} most important {payload_label(update_mode)}. "
|
||||
f"Return their 0-based indices in priority order."
|
||||
)
|
||||
teacher_ctx = format_meta_skill_context(meta_skill_context)
|
||||
if teacher_ctx:
|
||||
user = f"{teacher_ctx}\n\n{user}"
|
||||
optimizer_ctx = format_meta_skill_context(meta_skill_context)
|
||||
if optimizer_ctx:
|
||||
user = f"{optimizer_ctx}\n\n{user}"
|
||||
prompt_name = "ranking_rewrite" if is_rewrite_mode(update_mode) else "ranking"
|
||||
|
||||
try:
|
||||
response, _ = chat_teacher(
|
||||
response, _ = chat_optimizer(
|
||||
system=load_prompt(prompt_name), user=user,
|
||||
max_completion_tokens=2048, retries=3, stage="ranking",
|
||||
)
|
||||
@@ -94,7 +94,7 @@ def rank_and_select(
|
||||
if selected:
|
||||
return {
|
||||
"reasoning": patch.get("reasoning", "")
|
||||
+ f" [teacher-ranked: selected {len(selected)}/{len(edits)} {payload_label(update_mode)}]",
|
||||
+ f" [optimizer-ranked: selected {len(selected)}/{len(edits)} {payload_label(update_mode)}]",
|
||||
payload_key(update_mode): selected,
|
||||
"ranking_details": result,
|
||||
}
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
"""Teacher-driven autonomous update-size decisions."""
|
||||
"""Optimizer-driven autonomous update-size decisions."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from skillopt.model import chat_teacher
|
||||
from skillopt.model import chat_optimizer
|
||||
from skillopt.optimizer.meta_skill import format_meta_skill_context
|
||||
from skillopt.optimizer.update_modes import describe_item, get_payload_items, payload_label
|
||||
from skillopt.prompts import load_prompt
|
||||
@@ -39,7 +39,7 @@ def decide_autonomous_learning_rate(
|
||||
step_buffer_context: str = "",
|
||||
meta_skill_context: str = "",
|
||||
) -> dict:
|
||||
"""Ask the teacher to choose the number of update items for this step.
|
||||
"""Ask the optimizer to choose the number of update items for this step.
|
||||
|
||||
The prompt intentionally avoids default budgets, candidate budget lists, or
|
||||
scheduler history. The only hard post-processing is validity: the returned
|
||||
@@ -65,15 +65,15 @@ def decide_autonomous_learning_rate(
|
||||
)
|
||||
if step_buffer_context.strip():
|
||||
user += f"\n\n## Previous Steps in This Epoch\n{step_buffer_context}"
|
||||
teacher_ctx = format_meta_skill_context(meta_skill_context)
|
||||
if teacher_ctx:
|
||||
user = f"{teacher_ctx}\n\n{user}"
|
||||
optimizer_ctx = format_meta_skill_context(meta_skill_context)
|
||||
if optimizer_ctx:
|
||||
user = f"{optimizer_ctx}\n\n{user}"
|
||||
|
||||
response = ""
|
||||
parsed: dict | None = None
|
||||
decision: int | None = None
|
||||
try:
|
||||
response, _ = chat_teacher(
|
||||
response, _ = chat_optimizer(
|
||||
system=load_prompt("lr_autonomous"),
|
||||
user=user,
|
||||
max_completion_tokens=2048,
|
||||
|
||||
@@ -1,28 +1,28 @@
|
||||
"""Teacher-side meta skill memory for cross-epoch optimization guidance.
|
||||
"""Optimizer-side meta skill memory for cross-epoch optimization guidance.
|
||||
|
||||
This module maintains a compact teacher-facing memory distilled from
|
||||
This module maintains a compact optimizer-facing memory distilled from
|
||||
adjacent-epoch skill comparisons. Unlike ``slow_update``, it does not
|
||||
modify the student skill document. Instead, it produces guidance meant to
|
||||
improve future teacher behavior when proposing, merging, and ranking edits.
|
||||
modify the target skill document. Instead, it produces guidance meant to
|
||||
improve future optimizer behavior when proposing, merging, and ranking edits.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import traceback
|
||||
|
||||
from skillopt.model import chat_teacher
|
||||
from skillopt.model import chat_optimizer
|
||||
from skillopt.optimizer.slow_update import format_comparison_text
|
||||
from skillopt.prompts import load_prompt
|
||||
from skillopt.utils import extract_json
|
||||
|
||||
|
||||
def format_meta_skill_context(meta_skill_content: str) -> str:
|
||||
"""Render teacher memory into a prompt-ready context block."""
|
||||
"""Render optimizer memory into a prompt-ready context block."""
|
||||
content = (meta_skill_content or "").strip()
|
||||
if not content:
|
||||
return ""
|
||||
return (
|
||||
"## Teacher Meta Skill\n"
|
||||
"This is teacher-side memory distilled from prior epoch transitions in "
|
||||
"## Optimizer Meta Skill\n"
|
||||
"This is optimizer-side memory distilled from prior epoch transitions in "
|
||||
"this environment. Use it to improve how you propose, merge, and rank "
|
||||
"skill edits. Prefer it when the current evidence is ambiguous, but do "
|
||||
"not force it if the current trajectories clearly contradict it.\n\n"
|
||||
@@ -38,7 +38,7 @@ def run_meta_skill(
|
||||
prev_meta_skill_content: str = "",
|
||||
system_prompt: str | None = None,
|
||||
) -> dict | None:
|
||||
"""Produce updated teacher-side meta skill from adjacent epochs."""
|
||||
"""Produce updated optimizer-side meta skill from adjacent epochs."""
|
||||
actual_system = system_prompt if system_prompt is not None else load_prompt("meta_skill")
|
||||
|
||||
prev_skill_display = prev_skill
|
||||
@@ -52,15 +52,15 @@ def run_meta_skill(
|
||||
prev_meta_section = (
|
||||
prev_meta_skill_content.strip()
|
||||
if prev_meta_skill_content and prev_meta_skill_content.strip()
|
||||
else "(No previous teacher meta skill — this is the first update.)"
|
||||
else "(No previous optimizer meta skill — this is the first update.)"
|
||||
)
|
||||
|
||||
comparison_text = format_comparison_text(comparison_pairs)
|
||||
user = (
|
||||
f"## Previous Epoch Last-Step Skill\n{prev_skill_display}\n\n"
|
||||
f"## Current Epoch Last-Step Skill\n{curr_skill_display}\n\n"
|
||||
f"## Previous Teacher Meta Skill\n"
|
||||
f"The following teacher memory was available during the current epoch. "
|
||||
f"## Previous Optimizer Meta Skill\n"
|
||||
f"The following optimizer memory was available during the current epoch. "
|
||||
f"Reflect on whether it improved or harmed the quality of edits.\n\n"
|
||||
f"{prev_meta_section}\n\n"
|
||||
f"## Longitudinal Comparison (same tasks, two last-step skills)\n"
|
||||
@@ -68,7 +68,7 @@ def run_meta_skill(
|
||||
)
|
||||
|
||||
try:
|
||||
response, _ = chat_teacher(
|
||||
response, _ = chat_optimizer(
|
||||
system=actual_system,
|
||||
user=user,
|
||||
max_completion_tokens=3072,
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
"""Teacher-driven full skill rewrite from selected revise_suggestions."""
|
||||
"""Optimizer-driven full skill rewrite from selected revise_suggestions."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
from skillopt.model import chat_teacher
|
||||
from skillopt.model import chat_optimizer
|
||||
from skillopt.prompts import load_prompt
|
||||
from skillopt.optimizer.update_modes import get_payload_items
|
||||
from skillopt.utils import extract_json
|
||||
@@ -40,7 +40,7 @@ def rewrite_skill_from_suggestions(
|
||||
)
|
||||
|
||||
try:
|
||||
response, _ = chat_teacher(
|
||||
response, _ = chat_optimizer(
|
||||
system=actual_system,
|
||||
user=user,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
|
||||
@@ -28,9 +28,19 @@ def _is_in_slow_update_region(skill: str, target: str) -> bool:
|
||||
return start_idx <= target_idx < region_end
|
||||
|
||||
|
||||
def _strip_slow_update_markers(text: str) -> str:
|
||||
"""Remove any SLOW_UPDATE markers from edit content to prevent duplication."""
|
||||
return (
|
||||
text.replace(SLOW_UPDATE_START, "")
|
||||
.replace(SLOW_UPDATE_END, "")
|
||||
)
|
||||
|
||||
|
||||
def _edit_fields(edit: EditType | dict) -> tuple[str, str, str]:
|
||||
op = edit.op if hasattr(edit, "op") else edit.get("op", "")
|
||||
content = (edit.content if hasattr(edit, "content") else edit.get("content", "")).strip()
|
||||
content = _strip_slow_update_markers(
|
||||
(edit.content if hasattr(edit, "content") else edit.get("content", "")).strip()
|
||||
)
|
||||
target = edit.target if hasattr(edit, "target") else edit.get("target", "")
|
||||
return op, content, target
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
At the end of each epoch, the slow update compares rollout performance of the
|
||||
same sample set under the previous epoch's skill vs. the current epoch's skill
|
||||
(Markov: only adjacent epochs). A teacher analyzes regressions, improvements,
|
||||
(Markov: only adjacent epochs). A optimizer analyzes regressions, improvements,
|
||||
and persistent failures, then writes a free-form guidance block into a
|
||||
**protected** section of the skill document. This section cannot be modified by
|
||||
step-level analyst edits — only the slow update process overwrites it.
|
||||
@@ -14,7 +14,7 @@ Public API
|
||||
- :func:`replace_slow_update_field` — overwrite content
|
||||
- :func:`has_slow_update_field` — check if markers are present
|
||||
- :func:`build_comparison_text` — format side-by-side rollout results
|
||||
- :func:`run_slow_update` — teacher call to produce guidance
|
||||
- :func:`run_slow_update` — optimizer call to produce guidance
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
@@ -22,7 +22,7 @@ import json
|
||||
import os
|
||||
import traceback
|
||||
|
||||
from skillopt.model import chat_teacher
|
||||
from skillopt.model import chat_optimizer
|
||||
from skillopt.prompts import load_prompt
|
||||
from skillopt.utils import extract_json
|
||||
|
||||
@@ -57,16 +57,35 @@ def extract_slow_update_field(skill: str) -> str:
|
||||
return skill[inner_start:end].strip()
|
||||
|
||||
|
||||
def replace_slow_update_field(skill: str, new_content: str) -> str:
|
||||
start = skill.find(SLOW_UPDATE_START)
|
||||
end = skill.find(SLOW_UPDATE_END)
|
||||
if start == -1 or end == -1:
|
||||
skill = inject_empty_slow_update_field(skill)
|
||||
def _strip_all_slow_update_fields(skill: str) -> str:
|
||||
"""Remove every SLOW_UPDATE_START/END pair (and content between) from *skill*."""
|
||||
while True:
|
||||
start = skill.find(SLOW_UPDATE_START)
|
||||
end = skill.find(SLOW_UPDATE_END)
|
||||
before = skill[:start + len(SLOW_UPDATE_START)]
|
||||
after = skill[end:]
|
||||
return before + "\n" + new_content.strip() + "\n" + after
|
||||
if start == -1:
|
||||
break
|
||||
end = skill.find(SLOW_UPDATE_END, start)
|
||||
if end == -1:
|
||||
# Orphan start marker — remove it
|
||||
skill = skill[:start] + skill[start + len(SLOW_UPDATE_START):]
|
||||
break
|
||||
skill = skill[:start] + skill[end + len(SLOW_UPDATE_END):]
|
||||
# Clean up stray end markers
|
||||
skill = skill.replace(SLOW_UPDATE_END, "")
|
||||
# Collapse excess blank lines left behind
|
||||
while "\n\n\n" in skill:
|
||||
skill = skill.replace("\n\n\n", "\n\n")
|
||||
return skill.rstrip()
|
||||
|
||||
|
||||
def replace_slow_update_field(skill: str, new_content: str) -> str:
|
||||
# Remove all existing slow update regions first to guarantee exactly one.
|
||||
skill = _strip_all_slow_update_fields(skill)
|
||||
block = (
|
||||
f"\n\n{SLOW_UPDATE_START}\n"
|
||||
f"{new_content.strip()}\n"
|
||||
f"{SLOW_UPDATE_END}\n"
|
||||
)
|
||||
return skill + block
|
||||
|
||||
|
||||
# ── Comparison text builder ─────────────────────────────────────────────────
|
||||
@@ -212,7 +231,7 @@ def save_comparison_pairs(pairs: list[dict], out_path: str) -> None:
|
||||
|
||||
|
||||
def format_comparison_text(pairs: list[dict]) -> str:
|
||||
"""Format structured comparison pairs into teacher-readable text."""
|
||||
"""Format structured comparison pairs into optimizer-readable text."""
|
||||
by_cat: dict[str, list[dict]] = {
|
||||
"regressed": [],
|
||||
"persistent_fail": [],
|
||||
@@ -277,7 +296,7 @@ def format_comparison_text(pairs: list[dict]) -> str:
|
||||
|
||||
|
||||
|
||||
# ── Teacher call ────────────────────────────────────────────────────────────
|
||||
# ── Optimizer call ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def run_slow_update(
|
||||
@@ -293,7 +312,7 @@ def run_slow_update(
|
||||
comparison_pairs: list[dict] | None = None,
|
||||
system_prompt: str | None = None,
|
||||
) -> dict | None:
|
||||
"""Run the slow update teacher call for one epoch boundary.
|
||||
"""Run the slow update optimizer call for one epoch boundary.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
@@ -355,7 +374,7 @@ def run_slow_update(
|
||||
)
|
||||
|
||||
try:
|
||||
response, _ = chat_teacher(
|
||||
response, _ = chat_optimizer(
|
||||
system=actual_system,
|
||||
user=user,
|
||||
max_completion_tokens=4096,
|
||||
|
||||
@@ -10,7 +10,7 @@ the batch and propose a concise set of skill-revision suggestions.
|
||||
2. Identify the most prevalent, systematic failure patterns across them.
|
||||
3. For each pattern, classify its failure type.
|
||||
4. Propose revision suggestions that address the COMMON patterns, not individual edge cases.
|
||||
5. Suggestions must be generalizable and should help a later teacher rewrite the full skill document.
|
||||
5. Suggestions must be generalizable and should help a later optimizer rewrite the full skill document.
|
||||
6. Do not hardcode task-specific values.
|
||||
|
||||
You will be told the maximum number of suggestions (the budget L). Produce AT MOST L suggestions,
|
||||
@@ -29,7 +29,7 @@ Respond ONLY with a valid JSON object (no markdown fences, no extra text):
|
||||
"type": "add_rule|remove_rule|merge_rules|reorganize|compress|clarify",
|
||||
"title": "<short title>",
|
||||
"motivation": "<why this matters>",
|
||||
"instruction": "<what the rewriting teacher should change in the skill>",
|
||||
"instruction": "<what the rewriting optimizer should change in the skill>",
|
||||
"priority_hint": "high|medium|low"
|
||||
}
|
||||
]
|
||||
|
||||
@@ -24,7 +24,7 @@ Respond ONLY with a valid JSON object:
|
||||
"type": "add_rule|remove_rule|merge_rules|reorganize|compress|clarify",
|
||||
"title": "<short title>",
|
||||
"motivation": "<why this matters>",
|
||||
"instruction": "<what the rewriting teacher should change in the skill>",
|
||||
"instruction": "<what the rewriting optimizer should change in the skill>",
|
||||
"priority_hint": "high|medium|low"
|
||||
}
|
||||
]
|
||||
|
||||
@@ -7,7 +7,7 @@ Merge guidelines:
|
||||
2. Resolve conflicts by keeping the more general, better-justified direction.
|
||||
3. Preserve unique high-impact corrective insights.
|
||||
4. Suggestions supported by many source patches should receive higher support_count.
|
||||
5. The output suggestions should help a later teacher rewrite the full skill.
|
||||
5. The output suggestions should help a later optimizer rewrite the full skill.
|
||||
|
||||
Respond ONLY with a valid JSON object:
|
||||
{
|
||||
@@ -17,7 +17,7 @@ Respond ONLY with a valid JSON object:
|
||||
"type": "add_rule|remove_rule|merge_rules|reorganize|compress|clarify",
|
||||
"title": "<short title>",
|
||||
"motivation": "<why this matters>",
|
||||
"instruction": "<what the rewriting teacher should change in the skill>",
|
||||
"instruction": "<what the rewriting optimizer should change in the skill>",
|
||||
"priority_hint": "high|medium|low",
|
||||
"support_count": <integer>,
|
||||
"source_type": "failure"
|
||||
|
||||
@@ -16,7 +16,7 @@ Respond ONLY with a valid JSON object:
|
||||
"type": "add_rule|remove_rule|merge_rules|reorganize|compress|clarify",
|
||||
"title": "<short title>",
|
||||
"motivation": "<why this matters>",
|
||||
"instruction": "<what the rewriting teacher should change in the skill>",
|
||||
"instruction": "<what the rewriting optimizer should change in the skill>",
|
||||
"priority_hint": "high|medium|low",
|
||||
"support_count": <integer>,
|
||||
"source_type": "failure|success"
|
||||
|
||||
@@ -6,7 +6,7 @@ Merge guidelines:
|
||||
1. Deduplicate overlapping success patterns.
|
||||
2. Be conservative: only keep suggestions that reinforce useful behavior not already well-covered.
|
||||
3. Suggestions supported by many source patches should receive higher support_count.
|
||||
4. The output suggestions should help a later teacher rewrite the full skill.
|
||||
4. The output suggestions should help a later optimizer rewrite the full skill.
|
||||
|
||||
Respond ONLY with a valid JSON object:
|
||||
{
|
||||
@@ -16,7 +16,7 @@ Respond ONLY with a valid JSON object:
|
||||
"type": "add_rule|remove_rule|merge_rules|reorganize|compress|clarify",
|
||||
"title": "<short title>",
|
||||
"motivation": "<why this matters>",
|
||||
"instruction": "<what the rewriting teacher should change in the skill>",
|
||||
"instruction": "<what the rewriting optimizer should change in the skill>",
|
||||
"priority_hint": "high|medium|low",
|
||||
"support_count": <integer>,
|
||||
"source_type": "success"
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
You are a teacher-coach for an AI agent skill optimization system.
|
||||
You are a optimizer-coach for an AI agent skill optimization system.
|
||||
|
||||
Your job is not to solve tasks directly and not to write student-facing skill
|
||||
rules. Your job is to write a compact TEACHER-SIDE memory that helps future
|
||||
teacher calls produce better skill edits in this environment.
|
||||
Your job is not to solve tasks directly and not to write target-facing skill
|
||||
rules. Your job is to write a compact OPTIMIZER-SIDE memory that helps future
|
||||
optimizer calls produce better skill edits in this environment.
|
||||
|
||||
## What You Receive
|
||||
|
||||
1. The previous epoch's last-step skill.
|
||||
2. The current epoch's last-step skill.
|
||||
3. A longitudinal comparison on the SAME sampled tasks under those two skills.
|
||||
4. The previous teacher meta skill, if one existed.
|
||||
4. The previous optimizer meta skill, if one existed.
|
||||
|
||||
## Your Goal
|
||||
|
||||
Write a concise meta skill that improves future teacher behavior in stages such
|
||||
Write a concise meta skill that improves future optimizer behavior in stages such
|
||||
as failure analysis, success analysis, patch merging, and edit ranking.
|
||||
|
||||
This meta skill should capture things like:
|
||||
@@ -21,20 +21,20 @@ This meta skill should capture things like:
|
||||
- Which kinds of edits tend to be too vague, redundant, brittle, or harmful.
|
||||
- What level of abstraction works best for rules here.
|
||||
- What failure-repair patterns should be prioritized.
|
||||
- What regression risks future teacher calls should guard against.
|
||||
- What regression risks future optimizer calls should guard against.
|
||||
|
||||
## Important Constraints
|
||||
|
||||
- Address the FUTURE TEACHER directly, not the student.
|
||||
- Address the FUTURE OPTIMIZER directly, not the target.
|
||||
- Focus on how to write better edits and organize better skill updates.
|
||||
- Use evidence from the adjacent-epoch comparison, not generic advice.
|
||||
- Keep it compact and high-signal. Prefer a few durable principles.
|
||||
- Revise or remove parts of the previous meta skill if they did not help.
|
||||
- Do not output student-facing task instructions.
|
||||
- Do not output target-facing task instructions.
|
||||
- Do not restate the whole skill; summarize editing strategy.
|
||||
|
||||
Respond ONLY with a valid JSON object:
|
||||
{
|
||||
"reasoning": "<brief reflection on what editing directions helped or hurt>",
|
||||
"meta_skill_content": "<compact teacher-side guidance for future edit generation and selection>"
|
||||
"meta_skill_content": "<compact optimizer-side guidance for future edit generation and selection>"
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
You are an expert skill-optimization teacher. You receive a skill document and a pool
|
||||
You are an expert skill-optimization optimizer. You receive a skill document and a pool
|
||||
of proposed edits. Your job is to RANK the edits by importance and select the top ones.
|
||||
|
||||
Ranking criteria (in order of priority):
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
You are an expert skill-optimization teacher. You receive a skill document and a pool
|
||||
You are an expert skill-optimization optimizer. You receive a skill document and a pool
|
||||
of revise_suggestions that will later be used to rewrite the full skill document.
|
||||
Rank the suggestions by importance and select the top ones.
|
||||
|
||||
Ranking criteria:
|
||||
1. Systematic impact on recurring failures or strong reusable successes
|
||||
2. Complementarity with the current skill
|
||||
3. Rewrite utility: how much the suggestion helps a later teacher improve structure, clarity, or coverage
|
||||
3. Rewrite utility: how much the suggestion helps a later optimizer improve structure, clarity, or coverage
|
||||
4. Generality and actionability
|
||||
|
||||
Respond ONLY with a valid JSON object:
|
||||
|
||||
@@ -4,7 +4,7 @@ You will receive:
|
||||
1. The current skill document
|
||||
2. A selected set of revise_suggestions distilled from trajectory analysis
|
||||
|
||||
Your job is to rewrite the FULL student skill document so it incorporates the
|
||||
Your job is to rewrite the FULL target skill document so it incorporates the
|
||||
selected suggestions coherently.
|
||||
|
||||
Hard requirements:
|
||||
@@ -12,7 +12,7 @@ Hard requirements:
|
||||
2. Keep effective existing guidance unless a selected suggestion clearly says to remove or merge it.
|
||||
3. Prefer consolidation and clarity over making the document longer.
|
||||
4. Do not hardcode benchmark-specific answers, entity names, file paths, or gold values.
|
||||
5. Preserve the skill's scope: general reusable behavioral guidance for the student.
|
||||
5. Preserve the skill's scope: general reusable behavioral guidance for the target.
|
||||
6. Do not modify content inside the protected slow-update block between
|
||||
<!-- SLOW_UPDATE_START --> and <!-- SLOW_UPDATE_END --> except to keep it intact.
|
||||
7. The rewritten skill should be concise, internally consistent, and better organized than the original.
|
||||
|
||||
@@ -41,16 +41,16 @@ all subsequent step-level optimization — only you can overwrite it at the next
|
||||
epoch boundary.
|
||||
|
||||
Your guidance must:
|
||||
- Be written as **direct, actionable instructions** to the student model
|
||||
- Be written as **direct, actionable instructions** to the target model
|
||||
(the AI agent that will read and follow the skill).
|
||||
- Focus on helping the student get problems RIGHT — not on analysis or
|
||||
- Focus on helping the target get problems RIGHT — not on analysis or
|
||||
explanation of what went wrong.
|
||||
- Prioritize: (1) preventing regressions, (2) fixing persistent failures,
|
||||
(3) reinforcing successful patterns.
|
||||
- Be concise but comprehensive — you have no length limit, but every sentence
|
||||
should earn its place.
|
||||
- NOT duplicate content already in the main skill body — complement it.
|
||||
- Address the student directly (e.g., "When you encounter X, always do Y"
|
||||
- Address the target directly (e.g., "When you encounter X, always do Y"
|
||||
rather than "The agent should...").
|
||||
|
||||
Respond ONLY with a valid JSON object (no markdown fences, no extra text):
|
||||
|
||||
@@ -118,8 +118,8 @@ class RolloutResult:
|
||||
predicted_answer: str = ""
|
||||
question: str = ""
|
||||
reference_text: str = ""
|
||||
student_system_prompt: str = ""
|
||||
student_user_prompt: str = ""
|
||||
target_system_prompt: str = ""
|
||||
target_user_prompt: str = ""
|
||||
spreadsheet_preview: str = ""
|
||||
extras: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@@ -151,8 +151,8 @@ class RolloutResult:
|
||||
predicted_answer=str(d.get("predicted_answer", "")),
|
||||
question=str(d.get("question", "")),
|
||||
reference_text=str(d.get("reference_text", "")),
|
||||
student_system_prompt=str(d.get("student_system_prompt", "")),
|
||||
student_user_prompt=str(d.get("student_user_prompt", "")),
|
||||
target_system_prompt=str(d.get("target_system_prompt", "")),
|
||||
target_user_prompt=str(d.get("target_user_prompt", "")),
|
||||
spreadsheet_preview=str(d.get("spreadsheet_preview", "")),
|
||||
extras=extras,
|
||||
)
|
||||
@@ -166,7 +166,7 @@ class RolloutResult:
|
||||
for attr in (
|
||||
"n_turns", "fail_reason", "task_type", "task_description",
|
||||
"predicted_answer", "question", "reference_text",
|
||||
"student_system_prompt", "student_user_prompt",
|
||||
"target_system_prompt", "target_user_prompt",
|
||||
"spreadsheet_preview",
|
||||
):
|
||||
val = getattr(self, attr)
|
||||
@@ -244,57 +244,6 @@ class RawPatch:
|
||||
return d
|
||||
|
||||
|
||||
# ── Epoch-level: META_REFLECT ────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class MetaReflectResult:
|
||||
"""Output of the epoch-level meta-reflect stage (momentum)."""
|
||||
|
||||
meta_summary: str
|
||||
patch: Patch
|
||||
action: str = ""
|
||||
gate_score: float | None = None
|
||||
time_s: float | None = None
|
||||
candidate_hash: str = ""
|
||||
update_origin: str = ""
|
||||
update_target: str = ""
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict | None) -> MetaReflectResult | None:
|
||||
if d is None:
|
||||
return None
|
||||
patch_raw = d.get("patch", {})
|
||||
return cls(
|
||||
meta_summary=d.get("meta_summary", ""),
|
||||
patch=Patch.from_dict(patch_raw) if isinstance(patch_raw, dict) else Patch(),
|
||||
action=d.get("action", ""),
|
||||
gate_score=d.get("gate_score"),
|
||||
time_s=d.get("time_s"),
|
||||
candidate_hash=d.get("candidate_hash", ""),
|
||||
update_origin=d.get("update_origin", ""),
|
||||
update_target=d.get("update_target", ""),
|
||||
)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
d: dict[str, Any] = {
|
||||
"meta_summary": self.meta_summary,
|
||||
"patch": self.patch.to_dict(),
|
||||
}
|
||||
if self.action:
|
||||
d["action"] = self.action
|
||||
if self.gate_score is not None:
|
||||
d["gate_score"] = self.gate_score
|
||||
if self.time_s is not None:
|
||||
d["time_s"] = self.time_s
|
||||
if self.candidate_hash:
|
||||
d["candidate_hash"] = self.candidate_hash
|
||||
if self.update_origin:
|
||||
d["update_origin"] = self.update_origin
|
||||
if self.update_target:
|
||||
d["update_target"] = self.update_target
|
||||
return d
|
||||
|
||||
|
||||
# ── Epoch-level: SLOW_UPDATE ─────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
|
||||
@@ -86,8 +86,8 @@ class TrainingManager:
|
||||
if line and not line.startswith("#") and "=" in line:
|
||||
k, v = line.split("=", 1)
|
||||
env[k] = v
|
||||
# Propagate TEACHER_* to base AZURE_OPENAI_* when base is missing,
|
||||
# so student/default endpoints inherit from teacher config.
|
||||
# Propagate OPTIMIZER_* to base AZURE_OPENAI_* when base is missing,
|
||||
# so target/default endpoints inherit from optimizer config.
|
||||
_propagate = [
|
||||
("ENDPOINT", ""), ("API_VERSION", ""), ("AUTH_MODE", ""),
|
||||
("MANAGED_IDENTITY_CLIENT_ID", ""), ("AD_SCOPE", ""),
|
||||
@@ -95,9 +95,9 @@ class TrainingManager:
|
||||
]
|
||||
for suffix, _ in _propagate:
|
||||
base_key = f"AZURE_OPENAI_{suffix}"
|
||||
teacher_key = f"TEACHER_AZURE_OPENAI_{suffix}"
|
||||
if not env.get(base_key) and env.get(teacher_key):
|
||||
env[base_key] = env[teacher_key]
|
||||
optimizer_key = f"OPTIMIZER_AZURE_OPENAI_{suffix}"
|
||||
if not env.get(base_key) and env.get(optimizer_key):
|
||||
env[base_key] = env[optimizer_key]
|
||||
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
@@ -398,7 +398,7 @@ def build_ui():
|
||||
use_slow_update = gr.Checkbox(value=True,
|
||||
label="Slow Update (epoch-boundary momentum)")
|
||||
use_meta_skill = gr.Checkbox(value=True,
|
||||
label="Meta Skill (cross-epoch teacher memory)")
|
||||
label="Meta Skill (cross-epoch optimizer memory)")
|
||||
use_gate = gr.Checkbox(value=True,
|
||||
label="Gate (validation-based accept/reject)")
|
||||
|
||||
@@ -533,10 +533,13 @@ def main():
|
||||
parser = argparse.ArgumentParser(description="SkillOpt WebUI")
|
||||
parser.add_argument("--port", type=int, default=7860)
|
||||
parser.add_argument("--share", action="store_true")
|
||||
parser.add_argument("--host", type=str, default="0.0.0.0",
|
||||
help="Server host. Use 0.0.0.0 for public access.")
|
||||
args = parser.parse_args()
|
||||
|
||||
app = build_ui()
|
||||
app.launch(
|
||||
server_name=args.host,
|
||||
server_port=args.port,
|
||||
share=args.share,
|
||||
theme=gr.themes.Soft(primary_hue="indigo"),
|
||||
|
||||
Reference in New Issue
Block a user