diff --git a/.env.example b/.env.example index 89d0b35..faf9c85 100644 --- a/.env.example +++ b/.env.example @@ -1,24 +1,25 @@ # SkillOpt Environment Variables # Copy this file to .env and fill in your values. +# Usage: set -a; source .env; set +a # ── Azure OpenAI (required for openai_chat backend) ────────────────── -AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/ -AZURE_OPENAI_API_VERSION=2024-12-01-preview +export AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/ +export AZURE_OPENAI_API_VERSION=2024-12-01-preview # Authentication: choose one method # Option 1: API Key -AZURE_OPENAI_API_KEY= -# Option 2: Azure CLI (set auth_mode=azure_cli in config) -# Option 3: Managed Identity (set auth_mode=managed_identity + client_id in config) +export AZURE_OPENAI_API_KEY= +# Option 2: Azure CLI (no API key needed, recommended on Azure VMs) +# export AZURE_OPENAI_AUTH_MODE=azure_cli +# Option 3: Managed Identity +# export AZURE_OPENAI_AUTH_MODE=managed_identity +# export AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID=your-client-id # ── OpenAI (alternative to Azure) ──────────────────────────────────── -# OPENAI_API_KEY=sk-... +# export OPENAI_API_KEY=sk-... # ── Anthropic / Claude (for claude_chat backend) ───────────────────── -# ANTHROPIC_API_KEY=sk-ant-... +# export ANTHROPIC_API_KEY=sk-ant-... # ── Qwen Local Model (for qwen_chat backend) ──────────────────────── -# QWEN_CHAT_BASE_URL=http://localhost:8000/v1 -# QWEN_CHAT_MODEL=Qwen/Qwen3.5-4B - -# ── Ray (optional, for distributed rollout) ────────────────────────── -# RAY_ADDRESS=auto +# export QWEN_CHAT_BASE_URL=http://localhost:8000/v1 +# export QWEN_CHAT_MODEL=Qwen/Qwen3.5-4B diff --git a/.gradio/certificate.pem b/.gradio/certificate.pem new file mode 100644 index 0000000..b85c803 --- /dev/null +++ b/.gradio/certificate.pem @@ -0,0 +1,31 @@ +-----BEGIN CERTIFICATE----- +MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw +TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh +cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4 +WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu +ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY +MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc +h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+ +0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U +A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW +T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH +B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC +B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv +KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn +OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn +jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw +qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI +rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV +HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq +hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL +ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ +3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK +NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5 +ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur +TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC +jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc +oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq +4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA +mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d +emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc= +-----END CERTIFICATE----- diff --git a/README.md b/README.md index 381de7f..2d1edde 100644 --- a/README.md +++ b/README.md @@ -19,16 +19,16 @@ SkillOpt is a framework for optimizing a natural-language **skill document** thr It does **not** fine-tune model parameters. Instead, it treats the skill document as the optimization target: -- The **student** model executes tasks with the current skill -- The **teacher** model analyzes trajectories and proposes edits +- The **target** model executes tasks with the current skill +- The **optimizer** model analyzes trajectories and proposes edits - The framework merges, ranks, applies, and validates those edits - Only validated skill updates are kept | Deep Learning | SkillOpt | |---|---| | Model weights | Skill document (Markdown) | -| Forward pass | Rollout (student executes tasks) | -| Loss computation | Reflect (teacher analyzes trajectories) | +| Forward pass | Rollout (target executes tasks) | +| Loss computation | Reflect (optimizer analyzes trajectories) | | Gradient | Edit patches (proposed skill improvements) | | Gradient clipping | Edit ranking & selection (`learning_rate`) | | Weight update | Patch application to skill document | @@ -59,17 +59,17 @@ This gives a training-style loop for prompt / policy optimization: Every training step executes the following pipeline in `skillopt/engine/trainer.py`: 1. **Rollout** - The student model runs a batch of tasks using the current skill. + The target model runs a batch of tasks using the current skill. 2. **Reflect** - The teacher analyzes minibatches of trajectories and emits raw patches. + The optimizer analyzes minibatches of trajectories and emits raw patches. Failure-driven and success-driven patches are tracked separately. 3. **Aggregate** Raw patches are merged hierarchically. Metadata such as `support_count` and `source_type` is carried into the merged patch so later ranking can use it. 4. **Select** - The teacher ranks the merged edit pool and keeps up to `edit_budget` edits. + The optimizer ranks the merged edit pool and keeps up to `edit_budget` edits. 5. **Update** The selected edits are applied to the skill document. The framework records an `edit_apply_report.json` so you can see which edits actually landed, which were skipped, and why. @@ -84,7 +84,7 @@ Inside an epoch, the trainer maintains a step buffer containing: - Compact failure-pattern summaries from previous steps - Rejected edits and their score deltas -That context is fed back into later reflection calls so the teacher can avoid repeating ineffective edits and can focus on unsolved error patterns. +That context is fed back into later reflection calls so the optimizer can avoid repeating ineffective edits and can focus on unsolved error patterns. ### Epoch-Level Mechanisms @@ -96,7 +96,7 @@ This guidance is **not** blindly written through — it is converted into a cand #### Meta Skill -`meta_skill` is teacher-side cross-epoch memory. It does not directly edit the current skill. Instead, it writes a compact memory artifact describing longer-term patterns across adjacent epochs. That memory is loaded into later reflection / merge / ranking calls as extra context. +`meta_skill` is optimizer-side cross-epoch memory. It does not directly edit the current skill. Instead, it writes a compact memory artifact describing longer-term patterns across adjacent epochs. That memory is loaded into later reflection / merge / ranking calls as extra context. #### Meta Reflect @@ -161,10 +161,10 @@ SkillOpt uses a hierarchical YAML configuration system. Each benchmark config in ```yaml model: - teacher_backend: openai_chat # openai_chat | claude_chat | qwen_chat - student_backend: openai_chat # openai_chat | claude_chat | codex_exec | qwen_chat - teacher: gpt-5.5 # teacher model deployment name - student: gpt-5.5 # student model deployment name + optimizer_backend: openai_chat # openai_chat | claude_chat | qwen_chat + target_backend: openai_chat # openai_chat | claude_chat | codex_exec | qwen_chat + optimizer: gpt-5.5 # optimizer model deployment name + target: gpt-5.5 # target model deployment name reasoning_effort: medium # low | medium | high train: @@ -205,8 +205,8 @@ Override any config key from the command line: ```bash python scripts/train.py \ --config configs/searchqa/default.yaml \ - --cfg-options model.teacher_backend=openai_chat \ - model.student_backend=codex_exec \ + --cfg-options model.optimizer_backend=openai_chat \ + model.target_backend=codex_exec \ train.batch_size=40 \ optimizer.learning_rate=4 @@ -214,8 +214,8 @@ python scripts/train.py \ python scripts/train.py \ --config configs/searchqa/default.yaml \ --backend azure_openai \ - --teacher_model gpt-5.5 \ - --student_model gpt-5.5 \ + --optimizer_model gpt-5.5 \ + --target_model gpt-5.5 \ --reasoning_effort medium ``` @@ -227,19 +227,19 @@ All model access goes through the unified backend router in `skillopt/model/`. | Backend | Use case | Config key | |---|---|---| -| `openai_chat` | Azure OpenAI / OpenAI API | teacher / student | -| `claude_chat` | Anthropic Claude | teacher / student | -| `codex_exec` | Codex execution harness | student only | -| `qwen_chat` | Local Qwen via vLLM | teacher / student | +| `openai_chat` | Azure OpenAI / OpenAI API | optimizer / target | +| `claude_chat` | Anthropic Claude | optimizer / target | +| `codex_exec` | Codex execution harness | target only | +| `qwen_chat` | Local Qwen via vLLM | optimizer / target | -Separate teacher/student endpoints are supported: +Separate optimizer/target endpoints are supported: ```yaml model: - teacher_backend: openai_chat - student_backend: codex_exec - teacher: gpt-5.5 - student: gpt-5.5-codex + optimizer_backend: openai_chat + target_backend: codex_exec + optimizer: gpt-5.5 + target: gpt-5.5-codex ``` --- @@ -292,15 +292,15 @@ Basic training: python scripts/train.py --config configs/searchqa/default.yaml ``` -Exec harness (Codex student): +Exec harness (Codex target): ```bash python scripts/train.py \ --config configs/searchqa/default.yaml \ - --teacher_backend openai_chat \ - --student_backend codex_exec \ - --teacher_model gpt-5.5 \ - --student_model gpt-5.5-codex \ + --optimizer_backend openai_chat \ + --target_backend codex_exec \ + --optimizer_model gpt-5.5 \ + --target_model gpt-5.5-codex \ --use_deep_reflect true \ --skill_update_mode rewrite_from_suggestions ``` @@ -366,7 +366,7 @@ The trainer resumes from `runtime_state.json` when present. That state tracks: 1. Create `skillopt/envs//` with: - `adapter.py` — implements `EnvAdapter` - `dataloader.py` — data loading logic - - `rollout.py` — student execution logic + - `rollout.py` — target execution logic - `skills/initial.md` — initial skill document 2. Add a config at `configs//default.yaml` 3. Register in `skillopt/envs/__init__.py` diff --git a/configs/_base_/default.yaml b/configs/_base_/default.yaml index 4f485d8..a454b72 100644 --- a/configs/_base_/default.yaml +++ b/configs/_base_/default.yaml @@ -3,10 +3,10 @@ model: backend: azure_openai - teacher: gpt-5.5 - student: gpt-5.5 - teacher_backend: openai_chat - student_backend: openai_chat + optimizer: gpt-5.5 + target: gpt-5.5 + optimizer_backend: openai_chat + target_backend: openai_chat reasoning_effort: medium rewrite_reasoning_effort: "" rewrite_max_completion_tokens: 64000 @@ -24,25 +24,25 @@ model: claude_code_exec_use_sdk: auto claude_code_exec_effort: medium claude_code_exec_max_thinking_tokens: 16384 - codex_trace_to_teacher: true + codex_trace_to_optimizer: true azure_openai_endpoint: "" # e.g. "https://your-resource.openai.azure.com/" azure_openai_api_version: "2024-12-01-preview" azure_openai_api_key: "" # Fill locally if you do not export AZURE_OPENAI_API_KEY azure_openai_auth_mode: azure_cli azure_openai_ad_scope: "https://cognitiveservices.azure.com/.default" azure_openai_managed_identity_client_id: "" - teacher_azure_openai_endpoint: "" # e.g. "https://your-resource.openai.azure.com/" - teacher_azure_openai_api_version: "2024-12-01-preview" - teacher_azure_openai_api_key: "" - teacher_azure_openai_auth_mode: azure_cli - teacher_azure_openai_ad_scope: "https://cognitiveservices.azure.com/.default" - teacher_azure_openai_managed_identity_client_id: "" - student_azure_openai_endpoint: "" # e.g. "https://your-resource.openai.azure.com/" - student_azure_openai_api_version: "2024-12-01-preview" - student_azure_openai_api_key: "" - student_azure_openai_auth_mode: azure_cli - student_azure_openai_ad_scope: "https://cognitiveservices.azure.com/.default" - student_azure_openai_managed_identity_client_id: "" + optimizer_azure_openai_endpoint: "" # e.g. "https://your-resource.openai.azure.com/" + optimizer_azure_openai_api_version: "2024-12-01-preview" + optimizer_azure_openai_api_key: "" + optimizer_azure_openai_auth_mode: azure_cli + optimizer_azure_openai_ad_scope: "https://cognitiveservices.azure.com/.default" + optimizer_azure_openai_managed_identity_client_id: "" + target_azure_openai_endpoint: "" # e.g. "https://your-resource.openai.azure.com/" + target_azure_openai_api_version: "2024-12-01-preview" + target_azure_openai_api_key: "" + target_azure_openai_auth_mode: azure_cli + target_azure_openai_ad_scope: "https://cognitiveservices.azure.com/.default" + target_azure_openai_managed_identity_client_id: "" train: num_epochs: 4 @@ -57,9 +57,6 @@ gradient: analyst_workers: 16 max_analyst_rounds: 3 failure_only: false - use_deep_reflect: false - deep_reflect_failures: 4 - deep_reflect_successes: 2 optimizer: learning_rate: 4 # max edits per step (edit_budget) @@ -67,8 +64,6 @@ optimizer: lr_scheduler: cosine # constant / linear / cosine / autonomous lr_control_mode: fixed # fixed / autonomous / none skill_update_mode: patch # patch / rewrite_from_suggestions / full_rewrite_minibatch - use_meta_reflect: false - meta_learning_rate: 4 # max edits per epoch-level meta-reflect use_slow_update: true slow_update_samples: 20 longitudinal_pair_policy: mixed # mixed / changed / unchanged @@ -89,5 +84,5 @@ env: split_dir: "" data_path: "" split_output_dir: "" - exec_timeout: 120 # per student model/code-agent call timeout in seconds + exec_timeout: 120 # per target model/code-agent call timeout in seconds out_root: "" diff --git a/configs/alfworld/default.yaml b/configs/alfworld/default.yaml index 69b1049..d769224 100644 --- a/configs/alfworld/default.yaml +++ b/configs/alfworld/default.yaml @@ -10,7 +10,6 @@ gradient: optimizer: learning_rate: 4 - use_meta_reflect: false evaluation: sel_env_num: 0 diff --git a/docs/guide/configuration.md b/docs/guide/configuration.md index 68f844b..55a3a86 100644 --- a/docs/guide/configuration.md +++ b/docs/guide/configuration.md @@ -25,8 +25,8 @@ Benchmark configs inherit from `_base_/default.yaml` and override specific value ```yaml model: backend: azure_openai # azure_openai | openai_chat | claude_code_exec | qwen - teacher: gpt-5.5 # Teacher model (for reflection) - student: gpt-5.5 # Student model (for rollout) + optimizer: gpt-5.5 # Optimizer model (for reflection) + target: gpt-5.5 # Target model (for rollout) ``` ### Training diff --git a/docs/guide/dl-analogy.md b/docs/guide/dl-analogy.md index 48b79a5..758566f 100644 --- a/docs/guide/dl-analogy.md +++ b/docs/guide/dl-analogy.md @@ -7,9 +7,9 @@ SkillOpt is designed around a core insight: **optimizing natural-language prompt | Deep Learning | SkillOpt | Description | |---|---|---| | **Model weights** | Skill document (Markdown) | The thing being optimized | -| **Forward pass** | Rollout | Student executes tasks using current skill | +| **Forward pass** | Rollout | Target executes tasks using current skill | | **Loss function** | Task evaluator | Scores task execution quality | -| **Backpropagation** | Reflect | Teacher analyzes failures → edit patches | +| **Backpropagation** | Reflect | Optimizer analyzes failures → edit patches | | **Gradients** | Edit patches | Proposed changes to the skill | | **Gradient aggregation** | Patch aggregation | Merge similar edits | | **Gradient clipping** | Edit selection | Cap max edits per step | @@ -21,7 +21,7 @@ SkillOpt is designed around a core insight: **optimizing natural-language prompt | **Training step** | Step | One rollout → reflect → update cycle | | **Epoch** | Epoch | Full pass with slow update + meta memory | | **Momentum** | Slow update | Longitudinal comparison at epoch boundary | -| **Meta-learning** | Meta skill | Cross-epoch teacher strategy memory | +| **Meta-learning** | Meta skill | Cross-epoch optimizer strategy memory | | **Batch size** | `batch_size` | Tasks sampled per rollout | | **Data parallelism** | `analyst_workers` | Parallel reflection workers | | **Training set** | Train split | Items used for rollout | @@ -44,7 +44,7 @@ From our experiments, these DL intuitions transfer well: - **Cosine schedule > constant** — same as in DL, cosine annealing helps convergence - **Moderate LR (4-16) > very high/low** — too few edits = slow learning, too many = noisy - **Slow update helps** — longitudinal comparison prevents catastrophic forgetting across epochs - - **Meta skill memory improves reflection** — teacher benefits from cross-epoch strategy notes + - **Meta skill memory improves reflection** — optimizer benefits from cross-epoch strategy notes !!! warning "What doesn't transfer" - **Batch size ≠ better** — larger rollout batches have diminishing returns due to API costs diff --git a/docs/guide/first-experiment.md b/docs/guide/first-experiment.md index e5a1a48..2a65589 100644 --- a/docs/guide/first-experiment.md +++ b/docs/guide/first-experiment.md @@ -33,7 +33,7 @@ optimizer: learning_rate: 4 # (max edits per step) lr_scheduler: cosine # (learning rate schedule) use_slow_update: true # (momentum at epoch boundary) - use_meta_skill: true # (cross-epoch teacher memory) + use_meta_skill: true # (cross-epoch optimizer memory) gradient: analyst_workers: 16 # (parallel reflection workers) diff --git a/docs/guide/new-benchmark.md b/docs/guide/new-benchmark.md index 5b1b8ad..091c385 100644 --- a/docs/guide/new-benchmark.md +++ b/docs/guide/new-benchmark.md @@ -76,7 +76,7 @@ class MyBenchmarkEnv(EnvAdapter): Args: item: The data item to process skill: Current skill document content - model: The student model instance + model: The target model instance Returns: TaskResult with prediction, score, and trajectory diff --git a/docs/guide/skill-document.md b/docs/guide/skill-document.md index bc6aaa8..62d1a34 100644 --- a/docs/guide/skill-document.md +++ b/docs/guide/skill-document.md @@ -70,7 +70,7 @@ Track your skill's evolution through: 1. **Start with a seed skill** (`env.skill_init`) if you have domain knowledge — it converges faster 2. **Use cosine LR schedule** — aggressive early exploration + careful late refinement 3. **Enable slow update** (`use_slow_update: true`) to prevent forgetting across epochs - 4. **Enable meta skill** (`use_meta_skill: true`) so the teacher accumulates strategy memory + 4. **Enable meta skill** (`use_meta_skill: true`) so the optimizer accumulates strategy memory ## Next Steps diff --git a/docs/guide/training-loop.md b/docs/guide/training-loop.md index 8c8d118..7922305 100644 --- a/docs/guide/training-loop.md +++ b/docs/guide/training-loop.md @@ -10,8 +10,8 @@ SkillOpt's core insight: **optimizing natural-language skill documents follows t │ │ │ for epoch in epochs: │ │ for step in steps: │ -│ 1. Rollout — Student executes tasks │ -│ 2. Reflect — Teacher analyzes trajectories │ +│ 1. Rollout — Target executes tasks │ +│ 2. Reflect — Optimizer analyzes trajectories │ │ 3. Aggregate — Hierarchical merge of patches │ │ 4. Select — Rank & clip edits (learning rate) │ │ 5. Update — Apply patches to skill doc │ @@ -27,7 +27,7 @@ SkillOpt's core insight: **optimizing natural-language skill documents follows t ### 1. Rollout (Forward Pass) -The **student** model executes tasks using the current skill document as its prompt. Each task produces a trajectory and a score. +The **target** model executes tasks using the current skill document as its prompt. Each task produces a trajectory and a score. ```python # Analogy: forward pass through the network @@ -37,7 +37,7 @@ scores = evaluate(predictions, ground_truth) ### 2. Reflect (Backward Pass) -The **teacher** model analyzes failed trajectories and produces **edit patches** — structured suggestions for improving the skill document. +The **optimizer** model analyzes failed trajectories and produces **edit patches** — structured suggestions for improving the skill document. Two modes: @@ -84,7 +84,7 @@ At the end of each epoch (starting from epoch 2), the system performs a **longit ### Meta Skill -A **meta-skill memory** accumulates high-level strategy notes across the entire training run. At the end of each epoch, the teacher reflects on what changed between epochs and produces a compact memory that is provided as additional context during future reflection steps. +A **meta-skill memory** accumulates high-level strategy notes across the entire training run. At the end of each epoch, the optimizer reflects on what changed between epochs and produces a compact memory that is provided as additional context during future reflection steps. ## Next Steps diff --git a/docs/index.md b/docs/index.md index abbc6e6..2dfe6da 100644 --- a/docs/index.md +++ b/docs/index.md @@ -26,7 +26,7 @@ hide:
🎯
Rollout
-
Student executes tasks
+
Target executes tasks
@@ -34,7 +34,7 @@ hide:
🔍
Reflect
-
Teacher analyzes trajectories
+
Optimizer analyzes trajectories
@@ -88,8 +88,8 @@ SkillOpt brings the familiar deep-learning training paradigm to agentic prompt o | Deep Learning | SkillOpt | |---|---| | Model weights | Skill document (Markdown) | -| Forward pass | Rollout (student executes tasks) | -| Loss / gradient | Reflect (teacher produces edit patches) | +| Forward pass | Rollout (target executes tasks) | +| Loss / gradient | Reflect (optimizer produces edit patches) | | Gradient clipping | Edit selection (`learning_rate` = max edits) | | SGD step | Patch application to skill | | Validation set | Gated evaluation on selection split | diff --git a/docs/reference/config.md b/docs/reference/config.md index 43aff23..eec0472 100644 --- a/docs/reference/config.md +++ b/docs/reference/config.md @@ -7,8 +7,8 @@ Complete reference for all SkillOpt configuration parameters. | Parameter | Type | Default | Description | |---|---|---|---| | `model.backend` | str | `azure_openai` | Backend: `azure_openai` / `openai_chat` / `claude_code_exec` / `qwen` | -| `model.teacher` | str | `gpt-5.5` | Teacher model (for reflection & slow update) | -| `model.student` | str | `gpt-5.5` | Student model (for rollout execution) | +| `model.optimizer` | str | `gpt-5.5` | Optimizer model (for reflection & slow update) | +| `model.target` | str | `gpt-5.5` | Target model (for rollout execution) | | `model.reasoning_effort` | str | `medium` | Reasoning effort level | ## Training (`train`) @@ -40,7 +40,7 @@ Complete reference for all SkillOpt configuration parameters. | `optimizer.skill_update_mode` | str | `patch` | — | `patch` / `rewrite_from_suggestions` / `full_rewrite_minibatch` | | `optimizer.use_slow_update` | bool | `true` | Momentum | Epoch-boundary longitudinal comparison & guidance | | `optimizer.slow_update_samples` | int | 20 | — | Samples for slow update evaluation | -| `optimizer.use_meta_skill` | bool | `true` | Meta-learning | Cross-epoch teacher-side strategy memory | +| `optimizer.use_meta_skill` | bool | `true` | Meta-learning | Cross-epoch optimizer-side strategy memory | | `optimizer.longitudinal_pair_policy` | str | `mixed` | — | `mixed` / `changed` / `unchanged` | ## Evaluation (`evaluation`) diff --git a/scripts/eval_only.py b/scripts/eval_only.py index a14d3c5..ec6cd37 100644 --- a/scripts/eval_only.py +++ b/scripts/eval_only.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""ReflACT eval-only: run a single skill on a dataset without training. +"""SkillOpt eval-only: run a single skill on a dataset without training. Usage ----- @@ -29,10 +29,10 @@ from skillopt.model import ( configure_claude_code_exec, configure_codex_exec, set_reasoning_effort, - set_student_backend, - set_student_deployment, - set_teacher_backend, - set_teacher_deployment, + set_target_backend, + set_target_deployment, + set_optimizer_backend, + set_optimizer_deployment, ) from skillopt.model.common import default_model_for_backend, normalize_backend_name @@ -126,7 +126,7 @@ _BOOL = lambda x: str(x).lower() in ("true", "1", "yes") # noqa: E731 def parse_args() -> argparse.Namespace: - p = argparse.ArgumentParser(description="ReflACT eval-only") + p = argparse.ArgumentParser(description="SkillOpt eval-only") p.add_argument("--config", type=str, required=True) p.add_argument("--skill", type=str, required=True, help="Path to skill .md file to evaluate") @@ -138,10 +138,10 @@ def parse_args() -> argparse.Namespace: p.add_argument("--env", type=str) p.add_argument("--backend", type=str, choices=["azure_openai", "codex", "codex_exec", "claude", "claude_chat", "claude_code_exec"]) - p.add_argument("--teacher_model", type=str) - p.add_argument("--student_model", type=str) - p.add_argument("--teacher_backend", type=str) - p.add_argument("--student_backend", type=str) + p.add_argument("--optimizer_model", type=str) + p.add_argument("--target_model", type=str) + p.add_argument("--optimizer_backend", type=str) + p.add_argument("--target_backend", type=str) p.add_argument("--reasoning_effort", type=str, choices=["", "low", "medium", "high", "xhigh", "max"]) p.add_argument("--azure_endpoint", type=str) @@ -153,18 +153,18 @@ def parse_args() -> argparse.Namespace: p.add_argument("--azure_openai_auth_mode", type=str) p.add_argument("--azure_openai_ad_scope", type=str) p.add_argument("--azure_openai_managed_identity_client_id", type=str) - p.add_argument("--teacher_azure_openai_endpoint", type=str) - p.add_argument("--teacher_azure_openai_api_version", type=str) - p.add_argument("--teacher_azure_openai_api_key", type=str) - p.add_argument("--teacher_azure_openai_auth_mode", type=str) - p.add_argument("--teacher_azure_openai_ad_scope", type=str) - p.add_argument("--teacher_azure_openai_managed_identity_client_id", type=str) - p.add_argument("--student_azure_openai_endpoint", type=str) - p.add_argument("--student_azure_openai_api_version", type=str) - p.add_argument("--student_azure_openai_api_key", type=str) - p.add_argument("--student_azure_openai_auth_mode", type=str) - p.add_argument("--student_azure_openai_ad_scope", type=str) - p.add_argument("--student_azure_openai_managed_identity_client_id", type=str) + p.add_argument("--optimizer_azure_openai_endpoint", type=str) + p.add_argument("--optimizer_azure_openai_api_version", type=str) + p.add_argument("--optimizer_azure_openai_api_key", type=str) + p.add_argument("--optimizer_azure_openai_auth_mode", type=str) + p.add_argument("--optimizer_azure_openai_ad_scope", type=str) + p.add_argument("--optimizer_azure_openai_managed_identity_client_id", type=str) + p.add_argument("--target_azure_openai_endpoint", type=str) + p.add_argument("--target_azure_openai_api_version", type=str) + p.add_argument("--target_azure_openai_api_key", type=str) + p.add_argument("--target_azure_openai_auth_mode", type=str) + p.add_argument("--target_azure_openai_ad_scope", type=str) + p.add_argument("--target_azure_openai_managed_identity_client_id", type=str) p.add_argument("--codex_exec_path", type=str) p.add_argument("--codex_exec_sandbox", type=str) p.add_argument("--codex_exec_profile", type=str) @@ -214,10 +214,10 @@ def main() -> None: from skillopt.config import apply_overrides _MAP = { "backend": "model.backend", - "teacher_model": "model.teacher", - "student_model": "model.student", - "teacher_backend": "model.teacher_backend", - "student_backend": "model.student_backend", + "optimizer_model": "model.optimizer", + "target_model": "model.target", + "optimizer_backend": "model.optimizer_backend", + "target_backend": "model.target_backend", "reasoning_effort": "model.reasoning_effort", "azure_endpoint": "model.azure_endpoint", "azure_api_version": "model.azure_api_version", @@ -228,18 +228,18 @@ def main() -> None: "azure_openai_auth_mode": "model.azure_openai_auth_mode", "azure_openai_ad_scope": "model.azure_openai_ad_scope", "azure_openai_managed_identity_client_id": "model.azure_openai_managed_identity_client_id", - "teacher_azure_openai_endpoint": "model.teacher_azure_openai_endpoint", - "teacher_azure_openai_api_version": "model.teacher_azure_openai_api_version", - "teacher_azure_openai_api_key": "model.teacher_azure_openai_api_key", - "teacher_azure_openai_auth_mode": "model.teacher_azure_openai_auth_mode", - "teacher_azure_openai_ad_scope": "model.teacher_azure_openai_ad_scope", - "teacher_azure_openai_managed_identity_client_id": "model.teacher_azure_openai_managed_identity_client_id", - "student_azure_openai_endpoint": "model.student_azure_openai_endpoint", - "student_azure_openai_api_version": "model.student_azure_openai_api_version", - "student_azure_openai_api_key": "model.student_azure_openai_api_key", - "student_azure_openai_auth_mode": "model.student_azure_openai_auth_mode", - "student_azure_openai_ad_scope": "model.student_azure_openai_ad_scope", - "student_azure_openai_managed_identity_client_id": "model.student_azure_openai_managed_identity_client_id", + "optimizer_azure_openai_endpoint": "model.optimizer_azure_openai_endpoint", + "optimizer_azure_openai_api_version": "model.optimizer_azure_openai_api_version", + "optimizer_azure_openai_api_key": "model.optimizer_azure_openai_api_key", + "optimizer_azure_openai_auth_mode": "model.optimizer_azure_openai_auth_mode", + "optimizer_azure_openai_ad_scope": "model.optimizer_azure_openai_ad_scope", + "optimizer_azure_openai_managed_identity_client_id": "model.optimizer_azure_openai_managed_identity_client_id", + "target_azure_openai_endpoint": "model.target_azure_openai_endpoint", + "target_azure_openai_api_version": "model.target_azure_openai_api_version", + "target_azure_openai_api_key": "model.target_azure_openai_api_key", + "target_azure_openai_auth_mode": "model.target_azure_openai_auth_mode", + "target_azure_openai_ad_scope": "model.target_azure_openai_ad_scope", + "target_azure_openai_managed_identity_client_id": "model.target_azure_openai_managed_identity_client_id", "codex_exec_path": "model.codex_exec_path", "codex_exec_sandbox": "model.codex_exec_sandbox", "codex_exec_profile": "model.codex_exec_profile", @@ -288,7 +288,7 @@ def main() -> None: explicit_backend = str(option).split("=", 1)[1].strip() break - backend = normalize_backend_name(cfg.get("model_backend") or cfg.get("student_backend") or "azure_openai") + backend = normalize_backend_name(cfg.get("model_backend") or cfg.get("target_backend") or "azure_openai") def _has_model_override(dotted_key: str, legacy_key: str) -> bool: if getattr(args, legacy_key, None) is not None: @@ -303,43 +303,43 @@ def main() -> None: backend = normalize_backend_name(explicit_backend) cfg["model_backend"] = backend if backend in {"claude", "claude_chat"}: - cfg.setdefault("teacher_backend", "claude_chat") - cfg.setdefault("student_backend", "claude_chat") + cfg.setdefault("optimizer_backend", "claude_chat") + cfg.setdefault("target_backend", "claude_chat") elif backend in {"codex", "codex_exec"}: - cfg.setdefault("teacher_backend", "openai_chat") - cfg.setdefault("student_backend", "codex_exec") + cfg.setdefault("optimizer_backend", "openai_chat") + cfg.setdefault("target_backend", "codex_exec") elif backend == "claude_code_exec": - cfg.setdefault("teacher_backend", "openai_chat") - cfg.setdefault("student_backend", "claude_code_exec") + cfg.setdefault("optimizer_backend", "openai_chat") + cfg.setdefault("target_backend", "claude_code_exec") else: - cfg.setdefault("teacher_backend", "openai_chat") - cfg.setdefault("student_backend", "openai_chat") + cfg.setdefault("optimizer_backend", "openai_chat") + cfg.setdefault("target_backend", "openai_chat") else: - cfg.setdefault("teacher_backend", "openai_chat") - cfg.setdefault("student_backend", "openai_chat") + cfg.setdefault("optimizer_backend", "openai_chat") + cfg.setdefault("target_backend", "openai_chat") - if cfg.get("teacher_backend") == "claude_chat": + if cfg.get("optimizer_backend") == "claude_chat": if ( - str(cfg.get("teacher_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS - and not _has_model_override("model.teacher", "teacher_model") + str(cfg.get("optimizer_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS + and not _has_model_override("model.optimizer", "optimizer_model") ): - cfg["teacher_model"] = default_model_for_backend("claude_chat") - if cfg.get("student_backend") == "claude_chat": + cfg["optimizer_model"] = default_model_for_backend("claude_chat") + if cfg.get("target_backend") == "claude_chat": if ( - str(cfg.get("student_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS - and not _has_model_override("model.student", "student_model") + str(cfg.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS + and not _has_model_override("model.target", "target_model") ): - cfg["student_model"] = default_model_for_backend("claude_chat") - if cfg.get("student_backend") == "claude_code_exec": + cfg["target_model"] = default_model_for_backend("claude_chat") + if cfg.get("target_backend") == "claude_code_exec": if ( - str(cfg.get("student_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS - and not _has_model_override("model.student", "student_model") + str(cfg.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS + and not _has_model_override("model.target", "target_model") ): - cfg["student_model"] = default_model_for_backend("claude_chat") + cfg["target_model"] = default_model_for_backend("claude_chat") if not cfg.get("out_root"): env = cfg.get("env", "unknown") - model = cfg.get("student_model", "unknown").replace("/", "-") + model = cfg.get("target_model", "unknown").replace("/", "-") ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") cfg["out_root"] = os.path.join("outputs", f"eval_{env}_{model}_{ts}") @@ -362,27 +362,27 @@ def main() -> None: auth_mode=cfg.get("azure_openai_auth_mode") or None, ad_scope=cfg.get("azure_openai_ad_scope") or None, managed_identity_client_id=cfg.get("azure_openai_managed_identity_client_id") or None, - teacher_endpoint=cfg.get("teacher_azure_openai_endpoint") or None, - teacher_api_version=cfg.get("teacher_azure_openai_api_version") or None, - teacher_api_key=cfg.get("teacher_azure_openai_api_key") or None, - teacher_auth_mode=cfg.get("teacher_azure_openai_auth_mode") or None, - teacher_ad_scope=cfg.get("teacher_azure_openai_ad_scope") or None, - teacher_managed_identity_client_id=( - cfg.get("teacher_azure_openai_managed_identity_client_id") or None + optimizer_endpoint=cfg.get("optimizer_azure_openai_endpoint") or None, + optimizer_api_version=cfg.get("optimizer_azure_openai_api_version") or None, + optimizer_api_key=cfg.get("optimizer_azure_openai_api_key") or None, + optimizer_auth_mode=cfg.get("optimizer_azure_openai_auth_mode") or None, + optimizer_ad_scope=cfg.get("optimizer_azure_openai_ad_scope") or None, + optimizer_managed_identity_client_id=( + cfg.get("optimizer_azure_openai_managed_identity_client_id") or None ), - student_endpoint=cfg.get("student_azure_openai_endpoint") or None, - student_api_version=cfg.get("student_azure_openai_api_version") or None, - student_api_key=cfg.get("student_azure_openai_api_key") or None, - student_auth_mode=cfg.get("student_azure_openai_auth_mode") or None, - student_ad_scope=cfg.get("student_azure_openai_ad_scope") or None, - student_managed_identity_client_id=( - cfg.get("student_azure_openai_managed_identity_client_id") or None + target_endpoint=cfg.get("target_azure_openai_endpoint") or None, + target_api_version=cfg.get("target_azure_openai_api_version") or None, + target_api_key=cfg.get("target_azure_openai_api_key") or None, + target_auth_mode=cfg.get("target_azure_openai_auth_mode") or None, + target_ad_scope=cfg.get("target_azure_openai_ad_scope") or None, + target_managed_identity_client_id=( + cfg.get("target_azure_openai_managed_identity_client_id") or None ), ) - set_teacher_backend(cfg.get("teacher_backend", "openai_chat")) - set_student_backend(cfg.get("student_backend", "openai_chat")) - set_teacher_deployment(cfg.get("teacher_model", default_model_for_backend(backend))) - set_student_deployment(cfg.get("student_model", default_model_for_backend(backend))) + set_optimizer_backend(cfg.get("optimizer_backend", "openai_chat")) + set_target_backend(cfg.get("target_backend", "openai_chat")) + set_optimizer_deployment(cfg.get("optimizer_model", default_model_for_backend(backend))) + set_target_deployment(cfg.get("target_model", default_model_for_backend(backend))) configure_codex_exec( path=cfg.get("codex_exec_path", "codex"), sandbox=cfg.get("codex_exec_sandbox", "workspace-write"), diff --git a/scripts/run_alfworld.sh b/scripts/run_alfworld.sh index 54ee8b7..05c5c93 100755 --- a/scripts/run_alfworld.sh +++ b/scripts/run_alfworld.sh @@ -1,28 +1,26 @@ #!/usr/bin/env bash # ────────────────────────────────────────────────────────────────────────────── -# ReflACT — ALFWorld training launch script +# SkillOpt — ALFWorld training launch script +# +# Prerequisites: +# pip install -e ".[alfworld]" +# pip install alfworld[full] && alfworld-download # # Usage: # bash scripts/run_alfworld.sh # bash scripts/run_alfworld.sh --num_epochs 2 --edit_budget 6 +# bash scripts/run_alfworld.sh --split_dir /path/to/alfworld_split # ────────────────────────────────────────────────────────────────────────────── set -euo pipefail -# ── Paths ──────────────────────────────────────────────────────────────────── -WORKSPACE="${WORKSPACE:-$(cd "$(dirname "$0")/../.." && pwd)}" SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" PROJECT_ROOT="$(dirname "${SCRIPT_DIR}")" -# Activate conda environment -export PATH="${WORKSPACE}/miniconda3/envs/skillopt/bin:${WORKSPACE}/miniconda3/bin:${PATH}" - -# ALFWorld data — uses ~/.cache/alfworld by default (standard alfworld location) -export ALFWORLD_DATA="${ALFWORLD_DATA:-${HOME}/.cache/alfworld}" - -# Ensure ReflACT is importable export PYTHONPATH="${PROJECT_ROOT}:${PYTHONPATH:-}" -# ── Verify ALFWorld data exists ────────────────────────────────────────────── +# ALFWorld data — uses ~/.cache/alfworld by default +export ALFWORLD_DATA="${ALFWORLD_DATA:-${HOME}/.cache/alfworld}" + if [ ! -d "${ALFWORLD_DATA}/json_2.1.1" ]; then echo "ERROR: ALFWorld data not found at ${ALFWORLD_DATA}/json_2.1.1" echo "" @@ -34,25 +32,17 @@ if [ ! -d "${ALFWORLD_DATA}/json_2.1.1" ]; then exit 1 fi -# ── Azure OpenAI credentials ──────────────────────────────────────────────── -export AZURE_OPENAI_ENDPOINT="${AZURE_OPENAI_ENDPOINT:?Set AZURE_OPENAI_ENDPOINT}" -export AZURE_OPENAI_API_KEY="${AZURE_OPENAI_API_KEY:?Set AZURE_OPENAI_API_KEY}" -export AZURE_OPENAI_API_VERSION="${AZURE_OPENAI_API_VERSION:-2025-04-01-preview}" +OPTIMIZER_MODEL="${OPTIMIZER_MODEL:-gpt-5.5}" +TARGET_MODEL="${TARGET_MODEL:-gpt-5.5}" -# ── Model configuration ───────────────────────────────────────────────────── -export TEACHER_DEPLOYMENT="${TEACHER_DEPLOYMENT:-gpt-5.5}" -export STUDENT_DEPLOYMENT="${STUDENT_DEPLOYMENT:-gpt-5.5}" - -# ── Output directory ───────────────────────────────────────────────────────── TIMESTAMP=$(date +%Y%m%d_%H%M%S) -DEFAULT_OUT_ROOT="${PROJECT_ROOT}/outputs/skillopt_alfworld_${STUDENT_DEPLOYMENT}_${TIMESTAMP}" +DEFAULT_OUT_ROOT="${PROJECT_ROOT}/outputs/skillopt_alfworld_${TARGET_MODEL}_${TIMESTAMP}" -# ── Run ────────────────────────────────────────────────────────────────────── echo "============================================================" -echo " ReflACT — Reflective Agent Tuning (ALFWorld)" +echo " SkillOpt — ALFWorld Training" echo "============================================================" -echo " Teacher: ${TEACHER_DEPLOYMENT}" -echo " Student: ${STUDENT_DEPLOYMENT}" +echo " Optimizer: ${OPTIMIZER_MODEL}" +echo " Target: ${TARGET_MODEL}" echo " ALFWORLD_DATA: ${ALFWORLD_DATA}" echo " Output: ${DEFAULT_OUT_ROOT}" echo "============================================================" @@ -60,7 +50,9 @@ echo "============================================================" cd "${PROJECT_ROOT}" python scripts/train.py \ - --config configs/alfworld_default.yaml \ + --config configs/alfworld/default.yaml \ + --optimizer_model "${OPTIMIZER_MODEL}" \ + --target_model "${TARGET_MODEL}" \ --out_root "${DEFAULT_OUT_ROOT}" \ "$@" diff --git a/scripts/run_searchqa.sh b/scripts/run_searchqa.sh index 16bb1d2..0f7a7cb 100755 --- a/scripts/run_searchqa.sh +++ b/scripts/run_searchqa.sh @@ -1,41 +1,38 @@ #!/usr/bin/env bash # ────────────────────────────────────────────────────────────────────────────── -# ReflACT — SearchQA training launch script +# SkillOpt — SearchQA training launch script # # Usage: # bash scripts/run_searchqa.sh -# bash scripts/run_searchqa.sh --data_path data/searchqa_train_2000.json # bash scripts/run_searchqa.sh --num_epochs 2 --edit_budget 6 +# bash scripts/run_searchqa.sh --split_dir /path/to/searchqa_split # ────────────────────────────────────────────────────────────────────────────── set -euo pipefail -# ── Paths ──────────────────────────────────────────────────────────────────── SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" PROJECT_ROOT="$(dirname "${SCRIPT_DIR}")" -# Ensure ReflACT is importable export PYTHONPATH="${PROJECT_ROOT}:${PYTHONPATH:-}" -# ── Model configuration ───────────────────────────────────────────────────── -export TEACHER_DEPLOYMENT="${TEACHER_DEPLOYMENT:-gpt-5.5}" -export STUDENT_DEPLOYMENT="${STUDENT_DEPLOYMENT:-gpt-5.5}" +OPTIMIZER_MODEL="${OPTIMIZER_MODEL:-gpt-5.5}" +TARGET_MODEL="${TARGET_MODEL:-gpt-5.5}" -# ── Output directory ───────────────────────────────────────────────────────── TIMESTAMP=$(date +%Y%m%d_%H%M%S) -DEFAULT_OUT_ROOT="${PROJECT_ROOT}/outputs/skillopt_searchqa_${STUDENT_DEPLOYMENT}_${TIMESTAMP}" +DEFAULT_OUT_ROOT="${PROJECT_ROOT}/outputs/skillopt_searchqa_${TARGET_MODEL}_${TIMESTAMP}" -# ── Run ────────────────────────────────────────────────────────────────────── echo "============================================================" -echo " ReflACT — Reflective Agent Tuning (SearchQA)" +echo " SkillOpt — SearchQA Training" echo "============================================================" -echo " Teacher: ${TEACHER_DEPLOYMENT}" -echo " Student: ${STUDENT_DEPLOYMENT}" +echo " Optimizer: ${OPTIMIZER_MODEL}" +echo " Target: ${TARGET_MODEL}" echo "============================================================" cd "${PROJECT_ROOT}" python scripts/train.py \ - --config configs/searchqa_default.yaml \ + --config configs/searchqa/default.yaml \ + --optimizer_model "${OPTIMIZER_MODEL}" \ + --target_model "${TARGET_MODEL}" \ --out_root "${DEFAULT_OUT_ROOT}" \ "$@" diff --git a/scripts/run_spreadsheetbench.sh b/scripts/run_spreadsheetbench.sh index 74e998f..bcbb32c 100755 --- a/scripts/run_spreadsheetbench.sh +++ b/scripts/run_spreadsheetbench.sh @@ -1,46 +1,37 @@ #!/usr/bin/env bash # ────────────────────────────────────────────────────────────────────────────── -# ReflACT — SpreadsheetBench training launch script +# SkillOpt — SpreadsheetBench training launch script # # Usage: -# bash scripts/run_spreadsheetbench.sh \ -# --data_root /path/to/data \ -# --jsonl_path /path/to/benchmark.jsonl -# -# bash scripts/run_spreadsheetbench.sh \ -# --data_root /path/to/data \ -# --jsonl_path /path/to/benchmark.jsonl \ -# --num_epochs 2 --edit_budget 6 +# bash scripts/run_spreadsheetbench.sh --split_dir /path/to/split --data_root /path/to/data +# bash scripts/run_spreadsheetbench.sh --num_epochs 2 --edit_budget 6 # ────────────────────────────────────────────────────────────────────────────── set -euo pipefail -# ── Paths ──────────────────────────────────────────────────────────────────── SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" PROJECT_ROOT="$(dirname "${SCRIPT_DIR}")" -# Ensure ReflACT is importable export PYTHONPATH="${PROJECT_ROOT}:${PYTHONPATH:-}" -# ── Model configuration ───────────────────────────────────────────────────── -export TEACHER_DEPLOYMENT="${TEACHER_DEPLOYMENT:-gpt-5.5}" -export STUDENT_DEPLOYMENT="${STUDENT_DEPLOYMENT:-gpt-5.5}" +OPTIMIZER_MODEL="${OPTIMIZER_MODEL:-gpt-5.5}" +TARGET_MODEL="${TARGET_MODEL:-gpt-5.5}" -# ── Output directory ───────────────────────────────────────────────────────── TIMESTAMP=$(date +%Y%m%d_%H%M%S) -DEFAULT_OUT_ROOT="${PROJECT_ROOT}/outputs/skillopt_spreadsheetbench_${STUDENT_DEPLOYMENT}_${TIMESTAMP}" +DEFAULT_OUT_ROOT="${PROJECT_ROOT}/outputs/skillopt_spreadsheetbench_${TARGET_MODEL}_${TIMESTAMP}" -# ── Run ────────────────────────────────────────────────────────────────────── echo "============================================================" -echo " ReflACT — Reflective Agent Tuning (SpreadsheetBench)" +echo " SkillOpt — SpreadsheetBench Training" echo "============================================================" -echo " Teacher: ${TEACHER_DEPLOYMENT}" -echo " Student: ${STUDENT_DEPLOYMENT}" +echo " Optimizer: ${OPTIMIZER_MODEL}" +echo " Target: ${TARGET_MODEL}" echo "============================================================" cd "${PROJECT_ROOT}" python scripts/train.py \ - --config configs/spreadsheetbench_default.yaml \ + --config configs/spreadsheetbench/default.yaml \ + --optimizer_model "${OPTIMIZER_MODEL}" \ + --target_model "${TARGET_MODEL}" \ --out_root "${DEFAULT_OUT_ROOT}" \ "$@" diff --git a/scripts/train.py b/scripts/train.py index bf510b5..4d9473f 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""ReflACT unified training entry point. +"""SkillOpt unified training entry point. Usage ----- @@ -125,7 +125,7 @@ _BOOL = lambda x: x.lower() in ("true", "1", "yes") # noqa: E731 def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser( - description="ReflACT: Reflective Agent Tuning", + description="SkillOpt: Executive Strategy for Self-Evolving Agent Skills", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) @@ -138,10 +138,10 @@ def parse_args() -> argparse.Namespace: p.add_argument("--env", type=str) p.add_argument("--backend", type=str, choices=["azure_openai", "codex", "codex_exec", "claude", "claude_chat", "claude_code_exec", "qwen", "qwen_chat"]) - p.add_argument("--teacher_model", type=str) - p.add_argument("--student_model", type=str) - p.add_argument("--teacher_backend", type=str) - p.add_argument("--student_backend", type=str) + p.add_argument("--optimizer_model", type=str) + p.add_argument("--target_model", type=str) + p.add_argument("--optimizer_backend", type=str) + p.add_argument("--target_backend", type=str) p.add_argument("--reasoning_effort", type=str, choices=["", "low", "medium", "high", "xhigh", "max"]) p.add_argument("--rewrite_reasoning_effort", type=str) @@ -155,18 +155,18 @@ def parse_args() -> argparse.Namespace: p.add_argument("--azure_openai_auth_mode", type=str) p.add_argument("--azure_openai_ad_scope", type=str) p.add_argument("--azure_openai_managed_identity_client_id", type=str) - p.add_argument("--teacher_azure_openai_endpoint", type=str) - p.add_argument("--teacher_azure_openai_api_version", type=str) - p.add_argument("--teacher_azure_openai_api_key", type=str) - p.add_argument("--teacher_azure_openai_auth_mode", type=str) - p.add_argument("--teacher_azure_openai_ad_scope", type=str) - p.add_argument("--teacher_azure_openai_managed_identity_client_id", type=str) - p.add_argument("--student_azure_openai_endpoint", type=str) - p.add_argument("--student_azure_openai_api_version", type=str) - p.add_argument("--student_azure_openai_api_key", type=str) - p.add_argument("--student_azure_openai_auth_mode", type=str) - p.add_argument("--student_azure_openai_ad_scope", type=str) - p.add_argument("--student_azure_openai_managed_identity_client_id", type=str) + p.add_argument("--optimizer_azure_openai_endpoint", type=str) + p.add_argument("--optimizer_azure_openai_api_version", type=str) + p.add_argument("--optimizer_azure_openai_api_key", type=str) + p.add_argument("--optimizer_azure_openai_auth_mode", type=str) + p.add_argument("--optimizer_azure_openai_ad_scope", type=str) + p.add_argument("--optimizer_azure_openai_managed_identity_client_id", type=str) + p.add_argument("--target_azure_openai_endpoint", type=str) + p.add_argument("--target_azure_openai_api_version", type=str) + p.add_argument("--target_azure_openai_api_key", type=str) + p.add_argument("--target_azure_openai_auth_mode", type=str) + p.add_argument("--target_azure_openai_ad_scope", type=str) + p.add_argument("--target_azure_openai_managed_identity_client_id", type=str) p.add_argument("--qwen_chat_base_url", type=str) p.add_argument("--qwen_chat_api_key", type=str) p.add_argument("--qwen_chat_temperature", type=float) @@ -187,7 +187,7 @@ def parse_args() -> argparse.Namespace: p.add_argument("--claude_code_exec_use_sdk", type=str) p.add_argument("--claude_code_exec_effort", type=str) p.add_argument("--claude_code_exec_max_thinking_tokens", type=int) - p.add_argument("--codex_trace_to_teacher", type=_BOOL) + p.add_argument("--codex_trace_to_optimizer", type=_BOOL) p.add_argument("--skill_init", type=str) p.add_argument("--num_epochs", type=int) p.add_argument("--train_size", type=int) @@ -212,8 +212,6 @@ def parse_args() -> argparse.Namespace: p.add_argument("--analyst_workers", type=int) p.add_argument("--failure_only", type=_BOOL) p.add_argument("--minibatch_size", type=int) - p.add_argument("--use_meta_reflect", type=_BOOL) - p.add_argument("--meta_edit_budget", type=int) p.add_argument("--skill_update_mode", type=str, choices=[ "patch", @@ -224,9 +222,6 @@ def parse_args() -> argparse.Namespace: "full_rewrite_minibatch", "minibatch_full_rewrite", ]) - p.add_argument("--use_deep_reflect", type=_BOOL) - p.add_argument("--deep_reflect_failures", type=int) - p.add_argument("--deep_reflect_successes", type=int) p.add_argument("--use_slow_update", type=_BOOL) p.add_argument("--slow_update_samples", type=int) p.add_argument("--longitudinal_pair_policy", type=str, @@ -260,10 +255,10 @@ def parse_args() -> argparse.Namespace: _LEGACY_TO_STRUCTURED: dict[str, str] = { "backend": "model.backend", - "teacher_model": "model.teacher", - "student_model": "model.student", - "teacher_backend": "model.teacher_backend", - "student_backend": "model.student_backend", + "optimizer_model": "model.optimizer", + "target_model": "model.target", + "optimizer_backend": "model.optimizer_backend", + "target_backend": "model.target_backend", "reasoning_effort": "model.reasoning_effort", "rewrite_reasoning_effort": "model.rewrite_reasoning_effort", "rewrite_max_completion_tokens": "model.rewrite_max_completion_tokens", @@ -276,18 +271,18 @@ _LEGACY_TO_STRUCTURED: dict[str, str] = { "azure_openai_auth_mode": "model.azure_openai_auth_mode", "azure_openai_ad_scope": "model.azure_openai_ad_scope", "azure_openai_managed_identity_client_id": "model.azure_openai_managed_identity_client_id", - "teacher_azure_openai_endpoint": "model.teacher_azure_openai_endpoint", - "teacher_azure_openai_api_version": "model.teacher_azure_openai_api_version", - "teacher_azure_openai_api_key": "model.teacher_azure_openai_api_key", - "teacher_azure_openai_auth_mode": "model.teacher_azure_openai_auth_mode", - "teacher_azure_openai_ad_scope": "model.teacher_azure_openai_ad_scope", - "teacher_azure_openai_managed_identity_client_id": "model.teacher_azure_openai_managed_identity_client_id", - "student_azure_openai_endpoint": "model.student_azure_openai_endpoint", - "student_azure_openai_api_version": "model.student_azure_openai_api_version", - "student_azure_openai_api_key": "model.student_azure_openai_api_key", - "student_azure_openai_auth_mode": "model.student_azure_openai_auth_mode", - "student_azure_openai_ad_scope": "model.student_azure_openai_ad_scope", - "student_azure_openai_managed_identity_client_id": "model.student_azure_openai_managed_identity_client_id", + "optimizer_azure_openai_endpoint": "model.optimizer_azure_openai_endpoint", + "optimizer_azure_openai_api_version": "model.optimizer_azure_openai_api_version", + "optimizer_azure_openai_api_key": "model.optimizer_azure_openai_api_key", + "optimizer_azure_openai_auth_mode": "model.optimizer_azure_openai_auth_mode", + "optimizer_azure_openai_ad_scope": "model.optimizer_azure_openai_ad_scope", + "optimizer_azure_openai_managed_identity_client_id": "model.optimizer_azure_openai_managed_identity_client_id", + "target_azure_openai_endpoint": "model.target_azure_openai_endpoint", + "target_azure_openai_api_version": "model.target_azure_openai_api_version", + "target_azure_openai_api_key": "model.target_azure_openai_api_key", + "target_azure_openai_auth_mode": "model.target_azure_openai_auth_mode", + "target_azure_openai_ad_scope": "model.target_azure_openai_ad_scope", + "target_azure_openai_managed_identity_client_id": "model.target_azure_openai_managed_identity_client_id", "qwen_chat_base_url": "model.qwen_chat_base_url", "qwen_chat_api_key": "model.qwen_chat_api_key", "qwen_chat_temperature": "model.qwen_chat_temperature", @@ -308,7 +303,7 @@ _LEGACY_TO_STRUCTURED: dict[str, str] = { "claude_code_exec_use_sdk": "model.claude_code_exec_use_sdk", "claude_code_exec_effort": "model.claude_code_exec_effort", "claude_code_exec_max_thinking_tokens": "model.claude_code_exec_max_thinking_tokens", - "codex_trace_to_teacher": "model.codex_trace_to_teacher", + "codex_trace_to_optimizer": "model.codex_trace_to_optimizer", "num_epochs": "train.num_epochs", "train_size": "train.train_size", "steps_per_epoch": "train.steps_per_epoch", @@ -320,16 +315,11 @@ _LEGACY_TO_STRUCTURED: dict[str, str] = { "analyst_workers": "gradient.analyst_workers", "max_analyst_rounds": "gradient.max_analyst_rounds", "failure_only": "gradient.failure_only", - "use_deep_reflect": "gradient.use_deep_reflect", - "deep_reflect_failures": "gradient.deep_reflect_failures", - "deep_reflect_successes": "gradient.deep_reflect_successes", "edit_budget": "optimizer.learning_rate", "min_edit_budget": "optimizer.min_learning_rate", "lr_scheduler": "optimizer.lr_scheduler", "lr_control_mode": "optimizer.lr_control_mode", "skill_update_mode": "optimizer.skill_update_mode", - "use_meta_reflect": "optimizer.use_meta_reflect", - "meta_edit_budget": "optimizer.meta_learning_rate", "use_slow_update": "optimizer.use_slow_update", "slow_update_samples": "optimizer.slow_update_samples", "longitudinal_pair_policy": "optimizer.longitudinal_pair_policy", @@ -387,7 +377,7 @@ def load_config(args: argparse.Namespace) -> dict: explicit_backend = str(option).split("=", 1)[1].strip() break - backend = normalize_backend_name(flat.get("model_backend") or flat.get("student_backend") or "azure_openai") + backend = normalize_backend_name(flat.get("model_backend") or flat.get("target_backend") or "azure_openai") def _has_model_override(dotted_key: str, legacy_key: str) -> bool: if getattr(args, legacy_key, None) is not None: @@ -402,53 +392,53 @@ def load_config(args: argparse.Namespace) -> dict: backend = normalize_backend_name(explicit_backend) flat["model_backend"] = backend if backend in {"claude", "claude_chat"}: - flat.setdefault("teacher_backend", "claude_chat") - flat.setdefault("student_backend", "claude_chat") + flat.setdefault("optimizer_backend", "claude_chat") + flat.setdefault("target_backend", "claude_chat") elif backend in {"codex", "codex_exec"}: - flat.setdefault("teacher_backend", "openai_chat") - flat.setdefault("student_backend", "codex_exec") + flat.setdefault("optimizer_backend", "openai_chat") + flat.setdefault("target_backend", "codex_exec") elif backend == "claude_code_exec": - flat.setdefault("teacher_backend", "openai_chat") - flat.setdefault("student_backend", "claude_code_exec") + flat.setdefault("optimizer_backend", "openai_chat") + flat.setdefault("target_backend", "claude_code_exec") elif backend in {"qwen", "qwen_chat"}: - flat.setdefault("teacher_backend", "openai_chat") - flat.setdefault("student_backend", "qwen_chat") + flat.setdefault("optimizer_backend", "openai_chat") + flat.setdefault("target_backend", "qwen_chat") else: - flat.setdefault("teacher_backend", "openai_chat") - flat.setdefault("student_backend", "openai_chat") + flat.setdefault("optimizer_backend", "openai_chat") + flat.setdefault("target_backend", "openai_chat") else: - flat.setdefault("teacher_backend", "openai_chat") - flat.setdefault("student_backend", "openai_chat") + flat.setdefault("optimizer_backend", "openai_chat") + flat.setdefault("target_backend", "openai_chat") - if flat.get("teacher_backend") == "claude_chat": + if flat.get("optimizer_backend") == "claude_chat": if ( - str(flat.get("teacher_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS - and not _has_model_override("model.teacher", "teacher_model") + str(flat.get("optimizer_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS + and not _has_model_override("model.optimizer", "optimizer_model") ): - flat["teacher_model"] = default_model_for_backend("claude_chat") - if flat.get("student_backend") == "claude_chat": + flat["optimizer_model"] = default_model_for_backend("claude_chat") + if flat.get("target_backend") == "claude_chat": if ( - str(flat.get("student_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS - and not _has_model_override("model.student", "student_model") + str(flat.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS + and not _has_model_override("model.target", "target_model") ): - flat["student_model"] = default_model_for_backend("claude_chat") - if flat.get("student_backend") == "claude_code_exec": + flat["target_model"] = default_model_for_backend("claude_chat") + if flat.get("target_backend") == "claude_code_exec": if ( - str(flat.get("student_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS - and not _has_model_override("model.student", "student_model") + str(flat.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS + and not _has_model_override("model.target", "target_model") ): - flat["student_model"] = default_model_for_backend("claude_chat") - if flat.get("student_backend") == "qwen_chat": + flat["target_model"] = default_model_for_backend("claude_chat") + if flat.get("target_backend") == "qwen_chat": if ( - str(flat.get("student_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS - and not _has_model_override("model.student", "student_model") + str(flat.get("target_model", "") or "").strip() in _OPENAI_DEFAULT_MODEL_SENTINELS + and not _has_model_override("model.target", "target_model") ): - flat["student_model"] = default_model_for_backend("qwen_chat") + flat["target_model"] = default_model_for_backend("qwen_chat") # Auto-generate output root if not flat.get("out_root"): env = flat.get("env", "unknown") - model = flat.get("teacher_model", "unknown").replace("/", "-") + model = flat.get("optimizer_model", "unknown").replace("/", "-") ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") flat["out_root"] = os.path.join("outputs", f"skillopt_{env}_{model}_{ts}") @@ -463,13 +453,13 @@ def main() -> None: cfg = load_config(args) print(f"\n{'='*60}") - print(f" ReflACT — Reflective Agent Tuning") + print(f" SkillOpt — Executive Strategy for Self-Evolving Agent Skills") print(f"{'='*60}") print(f" env: {cfg.get('env')}") - print(f" teacher_model: {cfg.get('teacher_model')}") - print(f" student_model: {cfg.get('student_model')}") - print(f" teacher_backend:{cfg.get('teacher_backend', 'openai_chat')}") - print(f" student_backend:{cfg.get('student_backend', 'openai_chat')}") + print(f" optimizer_model: {cfg.get('optimizer_model')}") + print(f" target_model: {cfg.get('target_model')}") + print(f" optimizer_backend:{cfg.get('optimizer_backend', 'openai_chat')}") + print(f" target_backend:{cfg.get('target_backend', 'openai_chat')}") print(f" reasoning: {cfg.get('reasoning_effort') or 'off'}") print(f" rewrite_effort: {cfg.get('rewrite_reasoning_effort') or 'off'}") print(f" epochs: {cfg.get('num_epochs')}") @@ -482,8 +472,8 @@ def main() -> None: print(f" min_edit_budget:{cfg.get('min_edit_budget', 2)}") print(f" minibatch_size: {cfg.get('minibatch_size')}") print(f" seed: {cfg.get('seed')}") - print(f" meta_reflect: {cfg.get('use_meta_reflect', False)}") print(f" meta_skill: {cfg.get('use_meta_skill', False)}") + print(f" slow_update: {cfg.get('use_slow_update', False)}") print(f" out_root: {cfg.get('out_root')}") print(f"{'='*60}\n") diff --git a/skillopt/__init__.py b/skillopt/__init__.py index 1e957a9..a41cfaa 100644 --- a/skillopt/__init__.py +++ b/skillopt/__init__.py @@ -21,7 +21,6 @@ from skillopt.types import ( # noqa: F401 FailureSummaryEntry, GateAction, GateResult, - MetaReflectResult, Patch, RawPatch, RolloutResult, diff --git a/skillopt/config.py b/skillopt/config.py index fec39ed..cc61a27 100644 --- a/skillopt/config.py +++ b/skillopt/config.py @@ -30,10 +30,10 @@ _STRUCTURED_SECTIONS = frozenset({ _FLATTEN_MAP: dict[str, str] = { "model.backend": "model_backend", - "model.teacher": "teacher_model", - "model.student": "student_model", - "model.teacher_backend": "teacher_backend", - "model.student_backend": "student_backend", + "model.optimizer": "optimizer_model", + "model.target": "target_model", + "model.optimizer_backend": "optimizer_backend", + "model.target_backend": "target_backend", "model.reasoning_effort": "reasoning_effort", "model.rewrite_reasoning_effort": "rewrite_reasoning_effort", "model.rewrite_max_completion_tokens": "rewrite_max_completion_tokens", @@ -51,7 +51,7 @@ _FLATTEN_MAP: dict[str, str] = { "model.claude_code_exec_use_sdk": "claude_code_exec_use_sdk", "model.claude_code_exec_effort": "claude_code_exec_effort", "model.claude_code_exec_max_thinking_tokens": "claude_code_exec_max_thinking_tokens", - "model.codex_trace_to_teacher": "codex_trace_to_teacher", + "model.codex_trace_to_optimizer": "codex_trace_to_optimizer", "model.azure_endpoint": "azure_endpoint", "model.azure_api_version": "azure_api_version", "model.azure_api_key": "azure_api_key", @@ -61,18 +61,18 @@ _FLATTEN_MAP: dict[str, str] = { "model.azure_openai_auth_mode": "azure_openai_auth_mode", "model.azure_openai_ad_scope": "azure_openai_ad_scope", "model.azure_openai_managed_identity_client_id": "azure_openai_managed_identity_client_id", - "model.teacher_azure_openai_endpoint": "teacher_azure_openai_endpoint", - "model.teacher_azure_openai_api_version": "teacher_azure_openai_api_version", - "model.teacher_azure_openai_api_key": "teacher_azure_openai_api_key", - "model.teacher_azure_openai_auth_mode": "teacher_azure_openai_auth_mode", - "model.teacher_azure_openai_ad_scope": "teacher_azure_openai_ad_scope", - "model.teacher_azure_openai_managed_identity_client_id": "teacher_azure_openai_managed_identity_client_id", - "model.student_azure_openai_endpoint": "student_azure_openai_endpoint", - "model.student_azure_openai_api_version": "student_azure_openai_api_version", - "model.student_azure_openai_api_key": "student_azure_openai_api_key", - "model.student_azure_openai_auth_mode": "student_azure_openai_auth_mode", - "model.student_azure_openai_ad_scope": "student_azure_openai_ad_scope", - "model.student_azure_openai_managed_identity_client_id": "student_azure_openai_managed_identity_client_id", + "model.optimizer_azure_openai_endpoint": "optimizer_azure_openai_endpoint", + "model.optimizer_azure_openai_api_version": "optimizer_azure_openai_api_version", + "model.optimizer_azure_openai_api_key": "optimizer_azure_openai_api_key", + "model.optimizer_azure_openai_auth_mode": "optimizer_azure_openai_auth_mode", + "model.optimizer_azure_openai_ad_scope": "optimizer_azure_openai_ad_scope", + "model.optimizer_azure_openai_managed_identity_client_id": "optimizer_azure_openai_managed_identity_client_id", + "model.target_azure_openai_endpoint": "target_azure_openai_endpoint", + "model.target_azure_openai_api_version": "target_azure_openai_api_version", + "model.target_azure_openai_api_key": "target_azure_openai_api_key", + "model.target_azure_openai_auth_mode": "target_azure_openai_auth_mode", + "model.target_azure_openai_ad_scope": "target_azure_openai_ad_scope", + "model.target_azure_openai_managed_identity_client_id": "target_azure_openai_managed_identity_client_id", "model.qwen_chat_base_url": "qwen_chat_base_url", "model.qwen_chat_api_key": "qwen_chat_api_key", "model.qwen_chat_temperature": "qwen_chat_temperature", @@ -89,16 +89,12 @@ _FLATTEN_MAP: dict[str, str] = { "gradient.merge_batch_size": "merge_batch_size", "gradient.analyst_workers": "analyst_workers", "gradient.failure_only": "failure_only", - "gradient.use_deep_reflect": "use_deep_reflect", - "gradient.deep_reflect_failures": "deep_reflect_failures", - "gradient.deep_reflect_successes": "deep_reflect_successes", "gradient.max_analyst_rounds": "max_analyst_rounds", "optimizer.learning_rate": "edit_budget", "optimizer.min_learning_rate": "min_edit_budget", "optimizer.lr_scheduler": "lr_scheduler", "optimizer.lr_control_mode": "lr_control_mode", "optimizer.skill_update_mode": "skill_update_mode", - "optimizer.use_meta_reflect": "use_meta_reflect", "optimizer.meta_learning_rate": "meta_edit_budget", "optimizer.use_slow_update": "use_slow_update", "optimizer.slow_update_samples": "slow_update_samples", diff --git a/skillopt/engine/trainer.py b/skillopt/engine/trainer.py index 79f6049..d12982c 100644 --- a/skillopt/engine/trainer.py +++ b/skillopt/engine/trainer.py @@ -26,7 +26,6 @@ from skillopt.datasets.base import BatchSpec from skillopt.envs.base import EnvAdapter from skillopt.evaluation.gate import evaluate_gate from skillopt.gradient.aggregate import merge_patches -from skillopt.optimizer.meta_reflect import build_epoch_history, run_meta_reflect from skillopt.optimizer.meta_skill import run_meta_skill from skillopt.optimizer.clip import rank_and_select from skillopt.optimizer.lr_autonomous import decide_autonomous_learning_rate @@ -56,10 +55,10 @@ from skillopt.model import ( get_token_summary, reset_token_tracker, set_reasoning_effort, - set_student_backend, - set_student_deployment, - set_teacher_backend, - set_teacher_deployment, + set_target_backend, + set_target_deployment, + set_optimizer_backend, + set_optimizer_deployment, ) from skillopt.utils import compute_score, skill_hash @@ -132,7 +131,7 @@ def _normalise_lr_control_mode(mode: str | None) -> str: "scheduled": "fixed", "autonomous": "autonomous", "auto": "autonomous", - "teacher": "autonomous", + "optimizer": "autonomous", "none": "none", "off": "none", "no_lr": "none", @@ -570,47 +569,47 @@ class ReflACTTrainer: auth_mode=cfg.get("azure_openai_auth_mode") or None, ad_scope=cfg.get("azure_openai_ad_scope") or None, managed_identity_client_id=cfg.get("azure_openai_managed_identity_client_id") or None, - teacher_endpoint=cfg.get("teacher_azure_openai_endpoint") or None, - teacher_api_version=cfg.get("teacher_azure_openai_api_version") or None, - teacher_api_key=cfg.get("teacher_azure_openai_api_key") or None, - teacher_auth_mode=cfg.get("teacher_azure_openai_auth_mode") or None, - teacher_ad_scope=cfg.get("teacher_azure_openai_ad_scope") or None, - teacher_managed_identity_client_id=( - cfg.get("teacher_azure_openai_managed_identity_client_id") or None + optimizer_endpoint=cfg.get("optimizer_azure_openai_endpoint") or None, + optimizer_api_version=cfg.get("optimizer_azure_openai_api_version") or None, + optimizer_api_key=cfg.get("optimizer_azure_openai_api_key") or None, + optimizer_auth_mode=cfg.get("optimizer_azure_openai_auth_mode") or None, + optimizer_ad_scope=cfg.get("optimizer_azure_openai_ad_scope") or None, + optimizer_managed_identity_client_id=( + cfg.get("optimizer_azure_openai_managed_identity_client_id") or None ), - student_endpoint=cfg.get("student_azure_openai_endpoint") or None, - student_api_version=cfg.get("student_azure_openai_api_version") or None, - student_api_key=cfg.get("student_azure_openai_api_key") or None, - student_auth_mode=cfg.get("student_azure_openai_auth_mode") or None, - student_ad_scope=cfg.get("student_azure_openai_ad_scope") or None, - student_managed_identity_client_id=( - cfg.get("student_azure_openai_managed_identity_client_id") or None + target_endpoint=cfg.get("target_azure_openai_endpoint") or None, + target_api_version=cfg.get("target_azure_openai_api_version") or None, + target_api_key=cfg.get("target_azure_openai_api_key") or None, + target_auth_mode=cfg.get("target_azure_openai_auth_mode") or None, + target_ad_scope=cfg.get("target_azure_openai_ad_scope") or None, + target_managed_identity_client_id=( + cfg.get("target_azure_openai_managed_identity_client_id") or None ), ) - teacher_backend = cfg.get("teacher_backend") - student_backend = cfg.get("student_backend") - if not teacher_backend or not student_backend: + optimizer_backend = cfg.get("optimizer_backend") + target_backend = cfg.get("target_backend") + if not optimizer_backend or not target_backend: if backend in {"claude", "claude_chat"}: - teacher_backend = teacher_backend or "claude_chat" - student_backend = student_backend or "claude_chat" + optimizer_backend = optimizer_backend or "claude_chat" + target_backend = target_backend or "claude_chat" elif backend in {"codex", "codex_exec"}: - teacher_backend = teacher_backend or "openai_chat" - student_backend = student_backend or "codex_exec" + optimizer_backend = optimizer_backend or "openai_chat" + target_backend = target_backend or "codex_exec" elif backend == "claude_code_exec": - teacher_backend = teacher_backend or "openai_chat" - student_backend = student_backend or "claude_code_exec" + optimizer_backend = optimizer_backend or "openai_chat" + target_backend = target_backend or "claude_code_exec" elif backend in {"qwen", "qwen_chat"}: - teacher_backend = teacher_backend or "openai_chat" - student_backend = student_backend or "qwen_chat" + optimizer_backend = optimizer_backend or "openai_chat" + target_backend = target_backend or "qwen_chat" else: - teacher_backend = teacher_backend or "openai_chat" - student_backend = student_backend or "openai_chat" - cfg["teacher_backend"] = teacher_backend - cfg["student_backend"] = student_backend - set_teacher_backend(teacher_backend) - set_student_backend(student_backend) - set_teacher_deployment(cfg["teacher_model"]) - set_student_deployment(cfg["student_model"]) + optimizer_backend = optimizer_backend or "openai_chat" + target_backend = target_backend or "openai_chat" + cfg["optimizer_backend"] = optimizer_backend + cfg["target_backend"] = target_backend + set_optimizer_backend(optimizer_backend) + set_target_backend(target_backend) + set_optimizer_deployment(cfg["optimizer_model"]) + set_target_deployment(cfg["target_model"]) configure_codex_exec( path=cfg.get("codex_exec_path", "codex"), sandbox=cfg.get("codex_exec_sandbox", "workspace-write"), @@ -637,19 +636,17 @@ class ReflACTTrainer: max_tokens=cfg.get("qwen_chat_max_tokens"), enable_thinking=cfg.get("qwen_chat_enable_thinking"), ) - os.environ["REFLACT_CODEX_TRACE_TO_TEACHER"] = ( + os.environ["REFLACT_CODEX_TRACE_TO_OPTIMIZER"] = ( "1" - if student_backend == "codex_exec" and cfg.get("codex_trace_to_teacher", False) + if target_backend == "codex_exec" and cfg.get("codex_trace_to_optimizer", False) else "0" ) reasoning = cfg.get("reasoning_effort", "") or None set_reasoning_effort(reasoning) - if student_backend == "claude_code_exec" and cfg.get("use_deep_reflect", False): - raise NotImplementedError("claude_code_exec does not support use_deep_reflect yet.") print( f" [model config] backend={backend} " - f"teacher={cfg['teacher_model']} ({teacher_backend}) " - f"student={cfg['student_model']} ({student_backend}) " + f"optimizer={cfg['optimizer_model']} ({optimizer_backend}) " + f"target={cfg['target_model']} ({target_backend}) " f"reasoning={reasoning or 'off'}" ) @@ -897,7 +894,7 @@ class ReflACTTrainer: epoch_rng.shuffle(shuffled_seeds) # Step buffer: accumulates per-step context (failure patterns + - # rejected edits) within this epoch so teachers see full history. + # rejected edits) within this epoch so optimizers see full history. step_buffer: list[dict] = [] active_meta_skill = ( _load_meta_skill_content(out_root, epoch - 1) @@ -948,7 +945,6 @@ class ReflACTTrainer: accum_rollout_stats: list[dict] = [] total_rollout_time = 0.0 total_reflect_time = 0.0 - total_deep_reflect_time = 0.0 for a in range(accumulation): batch_idx = step_in_epoch * accumulation + a @@ -1013,33 +1009,6 @@ class ReflACTTrainer: f"success_patches={len(success_patches)}" ) - deep_failure_patches: list[dict] = [] - deep_success_patches: list[dict] = [] - if cfg.get("use_deep_reflect", False): - t_phase = time.time() - deep_raw_patches = adapter.deep_reflect( - rollout_results, - current_skill, - batch_dir, - env_manager=train_env, - prediction_dir=pred_dir, - random_seed=batch_seed, - step_buffer_context=step_buffer_context, - meta_skill_context=active_meta_skill, - ) - deep_failure_patches, deep_success_patches = _normalise_patches( - deep_raw_patches, - update_mode=update_mode, - ) - all_failure_patches.extend(deep_failure_patches) - all_success_patches.extend(deep_success_patches) - all_raw_patches.extend(deep_raw_patches) - total_deep_reflect_time += time.time() - t_phase - print( - f" [2b/6 DEEP REFLECT] failure_patches={len(deep_failure_patches)} " - f"success_patches={len(deep_success_patches)}" - ) - # Track per-batch stats accum_rollout_stats.append({ "batch_idx": a, @@ -1049,8 +1018,6 @@ class ReflACTTrainer: "soft": r_soft, "n_failure_patches": len(failure_patches), "n_success_patches": len(success_patches), - "n_deep_failure_patches": len(deep_failure_patches), - "n_deep_success_patches": len(deep_success_patches), }) # ── End of accumulation loop ───────────────────────────── @@ -1066,8 +1033,6 @@ class ReflACTTrainer: step_rec["accumulation_batches"] = accum_rollout_stats step_rec["timing"]["rollout_s"] = round(total_rollout_time, 1) step_rec["timing"]["reflect_s"] = round(total_reflect_time, 1) - if cfg.get("use_deep_reflect", False): - step_rec["timing"]["deep_reflect_s"] = round(total_deep_reflect_time, 1) n_total_patches = len(all_failure_patches) + len(all_success_patches) step_rec["n_patches"] = n_total_patches @@ -1383,7 +1348,7 @@ class ReflACTTrainer: step_buffer.append(buf_entry) - # Persist for meta-reflect + # Persist step digest for step buffer context digest_path = os.path.join(step_dir, "trajectory_digest.json") with open(digest_path, "w") as f: json.dump(buf_entry, f, indent=2, ensure_ascii=False) @@ -1431,7 +1396,6 @@ class ReflACTTrainer: f"dt={step_rec['wall_time_s']}s\n" f" timing: rollout={timing.get('rollout_s',0)}s " f"reflect={timing.get('reflect_s',0)}s " - f"deep_reflect={timing.get('deep_reflect_s',0)}s " f"aggregate={timing.get('aggregate_s',0)}s " f"select={timing.get('select_s',0)}s " f"evaluate={timing.get('evaluate_s',0)}s" @@ -1463,12 +1427,17 @@ class ReflACTTrainer: epoch_comparison_pairs = None if ( slow_saved.get("slow_update_content") - and slow_saved.get("action") in {"accept", "accept_new_best"} + and slow_saved.get("action") in { + "accept", "accept_new_best", "force_accept", + } and epoch >= 2 ): current_skill = replace_slow_update_field( current_skill, slow_saved["slow_update_content"], ) + best_skill = replace_slow_update_field( + best_skill, slow_saved["slow_update_content"], + ) elif epoch == 1: # Epoch 1: inject empty placeholder os.makedirs(slow_dir, exist_ok=True) @@ -1577,7 +1546,7 @@ class ReflACTTrainer: # 5. Extract previous slow update guidance for reflection existing_guidance = extract_slow_update_field(current_skill) - # 6. Teacher analysis (with reflection on previous guidance) + # 6. Optimizer analysis (with reflection on previous guidance) slow_result = run_slow_update( current_skill, results_prev, @@ -1608,67 +1577,29 @@ class ReflACTTrainer: "observed across adjacent epochs." ) - if slow_candidate_hash in sel_cache: - slow_sel_hard, slow_sel_soft = sel_cache[slow_candidate_hash] - print( - f" [slow gate] cache hit: hard={slow_sel_hard:.4f}" - ) - else: - sel_env, sel_n = _build_eval_env( - split="valid_seen", - env_num=cfg["sel_env_num"], - seed=seed, - ) - print(f" [slow gate] selection items={sel_n}") - slow_eval_dir = os.path.join(slow_dir, "selection_eval") - slow_eval_results = adapter.rollout( - sel_env, slow_candidate, slow_eval_dir, - ) - slow_sel_hard, slow_sel_soft = compute_score(slow_eval_results) - sel_cache[slow_candidate_hash] = (slow_sel_hard, slow_sel_soft) - - slow_gate = evaluate_gate( - candidate_skill=slow_candidate, - cand_hard=slow_sel_hard, - current_skill=current_skill, - current_score=current_score, - best_skill=best_skill, - best_score=best_score, - best_step=best_step, - global_step=global_step, + # Slow update field is force-updated into both + # current_skill and best_skill unconditionally. + # The epoch-level longitudinal guidance should always + # persist — it must not be gated by step-level + # selection scores. + slow_content = slow_result["slow_update_content"] + current_skill = replace_slow_update_field( + current_skill, slow_content, ) - slow_result["selection_hard"] = slow_sel_hard - slow_result["selection_soft"] = slow_sel_soft - slow_result["action"] = slow_gate.action - prev_current = current_score - prev_best = best_score - current_skill = slow_gate.current_skill - current_score = slow_gate.current_score - best_skill = slow_gate.best_skill - best_score = slow_gate.best_score - best_step = slow_gate.best_step - if slow_gate.action in {"accept", "accept_new_best"}: - current_origin = f"slow_update_epoch_{epoch:02d}" - if slow_gate.action == "accept_new_best": - best_origin = current_origin - print( - f" [slow gate] ACCEPT (new best) " - f"hard={slow_sel_hard:.4f} > prev best {prev_best:.4f}" - ) - elif slow_gate.action == "accept": - print( - f" [slow gate] ACCEPT " - f"hard={slow_sel_hard:.4f} > current={prev_current:.4f}" - ) - else: - print( - f" [slow gate] REJECT " - f"hard={slow_sel_hard:.4f} <= current={current_score:.4f}" - ) + best_skill = replace_slow_update_field( + best_skill, slow_content, + ) + # Update caches so downstream steps use the + # slow-update-injected skill for hashing. + slow_candidate_hash = skill_hash(current_skill) + sel_cache[slow_candidate_hash] = (current_score, 0.0) + + slow_result["action"] = "force_accept" + current_origin = f"slow_update_epoch_{epoch:02d}" print( - f" [slow update] guidance written " - f"({len(slow_result['slow_update_content'])} chars), " + f" [slow update] force-injected into current & best " + f"({len(slow_content)} chars), " f"{slow_time}s" ) else: @@ -1693,7 +1624,7 @@ class ReflACTTrainer: f"current={current_score:.4f} best={best_score:.4f}" ) - # ── META SKILL (end of epoch, teacher-side memory) ───────── + # ── META SKILL (end of epoch, optimizer-side memory) ───────── use_meta_skill = cfg.get("use_meta_skill", False) if use_meta_skill: meta_skill_dir = os.path.join(out_root, "meta_skill", f"epoch_{epoch:02d}") @@ -1713,7 +1644,7 @@ class ReflACTTrainer: print( f"\n {'='*60}\n" f" META SKILL — Epoch {epoch} " - f"(teacher memory from epoch {epoch-1} vs {epoch})\n" + f"(optimizer memory from epoch {epoch-1} vs {epoch})\n" f" {'='*60}" ) @@ -1806,232 +1737,6 @@ class ReflACTTrainer: with open(meta_skill_done_path, "w") as f: json.dump(meta_skill_result, f, indent=2, ensure_ascii=False) - # ── META-REFLECT (end of epoch) ───────────────────────────── - use_meta = cfg.get("use_meta_reflect", False) - if use_meta: - # Collect this epoch's step records from history - epoch_records = [ - h for h in history if h.get("epoch") == epoch - ] - if epoch_records: - meta_step_tag = f"meta_epoch_{epoch}" - meta_dir = os.path.join(out_root, "meta_reflect", f"epoch_{epoch:02d}") - meta_done_path = os.path.join(meta_dir, "meta_result.json") - - # Resume support: skip if already done - if os.path.exists(meta_done_path): - with open(meta_done_path) as f: - meta_result = json.load(f) - meta_summary = meta_result.get("meta_summary", "") - meta_action = meta_result.get("action", "unknown") - print( - f"\n [META-REFLECT epoch {epoch}] " - f"resumed — {meta_action}" - ) - else: - os.makedirs(meta_dir, exist_ok=True) - print( - f"\n {'='*60}\n" - f" META-REFLECT — Epoch {epoch} " - f"({len(epoch_records)} steps)\n" - f" {'='*60}" - ) - - meta_edit_budget = cfg.get("meta_edit_budget", 4) - - # Build epoch history text - epoch_history_text = build_epoch_history( - epoch_records, out_root, - update_mode=update_mode, - ) - - # Load previous meta summary - prev_meta_path = os.path.join( - out_root, "meta_reflect", - f"epoch_{epoch - 1:02d}", "meta_result.json", - ) - prev_meta_summary = "" - if os.path.exists(prev_meta_path): - try: - with open(prev_meta_path) as f: - prev = json.load(f) - prev_meta_summary = prev.get("meta_summary", "") - except Exception: - pass - - # Get env-specific meta prompt if available - meta_system = adapter.get_meta_reflect_prompt() \ - if hasattr(adapter, "get_meta_reflect_prompt") else None - - # Run meta-reflect - t_meta = time.time() - meta_result = run_meta_reflect( - skill_content=current_skill, - epoch_history_text=epoch_history_text, - prev_meta_summary=prev_meta_summary, - meta_edit_budget=meta_edit_budget, - system_prompt=meta_system, - update_mode=update_mode, - ) - meta_time = round(time.time() - t_meta, 1) - - meta_items = get_payload_items(meta_result.get("patch", {}) if meta_result else {}, update_mode) - if meta_result and meta_items: - for item in meta_items: - item.setdefault("update_origin", "meta_reflect_momentum") - item.setdefault( - "update_target", - "Consolidate epoch-level accepted/rejected edit patterns.", - ) - meta_summary = meta_result.get("meta_summary", "") - print( - f" [meta-reflect] " - f"{len(meta_items)} {payload_label(update_mode)} proposed, " - f"{meta_time}s" - ) - - meta_rewrite_result = None - if update_mode == "rewrite_from_suggestions": - meta_rewrite_result = rewrite_skill_from_suggestions( - current_skill, - meta_result["patch"], - env=cfg.get("env"), - reasoning_effort=rewrite_reasoning_effort, - max_completion_tokens=rewrite_max_completion_tokens, - ) - if meta_rewrite_result and meta_rewrite_result.get("new_skill"): - meta_candidate = meta_rewrite_result["new_skill"] - meta_apply_report = [] - else: - meta_candidate = current_skill - meta_apply_report = [] - else: - meta_candidate, meta_apply_report = apply_patch_with_report( - current_skill, meta_result["patch"], - ) - meta_cand_hash = skill_hash(meta_candidate) - - # Save meta candidate - with open(os.path.join(meta_dir, "meta_candidate.md"), "w") as f: - f.write(meta_candidate) - with open(os.path.join(meta_dir, "meta_patch.json"), "w") as f: - json.dump(meta_result, f, indent=2, ensure_ascii=False) - if meta_apply_report: - with open(os.path.join(meta_dir, "meta_edit_apply_report.json"), "w") as f: - json.dump(meta_apply_report, f, indent=2, ensure_ascii=False) - if meta_rewrite_result: - with open(os.path.join(meta_dir, "meta_rewrite_result.json"), "w") as f: - json.dump(meta_rewrite_result, f, indent=2, ensure_ascii=False) - meta_result["rewrite_change_summary"] = meta_rewrite_result.get("change_summary", []) - - if update_mode == "rewrite_from_suggestions" and meta_rewrite_result is None: - meta_action = "skip_no_rewrite" - meta_result["action"] = meta_action - meta_result["meta_summary"] = meta_summary - meta_result["time_s"] = meta_time - print( - " [meta-reflect] no usable rewrite generated — " - f"skill unchanged, {meta_time}s" - ) - else: - # Gate: evaluate meta candidate - if meta_cand_hash in sel_cache: - meta_hard, meta_soft = sel_cache[meta_cand_hash] - print( - f" [meta-gate] " - f"cache hit: hard={meta_hard:.4f}" - ) - else: - sel_env, _ = _build_eval_env( - split="valid_seen", - env_num=cfg["sel_env_num"], - seed=seed, - ) - meta_eval_dir = os.path.join(meta_dir, "selection_eval") - meta_eval_results = adapter.rollout( - sel_env, meta_candidate, meta_eval_dir, - ) - meta_hard, meta_soft = compute_score(meta_eval_results) - sel_cache[meta_cand_hash] = (meta_hard, meta_soft) - - meta_gate = evaluate_gate( - candidate_skill=meta_candidate, - cand_hard=meta_hard, - current_skill=current_skill, - current_score=current_score, - best_skill=best_skill, - best_score=best_score, - best_step=best_step, - global_step=global_step, - ) - meta_action = meta_gate.action - prev_score = current_score - current_skill = meta_gate.current_skill - current_score = meta_gate.current_score - best_skill = meta_gate.best_skill - best_score = meta_gate.best_score - best_step = meta_gate.best_step - if meta_gate.action in {"accept", "accept_new_best"}: - current_origin = f"meta_reflect_epoch_{epoch:02d}" - if meta_gate.action == "accept_new_best": - best_origin = current_origin - if meta_gate.action == "accept_new_best": - print( - f" [meta-gate] ACCEPT (new best) " - f"hard={meta_hard:.4f} > " - f"prev best {prev_score:.4f}" - ) - elif meta_gate.action == "accept": - print( - f" [meta-gate] ACCEPT " - f"hard={meta_hard:.4f} > " - f"current={prev_score:.4f}" - ) - else: - print( - f" [meta-gate] REJECT " - f"hard={meta_hard:.4f} <= " - f"current={current_score:.4f}" - ) - - # Save meta result with gate outcome - meta_result["action"] = meta_action - meta_result["gate_score"] = meta_hard - meta_result["time_s"] = meta_time - meta_result["update_origin"] = "meta_reflect_momentum" - meta_result["update_target"] = ( - "Consolidate epoch-level editing directions that helped or hurt." - ) - else: - meta_summary = meta_result.get("meta_summary", "") if meta_result else "" - meta_action = f"skip_no_{payload_label(update_mode)}" - if meta_result is None: - meta_result = {} - meta_result["action"] = meta_action - meta_result["meta_summary"] = meta_summary - meta_result["time_s"] = meta_time - print( - f" [meta-reflect] no {payload_label(update_mode)} proposed — " - f"skill unchanged, {meta_time}s" - ) - - # Persist - with open(meta_done_path, "w") as f: - json.dump(meta_result, f, indent=2, ensure_ascii=False) - - # Save updated skill after meta-reflect - _save_skill(out_root, global_step, current_skill) - with open(os.path.join(out_root, "best_skill.md"), "w") as f: - f.write(best_skill) - _persist_runtime_state(global_step) - - print( - f"\n [META-REFLECT epoch {epoch} done] " - f"action={meta_action} " - f"current={current_score:.4f} " - f"best={best_score:.4f}" - ) - # ── Save best skill ────────────────────────────────────────────── with open(os.path.join(out_root, "best_skill.md"), "w") as f: f.write(best_skill) diff --git a/skillopt/envs/_template/config_template.yaml b/skillopt/envs/_template/config_template.yaml index 80dc569..74369b9 100644 --- a/skillopt/envs/_template/config_template.yaml +++ b/skillopt/envs/_template/config_template.yaml @@ -31,7 +31,7 @@ optimizer: learning_rate: 4 # Max edits per step (edit budget) lr_scheduler: cosine # cosine | linear | constant | autonomous use_slow_update: true # Epoch-boundary momentum - use_meta_skill: true # Cross-epoch teacher memory + use_meta_skill: true # Cross-epoch optimizer memory # ── Evaluation ─────────────────────────────────── evaluation: @@ -41,5 +41,5 @@ evaluation: # ── Model ──────────────────────────────────────── model: backend: azure_openai # azure_openai | openai_chat | claude_code_exec | qwen - teacher: gpt-5.5 - student: gpt-5.5 + optimizer: gpt-4o + target: gpt-4o diff --git a/skillopt/envs/_template/env_template.py b/skillopt/envs/_template/env_template.py index 5eac94c..5b0b2d3 100644 --- a/skillopt/envs/_template/env_template.py +++ b/skillopt/envs/_template/env_template.py @@ -4,7 +4,7 @@ Benchmark Environment Template Copy this file and implement the TODO sections to add a new benchmark. The EnvAdapter is responsible for: -1. Executing tasks using the student model + current skill document +1. Executing tasks using the target model + current skill document 2. Evaluating predictions against ground truth 3. Returning structured results for the training loop """ @@ -25,12 +25,12 @@ class TemplateBenchmarkEnv(EnvAdapter): async def execute(self, item, skill: str, model): """ - Execute a single task with the student model. + Execute a single task with the target model. Args: item: DataItem with .id, .input, .ground_truth, .metadata skill: Current skill document content (Markdown string) - model: Student model backend instance + model: Target model backend instance Returns: TaskResult with prediction, score, and trajectory @@ -38,7 +38,7 @@ class TemplateBenchmarkEnv(EnvAdapter): # Step 1: Build the prompt combining skill + task input prompt = self.build_prompt(item, skill) - # Step 2: Call the student model + # Step 2: Call the target model # TODO: Customize the message format for your benchmark messages = [ {"role": "system", "content": skill}, diff --git a/skillopt/envs/alfworld/adapter.py b/skillopt/envs/alfworld/adapter.py index 9fd0909..41bf73b 100644 --- a/skillopt/envs/alfworld/adapter.py +++ b/skillopt/envs/alfworld/adapter.py @@ -9,7 +9,6 @@ from dataclasses import dataclass import json import os -from skillopt.gradient.deep_probe import generate_deep_probe_instruction from skillopt.datasets.base import BatchSpec from skillopt.envs.base import EnvAdapter from skillopt.envs.alfworld.dataloader import ALFWorldDataLoader @@ -82,11 +81,7 @@ class ALFWorldAdapter(EnvAdapter): analyst_workers: int = 16, failure_only: bool = False, minibatch_size: int = 8, - edit_budget: int = 4, - use_deep_reflect: bool = False, - deep_reflect_failures: int = 4, - deep_reflect_successes: int = 2, - ) -> None: + edit_budget: int = 4, ) -> None: self.max_steps = max_steps self.workers = max(int(workers or 1), 1) self.max_api_workers = max_api_workers @@ -94,9 +89,6 @@ class ALFWorldAdapter(EnvAdapter): self.failure_only = failure_only self.minibatch_size = minibatch_size self.edit_budget = edit_budget - self.use_deep_reflect = use_deep_reflect - self.deep_reflect_failures = deep_reflect_failures - self.deep_reflect_successes = deep_reflect_successes self.dataloader = ALFWorldDataLoader( split_dir=split_dir, data_path=data_path, @@ -457,129 +449,6 @@ class ALFWorldAdapter(EnvAdapter): meta_skill_context=meta_skill_context, ) - def deep_reflect( - self, - results: list[dict], - skill_content: str, - out_dir: str, - **kwargs, - ) -> list[dict | None]: - if not self.use_deep_reflect: - return [] - - prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions")) - random_seed = kwargs.get("random_seed") - step_buffer_context = kwargs.get("step_buffer_context", "") - meta_skill_context = kwargs.get("meta_skill_context", "") - selected_items = self.select_representative_items( - results, - results, - n_failures=self.deep_reflect_failures, - n_successes=self.deep_reflect_successes, - seed=random_seed, - ) - if not selected_items: - return [] - - selected_ids = {str(item["id"]) for item in selected_items} - selected_results = [row for row in results if str(row.get("id")) in selected_ids] - selected_examples = self.attach_reference_context(selected_results, selected_items) - - field_counts: dict[str, int] = {} - selected_metadata: list[dict] = [] - for item in selected_items: - meta = self.get_reference_metadata(item) - for field in meta["fields"]: - field_counts[field] = field_counts.get(field, 0) + 1 - selected_metadata.append({ - "id": str(item["id"]), - "task_type": str(item.get("task_type") or "alfworld"), - "gamefile": str(item.get("gamefile") or ""), - "reference_fields": meta["fields"], - "reference_preview": meta["preview"], - }) - - deep_dir = os.path.join(out_dir, "deep_reflect") - rollout_dir = os.path.join(deep_dir, "rollout") - patches_dir = os.path.join(deep_dir, "patches") - os.makedirs(deep_dir, exist_ok=True) - field_summary = ", ".join( - f"{field}({count}/{len(selected_items)})" - for field, count in sorted(field_counts.items()) - ) or "none" - print( - f" [2b/6 DEEP REFLECT setup] selected={len(selected_items)} " - f"reference_fields={field_summary}" - ) - probe = generate_deep_probe_instruction( - skill_content=skill_content, - items=selected_examples, - prediction_dir=prediction_dir, - system_prompt=self.get_deep_probe_prompt(), - step_buffer_context=step_buffer_context, - meta_skill_context=meta_skill_context, - output_requirements=[ - "- Some trajectories may include a hidden Reference block. Use it to target the student's latent subgoal, missing precondition, or next-step intent, but do not reveal or paraphrase that reference to the student.", - "- The instruction must request a brief diagnostic readout inside the existing ... block.", - "- The student must still output exactly one admissible action inside ....", - "- Do not ask for exhaustive inventories, full plans, or long chain-of-thought.", - "- The instruction text should be ready to append directly to the student's prompt.", - ], - ) - if not probe: - return [] - - with open(os.path.join(deep_dir, "probe.json"), "w", encoding="utf-8") as f: - json.dump( - { - **probe, - "reference_summary": { - "selected_count": len(selected_items), - "field_counts": field_counts, - }, - "selected_examples": selected_metadata, - }, - f, - ensure_ascii=False, - indent=2, - ) - - gamefiles = [str(item.get("gamefile") or "") for item in selected_items] - if any(not gamefile for gamefile in gamefiles): - return [] - eval_dataset, is_train = self._infer_dataset_from_gamefile(gamefiles[0]) - deep_env = ALFWorldBatchRun( - env_num=len(selected_items), - eval_dataset=eval_dataset, - seed=random_seed or 42, - is_train=is_train, - specific_gamefiles=gamefiles, - workers=min(self.workers, max(len(selected_items), 1)), - result_ids=[str(item["id"]) for item in selected_items], - ) - deep_results = self._run_batch( - deep_env, - skill_content=skill_content, - out_dir=rollout_dir, - diagnostic_mode=True, - diagnostic_instruction=probe["probe_instruction"], - ) - deep_results = self.attach_reference_context(deep_results, selected_items) - return run_minibatch_reflect( - results=deep_results, - skill_content=skill_content, - prediction_dir=os.path.join(rollout_dir, "predictions"), - patches_dir=patches_dir, - workers=self.analyst_workers, - failure_only=self.failure_only, - minibatch_size=self.minibatch_size, - edit_budget=self.edit_budget, - random_seed=random_seed, - error_system=self.get_error_minibatch_prompt(), - success_system=self.get_success_minibatch_prompt(), - step_buffer_context=step_buffer_context, - meta_skill_context=meta_skill_context, - ) def get_task_types(self) -> list[str]: return list(TASKS) diff --git a/skillopt/envs/alfworld/rollout.py b/skillopt/envs/alfworld/rollout.py index 0faf867..afd84dd 100644 --- a/skillopt/envs/alfworld/rollout.py +++ b/skillopt/envs/alfworld/rollout.py @@ -15,7 +15,7 @@ import time import concurrent.futures import numpy as np -from skillopt.model import chat_student +from skillopt.model import chat_target # ── Constants ───────────────────────────────────────────────────────────────── @@ -210,7 +210,7 @@ def run_alfworld_batch( def call_api(idx): try: - response, _ = chat_student( + response, _ = chat_target( system="You are an expert agent operating in the ALFRED Embodied Environment.", user=prompts[idx], max_completion_tokens=max_completion_tokens, diff --git a/skillopt/envs/base.py b/skillopt/envs/base.py index 4267944..c2e57ea 100644 --- a/skillopt/envs/base.py +++ b/skillopt/envs/base.py @@ -31,7 +31,6 @@ import os import random from skillopt.datasets.base import BaseDataLoader, BatchSpec -from skillopt.model.codex_harness import extract_codex_trace_prefix, format_codex_trace_steps, parse_codex_raw from skillopt.prompts import load_prompt @@ -60,24 +59,8 @@ class EnvAdapter(ABC): """Return whether this adapter requires Ray runtime initialization.""" return False - def deep_reflect( - self, - results: list[dict], - skill_content: str, - out_dir: str, - **kwargs, - ) -> list[dict | None]: - """Optional deeper diagnostic reflection pass. - - Default behavior is a no-op. Dataset-backed adapters may override this - to re-query the student on a small representative subset of the current - batch using minimally-perturbed diagnostic prompts that expose - intermediate reasoning state. - """ - return [] - def build_reference_text(self, item: dict) -> str: - """Return hidden reference material for deep reflection, if any.""" + """Return hidden reference material for reflection, if any.""" return str(item.get("reference_text") or "").strip() def get_reference_metadata(self, item: dict) -> dict: @@ -90,65 +73,6 @@ class EnvAdapter(ABC): "preview": reference_text[:400], } - def get_codex_deep_probe_prompt(self) -> str | None: - env_name = getattr(self, "_cfg", {}).get("env_name") - return load_prompt("deep_probe_codex", env=env_name) - - def attach_codex_probe_context( - self, - results: list[dict], - prediction_dir: str, - ) -> list[dict]: - """Attach compact Codex step metadata for codex-aware deep reflection.""" - enriched: list[dict] = [] - for row in results: - merged = dict(row) - tid = str(row.get("id")) - raw_path = os.path.join(prediction_dir, tid, "codex_raw.txt") - if os.path.exists(raw_path): - with open(raw_path, encoding="utf-8") as f: - raw = f.read() - parsed = parse_codex_raw(raw) - merged["codex_probe_trace_steps"] = format_codex_trace_steps(raw) - merged["codex_probe_step_count"] = len(parsed["steps"]) - enriched.append(merged) - return enriched - - def resolve_codex_probe_target( - self, - *, - selected_items: list[dict], - selected_examples: list[dict], - prediction_dir: str, - probe: dict, - ) -> tuple[list[dict], dict[str, str] | None, dict]: - """Resolve the teacher-selected codex probe target and raw trace prefix.""" - target_id = str(probe.get("probe_target_id", "")).strip() - selected_id_set = {str(item["id"]) for item in selected_items} - if target_id not in selected_id_set: - target_id = str(selected_items[0]["id"]) - target_item = next(item for item in selected_items if str(item["id"]) == target_id) - target_result = next( - (row for row in selected_examples if str(row.get("id")) == target_id), - None, - ) - max_probe_step = int((target_result or {}).get("codex_probe_step_count", 0)) - default_probe_step = max_probe_step - 1 if max_probe_step > 1 else max_probe_step - probe_after_step = int(probe.get("probe_after_step", default_probe_step)) - if max_probe_step > 0: - probe_after_step = max(0, min(probe_after_step, max_probe_step)) - else: - probe_after_step = 0 - raw_path = os.path.join(prediction_dir, target_id, "codex_raw.txt") - trace_prefix = "" - if os.path.exists(raw_path): - with open(raw_path, encoding="utf-8") as f: - trace_prefix = extract_codex_trace_prefix(f.read(), after_step=probe_after_step) - updated_probe = dict(probe) - updated_probe["probe_target_id"] = target_id - updated_probe["probe_after_step"] = probe_after_step - return [target_item], {target_id: trace_prefix}, updated_probe - def attach_reference_context( self, results: list[dict], @@ -383,14 +307,3 @@ class EnvAdapter(ABC): if prompt is not None: return prompt return self._load_env_prompt("analyst_success") - - def get_deep_probe_prompt(self) -> str | None: - return self._load_env_prompt("deep_probe") - - def get_meta_reflect_prompt(self) -> str | None: - update_mode = getattr(self, "_cfg", {}).get("skill_update_mode", "patch") - if str(update_mode).strip().lower() == "rewrite_from_suggestions": - prompt = self._load_env_prompt("meta_reflect_rewrite") - if prompt is not None: - return prompt - return self._load_env_prompt("meta_reflect") diff --git a/skillopt/envs/docvqa/adapter.py b/skillopt/envs/docvqa/adapter.py index 7a86520..5c95a0b 100644 --- a/skillopt/envs/docvqa/adapter.py +++ b/skillopt/envs/docvqa/adapter.py @@ -4,7 +4,6 @@ import os from skillopt.datasets.base import BatchSpec from skillopt.envs.base import EnvAdapter -from skillopt.envs.deep_reflect import run_no_reference_deep_reflect from skillopt.envs.docvqa.dataloader import DocVQADataLoader from skillopt.envs.docvqa.rollout import run_batch from skillopt.gradient.reflect import run_minibatch_reflect @@ -28,11 +27,7 @@ class DocVQAAdapter(EnvAdapter): edit_budget: int = 4, seed: int = 42, limit: int = 0, - image_detail: str = "auto", - use_deep_reflect: bool = False, - deep_reflect_failures: int = 4, - deep_reflect_successes: int = 2, - ) -> None: + image_detail: str = "auto", ) -> None: self.max_turns = max_turns self.exec_timeout = exec_timeout self.workers = workers @@ -41,9 +36,6 @@ class DocVQAAdapter(EnvAdapter): self.minibatch_size = minibatch_size self.edit_budget = edit_budget self.image_detail = image_detail - self.use_deep_reflect = use_deep_reflect - self.deep_reflect_failures = deep_reflect_failures - self.deep_reflect_successes = deep_reflect_successes self.dataloader = DocVQADataLoader( split_dir=split_dir, data_path=data_path, @@ -109,38 +101,6 @@ class DocVQAAdapter(EnvAdapter): update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), ) - def deep_reflect( - self, - results: list[dict], - skill_content: str, - out_dir: str, - **kwargs, - ) -> list[dict | None]: - return run_no_reference_deep_reflect( - self, - results, - skill_content, - out_dir, - env_manager=kwargs.get("env_manager"), - prediction_dir=kwargs.get("prediction_dir"), - random_seed=kwargs.get("random_seed"), - step_buffer_context=kwargs.get("step_buffer_context", ""), - output_requirements=[ - "- There is no hidden reference block. Use only the document image prompt, student output, and evaluation result to infer what intermediate state is worth probing.", - "- The instruction must explicitly request a short ... block before the final ....", - "- The readout should focus on visual region, field/table/figure label, OCR text read, candidate answer, and answer-format normalization.", - "- Do not ask for exhaustive transcription or a full chain-of-thought.", - "- The instruction text should be ready to append directly to the student's prompt.", - ], - metadata_builder=lambda item: { - "id": str(item.get("id")), - "task_type": str(item.get("task_type") or "docvqa"), - "question_preview": str(item.get("question") or "")[:200], - "image_path": item.get("image_path", ""), - "docId": item.get("docId", ""), - "page": item.get("ucsf_document_page_no", ""), - }, - ) def get_task_types(self) -> list[str]: seen: list[str] = [] diff --git a/skillopt/envs/docvqa/rollout.py b/skillopt/envs/docvqa/rollout.py index 825a7ca..14d4bb0 100644 --- a/skillopt/envs/docvqa/rollout.py +++ b/skillopt/envs/docvqa/rollout.py @@ -6,8 +6,8 @@ import time from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait from skillopt.envs.docvqa.evaluator import evaluate -from skillopt.model import chat_student_messages, get_student_backend, is_student_exec_backend -from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_student_exec +from skillopt.model import chat_target_messages, get_target_backend, is_target_exec_backend +from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_target_exec from skillopt.prompts import load_prompt @@ -112,11 +112,11 @@ def _run_codex_once( images=[item["image_path"]], ) prompt = ( - "Use the `skillopt-student` skill available in this workspace.\n" + "Use the `skillopt-target` skill available in this workspace.\n" "Read `task.md`, inspect the attached document image, and answer the DocVQA question.\n" "Return the final answer inside ...." ) - final_message, raw = run_student_exec( + final_message, raw = run_target_exec( work_dir=work_dir, prompt=prompt, model=model, @@ -158,7 +158,7 @@ def process_one( system_prompt = "" user_text = "" conversation: list[dict] = [] - if is_student_exec_backend(): + if is_target_exec_backend(): from skillopt.model import azure_openai as _llm conversation = [ @@ -172,7 +172,7 @@ def process_one( pred_dir=os.path.join(out_root, "predictions", item_id), item=item, skill_content=skill_content, - model=_llm.STUDENT_DEPLOYMENT, + model=_llm.TARGET_DEPLOYMENT, timeout=exec_timeout, image_detail=image_detail, diagnostic_mode=diagnostic_mode if turn == 0 else False, @@ -198,7 +198,7 @@ def process_one( ] for turn in range(max_turns): if turn == 0: - resp_text, _ = chat_student_messages( + resp_text, _ = chat_target_messages( messages=messages, max_completion_tokens=768, retries=5, @@ -212,7 +212,7 @@ def process_one( {"role": "assistant", "content": response}, {"role": "user", "content": "Review the same image carefully and answer again. Keep the final answer inside ...."}, ] - resp_text, _ = chat_student_messages( + resp_text, _ = chat_target_messages( messages=refinement_messages, max_completion_tokens=512, retries=5, @@ -230,9 +230,9 @@ def process_one( pred_dir = os.path.join(out_root, "predictions", item_id) os.makedirs(pred_dir, exist_ok=True) - with open(os.path.join(pred_dir, "student_system_prompt.txt"), "w", encoding="utf-8") as f: + with open(os.path.join(pred_dir, "target_system_prompt.txt"), "w", encoding="utf-8") as f: f.write(system_prompt) - with open(os.path.join(pred_dir, "student_user_prompt.txt"), "w", encoding="utf-8") as f: + with open(os.path.join(pred_dir, "target_user_prompt.txt"), "w", encoding="utf-8") as f: f.write(user_text) eval_result = evaluate(response, item.get("answers", [])) diff --git a/skillopt/envs/livemathematicianbench/adapter.py b/skillopt/envs/livemathematicianbench/adapter.py index 15afbaf..b98090c 100644 --- a/skillopt/envs/livemathematicianbench/adapter.py +++ b/skillopt/envs/livemathematicianbench/adapter.py @@ -4,13 +4,12 @@ from __future__ import annotations import json import os -from skillopt.gradient.deep_probe import generate_deep_probe_instruction from skillopt.datasets.base import BatchSpec from skillopt.gradient.reflect import run_minibatch_reflect from skillopt.envs.base import EnvAdapter from skillopt.envs.livemathematicianbench.dataloader import LiveMathematicianBenchDataLoader from skillopt.envs.livemathematicianbench.rollout import run_batch -from skillopt.model import get_student_backend +from skillopt.model import get_target_backend class LiveMathematicianBenchAdapter(EnvAdapter): @@ -61,11 +60,7 @@ class LiveMathematicianBenchAdapter(EnvAdapter): limit: int = 0, shuffle_choices: bool = True, use_theorem: bool = False, - use_sketch: bool = False, - use_deep_reflect: bool = False, - deep_reflect_failures: int = 4, - deep_reflect_successes: int = 2, - ) -> None: + use_sketch: bool = False, ) -> None: self.max_turns = max_turns self.exec_timeout = exec_timeout self.workers = workers @@ -75,9 +70,6 @@ class LiveMathematicianBenchAdapter(EnvAdapter): self.edit_budget = edit_budget self.use_theorem = use_theorem self.use_sketch = use_sketch - self.use_deep_reflect = use_deep_reflect - self.deep_reflect_failures = deep_reflect_failures - self.deep_reflect_successes = deep_reflect_successes self.dataloader = LiveMathematicianBenchDataLoader( split_dir=split_dir, data_path=data_path, @@ -161,122 +153,6 @@ class LiveMathematicianBenchAdapter(EnvAdapter): update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), ) - def deep_reflect( - self, - results: list[dict], - skill_content: str, - out_dir: str, - **kwargs, - ) -> list[dict | None]: - if not self.use_deep_reflect: - return [] - - env_manager = kwargs.get("env_manager") - prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions")) - random_seed = kwargs.get("random_seed") - step_buffer_context = kwargs.get("step_buffer_context", "") - meta_skill_context = kwargs.get("meta_skill_context", "") - codex_backend = get_student_backend() == "codex_exec" - selected_items = self.select_representative_items( - results, - env_manager if isinstance(env_manager, list) else None, - n_failures=self.deep_reflect_failures, - n_successes=self.deep_reflect_successes, - seed=random_seed, - ) - if not selected_items: - return [] - selected_ids = {str(item["id"]) for item in selected_items} - selected_results = [row for row in results if str(row.get("id")) in selected_ids] - selected_examples = self.attach_reference_context(selected_results, selected_items) - if codex_backend: - selected_examples = self.attach_codex_probe_context(selected_examples, prediction_dir) - selected_metadata = [] - theorem_count = 0 - sketch_count = 0 - for item in selected_items: - meta = self.get_reference_metadata(item) - if "theorem" in meta["fields"]: - theorem_count += 1 - if "sketch" in meta["fields"]: - sketch_count += 1 - selected_metadata.append({ - "id": str(item["id"]), - "task_type": str(item.get("theorem_type", ["math_mcq"])[0] if item.get("theorem_type") else "math_mcq"), - "reference_fields": meta["fields"], - "reference_preview": meta["preview"], - }) - - deep_dir = os.path.join(out_dir, "deep_reflect") - rollout_dir = os.path.join(deep_dir, "rollout") - patches_dir = os.path.join(deep_dir, "patches") - os.makedirs(deep_dir, exist_ok=True) - print( - f" [2b/6 DEEP REFLECT setup] selected={len(selected_items)} " - f"reference_fields=theorem({theorem_count}/{len(selected_items)})," - f"sketch({sketch_count}/{len(selected_items)})" - ) - probe = generate_deep_probe_instruction( - skill_content=skill_content, - items=selected_examples, - prediction_dir=prediction_dir, - system_prompt=self.get_codex_deep_probe_prompt() if codex_backend else self.get_deep_probe_prompt(), - step_buffer_context=step_buffer_context, - meta_skill_context=meta_skill_context, - ) - if not probe: - return [] - diagnostic_trace_context_by_id = None - if codex_backend: - selected_items, diagnostic_trace_context_by_id, probe = self.resolve_codex_probe_target( - selected_items=selected_items, - selected_examples=selected_examples, - prediction_dir=prediction_dir, - probe=probe, - ) - probe_record = { - **probe, - "reference_summary": { - "selected_count": len(selected_items), - "field_counts": { - "theorem": theorem_count, - "sketch": sketch_count, - }, - }, - "selected_examples": selected_metadata, - } - with open(os.path.join(deep_dir, "probe.json"), "w", encoding="utf-8") as f: - json.dump(probe_record, f, ensure_ascii=False, indent=2) - deep_results = run_batch( - items=selected_items, - out_root=rollout_dir, - skill_content=skill_content, - max_turns=self.max_turns, - workers=min(self.workers, max(len(selected_items), 1)), - use_theorem=self.use_theorem, - use_sketch=self.use_sketch, - diagnostic_mode=True, - diagnostic_instruction=probe["probe_instruction"], - diagnostic_trace_context_by_id=diagnostic_trace_context_by_id, - task_timeout=self.exec_timeout, - ) - deep_results = self.attach_reference_context(deep_results, selected_items) - return run_minibatch_reflect( - results=deep_results, - skill_content=skill_content, - prediction_dir=os.path.join(rollout_dir, "predictions"), - patches_dir=patches_dir, - workers=self.analyst_workers, - failure_only=self.failure_only, - minibatch_size=self.minibatch_size, - edit_budget=self.edit_budget, - random_seed=random_seed, - error_system=self.get_error_minibatch_prompt(), - success_system=self.get_success_minibatch_prompt(), - step_buffer_context=step_buffer_context, - meta_skill_context=meta_skill_context, - update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), - ) def get_task_types(self) -> list[str]: return self.dataloader.get_task_types() diff --git a/skillopt/envs/livemathematicianbench/prompts/analyst_error.md b/skillopt/envs/livemathematicianbench/prompts/analyst_error.md index 7a78d10..dac1d04 100644 --- a/skillopt/envs/livemathematicianbench/prompts/analyst_error.md +++ b/skillopt/envs/livemathematicianbench/prompts/analyst_error.md @@ -1,7 +1,7 @@ You are an expert failure-analysis agent for theorem-grounded mathematical multiple-choice questions. You will be given MULTIPLE failed trajectories from a single minibatch and the current skill document. -Each trajectory includes the student's response and an evaluation result showing the predicted option +Each trajectory includes the target's response and an evaluation result showing the predicted option versus the correct option. Your job is to identify COMMON reasoning failures across the batch and propose concise skill edits. diff --git a/skillopt/envs/livemathematicianbench/rollout.py b/skillopt/envs/livemathematicianbench/rollout.py index 0734b7a..de4f3dc 100644 --- a/skillopt/envs/livemathematicianbench/rollout.py +++ b/skillopt/envs/livemathematicianbench/rollout.py @@ -7,8 +7,8 @@ import time from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait from skillopt.envs.livemathematicianbench.evaluator import evaluate -from skillopt.model import chat_student, get_student_backend, is_student_exec_backend -from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_student_exec +from skillopt.model import chat_target, get_target_backend, is_target_exec_backend +from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_target_exec from skillopt.prompts import load_prompt def _build_system(skill_content: str) -> str: @@ -95,11 +95,11 @@ def _run_codex_once( work_dir = os.path.join(pred_dir, "codex_exec") prepare_workspace(work_dir=work_dir, skill_md=skill_md, task_text=task_text) prompt = ( - "Use the `skillopt-student` skill available in this workspace.\n" + "Use the `skillopt-target` skill available in this workspace.\n" "Read `task.md` and solve the multiple-choice problem.\n" "Output only the final choice label inside ...." ) - final_message, raw = run_student_exec( + final_message, raw = run_target_exec( work_dir=work_dir, prompt=prompt, model=model, @@ -143,7 +143,7 @@ def process_one( pred_dir = os.path.join(out_root, "predictions", item_id) os.makedirs(pred_dir, exist_ok=True) - if is_student_exec_backend(): + if is_target_exec_backend(): from skillopt.model import azure_openai as _llm conversation: list[dict] = [] @@ -155,7 +155,7 @@ def process_one( pred_dir=pred_dir, skill_content=skill_content, item=item, - model=_llm.STUDENT_DEPLOYMENT, + model=_llm.TARGET_DEPLOYMENT, timeout=exec_timeout, use_theorem=use_theorem, use_sketch=use_sketch, @@ -172,9 +172,9 @@ def process_one( result["agent_ok"] = True result["n_turns"] = len(conversation) - with open(os.path.join(pred_dir, "student_system_prompt.txt"), "w", encoding="utf-8") as f: + with open(os.path.join(pred_dir, "target_system_prompt.txt"), "w", encoding="utf-8") as f: f.write(system) - with open(os.path.join(pred_dir, "student_user_prompt.txt"), "w", encoding="utf-8") as f: + with open(os.path.join(pred_dir, "target_user_prompt.txt"), "w", encoding="utf-8") as f: f.write(user) eval_result = evaluate(response, item["correct_choice"], item["choices"]) @@ -216,7 +216,7 @@ def process_one( for turn in range(max_turns): if turn == 0: - resp_text, _ = chat_student( + resp_text, _ = chat_target( system=system, user=user, max_completion_tokens=16384, @@ -230,7 +230,7 @@ def process_one( "Re-evaluate the exact option wording. If needed, correct it. " "Output only the final choice label inside ...." ) - resp_text, _ = chat_student( + resp_text, _ = chat_target( system=system, user=refinement, max_completion_tokens=16384, @@ -247,9 +247,9 @@ def process_one( result["agent_ok"] = True result["n_turns"] = len(conversation) - with open(os.path.join(pred_dir, "student_system_prompt.txt"), "w", encoding="utf-8") as f: + with open(os.path.join(pred_dir, "target_system_prompt.txt"), "w", encoding="utf-8") as f: f.write(system) - with open(os.path.join(pred_dir, "student_user_prompt.txt"), "w", encoding="utf-8") as f: + with open(os.path.join(pred_dir, "target_user_prompt.txt"), "w", encoding="utf-8") as f: f.write(user) eval_result = evaluate(response, item["correct_choice"], item["choices"]) diff --git a/skillopt/envs/officeqa/adapter.py b/skillopt/envs/officeqa/adapter.py index b97acc6..b504309 100644 --- a/skillopt/envs/officeqa/adapter.py +++ b/skillopt/envs/officeqa/adapter.py @@ -4,7 +4,6 @@ import os from skillopt.datasets.base import BatchSpec from skillopt.envs.base import EnvAdapter -from skillopt.envs.deep_reflect import run_no_reference_deep_reflect from skillopt.envs.officeqa.dataloader import OfficeQADataLoader from skillopt.envs.officeqa.rollout import run_batch from skillopt.gradient.reflect import run_minibatch_reflect @@ -37,11 +36,7 @@ class OfficeQAAdapter(EnvAdapter): search_timeout_seconds: int = 20, use_local_tools: bool = True, data_dirs: list[str] | str | None = None, - docs_dirs: list[str] | str | None = None, - use_deep_reflect: bool = False, - deep_reflect_failures: int = 4, - deep_reflect_successes: int = 2, - ) -> None: + docs_dirs: list[str] | str | None = None, ) -> None: self.workers = workers self.analyst_workers = analyst_workers self.failure_only = failure_only @@ -58,9 +53,6 @@ class OfficeQAAdapter(EnvAdapter): self.search_timeout_seconds = int(search_timeout_seconds) self.use_local_tools = bool(use_local_tools) self.data_dirs = data_dirs if data_dirs is not None else docs_dirs - self.use_deep_reflect = use_deep_reflect - self.deep_reflect_failures = deep_reflect_failures - self.deep_reflect_successes = deep_reflect_successes self.dataloader = OfficeQADataLoader( split_dir=split_dir, data_path=data_path, @@ -133,37 +125,6 @@ class OfficeQAAdapter(EnvAdapter): update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), ) - def deep_reflect( - self, - results: list[dict], - skill_content: str, - out_dir: str, - **kwargs, - ) -> list[dict | None]: - return run_no_reference_deep_reflect( - self, - results, - skill_content, - out_dir, - env_manager=kwargs.get("env_manager"), - prediction_dir=kwargs.get("prediction_dir"), - random_seed=kwargs.get("random_seed"), - step_buffer_context=kwargs.get("step_buffer_context", ""), - output_requirements=[ - "- There is no hidden reference block. Use only the question, candidate files, tool trace, student output, and evaluation result to infer what intermediate state is worth probing.", - "- The instruction must explicitly request a short ... block before the final ....", - "- The readout should focus on selected document/file, evidence span or table, extracted value, units, and any date or fiscal-period normalization.", - "- Do not ask for exhaustive copying of source text or a full chain-of-thought.", - "- The instruction text should be ready to append directly to the student's prompt.", - ], - metadata_builder=lambda item: { - "id": str(item.get("id")), - "task_type": str(item.get("task_type") or "officeqa"), - "question_preview": str(item.get("question") or "")[:200], - "source_files": item.get("source_files", []), - "source_docs": item.get("source_docs", []), - }, - ) def get_task_types(self) -> list[str]: seen: list[str] = [] diff --git a/skillopt/envs/officeqa/rollout.py b/skillopt/envs/officeqa/rollout.py index 0428717..7d3a37a 100644 --- a/skillopt/envs/officeqa/rollout.py +++ b/skillopt/envs/officeqa/rollout.py @@ -14,8 +14,8 @@ try: from skillopt.envs.sealqa.tool_runtime import custom_search except ImportError: custom_search = None # type: ignore[assignment] -from skillopt.model import chat_student_messages, get_student_backend, is_student_exec_backend -from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_student_exec +from skillopt.model import chat_target_messages, get_target_backend, is_target_exec_backend +from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_target_exec from skillopt.prompts import load_prompt _TOOL_SCHEMAS = [ { @@ -299,12 +299,12 @@ def _run_codex_once( link_dirs=_docs_link_targets(docs_roots), ) prompt = ( - "Use the `skillopt-student` skill available in this workspace.\n" + "Use the `skillopt-target` skill available in this workspace.\n" "Read `task.md`, inspect or search the full OfficeQA corpus under `docs/`, and answer the question.\n" "Treat candidate files in `task.md` as hints, not an access limit.\n" "Return the final answer inside ...." ) - final_message, raw = run_student_exec( + final_message, raw = run_target_exec( work_dir=work_dir, prompt=prompt, model=model, @@ -356,8 +356,8 @@ def _run_custom_search_process( raise ValueError("custom_search mode requires a non-empty search_api_url") if not os.environ.get(search_auth_env, "").strip(): raise ValueError(f"custom_search mode requires auth token env var {search_auth_env}") - if get_student_backend() not in {"openai_chat", "qwen_chat"}: - raise ValueError("custom_search mode is only supported with student_backend='openai_chat' or 'qwen_chat'") + if get_target_backend() not in {"openai_chat", "qwen_chat"}: + raise ValueError("custom_search mode is only supported with target_backend='openai_chat' or 'qwen_chat'") system = _build_system( skill_content, search_mode=_CUSTOM_SEARCH_MODE, @@ -385,7 +385,7 @@ def _run_custom_search_process( fail_reason = "" last_response_metadata: dict = {} for turn in range(1, max_tool_turns + 1): - message, _ = chat_student_messages( + message, _ = chat_target_messages( messages=messages, max_completion_tokens=max_completion_tokens, retries=5, @@ -439,8 +439,8 @@ def _run_azure_search_process( diagnostic_mode: bool, diagnostic_instruction: str, ) -> tuple[str, str, str, str, list[dict], str, dict]: - if get_student_backend() != "openai_chat": - raise ValueError("azure_search mode is only supported with student_backend='openai_chat'") + if get_target_backend() != "openai_chat": + raise ValueError("azure_search mode is only supported with target_backend='openai_chat'") system = _build_system(skill_content, search_mode=_AZURE_SEARCH_MODE) user = _build_user( item, @@ -453,7 +453,7 @@ def _run_azure_search_process( {"role": "user", "content": user}, ] conversation: list[dict] = [{"role": "user", "content": user}] - message, _ = chat_student_messages( + message, _ = chat_target_messages( messages=messages, max_completion_tokens=max_completion_tokens, retries=5, @@ -494,7 +494,7 @@ def _run_offline_no_tools_process( {"role": "user", "content": user}, ] conversation: list[dict] = [{"role": "user", "content": user}] - message, _ = chat_student_messages( + message, _ = chat_target_messages( messages=messages, max_completion_tokens=max_completion_tokens, retries=5, @@ -616,7 +616,7 @@ def process_one( candidate_files=candidate_files, oracle_context=oracle_context, ) - elif is_student_exec_backend(): + elif is_target_exec_backend(): from skillopt.model import azure_openai as _llm response = "" system = "" @@ -628,7 +628,7 @@ def process_one( skill_content=skill_content, candidate_files=candidate_files, docs_roots=docs_roots, - model=_llm.STUDENT_DEPLOYMENT, + model=_llm.TARGET_DEPLOYMENT, timeout=180, diagnostic_mode=diagnostic_mode if turn == 1 else False, diagnostic_instruction=diagnostic_instruction if turn == 1 else "", @@ -650,7 +650,7 @@ def process_one( {"role": "user", "content": user}, ] for turn in range(1, max_tool_turns + 1): - message, _ = chat_student_messages( + message, _ = chat_target_messages( messages=messages, max_completion_tokens=768, retries=5, @@ -688,9 +688,9 @@ def process_one( break except Exception as e: # noqa: BLE001 fail_reason = f"error: {e}" - with open(os.path.join(pred_dir, "student_system_prompt.txt"), "w", encoding="utf-8") as f: + with open(os.path.join(pred_dir, "target_system_prompt.txt"), "w", encoding="utf-8") as f: f.write(system) - with open(os.path.join(pred_dir, "student_user_prompt.txt"), "w", encoding="utf-8") as f: + with open(os.path.join(pred_dir, "target_user_prompt.txt"), "w", encoding="utf-8") as f: f.write(user) with open(os.path.join(pred_dir, "conversation.json"), "w", encoding="utf-8") as f: json.dump(conversation, f, ensure_ascii=False, indent=2) @@ -714,8 +714,8 @@ def process_one( "agent_ok": not fail_reason, "n_turns": len(conversation), "last_finish_reason": last_response_metadata.get("finish_reason", ""), - "student_system_prompt": system, - "student_user_prompt": user, + "target_system_prompt": system, + "target_user_prompt": user, } return result def run_batch( diff --git a/skillopt/envs/searchqa/adapter.py b/skillopt/envs/searchqa/adapter.py index b803bcd..15afbd0 100644 --- a/skillopt/envs/searchqa/adapter.py +++ b/skillopt/envs/searchqa/adapter.py @@ -4,13 +4,12 @@ from __future__ import annotations import json import os -from skillopt.gradient.deep_probe import generate_deep_probe_instruction from skillopt.datasets.base import BatchSpec from skillopt.envs.base import EnvAdapter from skillopt.envs.searchqa.dataloader import SearchQADataLoader from skillopt.envs.searchqa.rollout import run_batch from skillopt.gradient.reflect import run_minibatch_reflect -from skillopt.model import get_student_backend +from skillopt.model import get_target_backend class SearchQAAdapter(EnvAdapter): @@ -32,11 +31,7 @@ class SearchQAAdapter(EnvAdapter): minibatch_size: int = 8, edit_budget: int = 4, seed: int = 42, - limit: int = 0, - use_deep_reflect: bool = False, - deep_reflect_failures: int = 4, - deep_reflect_successes: int = 2, - ) -> None: + limit: int = 0, ) -> None: self.max_turns = max_turns self.exec_timeout = exec_timeout self.workers = workers @@ -44,9 +39,6 @@ class SearchQAAdapter(EnvAdapter): self.failure_only = failure_only self.minibatch_size = minibatch_size self.edit_budget = edit_budget - self.use_deep_reflect = use_deep_reflect - self.deep_reflect_failures = deep_reflect_failures - self.deep_reflect_successes = deep_reflect_successes self.dataloader = SearchQADataLoader( split_dir=split_dir, data_path=data_path, @@ -128,121 +120,6 @@ class SearchQAAdapter(EnvAdapter): update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), ) - def deep_reflect( - self, - results: list[dict], - skill_content: str, - out_dir: str, - **kwargs, - ) -> list[dict | None]: - if not self.use_deep_reflect: - return [] - - env_manager = kwargs.get("env_manager") - if not isinstance(env_manager, list): - return [] - - prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions")) - random_seed = kwargs.get("random_seed") - step_buffer_context = kwargs.get("step_buffer_context", "") - meta_skill_context = kwargs.get("meta_skill_context", "") - codex_backend = get_student_backend() == "codex_exec" - selected_items = self.select_representative_items( - results, - env_manager, - n_failures=self.deep_reflect_failures, - n_successes=self.deep_reflect_successes, - seed=random_seed, - ) - if not selected_items: - return [] - - selected_ids = {str(item["id"]) for item in selected_items} - selected_results = [row for row in results if str(row.get("id")) in selected_ids] - selected_examples = ( - self.attach_codex_probe_context(selected_results, prediction_dir) - if codex_backend - else selected_results - ) - selected_metadata = [ - { - "id": str(item["id"]), - "question_preview": str(item.get("question") or "")[:200], - "has_context": bool(str(item.get("context") or "").strip()), - "n_gold_answers": len(item.get("answers") or []), - } - for item in selected_items - ] - - deep_dir = os.path.join(out_dir, "deep_reflect") - rollout_dir = os.path.join(deep_dir, "rollout") - patches_dir = os.path.join(deep_dir, "patches") - os.makedirs(deep_dir, exist_ok=True) - print( - f" [2b/6 DEEP REFLECT setup] selected={len(selected_items)} " - f"mode=no_reference_probe" - ) - probe = generate_deep_probe_instruction( - skill_content=skill_content, - items=selected_examples, - prediction_dir=prediction_dir, - system_prompt=self.get_codex_deep_probe_prompt() if codex_backend else self.get_deep_probe_prompt(), - step_buffer_context=step_buffer_context, - meta_skill_context=meta_skill_context, - output_requirements=[ - "- There is no hidden reference block. Use only the question, provided context, the student's output, and the evaluation result to infer what intermediate state is worth probing.", - "- The instruction must explicitly request a short ... block before the final ....", - "- The readout should focus on likely evidence span, top candidate and runner-up, decisive clue, or a few short intermediate conclusions.", - "- Do not ask for exhaustive copying of the context or a full chain-of-thought.", - "- The instruction text should be ready to append directly to the student's prompt.", - ], - ) - if not probe: - return [] - diagnostic_trace_context_by_id = None - if codex_backend: - selected_items, diagnostic_trace_context_by_id, probe = self.resolve_codex_probe_target( - selected_items=selected_items, - selected_examples=selected_examples, - prediction_dir=prediction_dir, - probe=probe, - ) - - with open(os.path.join(deep_dir, "probe.json"), "w", encoding="utf-8") as f: - json.dump( - { - **probe, - "selected_examples": selected_metadata, - }, - f, - ensure_ascii=False, - indent=2, - ) - - deep_results = self.rollout( - selected_items, - skill_content, - rollout_dir, - diagnostic_mode=True, - diagnostic_instruction=probe["probe_instruction"], - diagnostic_trace_context_by_id=diagnostic_trace_context_by_id, - ) - return run_minibatch_reflect( - results=deep_results, - skill_content=skill_content, - prediction_dir=os.path.join(rollout_dir, "predictions"), - patches_dir=patches_dir, - workers=self.analyst_workers, - failure_only=self.failure_only, - minibatch_size=self.minibatch_size, - edit_budget=self.edit_budget, - random_seed=random_seed, - error_system=self.get_error_minibatch_prompt(), - success_system=self.get_success_minibatch_prompt(), - step_buffer_context=step_buffer_context, - meta_skill_context=meta_skill_context, - update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), - ) def get_task_types(self) -> list[str]: return ["qa"] diff --git a/skillopt/envs/searchqa/rollout.py b/skillopt/envs/searchqa/rollout.py index 731790f..b94f671 100644 --- a/skillopt/envs/searchqa/rollout.py +++ b/skillopt/envs/searchqa/rollout.py @@ -16,8 +16,8 @@ import time import traceback from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait -from skillopt.model import chat_student, get_student_backend, is_student_exec_backend -from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_student_exec +from skillopt.model import chat_target, get_target_backend, is_target_exec_backend +from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_target_exec from skillopt.prompts import load_prompt from skillopt.envs.searchqa.evaluator import evaluate @@ -123,11 +123,11 @@ def _run_codex_once( task_text=task_text, ) prompt = ( - "Use the `skillopt-student` skill available in this workspace.\n" + "Use the `skillopt-target` skill available in this workspace.\n" "Read `task.md` and answer the SearchQA question.\n" "Return the final answer inside ...." ) - final_message, raw = run_student_exec( + final_message, raw = run_target_exec( work_dir=work_dir, prompt=prompt, model=model, @@ -192,7 +192,7 @@ def process_one( pred_dir = os.path.join(out_root, "predictions", item_id) os.makedirs(pred_dir, exist_ok=True) - if is_student_exec_backend(): + if is_target_exec_backend(): from skillopt.model import azure_openai as _llm conversation: list[dict] = [] @@ -205,7 +205,7 @@ def process_one( skill_content=skill_content, question=question, context=context, - model=_llm.STUDENT_DEPLOYMENT, + model=_llm.TARGET_DEPLOYMENT, timeout=exec_timeout, diagnostic_mode=diagnostic_mode if turn == 0 else False, diagnostic_instruction=diagnostic_instruction if turn == 0 else "", @@ -220,9 +220,9 @@ def process_one( result["agent_ok"] = True result["n_turns"] = len(conversation) - with open(os.path.join(pred_dir, "student_system_prompt.txt"), "w") as f: + with open(os.path.join(pred_dir, "target_system_prompt.txt"), "w") as f: f.write(system) - with open(os.path.join(pred_dir, "student_user_prompt.txt"), "w") as f: + with open(os.path.join(pred_dir, "target_user_prompt.txt"), "w") as f: f.write(user) with open(os.path.join(pred_dir, "conversation.json"), "w") as f: json.dump(conversation, f, ensure_ascii=False, indent=2) @@ -266,7 +266,7 @@ def process_one( for turn in range(max_turns): if turn == 0: - resp_text, _ = chat_student( + resp_text, _ = chat_target( system=system, user=user, max_completion_tokens=512, retries=5, stage="rollout", @@ -279,7 +279,7 @@ def process_one( f"If correct, repeat it. If wrong, provide a corrected answer.\n" f"Use ... tags for your final answer." ) - resp_text, _ = chat_student( + resp_text, _ = chat_target( system=system, user=refinement, max_completion_tokens=512, retries=5, stage="rollout", @@ -297,9 +297,9 @@ def process_one( result["n_turns"] = len(conversation) # Save conversation - with open(os.path.join(pred_dir, "student_system_prompt.txt"), "w") as f: + with open(os.path.join(pred_dir, "target_system_prompt.txt"), "w") as f: f.write(system) - with open(os.path.join(pred_dir, "student_user_prompt.txt"), "w") as f: + with open(os.path.join(pred_dir, "target_user_prompt.txt"), "w") as f: f.write(user) with open(os.path.join(pred_dir, "conversation.json"), "w") as f: json.dump(conversation, f, ensure_ascii=False, indent=2) diff --git a/skillopt/envs/spreadsheetbench/adapter.py b/skillopt/envs/spreadsheetbench/adapter.py index 7f69c85..c43ae98 100644 --- a/skillopt/envs/spreadsheetbench/adapter.py +++ b/skillopt/envs/spreadsheetbench/adapter.py @@ -8,7 +8,6 @@ from __future__ import annotations import json import os -from skillopt.gradient.deep_probe import generate_deep_probe_instruction from skillopt.datasets.base import BatchSpec from skillopt.envs.base import EnvAdapter from skillopt.envs.spreadsheetbench.dataloader import SpreadsheetBenchDataLoader @@ -18,7 +17,7 @@ from skillopt.envs.spreadsheetbench.rollout import ( run_spreadsheet_batch_codegen, ) from skillopt.gradient.reflect import run_minibatch_reflect -from skillopt.model import get_student_backend, is_student_exec_backend +from skillopt.model import get_target_backend, is_target_exec_backend # Task types used for per-category breakdowns @@ -45,11 +44,7 @@ class SpreadsheetBenchAdapter(EnvAdapter): failure_only: bool = False, minibatch_size: int = 8, edit_budget: int = 4, - seed: int = 42, - use_deep_reflect: bool = False, - deep_reflect_failures: int = 4, - deep_reflect_successes: int = 2, - ) -> None: + seed: int = 42, ) -> None: self.data_root = data_root self.mode = mode # "single", "multi", or "react" self.max_turns = max_turns @@ -59,9 +54,6 @@ class SpreadsheetBenchAdapter(EnvAdapter): self.failure_only = failure_only self.minibatch_size = minibatch_size self.edit_budget = edit_budget - self.use_deep_reflect = use_deep_reflect - self.deep_reflect_failures = deep_reflect_failures - self.deep_reflect_successes = deep_reflect_successes self.dataloader = SpreadsheetBenchDataLoader( split_dir=split_dir, data_path=data_path, @@ -75,9 +67,9 @@ class SpreadsheetBenchAdapter(EnvAdapter): def setup(self, cfg: dict) -> None: super().setup(cfg) - if is_student_exec_backend() and self.mode != "single": + if is_target_exec_backend() and self.mode != "single": raise NotImplementedError( - "Exec student backends are currently supported only for SpreadsheetBench mode=single." + "Exec target backends are currently supported only for SpreadsheetBench mode=single." ) self.dataloader.setup(cfg) @@ -190,120 +182,6 @@ class SpreadsheetBenchAdapter(EnvAdapter): update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), ) - def deep_reflect( - self, - results: list[dict], - skill_content: str, - out_dir: str, - **kwargs, - ) -> list[dict | None]: - if not self.use_deep_reflect: - return [] - - env_manager = kwargs.get("env_manager") - if not isinstance(env_manager, list): - return [] - - prediction_dir = kwargs.get("prediction_dir", os.path.join(out_dir, "predictions")) - random_seed = kwargs.get("random_seed") - step_buffer_context = kwargs.get("step_buffer_context", "") - meta_skill_context = kwargs.get("meta_skill_context", "") - codex_backend = get_student_backend() == "codex_exec" - selected_items = self.select_representative_items( - results, - env_manager, - n_failures=self.deep_reflect_failures, - n_successes=self.deep_reflect_successes, - seed=random_seed, - ) - if not selected_items: - return [] - - selected_ids = {str(item["id"]) for item in selected_items} - selected_results = [row for row in results if str(row.get("id")) in selected_ids] - selected_examples = ( - self.attach_codex_probe_context(selected_results, prediction_dir) - if codex_backend - else selected_results - ) - selected_metadata = [ - { - "id": str(item["id"]), - "instruction_type": str(item.get("instruction_type") or ""), - "answer_position": str(item.get("answer_position") or ""), - } - for item in selected_items - ] - - deep_dir = os.path.join(out_dir, "deep_reflect") - rollout_dir = os.path.join(deep_dir, "rollout") - patches_dir = os.path.join(deep_dir, "patches") - os.makedirs(deep_dir, exist_ok=True) - print( - f" [2b/6 DEEP REFLECT setup] selected={len(selected_items)} " - f"mode={self.mode}" - ) - probe = generate_deep_probe_instruction( - skill_content=skill_content, - items=selected_examples, - prediction_dir=prediction_dir, - system_prompt=self.get_codex_deep_probe_prompt() if codex_backend else self.get_deep_probe_prompt(), - step_buffer_context=step_buffer_context, - meta_skill_context=meta_skill_context, - output_requirements=[ - "- The instruction must ask for a short structured diagnostic readout before the student writes code or starts tool use.", - "- The readout should focus on task family, source/target region, and decisive transformation rule.", - "- The student must still complete the original spreadsheet task.", - "- Keep the readout concise and avoid exhaustive cell enumeration.", - "- The instruction text should be ready to append directly to the student's prompt.", - ], - ) - if not probe: - return [] - diagnostic_trace_context_by_id = None - if codex_backend: - selected_items, diagnostic_trace_context_by_id, probe = self.resolve_codex_probe_target( - selected_items=selected_items, - selected_examples=selected_examples, - prediction_dir=prediction_dir, - probe=probe, - ) - - with open(os.path.join(deep_dir, "probe.json"), "w", encoding="utf-8") as f: - json.dump( - { - **probe, - "selected_examples": selected_metadata, - }, - f, - ensure_ascii=False, - indent=2, - ) - - deep_results = self.rollout( - selected_items, - skill_content, - rollout_dir, - diagnostic_mode=True, - diagnostic_instruction=probe["probe_instruction"], - diagnostic_trace_context_by_id=diagnostic_trace_context_by_id, - ) - return run_minibatch_reflect( - results=deep_results, - skill_content=skill_content, - prediction_dir=os.path.join(rollout_dir, "predictions"), - patches_dir=patches_dir, - workers=self.analyst_workers, - failure_only=self.failure_only, - minibatch_size=self.minibatch_size, - edit_budget=self.edit_budget, - random_seed=random_seed, - error_system=self.get_error_minibatch_prompt(), - success_system=self.get_success_minibatch_prompt(), - step_buffer_context=step_buffer_context, - meta_skill_context=meta_skill_context, - update_mode=getattr(self, "_cfg", {}).get("skill_update_mode", "patch"), - ) def get_task_types(self) -> list[str]: return list(TASK_TYPES) diff --git a/skillopt/envs/spreadsheetbench/codegen_agent.py b/skillopt/envs/spreadsheetbench/codegen_agent.py index 8a6b48f..937d385 100644 --- a/skillopt/envs/spreadsheetbench/codegen_agent.py +++ b/skillopt/envs/spreadsheetbench/codegen_agent.py @@ -30,12 +30,12 @@ def _timeout_handler(signum, frame): from skillopt.model.azure_openai import ( get_reasoning_effort, - get_student_client, + get_target_client, _needs_responses_api, tracker, ) -from skillopt.model import get_codex_exec_config, get_student_backend, is_student_exec_backend -from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_student_exec +from skillopt.model import get_codex_exec_config, get_target_backend, is_target_exec_backend +from skillopt.model.codex_harness import prepare_workspace, render_skill_md, run_target_exec from skillopt.prompts import load_prompt from skillopt.envs.spreadsheetbench.executor import run_generated_code from skillopt.envs.spreadsheetbench.evaluator import evaluate @@ -44,13 +44,13 @@ from skillopt.envs.spreadsheetbench.evaluator import evaluate # ── Eval feedback helper (no golden value leakage) ───────────────────────── def _build_eval_feedback(verify_report: str) -> str: - """Build Student feedback from a verify report, hiding expected values. + """Build Target feedback from a verify report, hiding expected values. The verify report contains lines like: Sheet1!D2: got=None, expected=0 ✗ Sheet1!D10: got=None, expected=None ✓ - We strip the ``expected=...`` part so the Student sees only its own + We strip the ``expected=...`` part so the Target sees only its own output and whether each cell is correct or wrong. """ import re @@ -203,7 +203,7 @@ def _llm_call_with_retry(call_fn, *, retries: int = 5, timeout: int = 120): def _get_deployment() -> str: from skillopt.model import azure_openai as _llm - return _llm.STUDENT_DEPLOYMENT + return _llm.TARGET_DEPLOYMENT def _build_codex_skill(skill_content: str) -> str: @@ -242,7 +242,7 @@ def _build_codex_task( return ( f"{prompt}\n\n" "## Codex Harness Task\n" - "- Read `.agents/skills/skillopt-student/SKILL.md` before writing code; do not call a Skill tool.\n" + "- Read `.agents/skills/skillopt-target/SKILL.md` before writing code; do not call a Skill tool.\n" "- Read and optionally inspect `input.xlsx` in this workspace.\n" "- Write the final Python solution to `solution.py`.\n" "- The script should use the provided `INPUT_PATH` and `OUTPUT_PATH` variables.\n" @@ -296,7 +296,7 @@ def _prepare_codex_workspace( diagnostic_trace_context=diagnostic_trace_context, ) prompt = ( - "Read `.agents/skills/skillopt-student/SKILL.md` directly; do not call a Skill tool.\n" + "Read `.agents/skills/skillopt-target/SKILL.md` directly; do not call a Skill tool.\n" "Read `task.md`, inspect `input.xlsx` if useful, and write the final solution to `solution.py`.\n" "You may run `python run_solution.py` to validate the script locally.\n" "In your final response, briefly confirm whether `solution.py` was written and summarize the approach." @@ -319,7 +319,7 @@ def _run_exec_backend( model: str, timeout: int, ) -> tuple[str, str]: - return run_student_exec( + return run_target_exec( work_dir=work_dir, prompt=prompt, model=model, @@ -416,7 +416,7 @@ def run_single( Returns ``{"code": str, "raw": str, "n_turns": 1}``. """ - if is_student_exec_backend(): + if is_target_exec_backend(): deadline = time.time() + task_timeout deployment = _get_deployment() work_dir, skill_md, task_md, prompt = _prepare_codex_workspace( @@ -449,12 +449,12 @@ def run_single( "raw": raw or final_message, "n_turns": 1, "conversation": [{"role": "assistant", "content": final_message or raw}], - "student_system_prompt": skill_md, - "student_user_prompt": f"{prompt}\n\n## Task File\n\n{task_md}", + "target_system_prompt": skill_md, + "target_user_prompt": f"{prompt}\n\n## Task File\n\n{task_md}", } deadline = time.time() + task_timeout - client = get_student_client() + client = get_target_client() deployment = _get_deployment() system = _build_system(skill_content) user = _build_user( @@ -483,8 +483,8 @@ def run_single( "raw": raw, "n_turns": 1, "conversation": [{"role": "assistant", "content": raw}], - "student_system_prompt": system, - "student_user_prompt": user, + "target_system_prompt": system, + "target_user_prompt": user, } @@ -520,7 +520,7 @@ def run_multi( Returns ``{"code": str, "raw": str, "n_turns": int, "conversation": [...]}``. """ - if is_student_exec_backend(): + if is_target_exec_backend(): deadline = time.time() + task_timeout deployment = _get_deployment() work_dir, skill_md, task_md, initial_prompt = _prepare_codex_workspace( @@ -613,12 +613,12 @@ def run_multi( "raw": raw or final_message, "n_turns": len([m for m in conversation if m["role"] == "assistant"]), "conversation": conversation, - "student_system_prompt": skill_md, - "student_user_prompt": f"{initial_prompt}\n\n## Task File\n\n{task_md}", + "target_system_prompt": skill_md, + "target_user_prompt": f"{initial_prompt}\n\n## Task File\n\n{task_md}", } deadline = time.time() + task_timeout - client = get_student_client() + client = get_target_client() deployment = _get_deployment() system = _build_system(skill_content) user = _build_user( @@ -699,6 +699,6 @@ def run_multi( "raw": raw, "n_turns": turn + 1, "conversation": conversation, - "student_system_prompt": system, - "student_user_prompt": user, + "target_system_prompt": system, + "target_user_prompt": user, } diff --git a/skillopt/envs/spreadsheetbench/react_agent.py b/skillopt/envs/spreadsheetbench/react_agent.py index 252ca2a..ff296a8 100644 --- a/skillopt/envs/spreadsheetbench/react_agent.py +++ b/skillopt/envs/spreadsheetbench/react_agent.py @@ -11,7 +11,7 @@ import json import os import subprocess -from skillopt.model import chat_student_messages +from skillopt.model import chat_target_messages from skillopt.prompts import load_prompt # ── Tool schemas ───────────────────────────────────────────────────────────── @@ -298,7 +298,7 @@ def _react_loop( n_turns = 0 for _ in range(max_turns): - message, _ = chat_student_messages( + message, _ = chat_target_messages( messages=messages, tools=[BASH_TOOL_CHAT, WRITE_FILE_TOOL_CHAT], tool_choice="auto", @@ -390,6 +390,6 @@ def run_react( diagnostic_trace_context=diagnostic_trace_context, ) result = _react_loop(system, user, work_dir, max_turns, max_output_tokens) - result["student_system_prompt"] = system - result["student_user_prompt"] = user + result["target_system_prompt"] = system + result["target_user_prompt"] = user return result diff --git a/skillopt/envs/spreadsheetbench/rollout.py b/skillopt/envs/spreadsheetbench/rollout.py index 7cc616d..d9c35d6 100644 --- a/skillopt/envs/spreadsheetbench/rollout.py +++ b/skillopt/envs/spreadsheetbench/rollout.py @@ -233,37 +233,37 @@ def process_one( no1, ip1, _ = cases[0] pred_path_1 = os.path.join(task_out_dir, f"{no1}_pred.xlsx") - student_prompt_parts = [ + target_prompt_parts = [ f"# Instruction\n{instruction}", f"# Input file\n{ip1}", f"# Output file\n{pred_path_1}", ] if instruction_type: - student_prompt_parts.append(f"# Instruction type\n{instruction_type}") + target_prompt_parts.append(f"# Instruction type\n{instruction_type}") if answer_position_eval: - student_prompt_parts.append(f"# Answer position\n{answer_position_eval}") + target_prompt_parts.append(f"# Answer position\n{answer_position_eval}") if diagnostic_trace_context.strip(): - student_prompt_parts.insert( + target_prompt_parts.insert( 0, "# Previous Codex Trace Snapshot\n" "This is a partial transcript from an earlier attempt. Use it as your current reasoning context.\n\n" f"{diagnostic_trace_context.strip()}", ) if diagnostic_mode and diagnostic_instruction.strip(): - student_prompt_parts.append(f"# Training readout\n{diagnostic_instruction.strip()}") - student_user_prompt = "\n\n".join(student_prompt_parts) + target_prompt_parts.append(f"# Training readout\n{diagnostic_instruction.strip()}") + target_user_prompt = "\n\n".join(target_prompt_parts) try: from skillopt.envs.spreadsheetbench.react_agent import _build_system - student_system_prompt = _build_system(skill_content) + target_system_prompt = _build_system(skill_content) except Exception: - student_system_prompt = "" - if student_system_prompt: - with open(os.path.join(task_out_dir, "student_system_prompt.txt"), "w") as f: - f.write(student_system_prompt) - result["student_system_prompt"] = student_system_prompt - with open(os.path.join(task_out_dir, "student_user_prompt.txt"), "w") as f: - f.write(student_user_prompt) - result["student_user_prompt"] = student_user_prompt + target_system_prompt = "" + if target_system_prompt: + with open(os.path.join(task_out_dir, "target_system_prompt.txt"), "w") as f: + f.write(target_system_prompt) + result["target_system_prompt"] = target_system_prompt + with open(os.path.join(task_out_dir, "target_user_prompt.txt"), "w") as f: + f.write(target_user_prompt) + result["target_user_prompt"] = target_user_prompt # ── Stage 1: run ReAct agent on test case 1 ───────────────────── result["phase"] = "agent" @@ -288,14 +288,14 @@ def process_one( diagnostic_trace_context=diagnostic_trace_context, ) result["n_turns"] = agent_result.get("n_turns", 0) - if agent_result.get("student_system_prompt"): - with open(os.path.join(task_out_dir, "student_system_prompt.txt"), "w") as f: - f.write(agent_result["student_system_prompt"]) - result["student_system_prompt"] = agent_result["student_system_prompt"] - if agent_result.get("student_user_prompt"): - with open(os.path.join(task_out_dir, "student_user_prompt.txt"), "w") as f: - f.write(agent_result["student_user_prompt"]) - result["student_user_prompt"] = agent_result["student_user_prompt"] + if agent_result.get("target_system_prompt"): + with open(os.path.join(task_out_dir, "target_system_prompt.txt"), "w") as f: + f.write(agent_result["target_system_prompt"]) + result["target_system_prompt"] = agent_result["target_system_prompt"] + if agent_result.get("target_user_prompt"): + with open(os.path.join(task_out_dir, "target_user_prompt.txt"), "w") as f: + f.write(agent_result["target_user_prompt"]) + result["target_user_prompt"] = agent_result["target_user_prompt"] # Save conversation log with open(os.path.join(task_out_dir, "conversation.json"), "w") as f: @@ -606,7 +606,7 @@ def process_one_codegen( task_out_dir = os.path.join(out_root, "predictions", task_id) os.makedirs(task_out_dir, exist_ok=True) - # ── Save context for Teacher (Reflect stage) ────────────────── + # ── Save context for Optimizer (Reflect stage) ────────────────── from skillopt.envs.spreadsheetbench.codegen_agent import ( _preview_workbook, _build_system, _build_user, ) @@ -615,8 +615,8 @@ def process_one_codegen( preview_text = _preview_workbook(first_input_for_preview) except Exception: preview_text = "(preview failed)" - student_system = _build_system(skill_content) - student_user = _build_user( + target_system = _build_system(skill_content) + target_user = _build_user( instruction, first_input_for_preview, instruction_type, @@ -628,14 +628,14 @@ def process_one_codegen( with open(os.path.join(task_out_dir, "spreadsheet_preview.txt"), "w") as f: f.write(preview_text) - with open(os.path.join(task_out_dir, "student_system_prompt.txt"), "w") as f: - f.write(student_system) - with open(os.path.join(task_out_dir, "student_user_prompt.txt"), "w") as f: - f.write(student_user) + with open(os.path.join(task_out_dir, "target_system_prompt.txt"), "w") as f: + f.write(target_system) + with open(os.path.join(task_out_dir, "target_user_prompt.txt"), "w") as f: + f.write(target_user) result["spreadsheet_preview"] = preview_text - result["student_system_prompt"] = student_system - result["student_user_prompt"] = student_user + result["target_system_prompt"] = target_system + result["target_user_prompt"] = target_user # ── LLM phase ────────────────────────────────────────────────── result["phase"] = "llm" diff --git a/skillopt/gradient/__init__.py b/skillopt/gradient/__init__.py index 65b5416..0b05ef8 100644 --- a/skillopt/gradient/__init__.py +++ b/skillopt/gradient/__init__.py @@ -1,4 +1,4 @@ -"""ReflACT Gradient -- trajectory analysis and patch generation. +"""SkillOpt Gradient -- trajectory analysis and patch generation. Analogous to gradient computation in neural network training: analyzes minibatch rollout trajectories to produce skill-edit patches (the "gradient" @@ -8,10 +8,8 @@ Modules ------- - reflect: minibatch trajectory analysis (gradient computation) - aggregate: hierarchical patch merging (gradient aggregation) -- deep_probe: diagnostic probe generation (gradient probing) """ from skillopt.gradient.reflect import ( # noqa: F401 run_minibatch_reflect, ) from skillopt.gradient.aggregate import merge_patches # noqa: F401 -from skillopt.gradient.deep_probe import generate_deep_probe_instruction # noqa: F401 diff --git a/skillopt/gradient/aggregate.py b/skillopt/gradient/aggregate.py index 43ef74c..cdad87c 100644 --- a/skillopt/gradient/aggregate.py +++ b/skillopt/gradient/aggregate.py @@ -9,7 +9,7 @@ from __future__ import annotations import json from concurrent.futures import ThreadPoolExecutor, as_completed -from skillopt.model import chat_teacher +from skillopt.model import chat_optimizer from skillopt.optimizer.meta_skill import format_meta_skill_context from skillopt.optimizer.update_modes import ( get_payload_items, @@ -33,17 +33,17 @@ def _merge_batch( meta_skill_context: str = "", level: int = 1, ) -> dict: - """Call teacher LLM to merge a batch of patches into one.""" + """Call optimizer LLM to merge a batch of patches into one.""" patches_text = json.dumps(patches, ensure_ascii=False, indent=2) user = ( f"## Current Skill\n{skill_content}\n\n" f"## Patches to merge ({len(patches)} total, merge level {level})\n{patches_text}" ) - teacher_ctx = format_meta_skill_context(meta_skill_context) - if teacher_ctx: - user = f"{teacher_ctx}\n\n{user}" + optimizer_ctx = format_meta_skill_context(meta_skill_context) + if optimizer_ctx: + user = f"{optimizer_ctx}\n\n{user}" try: - response, _ = chat_teacher( + response, _ = chat_optimizer( system=system_prompt, user=user, max_completion_tokens=64000 if is_full_rewrite_minibatch_mode(update_mode) else 4096, @@ -224,11 +224,11 @@ def merge_patches( f"{len(s_edits)} edits\n\n" f"{combined_text}" ) - teacher_ctx = format_meta_skill_context(meta_skill_context) - if teacher_ctx: - user = f"{teacher_ctx}\n\n{user}" + optimizer_ctx = format_meta_skill_context(meta_skill_context) + if optimizer_ctx: + user = f"{optimizer_ctx}\n\n{user}" try: - response, _ = chat_teacher( + response, _ = chat_optimizer( system=merge_final_prompt, user=user, max_completion_tokens=64000 if is_full_rewrite_minibatch_mode(update_mode) else 4096, diff --git a/skillopt/gradient/reflect.py b/skillopt/gradient/reflect.py index 4579aed..197484f 100644 --- a/skillopt/gradient/reflect.py +++ b/skillopt/gradient/reflect.py @@ -15,8 +15,8 @@ Public API ---------- - :func:`fmt_trajectory` -- format one conversation into text - :func:`fmt_minibatch_trajectories` -- format multiple trajectories for batch analysis -- :func:`run_error_analyst_minibatch` -- one teacher call for a group of failures -- :func:`run_success_analyst_minibatch` -- one teacher call for a group of successes +- :func:`run_error_analyst_minibatch` -- one optimizer call for a group of failures +- :func:`run_success_analyst_minibatch` -- one optimizer call for a group of successes - :func:`run_minibatch_reflect` -- full reflect stage dispatcher """ from __future__ import annotations @@ -27,7 +27,7 @@ import random import traceback from concurrent.futures import ThreadPoolExecutor, as_completed -from skillopt.model import chat_teacher +from skillopt.model import chat_optimizer from skillopt.optimizer.meta_skill import format_meta_skill_context from skillopt.optimizer.update_modes import ( get_payload_items, @@ -115,7 +115,7 @@ def fmt_minibatch_trajectories( ``"task_type"``, ``"fail_reason"``, etc. Reads ``conversation.json`` for each and formats them together with trajectory headers. - If available, includes the spreadsheet preview and student system prompt + If available, includes the spreadsheet preview and target system prompt so the analyst can see what the agent saw. Parameters @@ -160,32 +160,32 @@ def fmt_minibatch_trajectories( f"{reference_text[:4000]}\n" ) - # ── Append student context (what the agent saw) ────────────── - student_prompt = item.get("student_system_prompt", "") - if not student_prompt: - prompt_path = os.path.join(prediction_dir, tid, "student_system_prompt.txt") + # ── Append target context (what the agent saw) ────────────── + target_prompt = item.get("target_system_prompt", "") + if not target_prompt: + prompt_path = os.path.join(prediction_dir, tid, "target_system_prompt.txt") if os.path.exists(prompt_path): with open(prompt_path) as f: - student_prompt = f.read() - if student_prompt: + target_prompt = f.read() + if target_prompt: header += ( - f"\n#### Student System Prompt\n" - f"{student_prompt[:3000]}\n" + f"\n#### Target System Prompt\n" + f"{target_prompt[:3000]}\n" ) - user_prompt = item.get("student_user_prompt", "") + user_prompt = item.get("target_user_prompt", "") if not user_prompt: - user_prompt_path = os.path.join(prediction_dir, tid, "student_user_prompt.txt") + user_prompt_path = os.path.join(prediction_dir, tid, "target_user_prompt.txt") if os.path.exists(user_prompt_path): with open(user_prompt_path) as f: user_prompt = f.read() if user_prompt: header += ( - f"\n#### Student User Prompt\n" + f"\n#### Target User Prompt\n" f"{user_prompt[:3000]}\n" ) - if os.environ.get("REFLACT_CODEX_TRACE_TO_TEACHER", "0") == "1": + if os.environ.get("REFLACT_CODEX_TRACE_TO_OPTIMIZER", "0") == "1": codex_trace_summary = item.get("codex_trace_summary", "") if not codex_trace_summary: codex_trace_summary_path = os.path.join(prediction_dir, tid, "codex_trace_summary.txt") @@ -262,7 +262,7 @@ def run_error_analyst_minibatch( meta_skill_context: str = "", update_mode: str = "patch", ) -> dict | None: - """Analyze a minibatch of failed trajectories in one teacher call. + """Analyze a minibatch of failed trajectories in one optimizer call. Parameters ---------- @@ -315,13 +315,13 @@ def run_error_analyst_minibatch( ctx = f"{ctx}\n{trajectory_memory_context}" if ctx else trajectory_memory_context if ctx.strip(): user += f"## Previous Steps in This Epoch\n{ctx}\n\n" - teacher_ctx = format_meta_skill_context(meta_skill_context) - if teacher_ctx: - user += teacher_ctx + "\n\n" + optimizer_ctx = format_meta_skill_context(meta_skill_context) + if optimizer_ctx: + user += optimizer_ctx + "\n\n" user += f"## Failed Trajectories ({len(items)} total)\n{trajectories_text}" try: - response, _ = chat_teacher( + response, _ = chat_optimizer( system=actual_system, user=user, max_completion_tokens=64000 if is_full_rewrite_minibatch_mode(mode) else 4096, retries=3, @@ -350,7 +350,7 @@ def run_success_analyst_minibatch( meta_skill_context: str = "", update_mode: str = "patch", ) -> dict | None: - """Analyze a minibatch of successful trajectories in one teacher call. + """Analyze a minibatch of successful trajectories in one optimizer call. Parameters ---------- @@ -390,13 +390,13 @@ def run_success_analyst_minibatch( ctx = step_buffer_context or trajectory_memory_context or "" if ctx.strip(): user += f"## Previous Steps in This Epoch\n{ctx}\n\n" - teacher_ctx = format_meta_skill_context(meta_skill_context) - if teacher_ctx: - user += teacher_ctx + "\n\n" + optimizer_ctx = format_meta_skill_context(meta_skill_context) + if optimizer_ctx: + user += optimizer_ctx + "\n\n" user += f"## Successful Trajectories ({len(items)} total)\n{trajectories_text}" try: - response, _ = chat_teacher( + response, _ = chat_optimizer( system=actual_system, user=user, max_completion_tokens=64000 if is_full_rewrite_minibatch_mode(mode) else 4096, retries=3, @@ -454,7 +454,7 @@ def run_minibatch_reflect( meta_skill_context: str = "", update_mode: str = "patch", ) -> list[dict | None]: - """Full minibatch reflect stage: group → parallel teacher calls → patches. + """Full minibatch reflect stage: group → parallel optimizer calls → patches. Separates failure and success trajectories, splits each into minibatches of size M, runs all minibatches in parallel, and saves patch files. @@ -470,7 +470,7 @@ def run_minibatch_reflect( patches_dir : str Path to save per-minibatch patch JSON files. workers : int - Max parallel teacher calls. + Max parallel optimizer calls. failure_only : bool If True, skip success trajectories. minibatch_size : int diff --git a/skillopt/model/__init__.py b/skillopt/model/__init__.py index 5edde31..bd33aa1 100644 --- a/skillopt/model/__init__.py +++ b/skillopt/model/__init__.py @@ -1,4 +1,4 @@ -"""ReflACT model API with runtime backend selection for the student path.""" +"""ReflACT model API with runtime backend selection for the target path.""" from __future__ import annotations @@ -12,73 +12,73 @@ from skillopt.model.backend_config import ( # noqa: F401 configure_codex_exec, get_claude_code_exec_config, get_codex_exec_config, - get_student_backend, - get_teacher_backend, - is_student_chat_backend, - is_student_exec_backend, - is_teacher_chat_backend, - set_student_backend, - set_teacher_backend, + get_target_backend, + get_optimizer_backend, + is_target_chat_backend, + is_target_exec_backend, + is_optimizer_chat_backend, + set_target_backend, + set_optimizer_backend, ) def set_backend(name: str | None) -> str: """Backward-compatible global backend setter. - Historically the codebase used one shared backend for both teacher and - student. Keep that entry point so older scripts continue to work, while - mapping it onto the split teacher/student backend model. + Historically the codebase used one shared backend for both optimizer and + target. Keep that entry point so older scripts continue to work, while + mapping it onto the split optimizer/target backend model. """ normalized = str(name or "azure_openai").strip().lower() if normalized in {"azure_openai", "openai_chat", "azure", "azure-openai"}: - set_teacher_backend("openai_chat") - set_student_backend("openai_chat") + set_optimizer_backend("openai_chat") + set_target_backend("openai_chat") return "azure_openai" if normalized in {"claude", "claude_chat", "anthropic"}: - set_teacher_backend("claude_chat") - set_student_backend("claude_chat") + set_optimizer_backend("claude_chat") + set_target_backend("claude_chat") return "claude_chat" if normalized == "codex": - set_teacher_backend("openai_chat") - set_student_backend("codex_exec") + set_optimizer_backend("openai_chat") + set_target_backend("codex_exec") return "codex" if normalized in {"codex_exec", "claude_code_exec"}: - set_teacher_backend("openai_chat") - set_student_backend(normalized) + set_optimizer_backend("openai_chat") + set_target_backend(normalized) return normalized if normalized in {"qwen", "qwen_chat"}: - set_teacher_backend("openai_chat") - set_student_backend("qwen_chat") + set_optimizer_backend("openai_chat") + set_target_backend("qwen_chat") return "qwen_chat" raise ValueError(f"Unsupported legacy backend: {name!r}") def get_backend_name() -> str: """Best-effort backward-compatible backend summary.""" - teacher = get_teacher_backend() - student = get_student_backend() - if teacher == "claude_chat" and student == "claude_chat": + optimizer = get_optimizer_backend() + target = get_target_backend() + if optimizer == "claude_chat" and target == "claude_chat": return "claude_chat" - if teacher == "openai_chat" and student == "openai_chat": + if optimizer == "openai_chat" and target == "openai_chat": return "azure_openai" - if teacher == "openai_chat" and student == "codex_exec": + if optimizer == "openai_chat" and target == "codex_exec": return "codex" - if teacher == "openai_chat" and student == "qwen_chat": + if optimizer == "openai_chat" and target == "qwen_chat": return "qwen_chat" - return f"{teacher}+{student}" + return f"{optimizer}+{target}" -def chat_teacher( +def chat_optimizer( system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "teacher", + stage: str = "optimizer", reasoning_effort: str | None = None, timeout: int | None = None, ) -> tuple[str, dict]: - if get_teacher_backend() == "claude_chat": - return _claude.chat_teacher( + if get_optimizer_backend() == "claude_chat": + return _claude.chat_optimizer( system=system, user=user, max_completion_tokens=max_completion_tokens, @@ -86,7 +86,7 @@ def chat_teacher( stage=stage, timeout=timeout, ) - return _openai.chat_teacher( + return _openai.chat_optimizer( system=system, user=user, max_completion_tokens=max_completion_tokens, @@ -97,17 +97,17 @@ def chat_teacher( ) -def chat_student( +def chat_target( system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "student", + stage: str = "target", reasoning_effort: str | None = None, timeout: int | None = None, ) -> tuple[str, dict]: - if get_student_backend() == "claude_chat": - return _claude.chat_student( + if get_target_backend() == "claude_chat": + return _claude.chat_target( system=system, user=user, max_completion_tokens=max_completion_tokens, @@ -115,8 +115,8 @@ def chat_student( stage=stage, timeout=timeout, ) - if get_student_backend() == "qwen_chat": - return _qwen.chat_student( + if get_target_backend() == "qwen_chat": + return _qwen.chat_target( system=system, user=user, max_completion_tokens=max_completion_tokens, @@ -124,12 +124,12 @@ def chat_student( stage=stage, reasoning_effort=reasoning_effort, ) - if not is_student_chat_backend(): + if not is_target_chat_backend(): raise NotImplementedError( - "chat_student is only supported with student_backend=openai_chat, claude_chat, or qwen_chat. " + "chat_target is only supported with target_backend=openai_chat, claude_chat, or qwen_chat. " "Exec backends are handled in environment-specific rollout code." ) - return _openai.chat_student( + return _openai.chat_target( system=system, user=user, max_completion_tokens=max_completion_tokens, @@ -140,11 +140,11 @@ def chat_student( ) -def chat_teacher_messages( +def chat_optimizer_messages( messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "teacher", + stage: str = "optimizer", reasoning_effort: str | None = None, *, tools: list[dict[str, Any]] | None = None, @@ -152,8 +152,8 @@ def chat_teacher_messages( return_message: bool = False, timeout: int | None = None, ) -> tuple[Any, dict]: - if get_teacher_backend() == "claude_chat": - return _claude.chat_teacher_messages( + if get_optimizer_backend() == "claude_chat": + return _claude.chat_optimizer_messages( messages=messages, max_completion_tokens=max_completion_tokens, retries=retries, @@ -163,7 +163,7 @@ def chat_teacher_messages( return_message=return_message, timeout=timeout, ) - return _openai.chat_teacher_messages( + return _openai.chat_optimizer_messages( messages=messages, max_completion_tokens=max_completion_tokens, retries=retries, @@ -176,11 +176,11 @@ def chat_teacher_messages( ) -def chat_student_messages( +def chat_target_messages( messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "student", + stage: str = "target", reasoning_effort: str | None = None, *, tools: list[dict[str, Any]] | None = None, @@ -188,8 +188,8 @@ def chat_student_messages( return_message: bool = False, timeout: int | None = None, ) -> tuple[Any, dict]: - if get_student_backend() == "claude_chat": - return _claude.chat_student_messages( + if get_target_backend() == "claude_chat": + return _claude.chat_target_messages( messages=messages, max_completion_tokens=max_completion_tokens, retries=retries, @@ -199,8 +199,8 @@ def chat_student_messages( return_message=return_message, timeout=timeout, ) - if get_student_backend() == "qwen_chat": - return _qwen.chat_student_messages( + if get_target_backend() == "qwen_chat": + return _qwen.chat_target_messages( messages=messages, max_completion_tokens=max_completion_tokens, retries=retries, @@ -210,12 +210,12 @@ def chat_student_messages( tool_choice=tool_choice, return_message=return_message, ) - if not is_student_chat_backend(): + if not is_target_chat_backend(): raise NotImplementedError( - "chat_student_messages is only supported with student_backend=openai_chat, claude_chat, or qwen_chat. " + "chat_target_messages is only supported with target_backend=openai_chat, claude_chat, or qwen_chat. " "Exec backends are handled in environment-specific rollout code." ) - return _openai.chat_student_messages( + return _openai.chat_target_messages( messages=messages, max_completion_tokens=max_completion_tokens, retries=retries, @@ -332,18 +332,18 @@ def configure_azure_openai( auth_mode: str | None = None, ad_scope: str | None = None, managed_identity_client_id: str | None = None, - teacher_endpoint: str | None = None, - teacher_api_version: str | None = None, - teacher_api_key: str | None = None, - teacher_auth_mode: str | None = None, - teacher_ad_scope: str | None = None, - teacher_managed_identity_client_id: str | None = None, - student_endpoint: str | None = None, - student_api_version: str | None = None, - student_api_key: str | None = None, - student_auth_mode: str | None = None, - student_ad_scope: str | None = None, - student_managed_identity_client_id: str | None = None, + optimizer_endpoint: str | None = None, + optimizer_api_version: str | None = None, + optimizer_api_key: str | None = None, + optimizer_auth_mode: str | None = None, + optimizer_ad_scope: str | None = None, + optimizer_managed_identity_client_id: str | None = None, + target_endpoint: str | None = None, + target_api_version: str | None = None, + target_api_key: str | None = None, + target_auth_mode: str | None = None, + target_ad_scope: str | None = None, + target_managed_identity_client_id: str | None = None, ) -> None: _openai.configure_azure_openai( endpoint=endpoint, @@ -352,18 +352,18 @@ def configure_azure_openai( auth_mode=auth_mode, ad_scope=ad_scope, managed_identity_client_id=managed_identity_client_id, - teacher_endpoint=teacher_endpoint, - teacher_api_version=teacher_api_version, - teacher_api_key=teacher_api_key, - teacher_auth_mode=teacher_auth_mode, - teacher_ad_scope=teacher_ad_scope, - teacher_managed_identity_client_id=teacher_managed_identity_client_id, - student_endpoint=student_endpoint, - student_api_version=student_api_version, - student_api_key=student_api_key, - student_auth_mode=student_auth_mode, - student_ad_scope=student_ad_scope, - student_managed_identity_client_id=student_managed_identity_client_id, + optimizer_endpoint=optimizer_endpoint, + optimizer_api_version=optimizer_api_version, + optimizer_api_key=optimizer_api_key, + optimizer_auth_mode=optimizer_auth_mode, + optimizer_ad_scope=optimizer_ad_scope, + optimizer_managed_identity_client_id=optimizer_managed_identity_client_id, + target_endpoint=target_endpoint, + target_api_version=target_api_version, + target_api_key=target_api_key, + target_auth_mode=target_auth_mode, + target_ad_scope=target_ad_scope, + target_managed_identity_client_id=target_managed_identity_client_id, ) @@ -392,12 +392,12 @@ def set_reasoning_effort(effort: str | None) -> None: _qwen.set_reasoning_effort(effort) -def set_student_deployment(deployment: str) -> None: - _openai.set_student_deployment(deployment) - _claude.set_student_deployment(deployment) - _qwen.set_student_deployment(deployment) +def set_target_deployment(deployment: str) -> None: + _openai.set_target_deployment(deployment) + _claude.set_target_deployment(deployment) + _qwen.set_target_deployment(deployment) -def set_teacher_deployment(deployment: str) -> None: - _openai.set_teacher_deployment(deployment) - _claude.set_teacher_deployment(deployment) +def set_optimizer_deployment(deployment: str) -> None: + _openai.set_optimizer_deployment(deployment) + _claude.set_optimizer_deployment(deployment) diff --git a/skillopt/model/azure_openai.py b/skillopt/model/azure_openai.py index 9ba4733..92a86e7 100644 --- a/skillopt/model/azure_openai.py +++ b/skillopt/model/azure_openai.py @@ -1,6 +1,6 @@ """ReflACT Model backend — Azure OpenAI wrapper with token tracking. -Provides teacher/student dual-deployment chat functions and a global +Provides optimizer/target dual-deployment chat functions and a global TokenTracker for per-stage cost accounting. Previously llm/azure_openai.py. """ from __future__ import annotations @@ -35,69 +35,69 @@ MANAGED_IDENTITY_CLIENT_ID = os.environ.get( "", ).strip() -TEACHER_ENDPOINT = ( - os.environ.get("TEACHER_AZURE_OPENAI_ENDPOINT") - or os.environ.get("AZURE_OPENAI_TEACHER_ENDPOINT") +OPTIMIZER_ENDPOINT = ( + os.environ.get("OPTIMIZER_AZURE_OPENAI_ENDPOINT") + or os.environ.get("AZURE_OPENAI_OPTIMIZER_ENDPOINT") or ENDPOINT ) -STUDENT_ENDPOINT = ( - os.environ.get("STUDENT_AZURE_OPENAI_ENDPOINT") - or os.environ.get("AZURE_OPENAI_STUDENT_ENDPOINT") +TARGET_ENDPOINT = ( + os.environ.get("TARGET_AZURE_OPENAI_ENDPOINT") + or os.environ.get("AZURE_OPENAI_TARGET_ENDPOINT") or ENDPOINT ) -TEACHER_API_VERSION = ( - os.environ.get("TEACHER_AZURE_OPENAI_API_VERSION") - or os.environ.get("AZURE_OPENAI_TEACHER_API_VERSION") +OPTIMIZER_API_VERSION = ( + os.environ.get("OPTIMIZER_AZURE_OPENAI_API_VERSION") + or os.environ.get("AZURE_OPENAI_OPTIMIZER_API_VERSION") or API_VERSION ) -STUDENT_API_VERSION = ( - os.environ.get("STUDENT_AZURE_OPENAI_API_VERSION") - or os.environ.get("AZURE_OPENAI_STUDENT_API_VERSION") +TARGET_API_VERSION = ( + os.environ.get("TARGET_AZURE_OPENAI_API_VERSION") + or os.environ.get("AZURE_OPENAI_TARGET_API_VERSION") or API_VERSION ) -TEACHER_API_KEY = ( - os.environ.get("TEACHER_AZURE_OPENAI_API_KEY") - or os.environ.get("AZURE_OPENAI_TEACHER_API_KEY") +OPTIMIZER_API_KEY = ( + os.environ.get("OPTIMIZER_AZURE_OPENAI_API_KEY") + or os.environ.get("AZURE_OPENAI_OPTIMIZER_API_KEY") or API_KEY ) -STUDENT_API_KEY = ( - os.environ.get("STUDENT_AZURE_OPENAI_API_KEY") - or os.environ.get("AZURE_OPENAI_STUDENT_API_KEY") +TARGET_API_KEY = ( + os.environ.get("TARGET_AZURE_OPENAI_API_KEY") + or os.environ.get("AZURE_OPENAI_TARGET_API_KEY") or API_KEY ) -TEACHER_AUTH_MODE = ( - os.environ.get("TEACHER_AZURE_OPENAI_AUTH_MODE") - or os.environ.get("AZURE_OPENAI_TEACHER_AUTH_MODE") +OPTIMIZER_AUTH_MODE = ( + os.environ.get("OPTIMIZER_AZURE_OPENAI_AUTH_MODE") + or os.environ.get("AZURE_OPENAI_OPTIMIZER_AUTH_MODE") or AUTH_MODE ).strip().lower() -STUDENT_AUTH_MODE = ( - os.environ.get("STUDENT_AZURE_OPENAI_AUTH_MODE") - or os.environ.get("AZURE_OPENAI_STUDENT_AUTH_MODE") +TARGET_AUTH_MODE = ( + os.environ.get("TARGET_AZURE_OPENAI_AUTH_MODE") + or os.environ.get("AZURE_OPENAI_TARGET_AUTH_MODE") or AUTH_MODE ).strip().lower() -TEACHER_AD_SCOPE = ( - os.environ.get("TEACHER_AZURE_OPENAI_AD_SCOPE") - or os.environ.get("AZURE_OPENAI_TEACHER_AD_SCOPE") +OPTIMIZER_AD_SCOPE = ( + os.environ.get("OPTIMIZER_AZURE_OPENAI_AD_SCOPE") + or os.environ.get("AZURE_OPENAI_OPTIMIZER_AD_SCOPE") or AD_SCOPE ) -STUDENT_AD_SCOPE = ( - os.environ.get("STUDENT_AZURE_OPENAI_AD_SCOPE") - or os.environ.get("AZURE_OPENAI_STUDENT_AD_SCOPE") +TARGET_AD_SCOPE = ( + os.environ.get("TARGET_AZURE_OPENAI_AD_SCOPE") + or os.environ.get("AZURE_OPENAI_TARGET_AD_SCOPE") or AD_SCOPE ) -TEACHER_MANAGED_IDENTITY_CLIENT_ID = ( - os.environ.get("TEACHER_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID") - or os.environ.get("AZURE_OPENAI_TEACHER_MANAGED_IDENTITY_CLIENT_ID") +OPTIMIZER_MANAGED_IDENTITY_CLIENT_ID = ( + os.environ.get("OPTIMIZER_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID") + or os.environ.get("AZURE_OPENAI_OPTIMIZER_MANAGED_IDENTITY_CLIENT_ID") or MANAGED_IDENTITY_CLIENT_ID ).strip() -STUDENT_MANAGED_IDENTITY_CLIENT_ID = ( - os.environ.get("STUDENT_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID") - or os.environ.get("AZURE_OPENAI_STUDENT_MANAGED_IDENTITY_CLIENT_ID") +TARGET_MANAGED_IDENTITY_CLIENT_ID = ( + os.environ.get("TARGET_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID") + or os.environ.get("AZURE_OPENAI_TARGET_MANAGED_IDENTITY_CLIENT_ID") or MANAGED_IDENTITY_CLIENT_ID ).strip() -TEACHER_DEPLOYMENT = os.environ.get("TEACHER_DEPLOYMENT", "gpt-5.5") -STUDENT_DEPLOYMENT = os.environ.get("STUDENT_DEPLOYMENT", "gpt-5.5") +OPTIMIZER_DEPLOYMENT = os.environ.get("OPTIMIZER_DEPLOYMENT", "gpt-4o") +TARGET_DEPLOYMENT = os.environ.get("TARGET_DEPLOYMENT", "gpt-4o") REASONING_EFFORT: str | None = None @@ -177,30 +177,30 @@ tracker = TokenTracker() # ── Client management ───────────────────────────────────────────────────────── -_teacher_client: AzureOpenAI | None = None -_student_client: AzureOpenAI | None = None -_teacher_lock = threading.Lock() -_student_lock = threading.Lock() +_optimizer_client: AzureOpenAI | None = None +_target_client: AzureOpenAI | None = None +_optimizer_lock = threading.Lock() +_target_lock = threading.Lock() def _role_config(role: str) -> dict[str, str]: - if role == "teacher": + if role == "optimizer": return { - "endpoint": TEACHER_ENDPOINT, - "api_version": TEACHER_API_VERSION, - "api_key": TEACHER_API_KEY, - "auth_mode": TEACHER_AUTH_MODE, - "ad_scope": TEACHER_AD_SCOPE, - "managed_identity_client_id": TEACHER_MANAGED_IDENTITY_CLIENT_ID, + "endpoint": OPTIMIZER_ENDPOINT, + "api_version": OPTIMIZER_API_VERSION, + "api_key": OPTIMIZER_API_KEY, + "auth_mode": OPTIMIZER_AUTH_MODE, + "ad_scope": OPTIMIZER_AD_SCOPE, + "managed_identity_client_id": OPTIMIZER_MANAGED_IDENTITY_CLIENT_ID, } - if role == "student": + if role == "target": return { - "endpoint": STUDENT_ENDPOINT, - "api_version": STUDENT_API_VERSION, - "api_key": STUDENT_API_KEY, - "auth_mode": STUDENT_AUTH_MODE, - "ad_scope": STUDENT_AD_SCOPE, - "managed_identity_client_id": STUDENT_MANAGED_IDENTITY_CLIENT_ID, + "endpoint": TARGET_ENDPOINT, + "api_version": TARGET_API_VERSION, + "api_key": TARGET_API_KEY, + "auth_mode": TARGET_AUTH_MODE, + "ad_scope": TARGET_AD_SCOPE, + "managed_identity_client_id": TARGET_MANAGED_IDENTITY_CLIENT_ID, } raise ValueError(f"Unknown Azure OpenAI client role: {role!r}") @@ -280,6 +280,12 @@ def _make_azure_cli_token_provider(ad_scope: str): def _make_client(role: str) -> AzureOpenAI: cfg = _role_config(role) + if not cfg["endpoint"]: + raise ValueError( + f"Azure OpenAI endpoint is not configured for {role}. " + "Pass --azure_openai_endpoint https://your-resource.openai.azure.com/ " + "or set AZURE_OPENAI_ENDPOINT in your environment." + ) auth_mode = cfg["auth_mode"] if auth_mode in {"api_key", "key"}: if not cfg["api_key"]: @@ -303,29 +309,29 @@ def _make_client(role: str) -> AzureOpenAI: ) -def get_teacher_client() -> AzureOpenAI: - global _teacher_client - with _teacher_lock: - if _teacher_client is None: - _teacher_client = _make_client("teacher") - return _teacher_client +def get_optimizer_client() -> AzureOpenAI: + global _optimizer_client + with _optimizer_lock: + if _optimizer_client is None: + _optimizer_client = _make_client("optimizer") + return _optimizer_client -def get_student_client() -> AzureOpenAI | OpenAI: - global _student_client - with _student_lock: - if _student_client is None: +def get_target_client() -> AzureOpenAI | OpenAI: + global _target_client + with _target_lock: + if _target_client is None: # When using qwen_chat backend, return an OpenAI client pointing to vLLM - from skillopt.model.backend_config import get_student_backend - if get_student_backend() == "qwen_chat": + from skillopt.model.backend_config import get_target_backend + if get_target_backend() == "qwen_chat": from skillopt.model import qwen_backend as _qwen - _student_client = OpenAI( + _target_client = OpenAI( base_url=_qwen.BASE_URL, api_key=_qwen.API_KEY or "dummy", ) else: - _student_client = _make_client("student") - return _student_client + _target_client = _make_client("target") + return _target_client def _needs_responses_api(deployment: str) -> bool: @@ -587,25 +593,25 @@ def configure_azure_openai( auth_mode: str | None = None, ad_scope: str | None = None, managed_identity_client_id: str | None = None, - teacher_endpoint: str | None = None, - teacher_api_version: str | None = None, - teacher_api_key: str | None = None, - teacher_auth_mode: str | None = None, - teacher_ad_scope: str | None = None, - teacher_managed_identity_client_id: str | None = None, - student_endpoint: str | None = None, - student_api_version: str | None = None, - student_api_key: str | None = None, - student_auth_mode: str | None = None, - student_ad_scope: str | None = None, - student_managed_identity_client_id: str | None = None, + optimizer_endpoint: str | None = None, + optimizer_api_version: str | None = None, + optimizer_api_key: str | None = None, + optimizer_auth_mode: str | None = None, + optimizer_ad_scope: str | None = None, + optimizer_managed_identity_client_id: str | None = None, + target_endpoint: str | None = None, + target_api_version: str | None = None, + target_api_key: str | None = None, + target_auth_mode: str | None = None, + target_ad_scope: str | None = None, + target_managed_identity_client_id: str | None = None, ) -> None: global ENDPOINT, API_VERSION, API_KEY, AUTH_MODE, AD_SCOPE, MANAGED_IDENTITY_CLIENT_ID - global TEACHER_ENDPOINT, TEACHER_API_VERSION, TEACHER_API_KEY, TEACHER_AUTH_MODE - global TEACHER_AD_SCOPE, TEACHER_MANAGED_IDENTITY_CLIENT_ID - global STUDENT_ENDPOINT, STUDENT_API_VERSION, STUDENT_API_KEY, STUDENT_AUTH_MODE - global STUDENT_AD_SCOPE, STUDENT_MANAGED_IDENTITY_CLIENT_ID - global _teacher_client, _student_client + global OPTIMIZER_ENDPOINT, OPTIMIZER_API_VERSION, OPTIMIZER_API_KEY, OPTIMIZER_AUTH_MODE + global OPTIMIZER_AD_SCOPE, OPTIMIZER_MANAGED_IDENTITY_CLIENT_ID + global TARGET_ENDPOINT, TARGET_API_VERSION, TARGET_API_KEY, TARGET_AUTH_MODE + global TARGET_AD_SCOPE, TARGET_MANAGED_IDENTITY_CLIENT_ID + global _optimizer_client, _target_client def _clean(value: str | None, *, lower: bool = False) -> str | None: if value is None: @@ -641,72 +647,72 @@ def configure_azure_openai( "AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID", ) - resolved_teacher_endpoint = _clean(teacher_endpoint) or shared_endpoint - resolved_teacher_api_version = _clean(teacher_api_version) or shared_api_version - resolved_teacher_api_key = _clean(teacher_api_key) or shared_api_key - resolved_teacher_auth_mode = _clean(teacher_auth_mode, lower=True) or shared_auth_mode - resolved_teacher_ad_scope = _clean(teacher_ad_scope) or shared_ad_scope - resolved_teacher_mi = ( - _clean(teacher_managed_identity_client_id) + resolved_optimizer_endpoint = _clean(optimizer_endpoint) or shared_endpoint + resolved_optimizer_api_version = _clean(optimizer_api_version) or shared_api_version + resolved_optimizer_api_key = _clean(optimizer_api_key) or shared_api_key + resolved_optimizer_auth_mode = _clean(optimizer_auth_mode, lower=True) or shared_auth_mode + resolved_optimizer_ad_scope = _clean(optimizer_ad_scope) or shared_ad_scope + resolved_optimizer_mi = ( + _clean(optimizer_managed_identity_client_id) or shared_managed_identity_client_id ) - resolved_student_endpoint = _clean(student_endpoint) or shared_endpoint - resolved_student_api_version = _clean(student_api_version) or shared_api_version - resolved_student_api_key = _clean(student_api_key) or shared_api_key - resolved_student_auth_mode = _clean(student_auth_mode, lower=True) or shared_auth_mode - resolved_student_ad_scope = _clean(student_ad_scope) or shared_ad_scope - resolved_student_mi = ( - _clean(student_managed_identity_client_id) + resolved_target_endpoint = _clean(target_endpoint) or shared_endpoint + resolved_target_api_version = _clean(target_api_version) or shared_api_version + resolved_target_api_key = _clean(target_api_key) or shared_api_key + resolved_target_auth_mode = _clean(target_auth_mode, lower=True) or shared_auth_mode + resolved_target_ad_scope = _clean(target_ad_scope) or shared_ad_scope + resolved_target_mi = ( + _clean(target_managed_identity_client_id) or shared_managed_identity_client_id ) - _set("TEACHER_ENDPOINT", resolved_teacher_endpoint, "TEACHER_AZURE_OPENAI_ENDPOINT") + _set("OPTIMIZER_ENDPOINT", resolved_optimizer_endpoint, "OPTIMIZER_AZURE_OPENAI_ENDPOINT") _set( - "TEACHER_API_VERSION", - resolved_teacher_api_version, - "TEACHER_AZURE_OPENAI_API_VERSION", + "OPTIMIZER_API_VERSION", + resolved_optimizer_api_version, + "OPTIMIZER_AZURE_OPENAI_API_VERSION", ) - _set("TEACHER_API_KEY", resolved_teacher_api_key, "TEACHER_AZURE_OPENAI_API_KEY") - _set("TEACHER_AUTH_MODE", resolved_teacher_auth_mode, "TEACHER_AZURE_OPENAI_AUTH_MODE") - _set("TEACHER_AD_SCOPE", resolved_teacher_ad_scope, "TEACHER_AZURE_OPENAI_AD_SCOPE") + _set("OPTIMIZER_API_KEY", resolved_optimizer_api_key, "OPTIMIZER_AZURE_OPENAI_API_KEY") + _set("OPTIMIZER_AUTH_MODE", resolved_optimizer_auth_mode, "OPTIMIZER_AZURE_OPENAI_AUTH_MODE") + _set("OPTIMIZER_AD_SCOPE", resolved_optimizer_ad_scope, "OPTIMIZER_AZURE_OPENAI_AD_SCOPE") _set( - "TEACHER_MANAGED_IDENTITY_CLIENT_ID", - resolved_teacher_mi, - "TEACHER_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID", + "OPTIMIZER_MANAGED_IDENTITY_CLIENT_ID", + resolved_optimizer_mi, + "OPTIMIZER_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID", ) - _set("STUDENT_ENDPOINT", resolved_student_endpoint, "STUDENT_AZURE_OPENAI_ENDPOINT") + _set("TARGET_ENDPOINT", resolved_target_endpoint, "TARGET_AZURE_OPENAI_ENDPOINT") _set( - "STUDENT_API_VERSION", - resolved_student_api_version, - "STUDENT_AZURE_OPENAI_API_VERSION", + "TARGET_API_VERSION", + resolved_target_api_version, + "TARGET_AZURE_OPENAI_API_VERSION", ) - _set("STUDENT_API_KEY", resolved_student_api_key, "STUDENT_AZURE_OPENAI_API_KEY") - _set("STUDENT_AUTH_MODE", resolved_student_auth_mode, "STUDENT_AZURE_OPENAI_AUTH_MODE") - _set("STUDENT_AD_SCOPE", resolved_student_ad_scope, "STUDENT_AZURE_OPENAI_AD_SCOPE") + _set("TARGET_API_KEY", resolved_target_api_key, "TARGET_AZURE_OPENAI_API_KEY") + _set("TARGET_AUTH_MODE", resolved_target_auth_mode, "TARGET_AZURE_OPENAI_AUTH_MODE") + _set("TARGET_AD_SCOPE", resolved_target_ad_scope, "TARGET_AZURE_OPENAI_AD_SCOPE") _set( - "STUDENT_MANAGED_IDENTITY_CLIENT_ID", - resolved_student_mi, - "STUDENT_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID", + "TARGET_MANAGED_IDENTITY_CLIENT_ID", + resolved_target_mi, + "TARGET_AZURE_OPENAI_MANAGED_IDENTITY_CLIENT_ID", ) - with _teacher_lock: - _teacher_client = None - with _student_lock: - _student_client = None + with _optimizer_lock: + _optimizer_client = None + with _target_lock: + _target_client = None -def chat_teacher( +def chat_optimizer( system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "teacher", + stage: str = "optimizer", reasoning_effort: str | None = None, timeout: int | None = None, ) -> tuple[str, dict]: - """Call the teacher model. Returns (response_text, usage_dict).""" + """Call the optimizer model. Returns (response_text, usage_dict).""" return _chat_impl( - get_teacher_client(), TEACHER_DEPLOYMENT, + get_optimizer_client(), OPTIMIZER_DEPLOYMENT, system, user, max_completion_tokens, retries, stage, reasoning_effort, timeout, ) @@ -723,7 +729,7 @@ def chat_with_deployment( ) -> tuple[str, dict]: """Call an arbitrary deployment using the shared Azure client.""" return _chat_impl( - get_teacher_client(), + get_optimizer_client(), deployment, system, user, @@ -735,27 +741,27 @@ def chat_with_deployment( ) -def chat_student( +def chat_target( system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "student", + stage: str = "target", reasoning_effort: str | None = None, timeout: int | None = None, ) -> tuple[str, dict]: - """Call the student model. Returns (response_text, usage_dict).""" + """Call the target model. Returns (response_text, usage_dict).""" return _chat_impl( - get_student_client(), STUDENT_DEPLOYMENT, + get_target_client(), TARGET_DEPLOYMENT, system, user, max_completion_tokens, retries, stage, reasoning_effort, timeout, ) -def chat_teacher_messages( +def chat_optimizer_messages( messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "teacher", + stage: str = "optimizer", reasoning_effort: str | None = None, *, tools: list[dict[str, Any]] | None = None, @@ -763,10 +769,10 @@ def chat_teacher_messages( return_message: bool = False, timeout: int | None = None, ) -> tuple[Any, dict]: - """Call the teacher model with a pre-built chat message list.""" + """Call the optimizer model with a pre-built chat message list.""" return _chat_messages_impl( - get_teacher_client(), - TEACHER_DEPLOYMENT, + get_optimizer_client(), + OPTIMIZER_DEPLOYMENT, messages, max_completion_tokens, retries, @@ -794,7 +800,7 @@ def chat_messages_with_deployment( ) -> tuple[Any, dict]: """Call an arbitrary deployment with a pre-built chat message list.""" return _chat_messages_impl( - get_teacher_client(), + get_optimizer_client(), deployment, messages, max_completion_tokens, @@ -808,11 +814,11 @@ def chat_messages_with_deployment( ) -def chat_student_messages( +def chat_target_messages( messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "student", + stage: str = "target", reasoning_effort: str | None = None, *, tools: list[dict[str, Any]] | None = None, @@ -820,10 +826,10 @@ def chat_student_messages( return_message: bool = False, timeout: int | None = None, ) -> tuple[Any, dict]: - """Call the student model with a pre-built chat message list.""" + """Call the target model with a pre-built chat message list.""" return _chat_messages_impl( - get_student_client(), - STUDENT_DEPLOYMENT, + get_target_client(), + TARGET_DEPLOYMENT, messages, max_completion_tokens, retries, @@ -845,14 +851,14 @@ def reset_token_tracker() -> None: tracker.reset() -def set_student_deployment(deployment: str) -> None: - """Change student deployment at runtime.""" - global _student_client, STUDENT_DEPLOYMENT - STUDENT_DEPLOYMENT = deployment - os.environ["STUDENT_DEPLOYMENT"] = deployment +def set_target_deployment(deployment: str) -> None: + """Change target deployment at runtime.""" + global _target_client, TARGET_DEPLOYMENT + TARGET_DEPLOYMENT = deployment + os.environ["TARGET_DEPLOYMENT"] = deployment os.environ["AZURE_OPENAI_DEPLOYMENT"] = deployment - with _student_lock: - _student_client = None + with _target_lock: + _target_client = None try: import llm_client as _legacy _legacy.DEPLOYMENT = deployment @@ -872,10 +878,10 @@ def get_reasoning_effort() -> str | None: return REASONING_EFFORT -def set_teacher_deployment(deployment: str) -> None: - """Change teacher deployment at runtime.""" - global _teacher_client, TEACHER_DEPLOYMENT - TEACHER_DEPLOYMENT = deployment - os.environ["TEACHER_DEPLOYMENT"] = deployment - with _teacher_lock: - _teacher_client = None +def set_optimizer_deployment(deployment: str) -> None: + """Change optimizer deployment at runtime.""" + global _optimizer_client, OPTIMIZER_DEPLOYMENT + OPTIMIZER_DEPLOYMENT = deployment + os.environ["OPTIMIZER_DEPLOYMENT"] = deployment + with _optimizer_lock: + _optimizer_client = None diff --git a/skillopt/model/backend_config.py b/skillopt/model/backend_config.py index 545b736..cf36b7e 100644 --- a/skillopt/model/backend_config.py +++ b/skillopt/model/backend_config.py @@ -1,4 +1,4 @@ -"""Runtime backend configuration for teacher/student model calls.""" +"""Runtime backend configuration for optimizer/target model calls.""" from __future__ import annotations import os @@ -12,8 +12,8 @@ def _parse_bool(value: str | None, default: bool) -> bool: return str(value).strip().lower() in {"1", "true", "yes", "on"} -TEACHER_BACKEND = normalize_backend_name(os.environ.get("TEACHER_BACKEND", "openai_chat")) -STUDENT_BACKEND = normalize_backend_name(os.environ.get("STUDENT_BACKEND", "openai_chat")) +OPTIMIZER_BACKEND = normalize_backend_name(os.environ.get("OPTIMIZER_BACKEND", "openai_chat")) +TARGET_BACKEND = normalize_backend_name(os.environ.get("TARGET_BACKEND", "openai_chat")) CODEX_EXEC_PATH = os.environ.get("CODEX_EXEC_PATH", "codex") CODEX_EXEC_SANDBOX = os.environ.get("CODEX_EXEC_SANDBOX", "workspace-write") @@ -46,46 +46,46 @@ CLAUDE_CODE_EXEC_MAX_THINKING_TOKENS = max( ) -def set_teacher_backend(backend: str) -> None: - global TEACHER_BACKEND - TEACHER_BACKEND = normalize_backend_name(backend or "openai_chat") - if TEACHER_BACKEND not in {"openai_chat", "claude_chat"}: +def set_optimizer_backend(backend: str) -> None: + global OPTIMIZER_BACKEND + OPTIMIZER_BACKEND = normalize_backend_name(backend or "openai_chat") + if OPTIMIZER_BACKEND not in {"openai_chat", "claude_chat"}: raise ValueError( - f"Unsupported teacher backend: {TEACHER_BACKEND!r}. " + f"Unsupported optimizer backend: {OPTIMIZER_BACKEND!r}. " "Supported values are 'openai_chat' and 'claude_chat'." ) - os.environ["TEACHER_BACKEND"] = TEACHER_BACKEND + os.environ["OPTIMIZER_BACKEND"] = OPTIMIZER_BACKEND -def get_teacher_backend() -> str: - return TEACHER_BACKEND +def get_optimizer_backend() -> str: + return OPTIMIZER_BACKEND -def set_student_backend(backend: str) -> None: - global STUDENT_BACKEND - STUDENT_BACKEND = normalize_backend_name(backend or "openai_chat") - if STUDENT_BACKEND not in {"openai_chat", "claude_chat", "qwen_chat", "codex_exec", "claude_code_exec"}: +def set_target_backend(backend: str) -> None: + global TARGET_BACKEND + TARGET_BACKEND = normalize_backend_name(backend or "openai_chat") + if TARGET_BACKEND not in {"openai_chat", "claude_chat", "qwen_chat", "codex_exec", "claude_code_exec"}: raise ValueError( - f"Unsupported student backend: {STUDENT_BACKEND!r}. " + f"Unsupported target backend: {TARGET_BACKEND!r}. " "Supported values are 'openai_chat', 'claude_chat', 'qwen_chat', 'codex_exec', and 'claude_code_exec'." ) - os.environ["STUDENT_BACKEND"] = STUDENT_BACKEND + os.environ["TARGET_BACKEND"] = TARGET_BACKEND -def get_student_backend() -> str: - return STUDENT_BACKEND +def get_target_backend() -> str: + return TARGET_BACKEND -def is_student_exec_backend() -> bool: - return STUDENT_BACKEND in {"codex_exec", "claude_code_exec"} +def is_target_exec_backend() -> bool: + return TARGET_BACKEND in {"codex_exec", "claude_code_exec"} -def is_teacher_chat_backend() -> bool: - return TEACHER_BACKEND in {"openai_chat", "claude_chat"} +def is_optimizer_chat_backend() -> bool: + return OPTIMIZER_BACKEND in {"openai_chat", "claude_chat"} -def is_student_chat_backend() -> bool: - return STUDENT_BACKEND in {"openai_chat", "claude_chat", "qwen_chat"} +def is_target_chat_backend() -> bool: + return TARGET_BACKEND in {"openai_chat", "claude_chat", "qwen_chat"} def configure_codex_exec( diff --git a/skillopt/model/claude_backend.py b/skillopt/model/claude_backend.py index a4ca3a1..22031b7 100644 --- a/skillopt/model/claude_backend.py +++ b/skillopt/model/claude_backend.py @@ -19,8 +19,8 @@ CLAUDE_PERMISSION_MODE = os.environ.get("CLAUDE_PERMISSION_MODE", "dontAsk") CLAUDE_SETTING_SOURCES = os.environ.get("CLAUDE_SETTING_SOURCES", "user,project") CLAUDE_ALLOW_ATTACHMENT_READ = os.environ.get("CLAUDE_ALLOW_ATTACHMENT_READ", "1").strip().lower() not in {"0", "false", "no"} -TEACHER_DEPLOYMENT = os.environ.get("TEACHER_DEPLOYMENT", "claude-sonnet-4-6") -STUDENT_DEPLOYMENT = os.environ.get("STUDENT_DEPLOYMENT", "claude-sonnet-4-6") +OPTIMIZER_DEPLOYMENT = os.environ.get("OPTIMIZER_DEPLOYMENT", "claude-sonnet-4-6") +TARGET_DEPLOYMENT = os.environ.get("TARGET_DEPLOYMENT", "claude-sonnet-4-6") REASONING_EFFORT: str | None = None _VALID_EFFORTS = {"low", "medium", "high", "xhigh", "max"} @@ -292,7 +292,7 @@ def _compat_message_from_payload(payload: Any) -> CompatAssistantMessage: def _call_messages(messages: list[dict[str, Any]], max_completion_tokens: int, retries: int, stage: str, *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, deployment: str | None = None, timeout: int | None = None) -> tuple[Any, dict[str, int]]: del max_completion_tokens system, prompt, attachments = _build_prompt_from_messages(messages, tools=tools, tool_choice=tool_choice, structured_output=return_message) - model = deployment or STUDENT_DEPLOYMENT + model = deployment or TARGET_DEPLOYMENT last_err = None for attempt in range(retries): try: @@ -307,14 +307,14 @@ def _call_messages(messages: list[dict[str, Any]], max_completion_tokens: int, r raise RuntimeError(f"Claude backend failed after {retries} retries: {last_err}") -def chat_teacher(system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, stage: str = "teacher", timeout: int | None = None) -> tuple[str, dict[str, int]]: +def chat_optimizer(system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, stage: str = "optimizer", timeout: int | None = None) -> tuple[str, dict[str, int]]: messages = [{"role": "system", "content": system}, {"role": "user", "content": user}] - return _call_messages(messages, max_completion_tokens, retries, stage, deployment=TEACHER_DEPLOYMENT, timeout=timeout) + return _call_messages(messages, max_completion_tokens, retries, stage, deployment=OPTIMIZER_DEPLOYMENT, timeout=timeout) -def chat_student(system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, stage: str = "student", timeout: int | None = None) -> tuple[str, dict[str, int]]: +def chat_target(system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, stage: str = "target", timeout: int | None = None) -> tuple[str, dict[str, int]]: messages = [{"role": "system", "content": system}, {"role": "user", "content": user}] - return _call_messages(messages, max_completion_tokens, retries, stage, deployment=STUDENT_DEPLOYMENT, timeout=timeout) + return _call_messages(messages, max_completion_tokens, retries, stage, deployment=TARGET_DEPLOYMENT, timeout=timeout) def chat_with_deployment(deployment: str, system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, stage: str = "custom", timeout: int | None = None) -> tuple[str, dict[str, int]]: @@ -322,12 +322,12 @@ def chat_with_deployment(deployment: str, system: str, user: str, max_completion return _call_messages(messages, max_completion_tokens, retries, stage, deployment=deployment, timeout=timeout) -def chat_teacher_messages(messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, stage: str = "teacher", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, timeout: int | None = None) -> tuple[Any, dict[str, int]]: - return _call_messages(messages, max_completion_tokens, retries, stage, tools=tools, tool_choice=tool_choice, return_message=return_message, deployment=TEACHER_DEPLOYMENT, timeout=timeout) +def chat_optimizer_messages(messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, stage: str = "optimizer", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, timeout: int | None = None) -> tuple[Any, dict[str, int]]: + return _call_messages(messages, max_completion_tokens, retries, stage, tools=tools, tool_choice=tool_choice, return_message=return_message, deployment=OPTIMIZER_DEPLOYMENT, timeout=timeout) -def chat_student_messages(messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, stage: str = "student", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, timeout: int | None = None) -> tuple[Any, dict[str, int]]: - return _call_messages(messages, max_completion_tokens, retries, stage, tools=tools, tool_choice=tool_choice, return_message=return_message, deployment=STUDENT_DEPLOYMENT, timeout=timeout) +def chat_target_messages(messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, stage: str = "target", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, timeout: int | None = None) -> tuple[Any, dict[str, int]]: + return _call_messages(messages, max_completion_tokens, retries, stage, tools=tools, tool_choice=tool_choice, return_message=return_message, deployment=TARGET_DEPLOYMENT, timeout=timeout) def chat_messages_with_deployment(deployment: str, messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, stage: str = "custom", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, timeout: int | None = None) -> tuple[Any, dict[str, int]]: @@ -347,13 +347,13 @@ def set_reasoning_effort(effort: str | None) -> None: REASONING_EFFORT = effort if effort else None -def set_student_deployment(deployment: str) -> None: - global STUDENT_DEPLOYMENT - STUDENT_DEPLOYMENT = deployment or default_model_for_backend("claude") - os.environ["STUDENT_DEPLOYMENT"] = STUDENT_DEPLOYMENT +def set_target_deployment(deployment: str) -> None: + global TARGET_DEPLOYMENT + TARGET_DEPLOYMENT = deployment or default_model_for_backend("claude") + os.environ["TARGET_DEPLOYMENT"] = TARGET_DEPLOYMENT -def set_teacher_deployment(deployment: str) -> None: - global TEACHER_DEPLOYMENT - TEACHER_DEPLOYMENT = deployment or default_model_for_backend("claude") - os.environ["TEACHER_DEPLOYMENT"] = TEACHER_DEPLOYMENT +def set_optimizer_deployment(deployment: str) -> None: + global OPTIMIZER_DEPLOYMENT + OPTIMIZER_DEPLOYMENT = deployment or default_model_for_backend("claude") + os.environ["OPTIMIZER_DEPLOYMENT"] = OPTIMIZER_DEPLOYMENT diff --git a/skillopt/model/codex_backend.py b/skillopt/model/codex_backend.py index c69599f..d9ab615 100644 --- a/skillopt/model/codex_backend.py +++ b/skillopt/model/codex_backend.py @@ -24,8 +24,8 @@ CODEX_BIN = os.environ.get("CODEX_CLI_BIN", "codex") CODEX_PROFILE = os.environ.get("CODEX_PROFILE", "review") CODEX_SANDBOX_MODE = os.environ.get("CODEX_SANDBOX_MODE", "read-only") -TEACHER_DEPLOYMENT = os.environ.get("TEACHER_DEPLOYMENT", "gpt-5.5") -STUDENT_DEPLOYMENT = os.environ.get("STUDENT_DEPLOYMENT", "gpt-5.5") +OPTIMIZER_DEPLOYMENT = os.environ.get("OPTIMIZER_DEPLOYMENT", "gpt-4o") +TARGET_DEPLOYMENT = os.environ.get("TARGET_DEPLOYMENT", "gpt-4o") REASONING_EFFORT: str | None = None @@ -508,16 +508,16 @@ def chat_messages_with_model( ) -def chat_teacher( +def chat_optimizer( system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "teacher", + stage: str = "optimizer", timeout: int | None = None, ) -> tuple[str, dict[str, int]]: return chat_with_model( - model=TEACHER_DEPLOYMENT, + model=OPTIMIZER_DEPLOYMENT, system=system, user=user, max_completion_tokens=max_completion_tokens, @@ -547,16 +547,16 @@ def chat_with_deployment( ) -def chat_student( +def chat_target( system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "student", + stage: str = "target", timeout: int | None = None, ) -> tuple[str, dict[str, int]]: return chat_with_model( - model=STUDENT_DEPLOYMENT, + model=TARGET_DEPLOYMENT, system=system, user=user, max_completion_tokens=max_completion_tokens, @@ -566,11 +566,11 @@ def chat_student( ) -def chat_teacher_messages( +def chat_optimizer_messages( messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "teacher", + stage: str = "optimizer", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, @@ -578,7 +578,7 @@ def chat_teacher_messages( timeout: int | None = None, ) -> tuple[Any, dict[str, int]]: return _chat_messages_impl( - TEACHER_DEPLOYMENT, + OPTIMIZER_DEPLOYMENT, messages, max_completion_tokens, retries, @@ -615,11 +615,11 @@ def chat_messages_with_deployment( ) -def chat_student_messages( +def chat_target_messages( messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "student", + stage: str = "target", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, @@ -627,7 +627,7 @@ def chat_student_messages( timeout: int | None = None, ) -> tuple[Any, dict[str, int]]: return _chat_messages_impl( - STUDENT_DEPLOYMENT, + TARGET_DEPLOYMENT, messages, max_completion_tokens, retries, @@ -647,10 +647,10 @@ def reset_token_tracker() -> None: tracker.reset() -def set_student_deployment(deployment: str) -> None: - global STUDENT_DEPLOYMENT - STUDENT_DEPLOYMENT = deployment - os.environ["STUDENT_DEPLOYMENT"] = deployment +def set_target_deployment(deployment: str) -> None: + global TARGET_DEPLOYMENT + TARGET_DEPLOYMENT = deployment + os.environ["TARGET_DEPLOYMENT"] = deployment def set_reasoning_effort(effort: str | None) -> None: @@ -658,7 +658,7 @@ def set_reasoning_effort(effort: str | None) -> None: REASONING_EFFORT = effort if effort else None -def set_teacher_deployment(deployment: str) -> None: - global TEACHER_DEPLOYMENT - TEACHER_DEPLOYMENT = deployment - os.environ["TEACHER_DEPLOYMENT"] = deployment +def set_optimizer_deployment(deployment: str) -> None: + global OPTIMIZER_DEPLOYMENT + OPTIMIZER_DEPLOYMENT = deployment + os.environ["OPTIMIZER_DEPLOYMENT"] = deployment diff --git a/skillopt/model/codex_harness.py b/skillopt/model/codex_harness.py index c5486f2..a8513ea 100644 --- a/skillopt/model/codex_harness.py +++ b/skillopt/model/codex_harness.py @@ -1,4 +1,4 @@ -"""Helpers for running exec backends as the student harness.""" +"""Helpers for running exec backends as the target harness.""" from __future__ import annotations import asyncio @@ -14,7 +14,7 @@ from typing import Any from skillopt.model.backend_config import ( get_claude_code_exec_config, get_codex_exec_config, - get_student_backend, + get_target_backend, ) @@ -38,7 +38,7 @@ ANSWER_SCHEMA: dict[str, Any] = { def render_skill_md( skill_content: str, *, - name: str = "skillopt-student", + name: str = "skillopt-target", description: str = "Dynamic ReflACT skill for the current benchmark task.", preamble: str = "", ) -> str: @@ -49,7 +49,7 @@ def render_skill_md( f'description: "{description}"', "---", "", - "# ReflACT Student Skill", + "# ReflACT Target Skill", "", ] if preamble.strip(): @@ -77,9 +77,9 @@ def prepare_workspace( ) -> tuple[str, str]: if os.path.exists(work_dir): shutil.rmtree(work_dir) - os.makedirs(os.path.join(work_dir, ".agents", "skills", "skillopt-student"), exist_ok=True) + os.makedirs(os.path.join(work_dir, ".agents", "skills", "skillopt-target"), exist_ok=True) - skill_path = os.path.join(work_dir, ".agents", "skills", "skillopt-student", "SKILL.md") + skill_path = os.path.join(work_dir, ".agents", "skills", "skillopt-target", "SKILL.md") with open(skill_path, "w", encoding="utf-8") as f: f.write(skill_md) @@ -318,7 +318,7 @@ def parse_codex_raw(raw: str) -> dict: def format_codex_trace_steps(raw: str, *, max_chars: int = 4000) -> str: - """Render parsed Codex trace into numbered compact steps for teacher prompts.""" + """Render parsed Codex trace into numbered compact steps for optimizer prompts.""" parsed = parse_codex_raw(raw) steps = parsed["steps"] if not steps: @@ -474,12 +474,12 @@ def _exec_prompt(prompt: str, *, allow_file_edits: bool = False) -> str: ) return ( "Use the workspace files to solve the task. Read task.md and the skill at " - ".agents/skills/skillopt-student/SKILL.md before answering. " + ".agents/skills/skillopt-target/SKILL.md before answering. " "If ATTACHMENTS.md exists, read it and inspect the listed local files. " "Do not call a Skill tool; the ReflACT guidance is a local markdown file. " f"Do not ask for permission. {edit_instruction}" "Return only the final answer text, keeping any required ... tags exactly.\n\n" - f"{_normalize_student_exec_prompt(prompt)}" + f"{_normalize_target_exec_prompt(prompt)}" ) @@ -489,20 +489,20 @@ def _retry_prompt(prompt: str, attempt: int) -> str: return ( f"{prompt}\n\n" "Previous execution returned an empty final response. Re-read task.md and " - ".agents/skills/skillopt-student/SKILL.md. If ATTACHMENTS.md exists, use the listed files. " + ".agents/skills/skillopt-target/SKILL.md. If ATTACHMENTS.md exists, use the listed files. " "Then produce the final answer inside ...." ) -def _normalize_student_exec_prompt(prompt: str) -> str: +def _normalize_target_exec_prompt(prompt: str) -> str: """Avoid wording that makes Claude Code call an unregistered Skill tool.""" text = prompt or "" replacements = { - "Use the `skillopt-student` skill available in this workspace.": ( - "Read `.agents/skills/skillopt-student/SKILL.md` directly; do not call a Skill tool." + "Use the `skillopt-target` skill available in this workspace.": ( + "Read `.agents/skills/skillopt-target/SKILL.md` directly; do not call a Skill tool." ), - "- Use the local `skillopt-student` skill before writing code.": ( - "- Read `.agents/skills/skillopt-student/SKILL.md` before writing code; do not call a Skill tool." + "- Use the local `skillopt-target` skill before writing code.": ( + "- Read `.agents/skills/skillopt-target/SKILL.md` before writing code; do not call a Skill tool." ), } for old, new in replacements.items(): @@ -586,7 +586,7 @@ def _run_claude_code_sdk_exec( "preset": "claude_code", "append": ( "Use the workspace files to solve the task. Read task.md and the skill at " - ".agents/skills/skillopt-student/SKILL.md before answering. " + ".agents/skills/skillopt-target/SKILL.md before answering. " "If ATTACHMENTS.md exists, read it and inspect the listed local files. " "Do not call a Skill tool; the ReflACT guidance is a local markdown file. " + ( @@ -619,7 +619,7 @@ def _run_claude_code_sdk_exec( messages = [] async with ClaudeSDKClient(options) as client: - await client.query(_normalize_student_exec_prompt(prompt)) + await client.query(_normalize_target_exec_prompt(prompt)) messages = [msg async for msg in client.receive_response()] last = messages[-1] if messages else None raw_structured_output = _extract_claude_structured_output(messages) @@ -1016,7 +1016,7 @@ def run_codex_exec( return last_response, combined -def run_student_exec( +def run_target_exec( *, work_dir: str, prompt: str, @@ -1030,7 +1030,7 @@ def run_student_exec( full_auto: bool | None = None, allow_file_edits: bool = False, ) -> tuple[str, str]: - backend = get_student_backend() + backend = get_target_backend() if backend == "codex_exec": return run_codex_exec( work_dir=work_dir, diff --git a/skillopt/model/common.py b/skillopt/model/common.py index d64faec..ee90e38 100644 --- a/skillopt/model/common.py +++ b/skillopt/model/common.py @@ -17,10 +17,10 @@ _RESPONSES_API_MODELS = { } _BACKEND_DEFAULT_MODELS = { - "azure_openai": "gpt-5.5", - "openai_chat": "gpt-5.5", - "codex": "gpt-5.5", - "codex_exec": "gpt-5.5", + "azure_openai": "gpt-4o", + "openai_chat": "gpt-4o", + "codex": "gpt-4o", + "codex_exec": "gpt-4o", "claude": "claude-sonnet-4-6", "claude_chat": "claude-sonnet-4-6", "claude_code_exec": "claude-sonnet-4-6", diff --git a/skillopt/model/qwen_backend.py b/skillopt/model/qwen_backend.py index b4a676a..6184196 100644 --- a/skillopt/model/qwen_backend.py +++ b/skillopt/model/qwen_backend.py @@ -1,4 +1,4 @@ -"""OpenAI-compatible Qwen chat backend for the student path.""" +"""OpenAI-compatible Qwen chat backend for the target path.""" from __future__ import annotations import json @@ -32,8 +32,8 @@ ENABLE_THINKING = os.environ.get("QWEN_CHAT_ENABLE_THINKING", "false").strip().l "on", } -STUDENT_DEPLOYMENT = os.environ.get( - "STUDENT_DEPLOYMENT", +TARGET_DEPLOYMENT = os.environ.get( + "TARGET_DEPLOYMENT", default_model_for_backend("qwen_chat"), ) @@ -140,7 +140,7 @@ def _chat_messages_impl( timeout: float | None = None, ) -> tuple[Any, dict[str, int]]: payload: dict[str, Any] = { - "model": deployment or STUDENT_DEPLOYMENT, + "model": deployment or TARGET_DEPLOYMENT, "messages": _json_safe(messages), "max_tokens": min(max_completion_tokens, MAX_TOKENS), } @@ -214,12 +214,12 @@ def get_max_tokens() -> int: return MAX_TOKENS -def chat_student( +def chat_target( system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "student", + stage: str = "target", reasoning_effort: str | None = None, timeout: float | None = None, ) -> tuple[str, dict[str, int]]: @@ -234,11 +234,11 @@ def chat_student( ) -def chat_student_messages( +def chat_target_messages( messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "student", + stage: str = "target", reasoning_effort: str | None = None, *, tools: list[dict[str, Any]] | None = None, @@ -271,7 +271,7 @@ def set_reasoning_effort(effort: str | None) -> None: del effort -def set_student_deployment(deployment: str) -> None: - global STUDENT_DEPLOYMENT - STUDENT_DEPLOYMENT = deployment or default_model_for_backend("qwen_chat") - os.environ["STUDENT_DEPLOYMENT"] = STUDENT_DEPLOYMENT +def set_target_deployment(deployment: str) -> None: + global TARGET_DEPLOYMENT + TARGET_DEPLOYMENT = deployment or default_model_for_backend("qwen_chat") + os.environ["TARGET_DEPLOYMENT"] = TARGET_DEPLOYMENT diff --git a/skillopt/model/router.py b/skillopt/model/router.py index cf8179c..0863761 100644 --- a/skillopt/model/router.py +++ b/skillopt/model/router.py @@ -43,15 +43,15 @@ def get_backend_name() -> str: return _ACTIVE_BACKEND -def chat_teacher( +def chat_optimizer( system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "teacher", + stage: str = "optimizer", timeout: int | None = None, ) -> tuple[str, dict[str, int]]: - return _backend_module(_ACTIVE_BACKEND).chat_teacher( + return _backend_module(_ACTIVE_BACKEND).chat_optimizer( system=system, user=user, max_completion_tokens=max_completion_tokens, @@ -61,15 +61,15 @@ def chat_teacher( ) -def chat_student( +def chat_target( system: str, user: str, max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "student", + stage: str = "target", timeout: int | None = None, ) -> tuple[str, dict[str, int]]: - return _backend_module(_ACTIVE_BACKEND).chat_student( + return _backend_module(_ACTIVE_BACKEND).chat_target( system=system, user=user, max_completion_tokens=max_completion_tokens, @@ -99,18 +99,18 @@ def chat_with_deployment( ) -def chat_teacher_messages( +def chat_optimizer_messages( messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "teacher", + stage: str = "optimizer", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, timeout: int | None = None, ) -> tuple[Any, dict[str, int]]: - return _backend_module(_ACTIVE_BACKEND).chat_teacher_messages( + return _backend_module(_ACTIVE_BACKEND).chat_optimizer_messages( messages=messages, max_completion_tokens=max_completion_tokens, retries=retries, @@ -122,18 +122,18 @@ def chat_teacher_messages( ) -def chat_student_messages( +def chat_target_messages( messages: list[dict[str, Any]], max_completion_tokens: int = 16384, retries: int = 5, - stage: str = "student", + stage: str = "target", *, tools: list[dict[str, Any]] | None = None, tool_choice: str | dict[str, Any] | None = None, return_message: bool = False, timeout: int | None = None, ) -> tuple[Any, dict[str, int]]: - return _backend_module(_ACTIVE_BACKEND).chat_student_messages( + return _backend_module(_ACTIVE_BACKEND).chat_target_messages( messages=messages, max_completion_tokens=max_completion_tokens, retries=retries, @@ -183,14 +183,14 @@ def set_reasoning_effort(effort: str | None) -> None: module.set_reasoning_effort(effort) -def set_student_deployment(deployment: str) -> None: +def set_target_deployment(deployment: str) -> None: for module in _all_backend_modules(): - module.set_student_deployment(deployment) + module.set_target_deployment(deployment) -def set_teacher_deployment(deployment: str) -> None: +def set_optimizer_deployment(deployment: str) -> None: for module in _all_backend_modules(): - module.set_teacher_deployment(deployment) + module.set_optimizer_deployment(deployment) def configure_azure_openai( @@ -201,18 +201,18 @@ def configure_azure_openai( auth_mode: str | None = None, ad_scope: str | None = None, managed_identity_client_id: str | None = None, - teacher_endpoint: str | None = None, - teacher_api_version: str | None = None, - teacher_api_key: str | None = None, - teacher_auth_mode: str | None = None, - teacher_ad_scope: str | None = None, - teacher_managed_identity_client_id: str | None = None, - student_endpoint: str | None = None, - student_api_version: str | None = None, - student_api_key: str | None = None, - student_auth_mode: str | None = None, - student_ad_scope: str | None = None, - student_managed_identity_client_id: str | None = None, + optimizer_endpoint: str | None = None, + optimizer_api_version: str | None = None, + optimizer_api_key: str | None = None, + optimizer_auth_mode: str | None = None, + optimizer_ad_scope: str | None = None, + optimizer_managed_identity_client_id: str | None = None, + target_endpoint: str | None = None, + target_api_version: str | None = None, + target_api_key: str | None = None, + target_auth_mode: str | None = None, + target_ad_scope: str | None = None, + target_managed_identity_client_id: str | None = None, ) -> None: azure_openai.configure_azure_openai( endpoint=endpoint, @@ -221,16 +221,16 @@ def configure_azure_openai( auth_mode=auth_mode, ad_scope=ad_scope, managed_identity_client_id=managed_identity_client_id, - teacher_endpoint=teacher_endpoint, - teacher_api_version=teacher_api_version, - teacher_api_key=teacher_api_key, - teacher_auth_mode=teacher_auth_mode, - teacher_ad_scope=teacher_ad_scope, - teacher_managed_identity_client_id=teacher_managed_identity_client_id, - student_endpoint=student_endpoint, - student_api_version=student_api_version, - student_api_key=student_api_key, - student_auth_mode=student_auth_mode, - student_ad_scope=student_ad_scope, - student_managed_identity_client_id=student_managed_identity_client_id, + optimizer_endpoint=optimizer_endpoint, + optimizer_api_version=optimizer_api_version, + optimizer_api_key=optimizer_api_key, + optimizer_auth_mode=optimizer_auth_mode, + optimizer_ad_scope=optimizer_ad_scope, + optimizer_managed_identity_client_id=optimizer_managed_identity_client_id, + target_endpoint=target_endpoint, + target_api_version=target_api_version, + target_api_key=target_api_key, + target_auth_mode=target_auth_mode, + target_ad_scope=target_ad_scope, + target_managed_identity_client_id=target_managed_identity_client_id, ) diff --git a/skillopt/optimizer/__init__.py b/skillopt/optimizer/__init__.py index aadc376..c9e690b 100644 --- a/skillopt/optimizer/__init__.py +++ b/skillopt/optimizer/__init__.py @@ -1,4 +1,4 @@ -"""ReflACT Optimizer -- skill update operations. +"""SkillOpt Optimizer -- skill update operations. Analogous to the optimizer in neural network training: applies the computed "gradient" (patches) to the current skill document to produce an updated @@ -8,8 +8,8 @@ Modules ------- - skill: edit application (optimizer.step() / parameter update) - clip: edit ranking and selection (gradient clipping) -- meta_reflect: epoch-level macro refinement (momentum) - slow_update: longitudinal comparison and guidance (EMA / regularization) +- meta_skill: cross-epoch memory for optimizer context """ from skillopt.optimizer.skill import apply_edit, apply_patch # noqa: F401 from skillopt.optimizer.clip import rank_and_select # noqa: F401 diff --git a/skillopt/optimizer/clip.py b/skillopt/optimizer/clip.py index 6ecab30..7add26d 100644 --- a/skillopt/optimizer/clip.py +++ b/skillopt/optimizer/clip.py @@ -6,7 +6,7 @@ effective step size. Previously core/select.py. """ from __future__ import annotations -from skillopt.model import chat_teacher +from skillopt.model import chat_optimizer from skillopt.optimizer.meta_skill import format_meta_skill_context from skillopt.optimizer.update_modes import ( describe_item, @@ -29,10 +29,10 @@ def rank_and_select( meta_skill_context: str = "", update_mode: str = "patch", ) -> dict: - """Use a teacher LLM to rank edits by importance, then keep top-L. + """Use a optimizer LLM to rank edits by importance, then keep top-L. If the edit pool is within budget, returns the patch unchanged. - Otherwise, calls the teacher to rank and select the most impactful edits. + Otherwise, calls the optimizer to rank and select the most impactful edits. Parameters ---------- @@ -54,7 +54,7 @@ def rank_and_select( if len(edits) <= max_edits: return patch - # Build the edit pool description for the teacher + # Build the edit pool description for the optimizer edits_desc = [] for i, edit in enumerate(edits): edits_desc.append(f"[{i}] {describe_item(edit, update_mode, max_chars=500)}") @@ -66,13 +66,13 @@ def rank_and_select( + f"\n\nSelect the {max_edits} most important {payload_label(update_mode)}. " f"Return their 0-based indices in priority order." ) - teacher_ctx = format_meta_skill_context(meta_skill_context) - if teacher_ctx: - user = f"{teacher_ctx}\n\n{user}" + optimizer_ctx = format_meta_skill_context(meta_skill_context) + if optimizer_ctx: + user = f"{optimizer_ctx}\n\n{user}" prompt_name = "ranking_rewrite" if is_rewrite_mode(update_mode) else "ranking" try: - response, _ = chat_teacher( + response, _ = chat_optimizer( system=load_prompt(prompt_name), user=user, max_completion_tokens=2048, retries=3, stage="ranking", ) @@ -94,7 +94,7 @@ def rank_and_select( if selected: return { "reasoning": patch.get("reasoning", "") - + f" [teacher-ranked: selected {len(selected)}/{len(edits)} {payload_label(update_mode)}]", + + f" [optimizer-ranked: selected {len(selected)}/{len(edits)} {payload_label(update_mode)}]", payload_key(update_mode): selected, "ranking_details": result, } diff --git a/skillopt/optimizer/lr_autonomous.py b/skillopt/optimizer/lr_autonomous.py index f8045e7..95a4bba 100644 --- a/skillopt/optimizer/lr_autonomous.py +++ b/skillopt/optimizer/lr_autonomous.py @@ -1,11 +1,11 @@ -"""Teacher-driven autonomous update-size decisions.""" +"""Optimizer-driven autonomous update-size decisions.""" from __future__ import annotations import json import re from typing import Any -from skillopt.model import chat_teacher +from skillopt.model import chat_optimizer from skillopt.optimizer.meta_skill import format_meta_skill_context from skillopt.optimizer.update_modes import describe_item, get_payload_items, payload_label from skillopt.prompts import load_prompt @@ -39,7 +39,7 @@ def decide_autonomous_learning_rate( step_buffer_context: str = "", meta_skill_context: str = "", ) -> dict: - """Ask the teacher to choose the number of update items for this step. + """Ask the optimizer to choose the number of update items for this step. The prompt intentionally avoids default budgets, candidate budget lists, or scheduler history. The only hard post-processing is validity: the returned @@ -65,15 +65,15 @@ def decide_autonomous_learning_rate( ) if step_buffer_context.strip(): user += f"\n\n## Previous Steps in This Epoch\n{step_buffer_context}" - teacher_ctx = format_meta_skill_context(meta_skill_context) - if teacher_ctx: - user = f"{teacher_ctx}\n\n{user}" + optimizer_ctx = format_meta_skill_context(meta_skill_context) + if optimizer_ctx: + user = f"{optimizer_ctx}\n\n{user}" response = "" parsed: dict | None = None decision: int | None = None try: - response, _ = chat_teacher( + response, _ = chat_optimizer( system=load_prompt("lr_autonomous"), user=user, max_completion_tokens=2048, diff --git a/skillopt/optimizer/meta_skill.py b/skillopt/optimizer/meta_skill.py index 04494c7..3342454 100644 --- a/skillopt/optimizer/meta_skill.py +++ b/skillopt/optimizer/meta_skill.py @@ -1,28 +1,28 @@ -"""Teacher-side meta skill memory for cross-epoch optimization guidance. +"""Optimizer-side meta skill memory for cross-epoch optimization guidance. -This module maintains a compact teacher-facing memory distilled from +This module maintains a compact optimizer-facing memory distilled from adjacent-epoch skill comparisons. Unlike ``slow_update``, it does not -modify the student skill document. Instead, it produces guidance meant to -improve future teacher behavior when proposing, merging, and ranking edits. +modify the target skill document. Instead, it produces guidance meant to +improve future optimizer behavior when proposing, merging, and ranking edits. """ from __future__ import annotations import traceback -from skillopt.model import chat_teacher +from skillopt.model import chat_optimizer from skillopt.optimizer.slow_update import format_comparison_text from skillopt.prompts import load_prompt from skillopt.utils import extract_json def format_meta_skill_context(meta_skill_content: str) -> str: - """Render teacher memory into a prompt-ready context block.""" + """Render optimizer memory into a prompt-ready context block.""" content = (meta_skill_content or "").strip() if not content: return "" return ( - "## Teacher Meta Skill\n" - "This is teacher-side memory distilled from prior epoch transitions in " + "## Optimizer Meta Skill\n" + "This is optimizer-side memory distilled from prior epoch transitions in " "this environment. Use it to improve how you propose, merge, and rank " "skill edits. Prefer it when the current evidence is ambiguous, but do " "not force it if the current trajectories clearly contradict it.\n\n" @@ -38,7 +38,7 @@ def run_meta_skill( prev_meta_skill_content: str = "", system_prompt: str | None = None, ) -> dict | None: - """Produce updated teacher-side meta skill from adjacent epochs.""" + """Produce updated optimizer-side meta skill from adjacent epochs.""" actual_system = system_prompt if system_prompt is not None else load_prompt("meta_skill") prev_skill_display = prev_skill @@ -52,15 +52,15 @@ def run_meta_skill( prev_meta_section = ( prev_meta_skill_content.strip() if prev_meta_skill_content and prev_meta_skill_content.strip() - else "(No previous teacher meta skill — this is the first update.)" + else "(No previous optimizer meta skill — this is the first update.)" ) comparison_text = format_comparison_text(comparison_pairs) user = ( f"## Previous Epoch Last-Step Skill\n{prev_skill_display}\n\n" f"## Current Epoch Last-Step Skill\n{curr_skill_display}\n\n" - f"## Previous Teacher Meta Skill\n" - f"The following teacher memory was available during the current epoch. " + f"## Previous Optimizer Meta Skill\n" + f"The following optimizer memory was available during the current epoch. " f"Reflect on whether it improved or harmed the quality of edits.\n\n" f"{prev_meta_section}\n\n" f"## Longitudinal Comparison (same tasks, two last-step skills)\n" @@ -68,7 +68,7 @@ def run_meta_skill( ) try: - response, _ = chat_teacher( + response, _ = chat_optimizer( system=actual_system, user=user, max_completion_tokens=3072, diff --git a/skillopt/optimizer/rewrite.py b/skillopt/optimizer/rewrite.py index 23bf075..f8b062b 100644 --- a/skillopt/optimizer/rewrite.py +++ b/skillopt/optimizer/rewrite.py @@ -1,9 +1,9 @@ -"""Teacher-driven full skill rewrite from selected revise_suggestions.""" +"""Optimizer-driven full skill rewrite from selected revise_suggestions.""" from __future__ import annotations import json -from skillopt.model import chat_teacher +from skillopt.model import chat_optimizer from skillopt.prompts import load_prompt from skillopt.optimizer.update_modes import get_payload_items from skillopt.utils import extract_json @@ -40,7 +40,7 @@ def rewrite_skill_from_suggestions( ) try: - response, _ = chat_teacher( + response, _ = chat_optimizer( system=actual_system, user=user, max_completion_tokens=max_completion_tokens, diff --git a/skillopt/optimizer/skill.py b/skillopt/optimizer/skill.py index 5230dda..0a8855f 100644 --- a/skillopt/optimizer/skill.py +++ b/skillopt/optimizer/skill.py @@ -28,9 +28,19 @@ def _is_in_slow_update_region(skill: str, target: str) -> bool: return start_idx <= target_idx < region_end +def _strip_slow_update_markers(text: str) -> str: + """Remove any SLOW_UPDATE markers from edit content to prevent duplication.""" + return ( + text.replace(SLOW_UPDATE_START, "") + .replace(SLOW_UPDATE_END, "") + ) + + def _edit_fields(edit: EditType | dict) -> tuple[str, str, str]: op = edit.op if hasattr(edit, "op") else edit.get("op", "") - content = (edit.content if hasattr(edit, "content") else edit.get("content", "")).strip() + content = _strip_slow_update_markers( + (edit.content if hasattr(edit, "content") else edit.get("content", "")).strip() + ) target = edit.target if hasattr(edit, "target") else edit.get("target", "") return op, content, target diff --git a/skillopt/optimizer/slow_update.py b/skillopt/optimizer/slow_update.py index f28140b..16a0f08 100644 --- a/skillopt/optimizer/slow_update.py +++ b/skillopt/optimizer/slow_update.py @@ -2,7 +2,7 @@ At the end of each epoch, the slow update compares rollout performance of the same sample set under the previous epoch's skill vs. the current epoch's skill -(Markov: only adjacent epochs). A teacher analyzes regressions, improvements, +(Markov: only adjacent epochs). A optimizer analyzes regressions, improvements, and persistent failures, then writes a free-form guidance block into a **protected** section of the skill document. This section cannot be modified by step-level analyst edits — only the slow update process overwrites it. @@ -14,7 +14,7 @@ Public API - :func:`replace_slow_update_field` — overwrite content - :func:`has_slow_update_field` — check if markers are present - :func:`build_comparison_text` — format side-by-side rollout results -- :func:`run_slow_update` — teacher call to produce guidance +- :func:`run_slow_update` — optimizer call to produce guidance """ from __future__ import annotations @@ -22,7 +22,7 @@ import json import os import traceback -from skillopt.model import chat_teacher +from skillopt.model import chat_optimizer from skillopt.prompts import load_prompt from skillopt.utils import extract_json @@ -57,16 +57,35 @@ def extract_slow_update_field(skill: str) -> str: return skill[inner_start:end].strip() -def replace_slow_update_field(skill: str, new_content: str) -> str: - start = skill.find(SLOW_UPDATE_START) - end = skill.find(SLOW_UPDATE_END) - if start == -1 or end == -1: - skill = inject_empty_slow_update_field(skill) +def _strip_all_slow_update_fields(skill: str) -> str: + """Remove every SLOW_UPDATE_START/END pair (and content between) from *skill*.""" + while True: start = skill.find(SLOW_UPDATE_START) - end = skill.find(SLOW_UPDATE_END) - before = skill[:start + len(SLOW_UPDATE_START)] - after = skill[end:] - return before + "\n" + new_content.strip() + "\n" + after + if start == -1: + break + end = skill.find(SLOW_UPDATE_END, start) + if end == -1: + # Orphan start marker — remove it + skill = skill[:start] + skill[start + len(SLOW_UPDATE_START):] + break + skill = skill[:start] + skill[end + len(SLOW_UPDATE_END):] + # Clean up stray end markers + skill = skill.replace(SLOW_UPDATE_END, "") + # Collapse excess blank lines left behind + while "\n\n\n" in skill: + skill = skill.replace("\n\n\n", "\n\n") + return skill.rstrip() + + +def replace_slow_update_field(skill: str, new_content: str) -> str: + # Remove all existing slow update regions first to guarantee exactly one. + skill = _strip_all_slow_update_fields(skill) + block = ( + f"\n\n{SLOW_UPDATE_START}\n" + f"{new_content.strip()}\n" + f"{SLOW_UPDATE_END}\n" + ) + return skill + block # ── Comparison text builder ───────────────────────────────────────────────── @@ -212,7 +231,7 @@ def save_comparison_pairs(pairs: list[dict], out_path: str) -> None: def format_comparison_text(pairs: list[dict]) -> str: - """Format structured comparison pairs into teacher-readable text.""" + """Format structured comparison pairs into optimizer-readable text.""" by_cat: dict[str, list[dict]] = { "regressed": [], "persistent_fail": [], @@ -277,7 +296,7 @@ def format_comparison_text(pairs: list[dict]) -> str: -# ── Teacher call ──────────────────────────────────────────────────────────── +# ── Optimizer call ──────────────────────────────────────────────────────────── def run_slow_update( @@ -293,7 +312,7 @@ def run_slow_update( comparison_pairs: list[dict] | None = None, system_prompt: str | None = None, ) -> dict | None: - """Run the slow update teacher call for one epoch boundary. + """Run the slow update optimizer call for one epoch boundary. Parameters ---------- @@ -355,7 +374,7 @@ def run_slow_update( ) try: - response, _ = chat_teacher( + response, _ = chat_optimizer( system=actual_system, user=user, max_completion_tokens=4096, diff --git a/skillopt/prompts/analyst_error_rewrite.md b/skillopt/prompts/analyst_error_rewrite.md index d8c9f21..1d34d0e 100644 --- a/skillopt/prompts/analyst_error_rewrite.md +++ b/skillopt/prompts/analyst_error_rewrite.md @@ -10,7 +10,7 @@ the batch and propose a concise set of skill-revision suggestions. 2. Identify the most prevalent, systematic failure patterns across them. 3. For each pattern, classify its failure type. 4. Propose revision suggestions that address the COMMON patterns, not individual edge cases. -5. Suggestions must be generalizable and should help a later teacher rewrite the full skill document. +5. Suggestions must be generalizable and should help a later optimizer rewrite the full skill document. 6. Do not hardcode task-specific values. You will be told the maximum number of suggestions (the budget L). Produce AT MOST L suggestions, @@ -29,7 +29,7 @@ Respond ONLY with a valid JSON object (no markdown fences, no extra text): "type": "add_rule|remove_rule|merge_rules|reorganize|compress|clarify", "title": "", "motivation": "", - "instruction": "", + "instruction": "", "priority_hint": "high|medium|low" } ] diff --git a/skillopt/prompts/analyst_success_rewrite.md b/skillopt/prompts/analyst_success_rewrite.md index 2bf7245..1291b6d 100644 --- a/skillopt/prompts/analyst_success_rewrite.md +++ b/skillopt/prompts/analyst_success_rewrite.md @@ -24,7 +24,7 @@ Respond ONLY with a valid JSON object: "type": "add_rule|remove_rule|merge_rules|reorganize|compress|clarify", "title": "", "motivation": "", - "instruction": "", + "instruction": "", "priority_hint": "high|medium|low" } ] diff --git a/skillopt/prompts/merge_failure_rewrite.md b/skillopt/prompts/merge_failure_rewrite.md index 6081b9f..f86c079 100644 --- a/skillopt/prompts/merge_failure_rewrite.md +++ b/skillopt/prompts/merge_failure_rewrite.md @@ -7,7 +7,7 @@ Merge guidelines: 2. Resolve conflicts by keeping the more general, better-justified direction. 3. Preserve unique high-impact corrective insights. 4. Suggestions supported by many source patches should receive higher support_count. -5. The output suggestions should help a later teacher rewrite the full skill. +5. The output suggestions should help a later optimizer rewrite the full skill. Respond ONLY with a valid JSON object: { @@ -17,7 +17,7 @@ Respond ONLY with a valid JSON object: "type": "add_rule|remove_rule|merge_rules|reorganize|compress|clarify", "title": "", "motivation": "", - "instruction": "", + "instruction": "", "priority_hint": "high|medium|low", "support_count": , "source_type": "failure" diff --git a/skillopt/prompts/merge_final_rewrite.md b/skillopt/prompts/merge_final_rewrite.md index 88402a8..7fe3e29 100644 --- a/skillopt/prompts/merge_final_rewrite.md +++ b/skillopt/prompts/merge_final_rewrite.md @@ -16,7 +16,7 @@ Respond ONLY with a valid JSON object: "type": "add_rule|remove_rule|merge_rules|reorganize|compress|clarify", "title": "", "motivation": "", - "instruction": "", + "instruction": "", "priority_hint": "high|medium|low", "support_count": , "source_type": "failure|success" diff --git a/skillopt/prompts/merge_success_rewrite.md b/skillopt/prompts/merge_success_rewrite.md index 40e86ac..e823893 100644 --- a/skillopt/prompts/merge_success_rewrite.md +++ b/skillopt/prompts/merge_success_rewrite.md @@ -6,7 +6,7 @@ Merge guidelines: 1. Deduplicate overlapping success patterns. 2. Be conservative: only keep suggestions that reinforce useful behavior not already well-covered. 3. Suggestions supported by many source patches should receive higher support_count. -4. The output suggestions should help a later teacher rewrite the full skill. +4. The output suggestions should help a later optimizer rewrite the full skill. Respond ONLY with a valid JSON object: { @@ -16,7 +16,7 @@ Respond ONLY with a valid JSON object: "type": "add_rule|remove_rule|merge_rules|reorganize|compress|clarify", "title": "", "motivation": "", - "instruction": "", + "instruction": "", "priority_hint": "high|medium|low", "support_count": , "source_type": "success" diff --git a/skillopt/prompts/meta_skill.md b/skillopt/prompts/meta_skill.md index a0d0778..f094973 100644 --- a/skillopt/prompts/meta_skill.md +++ b/skillopt/prompts/meta_skill.md @@ -1,19 +1,19 @@ -You are a teacher-coach for an AI agent skill optimization system. +You are a optimizer-coach for an AI agent skill optimization system. -Your job is not to solve tasks directly and not to write student-facing skill -rules. Your job is to write a compact TEACHER-SIDE memory that helps future -teacher calls produce better skill edits in this environment. +Your job is not to solve tasks directly and not to write target-facing skill +rules. Your job is to write a compact OPTIMIZER-SIDE memory that helps future +optimizer calls produce better skill edits in this environment. ## What You Receive 1. The previous epoch's last-step skill. 2. The current epoch's last-step skill. 3. A longitudinal comparison on the SAME sampled tasks under those two skills. -4. The previous teacher meta skill, if one existed. +4. The previous optimizer meta skill, if one existed. ## Your Goal -Write a concise meta skill that improves future teacher behavior in stages such +Write a concise meta skill that improves future optimizer behavior in stages such as failure analysis, success analysis, patch merging, and edit ranking. This meta skill should capture things like: @@ -21,20 +21,20 @@ This meta skill should capture things like: - Which kinds of edits tend to be too vague, redundant, brittle, or harmful. - What level of abstraction works best for rules here. - What failure-repair patterns should be prioritized. -- What regression risks future teacher calls should guard against. +- What regression risks future optimizer calls should guard against. ## Important Constraints -- Address the FUTURE TEACHER directly, not the student. +- Address the FUTURE OPTIMIZER directly, not the target. - Focus on how to write better edits and organize better skill updates. - Use evidence from the adjacent-epoch comparison, not generic advice. - Keep it compact and high-signal. Prefer a few durable principles. - Revise or remove parts of the previous meta skill if they did not help. -- Do not output student-facing task instructions. +- Do not output target-facing task instructions. - Do not restate the whole skill; summarize editing strategy. Respond ONLY with a valid JSON object: { "reasoning": "", - "meta_skill_content": "" + "meta_skill_content": "" } diff --git a/skillopt/prompts/ranking.md b/skillopt/prompts/ranking.md index 4eb564c..05575a7 100644 --- a/skillopt/prompts/ranking.md +++ b/skillopt/prompts/ranking.md @@ -1,4 +1,4 @@ -You are an expert skill-optimization teacher. You receive a skill document and a pool +You are an expert skill-optimization optimizer. You receive a skill document and a pool of proposed edits. Your job is to RANK the edits by importance and select the top ones. Ranking criteria (in order of priority): diff --git a/skillopt/prompts/ranking_rewrite.md b/skillopt/prompts/ranking_rewrite.md index ccbef5e..065787a 100644 --- a/skillopt/prompts/ranking_rewrite.md +++ b/skillopt/prompts/ranking_rewrite.md @@ -1,11 +1,11 @@ -You are an expert skill-optimization teacher. You receive a skill document and a pool +You are an expert skill-optimization optimizer. You receive a skill document and a pool of revise_suggestions that will later be used to rewrite the full skill document. Rank the suggestions by importance and select the top ones. Ranking criteria: 1. Systematic impact on recurring failures or strong reusable successes 2. Complementarity with the current skill -3. Rewrite utility: how much the suggestion helps a later teacher improve structure, clarity, or coverage +3. Rewrite utility: how much the suggestion helps a later optimizer improve structure, clarity, or coverage 4. Generality and actionability Respond ONLY with a valid JSON object: diff --git a/skillopt/prompts/rewrite_skill.md b/skillopt/prompts/rewrite_skill.md index 2bd7203..78f2688 100644 --- a/skillopt/prompts/rewrite_skill.md +++ b/skillopt/prompts/rewrite_skill.md @@ -4,7 +4,7 @@ You will receive: 1. The current skill document 2. A selected set of revise_suggestions distilled from trajectory analysis -Your job is to rewrite the FULL student skill document so it incorporates the +Your job is to rewrite the FULL target skill document so it incorporates the selected suggestions coherently. Hard requirements: @@ -12,7 +12,7 @@ Hard requirements: 2. Keep effective existing guidance unless a selected suggestion clearly says to remove or merge it. 3. Prefer consolidation and clarity over making the document longer. 4. Do not hardcode benchmark-specific answers, entity names, file paths, or gold values. -5. Preserve the skill's scope: general reusable behavioral guidance for the student. +5. Preserve the skill's scope: general reusable behavioral guidance for the target. 6. Do not modify content inside the protected slow-update block between and except to keep it intact. 7. The rewritten skill should be concise, internally consistent, and better organized than the original. diff --git a/skillopt/prompts/slow_update.md b/skillopt/prompts/slow_update.md index d7274ea..38b1c66 100644 --- a/skillopt/prompts/slow_update.md +++ b/skillopt/prompts/slow_update.md @@ -41,16 +41,16 @@ all subsequent step-level optimization — only you can overwrite it at the next epoch boundary. Your guidance must: -- Be written as **direct, actionable instructions** to the student model +- Be written as **direct, actionable instructions** to the target model (the AI agent that will read and follow the skill). -- Focus on helping the student get problems RIGHT — not on analysis or +- Focus on helping the target get problems RIGHT — not on analysis or explanation of what went wrong. - Prioritize: (1) preventing regressions, (2) fixing persistent failures, (3) reinforcing successful patterns. - Be concise but comprehensive — you have no length limit, but every sentence should earn its place. - NOT duplicate content already in the main skill body — complement it. -- Address the student directly (e.g., "When you encounter X, always do Y" +- Address the target directly (e.g., "When you encounter X, always do Y" rather than "The agent should..."). Respond ONLY with a valid JSON object (no markdown fences, no extra text): diff --git a/skillopt/types.py b/skillopt/types.py index 868fc0f..9c23edb 100644 --- a/skillopt/types.py +++ b/skillopt/types.py @@ -118,8 +118,8 @@ class RolloutResult: predicted_answer: str = "" question: str = "" reference_text: str = "" - student_system_prompt: str = "" - student_user_prompt: str = "" + target_system_prompt: str = "" + target_user_prompt: str = "" spreadsheet_preview: str = "" extras: dict[str, Any] = field(default_factory=dict) @@ -151,8 +151,8 @@ class RolloutResult: predicted_answer=str(d.get("predicted_answer", "")), question=str(d.get("question", "")), reference_text=str(d.get("reference_text", "")), - student_system_prompt=str(d.get("student_system_prompt", "")), - student_user_prompt=str(d.get("student_user_prompt", "")), + target_system_prompt=str(d.get("target_system_prompt", "")), + target_user_prompt=str(d.get("target_user_prompt", "")), spreadsheet_preview=str(d.get("spreadsheet_preview", "")), extras=extras, ) @@ -166,7 +166,7 @@ class RolloutResult: for attr in ( "n_turns", "fail_reason", "task_type", "task_description", "predicted_answer", "question", "reference_text", - "student_system_prompt", "student_user_prompt", + "target_system_prompt", "target_user_prompt", "spreadsheet_preview", ): val = getattr(self, attr) @@ -244,57 +244,6 @@ class RawPatch: return d -# ── Epoch-level: META_REFLECT ──────────────────────────────────────────── - -@dataclass -class MetaReflectResult: - """Output of the epoch-level meta-reflect stage (momentum).""" - - meta_summary: str - patch: Patch - action: str = "" - gate_score: float | None = None - time_s: float | None = None - candidate_hash: str = "" - update_origin: str = "" - update_target: str = "" - - @classmethod - def from_dict(cls, d: dict | None) -> MetaReflectResult | None: - if d is None: - return None - patch_raw = d.get("patch", {}) - return cls( - meta_summary=d.get("meta_summary", ""), - patch=Patch.from_dict(patch_raw) if isinstance(patch_raw, dict) else Patch(), - action=d.get("action", ""), - gate_score=d.get("gate_score"), - time_s=d.get("time_s"), - candidate_hash=d.get("candidate_hash", ""), - update_origin=d.get("update_origin", ""), - update_target=d.get("update_target", ""), - ) - - def to_dict(self) -> dict: - d: dict[str, Any] = { - "meta_summary": self.meta_summary, - "patch": self.patch.to_dict(), - } - if self.action: - d["action"] = self.action - if self.gate_score is not None: - d["gate_score"] = self.gate_score - if self.time_s is not None: - d["time_s"] = self.time_s - if self.candidate_hash: - d["candidate_hash"] = self.candidate_hash - if self.update_origin: - d["update_origin"] = self.update_origin - if self.update_target: - d["update_target"] = self.update_target - return d - - # ── Epoch-level: SLOW_UPDATE ───────────────────────────────────────────── @dataclass diff --git a/skillopt_webui/app.py b/skillopt_webui/app.py index 01cb561..ef0c68f 100644 --- a/skillopt_webui/app.py +++ b/skillopt_webui/app.py @@ -86,8 +86,8 @@ class TrainingManager: if line and not line.startswith("#") and "=" in line: k, v = line.split("=", 1) env[k] = v - # Propagate TEACHER_* to base AZURE_OPENAI_* when base is missing, - # so student/default endpoints inherit from teacher config. + # Propagate OPTIMIZER_* to base AZURE_OPENAI_* when base is missing, + # so target/default endpoints inherit from optimizer config. _propagate = [ ("ENDPOINT", ""), ("API_VERSION", ""), ("AUTH_MODE", ""), ("MANAGED_IDENTITY_CLIENT_ID", ""), ("AD_SCOPE", ""), @@ -95,9 +95,9 @@ class TrainingManager: ] for suffix, _ in _propagate: base_key = f"AZURE_OPENAI_{suffix}" - teacher_key = f"TEACHER_AZURE_OPENAI_{suffix}" - if not env.get(base_key) and env.get(teacher_key): - env[base_key] = env[teacher_key] + optimizer_key = f"OPTIMIZER_AZURE_OPENAI_{suffix}" + if not env.get(base_key) and env.get(optimizer_key): + env[base_key] = env[optimizer_key] try: proc = subprocess.Popen( @@ -398,7 +398,7 @@ def build_ui(): use_slow_update = gr.Checkbox(value=True, label="Slow Update (epoch-boundary momentum)") use_meta_skill = gr.Checkbox(value=True, - label="Meta Skill (cross-epoch teacher memory)") + label="Meta Skill (cross-epoch optimizer memory)") use_gate = gr.Checkbox(value=True, label="Gate (validation-based accept/reject)") @@ -533,10 +533,13 @@ def main(): parser = argparse.ArgumentParser(description="SkillOpt WebUI") parser.add_argument("--port", type=int, default=7860) parser.add_argument("--share", action="store_true") + parser.add_argument("--host", type=str, default="0.0.0.0", + help="Server host. Use 0.0.0.0 for public access.") args = parser.parse_args() app = build_ui() app.launch( + server_name=args.host, server_port=args.port, share=args.share, theme=gr.themes.Soft(primary_hue="indigo"),