fix(model): forward Qwen timeout and only set enable_thinking when true

Two bugs made local vLLM targets score acc=0.000: the router did not
forward 'timeout' to the Qwen backend (so runs used the 300s default),
and qwen_backend always injected chat_template_kwargs.enable_thinking,
which non-Qwen vLLM servers reject or answer with <think> output and no
<answer> tag. Forward timeout and only set the field when enabled.

Closes #28

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Matt Van Horn
2026-06-07 07:41:35 -07:00
parent ee9931ec01
commit c31c50be51
3 changed files with 231 additions and 1 deletions

View File

@@ -142,6 +142,7 @@ def chat_target(
retries=retries,
stage=stage,
reasoning_effort=reasoning_effort,
timeout=timeout,
)
if get_target_backend() == "minimax_chat":
return _minimax.chat_target(
@@ -249,6 +250,7 @@ def chat_target_messages(
tools=tools,
tool_choice=tool_choice,
return_message=return_message,
timeout=timeout,
)
if get_target_backend() == "minimax_chat":
return _minimax.chat_target_messages(

View File

@@ -191,7 +191,8 @@ def _chat_messages_impl(
"messages": _json_safe(messages),
"max_tokens": min(max_completion_tokens, config.max_tokens),
}
payload["chat_template_kwargs"] = {"enable_thinking": config.enable_thinking}
if config.enable_thinking:
payload["chat_template_kwargs"] = {"enable_thinking": True}
if config.temperature is not None:
payload["temperature"] = config.temperature
if tools:

227
tests/test_qwen_backend.py Normal file
View File

@@ -0,0 +1,227 @@
"""Tests for the OpenAI-compatible Qwen chat backend."""
from __future__ import annotations
import importlib.util
import json
import os
import sys
import types
from collections.abc import Iterator
from dataclasses import fields
from typing import Any
import pytest
from skillopt.envs.searchqa.evaluator import extract_answer
_QWEN_CONFIG_ENV_KEYS = (
"BASE_URL",
"API_KEY",
"TEMPERATURE",
"TIMEOUT_SECONDS",
"MAX_TOKENS",
"ENABLE_THINKING",
)
_ENV_KEYS = ("OPTIMIZER_BACKEND", "TARGET_BACKEND") + tuple(
f"{prefix}QWEN_CHAT_{key}"
for prefix in ("", "OPTIMIZER_", "TARGET_")
for key in _QWEN_CONFIG_ENV_KEYS
)
class _FakeResponse:
def __init__(self, payload: dict[str, Any]) -> None:
self._payload = payload
def __enter__(self) -> _FakeResponse:
return self
def __exit__(self, exc_type: object, exc: object, traceback: object) -> None:
return None
def read(self) -> bytes:
return json.dumps(self._payload).encode("utf-8")
class _UrlopenRecorder:
def __init__(self, content: str = "<answer>yes</answer>") -> None:
self.content = content
self.calls: list[dict[str, Any]] = []
def __call__(self, request: Any, timeout: float | None = None) -> _FakeResponse:
request_data = request.data.decode("utf-8")
self.calls.append(
{
"payload": json.loads(request_data),
"timeout": timeout,
}
)
return _FakeResponse(
{
"choices": [
{
"message": {"content": self.content},
"finish_reason": "stop",
}
],
"usage": {
"prompt_tokens": 2,
"completion_tokens": 1,
"total_tokens": 3,
},
}
)
class _OpenAIClientStub:
def __init__(self, *args: Any, **kwargs: Any) -> None:
self.args = args
self.kwargs = kwargs
def _install_openai_stub() -> None:
if "openai" in sys.modules or importlib.util.find_spec("openai") is not None:
return
openai_stub = types.ModuleType("openai")
openai_stub.AzureOpenAI = _OpenAIClientStub
openai_stub.OpenAI = _OpenAIClientStub
sys.modules["openai"] = openai_stub
def _import_model_modules() -> tuple[Any, Any, Any]:
_install_openai_stub()
import skillopt.model as model_module
from skillopt.model import backend_config, qwen_backend
return model_module, backend_config, qwen_backend
def _snapshot_config(config: Any) -> dict[str, Any]:
return {field.name: getattr(config, field.name) for field in fields(config)}
def _restore_config(config: Any, snapshot: dict[str, Any]) -> None:
for key, value in snapshot.items():
setattr(config, key, value)
@pytest.fixture(autouse=True)
def isolate_qwen_state() -> Iterator[tuple[Any, Any]]:
model_module, backend_config, qwen_backend = _import_model_modules()
optimizer_config = _snapshot_config(qwen_backend.OPTIMIZER_CONFIG)
target_config = _snapshot_config(qwen_backend.TARGET_CONFIG)
optimizer_backend = backend_config.get_optimizer_backend()
target_backend = backend_config.get_target_backend()
env = {key: os.environ.get(key) for key in _ENV_KEYS}
qwen_backend.reset_token_tracker()
yield model_module, qwen_backend
qwen_backend.reset_token_tracker()
_restore_config(qwen_backend.OPTIMIZER_CONFIG, optimizer_config)
_restore_config(qwen_backend.TARGET_CONFIG, target_config)
backend_config.set_optimizer_backend(optimizer_backend)
backend_config.set_target_backend(target_backend)
for key, value in env.items():
if value is None:
os.environ.pop(key, None)
else:
os.environ[key] = value
def _use_qwen_target(model_module: Any, qwen_backend: Any, enable_thinking: bool) -> None:
model_module.set_target_backend("qwen_chat")
qwen_backend.TARGET_CONFIG.base_url = "http://qwen.example/v1"
qwen_backend.TARGET_CONFIG.api_key = ""
qwen_backend.TARGET_CONFIG.timeout_seconds = 300.0
qwen_backend.TARGET_CONFIG.max_tokens = 8000
qwen_backend.TARGET_CONFIG.temperature = None
qwen_backend.TARGET_CONFIG.enable_thinking = enable_thinking
qwen_backend.TARGET_CONFIG.deployment = "qwen-test"
def _record_urlopen(
monkeypatch: pytest.MonkeyPatch,
qwen_backend: Any,
content: str = "<answer>yes</answer>",
) -> _UrlopenRecorder:
recorder = _UrlopenRecorder(content)
monkeypatch.setattr(qwen_backend.urllib.request, "urlopen", recorder)
return recorder
def test_chat_target_omits_chat_template_kwargs_when_thinking_disabled(
monkeypatch: pytest.MonkeyPatch,
isolate_qwen_state: tuple[Any, Any],
) -> None:
model_module, qwen_backend = isolate_qwen_state
_use_qwen_target(model_module, qwen_backend, enable_thinking=False)
recorder = _record_urlopen(monkeypatch, qwen_backend)
text, usage = model_module.chat_target(
"system",
"user",
max_completion_tokens=128,
retries=1,
timeout=10.0,
)
assert text == "<answer>yes</answer>"
assert usage["total_tokens"] == 3
assert "chat_template_kwargs" not in recorder.calls[0]["payload"]
assert recorder.calls[0]["timeout"] == 10.0
def test_chat_target_includes_chat_template_kwargs_when_thinking_enabled(
monkeypatch: pytest.MonkeyPatch,
isolate_qwen_state: tuple[Any, Any],
) -> None:
model_module, qwen_backend = isolate_qwen_state
_use_qwen_target(model_module, qwen_backend, enable_thinking=True)
content = "<think>working</think>\n<answer>yes</answer>"
recorder = _record_urlopen(monkeypatch, qwen_backend, content=content)
text, _ = model_module.chat_target(
"system",
"user",
max_completion_tokens=128,
retries=1,
)
assert recorder.calls[0]["payload"]["chat_template_kwargs"] == {"enable_thinking": True}
assert extract_answer(text) == "yes"
def test_chat_target_messages_forwards_timeout_to_qwen_backend(
monkeypatch: pytest.MonkeyPatch,
isolate_qwen_state: tuple[Any, Any],
) -> None:
model_module, qwen_backend = isolate_qwen_state
_use_qwen_target(model_module, qwen_backend, enable_thinking=False)
recorder = _record_urlopen(monkeypatch, qwen_backend)
text, _ = model_module.chat_target_messages(
[{"role": "user", "content": "question"}],
max_completion_tokens=128,
retries=1,
timeout=10.0,
)
assert text == "<answer>yes</answer>"
assert recorder.calls[0]["timeout"] == 10.0
def test_configure_qwen_chat_runtime_toggle_controls_payload(
monkeypatch: pytest.MonkeyPatch,
isolate_qwen_state: tuple[Any, Any],
) -> None:
model_module, qwen_backend = isolate_qwen_state
_use_qwen_target(model_module, qwen_backend, enable_thinking=False)
recorder = _record_urlopen(monkeypatch, qwen_backend)
model_module.configure_qwen_chat(enable_thinking=True)
model_module.chat_target("system", "user", max_completion_tokens=128, retries=1)
model_module.configure_qwen_chat(enable_thinking=False)
model_module.chat_target("system", "user", max_completion_tokens=128, retries=1)
assert recorder.calls[0]["payload"]["chat_template_kwargs"] == {"enable_thinking": True}
assert "chat_template_kwargs" not in recorder.calls[1]["payload"]