mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
fix(model): forward Qwen timeout and only set enable_thinking when true
Two bugs made local vLLM targets score acc=0.000: the router did not forward 'timeout' to the Qwen backend (so runs used the 300s default), and qwen_backend always injected chat_template_kwargs.enable_thinking, which non-Qwen vLLM servers reject or answer with <think> output and no <answer> tag. Forward timeout and only set the field when enabled. Closes #28 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -142,6 +142,7 @@ def chat_target(
|
||||
retries=retries,
|
||||
stage=stage,
|
||||
reasoning_effort=reasoning_effort,
|
||||
timeout=timeout,
|
||||
)
|
||||
if get_target_backend() == "minimax_chat":
|
||||
return _minimax.chat_target(
|
||||
@@ -249,6 +250,7 @@ def chat_target_messages(
|
||||
tools=tools,
|
||||
tool_choice=tool_choice,
|
||||
return_message=return_message,
|
||||
timeout=timeout,
|
||||
)
|
||||
if get_target_backend() == "minimax_chat":
|
||||
return _minimax.chat_target_messages(
|
||||
|
||||
@@ -191,7 +191,8 @@ def _chat_messages_impl(
|
||||
"messages": _json_safe(messages),
|
||||
"max_tokens": min(max_completion_tokens, config.max_tokens),
|
||||
}
|
||||
payload["chat_template_kwargs"] = {"enable_thinking": config.enable_thinking}
|
||||
if config.enable_thinking:
|
||||
payload["chat_template_kwargs"] = {"enable_thinking": True}
|
||||
if config.temperature is not None:
|
||||
payload["temperature"] = config.temperature
|
||||
if tools:
|
||||
|
||||
227
tests/test_qwen_backend.py
Normal file
227
tests/test_qwen_backend.py
Normal file
@@ -0,0 +1,227 @@
|
||||
"""Tests for the OpenAI-compatible Qwen chat backend."""
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.util
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
from collections.abc import Iterator
|
||||
from dataclasses import fields
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from skillopt.envs.searchqa.evaluator import extract_answer
|
||||
|
||||
|
||||
_QWEN_CONFIG_ENV_KEYS = (
|
||||
"BASE_URL",
|
||||
"API_KEY",
|
||||
"TEMPERATURE",
|
||||
"TIMEOUT_SECONDS",
|
||||
"MAX_TOKENS",
|
||||
"ENABLE_THINKING",
|
||||
)
|
||||
_ENV_KEYS = ("OPTIMIZER_BACKEND", "TARGET_BACKEND") + tuple(
|
||||
f"{prefix}QWEN_CHAT_{key}"
|
||||
for prefix in ("", "OPTIMIZER_", "TARGET_")
|
||||
for key in _QWEN_CONFIG_ENV_KEYS
|
||||
)
|
||||
|
||||
|
||||
class _FakeResponse:
|
||||
def __init__(self, payload: dict[str, Any]) -> None:
|
||||
self._payload = payload
|
||||
|
||||
def __enter__(self) -> _FakeResponse:
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type: object, exc: object, traceback: object) -> None:
|
||||
return None
|
||||
|
||||
def read(self) -> bytes:
|
||||
return json.dumps(self._payload).encode("utf-8")
|
||||
|
||||
|
||||
class _UrlopenRecorder:
|
||||
def __init__(self, content: str = "<answer>yes</answer>") -> None:
|
||||
self.content = content
|
||||
self.calls: list[dict[str, Any]] = []
|
||||
|
||||
def __call__(self, request: Any, timeout: float | None = None) -> _FakeResponse:
|
||||
request_data = request.data.decode("utf-8")
|
||||
self.calls.append(
|
||||
{
|
||||
"payload": json.loads(request_data),
|
||||
"timeout": timeout,
|
||||
}
|
||||
)
|
||||
return _FakeResponse(
|
||||
{
|
||||
"choices": [
|
||||
{
|
||||
"message": {"content": self.content},
|
||||
"finish_reason": "stop",
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 2,
|
||||
"completion_tokens": 1,
|
||||
"total_tokens": 3,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class _OpenAIClientStub:
|
||||
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
|
||||
|
||||
def _install_openai_stub() -> None:
|
||||
if "openai" in sys.modules or importlib.util.find_spec("openai") is not None:
|
||||
return
|
||||
openai_stub = types.ModuleType("openai")
|
||||
openai_stub.AzureOpenAI = _OpenAIClientStub
|
||||
openai_stub.OpenAI = _OpenAIClientStub
|
||||
sys.modules["openai"] = openai_stub
|
||||
|
||||
|
||||
def _import_model_modules() -> tuple[Any, Any, Any]:
|
||||
_install_openai_stub()
|
||||
import skillopt.model as model_module
|
||||
from skillopt.model import backend_config, qwen_backend
|
||||
|
||||
return model_module, backend_config, qwen_backend
|
||||
|
||||
|
||||
def _snapshot_config(config: Any) -> dict[str, Any]:
|
||||
return {field.name: getattr(config, field.name) for field in fields(config)}
|
||||
|
||||
|
||||
def _restore_config(config: Any, snapshot: dict[str, Any]) -> None:
|
||||
for key, value in snapshot.items():
|
||||
setattr(config, key, value)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def isolate_qwen_state() -> Iterator[tuple[Any, Any]]:
|
||||
model_module, backend_config, qwen_backend = _import_model_modules()
|
||||
optimizer_config = _snapshot_config(qwen_backend.OPTIMIZER_CONFIG)
|
||||
target_config = _snapshot_config(qwen_backend.TARGET_CONFIG)
|
||||
optimizer_backend = backend_config.get_optimizer_backend()
|
||||
target_backend = backend_config.get_target_backend()
|
||||
env = {key: os.environ.get(key) for key in _ENV_KEYS}
|
||||
qwen_backend.reset_token_tracker()
|
||||
yield model_module, qwen_backend
|
||||
qwen_backend.reset_token_tracker()
|
||||
_restore_config(qwen_backend.OPTIMIZER_CONFIG, optimizer_config)
|
||||
_restore_config(qwen_backend.TARGET_CONFIG, target_config)
|
||||
backend_config.set_optimizer_backend(optimizer_backend)
|
||||
backend_config.set_target_backend(target_backend)
|
||||
for key, value in env.items():
|
||||
if value is None:
|
||||
os.environ.pop(key, None)
|
||||
else:
|
||||
os.environ[key] = value
|
||||
|
||||
|
||||
def _use_qwen_target(model_module: Any, qwen_backend: Any, enable_thinking: bool) -> None:
|
||||
model_module.set_target_backend("qwen_chat")
|
||||
qwen_backend.TARGET_CONFIG.base_url = "http://qwen.example/v1"
|
||||
qwen_backend.TARGET_CONFIG.api_key = ""
|
||||
qwen_backend.TARGET_CONFIG.timeout_seconds = 300.0
|
||||
qwen_backend.TARGET_CONFIG.max_tokens = 8000
|
||||
qwen_backend.TARGET_CONFIG.temperature = None
|
||||
qwen_backend.TARGET_CONFIG.enable_thinking = enable_thinking
|
||||
qwen_backend.TARGET_CONFIG.deployment = "qwen-test"
|
||||
|
||||
|
||||
def _record_urlopen(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
qwen_backend: Any,
|
||||
content: str = "<answer>yes</answer>",
|
||||
) -> _UrlopenRecorder:
|
||||
recorder = _UrlopenRecorder(content)
|
||||
monkeypatch.setattr(qwen_backend.urllib.request, "urlopen", recorder)
|
||||
return recorder
|
||||
|
||||
|
||||
def test_chat_target_omits_chat_template_kwargs_when_thinking_disabled(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
isolate_qwen_state: tuple[Any, Any],
|
||||
) -> None:
|
||||
model_module, qwen_backend = isolate_qwen_state
|
||||
_use_qwen_target(model_module, qwen_backend, enable_thinking=False)
|
||||
recorder = _record_urlopen(monkeypatch, qwen_backend)
|
||||
|
||||
text, usage = model_module.chat_target(
|
||||
"system",
|
||||
"user",
|
||||
max_completion_tokens=128,
|
||||
retries=1,
|
||||
timeout=10.0,
|
||||
)
|
||||
|
||||
assert text == "<answer>yes</answer>"
|
||||
assert usage["total_tokens"] == 3
|
||||
assert "chat_template_kwargs" not in recorder.calls[0]["payload"]
|
||||
assert recorder.calls[0]["timeout"] == 10.0
|
||||
|
||||
|
||||
def test_chat_target_includes_chat_template_kwargs_when_thinking_enabled(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
isolate_qwen_state: tuple[Any, Any],
|
||||
) -> None:
|
||||
model_module, qwen_backend = isolate_qwen_state
|
||||
_use_qwen_target(model_module, qwen_backend, enable_thinking=True)
|
||||
content = "<think>working</think>\n<answer>yes</answer>"
|
||||
recorder = _record_urlopen(monkeypatch, qwen_backend, content=content)
|
||||
|
||||
text, _ = model_module.chat_target(
|
||||
"system",
|
||||
"user",
|
||||
max_completion_tokens=128,
|
||||
retries=1,
|
||||
)
|
||||
|
||||
assert recorder.calls[0]["payload"]["chat_template_kwargs"] == {"enable_thinking": True}
|
||||
assert extract_answer(text) == "yes"
|
||||
|
||||
|
||||
def test_chat_target_messages_forwards_timeout_to_qwen_backend(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
isolate_qwen_state: tuple[Any, Any],
|
||||
) -> None:
|
||||
model_module, qwen_backend = isolate_qwen_state
|
||||
_use_qwen_target(model_module, qwen_backend, enable_thinking=False)
|
||||
recorder = _record_urlopen(monkeypatch, qwen_backend)
|
||||
|
||||
text, _ = model_module.chat_target_messages(
|
||||
[{"role": "user", "content": "question"}],
|
||||
max_completion_tokens=128,
|
||||
retries=1,
|
||||
timeout=10.0,
|
||||
)
|
||||
|
||||
assert text == "<answer>yes</answer>"
|
||||
assert recorder.calls[0]["timeout"] == 10.0
|
||||
|
||||
|
||||
def test_configure_qwen_chat_runtime_toggle_controls_payload(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
isolate_qwen_state: tuple[Any, Any],
|
||||
) -> None:
|
||||
model_module, qwen_backend = isolate_qwen_state
|
||||
_use_qwen_target(model_module, qwen_backend, enable_thinking=False)
|
||||
recorder = _record_urlopen(monkeypatch, qwen_backend)
|
||||
|
||||
model_module.configure_qwen_chat(enable_thinking=True)
|
||||
model_module.chat_target("system", "user", max_completion_tokens=128, retries=1)
|
||||
model_module.configure_qwen_chat(enable_thinking=False)
|
||||
model_module.chat_target("system", "user", max_completion_tokens=128, retries=1)
|
||||
|
||||
assert recorder.calls[0]["payload"]["chat_template_kwargs"] == {"enable_thinking": True}
|
||||
assert "chat_template_kwargs" not in recorder.calls[1]["payload"]
|
||||
Reference in New Issue
Block a user