Merge pull request #40 from mvanhorn/fix/28-qwen-chat-timeout-and-thinking-tag

fix: forward Qwen target timeout and gate enable_thinking for vLLM targets
This commit is contained in:
Yifan Yang
2026-06-08 01:42:50 +08:00
committed by GitHub
3 changed files with 231 additions and 1 deletions

View File

@@ -142,6 +142,7 @@ def chat_target(
retries=retries,
stage=stage,
reasoning_effort=reasoning_effort,
timeout=timeout,
)
if get_target_backend() == "minimax_chat":
return _minimax.chat_target(
@@ -249,6 +250,7 @@ def chat_target_messages(
tools=tools,
tool_choice=tool_choice,
return_message=return_message,
timeout=timeout,
)
if get_target_backend() == "minimax_chat":
return _minimax.chat_target_messages(

View File

@@ -191,7 +191,8 @@ def _chat_messages_impl(
"messages": _json_safe(messages),
"max_tokens": min(max_completion_tokens, config.max_tokens),
}
payload["chat_template_kwargs"] = {"enable_thinking": config.enable_thinking}
if config.enable_thinking:
payload["chat_template_kwargs"] = {"enable_thinking": True}
if config.temperature is not None:
payload["temperature"] = config.temperature
if tools:

227
tests/test_qwen_backend.py Normal file
View File

@@ -0,0 +1,227 @@
"""Tests for the OpenAI-compatible Qwen chat backend."""
from __future__ import annotations
import importlib.util
import json
import os
import sys
import types
from collections.abc import Iterator
from dataclasses import fields
from typing import Any
import pytest
from skillopt.envs.searchqa.evaluator import extract_answer
_QWEN_CONFIG_ENV_KEYS = (
"BASE_URL",
"API_KEY",
"TEMPERATURE",
"TIMEOUT_SECONDS",
"MAX_TOKENS",
"ENABLE_THINKING",
)
_ENV_KEYS = ("OPTIMIZER_BACKEND", "TARGET_BACKEND") + tuple(
f"{prefix}QWEN_CHAT_{key}"
for prefix in ("", "OPTIMIZER_", "TARGET_")
for key in _QWEN_CONFIG_ENV_KEYS
)
class _FakeResponse:
def __init__(self, payload: dict[str, Any]) -> None:
self._payload = payload
def __enter__(self) -> _FakeResponse:
return self
def __exit__(self, exc_type: object, exc: object, traceback: object) -> None:
return None
def read(self) -> bytes:
return json.dumps(self._payload).encode("utf-8")
class _UrlopenRecorder:
def __init__(self, content: str = "<answer>yes</answer>") -> None:
self.content = content
self.calls: list[dict[str, Any]] = []
def __call__(self, request: Any, timeout: float | None = None) -> _FakeResponse:
request_data = request.data.decode("utf-8")
self.calls.append(
{
"payload": json.loads(request_data),
"timeout": timeout,
}
)
return _FakeResponse(
{
"choices": [
{
"message": {"content": self.content},
"finish_reason": "stop",
}
],
"usage": {
"prompt_tokens": 2,
"completion_tokens": 1,
"total_tokens": 3,
},
}
)
class _OpenAIClientStub:
def __init__(self, *args: Any, **kwargs: Any) -> None:
self.args = args
self.kwargs = kwargs
def _install_openai_stub() -> None:
if "openai" in sys.modules or importlib.util.find_spec("openai") is not None:
return
openai_stub = types.ModuleType("openai")
openai_stub.AzureOpenAI = _OpenAIClientStub
openai_stub.OpenAI = _OpenAIClientStub
sys.modules["openai"] = openai_stub
def _import_model_modules() -> tuple[Any, Any, Any]:
_install_openai_stub()
import skillopt.model as model_module
from skillopt.model import backend_config, qwen_backend
return model_module, backend_config, qwen_backend
def _snapshot_config(config: Any) -> dict[str, Any]:
return {field.name: getattr(config, field.name) for field in fields(config)}
def _restore_config(config: Any, snapshot: dict[str, Any]) -> None:
for key, value in snapshot.items():
setattr(config, key, value)
@pytest.fixture(autouse=True)
def isolate_qwen_state() -> Iterator[tuple[Any, Any]]:
model_module, backend_config, qwen_backend = _import_model_modules()
optimizer_config = _snapshot_config(qwen_backend.OPTIMIZER_CONFIG)
target_config = _snapshot_config(qwen_backend.TARGET_CONFIG)
optimizer_backend = backend_config.get_optimizer_backend()
target_backend = backend_config.get_target_backend()
env = {key: os.environ.get(key) for key in _ENV_KEYS}
qwen_backend.reset_token_tracker()
yield model_module, qwen_backend
qwen_backend.reset_token_tracker()
_restore_config(qwen_backend.OPTIMIZER_CONFIG, optimizer_config)
_restore_config(qwen_backend.TARGET_CONFIG, target_config)
backend_config.set_optimizer_backend(optimizer_backend)
backend_config.set_target_backend(target_backend)
for key, value in env.items():
if value is None:
os.environ.pop(key, None)
else:
os.environ[key] = value
def _use_qwen_target(model_module: Any, qwen_backend: Any, enable_thinking: bool) -> None:
model_module.set_target_backend("qwen_chat")
qwen_backend.TARGET_CONFIG.base_url = "http://qwen.example/v1"
qwen_backend.TARGET_CONFIG.api_key = ""
qwen_backend.TARGET_CONFIG.timeout_seconds = 300.0
qwen_backend.TARGET_CONFIG.max_tokens = 8000
qwen_backend.TARGET_CONFIG.temperature = None
qwen_backend.TARGET_CONFIG.enable_thinking = enable_thinking
qwen_backend.TARGET_CONFIG.deployment = "qwen-test"
def _record_urlopen(
monkeypatch: pytest.MonkeyPatch,
qwen_backend: Any,
content: str = "<answer>yes</answer>",
) -> _UrlopenRecorder:
recorder = _UrlopenRecorder(content)
monkeypatch.setattr(qwen_backend.urllib.request, "urlopen", recorder)
return recorder
def test_chat_target_omits_chat_template_kwargs_when_thinking_disabled(
monkeypatch: pytest.MonkeyPatch,
isolate_qwen_state: tuple[Any, Any],
) -> None:
model_module, qwen_backend = isolate_qwen_state
_use_qwen_target(model_module, qwen_backend, enable_thinking=False)
recorder = _record_urlopen(monkeypatch, qwen_backend)
text, usage = model_module.chat_target(
"system",
"user",
max_completion_tokens=128,
retries=1,
timeout=10.0,
)
assert text == "<answer>yes</answer>"
assert usage["total_tokens"] == 3
assert "chat_template_kwargs" not in recorder.calls[0]["payload"]
assert recorder.calls[0]["timeout"] == 10.0
def test_chat_target_includes_chat_template_kwargs_when_thinking_enabled(
monkeypatch: pytest.MonkeyPatch,
isolate_qwen_state: tuple[Any, Any],
) -> None:
model_module, qwen_backend = isolate_qwen_state
_use_qwen_target(model_module, qwen_backend, enable_thinking=True)
content = "<think>working</think>\n<answer>yes</answer>"
recorder = _record_urlopen(monkeypatch, qwen_backend, content=content)
text, _ = model_module.chat_target(
"system",
"user",
max_completion_tokens=128,
retries=1,
)
assert recorder.calls[0]["payload"]["chat_template_kwargs"] == {"enable_thinking": True}
assert extract_answer(text) == "yes"
def test_chat_target_messages_forwards_timeout_to_qwen_backend(
monkeypatch: pytest.MonkeyPatch,
isolate_qwen_state: tuple[Any, Any],
) -> None:
model_module, qwen_backend = isolate_qwen_state
_use_qwen_target(model_module, qwen_backend, enable_thinking=False)
recorder = _record_urlopen(monkeypatch, qwen_backend)
text, _ = model_module.chat_target_messages(
[{"role": "user", "content": "question"}],
max_completion_tokens=128,
retries=1,
timeout=10.0,
)
assert text == "<answer>yes</answer>"
assert recorder.calls[0]["timeout"] == 10.0
def test_configure_qwen_chat_runtime_toggle_controls_payload(
monkeypatch: pytest.MonkeyPatch,
isolate_qwen_state: tuple[Any, Any],
) -> None:
model_module, qwen_backend = isolate_qwen_state
_use_qwen_target(model_module, qwen_backend, enable_thinking=False)
recorder = _record_urlopen(monkeypatch, qwen_backend)
model_module.configure_qwen_chat(enable_thinking=True)
model_module.chat_target("system", "user", max_completion_tokens=128, retries=1)
model_module.configure_qwen_chat(enable_thinking=False)
model_module.chat_target("system", "user", max_completion_tokens=128, retries=1)
assert recorder.calls[0]["payload"]["chat_template_kwargs"] == {"enable_thinking": True}
assert "chat_template_kwargs" not in recorder.calls[1]["payload"]