mirror of
https://github.com/microsoft/SkillOpt.git
synced 2026-07-03 14:02:58 +08:00
Two bugs made local vLLM targets score acc=0.000: the router did not forward 'timeout' to the Qwen backend (so runs used the 300s default), and qwen_backend always injected chat_template_kwargs.enable_thinking, which non-Qwen vLLM servers reject or answer with <think> output and no <answer> tag. Forward timeout and only set the field when enabled. Closes #28 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
228 lines
7.3 KiB
Python
228 lines
7.3 KiB
Python
"""Tests for the OpenAI-compatible Qwen chat backend."""
|
|
from __future__ import annotations
|
|
|
|
import importlib.util
|
|
import json
|
|
import os
|
|
import sys
|
|
import types
|
|
from collections.abc import Iterator
|
|
from dataclasses import fields
|
|
from typing import Any
|
|
|
|
import pytest
|
|
|
|
from skillopt.envs.searchqa.evaluator import extract_answer
|
|
|
|
|
|
_QWEN_CONFIG_ENV_KEYS = (
|
|
"BASE_URL",
|
|
"API_KEY",
|
|
"TEMPERATURE",
|
|
"TIMEOUT_SECONDS",
|
|
"MAX_TOKENS",
|
|
"ENABLE_THINKING",
|
|
)
|
|
_ENV_KEYS = ("OPTIMIZER_BACKEND", "TARGET_BACKEND") + tuple(
|
|
f"{prefix}QWEN_CHAT_{key}"
|
|
for prefix in ("", "OPTIMIZER_", "TARGET_")
|
|
for key in _QWEN_CONFIG_ENV_KEYS
|
|
)
|
|
|
|
|
|
class _FakeResponse:
|
|
def __init__(self, payload: dict[str, Any]) -> None:
|
|
self._payload = payload
|
|
|
|
def __enter__(self) -> _FakeResponse:
|
|
return self
|
|
|
|
def __exit__(self, exc_type: object, exc: object, traceback: object) -> None:
|
|
return None
|
|
|
|
def read(self) -> bytes:
|
|
return json.dumps(self._payload).encode("utf-8")
|
|
|
|
|
|
class _UrlopenRecorder:
|
|
def __init__(self, content: str = "<answer>yes</answer>") -> None:
|
|
self.content = content
|
|
self.calls: list[dict[str, Any]] = []
|
|
|
|
def __call__(self, request: Any, timeout: float | None = None) -> _FakeResponse:
|
|
request_data = request.data.decode("utf-8")
|
|
self.calls.append(
|
|
{
|
|
"payload": json.loads(request_data),
|
|
"timeout": timeout,
|
|
}
|
|
)
|
|
return _FakeResponse(
|
|
{
|
|
"choices": [
|
|
{
|
|
"message": {"content": self.content},
|
|
"finish_reason": "stop",
|
|
}
|
|
],
|
|
"usage": {
|
|
"prompt_tokens": 2,
|
|
"completion_tokens": 1,
|
|
"total_tokens": 3,
|
|
},
|
|
}
|
|
)
|
|
|
|
|
|
class _OpenAIClientStub:
|
|
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
self.args = args
|
|
self.kwargs = kwargs
|
|
|
|
|
|
def _install_openai_stub() -> None:
|
|
if "openai" in sys.modules or importlib.util.find_spec("openai") is not None:
|
|
return
|
|
openai_stub = types.ModuleType("openai")
|
|
openai_stub.AzureOpenAI = _OpenAIClientStub
|
|
openai_stub.OpenAI = _OpenAIClientStub
|
|
sys.modules["openai"] = openai_stub
|
|
|
|
|
|
def _import_model_modules() -> tuple[Any, Any, Any]:
|
|
_install_openai_stub()
|
|
import skillopt.model as model_module
|
|
from skillopt.model import backend_config, qwen_backend
|
|
|
|
return model_module, backend_config, qwen_backend
|
|
|
|
|
|
def _snapshot_config(config: Any) -> dict[str, Any]:
|
|
return {field.name: getattr(config, field.name) for field in fields(config)}
|
|
|
|
|
|
def _restore_config(config: Any, snapshot: dict[str, Any]) -> None:
|
|
for key, value in snapshot.items():
|
|
setattr(config, key, value)
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def isolate_qwen_state() -> Iterator[tuple[Any, Any]]:
|
|
model_module, backend_config, qwen_backend = _import_model_modules()
|
|
optimizer_config = _snapshot_config(qwen_backend.OPTIMIZER_CONFIG)
|
|
target_config = _snapshot_config(qwen_backend.TARGET_CONFIG)
|
|
optimizer_backend = backend_config.get_optimizer_backend()
|
|
target_backend = backend_config.get_target_backend()
|
|
env = {key: os.environ.get(key) for key in _ENV_KEYS}
|
|
qwen_backend.reset_token_tracker()
|
|
yield model_module, qwen_backend
|
|
qwen_backend.reset_token_tracker()
|
|
_restore_config(qwen_backend.OPTIMIZER_CONFIG, optimizer_config)
|
|
_restore_config(qwen_backend.TARGET_CONFIG, target_config)
|
|
backend_config.set_optimizer_backend(optimizer_backend)
|
|
backend_config.set_target_backend(target_backend)
|
|
for key, value in env.items():
|
|
if value is None:
|
|
os.environ.pop(key, None)
|
|
else:
|
|
os.environ[key] = value
|
|
|
|
|
|
def _use_qwen_target(model_module: Any, qwen_backend: Any, enable_thinking: bool) -> None:
|
|
model_module.set_target_backend("qwen_chat")
|
|
qwen_backend.TARGET_CONFIG.base_url = "http://qwen.example/v1"
|
|
qwen_backend.TARGET_CONFIG.api_key = ""
|
|
qwen_backend.TARGET_CONFIG.timeout_seconds = 300.0
|
|
qwen_backend.TARGET_CONFIG.max_tokens = 8000
|
|
qwen_backend.TARGET_CONFIG.temperature = None
|
|
qwen_backend.TARGET_CONFIG.enable_thinking = enable_thinking
|
|
qwen_backend.TARGET_CONFIG.deployment = "qwen-test"
|
|
|
|
|
|
def _record_urlopen(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
qwen_backend: Any,
|
|
content: str = "<answer>yes</answer>",
|
|
) -> _UrlopenRecorder:
|
|
recorder = _UrlopenRecorder(content)
|
|
monkeypatch.setattr(qwen_backend.urllib.request, "urlopen", recorder)
|
|
return recorder
|
|
|
|
|
|
def test_chat_target_omits_chat_template_kwargs_when_thinking_disabled(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
isolate_qwen_state: tuple[Any, Any],
|
|
) -> None:
|
|
model_module, qwen_backend = isolate_qwen_state
|
|
_use_qwen_target(model_module, qwen_backend, enable_thinking=False)
|
|
recorder = _record_urlopen(monkeypatch, qwen_backend)
|
|
|
|
text, usage = model_module.chat_target(
|
|
"system",
|
|
"user",
|
|
max_completion_tokens=128,
|
|
retries=1,
|
|
timeout=10.0,
|
|
)
|
|
|
|
assert text == "<answer>yes</answer>"
|
|
assert usage["total_tokens"] == 3
|
|
assert "chat_template_kwargs" not in recorder.calls[0]["payload"]
|
|
assert recorder.calls[0]["timeout"] == 10.0
|
|
|
|
|
|
def test_chat_target_includes_chat_template_kwargs_when_thinking_enabled(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
isolate_qwen_state: tuple[Any, Any],
|
|
) -> None:
|
|
model_module, qwen_backend = isolate_qwen_state
|
|
_use_qwen_target(model_module, qwen_backend, enable_thinking=True)
|
|
content = "<think>working</think>\n<answer>yes</answer>"
|
|
recorder = _record_urlopen(monkeypatch, qwen_backend, content=content)
|
|
|
|
text, _ = model_module.chat_target(
|
|
"system",
|
|
"user",
|
|
max_completion_tokens=128,
|
|
retries=1,
|
|
)
|
|
|
|
assert recorder.calls[0]["payload"]["chat_template_kwargs"] == {"enable_thinking": True}
|
|
assert extract_answer(text) == "yes"
|
|
|
|
|
|
def test_chat_target_messages_forwards_timeout_to_qwen_backend(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
isolate_qwen_state: tuple[Any, Any],
|
|
) -> None:
|
|
model_module, qwen_backend = isolate_qwen_state
|
|
_use_qwen_target(model_module, qwen_backend, enable_thinking=False)
|
|
recorder = _record_urlopen(monkeypatch, qwen_backend)
|
|
|
|
text, _ = model_module.chat_target_messages(
|
|
[{"role": "user", "content": "question"}],
|
|
max_completion_tokens=128,
|
|
retries=1,
|
|
timeout=10.0,
|
|
)
|
|
|
|
assert text == "<answer>yes</answer>"
|
|
assert recorder.calls[0]["timeout"] == 10.0
|
|
|
|
|
|
def test_configure_qwen_chat_runtime_toggle_controls_payload(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
isolate_qwen_state: tuple[Any, Any],
|
|
) -> None:
|
|
model_module, qwen_backend = isolate_qwen_state
|
|
_use_qwen_target(model_module, qwen_backend, enable_thinking=False)
|
|
recorder = _record_urlopen(monkeypatch, qwen_backend)
|
|
|
|
model_module.configure_qwen_chat(enable_thinking=True)
|
|
model_module.chat_target("system", "user", max_completion_tokens=128, retries=1)
|
|
model_module.configure_qwen_chat(enable_thinking=False)
|
|
model_module.chat_target("system", "user", max_completion_tokens=128, retries=1)
|
|
|
|
assert recorder.calls[0]["payload"]["chat_template_kwargs"] == {"enable_thinking": True}
|
|
assert "chat_template_kwargs" not in recorder.calls[1]["payload"]
|