import logging import os import time from typing import Optional from openai import OpenAI logger = logging.getLogger(__name__) class LLMClient: """Low-level OpenAI-compatible LLM client with retry and token tracking. Usage:: llm = LLMClient() content = llm.chat("qwen3.5-flash", [{"role": "user", "content": "Hello"}]) print(llm.usage) """ IMAGE_MODEL = "qwen3-vl-plus" TEXT_MODEL = "qwen3.5-flash-2026-02-23" TIMEOUT = 120 MAX_RETRIES = 3 def __init__( self, *, base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1", timeout: int | None = None, ): key = os.environ.get("DASHSCOPE_API_KEY", "") if not key: raise ValueError("DASHSCOPE_API_KEY environment variable is not set.") self._client = OpenAI(api_key=key, base_url=base_url) self._timeout = timeout or self.TIMEOUT self._prompt_tokens = 0 self._completion_tokens = 0 @property def usage(self) -> dict: """Return accumulated token counts as ``{prompt, completion, total}``.""" return { "prompt_tokens": self._prompt_tokens, "completion_tokens": self._completion_tokens, "total_tokens": self._prompt_tokens + self._completion_tokens, } @staticmethod def estimate_tokens(text: str) -> int: """Quick token estimate. CJK ≈1.7/token, others ≈3.0/token.""" cjk = sum(1 for c in text if '一' <= c <= '鿿' or ' ' <= c <= '〿') other = len(text) - cjk return max(1, int(cjk / 1.7 + other / 3.0)) @staticmethod def estimate_image_tokens() -> int: """Fixed estimate for one vision-model image (~500 tokens).""" return 500 def chat( self, model: str, messages: list[dict], *, timeout: int | None = None, response_format: dict | None = None, ) -> str: """Send a chat completion request and return the response content. Automatically retries on failure and accumulates token usage. """ label = f"chat({model})" def _call(): t0 = time.time() kwargs = dict(model=model, messages=messages, timeout=timeout or self._timeout) if response_format is not None: kwargs["response_format"] = response_format kwargs["temperature"] = 0 resp = self._client.chat.completions.create(**kwargs) content = resp.choices[0].message.content usg = resp.usage if usg: self._prompt_tokens += usg.prompt_tokens self._completion_tokens += usg.completion_tokens elapsed = time.time() - t0 logger.info("%s: %d chars in %.1fs", label, len(content) if content else 0, elapsed) if not content: raise RuntimeError("Empty response from LLM") return content return self._retry(_call, label) def _retry(self, fn, label: str) -> str: """Call *fn()* with exponential-backoff retry.""" last_error: Optional[Exception] = None for attempt in range(self.MAX_RETRIES): try: return fn() except Exception as e: last_error = e logger.warning( "%s error (attempt %d/%d): %s", label, attempt + 1, self.MAX_RETRIES, e, ) if attempt < self.MAX_RETRIES - 1: time.sleep(2 ** attempt) raise RuntimeError(f"{label}: all retries exhausted") from last_error