import base64 import logging import os from typing import Optional from LLM import LLMClient logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Prompts # --------------------------------------------------------------------------- PROMPT_IMAGE = """请分析这张图片,判断类型并输出文字描述。 ## 判断图片类型 如果是 **流程图 / 架构图 / 状态图 / 时序图 / 活动图**,详细描述: - 图中所有节点/步骤/状态/组件的名称 - 所有连线/箭头/转换关系及其方向 - 所有分支条件、判断逻辑和判断结果 - 所有文字标注、注释、标签 - 图的整体结构和逻辑流程 - 如果图片包含多个子图,拆解描述 如果是 **其他类型**(UI原型图 / 界面截图 / 设计稿 / 手机屏幕截图 / 网页截图等),简要描述图片内容。 ## 输出格式 **1. 类型标签(单独一行):** type: **2. 文字描述:** 该图片的详细文字描述。 不要输出 ---YAML--- 分隔符或 YAML 内容,不要添加任何额外的解释或问候语。""" # --------------------------------------------------------------------------- # ImageParser # --------------------------------------------------------------------------- class ImageParser: """Vision LLM wrapper for parsing images (type + description). Usage:: parser = ImageParser() result = parser.parse_image("images/img1.png") """ _VALID_TYPES = {"flowchart", "architecture", "state", "sequence", "activity", "text"} def __init__(self, llm: LLMClient | None = None): self._llm = llm or LLMClient() @property def usage(self) -> dict: return self._llm.usage def parse_image(self, image_path: str) -> Optional[dict]: """Parse an image and return its type and description (no YAML IR). Returns ``{type, description}``, or *None* for UI mockups. """ logger.info("Parsing image: %s", image_path) with open(image_path, "rb") as f: img_b64 = base64.b64encode(f.read()).decode() mime = self._mime_type(image_path) try: content = self._llm.chat( model=LLMClient.IMAGE_MODEL, messages=[{ "role": "user", "content": [ {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{img_b64}"}}, {"type": "text", "text": PROMPT_IMAGE}, ], }], ) except RuntimeError as e: logger.error(str(e)) return {"type": "other", "description": "", "error": str(e)} parsed = self._parse_type_and_description(content) if parsed is None: return None return {"type": parsed[0], "description": parsed[1]} # ---- internals ---------------------------------------------------------- def _parse_type_and_description(self, content: str) -> Optional[tuple[str, str]]: """Extract ``(type, description)`` from LLM response. Returns *None* for ``[[UI]]`` (skip). """ content = content.strip() if content == "[[UI]]" or content.startswith("[[UI]]"): return None parsed_type = "other" desc_lines: list[str] = [] for line in content.splitlines(): stripped = line.strip() if (stripped.startswith("type:") or stripped.startswith("类型:")) and parsed_type == "other": type_val = stripped.split(":", 1)[1].strip().lower() if type_val in self._VALID_TYPES: parsed_type = type_val else: desc_lines.append(line) return parsed_type, "\n".join(desc_lines).strip() @staticmethod def _mime_type(image_path: str) -> str: ext = os.path.splitext(image_path)[1].lstrip(".").lower() return { "png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg", "gif": "image/gif", "bmp": "image/bmp", "webp": "image/webp", "svg": "image/svg+xml", "tiff": "image/tiff", }.get(ext, "image/png")