document_analyzer/skills/doc_parser_skill/scripts/image_parser.py

import base64
import logging
import os
from typing import Optional

from LLM import LLMClient

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Prompts
# ---------------------------------------------------------------------------

PROMPT_IMAGE = """请分析这张图片，判断类型并输出文字描述。

## 判断图片类型

如果是 **流程图 / 架构图 / 状态图 / 时序图 / 活动图**，详细描述：
- 图中所有节点/步骤/状态/组件的名称
- 所有连线/箭头/转换关系及其方向
- 所有分支条件、判断逻辑和判断结果
- 所有文字标注、注释、标签
- 图的整体结构和逻辑流程
- 如果图片包含多个子图，拆解描述

如果是 **其他类型**（UI原型图 / 界面截图 / 设计稿 / 手机屏幕截图 / 网页截图等），简要描述图片内容。

## 输出格式

**1. 类型标签（单独一行）：**
type: <flowchart|architecture|state|sequence|activity|other>

**2. 文字描述：**
该图片的详细文字描述。

不要输出 ---YAML--- 分隔符或 YAML 内容，不要添加任何额外的解释或问候语。"""


# ---------------------------------------------------------------------------
# ImageParser
# ---------------------------------------------------------------------------

class ImageParser:
    """Vision LLM wrapper for parsing images (type + description).

    Usage::

        parser = ImageParser()
        result = parser.parse_image("images/img1.png")
    """

    _VALID_TYPES = {"flowchart", "architecture", "state", "sequence", "activity", "text"}

    def __init__(self, llm: LLMClient | None = None):
        self._llm = llm or LLMClient()

    @property
    def usage(self) -> dict:
        return self._llm.usage

    def parse_image(self, image_path: str) -> Optional[dict]:
        """Parse an image and return its type and description (no YAML IR).

        Returns ``{type, description}``, or *None* for UI mockups.
        """
        logger.info("Parsing image: %s", image_path)

        with open(image_path, "rb") as f:
            img_b64 = base64.b64encode(f.read()).decode()
        mime = self._mime_type(image_path)

        try:
            content = self._llm.chat(
                model=LLMClient.IMAGE_MODEL,
                messages=[{
                    "role": "user",
                    "content": [
                        {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{img_b64}"}},
                        {"type": "text", "text": PROMPT_IMAGE},
                    ],
                }],
            )
        except RuntimeError as e:
            logger.error(str(e))
            return {"type": "other", "description": "", "error": str(e)}

        parsed = self._parse_type_and_description(content)
        if parsed is None:
            return None
        return {"type": parsed[0], "description": parsed[1]}

    # ---- internals ----------------------------------------------------------

    def _parse_type_and_description(self, content: str) -> Optional[tuple[str, str]]:
        """Extract ``(type, description)`` from LLM response.

        Returns *None* for ``[[UI]]`` (skip).
        """
        content = content.strip()
        if content == "[[UI]]" or content.startswith("[[UI]]"):
            return None

        parsed_type = "other"
        desc_lines: list[str] = []
        for line in content.splitlines():
            stripped = line.strip()
            if (stripped.startswith("type:") or stripped.startswith("类型:")) and parsed_type == "other":
                type_val = stripped.split(":", 1)[1].strip().lower()
                if type_val in self._VALID_TYPES:
                    parsed_type = type_val
            else:
                desc_lines.append(line)

        return parsed_type, "\n".join(desc_lines).strip()

    @staticmethod
    def _mime_type(image_path: str) -> str:
        ext = os.path.splitext(image_path)[1].lstrip(".").lower()
        return {
            "png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
            "gif": "image/gif", "bmp": "image/bmp",
            "webp": "image/webp", "svg": "image/svg+xml", "tiff": "image/tiff",
        }.get(ext, "image/png")