Files
document_analyzer/skills/doc_parser_skill/scripts/image_parser.py
T
pzhang_zywl 40567a4fb6
CI / test (push) Successful in 30s
Initial commit: document_analyzer with CI/CD pipeline
- 4 skill pipeline (doc_parser, conflict_detection, ir_generation, resolution_application)
- CI workflow on push/PR (.gitea/workflows/ci.yml)
- Auto-issue on CI failure (.gitea/workflows/auto-issue.yml)
- Pytest smoke tests (tests/test_sample.py)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-29 20:00:26 +08:00

124 lines
4.2 KiB
Python

import base64
import logging
import os
from typing import Optional
from LLM import LLMClient
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Prompts
# ---------------------------------------------------------------------------
PROMPT_IMAGE = """请分析这张图片,判断类型并输出文字描述。
## 判断图片类型
如果是 **流程图 / 架构图 / 状态图 / 时序图 / 活动图**,详细描述:
- 图中所有节点/步骤/状态/组件的名称
- 所有连线/箭头/转换关系及其方向
- 所有分支条件、判断逻辑和判断结果
- 所有文字标注、注释、标签
- 图的整体结构和逻辑流程
- 如果图片包含多个子图,拆解描述
如果是 **其他类型**(UI原型图 / 界面截图 / 设计稿 / 手机屏幕截图 / 网页截图等),简要描述图片内容。
## 输出格式
**1. 类型标签(单独一行):**
type: <flowchart|architecture|state|sequence|activity|other>
**2. 文字描述:**
该图片的详细文字描述。
不要输出 ---YAML--- 分隔符或 YAML 内容,不要添加任何额外的解释或问候语。"""
# ---------------------------------------------------------------------------
# ImageParser
# ---------------------------------------------------------------------------
class ImageParser:
"""Vision LLM wrapper for parsing images (type + description).
Usage::
parser = ImageParser()
result = parser.parse_image("images/img1.png")
"""
_VALID_TYPES = {"flowchart", "architecture", "state", "sequence", "activity", "text"}
def __init__(self, llm: LLMClient | None = None):
self._llm = llm or LLMClient()
@property
def usage(self) -> dict:
return self._llm.usage
def parse_image(self, image_path: str) -> Optional[dict]:
"""Parse an image and return its type and description (no YAML IR).
Returns ``{type, description}``, or *None* for UI mockups.
"""
logger.info("Parsing image: %s", image_path)
with open(image_path, "rb") as f:
img_b64 = base64.b64encode(f.read()).decode()
mime = self._mime_type(image_path)
try:
content = self._llm.chat(
model=LLMClient.IMAGE_MODEL,
messages=[{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:{mime};base64,{img_b64}"}},
{"type": "text", "text": PROMPT_IMAGE},
],
}],
)
except RuntimeError as e:
logger.error(str(e))
return {"type": "other", "description": "", "error": str(e)}
parsed = self._parse_type_and_description(content)
if parsed is None:
return None
return {"type": parsed[0], "description": parsed[1]}
# ---- internals ----------------------------------------------------------
def _parse_type_and_description(self, content: str) -> Optional[tuple[str, str]]:
"""Extract ``(type, description)`` from LLM response.
Returns *None* for ``[[UI]]`` (skip).
"""
content = content.strip()
if content == "[[UI]]" or content.startswith("[[UI]]"):
return None
parsed_type = "other"
desc_lines: list[str] = []
for line in content.splitlines():
stripped = line.strip()
if (stripped.startswith("type:") or stripped.startswith("类型:")) and parsed_type == "other":
type_val = stripped.split(":", 1)[1].strip().lower()
if type_val in self._VALID_TYPES:
parsed_type = type_val
else:
desc_lines.append(line)
return parsed_type, "\n".join(desc_lines).strip()
@staticmethod
def _mime_type(image_path: str) -> str:
ext = os.path.splitext(image_path)[1].lstrip(".").lower()
return {
"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
"gif": "image/gif", "bmp": "image/bmp",
"webp": "image/webp", "svg": "image/svg+xml", "tiff": "image/tiff",
}.get(ext, "image/png")