Initial commit: document_analyzer with CI/CD pipeline

- 4 skill pipeline (doc_parser, conflict_detection, ir_generation, resolution_application) - CI workflow on push/PR (.gitea/workflows/ci.yml) - Auto-issue on CI failure (.gitea/workflows/auto-issue.yml) - Pytest smoke tests (tests/test_sample.py) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-29 20:00:26 +08:00
commit 40567a4fb6
22 changed files with 2898 additions and 0 deletions
@@ -0,0 +1,105 @@
+import logging
+import os
+import time
+from typing import Optional
+
+from openai import OpenAI
+
+logger = logging.getLogger(__name__)
+
+
+class LLMClient:
+    """Low-level OpenAI-compatible LLM client with retry and token tracking.
+
+    Usage::
+
+        llm = LLMClient()
+        content = llm.chat("qwen3.5-flash", [{"role": "user", "content": "Hello"}])
+        print(llm.usage)
+    """
+
+    IMAGE_MODEL = "qwen3-vl-plus"
+    TEXT_MODEL = "qwen3.5-flash-2026-02-23"
+    TIMEOUT = 120
+    MAX_RETRIES = 3
+
+    def __init__(
+        self,
+        *,
+        base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1",
+        timeout: int | None = None,
+    ):
+        key = os.environ.get("DASHSCOPE_API_KEY", "")
+        if not key:
+            raise ValueError("DASHSCOPE_API_KEY environment variable is not set.")
+        self._client = OpenAI(api_key=key, base_url=base_url)
+        self._timeout = timeout or self.TIMEOUT
+        self._prompt_tokens = 0
+        self._completion_tokens = 0
+
+    @property
+    def usage(self) -> dict:
+        """Return accumulated token counts as ``{prompt, completion, total}``."""
+        return {
+            "prompt_tokens": self._prompt_tokens,
+            "completion_tokens": self._completion_tokens,
+            "total_tokens": self._prompt_tokens + self._completion_tokens,
+        }
+
+    @staticmethod
+    def estimate_tokens(text: str) -> int:
+        """Quick token estimate.  CJK ≈1.7/token, others ≈3.0/token."""
+        cjk = sum(1 for c in text if '一' <= c <= '鿿' or '　' <= c <= '〿')
+        other = len(text) - cjk
+        return max(1, int(cjk / 1.7 + other / 3.0))
+
+    @staticmethod
+    def estimate_image_tokens() -> int:
+        """Fixed estimate for one vision-model image (~500 tokens)."""
+        return 500
+
+    def chat(
+        self, model: str, messages: list[dict], *, timeout: int | None = None,
+        response_format: dict | None = None,
+    ) -> str:
+        """Send a chat completion request and return the response content.
+
+        Automatically retries on failure and accumulates token usage.
+        """
+        label = f"chat({model})"
+
+        def _call():
+            t0 = time.time()
+            kwargs = dict(model=model, messages=messages, timeout=timeout or self._timeout)
+            if response_format is not None:
+                kwargs["response_format"] = response_format
+            kwargs["temperature"] = 0
+            resp = self._client.chat.completions.create(**kwargs)
+            content = resp.choices[0].message.content
+            usg = resp.usage
+            if usg:
+                self._prompt_tokens += usg.prompt_tokens
+                self._completion_tokens += usg.completion_tokens
+            elapsed = time.time() - t0
+            logger.info("%s: %d chars in %.1fs", label, len(content) if content else 0, elapsed)
+            if not content:
+                raise RuntimeError("Empty response from LLM")
+            return content
+
+        return self._retry(_call, label)
+
+    def _retry(self, fn, label: str) -> str:
+        """Call *fn()* with exponential-backoff retry."""
+        last_error: Optional[Exception] = None
+        for attempt in range(self.MAX_RETRIES):
+            try:
+                return fn()
+            except Exception as e:
+                last_error = e
+                logger.warning(
+                    "%s error (attempt %d/%d): %s",
+                    label, attempt + 1, self.MAX_RETRIES, e,
+                )
+                if attempt < self.MAX_RETRIES - 1:
+                    time.sleep(2 ** attempt)
+        raise RuntimeError(f"{label}: all retries exhausted") from last_error
@@ -0,0 +1,106 @@
+import argparse
+import json
+import logging
+import os
+import sys
+import time
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from image_parser import ImageParser
+from LLM import LLMClient
+from word_parser import WordParser
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+
+logger = logging.getLogger(__name__)
+
+RATE_LIMIT_DELAY = 0.5
+
+
+def parse_document(
+    docx_path: str,
+    output_dir: str = "output",
+    *,
+    dry_run: bool = False,
+) -> dict:
+    """Parse a .docx file: extract text structure and parse embedded images.
+
+    Produces ``<basename>_parsed.json`` in *output_dir*.
+    """
+    word = WordParser(docx_path)
+
+    basename = os.path.splitext(os.path.basename(docx_path))[0]
+    os.makedirs(output_dir, exist_ok=True)
+    images_dir = os.path.join(output_dir, "images")
+
+    # ---- extract sections and images -----------------------------------------
+    sections, image_sources = word.extract_sections()
+    logger.info("Document has %d sections, %d image sources", len(sections), len(image_sources))
+
+    # ---- parse images ----------------------------------------------------------
+    images = word.extract_images(images_dir)
+    logger.info("Found %d images in document", len(images))
+
+    image_analysis: list[dict] = []
+
+    if images:
+        llm = ImageParser()
+        for i, img in enumerate(images):
+            logger.info("[image %d/%d] rid=%s", i + 1, len(images), img["rid"])
+            if dry_run:
+                est = LLMClient.estimate_image_tokens()
+                logger.info("  [DRY RUN] would call vision LLM (~%d tokens)", est)
+                result = {"type": "other", "description": "[DRY RUN]"}
+            else:
+                result = llm.parse_image(img["path"])
+                if result is None:
+                    result = {"type": "other", "description": ""}
+            result["rid"] = img["rid"]
+            result["path"] = img["path"]
+            image_analysis.append(result)
+            if i < len(images) - 1:
+                time.sleep(RATE_LIMIT_DELAY)
+
+        usg = llm.usage
+        logger.info("Tokens: %d prompt + %d completion = %d total",
+                    usg["prompt_tokens"], usg["completion_tokens"], usg["total_tokens"])
+    else:
+        logger.info("No images found in document")
+
+    # ---- build output --------------------------------------------------------
+    output = {
+        "source": os.path.abspath(docx_path),
+        "sections": sections,
+        "image_sources": image_sources,
+        "image_analysis": image_analysis,
+    }
+
+    parsed_path = os.path.join(output_dir, f"{basename}_parsed.json")
+    with open(parsed_path, "w", encoding="utf-8") as f:
+        json.dump(output, f, ensure_ascii=False, indent=2)
+    logger.info("Saved: %s", parsed_path)
+
+    return output
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Parse a .docx file: extract text structure and parse images.",
+    )
+    parser.add_argument("input", metavar="input.docx", help="Path to the Word document")
+    parser.add_argument("output_dir", nargs="?", default="output", metavar="output_dir",
+                        help="Directory for output files (default: output/)")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Print LLM prompts without calling the API.")
+
+    args = parser.parse_args()
+    parse_document(args.input, args.output_dir, dry_run=args.dry_run)
@@ -0,0 +1,123 @@
+import base64
+import logging
+import os
+from typing import Optional
+
+from LLM import LLMClient
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Prompts
+# ---------------------------------------------------------------------------
+
+PROMPT_IMAGE = """请分析这张图片，判断类型并输出文字描述。
+
+## 判断图片类型
+
+如果是 **流程图 / 架构图 / 状态图 / 时序图 / 活动图**，详细描述：
+- 图中所有节点/步骤/状态/组件的名称
+- 所有连线/箭头/转换关系及其方向
+- 所有分支条件、判断逻辑和判断结果
+- 所有文字标注、注释、标签
+- 图的整体结构和逻辑流程
+- 如果图片包含多个子图，拆解描述
+
+如果是 **其他类型**（UI原型图 / 界面截图 / 设计稿 / 手机屏幕截图 / 网页截图等），简要描述图片内容。
+
+## 输出格式
+
+**1. 类型标签（单独一行）：**
+type: <flowchart|architecture|state|sequence|activity|other>
+
+**2. 文字描述：**
+该图片的详细文字描述。
+
+不要输出 ---YAML--- 分隔符或 YAML 内容，不要添加任何额外的解释或问候语。"""
+
+
+# ---------------------------------------------------------------------------
+# ImageParser
+# ---------------------------------------------------------------------------
+
+class ImageParser:
+    """Vision LLM wrapper for parsing images (type + description).
+
+    Usage::
+
+        parser = ImageParser()
+        result = parser.parse_image("images/img1.png")
+    """
+
+    _VALID_TYPES = {"flowchart", "architecture", "state", "sequence", "activity", "text"}
+
+    def __init__(self, llm: LLMClient | None = None):
+        self._llm = llm or LLMClient()
+
+    @property
+    def usage(self) -> dict:
+        return self._llm.usage
+
+    def parse_image(self, image_path: str) -> Optional[dict]:
+        """Parse an image and return its type and description (no YAML IR).
+
+        Returns ``{type, description}``, or *None* for UI mockups.
+        """
+        logger.info("Parsing image: %s", image_path)
+
+        with open(image_path, "rb") as f:
+            img_b64 = base64.b64encode(f.read()).decode()
+        mime = self._mime_type(image_path)
+
+        try:
+            content = self._llm.chat(
+                model=LLMClient.IMAGE_MODEL,
+                messages=[{
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{img_b64}"}},
+                        {"type": "text", "text": PROMPT_IMAGE},
+                    ],
+                }],
+            )
+        except RuntimeError as e:
+            logger.error(str(e))
+            return {"type": "other", "description": "", "error": str(e)}
+
+        parsed = self._parse_type_and_description(content)
+        if parsed is None:
+            return None
+        return {"type": parsed[0], "description": parsed[1]}
+
+    # ---- internals ----------------------------------------------------------
+
+    def _parse_type_and_description(self, content: str) -> Optional[tuple[str, str]]:
+        """Extract ``(type, description)`` from LLM response.
+
+        Returns *None* for ``[[UI]]`` (skip).
+        """
+        content = content.strip()
+        if content == "[[UI]]" or content.startswith("[[UI]]"):
+            return None
+
+        parsed_type = "other"
+        desc_lines: list[str] = []
+        for line in content.splitlines():
+            stripped = line.strip()
+            if (stripped.startswith("type:") or stripped.startswith("类型:")) and parsed_type == "other":
+                type_val = stripped.split(":", 1)[1].strip().lower()
+                if type_val in self._VALID_TYPES:
+                    parsed_type = type_val
+            else:
+                desc_lines.append(line)
+
+        return parsed_type, "\n".join(desc_lines).strip()
+
+    @staticmethod
+    def _mime_type(image_path: str) -> str:
+        ext = os.path.splitext(image_path)[1].lstrip(".").lower()
+        return {
+            "png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
+            "gif": "image/gif", "bmp": "image/bmp",
+            "webp": "image/webp", "svg": "image/svg+xml", "tiff": "image/tiff",
+        }.get(ext, "image/png")
@@ -0,0 +1,239 @@
+import logging
+import os
+
+from docx import Document
+from docx.table import Table
+from docx.text.paragraph import Paragraph
+
+logger = logging.getLogger(__name__)
+
+IMAGE_EXT = {
+    "image/png": ".png",
+    "image/jpeg": ".jpg",
+    "image/gif": ".gif",
+    "image/bmp": ".bmp",
+    "image/tiff": ".tiff",
+    "image/webp": ".webp",
+    "image/x-emf": ".emf",
+    "image/x-wmf": ".wmf",
+    "image/svg+xml": ".svg",
+}
+
+
+class WordParser:
+    """Parse a .docx file — extract images, split body into sections.
+
+    Usage::
+
+        parser = WordParser("doc.docx")
+        parser.extract_images("images/")
+        sections, image_sources = parser.extract_sections()
+    """
+
+    HEADER_CELL_MAX_LEN = 20  # max chars per cell to treat first row as header
+
+    WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+    DRAW_NS = "http://schemas.openxmlformats.org/drawingml/2006/main"
+    REL_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
+
+    def __init__(self, docx_path: str):
+        if not os.path.isfile(docx_path):
+            raise FileNotFoundError(f"Document not found: {docx_path}")
+        self._doc = Document(docx_path)
+
+    # ---- public API ---------------------------------------------------------
+
+    def extract_images(self, images_dir: str) -> list[dict]:
+        """Save all images to *images_dir*.  Returns ``[{rid, path}, ...]``."""
+        os.makedirs(images_dir, exist_ok=True)
+        images: list[dict] = []
+        for rel in self._doc.part.rels.values():
+            if "image" not in rel.reltype:
+                continue
+            ext = IMAGE_EXT.get(rel.target_part.content_type, ".png")
+            name = f"image_{rel.rId}{ext}"
+            path = os.path.join(images_dir, name)
+            with open(path, "wb") as f:
+                f.write(rel.target_part.blob)
+            images.append({"rid": rel.rId, "path": path})
+        return images
+
+    def extract_sections(self) -> tuple[list[dict], dict[str, dict]]:
+        """Walk document body and split into sections by heading.
+
+        Returns:
+            *sections* —  ``[{source, blocks, images}, ...]``
+                Each block is ``{type, index, text}`` (paragraph) or
+                ``{type, table, headers, rows}`` (table).
+            *image_sources* —  ``rid → {section, table?, row?, column?, name?}``
+        """
+        sections: list[dict] = []
+        current_source = ""
+        blocks: list[dict] = []
+        section_images: list[str] = []
+        image_sources: dict[str, dict] = {}
+        para_idx = 0
+        tbl_idx = 0
+
+        for child in self._doc.element.body:
+            tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
+
+            if tag == "p":
+                para = Paragraph(child, self._doc)
+
+                if self._heading_level(para) is not None:
+                    heading_text = para.text.strip()
+                    if heading_text:  # ignore empty heading-like paragraphs
+                        if blocks or section_images:
+                            sections.append({
+                                "source": current_source,
+                                "blocks": blocks,
+                                "images": list(section_images),
+                            })
+                            blocks = []
+                            section_images = []
+                            para_idx = 0
+                            tbl_idx = 0
+                        current_source = heading_text
+                    continue
+
+                text = para.text.strip()
+
+                # Scan for images — append [[IMAGE:rid]] markers
+                for run in para.runs:
+                    for rid in self._images_in(run._element):
+                        text += f" [[IMAGE:{rid}]]"
+                        section_images.append(rid)
+                        image_sources[rid] = {"section": current_source}
+
+                if text.strip():
+                    blocks.append({"type": "para", "index": para_idx + 1, "text": text.strip()})
+                    para_idx += 1
+
+            elif tag == "tbl":
+                tbl_idx += 1
+                table = Table(child, self._doc)
+
+                # Collect all rows as [[cell_text, ...], ...]
+                all_rows: list[list[str]] = []
+                all_images: list[list[list[str]]] = []  # row → col → [rids]
+                for row in table.rows:
+                    row_texts: list[str] = []
+                    row_cell_images: list[list[str]] = []
+                    for cell in row.cells:
+                        cell_text = cell.text.strip()
+                        cell_imgs: list[str] = []
+                        for cp in cell.paragraphs:
+                            for run in cp.runs:
+                                for rid in self._images_in(run._element):
+                                    cell_imgs.append(rid)
+                        # Replace images with markers in text
+                        for rid in cell_imgs:
+                            cell_text += f" [[IMAGE:{rid}]]"
+                            section_images.append(rid)
+                        row_texts.append(cell_text.strip())
+                        row_cell_images.append(cell_imgs)
+                    if any(row_texts) or any(row_cell_images):
+                        all_rows.append(row_texts)
+                        all_images.append(row_cell_images)
+
+                if len(all_rows) >= 2:
+                    # Heuristic: first row is a header if every cell is short
+                    first_row = all_rows[0]
+                    has_header = all(len(c) < self.HEADER_CELL_MAX_LEN for c in first_row)
+                    if has_header:
+                        headers = first_row
+                        data_rows_slice = zip(all_rows[1:], all_images[1:])
+                    else:
+                        headers = [f"列{ci + 1}" for ci in range(len(first_row))]
+                        data_rows_slice = zip(all_rows, all_images)
+
+                    data_rows: list[dict] = []
+
+                    for ri, (row_data, row_imgs) in enumerate(data_rows_slice):
+                        columns: list[dict] = []
+                        max_cols = max(len(headers), len(row_data))
+                        for ci in range(max_cols):
+                            hdr = headers[ci] if ci < len(headers) else ""
+                            txt = row_data[ci] if ci < len(row_data) else ""
+                            columns.append({
+                                "name": hdr,
+                                "row": ri + 1,
+                                "col": ci + 1,
+                                "text": txt,
+                            })
+
+                            # Register image sources with structured location
+                            imgs = row_imgs[ci] if ci < len(row_imgs) else []
+                            for rid in imgs:
+                                image_sources[rid] = {
+                                    "section": current_source,
+                                    "table": tbl_idx,
+                                    "row": ri + 1,
+                                    "column": ci + 1,
+                                    "name": hdr,
+                                }
+
+                        data_rows.append({"columns": columns})
+
+                    blocks.append({
+                        "type": "table",
+                        "table": tbl_idx,
+                        "headers": headers,
+                        "rows": data_rows,
+                    })
+                elif all_rows:
+                    # Degenerate table (only header or single row) — treat as plain rows
+                    for ri, row_data in enumerate(all_rows):
+                        row_text = " | ".join(row_data)
+                        if row_text.strip():
+                            blocks.append({
+                                "type": "para",
+                                "index": para_idx + 1,
+                                "text": row_text,
+                            })
+                            para_idx += 1
+
+        if blocks or section_images:
+            sections.append({
+                "source": current_source,
+                "blocks": blocks,
+                "images": list(section_images),
+            })
+
+        return sections, image_sources
+
+    # ---- internals ----------------------------------------------------------
+
+    def _heading_level(self, para: Paragraph) -> int | None:
+        """Heading level 1-9, or *None* if not a heading."""
+        if para.style and para.style.name:
+            name = para.style.name
+            for prefix in ("Heading", "标题"):
+                if name.startswith(prefix):
+                    try:
+                        return int(name.split()[-1])
+                    except (ValueError, IndexError):
+                        pass
+        pPr = para._element.find(f"{{{self.WML_NS}}}pPr")
+        if pPr is not None:
+            ol = pPr.find(f"{{{self.WML_NS}}}outlineLvl")
+            if ol is not None:
+                val = ol.get(f"{{{self.WML_NS}}}val")
+                if val is not None:
+                    try:
+                        return int(val) + 1
+                    except ValueError:
+                        pass
+        return None
+
+    def _images_in(self, element) -> list[str]:
+        """Return rId values for drawings embedded in *element*."""
+        rids: list[str] = []
+        for drawing in element.findall(f".//{{{self.WML_NS}}}drawing"):
+            blip = drawing.find(f".//{{{self.DRAW_NS}}}blip")
+            if blip is not None:
+                rid = blip.get(f"{{{self.REL_NS}}}embed")
+                if rid:
+                    rids.append(rid)
+        return rids