Initial commit: document_analyzer with CI/CD pipeline

- 4 skill pipeline (doc_parser, conflict_detection, ir_generation, resolution_application) - CI workflow on push/PR (.gitea/workflows/ci.yml) - Auto-issue on CI failure (.gitea/workflows/auto-issue.yml) - Pytest smoke tests (tests/test_sample.py) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-29 20:00:26 +08:00
commit 40567a4fb6
22 changed files with 2898 additions and 0 deletions
@@ -0,0 +1,106 @@
+import argparse
+import json
+import logging
+import os
+import sys
+import time
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from image_parser import ImageParser
+from LLM import LLMClient
+from word_parser import WordParser
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+
+logger = logging.getLogger(__name__)
+
+RATE_LIMIT_DELAY = 0.5
+
+
+def parse_document(
+    docx_path: str,
+    output_dir: str = "output",
+    *,
+    dry_run: bool = False,
+) -> dict:
+    """Parse a .docx file: extract text structure and parse embedded images.
+
+    Produces ``<basename>_parsed.json`` in *output_dir*.
+    """
+    word = WordParser(docx_path)
+
+    basename = os.path.splitext(os.path.basename(docx_path))[0]
+    os.makedirs(output_dir, exist_ok=True)
+    images_dir = os.path.join(output_dir, "images")
+
+    # ---- extract sections and images -----------------------------------------
+    sections, image_sources = word.extract_sections()
+    logger.info("Document has %d sections, %d image sources", len(sections), len(image_sources))
+
+    # ---- parse images ----------------------------------------------------------
+    images = word.extract_images(images_dir)
+    logger.info("Found %d images in document", len(images))
+
+    image_analysis: list[dict] = []
+
+    if images:
+        llm = ImageParser()
+        for i, img in enumerate(images):
+            logger.info("[image %d/%d] rid=%s", i + 1, len(images), img["rid"])
+            if dry_run:
+                est = LLMClient.estimate_image_tokens()
+                logger.info("  [DRY RUN] would call vision LLM (~%d tokens)", est)
+                result = {"type": "other", "description": "[DRY RUN]"}
+            else:
+                result = llm.parse_image(img["path"])
+                if result is None:
+                    result = {"type": "other", "description": ""}
+            result["rid"] = img["rid"]
+            result["path"] = img["path"]
+            image_analysis.append(result)
+            if i < len(images) - 1:
+                time.sleep(RATE_LIMIT_DELAY)
+
+        usg = llm.usage
+        logger.info("Tokens: %d prompt + %d completion = %d total",
+                    usg["prompt_tokens"], usg["completion_tokens"], usg["total_tokens"])
+    else:
+        logger.info("No images found in document")
+
+    # ---- build output --------------------------------------------------------
+    output = {
+        "source": os.path.abspath(docx_path),
+        "sections": sections,
+        "image_sources": image_sources,
+        "image_analysis": image_analysis,
+    }
+
+    parsed_path = os.path.join(output_dir, f"{basename}_parsed.json")
+    with open(parsed_path, "w", encoding="utf-8") as f:
+        json.dump(output, f, ensure_ascii=False, indent=2)
+    logger.info("Saved: %s", parsed_path)
+
+    return output
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Parse a .docx file: extract text structure and parse images.",
+    )
+    parser.add_argument("input", metavar="input.docx", help="Path to the Word document")
+    parser.add_argument("output_dir", nargs="?", default="output", metavar="output_dir",
+                        help="Directory for output files (default: output/)")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Print LLM prompts without calling the API.")
+
+    args = parser.parse_args()
+    parse_document(args.input, args.output_dir, dry_run=args.dry_run)