Initial commit: document_analyzer with CI/CD pipeline
CI / test (push) Successful in 30s

- 4 skill pipeline (doc_parser, conflict_detection, ir_generation, resolution_application)
- CI workflow on push/PR (.gitea/workflows/ci.yml)
- Auto-issue on CI failure (.gitea/workflows/auto-issue.yml)
- Pytest smoke tests (tests/test_sample.py)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-29 20:00:26 +08:00
commit 40567a4fb6
22 changed files with 2898 additions and 0 deletions
@@ -0,0 +1,106 @@
import argparse
import json
import logging
import os
import sys
import time
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from image_parser import ImageParser
from LLM import LLMClient
from word_parser import WordParser
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
RATE_LIMIT_DELAY = 0.5
def parse_document(
docx_path: str,
output_dir: str = "output",
*,
dry_run: bool = False,
) -> dict:
"""Parse a .docx file: extract text structure and parse embedded images.
Produces ``<basename>_parsed.json`` in *output_dir*.
"""
word = WordParser(docx_path)
basename = os.path.splitext(os.path.basename(docx_path))[0]
os.makedirs(output_dir, exist_ok=True)
images_dir = os.path.join(output_dir, "images")
# ---- extract sections and images -----------------------------------------
sections, image_sources = word.extract_sections()
logger.info("Document has %d sections, %d image sources", len(sections), len(image_sources))
# ---- parse images ----------------------------------------------------------
images = word.extract_images(images_dir)
logger.info("Found %d images in document", len(images))
image_analysis: list[dict] = []
if images:
llm = ImageParser()
for i, img in enumerate(images):
logger.info("[image %d/%d] rid=%s", i + 1, len(images), img["rid"])
if dry_run:
est = LLMClient.estimate_image_tokens()
logger.info(" [DRY RUN] would call vision LLM (~%d tokens)", est)
result = {"type": "other", "description": "[DRY RUN]"}
else:
result = llm.parse_image(img["path"])
if result is None:
result = {"type": "other", "description": ""}
result["rid"] = img["rid"]
result["path"] = img["path"]
image_analysis.append(result)
if i < len(images) - 1:
time.sleep(RATE_LIMIT_DELAY)
usg = llm.usage
logger.info("Tokens: %d prompt + %d completion = %d total",
usg["prompt_tokens"], usg["completion_tokens"], usg["total_tokens"])
else:
logger.info("No images found in document")
# ---- build output --------------------------------------------------------
output = {
"source": os.path.abspath(docx_path),
"sections": sections,
"image_sources": image_sources,
"image_analysis": image_analysis,
}
parsed_path = os.path.join(output_dir, f"{basename}_parsed.json")
with open(parsed_path, "w", encoding="utf-8") as f:
json.dump(output, f, ensure_ascii=False, indent=2)
logger.info("Saved: %s", parsed_path)
return output
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Parse a .docx file: extract text structure and parse images.",
)
parser.add_argument("input", metavar="input.docx", help="Path to the Word document")
parser.add_argument("output_dir", nargs="?", default="output", metavar="output_dir",
help="Directory for output files (default: output/)")
parser.add_argument("--dry-run", action="store_true",
help="Print LLM prompts without calling the API.")
args = parser.parse_args()
parse_document(args.input, args.output_dir, dry_run=args.dry_run)