40567a4fb6
CI / test (push) Successful in 30s
- 4 skill pipeline (doc_parser, conflict_detection, ir_generation, resolution_application) - CI workflow on push/PR (.gitea/workflows/ci.yml) - Auto-issue on CI failure (.gitea/workflows/auto-issue.yml) - Pytest smoke tests (tests/test_sample.py) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
107 lines
3.6 KiB
Python
107 lines
3.6 KiB
Python
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import time
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
from image_parser import ImageParser
|
|
from LLM import LLMClient
|
|
from word_parser import WordParser
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
RATE_LIMIT_DELAY = 0.5
|
|
|
|
|
|
def parse_document(
|
|
docx_path: str,
|
|
output_dir: str = "output",
|
|
*,
|
|
dry_run: bool = False,
|
|
) -> dict:
|
|
"""Parse a .docx file: extract text structure and parse embedded images.
|
|
|
|
Produces ``<basename>_parsed.json`` in *output_dir*.
|
|
"""
|
|
word = WordParser(docx_path)
|
|
|
|
basename = os.path.splitext(os.path.basename(docx_path))[0]
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
images_dir = os.path.join(output_dir, "images")
|
|
|
|
# ---- extract sections and images -----------------------------------------
|
|
sections, image_sources = word.extract_sections()
|
|
logger.info("Document has %d sections, %d image sources", len(sections), len(image_sources))
|
|
|
|
# ---- parse images ----------------------------------------------------------
|
|
images = word.extract_images(images_dir)
|
|
logger.info("Found %d images in document", len(images))
|
|
|
|
image_analysis: list[dict] = []
|
|
|
|
if images:
|
|
llm = ImageParser()
|
|
for i, img in enumerate(images):
|
|
logger.info("[image %d/%d] rid=%s", i + 1, len(images), img["rid"])
|
|
if dry_run:
|
|
est = LLMClient.estimate_image_tokens()
|
|
logger.info(" [DRY RUN] would call vision LLM (~%d tokens)", est)
|
|
result = {"type": "other", "description": "[DRY RUN]"}
|
|
else:
|
|
result = llm.parse_image(img["path"])
|
|
if result is None:
|
|
result = {"type": "other", "description": ""}
|
|
result["rid"] = img["rid"]
|
|
result["path"] = img["path"]
|
|
image_analysis.append(result)
|
|
if i < len(images) - 1:
|
|
time.sleep(RATE_LIMIT_DELAY)
|
|
|
|
usg = llm.usage
|
|
logger.info("Tokens: %d prompt + %d completion = %d total",
|
|
usg["prompt_tokens"], usg["completion_tokens"], usg["total_tokens"])
|
|
else:
|
|
logger.info("No images found in document")
|
|
|
|
# ---- build output --------------------------------------------------------
|
|
output = {
|
|
"source": os.path.abspath(docx_path),
|
|
"sections": sections,
|
|
"image_sources": image_sources,
|
|
"image_analysis": image_analysis,
|
|
}
|
|
|
|
parsed_path = os.path.join(output_dir, f"{basename}_parsed.json")
|
|
with open(parsed_path, "w", encoding="utf-8") as f:
|
|
json.dump(output, f, ensure_ascii=False, indent=2)
|
|
logger.info("Saved: %s", parsed_path)
|
|
|
|
return output
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI
|
|
# ---------------------------------------------------------------------------
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="Parse a .docx file: extract text structure and parse images.",
|
|
)
|
|
parser.add_argument("input", metavar="input.docx", help="Path to the Word document")
|
|
parser.add_argument("output_dir", nargs="?", default="output", metavar="output_dir",
|
|
help="Directory for output files (default: output/)")
|
|
parser.add_argument("--dry-run", action="store_true",
|
|
help="Print LLM prompts without calling the API.")
|
|
|
|
args = parser.parse_args()
|
|
parse_document(args.input, args.output_dir, dry_run=args.dry_run)
|