import argparse import json import logging import os import sys import time sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from image_parser import ImageParser from LLM import LLMClient from word_parser import WordParser logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) logger = logging.getLogger(__name__) RATE_LIMIT_DELAY = 0.5 def parse_document( docx_path: str, output_dir: str = "output", *, dry_run: bool = False, ) -> dict: """Parse a .docx file: extract text structure and parse embedded images. Produces ``_parsed.json`` in *output_dir*. """ word = WordParser(docx_path) basename = os.path.splitext(os.path.basename(docx_path))[0] os.makedirs(output_dir, exist_ok=True) images_dir = os.path.join(output_dir, "images") # ---- extract sections and images ----------------------------------------- sections, image_sources = word.extract_sections() logger.info("Document has %d sections, %d image sources", len(sections), len(image_sources)) # ---- parse images ---------------------------------------------------------- images = word.extract_images(images_dir) logger.info("Found %d images in document", len(images)) image_analysis: list[dict] = [] if images: llm = ImageParser() for i, img in enumerate(images): logger.info("[image %d/%d] rid=%s", i + 1, len(images), img["rid"]) if dry_run: est = LLMClient.estimate_image_tokens() logger.info(" [DRY RUN] would call vision LLM (~%d tokens)", est) result = {"type": "other", "description": "[DRY RUN]"} else: result = llm.parse_image(img["path"]) if result is None: result = {"type": "other", "description": ""} result["rid"] = img["rid"] result["path"] = img["path"] image_analysis.append(result) if i < len(images) - 1: time.sleep(RATE_LIMIT_DELAY) usg = llm.usage logger.info("Tokens: %d prompt + %d completion = %d total", usg["prompt_tokens"], usg["completion_tokens"], usg["total_tokens"]) else: logger.info("No images found in document") # ---- build output -------------------------------------------------------- output = { "source": os.path.abspath(docx_path), "sections": sections, "image_sources": image_sources, "image_analysis": image_analysis, } parsed_path = os.path.join(output_dir, f"{basename}_parsed.json") with open(parsed_path, "w", encoding="utf-8") as f: json.dump(output, f, ensure_ascii=False, indent=2) logger.info("Saved: %s", parsed_path) return output # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- if __name__ == "__main__": parser = argparse.ArgumentParser( description="Parse a .docx file: extract text structure and parse images.", ) parser.add_argument("input", metavar="input.docx", help="Path to the Word document") parser.add_argument("output_dir", nargs="?", default="output", metavar="output_dir", help="Directory for output files (default: output/)") parser.add_argument("--dry-run", action="store_true", help="Print LLM prompts without calling the API.") args = parser.parse_args() parse_document(args.input, args.output_dir, dry_run=args.dry_run)