document_analyzer/skills/doc_parser_skill/scripts/doc_parser.py

import argparse
import json
import logging
import os
import sys
import time

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from image_parser import ImageParser
from LLM import LLMClient
from word_parser import WordParser

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logger = logging.getLogger(__name__)

RATE_LIMIT_DELAY = 0.5


def parse_document(
    docx_path: str,
    output_dir: str = "output",
    *,
    dry_run: bool = False,
) -> dict:
    """Parse a .docx file: extract text structure and parse embedded images.

    Produces ``<basename>_parsed.json`` in *output_dir*.
    """
    word = WordParser(docx_path)

    basename = os.path.splitext(os.path.basename(docx_path))[0]
    os.makedirs(output_dir, exist_ok=True)
    images_dir = os.path.join(output_dir, "images")

    # ---- extract sections and images -----------------------------------------
    sections, image_sources = word.extract_sections()
    logger.info("Document has %d sections, %d image sources", len(sections), len(image_sources))

    # ---- parse images ----------------------------------------------------------
    images = word.extract_images(images_dir)
    logger.info("Found %d images in document", len(images))

    image_analysis: list[dict] = []

    if images:
        llm = ImageParser()
        for i, img in enumerate(images):
            logger.info("[image %d/%d] rid=%s", i + 1, len(images), img["rid"])
            if dry_run:
                est = LLMClient.estimate_image_tokens()
                logger.info("  [DRY RUN] would call vision LLM (~%d tokens)", est)
                result = {"type": "other", "description": "[DRY RUN]"}
            else:
                result = llm.parse_image(img["path"])
                if result is None:
                    result = {"type": "other", "description": ""}
            result["rid"] = img["rid"]
            result["path"] = img["path"]
            image_analysis.append(result)
            if i < len(images) - 1:
                time.sleep(RATE_LIMIT_DELAY)

        usg = llm.usage
        logger.info("Tokens: %d prompt + %d completion = %d total",
                    usg["prompt_tokens"], usg["completion_tokens"], usg["total_tokens"])
    else:
        logger.info("No images found in document")

    # ---- build output --------------------------------------------------------
    output = {
        "source": os.path.abspath(docx_path),
        "sections": sections,
        "image_sources": image_sources,
        "image_analysis": image_analysis,
    }

    parsed_path = os.path.join(output_dir, f"{basename}_parsed.json")
    with open(parsed_path, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)
    logger.info("Saved: %s", parsed_path)

    return output


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Parse a .docx file: extract text structure and parse images.",
    )
    parser.add_argument("input", metavar="input.docx", help="Path to the Word document")
    parser.add_argument("output_dir", nargs="?", default="output", metavar="output_dir",
                        help="Directory for output files (default: output/)")
    parser.add_argument("--dry-run", action="store_true",
                        help="Print LLM prompts without calling the API.")

    args = parser.parse_args()
    parse_document(args.input, args.output_dir, dry_run=args.dry_run)