- 4 skill pipeline (doc_parser, conflict_detection, ir_generation, resolution_application) - CI workflow on push/PR (.gitea/workflows/ci.yml) - Auto-issue on CI failure (.gitea/workflows/auto-issue.yml) - Pytest smoke tests (tests/test_sample.py) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,106 @@
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from image_parser import ImageParser
|
||||
from LLM import LLMClient
|
||||
from word_parser import WordParser
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
RATE_LIMIT_DELAY = 0.5
|
||||
|
||||
|
||||
def parse_document(
|
||||
docx_path: str,
|
||||
output_dir: str = "output",
|
||||
*,
|
||||
dry_run: bool = False,
|
||||
) -> dict:
|
||||
"""Parse a .docx file: extract text structure and parse embedded images.
|
||||
|
||||
Produces ``<basename>_parsed.json`` in *output_dir*.
|
||||
"""
|
||||
word = WordParser(docx_path)
|
||||
|
||||
basename = os.path.splitext(os.path.basename(docx_path))[0]
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
images_dir = os.path.join(output_dir, "images")
|
||||
|
||||
# ---- extract sections and images -----------------------------------------
|
||||
sections, image_sources = word.extract_sections()
|
||||
logger.info("Document has %d sections, %d image sources", len(sections), len(image_sources))
|
||||
|
||||
# ---- parse images ----------------------------------------------------------
|
||||
images = word.extract_images(images_dir)
|
||||
logger.info("Found %d images in document", len(images))
|
||||
|
||||
image_analysis: list[dict] = []
|
||||
|
||||
if images:
|
||||
llm = ImageParser()
|
||||
for i, img in enumerate(images):
|
||||
logger.info("[image %d/%d] rid=%s", i + 1, len(images), img["rid"])
|
||||
if dry_run:
|
||||
est = LLMClient.estimate_image_tokens()
|
||||
logger.info(" [DRY RUN] would call vision LLM (~%d tokens)", est)
|
||||
result = {"type": "other", "description": "[DRY RUN]"}
|
||||
else:
|
||||
result = llm.parse_image(img["path"])
|
||||
if result is None:
|
||||
result = {"type": "other", "description": ""}
|
||||
result["rid"] = img["rid"]
|
||||
result["path"] = img["path"]
|
||||
image_analysis.append(result)
|
||||
if i < len(images) - 1:
|
||||
time.sleep(RATE_LIMIT_DELAY)
|
||||
|
||||
usg = llm.usage
|
||||
logger.info("Tokens: %d prompt + %d completion = %d total",
|
||||
usg["prompt_tokens"], usg["completion_tokens"], usg["total_tokens"])
|
||||
else:
|
||||
logger.info("No images found in document")
|
||||
|
||||
# ---- build output --------------------------------------------------------
|
||||
output = {
|
||||
"source": os.path.abspath(docx_path),
|
||||
"sections": sections,
|
||||
"image_sources": image_sources,
|
||||
"image_analysis": image_analysis,
|
||||
}
|
||||
|
||||
parsed_path = os.path.join(output_dir, f"{basename}_parsed.json")
|
||||
with open(parsed_path, "w", encoding="utf-8") as f:
|
||||
json.dump(output, f, ensure_ascii=False, indent=2)
|
||||
logger.info("Saved: %s", parsed_path)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Parse a .docx file: extract text structure and parse images.",
|
||||
)
|
||||
parser.add_argument("input", metavar="input.docx", help="Path to the Word document")
|
||||
parser.add_argument("output_dir", nargs="?", default="output", metavar="output_dir",
|
||||
help="Directory for output files (default: output/)")
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Print LLM prompts without calling the API.")
|
||||
|
||||
args = parser.parse_args()
|
||||
parse_document(args.input, args.output_dir, dry_run=args.dry_run)
|
||||
Reference in New Issue
Block a user