- 4 skill pipeline (doc_parser, conflict_detection, ir_generation, resolution_application) - CI workflow on push/PR (.gitea/workflows/ci.yml) - Auto-issue on CI failure (.gitea/workflows/auto-issue.yml) - Pytest smoke tests (tests/test_sample.py) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,105 @@
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LLMClient:
|
||||
"""Low-level OpenAI-compatible LLM client with retry and token tracking.
|
||||
|
||||
Usage::
|
||||
|
||||
llm = LLMClient()
|
||||
content = llm.chat("qwen3.5-flash", [{"role": "user", "content": "Hello"}])
|
||||
print(llm.usage)
|
||||
"""
|
||||
|
||||
IMAGE_MODEL = "qwen3-vl-plus"
|
||||
TEXT_MODEL = "qwen3.5-flash-2026-02-23"
|
||||
TIMEOUT = 120
|
||||
MAX_RETRIES = 3
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
timeout: int | None = None,
|
||||
):
|
||||
key = os.environ.get("DASHSCOPE_API_KEY", "")
|
||||
if not key:
|
||||
raise ValueError("DASHSCOPE_API_KEY environment variable is not set.")
|
||||
self._client = OpenAI(api_key=key, base_url=base_url)
|
||||
self._timeout = timeout or self.TIMEOUT
|
||||
self._prompt_tokens = 0
|
||||
self._completion_tokens = 0
|
||||
|
||||
@property
|
||||
def usage(self) -> dict:
|
||||
"""Return accumulated token counts as ``{prompt, completion, total}``."""
|
||||
return {
|
||||
"prompt_tokens": self._prompt_tokens,
|
||||
"completion_tokens": self._completion_tokens,
|
||||
"total_tokens": self._prompt_tokens + self._completion_tokens,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def estimate_tokens(text: str) -> int:
|
||||
"""Quick token estimate. CJK ≈1.7/token, others ≈3.0/token."""
|
||||
cjk = sum(1 for c in text if '一' <= c <= '鿿' or ' ' <= c <= '〿')
|
||||
other = len(text) - cjk
|
||||
return max(1, int(cjk / 1.7 + other / 3.0))
|
||||
|
||||
@staticmethod
|
||||
def estimate_image_tokens() -> int:
|
||||
"""Fixed estimate for one vision-model image (~500 tokens)."""
|
||||
return 500
|
||||
|
||||
def chat(
|
||||
self, model: str, messages: list[dict], *, timeout: int | None = None,
|
||||
response_format: dict | None = None,
|
||||
) -> str:
|
||||
"""Send a chat completion request and return the response content.
|
||||
|
||||
Automatically retries on failure and accumulates token usage.
|
||||
"""
|
||||
label = f"chat({model})"
|
||||
|
||||
def _call():
|
||||
t0 = time.time()
|
||||
kwargs = dict(model=model, messages=messages, timeout=timeout or self._timeout)
|
||||
if response_format is not None:
|
||||
kwargs["response_format"] = response_format
|
||||
kwargs["temperature"] = 0
|
||||
resp = self._client.chat.completions.create(**kwargs)
|
||||
content = resp.choices[0].message.content
|
||||
usg = resp.usage
|
||||
if usg:
|
||||
self._prompt_tokens += usg.prompt_tokens
|
||||
self._completion_tokens += usg.completion_tokens
|
||||
elapsed = time.time() - t0
|
||||
logger.info("%s: %d chars in %.1fs", label, len(content) if content else 0, elapsed)
|
||||
if not content:
|
||||
raise RuntimeError("Empty response from LLM")
|
||||
return content
|
||||
|
||||
return self._retry(_call, label)
|
||||
|
||||
def _retry(self, fn, label: str) -> str:
|
||||
"""Call *fn()* with exponential-backoff retry."""
|
||||
last_error: Optional[Exception] = None
|
||||
for attempt in range(self.MAX_RETRIES):
|
||||
try:
|
||||
return fn()
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
logger.warning(
|
||||
"%s error (attempt %d/%d): %s",
|
||||
label, attempt + 1, self.MAX_RETRIES, e,
|
||||
)
|
||||
if attempt < self.MAX_RETRIES - 1:
|
||||
time.sleep(2 ** attempt)
|
||||
raise RuntimeError(f"{label}: all retries exhausted") from last_error
|
||||
@@ -0,0 +1,106 @@
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from image_parser import ImageParser
|
||||
from LLM import LLMClient
|
||||
from word_parser import WordParser
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
RATE_LIMIT_DELAY = 0.5
|
||||
|
||||
|
||||
def parse_document(
|
||||
docx_path: str,
|
||||
output_dir: str = "output",
|
||||
*,
|
||||
dry_run: bool = False,
|
||||
) -> dict:
|
||||
"""Parse a .docx file: extract text structure and parse embedded images.
|
||||
|
||||
Produces ``<basename>_parsed.json`` in *output_dir*.
|
||||
"""
|
||||
word = WordParser(docx_path)
|
||||
|
||||
basename = os.path.splitext(os.path.basename(docx_path))[0]
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
images_dir = os.path.join(output_dir, "images")
|
||||
|
||||
# ---- extract sections and images -----------------------------------------
|
||||
sections, image_sources = word.extract_sections()
|
||||
logger.info("Document has %d sections, %d image sources", len(sections), len(image_sources))
|
||||
|
||||
# ---- parse images ----------------------------------------------------------
|
||||
images = word.extract_images(images_dir)
|
||||
logger.info("Found %d images in document", len(images))
|
||||
|
||||
image_analysis: list[dict] = []
|
||||
|
||||
if images:
|
||||
llm = ImageParser()
|
||||
for i, img in enumerate(images):
|
||||
logger.info("[image %d/%d] rid=%s", i + 1, len(images), img["rid"])
|
||||
if dry_run:
|
||||
est = LLMClient.estimate_image_tokens()
|
||||
logger.info(" [DRY RUN] would call vision LLM (~%d tokens)", est)
|
||||
result = {"type": "other", "description": "[DRY RUN]"}
|
||||
else:
|
||||
result = llm.parse_image(img["path"])
|
||||
if result is None:
|
||||
result = {"type": "other", "description": ""}
|
||||
result["rid"] = img["rid"]
|
||||
result["path"] = img["path"]
|
||||
image_analysis.append(result)
|
||||
if i < len(images) - 1:
|
||||
time.sleep(RATE_LIMIT_DELAY)
|
||||
|
||||
usg = llm.usage
|
||||
logger.info("Tokens: %d prompt + %d completion = %d total",
|
||||
usg["prompt_tokens"], usg["completion_tokens"], usg["total_tokens"])
|
||||
else:
|
||||
logger.info("No images found in document")
|
||||
|
||||
# ---- build output --------------------------------------------------------
|
||||
output = {
|
||||
"source": os.path.abspath(docx_path),
|
||||
"sections": sections,
|
||||
"image_sources": image_sources,
|
||||
"image_analysis": image_analysis,
|
||||
}
|
||||
|
||||
parsed_path = os.path.join(output_dir, f"{basename}_parsed.json")
|
||||
with open(parsed_path, "w", encoding="utf-8") as f:
|
||||
json.dump(output, f, ensure_ascii=False, indent=2)
|
||||
logger.info("Saved: %s", parsed_path)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Parse a .docx file: extract text structure and parse images.",
|
||||
)
|
||||
parser.add_argument("input", metavar="input.docx", help="Path to the Word document")
|
||||
parser.add_argument("output_dir", nargs="?", default="output", metavar="output_dir",
|
||||
help="Directory for output files (default: output/)")
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Print LLM prompts without calling the API.")
|
||||
|
||||
args = parser.parse_args()
|
||||
parse_document(args.input, args.output_dir, dry_run=args.dry_run)
|
||||
@@ -0,0 +1,123 @@
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from LLM import LLMClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Prompts
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
PROMPT_IMAGE = """请分析这张图片,判断类型并输出文字描述。
|
||||
|
||||
## 判断图片类型
|
||||
|
||||
如果是 **流程图 / 架构图 / 状态图 / 时序图 / 活动图**,详细描述:
|
||||
- 图中所有节点/步骤/状态/组件的名称
|
||||
- 所有连线/箭头/转换关系及其方向
|
||||
- 所有分支条件、判断逻辑和判断结果
|
||||
- 所有文字标注、注释、标签
|
||||
- 图的整体结构和逻辑流程
|
||||
- 如果图片包含多个子图,拆解描述
|
||||
|
||||
如果是 **其他类型**(UI原型图 / 界面截图 / 设计稿 / 手机屏幕截图 / 网页截图等),简要描述图片内容。
|
||||
|
||||
## 输出格式
|
||||
|
||||
**1. 类型标签(单独一行):**
|
||||
type: <flowchart|architecture|state|sequence|activity|other>
|
||||
|
||||
**2. 文字描述:**
|
||||
该图片的详细文字描述。
|
||||
|
||||
不要输出 ---YAML--- 分隔符或 YAML 内容,不要添加任何额外的解释或问候语。"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# ImageParser
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class ImageParser:
|
||||
"""Vision LLM wrapper for parsing images (type + description).
|
||||
|
||||
Usage::
|
||||
|
||||
parser = ImageParser()
|
||||
result = parser.parse_image("images/img1.png")
|
||||
"""
|
||||
|
||||
_VALID_TYPES = {"flowchart", "architecture", "state", "sequence", "activity", "text"}
|
||||
|
||||
def __init__(self, llm: LLMClient | None = None):
|
||||
self._llm = llm or LLMClient()
|
||||
|
||||
@property
|
||||
def usage(self) -> dict:
|
||||
return self._llm.usage
|
||||
|
||||
def parse_image(self, image_path: str) -> Optional[dict]:
|
||||
"""Parse an image and return its type and description (no YAML IR).
|
||||
|
||||
Returns ``{type, description}``, or *None* for UI mockups.
|
||||
"""
|
||||
logger.info("Parsing image: %s", image_path)
|
||||
|
||||
with open(image_path, "rb") as f:
|
||||
img_b64 = base64.b64encode(f.read()).decode()
|
||||
mime = self._mime_type(image_path)
|
||||
|
||||
try:
|
||||
content = self._llm.chat(
|
||||
model=LLMClient.IMAGE_MODEL,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image_url", "image_url": {"url": f"data:{mime};base64,{img_b64}"}},
|
||||
{"type": "text", "text": PROMPT_IMAGE},
|
||||
],
|
||||
}],
|
||||
)
|
||||
except RuntimeError as e:
|
||||
logger.error(str(e))
|
||||
return {"type": "other", "description": "", "error": str(e)}
|
||||
|
||||
parsed = self._parse_type_and_description(content)
|
||||
if parsed is None:
|
||||
return None
|
||||
return {"type": parsed[0], "description": parsed[1]}
|
||||
|
||||
# ---- internals ----------------------------------------------------------
|
||||
|
||||
def _parse_type_and_description(self, content: str) -> Optional[tuple[str, str]]:
|
||||
"""Extract ``(type, description)`` from LLM response.
|
||||
|
||||
Returns *None* for ``[[UI]]`` (skip).
|
||||
"""
|
||||
content = content.strip()
|
||||
if content == "[[UI]]" or content.startswith("[[UI]]"):
|
||||
return None
|
||||
|
||||
parsed_type = "other"
|
||||
desc_lines: list[str] = []
|
||||
for line in content.splitlines():
|
||||
stripped = line.strip()
|
||||
if (stripped.startswith("type:") or stripped.startswith("类型:")) and parsed_type == "other":
|
||||
type_val = stripped.split(":", 1)[1].strip().lower()
|
||||
if type_val in self._VALID_TYPES:
|
||||
parsed_type = type_val
|
||||
else:
|
||||
desc_lines.append(line)
|
||||
|
||||
return parsed_type, "\n".join(desc_lines).strip()
|
||||
|
||||
@staticmethod
|
||||
def _mime_type(image_path: str) -> str:
|
||||
ext = os.path.splitext(image_path)[1].lstrip(".").lower()
|
||||
return {
|
||||
"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg",
|
||||
"gif": "image/gif", "bmp": "image/bmp",
|
||||
"webp": "image/webp", "svg": "image/svg+xml", "tiff": "image/tiff",
|
||||
}.get(ext, "image/png")
|
||||
@@ -0,0 +1,239 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
from docx import Document
|
||||
from docx.table import Table
|
||||
from docx.text.paragraph import Paragraph
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
IMAGE_EXT = {
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/gif": ".gif",
|
||||
"image/bmp": ".bmp",
|
||||
"image/tiff": ".tiff",
|
||||
"image/webp": ".webp",
|
||||
"image/x-emf": ".emf",
|
||||
"image/x-wmf": ".wmf",
|
||||
"image/svg+xml": ".svg",
|
||||
}
|
||||
|
||||
|
||||
class WordParser:
|
||||
"""Parse a .docx file — extract images, split body into sections.
|
||||
|
||||
Usage::
|
||||
|
||||
parser = WordParser("doc.docx")
|
||||
parser.extract_images("images/")
|
||||
sections, image_sources = parser.extract_sections()
|
||||
"""
|
||||
|
||||
HEADER_CELL_MAX_LEN = 20 # max chars per cell to treat first row as header
|
||||
|
||||
WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
DRAW_NS = "http://schemas.openxmlformats.org/drawingml/2006/main"
|
||||
REL_NS = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
|
||||
def __init__(self, docx_path: str):
|
||||
if not os.path.isfile(docx_path):
|
||||
raise FileNotFoundError(f"Document not found: {docx_path}")
|
||||
self._doc = Document(docx_path)
|
||||
|
||||
# ---- public API ---------------------------------------------------------
|
||||
|
||||
def extract_images(self, images_dir: str) -> list[dict]:
|
||||
"""Save all images to *images_dir*. Returns ``[{rid, path}, ...]``."""
|
||||
os.makedirs(images_dir, exist_ok=True)
|
||||
images: list[dict] = []
|
||||
for rel in self._doc.part.rels.values():
|
||||
if "image" not in rel.reltype:
|
||||
continue
|
||||
ext = IMAGE_EXT.get(rel.target_part.content_type, ".png")
|
||||
name = f"image_{rel.rId}{ext}"
|
||||
path = os.path.join(images_dir, name)
|
||||
with open(path, "wb") as f:
|
||||
f.write(rel.target_part.blob)
|
||||
images.append({"rid": rel.rId, "path": path})
|
||||
return images
|
||||
|
||||
def extract_sections(self) -> tuple[list[dict], dict[str, dict]]:
|
||||
"""Walk document body and split into sections by heading.
|
||||
|
||||
Returns:
|
||||
*sections* — ``[{source, blocks, images}, ...]``
|
||||
Each block is ``{type, index, text}`` (paragraph) or
|
||||
``{type, table, headers, rows}`` (table).
|
||||
*image_sources* — ``rid → {section, table?, row?, column?, name?}``
|
||||
"""
|
||||
sections: list[dict] = []
|
||||
current_source = ""
|
||||
blocks: list[dict] = []
|
||||
section_images: list[str] = []
|
||||
image_sources: dict[str, dict] = {}
|
||||
para_idx = 0
|
||||
tbl_idx = 0
|
||||
|
||||
for child in self._doc.element.body:
|
||||
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
|
||||
|
||||
if tag == "p":
|
||||
para = Paragraph(child, self._doc)
|
||||
|
||||
if self._heading_level(para) is not None:
|
||||
heading_text = para.text.strip()
|
||||
if heading_text: # ignore empty heading-like paragraphs
|
||||
if blocks or section_images:
|
||||
sections.append({
|
||||
"source": current_source,
|
||||
"blocks": blocks,
|
||||
"images": list(section_images),
|
||||
})
|
||||
blocks = []
|
||||
section_images = []
|
||||
para_idx = 0
|
||||
tbl_idx = 0
|
||||
current_source = heading_text
|
||||
continue
|
||||
|
||||
text = para.text.strip()
|
||||
|
||||
# Scan for images — append [[IMAGE:rid]] markers
|
||||
for run in para.runs:
|
||||
for rid in self._images_in(run._element):
|
||||
text += f" [[IMAGE:{rid}]]"
|
||||
section_images.append(rid)
|
||||
image_sources[rid] = {"section": current_source}
|
||||
|
||||
if text.strip():
|
||||
blocks.append({"type": "para", "index": para_idx + 1, "text": text.strip()})
|
||||
para_idx += 1
|
||||
|
||||
elif tag == "tbl":
|
||||
tbl_idx += 1
|
||||
table = Table(child, self._doc)
|
||||
|
||||
# Collect all rows as [[cell_text, ...], ...]
|
||||
all_rows: list[list[str]] = []
|
||||
all_images: list[list[list[str]]] = [] # row → col → [rids]
|
||||
for row in table.rows:
|
||||
row_texts: list[str] = []
|
||||
row_cell_images: list[list[str]] = []
|
||||
for cell in row.cells:
|
||||
cell_text = cell.text.strip()
|
||||
cell_imgs: list[str] = []
|
||||
for cp in cell.paragraphs:
|
||||
for run in cp.runs:
|
||||
for rid in self._images_in(run._element):
|
||||
cell_imgs.append(rid)
|
||||
# Replace images with markers in text
|
||||
for rid in cell_imgs:
|
||||
cell_text += f" [[IMAGE:{rid}]]"
|
||||
section_images.append(rid)
|
||||
row_texts.append(cell_text.strip())
|
||||
row_cell_images.append(cell_imgs)
|
||||
if any(row_texts) or any(row_cell_images):
|
||||
all_rows.append(row_texts)
|
||||
all_images.append(row_cell_images)
|
||||
|
||||
if len(all_rows) >= 2:
|
||||
# Heuristic: first row is a header if every cell is short
|
||||
first_row = all_rows[0]
|
||||
has_header = all(len(c) < self.HEADER_CELL_MAX_LEN for c in first_row)
|
||||
if has_header:
|
||||
headers = first_row
|
||||
data_rows_slice = zip(all_rows[1:], all_images[1:])
|
||||
else:
|
||||
headers = [f"列{ci + 1}" for ci in range(len(first_row))]
|
||||
data_rows_slice = zip(all_rows, all_images)
|
||||
|
||||
data_rows: list[dict] = []
|
||||
|
||||
for ri, (row_data, row_imgs) in enumerate(data_rows_slice):
|
||||
columns: list[dict] = []
|
||||
max_cols = max(len(headers), len(row_data))
|
||||
for ci in range(max_cols):
|
||||
hdr = headers[ci] if ci < len(headers) else ""
|
||||
txt = row_data[ci] if ci < len(row_data) else ""
|
||||
columns.append({
|
||||
"name": hdr,
|
||||
"row": ri + 1,
|
||||
"col": ci + 1,
|
||||
"text": txt,
|
||||
})
|
||||
|
||||
# Register image sources with structured location
|
||||
imgs = row_imgs[ci] if ci < len(row_imgs) else []
|
||||
for rid in imgs:
|
||||
image_sources[rid] = {
|
||||
"section": current_source,
|
||||
"table": tbl_idx,
|
||||
"row": ri + 1,
|
||||
"column": ci + 1,
|
||||
"name": hdr,
|
||||
}
|
||||
|
||||
data_rows.append({"columns": columns})
|
||||
|
||||
blocks.append({
|
||||
"type": "table",
|
||||
"table": tbl_idx,
|
||||
"headers": headers,
|
||||
"rows": data_rows,
|
||||
})
|
||||
elif all_rows:
|
||||
# Degenerate table (only header or single row) — treat as plain rows
|
||||
for ri, row_data in enumerate(all_rows):
|
||||
row_text = " | ".join(row_data)
|
||||
if row_text.strip():
|
||||
blocks.append({
|
||||
"type": "para",
|
||||
"index": para_idx + 1,
|
||||
"text": row_text,
|
||||
})
|
||||
para_idx += 1
|
||||
|
||||
if blocks or section_images:
|
||||
sections.append({
|
||||
"source": current_source,
|
||||
"blocks": blocks,
|
||||
"images": list(section_images),
|
||||
})
|
||||
|
||||
return sections, image_sources
|
||||
|
||||
# ---- internals ----------------------------------------------------------
|
||||
|
||||
def _heading_level(self, para: Paragraph) -> int | None:
|
||||
"""Heading level 1-9, or *None* if not a heading."""
|
||||
if para.style and para.style.name:
|
||||
name = para.style.name
|
||||
for prefix in ("Heading", "标题"):
|
||||
if name.startswith(prefix):
|
||||
try:
|
||||
return int(name.split()[-1])
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
pPr = para._element.find(f"{{{self.WML_NS}}}pPr")
|
||||
if pPr is not None:
|
||||
ol = pPr.find(f"{{{self.WML_NS}}}outlineLvl")
|
||||
if ol is not None:
|
||||
val = ol.get(f"{{{self.WML_NS}}}val")
|
||||
if val is not None:
|
||||
try:
|
||||
return int(val) + 1
|
||||
except ValueError:
|
||||
pass
|
||||
return None
|
||||
|
||||
def _images_in(self, element) -> list[str]:
|
||||
"""Return rId values for drawings embedded in *element*."""
|
||||
rids: list[str] = []
|
||||
for drawing in element.findall(f".//{{{self.WML_NS}}}drawing"):
|
||||
blip = drawing.find(f".//{{{self.DRAW_NS}}}blip")
|
||||
if blip is not None:
|
||||
rid = blip.get(f"{{{self.REL_NS}}}embed")
|
||||
if rid:
|
||||
rids.append(rid)
|
||||
return rids
|
||||
Reference in New Issue
Block a user