Initial commit: document_analyzer with CI/CD pipeline
CI / test (push) Successful in 30s

- 4 skill pipeline (doc_parser, conflict_detection, ir_generation, resolution_application)
- CI workflow on push/PR (.gitea/workflows/ci.yml)
- Auto-issue on CI failure (.gitea/workflows/auto-issue.yml)
- Pytest smoke tests (tests/test_sample.py)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-29 20:00:26 +08:00
commit 40567a4fb6
22 changed files with 2898 additions and 0 deletions
+37
View File
@@ -0,0 +1,37 @@
---
name: 冲突检测技能
description: 分析解析后的文档,检测图表类图像与其相应文本描述之间的矛盾和条件不匹配。
---
# 冲突检测技能
## 概述
此技能识别解析文档中文本内容与视觉内容之间的潜在冲突。它特别针对图表类图像(流程图、架构图、状态图、序列图和活动图)并交叉检查其描述与同文档部分的文本内容。
## 功能
该技能:
- 从解析的文档结构中识别图表类图像
- 将图像描述与同一文档部分中的相应文本内容进行交叉引用
- 检测视觉表示和文本表示之间的矛盾和条件不匹配
- 生成包含其位置的已识别冲突的结构化列表
- 专门针对流程图、架构图、状态图、序列图和活动图
## 输入要求
- 解析文档JSON文件的路径(由文档解析技能生成)
- 可选输出目录规范
- 可选试运行标志,在不调用API的情况下预览大语言模型提示
## 输出
该技能生成一个结构化JSON文件,文件名为输入文档的基本名称后跟'_conflicts.json',包含:
- 带有关于差异详情的冲突对象列表
- 标识每个冲突发生位置的节标识符
- 冲突图像和文本内容的片段
- 每个冲突的类型分类(例如,矛盾、条件不匹配)
## 集成点
此技能消耗文档解析技能的输出并为解决方案应用技能提供输入。冲突解决过程通常需要人工输入才能进入下一阶段。
@@ -0,0 +1,105 @@
import logging
import os
import time
from typing import Optional
from openai import OpenAI
logger = logging.getLogger(__name__)
class LLMClient:
"""Low-level OpenAI-compatible LLM client with retry and token tracking.
Usage::
llm = LLMClient()
content = llm.chat("qwen3.5-flash", [{"role": "user", "content": "Hello"}])
print(llm.usage)
"""
IMAGE_MODEL = "qwen3-vl-plus"
TEXT_MODEL = "qwen3.5-flash-2026-02-23"
TIMEOUT = 120
MAX_RETRIES = 3
def __init__(
self,
*,
base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1",
timeout: int | None = None,
):
key = os.environ.get("DASHSCOPE_API_KEY", "")
if not key:
raise ValueError("DASHSCOPE_API_KEY environment variable is not set.")
self._client = OpenAI(api_key=key, base_url=base_url)
self._timeout = timeout or self.TIMEOUT
self._prompt_tokens = 0
self._completion_tokens = 0
@property
def usage(self) -> dict:
"""Return accumulated token counts as ``{prompt, completion, total}``."""
return {
"prompt_tokens": self._prompt_tokens,
"completion_tokens": self._completion_tokens,
"total_tokens": self._prompt_tokens + self._completion_tokens,
}
@staticmethod
def estimate_tokens(text: str) -> int:
"""Quick token estimate. CJK ≈1.7/token, others ≈3.0/token."""
cjk = sum(1 for c in text if '' <= c <= '鿿' or ' ' <= c <= '')
other = len(text) - cjk
return max(1, int(cjk / 1.7 + other / 3.0))
@staticmethod
def estimate_image_tokens() -> int:
"""Fixed estimate for one vision-model image (~500 tokens)."""
return 500
def chat(
self, model: str, messages: list[dict], *, timeout: int | None = None,
response_format: dict | None = None,
) -> str:
"""Send a chat completion request and return the response content.
Automatically retries on failure and accumulates token usage.
"""
label = f"chat({model})"
def _call():
t0 = time.time()
kwargs = dict(model=model, messages=messages, timeout=timeout or self._timeout)
if response_format is not None:
kwargs["response_format"] = response_format
kwargs["temperature"] = 0
resp = self._client.chat.completions.create(**kwargs)
content = resp.choices[0].message.content
usg = resp.usage
if usg:
self._prompt_tokens += usg.prompt_tokens
self._completion_tokens += usg.completion_tokens
elapsed = time.time() - t0
logger.info("%s: %d chars in %.1fs", label, len(content) if content else 0, elapsed)
if not content:
raise RuntimeError("Empty response from LLM")
return content
return self._retry(_call, label)
def _retry(self, fn, label: str) -> str:
"""Call *fn()* with exponential-backoff retry."""
last_error: Optional[Exception] = None
for attempt in range(self.MAX_RETRIES):
try:
return fn()
except Exception as e:
last_error = e
logger.warning(
"%s error (attempt %d/%d): %s",
label, attempt + 1, self.MAX_RETRIES, e,
)
if attempt < self.MAX_RETRIES - 1:
time.sleep(2 ** attempt)
raise RuntimeError(f"{label}: all retries exhausted") from last_error
@@ -0,0 +1,280 @@
#!/usr/bin/env python3
"""Detect logical conflicts between image analysis and text in ``_parsed.json``.
Usage::
python scripts/detect_conflicts.py D:/projects/jike/output/车机娱乐系统禁止功能文档_精简_parsed.json [--output-dir DIR]
For each diagram-type image (flowchart, architecture, state, sequence, activity),
the script locates its section via *image_sources*, grabs the corresponding text
blocks, and calls an LLM to find contradictions/condition-mismatches between the
image description and the text.
Output: ``<basename>_conflicts.json``
"""
import argparse
import json
import logging
import os
import re
import sys
import time
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from LLM import LLMClient
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
RATE_LIMIT_DELAY = 0.5
DIAGRAM_TYPES = {"flowchart", "architecture", "state", "sequence", "activity"}
MIN_TEXT_CHARS = 20
PROMPT_DETECT_CONFLICT = """你是一个文档一致性检查专家。以下内容来自同一份需求文档的同一个章节,包含两部分:
## 部分1:图片(流程图/架构图/状态图)的描述
```
{image_description}
```
## 部分2:同章节的文字描述
```
{text_description}
```
## 你的任务
检查这两部分之间是否存在**逻辑矛盾或条件不一致**。
你需要关注的冲突类型:
1. **condition_mismatch**(条件不一致):两者描述了同一规则,但触发条件、阈值、时序不同。
例如:图片说"车速≥15km/h且持续5秒",文字说"车速≥10km/h且持续3秒"
例如:图片说"非P档限制",文字说"车速>0限制"
2. **contradiction**(直接矛盾):两者对同一事物的描述完全相反。
例如:图片说"功能X被禁止",文字说"功能X可用"
例如:图片说"开关默认关闭",文字说"开关默认开启"
3. **scope_mismatch**(范围不一致):两者描述的场景/地域/设备范围不同。
例如:图片说"国内方案",文字说"海外方案"
例如:图片说"CSD中控屏",文字描述包含"PSD副驾屏"
## 输出格式
如果**没有冲突**,只输出:
```
[[NO_CONFLICT]]
```
如果**有冲突**,输出以下JSON数组(不要任何其他文字):
```json
[
{{
"conflict_type": "condition_mismatch",
"severity": "high",
"section": "{section_name}",
"image_snippet": "图片中描述的关键内容(摘录)",
"text_snippet": "文字中描述的关键内容(摘录)",
"description": "用中文说明冲突的具体差异"
}}
]
```
注意:
- 每个冲突一个条目,不要合并
- severity: high(功能正确性受影响)| medium(边界条件模糊)| low(表达方式差异)
- 输出必须是**严格合法的JSON数组**,不要有尾随逗号
- 如果没有严格冲突,输出 [[NO_CONFLICT]]
"""
def _build_text_for_section(sections: list[dict], section_name: str) -> str:
"""Build a single text block for the given section name."""
texts: list[str] = []
for sec in sections:
if sec.get("source", "") == section_name:
for blk in sec.get("blocks", []):
if blk["type"] == "para":
texts.append(blk["text"])
elif blk["type"] == "table":
table_lines = [f"表格 {blk['table']}:"]
for ri, row in enumerate(blk.get("rows", [])):
cols = row.get("columns", [])
parts = [f"{c['name']}: {c['text']}" for c in cols]
table_lines.append(f"{ri + 1}: {' | '.join(parts)}")
texts.append("\n".join(table_lines))
return "\n\n".join(texts)
def _parse_conflict_json(content: str) -> list[dict]:
"""Extract JSON array from LLM response, handling markdown fences."""
stripped = content.strip()
if "[[NO_CONFLICT]]" in stripped:
return []
# Remove markdown code fences
if "```json" in stripped:
stripped = stripped.split("```json", 1)[1]
if "```" in stripped:
stripped = stripped.split("```", 1)[0]
elif "```" in stripped:
stripped = stripped.split("```", 1)[1]
if "```" in stripped:
stripped = stripped.split("```", 1)[0]
stripped = stripped.strip()
if not stripped:
return []
# Try to find a JSON array
match = re.search(r"\[\s*\{.*\}\s*\]", stripped, re.DOTALL)
if match:
stripped = match.group()
try:
conflicts = json.loads(stripped)
if isinstance(conflicts, list):
return conflicts
return []
except json.JSONDecodeError as e:
logger.warning("Failed to parse conflict JSON: %s", e)
logger.debug("Raw content: %s", stripped)
return []
def detect_conflicts(
parsed_path: str,
output_dir: str | None = None,
*,
dry_run: bool = False,
) -> list[dict]:
"""Load ``_parsed.json`` and detect image-vs-text conflicts.
Returns a flat list of conflict dicts and writes to ``<basename>_conflicts.json``.
"""
with open(parsed_path, "r", encoding="utf-8") as f:
data = json.load(f)
basename = os.path.splitext(os.path.basename(parsed_path))[0]
if basename.endswith("_parsed"):
basename = basename[:-7]
if output_dir is None:
output_dir = os.path.dirname(os.path.abspath(parsed_path))
os.makedirs(output_dir, exist_ok=True)
sections = data.get("sections", [])
image_sources = data.get("image_sources", {})
image_analysis = data.get("image_analysis", [])
llm = LLMClient()
all_conflicts: list[dict] = []
# ---- For each diagram image, compare with its section text -------------
for img in image_analysis:
img_type = img.get("type", "other")
rid = img.get("rid", "")
description = img.get("description", "").strip()
if img_type not in DIAGRAM_TYPES or not description:
logger.info("Skip conflict check: rid=%s type=%s", rid, img_type)
continue
# Find source section
src = image_sources.get(rid, {})
section_name = src.get("section", "")
if not section_name:
logger.warning("No section found for rid=%s, skipping", rid)
continue
# Build text from the same section
text_content = _build_text_for_section(sections, section_name)
text_len = len(text_content.strip())
if text_len < MIN_TEXT_CHARS:
logger.info("Section text too short (%d chars) for rid=%s, skip", text_len, rid)
continue
logger.info("Checking conflicts: rid=%s section=%s (desc=%d chars, text=%d chars)",
rid, section_name, len(description), text_len)
if dry_run:
logger.info(" [DRY RUN] would call LLM to detect conflicts")
continue
prompt = PROMPT_DETECT_CONFLICT.format(
image_description=description,
text_description=text_content,
section_name=section_name,
)
try:
raw = llm.chat(
model=LLMClient.TEXT_MODEL,
messages=[{"role": "user", "content": prompt}],
)
logger.info("Conflict check response: %d chars", len(raw))
except RuntimeError as e:
logger.error("Conflict check failed: %s", e)
continue
conflicts = _parse_conflict_json(raw)
# Enrich with location info
for c in conflicts:
c["rid"] = rid
c["image_path"] = img.get("path", "")
if "section" not in c:
c["section"] = section_name
if src.get("table"):
c.setdefault("source_location", {})["table"] = src["table"]
if src.get("row"):
c.setdefault("source_location", {})["image_row"] = src["row"]
all_conflicts.extend(conflicts)
logger.info(" Found %d conflicts for rid=%s", len(conflicts), rid)
if any(x.get("type") in DIAGRAM_TYPES
for x in image_analysis
if x.get("rid", "") != rid):
time.sleep(RATE_LIMIT_DELAY)
# ---- Save ---------------------------------------------------------------
conflicts_path = os.path.join(output_dir, f"{basename}_conflicts.json")
with open(conflicts_path, "w", encoding="utf-8") as f:
json.dump(all_conflicts, f, ensure_ascii=False, indent=2)
logger.info("Saved: %s (%d conflicts)", conflicts_path, len(all_conflicts))
# ---- Summary ------------------------------------------------------------
usg = llm.usage
logger.info("Tokens: %d prompt + %d completion = %d total",
usg["prompt_tokens"], usg["completion_tokens"], usg["total_tokens"])
return all_conflicts
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Detect image-vs-text conflicts in parsed document.",
)
parser.add_argument("input", metavar="parsed.json", help="Path to _parsed.json from doc_parser")
parser.add_argument("--output-dir", metavar="DIR", default=None,
help="Output directory (default: same as input)")
parser.add_argument("--dry-run", action="store_true",
help="Print LLM prompts without calling the API.")
args = parser.parse_args()
detect_conflicts(args.input, output_dir=args.output_dir, dry_run=args.dry_run)