doc_parser_skill: - New: verify_flowchart.py (flowchart validation) - Updated: LLM.py (multi-provider: DeepSeek + DashScope) - Updated: image_parser.py (logic tree support, external prompts) - Updated: SKILL.md, prompts/image_prompt.md conflict_detection_skill: - Updated: LLM.py (multi-provider sync) - Updated: detect_conflicts.py (logic tree text conversion) ir_generation_skill: - Replaced old scripts/LLM.py + ir_generator.py with standalone project - New: main.py, config.py, step1-3_*.py, ensemble_merge.py - New: prompts/, tests/ subdirectories tests: - New: acceptance/ test suite with schema validation - Fixed: conftest no longer globally skips non-acceptance tests - Updated: test_sample.py for new ir_generation structure Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,105 +0,0 @@
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LLMClient:
|
||||
"""Low-level OpenAI-compatible LLM client with retry and token tracking.
|
||||
|
||||
Usage::
|
||||
|
||||
llm = LLMClient()
|
||||
content = llm.chat("qwen3.5-flash", [{"role": "user", "content": "Hello"}])
|
||||
print(llm.usage)
|
||||
"""
|
||||
|
||||
IMAGE_MODEL = "qwen3-vl-plus"
|
||||
TEXT_MODEL = "qwen3.5-flash-2026-02-23"
|
||||
TIMEOUT = 120
|
||||
MAX_RETRIES = 3
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1",
|
||||
timeout: int | None = None,
|
||||
):
|
||||
key = os.environ.get("DASHSCOPE_API_KEY", "")
|
||||
if not key:
|
||||
raise ValueError("DASHSCOPE_API_KEY environment variable is not set.")
|
||||
self._client = OpenAI(api_key=key, base_url=base_url)
|
||||
self._timeout = timeout or self.TIMEOUT
|
||||
self._prompt_tokens = 0
|
||||
self._completion_tokens = 0
|
||||
|
||||
@property
|
||||
def usage(self) -> dict:
|
||||
"""Return accumulated token counts as ``{prompt, completion, total}``."""
|
||||
return {
|
||||
"prompt_tokens": self._prompt_tokens,
|
||||
"completion_tokens": self._completion_tokens,
|
||||
"total_tokens": self._prompt_tokens + self._completion_tokens,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def estimate_tokens(text: str) -> int:
|
||||
"""Quick token estimate. CJK ≈1.7/token, others ≈3.0/token."""
|
||||
cjk = sum(1 for c in text if '一' <= c <= '鿿' or ' ' <= c <= '〿')
|
||||
other = len(text) - cjk
|
||||
return max(1, int(cjk / 1.7 + other / 3.0))
|
||||
|
||||
@staticmethod
|
||||
def estimate_image_tokens() -> int:
|
||||
"""Fixed estimate for one vision-model image (~500 tokens)."""
|
||||
return 500
|
||||
|
||||
def chat(
|
||||
self, model: str, messages: list[dict], *, timeout: int | None = None,
|
||||
response_format: dict | None = None,
|
||||
) -> str:
|
||||
"""Send a chat completion request and return the response content.
|
||||
|
||||
Automatically retries on failure and accumulates token usage.
|
||||
"""
|
||||
label = f"chat({model})"
|
||||
|
||||
def _call():
|
||||
t0 = time.time()
|
||||
kwargs = dict(model=model, messages=messages, timeout=timeout or self._timeout)
|
||||
if response_format is not None:
|
||||
kwargs["response_format"] = response_format
|
||||
kwargs["temperature"] = 0
|
||||
resp = self._client.chat.completions.create(**kwargs)
|
||||
content = resp.choices[0].message.content
|
||||
usg = resp.usage
|
||||
if usg:
|
||||
self._prompt_tokens += usg.prompt_tokens
|
||||
self._completion_tokens += usg.completion_tokens
|
||||
elapsed = time.time() - t0
|
||||
logger.info("%s: %d chars in %.1fs", label, len(content) if content else 0, elapsed)
|
||||
if not content:
|
||||
raise RuntimeError("Empty response from LLM")
|
||||
return content
|
||||
|
||||
return self._retry(_call, label)
|
||||
|
||||
def _retry(self, fn, label: str) -> str:
|
||||
"""Call *fn()* with exponential-backoff retry."""
|
||||
last_error: Optional[Exception] = None
|
||||
for attempt in range(self.MAX_RETRIES):
|
||||
try:
|
||||
return fn()
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
logger.warning(
|
||||
"%s error (attempt %d/%d): %s",
|
||||
label, attempt + 1, self.MAX_RETRIES, e,
|
||||
)
|
||||
if attempt < self.MAX_RETRIES - 1:
|
||||
time.sleep(2 ** attempt)
|
||||
raise RuntimeError(f"{label}: all retries exhausted") from last_error
|
||||
@@ -1,359 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Generate JSON intermediate representation from ``_parsed.json`` or ``_updated.json``.
|
||||
|
||||
Sends the JSON document directly to the LLM for analysis. If the document exceeds
|
||||
``MAX_ANALYSIS_TOKENS``, sections are batched greedily without splitting any
|
||||
individual section. Conflict corrections from ``resolved_conflicts`` are included
|
||||
so the output respects user arbitration decisions.
|
||||
|
||||
Usage::
|
||||
|
||||
python scripts/ir_generator.py output/<basename>_updated.json [output_dir] [--dry-run]
|
||||
|
||||
Output: ``<basename>_ir.json``
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from LLM import LLMClient
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
RATE_LIMIT_DELAY = 0.5
|
||||
MAX_ANALYSIS_TOKENS = 6000 # max content size per LLM call
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Prompt
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
PROMPT = """你是一个需求文档分析助手。请分析以下需求文档的JSON内容,输出结构化JSON。
|
||||
|
||||
## 已知修正(来自冲突检测)
|
||||
以下内容已确认修正,生成JSON时请**使用修正后的值**,不要同时输出两个版本。
|
||||
{conflict_context}
|
||||
|
||||
## 待分析内容(JSON格式)
|
||||
|
||||
{content}
|
||||
|
||||
## JSON字段说明
|
||||
- sections: 文档章节列表,每个章节含 source(章节标题)和 blocks(内容块数组)
|
||||
- blocks: 类型含 para(段落,字段 text)和 table(表格,字段 rows,每行含 columns 数组)
|
||||
- image_sources: 图片所在章节映射,key 为图片 rid
|
||||
- image_analysis: 图片分析结果,每个含 rid、type(流程图/架构图/状态图等)、description
|
||||
- resolved_conflicts: 已知修正列表,每个含 section、conflict_type、correction、source
|
||||
|
||||
## 功能点定义
|
||||
|
||||
只有满足以下**全部条件**的才视为功能点:
|
||||
1. 描述了一个**系统或软件要实现的具体行为**(有触发条件、执行动作、状态变化或逻辑规则)
|
||||
2. 该行为直接由**系统或框架**执行(不是人的操作流程、管理流程)
|
||||
3. 对用户或系统有**可观察的效果**
|
||||
|
||||
**以下内容不是功能点,不要输出:**
|
||||
- 术语/缩略词定义(
|
||||
- 文档背景、范围说明(如 "本文档涵盖xxx")
|
||||
- 变更日志、版本记录、编制人信息
|
||||
- 文档结构描述(如 "产品简介用户场景说明")
|
||||
- 纯文本的概述、没有具体行为的介绍
|
||||
|
||||
## 决策树/流程图分解规则(重要)
|
||||
|
||||
图片分析(image_analysis)中的流程图和决策树描述包含丰富的功能逻辑,**必须完全分解**:
|
||||
|
||||
1. **每个叶子路径 = 一个独立 function**:从根节点到每个最终结果的完整路径,都拆成一个 function
|
||||
2. **每个判断分支 = 一个独立 function**:菱形判断节点的每个分支方向和对应的结果,单独作为一个 function
|
||||
3. **不同约束条件 = 不同 function**:例如"通过接入SDK限制"和"通过系统限制"是不同约束机制,必须分别列出
|
||||
4. **不要合并不同路径**:即使最终结果相同,只要到达路径不同,就是不同的 function
|
||||
|
||||
## 输出格式
|
||||
|
||||
只输出功能点,每个功能点格式如下:
|
||||
|
||||
{
|
||||
"function": "功能名称",
|
||||
"source": {
|
||||
"section": "章节名",
|
||||
"location": "原文位置(如:正文第1段、表格1第2行、图片rId13)"
|
||||
},
|
||||
"trigger": {
|
||||
"type": "AND或者OR",
|
||||
"conditions": [
|
||||
"触发条件1",
|
||||
"触发条件2"
|
||||
]
|
||||
},
|
||||
"actions": {
|
||||
"场景/角色": [
|
||||
"动作1",
|
||||
"动作2"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
## 输出原则
|
||||
|
||||
1. **只输出功能点**,没有功能点就输出空数组 []
|
||||
2. 每个功能点**必须**包含 source.section 和 source.location
|
||||
3. location 必须是具体的原文位置标签(如 "正文第1段"、"表格1"、"图片rId13")
|
||||
4. **一个 function 只对应一种行为逻辑(一条完整路径)**。决策树中的每个分支路径(从根到叶子)必须拆成独立 function,conditions 中明确写出该路径上的所有判断条件和分支方向。
|
||||
5. **穷举所有分支**:流程图/决策树中的每一条分支路径都要输出对应的 function,不能遗漏任何子逻辑。
|
||||
6. 没有 trigger 或 actions 的字段直接**省略**,不要写 null 或空列表/空对象
|
||||
7. 所有功能点全部列出,**宁多勿漏**
|
||||
8. **已知修正**中确认的信息,使用修正后的值
|
||||
9. 输出一个JSON数组,不要用 ```json 代码块包裹,直接输出纯JSON
|
||||
"""
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _parse_llm_response(raw: str) -> list | dict | str | None:
|
||||
"""Parse JSON from LLM response, handling markdown code fences."""
|
||||
if raw is None:
|
||||
return None
|
||||
stripped = raw.strip()
|
||||
if stripped.startswith("```"):
|
||||
nl = stripped.find("\n")
|
||||
stripped = stripped[nl + 1:] if nl != -1 else stripped[3:]
|
||||
if stripped.endswith("```"):
|
||||
stripped = stripped[:-3]
|
||||
try:
|
||||
return json.loads(stripped)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning(" Failed to parse JSON, returning raw text")
|
||||
return raw
|
||||
|
||||
|
||||
def _build_conflict_context(
|
||||
section_name: str | None,
|
||||
resolved_conflicts: list[dict],
|
||||
) -> str:
|
||||
"""Build conflict correction context for a section, or all if section_name is None."""
|
||||
if section_name is None:
|
||||
relevant = resolved_conflicts
|
||||
else:
|
||||
relevant = [c for c in resolved_conflicts if c.get("section", "") == section_name]
|
||||
if not relevant:
|
||||
return "没有"
|
||||
|
||||
lines: list[str] = []
|
||||
for c in relevant:
|
||||
correction = c.get("correction", "")
|
||||
conflict_type = c.get("conflict_type", "")
|
||||
source = c.get("source", "")
|
||||
lines.append(f"- 冲突类型:{conflict_type},依据:{source}")
|
||||
lines.append(f" 修正后的值:{correction}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# LLM analysis
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _analyze_content(
|
||||
content: str,
|
||||
conflict_context: str,
|
||||
llm: LLMClient,
|
||||
*,
|
||||
dry_run: bool = False,
|
||||
) -> list[dict]:
|
||||
"""Send content to the LLM and return IR entries."""
|
||||
prompt = PROMPT.replace("{conflict_context}", conflict_context).replace("{content}", content)
|
||||
|
||||
if dry_run:
|
||||
est = llm.estimate_tokens(prompt)
|
||||
logger.info(" [DRY RUN] prompt ~%d tokens", est)
|
||||
return []
|
||||
|
||||
try:
|
||||
raw = llm.chat(
|
||||
model=LLMClient.TEXT_MODEL,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
logger.info(" Response: %d chars", len(raw))
|
||||
except RuntimeError as e:
|
||||
logger.error(" Analysis failed: %s", e)
|
||||
return []
|
||||
|
||||
parsed = _parse_llm_response(raw)
|
||||
if isinstance(parsed, list):
|
||||
return parsed
|
||||
elif isinstance(parsed, dict):
|
||||
return [parsed]
|
||||
else:
|
||||
logger.warning(" Unparseable response, raw length: %d", len(raw))
|
||||
return []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def generate_ir(
|
||||
parsed_path: str,
|
||||
output_dir: str = "output",
|
||||
*,
|
||||
dry_run: bool = False,
|
||||
) -> dict:
|
||||
"""Read parsed/updated JSON and generate JSON IR.
|
||||
|
||||
Produces ``<basename>_ir.json`` in *output_dir*.
|
||||
"""
|
||||
with open(parsed_path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
basename = os.path.splitext(os.path.basename(parsed_path))[0]
|
||||
for suffix in ("_parsed", "_updated"):
|
||||
if basename.endswith(suffix):
|
||||
basename = basename[:-len(suffix)]
|
||||
break
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
llm = LLMClient()
|
||||
ir_output: list[dict] = []
|
||||
|
||||
sections = data.get("sections", [])
|
||||
image_sources = data.get("image_sources", {})
|
||||
image_analysis = data.get("image_analysis", [])
|
||||
resolved_conflicts = data.get("resolved_conflicts", [])
|
||||
|
||||
# Build full document JSON to measure size
|
||||
full_doc = {
|
||||
"sections": sections,
|
||||
"image_sources": image_sources,
|
||||
"image_analysis": image_analysis,
|
||||
}
|
||||
full_json = json.dumps(full_doc, ensure_ascii=False)
|
||||
total_chars = len(full_json)
|
||||
logger.info("Total document JSON chars: %d", total_chars)
|
||||
|
||||
if total_chars < MAX_ANALYSIS_TOKENS:
|
||||
logger.info("Document fits in one request (< %d chars)", MAX_ANALYSIS_TOKENS)
|
||||
conflict_ctx = _build_conflict_context(None, resolved_conflicts)
|
||||
entries = _analyze_content(full_json, conflict_ctx, llm, dry_run=dry_run)
|
||||
ir_output.extend(entries)
|
||||
else:
|
||||
logger.info("Document is large (>= %d chars), batching sections", MAX_ANALYSIS_TOKENS)
|
||||
|
||||
# Filter to non-empty sections, measure effective size per section
|
||||
# (section JSON + image_sources + image_analysis for images in that section)
|
||||
sec_sizes = []
|
||||
for sec in sections:
|
||||
if not sec.get("blocks"):
|
||||
continue
|
||||
sec_json = json.dumps(sec, ensure_ascii=False)
|
||||
sec_chars = len(sec_json)
|
||||
# Add image overhead for this section
|
||||
sec_name = sec.get("source", "")
|
||||
sec_rids = [rid for rid, src in image_sources.items()
|
||||
if src.get("section", "") == sec_name]
|
||||
if sec_rids:
|
||||
overhead_doc = {
|
||||
"image_sources": {rid: image_sources[rid] for rid in sec_rids},
|
||||
"image_analysis": [img for img in image_analysis
|
||||
if img.get("rid", "") in sec_rids],
|
||||
}
|
||||
sec_chars += len(json.dumps(overhead_doc, ensure_ascii=False))
|
||||
sec_sizes.append((sec, sec_chars))
|
||||
|
||||
# Greedy batch: never split a section, keep adding until next exceeds limit
|
||||
i = 0
|
||||
while i < len(sec_sizes):
|
||||
batch = []
|
||||
batch_size = 0
|
||||
while i < len(sec_sizes) and batch_size + sec_sizes[i][1] <= MAX_ANALYSIS_TOKENS:
|
||||
batch.append(sec_sizes[i][0])
|
||||
batch_size += sec_sizes[i][1]
|
||||
i += 1
|
||||
|
||||
if not batch:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Collect sections and their images for this batch
|
||||
batch_names = [s.get("source", "") for s in batch]
|
||||
batch_image_sources = {
|
||||
rid: src for rid, src in image_sources.items()
|
||||
if src.get("section", "") in batch_names
|
||||
}
|
||||
batch_images = [
|
||||
img for img in image_analysis
|
||||
if image_sources.get(img.get("rid", ""), {}).get("section", "") in batch_names
|
||||
]
|
||||
|
||||
batch_doc = {
|
||||
"sections": batch,
|
||||
"image_sources": batch_image_sources,
|
||||
"image_analysis": batch_images,
|
||||
}
|
||||
batch_json = json.dumps(batch_doc, ensure_ascii=False)
|
||||
|
||||
# Merge conflict contexts
|
||||
ctx_parts = []
|
||||
for sn in batch_names:
|
||||
ctx = _build_conflict_context(sn, resolved_conflicts)
|
||||
if ctx != "没有":
|
||||
ctx_parts.append(ctx)
|
||||
conflict_ctx = "\n".join(ctx_parts) if ctx_parts else "没有"
|
||||
|
||||
label = " + ".join(batch_names)
|
||||
logger.info("Batch [%s]: %d sections, %d chars", label, len(batch), len(batch_json))
|
||||
entries = _analyze_content(batch_json, conflict_ctx, llm, dry_run=dry_run)
|
||||
ir_output.extend(entries)
|
||||
time.sleep(RATE_LIMIT_DELAY)
|
||||
|
||||
# ---- save ----------------------------------------------------------------
|
||||
ir_path = os.path.join(output_dir, f"{basename}_ir.json")
|
||||
os.makedirs(os.path.dirname(ir_path) or ".", exist_ok=True)
|
||||
with open(ir_path, "w", encoding="utf-8") as f:
|
||||
json.dump(ir_output, f, ensure_ascii=False, indent=2)
|
||||
logger.info("Saved: %s (%d entries)", ir_path, len(ir_output))
|
||||
|
||||
# ---- summary -------------------------------------------------------------
|
||||
usg = llm.usage
|
||||
logger.info("Tokens: %d prompt + %d completion = %d total",
|
||||
usg["prompt_tokens"], usg["completion_tokens"], usg["total_tokens"])
|
||||
logger.info("Output: %s", ir_path)
|
||||
|
||||
return {"ir": ir_output, "path": ir_path}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Generate JSON intermediate representation from parsed/updated JSON.",
|
||||
)
|
||||
parser.add_argument("input", metavar="parsed.json",
|
||||
help="Path to _parsed.json or _updated.json")
|
||||
parser.add_argument("output_dir", nargs="?", default="output", metavar="output_dir",
|
||||
help="Directory for output files (default: output/)")
|
||||
parser.add_argument("--dry-run", action="store_true",
|
||||
help="Print token estimates without calling the API.")
|
||||
|
||||
args = parser.parse_args()
|
||||
generate_ir(args.input, args.output_dir, dry_run=args.dry_run)
|
||||
Reference in New Issue
Block a user