sync: update all skills from latest workspace code

doc_parser_skill: - New: verify_flowchart.py (flowchart validation) - Updated: LLM.py (multi-provider: DeepSeek + DashScope) - Updated: image_parser.py (logic tree support, external prompts) - Updated: SKILL.md, prompts/image_prompt.md conflict_detection_skill: - Updated: LLM.py (multi-provider sync) - Updated: detect_conflicts.py (logic tree text conversion) ir_generation_skill: - Replaced old scripts/LLM.py + ir_generator.py with standalone project - New: main.py, config.py, step1-3_*.py, ensemble_merge.py - New: prompts/, tests/ subdirectories tests: - New: acceptance/ test suite with schema validation - Fixed: conftest no longer globally skips non-acceptance tests - Updated: test_sample.py for new ir_generation structure Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-30 22:45:08 +08:00
parent db64df2da1
commit fec4c09ee0
35 changed files with 8021 additions and 530 deletions
@@ -1,105 +0,0 @@
-import logging
-import os
-import time
-from typing import Optional
-
-from openai import OpenAI
-
-logger = logging.getLogger(__name__)
-
-
-class LLMClient:
-    """Low-level OpenAI-compatible LLM client with retry and token tracking.
-
-    Usage::
-
-        llm = LLMClient()
-        content = llm.chat("qwen3.5-flash", [{"role": "user", "content": "Hello"}])
-        print(llm.usage)
-    """
-
-    IMAGE_MODEL = "qwen3-vl-plus"
-    TEXT_MODEL = "qwen3.5-flash-2026-02-23"
-    TIMEOUT = 120
-    MAX_RETRIES = 3
-
-    def __init__(
-        self,
-        *,
-        base_url: str = "https://dashscope.aliyuncs.com/compatible-mode/v1",
-        timeout: int | None = None,
-    ):
-        key = os.environ.get("DASHSCOPE_API_KEY", "")
-        if not key:
-            raise ValueError("DASHSCOPE_API_KEY environment variable is not set.")
-        self._client = OpenAI(api_key=key, base_url=base_url)
-        self._timeout = timeout or self.TIMEOUT
-        self._prompt_tokens = 0
-        self._completion_tokens = 0
-
-    @property
-    def usage(self) -> dict:
-        """Return accumulated token counts as ``{prompt, completion, total}``."""
-        return {
-            "prompt_tokens": self._prompt_tokens,
-            "completion_tokens": self._completion_tokens,
-            "total_tokens": self._prompt_tokens + self._completion_tokens,
-        }
-
-    @staticmethod
-    def estimate_tokens(text: str) -> int:
-        """Quick token estimate.  CJK ≈1.7/token, others ≈3.0/token."""
-        cjk = sum(1 for c in text if '一' <= c <= '鿿' or '　' <= c <= '〿')
-        other = len(text) - cjk
-        return max(1, int(cjk / 1.7 + other / 3.0))
-
-    @staticmethod
-    def estimate_image_tokens() -> int:
-        """Fixed estimate for one vision-model image (~500 tokens)."""
-        return 500
-
-    def chat(
-        self, model: str, messages: list[dict], *, timeout: int | None = None,
-        response_format: dict | None = None,
-    ) -> str:
-        """Send a chat completion request and return the response content.
-
-        Automatically retries on failure and accumulates token usage.
-        """
-        label = f"chat({model})"
-
-        def _call():
-            t0 = time.time()
-            kwargs = dict(model=model, messages=messages, timeout=timeout or self._timeout)
-            if response_format is not None:
-                kwargs["response_format"] = response_format
-            kwargs["temperature"] = 0
-            resp = self._client.chat.completions.create(**kwargs)
-            content = resp.choices[0].message.content
-            usg = resp.usage
-            if usg:
-                self._prompt_tokens += usg.prompt_tokens
-                self._completion_tokens += usg.completion_tokens
-            elapsed = time.time() - t0
-            logger.info("%s: %d chars in %.1fs", label, len(content) if content else 0, elapsed)
-            if not content:
-                raise RuntimeError("Empty response from LLM")
-            return content
-
-        return self._retry(_call, label)
-
-    def _retry(self, fn, label: str) -> str:
-        """Call *fn()* with exponential-backoff retry."""
-        last_error: Optional[Exception] = None
-        for attempt in range(self.MAX_RETRIES):
-            try:
-                return fn()
-            except Exception as e:
-                last_error = e
-                logger.warning(
-                    "%s error (attempt %d/%d): %s",
-                    label, attempt + 1, self.MAX_RETRIES, e,
-                )
-                if attempt < self.MAX_RETRIES - 1:
-                    time.sleep(2 ** attempt)
-        raise RuntimeError(f"{label}: all retries exhausted") from last_error
@@ -1,359 +0,0 @@
-#!/usr/bin/env python3
-"""Generate JSON intermediate representation from ``_parsed.json`` or ``_updated.json``.
-
-Sends the JSON document directly to the LLM for analysis. If the document exceeds
-``MAX_ANALYSIS_TOKENS``, sections are batched greedily without splitting any
-individual section. Conflict corrections from ``resolved_conflicts`` are included
-so the output respects user arbitration decisions.
-
-Usage::
-
-    python scripts/ir_generator.py output/<basename>_updated.json [output_dir] [--dry-run]
-
-Output: ``<basename>_ir.json``
-"""
-
-import argparse
-import json
-import logging
-import os
-import sys
-import time
-
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-
-from LLM import LLMClient
-
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-)
-
-logger = logging.getLogger(__name__)
-
-# ---------------------------------------------------------------------------
-# Configuration
-# ---------------------------------------------------------------------------
-
-RATE_LIMIT_DELAY = 0.5
-MAX_ANALYSIS_TOKENS = 6000  # max content size per LLM call
-
-
-# ---------------------------------------------------------------------------
-# Prompt
-# ---------------------------------------------------------------------------
-
-PROMPT = """你是一个需求文档分析助手。请分析以下需求文档的JSON内容，输出结构化JSON。
-
-## 已知修正（来自冲突检测）
-以下内容已确认修正，生成JSON时请**使用修正后的值**，不要同时输出两个版本。
-{conflict_context}
-
-## 待分析内容（JSON格式）
-
-{content}
-
-## JSON字段说明
- sections: 文档章节列表，每个章节含 source（章节标题）和 blocks（内容块数组）
- blocks: 类型含 para（段落，字段 text）和 table（表格，字段 rows，每行含 columns 数组）
- image_sources: 图片所在章节映射，key 为图片 rid
- image_analysis: 图片分析结果，每个含 rid、type（流程图/架构图/状态图等）、description
- resolved_conflicts: 已知修正列表，每个含 section、conflict_type、correction、source
-
-## 功能点定义
-
-只有满足以下**全部条件**的才视为功能点：
-1. 描述了一个**系统或软件要实现的具体行为**（有触发条件、执行动作、状态变化或逻辑规则）
-2. 该行为直接由**系统或框架**执行（不是人的操作流程、管理流程）
-3. 对用户或系统有**可观察的效果**
-
-**以下内容不是功能点，不要输出：**
- 术语/缩略词定义（
- 文档背景、范围说明（如 "本文档涵盖xxx"）
- 变更日志、版本记录、编制人信息
- 文档结构描述（如 "产品简介用户场景说明"）
- 纯文本的概述、没有具体行为的介绍
-
-## 决策树/流程图分解规则（重要）
-
-图片分析（image_analysis）中的流程图和决策树描述包含丰富的功能逻辑，**必须完全分解**：
-
-1. **每个叶子路径 = 一个独立 function**：从根节点到每个最终结果的完整路径，都拆成一个 function
-2. **每个判断分支 = 一个独立 function**：菱形判断节点的每个分支方向和对应的结果，单独作为一个 function
-3. **不同约束条件 = 不同 function**：例如"通过接入SDK限制"和"通过系统限制"是不同约束机制，必须分别列出
-4. **不要合并不同路径**：即使最终结果相同，只要到达路径不同，就是不同的 function
-
-## 输出格式
-
-只输出功能点，每个功能点格式如下：
-
-{
-  "function": "功能名称",
-  "source": {
-    "section": "章节名",
-    "location": "原文位置（如：正文第1段、表格1第2行、图片rId13）"
-  },
-  "trigger": {
-    "type": "AND或者OR",
-    "conditions": [
-      "触发条件1",
-      "触发条件2"
-    ]
-  },
-  "actions": {
-    "场景/角色": [
-      "动作1",
-      "动作2"
-    ]
-  }
-}
-
-## 输出原则
-
-1. **只输出功能点**，没有功能点就输出空数组 []
-2. 每个功能点**必须**包含 source.section 和 source.location
-3. location 必须是具体的原文位置标签（如 "正文第1段"、"表格1"、"图片rId13"）
-4. **一个 function 只对应一种行为逻辑（一条完整路径）**。决策树中的每个分支路径（从根到叶子）必须拆成独立 function，conditions 中明确写出该路径上的所有判断条件和分支方向。
-5. **穷举所有分支**：流程图/决策树中的每一条分支路径都要输出对应的 function，不能遗漏任何子逻辑。
-6. 没有 trigger 或 actions 的字段直接**省略**，不要写 null 或空列表/空对象
-7. 所有功能点全部列出，**宁多勿漏**
-8. **已知修正**中确认的信息，使用修正后的值
-9. 输出一个JSON数组，不要用 ```json 代码块包裹，直接输出纯JSON
-"""
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-def _parse_llm_response(raw: str) -> list | dict | str | None:
-    """Parse JSON from LLM response, handling markdown code fences."""
-    if raw is None:
-        return None
-    stripped = raw.strip()
-    if stripped.startswith("```"):
-        nl = stripped.find("\n")
-        stripped = stripped[nl + 1:] if nl != -1 else stripped[3:]
-    if stripped.endswith("```"):
-        stripped = stripped[:-3]
-    try:
-        return json.loads(stripped)
-    except json.JSONDecodeError:
-        logger.warning("  Failed to parse JSON, returning raw text")
-        return raw
-
-
-def _build_conflict_context(
-    section_name: str | None,
-    resolved_conflicts: list[dict],
-) -> str:
-    """Build conflict correction context for a section, or all if section_name is None."""
-    if section_name is None:
-        relevant = resolved_conflicts
-    else:
-        relevant = [c for c in resolved_conflicts if c.get("section", "") == section_name]
-    if not relevant:
-        return "没有"
-
-    lines: list[str] = []
-    for c in relevant:
-        correction = c.get("correction", "")
-        conflict_type = c.get("conflict_type", "")
-        source = c.get("source", "")
-        lines.append(f"- 冲突类型：{conflict_type}，依据：{source}")
-        lines.append(f"  修正后的值：{correction}")
-
-    return "\n".join(lines)
-
-
-# ---------------------------------------------------------------------------
-# LLM analysis
-# ---------------------------------------------------------------------------
-
-def _analyze_content(
-    content: str,
-    conflict_context: str,
-    llm: LLMClient,
-    *,
-    dry_run: bool = False,
-) -> list[dict]:
-    """Send content to the LLM and return IR entries."""
-    prompt = PROMPT.replace("{conflict_context}", conflict_context).replace("{content}", content)
-
-    if dry_run:
-        est = llm.estimate_tokens(prompt)
-        logger.info("  [DRY RUN] prompt ~%d tokens", est)
-        return []
-
-    try:
-        raw = llm.chat(
-            model=LLMClient.TEXT_MODEL,
-            messages=[{"role": "user", "content": prompt}],
-            response_format={"type": "json_object"},
-        )
-        logger.info("  Response: %d chars", len(raw))
-    except RuntimeError as e:
-        logger.error("  Analysis failed: %s", e)
-        return []
-
-    parsed = _parse_llm_response(raw)
-    if isinstance(parsed, list):
-        return parsed
-    elif isinstance(parsed, dict):
-        return [parsed]
-    else:
-        logger.warning("  Unparseable response, raw length: %d", len(raw))
-        return []
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-def generate_ir(
-    parsed_path: str,
-    output_dir: str = "output",
-    *,
-    dry_run: bool = False,
-) -> dict:
-    """Read parsed/updated JSON and generate JSON IR.
-
-    Produces ``<basename>_ir.json`` in *output_dir*.
-    """
-    with open(parsed_path, "r", encoding="utf-8") as f:
-        data = json.load(f)
-
-    basename = os.path.splitext(os.path.basename(parsed_path))[0]
-    for suffix in ("_parsed", "_updated"):
-        if basename.endswith(suffix):
-            basename = basename[:-len(suffix)]
-            break
-    os.makedirs(output_dir, exist_ok=True)
-
-    llm = LLMClient()
-    ir_output: list[dict] = []
-
-    sections = data.get("sections", [])
-    image_sources = data.get("image_sources", {})
-    image_analysis = data.get("image_analysis", [])
-    resolved_conflicts = data.get("resolved_conflicts", [])
-
-    # Build full document JSON to measure size
-    full_doc = {
-        "sections": sections,
-        "image_sources": image_sources,
-        "image_analysis": image_analysis,
-    }
-    full_json = json.dumps(full_doc, ensure_ascii=False)
-    total_chars = len(full_json)
-    logger.info("Total document JSON chars: %d", total_chars)
-
-    if total_chars < MAX_ANALYSIS_TOKENS:
-        logger.info("Document fits in one request (< %d chars)", MAX_ANALYSIS_TOKENS)
-        conflict_ctx = _build_conflict_context(None, resolved_conflicts)
-        entries = _analyze_content(full_json, conflict_ctx, llm, dry_run=dry_run)
-        ir_output.extend(entries)
-    else:
-        logger.info("Document is large (>= %d chars), batching sections", MAX_ANALYSIS_TOKENS)
-
-        # Filter to non-empty sections, measure effective size per section
-        # (section JSON + image_sources + image_analysis for images in that section)
-        sec_sizes = []
-        for sec in sections:
-            if not sec.get("blocks"):
-                continue
-            sec_json = json.dumps(sec, ensure_ascii=False)
-            sec_chars = len(sec_json)
-            # Add image overhead for this section
-            sec_name = sec.get("source", "")
-            sec_rids = [rid for rid, src in image_sources.items()
-                        if src.get("section", "") == sec_name]
-            if sec_rids:
-                overhead_doc = {
-                    "image_sources": {rid: image_sources[rid] for rid in sec_rids},
-                    "image_analysis": [img for img in image_analysis
-                                       if img.get("rid", "") in sec_rids],
-                }
-                sec_chars += len(json.dumps(overhead_doc, ensure_ascii=False))
-            sec_sizes.append((sec, sec_chars))
-
-        # Greedy batch: never split a section, keep adding until next exceeds limit
-        i = 0
-        while i < len(sec_sizes):
-            batch = []
-            batch_size = 0
-            while i < len(sec_sizes) and batch_size + sec_sizes[i][1] <= MAX_ANALYSIS_TOKENS:
-                batch.append(sec_sizes[i][0])
-                batch_size += sec_sizes[i][1]
-                i += 1
-
-            if not batch:
-                i += 1
-                continue
-
-            # Collect sections and their images for this batch
-            batch_names = [s.get("source", "") for s in batch]
-            batch_image_sources = {
-                rid: src for rid, src in image_sources.items()
-                if src.get("section", "") in batch_names
-            }
-            batch_images = [
-                img for img in image_analysis
-                if image_sources.get(img.get("rid", ""), {}).get("section", "") in batch_names
-            ]
-
-            batch_doc = {
-                "sections": batch,
-                "image_sources": batch_image_sources,
-                "image_analysis": batch_images,
-            }
-            batch_json = json.dumps(batch_doc, ensure_ascii=False)
-
-            # Merge conflict contexts
-            ctx_parts = []
-            for sn in batch_names:
-                ctx = _build_conflict_context(sn, resolved_conflicts)
-                if ctx != "没有":
-                    ctx_parts.append(ctx)
-            conflict_ctx = "\n".join(ctx_parts) if ctx_parts else "没有"
-
-            label = " + ".join(batch_names)
-            logger.info("Batch [%s]: %d sections, %d chars", label, len(batch), len(batch_json))
-            entries = _analyze_content(batch_json, conflict_ctx, llm, dry_run=dry_run)
-            ir_output.extend(entries)
-            time.sleep(RATE_LIMIT_DELAY)
-
-    # ---- save ----------------------------------------------------------------
-    ir_path = os.path.join(output_dir, f"{basename}_ir.json")
-    os.makedirs(os.path.dirname(ir_path) or ".", exist_ok=True)
-    with open(ir_path, "w", encoding="utf-8") as f:
-        json.dump(ir_output, f, ensure_ascii=False, indent=2)
-    logger.info("Saved: %s (%d entries)", ir_path, len(ir_output))
-
-    # ---- summary -------------------------------------------------------------
-    usg = llm.usage
-    logger.info("Tokens: %d prompt + %d completion = %d total",
-                usg["prompt_tokens"], usg["completion_tokens"], usg["total_tokens"])
-    logger.info("Output: %s", ir_path)
-
-    return {"ir": ir_output, "path": ir_path}
-
-
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Generate JSON intermediate representation from parsed/updated JSON.",
-    )
-    parser.add_argument("input", metavar="parsed.json",
-                        help="Path to _parsed.json or _updated.json")
-    parser.add_argument("output_dir", nargs="?", default="output", metavar="output_dir",
-                        help="Directory for output files (default: output/)")
-    parser.add_argument("--dry-run", action="store_true",
-                        help="Print token estimates without calling the API.")
-
-    args = parser.parse_args()
-    generate_ir(args.input, args.output_dir, dry_run=args.dry_run)