sync: update all skills from latest workspace code

doc_parser_skill: - New: verify_flowchart.py (flowchart validation) - Updated: LLM.py (multi-provider: DeepSeek + DashScope) - Updated: image_parser.py (logic tree support, external prompts) - Updated: SKILL.md, prompts/image_prompt.md conflict_detection_skill: - Updated: LLM.py (multi-provider sync) - Updated: detect_conflicts.py (logic tree text conversion) ir_generation_skill: - Replaced old scripts/LLM.py + ir_generator.py with standalone project - New: main.py, config.py, step1-3_*.py, ensemble_merge.py - New: prompts/, tests/ subdirectories tests: - New: acceptance/ test suite with schema validation - Fixed: conftest no longer globally skips non-acceptance tests - Updated: test_sample.py for new ir_generation structure Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-30 22:45:08 +08:00
parent db64df2da1
commit fec4c09ee0
35 changed files with 8021 additions and 530 deletions
@@ -0,0 +1 @@
+# Tests package for document_analyzer
@@ -0,0 +1 @@
+# QE Acceptance Tests for document_analyzer
@@ -0,0 +1,186 @@
+"""Pytest configuration and shared fixtures for QE acceptance tests.
+
+Usage::
+
+    pytest tests/acceptance/ -v --run-acceptance [--acceptance-runs=3]
+
+Environment variables:
+    DASHSCOPE_API_KEY  — LLM API key (required for Layers B/C)
+    TEST_IR_PATH       — path to IR JSON to validate (default: ir_final.json sample)
+    TEST_PARSED_PATH   — path to _parsed.json or _updated.json for coverage analysis
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+import tempfile
+from pathlib import Path
+from typing import Any
+
+import pytest
+
+# ── Path setup ──────────────────────────────────────────────────────────────
+
+_PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(_PROJECT_ROOT))
+
+
+def _skill_path(skill_name: str) -> str:
+    return str(_PROJECT_ROOT / "skills" / skill_name / "scripts")
+
+
+# ── pytest configuration ────────────────────────────────────────────────────
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--run-acceptance",
+        action="store_true",
+        default=False,
+        help="Run QE acceptance tests (requires DASHSCOPE_API_KEY)",
+    )
+    parser.addoption(
+        "--acceptance-runs",
+        type=int,
+        default=1,
+        help="Number of IR generation runs for Layer B stability testing (default: 1 = skip)",
+    )
+    parser.addoption(
+        "--ir-path",
+        type=str,
+        default=None,
+        help="Path to IR JSON file to validate",
+    )
+    parser.addoption(
+        "--parsed-path",
+        type=str,
+        default=None,
+        help="Path to _parsed.json or _updated.json for coverage analysis",
+    )
+
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers",
+        "acceptance: QE acceptance test (requires --run-acceptance flag and DASHSCOPE_API_KEY)",
+    )
+
+
+def pytest_collection_modifyitems(config, items):
+    acceptance_dir = str(_PROJECT_ROOT / "tests" / "acceptance")
+    acceptance_items = [i for i in items if str(i.fspath).startswith(acceptance_dir)]
+    non_acceptance_items = [i for i in items if not str(i.fspath).startswith(acceptance_dir)]
+
+    if not config.getoption("--run-acceptance"):
+        skip_msg = pytest.mark.skip(reason="Need --run-acceptance flag to run")
+        for item in acceptance_items:
+            item.add_marker(skip_msg)
+        # Don't skip non-acceptance tests
+        return
+
+    if not os.environ.get("DASHSCOPE_API_KEY"):
+        skip_msg = pytest.mark.skip(reason="DASHSCOPE_API_KEY not set")
+        for item in acceptance_items:
+            item.add_marker(skip_msg)
+
+
+# ── Shared fixtures ─────────────────────────────────────────────────────────
+
+
+@pytest.fixture(scope="session")
+def project_root() -> Path:
+    return _PROJECT_ROOT
+
+
+@pytest.fixture(scope="session")
+def ir_path(request) -> str:
+    """Path to the IR JSON file under test."""
+    path = (
+        request.config.getoption("--ir-path")
+        or os.environ.get("TEST_IR_PATH")
+        or str(
+            Path.home()
+            / ".openclaw/workspace/skills/doc_parser_skill/output/ir_final.json"
+        )
+    )
+    if not os.path.exists(path):
+        pytest.skip(f"IR file not found: {path}")
+    return path
+
+
+@pytest.fixture(scope="session")
+def ir_data(ir_path: str) -> dict:
+    """Load the IR JSON data."""
+    with open(ir_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+@pytest.fixture(scope="session")
+def parsed_path(request) -> str | None:
+    """Path to the corresponding _parsed.json or _updated.json."""
+    path = (
+        request.config.getoption("--parsed-path")
+        or os.environ.get("TEST_PARSED_PATH")
+        or str(
+            _PROJECT_ROOT
+            / "skills/ir_generation_skill/车机娱乐系统禁止功能文档_精简_updated.json"
+        )
+    )
+    if os.path.exists(path):
+        return path
+    return None
+
+
+@pytest.fixture(scope="session")
+def parsed_data(parsed_path: str | None) -> dict | None:
+    """Load the parsed document JSON for coverage analysis."""
+    if parsed_path is None:
+        return None
+    with open(parsed_path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+@pytest.fixture(scope="session")
+def llm_client():
+    """Create an LLMClient instance for acceptance tests.
+
+    Uses the DashScope-compatible LLMClient from the project.
+    """
+    sys.path.insert(0, _skill_path("doc_parser_skill"))
+    from LLM import LLMClient
+
+    return LLMClient()
+
+
+@pytest.fixture(scope="session")
+def acceptance_runs(request) -> int:
+    return request.config.getoption("--acceptance-runs", default=1)
+
+
+# ── Pipeline runner ─────────────────────────────────────────────────────────
+
+
+@pytest.fixture(scope="session")
+def run_ir_pipeline():
+    """Return a callable that runs the IR generation pipeline on a parsed JSON.
+
+    Usage::
+
+        ir_data, ir_path = run_ir_pipeline(parsed_json_path, output_dir)
+    """
+    sys.path.insert(0, _skill_path("ir_generation_skill"))
+    from ir_generator import generate_ir
+
+    def _run(parsed_path: str, output_dir: str | None = None) -> tuple[dict, str]:
+        """Run IR generation and return (ir_data, ir_path)."""
+        out = output_dir or tempfile.mkdtemp(prefix="qe_acceptance_")
+        result = generate_ir(parsed_path, out, dry_run=False)
+        ir_list = result.get("ir", [])
+        ir_path = result.get("path", "")
+        # ir_generator produces a list; wrap to match rich format expectations
+        # for schema validation we accept both formats
+        return ir_list, ir_path
+
+    return _run
@@ -0,0 +1,325 @@
+"""Rich IR schema definition and validators for the document_analyzer QE framework.
+
+Target format is the production IR (``ir_final.json``):
+  {feature, feature_id, config_defaults?, rules: [{rule_id, path, description,
+    priority, sources, precondition, trigger, actions}]}
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+# ── Constants ────────────────────────────────────────────────────────────────
+
+VALID_SOURCE_TYPES = {"table", "logic_tree", "text"}
+VALID_ACTION_TYPES = {"system", "user_interaction"}
+VALID_PRIORITIES = {"P0", "P1", "P2"}
+VALID_TRIGGER_OPERATORS = {"AND", "OR"}
+
+# rule_id pattern: FEAT-NNN-SCOPE-TYPE-...-PATH-NN (variable middle segments)
+RULE_ID_RE = re.compile(
+    r"^[A-Z]+-\d+(-[A-Z]+)+-\d+$"
+)
+
+
+# ── Validation helpers ──────────────────────────────────────────────────────
+
+def _check(condition: bool, message: str) -> list[str]:
+    """Return a list with an error message if *condition* is False, else empty list."""
+    return [] if condition else [message]
+
+
+def validate_rule(rule: dict, index: int = 0) -> list[str]:
+    """Validate a single rule dict. Returns a (possibly empty) list of error strings."""
+    errors: list[str] = []
+    label = f"rules[{index}]"
+
+    if not isinstance(rule, dict):
+        return [f"{label}: not a dict"]
+
+    # ── required top-level fields ──
+    for field in ("rule_id", "description"):
+        errors.extend(_check(
+            isinstance(rule.get(field), str) and bool(rule[field].strip()),
+            f'{label}.{field}: required non-empty string',
+        ))
+
+    # sources is a list, not a string — validated separately below
+
+    # ── rule_id naming ──
+    rid = rule.get("rule_id", "")
+    if rid and isinstance(rid, str):
+        errors.extend(_check(
+            bool(RULE_ID_RE.match(rid)),
+            f'{label}.rule_id: "{rid}" does not match pattern FEAT-NNN-SCOPE-TYPE-PATH-NN',
+        ))
+
+    # ── priority ──
+    priority = rule.get("priority")
+    if priority is not None:
+        errors.extend(_check(
+            priority in VALID_PRIORITIES,
+            f'{label}.priority: "{priority}" not in {VALID_PRIORITIES}',
+        ))
+
+    # ── path ──
+    path = rule.get("path")
+    if path is not None:
+        if not isinstance(path, list):
+            errors.append(f"{label}.path: must be a list")
+        elif len(path) == 0:
+            errors.append(f"{label}.path: must not be empty")
+        elif not all(isinstance(p, str) and p.strip() for p in path):
+            errors.append(f"{label}.path: all segments must be non-empty strings")
+
+    # ── sources[] ──
+    sources = rule.get("sources", [])
+    if not isinstance(sources, list):
+        errors.append(f"{label}.sources: must be a list")
+    elif len(sources) == 0:
+        errors.append(f"{label}.sources: must have at least one source")
+    else:
+        for si, src in enumerate(sources):
+            errors.extend(_validate_source(src, f"{label}.sources[{si}]"))
+
+    # ── precondition ──
+    precondition = rule.get("precondition")
+    if precondition is not None:
+        if not isinstance(precondition, dict):
+            errors.append(f"{label}.precondition: must be a dict")
+        elif len(precondition) == 0:
+            errors.append(f"{label}.precondition: must not be empty")
+
+    # ── trigger ──
+    trigger = rule.get("trigger")
+    if trigger is not None:
+        if not isinstance(trigger, dict):
+            errors.append(f"{label}.trigger: must be a dict")
+        else:
+            errors.extend(_validate_trigger(trigger, f"{label}.trigger"))
+
+    # ── actions ──
+    actions = rule.get("actions")
+    if actions is not None:
+        if not isinstance(actions, list):
+            errors.append(f"{label}.actions: must be a list")
+        else:
+            for ai, act in enumerate(actions):
+                errors.extend(_validate_action(act, f"{label}.actions[{ai}]"))
+
+    # ── no null values at any depth ──
+    errors.extend(_find_nulls(rule, label))
+
+    return errors
+
+
+def _validate_source(src: dict, label: str) -> list[str]:
+    errors: list[str] = []
+    if not isinstance(src, dict):
+        return [f"{label}: not a dict"]
+
+    stype = src.get("type", "")
+    errors.extend(_check(
+        stype in VALID_SOURCE_TYPES,
+        f'{label}.type: "{stype}" not in {VALID_SOURCE_TYPES}',
+    ))
+
+    priority = src.get("priority", "")
+    if priority:
+        errors.extend(_check(
+            priority in ("primary_source", "supplementary"),
+            f'{label}.priority: "{priority}" must be primary_source or supplementary',
+        ))
+
+    # type-specific fields
+    if stype == "table":
+        errors.extend(_check(
+            isinstance(src.get("section"), str) and bool(src["section"].strip()),
+            f"{label}.section: required non-empty string for table source",
+        ))
+        errors.extend(_check(
+            isinstance(src.get("row"), int),
+            f"{label}.row: required int for table source",
+        ))
+    elif stype == "logic_tree":
+        errors.extend(_check(
+            isinstance(src.get("image_id"), str) and bool(src["image_id"].strip()),
+            f"{label}.image_id: required non-empty string for logic_tree source",
+        ))
+        node_ids = src.get("node_ids", [])
+        errors.extend(_check(
+            isinstance(node_ids, list) and len(node_ids) > 0,
+            f"{label}.node_ids: required non-empty list for logic_tree source",
+        ))
+    elif stype == "text":
+        errors.extend(_check(
+            isinstance(src.get("section"), str) and bool(src["section"].strip()),
+            f"{label}.section: required non-empty string for text source",
+        ))
+
+    return errors
+
+
+def _validate_trigger(trigger: dict, label: str) -> list[str]:
+    errors: list[str] = []
+    operator = trigger.get("operator", "")
+    errors.extend(_check(
+        operator in VALID_TRIGGER_OPERATORS,
+        f'{label}.operator: "{operator}" not in {VALID_TRIGGER_OPERATORS}',
+    ))
+
+    conditions = trigger.get("conditions")
+    if conditions is not None:
+        if not isinstance(conditions, list):
+            errors.append(f"{label}.conditions: must be a list")
+        else:
+            for ci, cond in enumerate(conditions):
+                if not isinstance(cond, dict):
+                    errors.append(f"{label}.conditions[{ci}]: not a dict")
+                else:
+                    errors.extend(_check(
+                        isinstance(cond.get("signal"), str) and bool(cond["signal"].strip()),
+                        f"{label}.conditions[{ci}].signal: required non-empty string",
+                    ))
+                    errors.extend(_check(
+                        "operator" in cond,
+                        f"{label}.conditions[{ci}].operator: required",
+                    ))
+    # empty conditions is valid (e.g. "switch always off, no conditions")
+
+    return errors
+
+
+def _validate_action(action: dict, label: str) -> list[str]:
+    errors: list[str] = []
+    if not isinstance(action, dict):
+        return [f"{label}: not a dict"]
+
+    atype = action.get("type", "")
+    errors.extend(_check(
+        atype in VALID_ACTION_TYPES,
+        f'{label}.type: "{atype}" not in {VALID_ACTION_TYPES}',
+    ))
+    errors.extend(_check(
+        isinstance(action.get("description"), str) and bool(action["description"].strip()),
+        f"{label}.description: required non-empty string",
+    ))
+
+    return errors
+
+
+def _find_nulls(obj: Any, label: str) -> list[str]:
+    """Find any None values at any depth in *obj*."""
+    errors: list[str] = []
+    if obj is None:
+        return [f"{label}: null value"]
+    elif isinstance(obj, dict):
+        for k, v in obj.items():
+            errors.extend(_find_nulls(v, f"{label}.{k}"))
+    elif isinstance(obj, list):
+        for i, v in enumerate(obj):
+            errors.extend(_find_nulls(v, f"{label}[{i}]"))
+    return errors
+
+
+# ── Top-level validation ────────────────────────────────────────────────────
+
+def validate_ir(ir_data: dict) -> dict:
+    """Validate the entire IR document.
+
+    Returns:
+        {
+            "valid": bool,
+            "errors": [str, ...],
+            "stats": {total_rules, valid_rules, has_config_defaults, ...}
+        }
+    """
+    errors: list[str] = []
+    stats = {"total_rules": 0, "valid_rules": 0, "has_config_defaults": False, "features": 0}
+
+    if not isinstance(ir_data, dict):
+        return {"valid": False, "errors": ["IR root is not a dict"], "stats": stats}
+
+    # top-level required fields
+    for field in ("feature", "feature_id", "rules"):
+        if field not in ir_data:
+            errors.append(f"root.{field}: missing required field")
+        elif field in ("feature", "feature_id") and not (
+            isinstance(ir_data[field], str) and ir_data[field].strip()
+        ):
+            errors.append(f"root.{field}: must be non-empty string")
+
+    # config_defaults (optional)
+    if "config_defaults" in ir_data:
+        stats["has_config_defaults"] = True
+        cd = ir_data["config_defaults"]
+        if not isinstance(cd, dict):
+            errors.append("root.config_defaults: must be a dict")
+
+    # rules array
+    rules = ir_data.get("rules", [])
+    if not isinstance(rules, list):
+        errors.append("root.rules: must be a list")
+    else:
+        stats["total_rules"] = len(rules)
+        if len(rules) == 0:
+            errors.append("root.rules: must have at least one rule")
+        else:
+            for i, rule in enumerate(rules):
+                rule_errors = validate_rule(rule, i)
+                if rule_errors:
+                    errors.extend(rule_errors)
+                else:
+                    stats["valid_rules"] += 1
+
+    # feature count
+    if isinstance(ir_data.get("feature_id"), str):
+        stats["features"] = 1
+
+    return {
+        "valid": len(errors) == 0,
+        "errors": errors,
+        "stats": stats,
+    }
+
+
+# ── Summary helpers ─────────────────────────────────────────────────────────
+
+def schema_checklist(ir_data: dict) -> list[dict]:
+    """Run individual checks and return a checklist for reporting.
+
+    Each item: {"check": str, "passed": bool, "detail": str}
+    """
+    report = validate_ir(ir_data)
+    checks: list[dict] = []
+
+    def _add(name: str, passed: bool, detail: str = ""):
+        checks.append({"check": name, "passed": passed, "detail": detail})
+
+    # Top-level
+    _add("root is dict", isinstance(ir_data, dict))
+    _add("root.feature present", isinstance(ir_data.get("feature"), str) and bool(ir_data["feature"].strip()))
+    _add("root.feature_id present", isinstance(ir_data.get("feature_id"), str) and bool(ir_data["feature_id"].strip()))
+    _add("root.rules is non-empty list", isinstance(ir_data.get("rules"), list) and len(ir_data["rules"]) > 0)
+
+    # Per-rule checks
+    rules = ir_data.get("rules", []) if isinstance(ir_data, dict) else []
+    rule_ids = []
+    for i, rule in enumerate(rules):
+        if not isinstance(rule, dict):
+            continue
+        rid = rule.get("rule_id", f"rules[{i}]")
+        rule_ids.append(rid)
+
+        errs = validate_rule(rule, i)
+        _add(f"{rid}: valid", len(errs) == 0, "; ".join(errs) if errs else "")
+
+    # Aggregate checks
+    _add("no duplicate rule_ids", len(rule_ids) == len(set(rule_ids)),
+         f"duplicates: {[r for r in rule_ids if rule_ids.count(r) > 1]}" if len(rule_ids) != len(set(rule_ids)) else "")
+
+    _add("all rules valid", report["valid"],
+         f"{report['stats']['valid_rules']}/{report['stats']['total_rules']} valid")
+
+    return checks
@@ -0,0 +1,178 @@
+"""Structured JSON report generation for QE acceptance test results.
+
+Produces a unified report with three-layer verdict:
+  Layer A – Schema compliance
+  Layer B – Structural coverage + stability
+  Layer C – LLM QE expert audit
+
+Final verdict: PASS (releasable) or FAIL (blocked).
+"""
+
+from __future__ import annotations
+
+import json
+import time
+from pathlib import Path
+from typing import Any
+
+
+def generate_report(
+    schema_result: dict,
+    coverage_result: dict,
+    audit_result: dict | None,
+    *,
+    commit: str = "",
+    branch: str = "main",
+    output_path: str | None = None,
+) -> dict:
+    """Assemble the three-layer report and return it.
+
+    Args:
+        schema_result: ``{"verdict": "PASS"|"FAIL", "total_checks": N, "passed": N, "failed": N}``
+        coverage_result: ``{"verdict": "PASS"|"FAIL", "coverage_rate": float,
+                           "stability": {"runs": N, "values": [...], "std": float}}``
+        audit_result: ``{"verdict": "ACCEPT"|"REJECT", "inadequate_ratio": float,
+                         "rationale": str, "section_assessments": [...]}`` or None
+        commit: git commit SHA
+        branch: branch name
+        output_path: if set, write the report JSON to this path
+
+    Returns the report dict.
+    """
+    layers: dict[str, Any] = {
+        "A_schema": schema_result,
+        "B_coverage": coverage_result,
+    }
+    if audit_result is not None:
+        layers["C_qe_audit"] = audit_result
+
+    # ── final verdict ──
+    a_pass = schema_result.get("verdict") == "PASS"
+    b_pass = coverage_result.get("verdict") == "PASS"
+    c_pass = (
+        audit_result is None
+        or audit_result.get("verdict") == "ACCEPT"
+    )
+    all_pass = a_pass and b_pass and c_pass
+
+    report = {
+        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
+        "commit": commit,
+        "branch": branch,
+        "layers": layers,
+        "final_verdict": "PASS" if all_pass else "FAIL",
+        "releasable": all_pass,
+        "failure_details": _failure_details(layers),
+    }
+
+    if output_path:
+        out = Path(output_path)
+        out.parent.mkdir(parents=True, exist_ok=True)
+        out.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
+
+    return report
+
+
+def _failure_details(layers: dict) -> list[str]:
+    """Summarise which layers failed and why."""
+    details: list[str] = []
+
+    schema = layers.get("A_schema", {})
+    if schema.get("verdict") != "PASS":
+        details.append(
+            f"Layer A (Schema): {schema.get('failed', '?')}/{schema.get('total_checks', '?')} checks failed"
+        )
+
+    coverage = layers.get("B_coverage", {})
+    if coverage.get("verdict") != "PASS":
+        cv = coverage.get("coverage_rate", "?")
+        details.append(f"Layer B (Coverage): rate={cv} (threshold: 0.70)")
+
+    audit = layers.get("C_qe_audit", {})
+    if audit.get("verdict") == "REJECT":
+        details.append(
+            f"Layer C (QE Audit): REJECT — inadequate_ratio={audit.get('inadequate_ratio', '?')}"
+        )
+
+    return details
+
+
+# ── Layer-specific result builders ──────────────────────────────────────────
+
+def schema_verdict(errors: list[str], stats: dict) -> dict:
+    """Build Layer A result from schema validation errors & stats."""
+    total = stats.get("total_rules", 0)
+    valid = stats.get("valid_rules", 0)
+    failed_checks = len(errors) + (total - valid)
+
+    return {
+        "verdict": "PASS" if failed_checks == 0 else "FAIL",
+        "total_checks": max(total, 1),  # at minimum, we checked the root
+        "passed": valid if failed_checks == 0 else valid,
+        "failed": failed_checks,
+        "rule_pass_rate": round(valid / max(total, 1), 2) if total > 0 else 0,
+        "sample_errors": errors[:10],  # first 10 for the report
+    }
+
+
+def coverage_verdict(
+    coverage_rate: float,
+    stability_std: float,
+    stability_values: list[float],
+    *,
+    coverage_threshold: float = 0.70,
+    stability_threshold: float = 0.05,
+    section_coverage: dict | None = None,
+    table_coverage: dict | None = None,
+    diagram_coverage: dict | None = None,
+) -> dict:
+    """Build Layer B result from coverage metrics."""
+    b1_pass = coverage_rate >= coverage_threshold
+    b2_pass = stability_std <= stability_threshold
+    both_pass = b1_pass and b2_pass
+
+    result: dict[str, Any] = {
+        "verdict": "PASS" if both_pass else "FAIL",
+        "coverage_rate": round(coverage_rate, 3),
+        "coverage_threshold": coverage_threshold,
+        "coverage_pass": b1_pass,
+        "stability": {
+            "runs": len(stability_values),
+            "values": [round(v, 3) for v in stability_values],
+            "std": round(stability_std, 4),
+            "threshold": stability_threshold,
+            "pass": b2_pass,
+        },
+    }
+
+    if section_coverage:
+        result["section_coverage"] = section_coverage
+    if table_coverage:
+        result["table_coverage"] = table_coverage
+    if diagram_coverage:
+        result["diagram_coverage"] = diagram_coverage
+
+    return result
+
+
+def audit_verdict(audit_data: dict, *, inadequate_threshold: float = 0.30) -> dict:
+    """Build Layer C result from LLM QE audit.
+
+    *audit_data* should contain:
+        inadequate_ratio: float
+        rationale: str
+        section_assessments: list[dict]
+    """
+    ratio = audit_data.get("inadequate_ratio", 1.0)
+    passed = ratio <= inadequate_threshold
+
+    return {
+        "verdict": "ACCEPT" if passed else "REJECT",
+        "inadequate_ratio": round(ratio, 3),
+        "threshold": inadequate_threshold,
+        "rationale": audit_data.get("rationale", ""),
+        "total_sections": audit_data.get("total_functional_sections", 0),
+        "adequate": audit_data.get("adequate", 0),
+        "inadequate": audit_data.get("inadequate", 0),
+        "not_applicable": audit_data.get("not_applicable", 0),
+    }
@@ -0,0 +1,558 @@
+"""QE Acceptance Test — Three-layer main branch health check.
+
+Layer A (Schema):   structural correctness of IR
+Layer B (Coverage): structural source-traceability coverage + stability
+Layer C (QE Audit): LLM as QE expert — functional coverage assessment
+
+Final verdict: all three layers must pass for main to be releasable.
+"""
+
+from __future__ import annotations
+
+import json
+import math
+import re
+import statistics
+import tempfile
+import time
+from pathlib import Path
+from typing import Any
+
+import pytest
+
+from .ir_schema import validate_ir, schema_checklist
+from .report import generate_report, schema_verdict, coverage_verdict, audit_verdict
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Layer A: SCHEMA — deterministic structural validation
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def test_layer_a_schema(ir_data: dict, request):
+    """Validate IR structure: required fields, types, naming conventions, no nulls."""
+    report = validate_ir(ir_data)
+    checks = schema_checklist(ir_data)
+
+    # Build Layer A result
+    a_errors = report["errors"]
+    a_stats = report["stats"]
+    a_result = schema_verdict(a_errors, a_stats)
+    a_result["checks"] = checks
+
+    # Store for downstream layers & report
+    _stash(request, "layer_a", a_result)
+
+    # Assert
+    assert report["valid"], (
+        f"Schema validation FAILED ({len(a_errors)} errors)\n"
+        + "\n".join(f"  - {e}" for e in a_errors[:20])
+    )
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Layer B: STRUCTURAL COVERAGE + STABILITY
+# ═══════════════════════════════════════════════════════════════════════════════
+
+# Section titles that are NOT functional requirements
+NON_FUNCTIONAL_PATTERNS = [
+    re.compile(p) for p in [
+        r"编制.*变更.*日志",
+        r"文档背景",
+        r"文档范围",
+        r"术语解释",
+        r"参考",
+        r"附录",
+        r"版本",
+        r"变更记录",
+        r"目录",
+        r"前言",
+        r"概述",
+        r"简介",
+        r"概述.*背景",
+    ]
+]
+
+
+def _is_functional_section(section_name: str) -> bool:
+    """Heuristic: exclude background, glossary, changelog, scope sections.
+
+    Sections that are purely structural — preface, glossary, changelog — are excluded.
+    Sections with numbering like '3.1.1' are always considered functional.
+    """
+    # Numbered sections are functional
+    if _section_number(section_name) != section_name:
+        return True
+    for pat in NON_FUNCTIONAL_PATTERNS:
+        if pat.search(section_name):
+            return False
+    return True
+
+
+def _extract_content_units(parsed_data: dict) -> dict:
+    """Extract countable content units from parsed JSON.
+
+    Returns:
+        {"sections": [{"name": ..., "number": ...}, ...],
+         "table_rows": int, "diagram_images": [rid, ...]}
+    """
+    sections = parsed_data.get("sections", [])
+
+    functional_sections: list[dict] = []
+    total_table_rows = 0
+
+    for sec in sections:
+        name = sec.get("source", "")
+        if _is_functional_section(name):
+            functional_sections.append({
+                "name": name,
+                "number": _section_number(name),
+            })
+
+        for block in sec.get("blocks", []):
+            if block.get("type") == "table":
+                rows = block.get("rows", [])
+                total_table_rows += len(rows)
+
+    # Diagram-type images from image_analysis
+    diagram_rids: list[str] = []
+    for img in parsed_data.get("image_analysis", []):
+        img_type = img.get("type", "")
+        if img_type in ("flowchart", "logic_tree", "architecture",
+                        "state", "sequence", "activity"):
+            diagram_rids.append(img.get("rid", ""))
+
+    return {
+        "functional_sections": functional_sections,
+        "table_rows": total_table_rows,
+        "diagram_images": diagram_rids,
+    }
+
+
+def _section_number(section_name: str) -> str:
+    """Extract leading section number, e.g. '3.1.1 系统限制' → '3.1.1'."""
+    import re
+    m = re.match(r"^([\d.]+)", section_name)
+    return m.group(1) if m else section_name
+
+
+def _section_matches(sec_ref: str, func_sections: list[dict]) -> str | None:
+    """Find a functional section matching *sec_ref*. Returns the section name or None.
+
+    Matching: exact match → starts-with match → number match → substring match.
+    """
+    # exact
+    for s in func_sections:
+        if s["name"] == sec_ref:
+            return s["name"]
+    # starts with section number
+    for s in func_sections:
+        if s["name"].startswith(sec_ref) or sec_ref.startswith(s["name"]):
+            return s["name"]
+    # number match
+    sec_num = _section_number(sec_ref)
+    if sec_num:
+        for s in func_sections:
+            if s["number"] == sec_num:
+                return s["name"]
+    # substring
+    for s in func_sections:
+        if sec_ref in s["name"] or s["name"] in sec_ref:
+            return s["name"]
+    return None
+
+
+def _measure_coverage(ir_data: dict, parsed_data: dict) -> dict:
+    """Compute structural coverage of IR over parsed document.
+
+    Returns:
+        {
+            "section_coverage": {total, covered, rate, uncovered},
+            "table_coverage": {total_rows, covered_rows, rate},
+            "diagram_coverage": {total, covered, rate},
+            "overall_rate": float,
+        }
+    """
+    units = _extract_content_units(parsed_data)
+    rules = ir_data.get("rules", [])
+
+    # ── section coverage ──
+    func_sections = units["functional_sections"]
+    covered_sections: set[str] = set()
+    for rule in rules:
+        for src in rule.get("sources", []):
+            sec_ref = src.get("section", "")
+            if sec_ref:
+                matched = _section_matches(sec_ref, func_sections)
+                if matched:
+                    covered_sections.add(matched)
+
+    section_coverage = {
+        "total": len(func_sections),
+        "covered": len(covered_sections),
+        "rate": round(len(covered_sections) / max(len(func_sections), 1), 3),
+        "uncovered": [s["name"] for s in func_sections
+                      if s["name"] not in covered_sections],
+    }
+
+    # ── table row coverage ──
+    covered_rows: set[tuple] = set()
+    for rule in rules:
+        for src in rule.get("sources", []):
+            if src.get("type") == "table":
+                sec = src.get("section", "")
+                row = src.get("row")
+                if sec and row is not None:
+                    covered_rows.add((sec, row))
+
+    total_rows = units["table_rows"]
+    table_coverage = {
+        "total_rows": total_rows,
+        "covered_rows": len(covered_rows),
+        "rate": round(len(covered_rows) / max(total_rows, 1), 3),
+    }
+
+    # ── diagram coverage ──
+    diagram_rids = units["diagram_images"]
+    covered_rids: set[str] = set()
+    for rule in rules:
+        for src in rule.get("sources", []):
+            if src.get("type") == "logic_tree":
+                img_id = src.get("image_id", "")
+                if img_id and img_id in diagram_rids:
+                    covered_rids.add(img_id)
+
+    diagram_coverage = {
+        "total": len(diagram_rids),
+        "covered": len(covered_rids),
+        "rate": round(len(covered_rids) / max(len(diagram_rids), 1), 3),
+        "uncovered": [r for r in diagram_rids if r not in covered_rids],
+    }
+
+    # ── overall ──
+    rates = [
+        section_coverage["rate"],
+        table_coverage["rate"],
+        diagram_coverage["rate"],
+    ]
+    overall = round(sum(rates) / len(rates), 3) if rates else 0.0
+
+    return {
+        "section_coverage": section_coverage,
+        "table_coverage": table_coverage,
+        "diagram_coverage": diagram_coverage,
+        "overall_rate": overall,
+    }
+
+
+def test_layer_b_coverage(
+    ir_data: dict,
+    parsed_data: dict | None,
+    ir_path: str,
+    acceptance_runs: int,
+    run_ir_pipeline,
+    request,
+):
+    """Measure structural coverage and (optionally) coverage stability."""
+    if parsed_data is None:
+        pytest.skip("No parsed JSON available for coverage analysis")
+
+    # ── B1: single-run coverage ──
+    cov = _measure_coverage(ir_data, parsed_data)
+
+    # ── B2: stability (multi-run) ──
+    stability_values: list[float] = [cov["overall_rate"]]
+    stability_std = 0.0
+
+    if acceptance_runs > 1:
+        parsed_path = request.config.getoption("--parsed-path")
+        if parsed_path and os.path.exists(parsed_path):
+            for _ in range(acceptance_runs - 1):
+                try:
+                    ir_list, _ = run_ir_pipeline(parsed_path)
+                    # Convert list-format IR to dict for coverage measurement
+                    run_ir = _wrap_list_ir(ir_list)
+                    run_cov = _measure_coverage(run_ir, parsed_data)
+                    stability_values.append(run_cov["overall_rate"])
+                    time.sleep(0.5)  # rate limiting between runs
+                except Exception as e:
+                    pytest.fail(f"Stability run failed: {e}")
+
+    if len(stability_values) > 1:
+        stability_std = statistics.stdev(stability_values)
+
+    # Build Layer B result
+    b_result = coverage_verdict(
+        coverage_rate=cov["overall_rate"],
+        stability_std=stability_std,
+        stability_values=stability_values,
+        section_coverage=cov["section_coverage"],
+        table_coverage=cov["table_coverage"],
+        diagram_coverage=cov["diagram_coverage"],
+    )
+    _stash(request, "layer_b", b_result)
+
+    # Assert — both B1 and B2 must pass
+    assert b_result["coverage_pass"], (
+        f"Coverage {cov['overall_rate']:.1%} < threshold 70%\n"
+        f"  Sections: {cov['section_coverage']['covered']}/{cov['section_coverage']['total']} "
+        f"({cov['section_coverage']['rate']:.1%})\n"
+        f"  Uncovered: {cov['section_coverage']['uncovered']}\n"
+        f"  Table rows: {cov['table_coverage']['covered_rows']}/{cov['table_coverage']['total_rows']} "
+        f"({cov['table_coverage']['rate']:.1%})\n"
+        f"  Diagrams: {cov['diagram_coverage']['covered']}/{cov['diagram_coverage']['total']} "
+        f"({cov['diagram_coverage']['rate']:.1%})\n"
+        f"  Uncovered diagrams: {cov['diagram_coverage']['uncovered']}"
+    )
+
+    if len(stability_values) > 1:
+        assert b_result["stability"]["pass"], (
+            f"Coverage stability std={stability_std:.4f} > threshold 0.05\n"
+            f"  Values across {len(stability_values)} runs: {stability_values}"
+        )
+
+
+def _wrap_list_ir(ir_list: list) -> dict:
+    """Wrap a list-format IR (from ir_generator.py) into a dict for schema compat."""
+    # Convert simple format to rich format for coverage measurement
+    rules = []
+    for i, entry in enumerate(ir_list):
+        if not isinstance(entry, dict):
+            continue
+        rule = {
+            "rule_id": f"GEN-001-RULE-{i:03d}",
+            "description": entry.get("function", ""),
+            "path": [],
+            "priority": "P2",
+            "sources": [],
+            "precondition": {},
+            "trigger": entry.get("trigger", {"operator": "AND", "conditions": []}),
+            "actions": [],
+        }
+        # Convert source
+        src = entry.get("source", {})
+        if src.get("section"):
+            rule["sources"].append({
+                "type": "text",
+                "section": src["section"],
+                "paragraph": 1,
+                "text_snippet": src.get("location", ""),
+                "priority": "primary_source",
+            })
+        rules.append(rule)
+
+    return {
+        "feature": "generated",
+        "feature_id": "GEN-001",
+        "rules": rules,
+    }
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Layer C: LLM QE EXPERT AUDIT
+# ═══════════════════════════════════════════════════════════════════════════════
+
+QE_AUDITOR_PROMPT = """你是一个资深 QE 专家，负责审查需求文档的 IR（中间表示层）是否充分覆盖了源文档的所有可测试功能点。
+
+你不是 IR 的生成者，你是独立的质量审计员。你的职责是判断 IR 的功能覆盖率是否充分。
+
+## 审计输入
+
+### Layer B 结构化覆盖率数据（参考）
+{coverage_summary}
+
+### 源文档内容（Parsed JSON）
+{parsed_content}
+
+### 生成的 IR（待审计）
+{ir_content}
+
+## 审计要求
+
+对源文档中的每个章节逐一评估其功能需求是否被 IR 充分覆盖。
+
+**判断标准**：
+- **adequate**（充分覆盖）：该章节的所有功能需求在 IR 中都有对应的 rule，包括触发条件、执行动作
+- **inadequate**（覆盖不足）：该章节存在功能需求未在 IR 中体现，或描述不完整（缺少触发条件或动作）
+- **not_applicable**（不适用）：该章节为背景介绍、术语定义、变更日志等，不包含功能需求
+
+**注意**：
+- 如果某个章节涉及多个决策路径（如流程图），检查 IR 是否覆盖了每条路径
+- 表格中的每个功能行都应被至少一个 IR rule 覆盖
+- 图片分析中的流程图/决策树节点应被 IR 引用
+
+## 输出格式
+
+请严格输出以下 JSON 格式（不要包含代码块标记）：
+
+{{
+  "total_functional_sections": <number>,
+  "adequate": <number>,
+  "inadequate": <number>,
+  "not_applicable": <number>,
+  "inadequate_ratio": <float>,
+  "verdict": "ACCEPT 或 REJECT",
+  "rationale": "<一句话说明接受或拒绝的理由>",
+  "section_assessments": [
+    {{
+      "section": "<章节名>",
+      "assessment": "adequate | inadequate | not_applicable",
+      "reason": "<评估理由>",
+      "missing": ["<缺失项1>", "<缺失项2>"]  // 仅 inadequate 时需要
+    }}
+  ]
+}}
+
+verdict 判定规则：
+- inadequate_ratio ≤ 0.30 → "ACCEPT"（风险可控）
+- inadequate_ratio > 0.30 → "REJECT"（功能点认知差异大，需要补充 IR）
+"""
+
+
+def test_layer_c_qe_audit(
+    ir_data: dict, parsed_data: dict | None, llm_client, request
+):
+    """LLM QE expert audit of functional coverage."""
+    if parsed_data is None:
+        pytest.skip("No parsed JSON available — cannot run QE audit")
+
+    # ── get Layer B summary for context ──
+    layer_b = _unstash(request, "layer_b") or {}
+    cov_summary = json.dumps(
+        {
+            "coverage_rate": layer_b.get("coverage_rate", "N/A"),
+            "section_coverage": layer_b.get("section_coverage", {}),
+            "diagram_coverage": layer_b.get("diagram_coverage", {}),
+        },
+        ensure_ascii=False,
+        indent=2,
+    )
+
+    # ── prepare content (trim to avoid token overflow) ──
+    parsed_str = json.dumps(parsed_data, ensure_ascii=False)
+    ir_str = json.dumps(ir_data, ensure_ascii=False)
+
+    max_parsed = 12000
+    max_ir = 8000
+    if len(parsed_str) > max_parsed:
+        parsed_str = parsed_str[:max_parsed] + "\n...[truncated]"
+    if len(ir_str) > max_ir:
+        ir_str = ir_str[:max_ir] + "\n...[truncated]"
+
+    prompt = QE_AUDITOR_PROMPT.format(
+        coverage_summary=cov_summary,
+        parsed_content=parsed_str,
+        ir_content=ir_str,
+    )
+
+    # ── call LLM ──
+    try:
+        raw = llm_client.chat(
+            model=llm_client.TEXT_MODEL,
+            messages=[{"role": "user", "content": prompt}],
+            response_format={"type": "json_object"},
+        )
+    except Exception as e:
+        pytest.fail(f"QE audit LLM call failed: {e}")
+
+    # ── parse response ──
+    audit_data = _parse_json_response(raw)
+    if audit_data is None:
+        pytest.fail(f"QE audit returned unparseable response:\n{raw[:500]}")
+
+    # Build Layer C result
+    c_result = audit_verdict(audit_data)
+    c_result["raw_assessments"] = audit_data.get("section_assessments", [])
+    _stash(request, "layer_c", c_result)
+
+    # Assert
+    assert c_result["verdict"] == "ACCEPT", (
+        f"QE Audit REJECTED — inadequate_ratio={c_result['inadequate_ratio']:.1%} > 30%\n"
+        f"  Rationale: {c_result['rationale']}\n"
+        f"  Adequate: {c_result['adequate']}, Inadequate: {c_result['inadequate']}"
+    )
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Final report (runs last)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+def test_final_report(ir_data: dict, ir_path: str, request):
+    """Generate the final three-layer JSON report.
+
+    This test always passes (report generation). The verdicts from layers A/B/C
+    determine the final releasable status, but the report itself is informational.
+    """
+    layer_a = _unstash(request, "layer_a") or {"verdict": "SKIPPED"}
+    layer_b = _unstash(request, "layer_b") or {"verdict": "SKIPPED"}
+    layer_c = _unstash(request, "layer_c") or {"verdict": "SKIPPED"}
+
+    report_path = request.config.getoption("--json-report-file", None) or str(
+        Path.cwd() / "acceptance-report.json"
+    )
+
+    report = generate_report(
+        layer_a,
+        layer_b,
+        layer_c,
+        commit=os.environ.get("GITEA_SHA", ""),
+        branch=os.environ.get("GITEA_BRANCH", "main"),
+        output_path=report_path,
+    )
+
+    # Print summary
+    print(f"\n{'='*60}")
+    print(f"QE ACCEPTANCE REPORT")
+    print(f"{'='*60}")
+    print(f"  Layer A (Schema):    {layer_a.get('verdict', '?')}")
+    print(f"  Layer B (Coverage):  {layer_b.get('verdict', '?')} "
+          f"(rate={layer_b.get('coverage_rate', '?')})")
+    print(f"  Layer C (QE Audit):  {layer_c.get('verdict', '?')}")
+    print(f"  {'─'*40}")
+    print(f"  FINAL: {report['final_verdict']}  |  "
+          f"Releasable: {report['releasable']}")
+    print(f"  Report: {report_path}")
+    print(f"{'='*60}\n")
+
+    # Fail if any layer failed (aggregate assertion)
+    failures = report.get("failure_details", [])
+    if failures:
+        pytest.fail(
+            "Acceptance tests FAILED:\n" + "\n".join(f"  - {f}" for f in failures)
+        )
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# Helpers
+# ═══════════════════════════════════════════════════════════════════════════════
+
+import os  # noqa: E402
+
+# Module-level stash for sharing results across tests in the same module.
+# Each test function stores its result here; later tests read earlier results.
+_module_stash: dict[str, dict] = {}
+
+
+def _stash(request, key: str, value: dict):
+    """Store a result dict for cross-test access within this module."""
+    _module_stash[key] = value
+
+
+def _unstash(request, key: str) -> dict | None:
+    """Retrieve a stashed result."""
+    return _module_stash.get(key)
+
+
+def _parse_json_response(raw: str) -> dict | None:
+    """Parse JSON from an LLM response, handling markdown code fences."""
+    if not raw:
+        return None
+    text = raw.strip()
+    if text.startswith("```"):
+        nl = text.find("\n")
+        text = text[nl + 1:] if nl != -1 else text[3:]
+    if text.endswith("```"):
+        text = text[:-3]
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        return None
@@ -55,12 +55,17 @@ def test_import_detect_conflicts():

 # -- IR generation tests ------------------------------------------------------

-def test_import_ir_generator():
-    """ir_generator module should be importable."""
+def test_import_ir_main():
+    """ir_generation main module should be importable (new project structure)."""
    os.environ.setdefault("DASHSCOPE_API_KEY", "test-fake-key")
-    _import_from_skill("ir_generation_skill", "ir_generator")
-    import ir_generator
-    assert hasattr(ir_generator, "generate_ir")
+    skill_dir = os.path.join(
+        os.path.dirname(os.path.dirname(__file__)),
+        "skills", "ir_generation_skill"
+    )
+    if skill_dir not in sys.path:
+        sys.path.insert(0, skill_dir)
+    import main
+    assert hasattr(main, "main")


 # -- Resolution application tests ---------------------------------------------
				`@@ -0,0 +1 @@`
				`# QE Acceptance Tests for document_analyzer`