sync: update all skills from latest workspace code
CI / test (push) Successful in 8s

doc_parser_skill:
- New: verify_flowchart.py (flowchart validation)
- Updated: LLM.py (multi-provider: DeepSeek + DashScope)
- Updated: image_parser.py (logic tree support, external prompts)
- Updated: SKILL.md, prompts/image_prompt.md

conflict_detection_skill:
- Updated: LLM.py (multi-provider sync)
- Updated: detect_conflicts.py (logic tree text conversion)

ir_generation_skill:
- Replaced old scripts/LLM.py + ir_generator.py with standalone project
- New: main.py, config.py, step1-3_*.py, ensemble_merge.py
- New: prompts/, tests/ subdirectories

tests:
- New: acceptance/ test suite with schema validation
- Fixed: conftest no longer globally skips non-acceptance tests
- Updated: test_sample.py for new ir_generation structure

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-30 22:45:08 +08:00
parent db64df2da1
commit fec4c09ee0
35 changed files with 8021 additions and 530 deletions
+1
View File
@@ -0,0 +1 @@
# Tests package for document_analyzer
+1
View File
@@ -0,0 +1 @@
# QE Acceptance Tests for document_analyzer
+186
View File
@@ -0,0 +1,186 @@
"""Pytest configuration and shared fixtures for QE acceptance tests.
Usage::
pytest tests/acceptance/ -v --run-acceptance [--acceptance-runs=3]
Environment variables:
DASHSCOPE_API_KEY — LLM API key (required for Layers B/C)
TEST_IR_PATH — path to IR JSON to validate (default: ir_final.json sample)
TEST_PARSED_PATH — path to _parsed.json or _updated.json for coverage analysis
"""
from __future__ import annotations
import json
import os
import sys
import tempfile
from pathlib import Path
from typing import Any
import pytest
# ── Path setup ──────────────────────────────────────────────────────────────
_PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(_PROJECT_ROOT))
def _skill_path(skill_name: str) -> str:
return str(_PROJECT_ROOT / "skills" / skill_name / "scripts")
# ── pytest configuration ────────────────────────────────────────────────────
def pytest_addoption(parser):
parser.addoption(
"--run-acceptance",
action="store_true",
default=False,
help="Run QE acceptance tests (requires DASHSCOPE_API_KEY)",
)
parser.addoption(
"--acceptance-runs",
type=int,
default=1,
help="Number of IR generation runs for Layer B stability testing (default: 1 = skip)",
)
parser.addoption(
"--ir-path",
type=str,
default=None,
help="Path to IR JSON file to validate",
)
parser.addoption(
"--parsed-path",
type=str,
default=None,
help="Path to _parsed.json or _updated.json for coverage analysis",
)
def pytest_configure(config):
config.addinivalue_line(
"markers",
"acceptance: QE acceptance test (requires --run-acceptance flag and DASHSCOPE_API_KEY)",
)
def pytest_collection_modifyitems(config, items):
acceptance_dir = str(_PROJECT_ROOT / "tests" / "acceptance")
acceptance_items = [i for i in items if str(i.fspath).startswith(acceptance_dir)]
non_acceptance_items = [i for i in items if not str(i.fspath).startswith(acceptance_dir)]
if not config.getoption("--run-acceptance"):
skip_msg = pytest.mark.skip(reason="Need --run-acceptance flag to run")
for item in acceptance_items:
item.add_marker(skip_msg)
# Don't skip non-acceptance tests
return
if not os.environ.get("DASHSCOPE_API_KEY"):
skip_msg = pytest.mark.skip(reason="DASHSCOPE_API_KEY not set")
for item in acceptance_items:
item.add_marker(skip_msg)
# ── Shared fixtures ─────────────────────────────────────────────────────────
@pytest.fixture(scope="session")
def project_root() -> Path:
return _PROJECT_ROOT
@pytest.fixture(scope="session")
def ir_path(request) -> str:
"""Path to the IR JSON file under test."""
path = (
request.config.getoption("--ir-path")
or os.environ.get("TEST_IR_PATH")
or str(
Path.home()
/ ".openclaw/workspace/skills/doc_parser_skill/output/ir_final.json"
)
)
if not os.path.exists(path):
pytest.skip(f"IR file not found: {path}")
return path
@pytest.fixture(scope="session")
def ir_data(ir_path: str) -> dict:
"""Load the IR JSON data."""
with open(ir_path, "r", encoding="utf-8") as f:
return json.load(f)
@pytest.fixture(scope="session")
def parsed_path(request) -> str | None:
"""Path to the corresponding _parsed.json or _updated.json."""
path = (
request.config.getoption("--parsed-path")
or os.environ.get("TEST_PARSED_PATH")
or str(
_PROJECT_ROOT
/ "skills/ir_generation_skill/车机娱乐系统禁止功能文档_精简_updated.json"
)
)
if os.path.exists(path):
return path
return None
@pytest.fixture(scope="session")
def parsed_data(parsed_path: str | None) -> dict | None:
"""Load the parsed document JSON for coverage analysis."""
if parsed_path is None:
return None
with open(parsed_path, "r", encoding="utf-8") as f:
return json.load(f)
@pytest.fixture(scope="session")
def llm_client():
"""Create an LLMClient instance for acceptance tests.
Uses the DashScope-compatible LLMClient from the project.
"""
sys.path.insert(0, _skill_path("doc_parser_skill"))
from LLM import LLMClient
return LLMClient()
@pytest.fixture(scope="session")
def acceptance_runs(request) -> int:
return request.config.getoption("--acceptance-runs", default=1)
# ── Pipeline runner ─────────────────────────────────────────────────────────
@pytest.fixture(scope="session")
def run_ir_pipeline():
"""Return a callable that runs the IR generation pipeline on a parsed JSON.
Usage::
ir_data, ir_path = run_ir_pipeline(parsed_json_path, output_dir)
"""
sys.path.insert(0, _skill_path("ir_generation_skill"))
from ir_generator import generate_ir
def _run(parsed_path: str, output_dir: str | None = None) -> tuple[dict, str]:
"""Run IR generation and return (ir_data, ir_path)."""
out = output_dir or tempfile.mkdtemp(prefix="qe_acceptance_")
result = generate_ir(parsed_path, out, dry_run=False)
ir_list = result.get("ir", [])
ir_path = result.get("path", "")
# ir_generator produces a list; wrap to match rich format expectations
# for schema validation we accept both formats
return ir_list, ir_path
return _run
+325
View File
@@ -0,0 +1,325 @@
"""Rich IR schema definition and validators for the document_analyzer QE framework.
Target format is the production IR (``ir_final.json``):
{feature, feature_id, config_defaults?, rules: [{rule_id, path, description,
priority, sources, precondition, trigger, actions}]}
"""
from __future__ import annotations
import re
from typing import Any
# ── Constants ────────────────────────────────────────────────────────────────
VALID_SOURCE_TYPES = {"table", "logic_tree", "text"}
VALID_ACTION_TYPES = {"system", "user_interaction"}
VALID_PRIORITIES = {"P0", "P1", "P2"}
VALID_TRIGGER_OPERATORS = {"AND", "OR"}
# rule_id pattern: FEAT-NNN-SCOPE-TYPE-...-PATH-NN (variable middle segments)
RULE_ID_RE = re.compile(
r"^[A-Z]+-\d+(-[A-Z]+)+-\d+$"
)
# ── Validation helpers ──────────────────────────────────────────────────────
def _check(condition: bool, message: str) -> list[str]:
"""Return a list with an error message if *condition* is False, else empty list."""
return [] if condition else [message]
def validate_rule(rule: dict, index: int = 0) -> list[str]:
"""Validate a single rule dict. Returns a (possibly empty) list of error strings."""
errors: list[str] = []
label = f"rules[{index}]"
if not isinstance(rule, dict):
return [f"{label}: not a dict"]
# ── required top-level fields ──
for field in ("rule_id", "description"):
errors.extend(_check(
isinstance(rule.get(field), str) and bool(rule[field].strip()),
f'{label}.{field}: required non-empty string',
))
# sources is a list, not a string — validated separately below
# ── rule_id naming ──
rid = rule.get("rule_id", "")
if rid and isinstance(rid, str):
errors.extend(_check(
bool(RULE_ID_RE.match(rid)),
f'{label}.rule_id: "{rid}" does not match pattern FEAT-NNN-SCOPE-TYPE-PATH-NN',
))
# ── priority ──
priority = rule.get("priority")
if priority is not None:
errors.extend(_check(
priority in VALID_PRIORITIES,
f'{label}.priority: "{priority}" not in {VALID_PRIORITIES}',
))
# ── path ──
path = rule.get("path")
if path is not None:
if not isinstance(path, list):
errors.append(f"{label}.path: must be a list")
elif len(path) == 0:
errors.append(f"{label}.path: must not be empty")
elif not all(isinstance(p, str) and p.strip() for p in path):
errors.append(f"{label}.path: all segments must be non-empty strings")
# ── sources[] ──
sources = rule.get("sources", [])
if not isinstance(sources, list):
errors.append(f"{label}.sources: must be a list")
elif len(sources) == 0:
errors.append(f"{label}.sources: must have at least one source")
else:
for si, src in enumerate(sources):
errors.extend(_validate_source(src, f"{label}.sources[{si}]"))
# ── precondition ──
precondition = rule.get("precondition")
if precondition is not None:
if not isinstance(precondition, dict):
errors.append(f"{label}.precondition: must be a dict")
elif len(precondition) == 0:
errors.append(f"{label}.precondition: must not be empty")
# ── trigger ──
trigger = rule.get("trigger")
if trigger is not None:
if not isinstance(trigger, dict):
errors.append(f"{label}.trigger: must be a dict")
else:
errors.extend(_validate_trigger(trigger, f"{label}.trigger"))
# ── actions ──
actions = rule.get("actions")
if actions is not None:
if not isinstance(actions, list):
errors.append(f"{label}.actions: must be a list")
else:
for ai, act in enumerate(actions):
errors.extend(_validate_action(act, f"{label}.actions[{ai}]"))
# ── no null values at any depth ──
errors.extend(_find_nulls(rule, label))
return errors
def _validate_source(src: dict, label: str) -> list[str]:
errors: list[str] = []
if not isinstance(src, dict):
return [f"{label}: not a dict"]
stype = src.get("type", "")
errors.extend(_check(
stype in VALID_SOURCE_TYPES,
f'{label}.type: "{stype}" not in {VALID_SOURCE_TYPES}',
))
priority = src.get("priority", "")
if priority:
errors.extend(_check(
priority in ("primary_source", "supplementary"),
f'{label}.priority: "{priority}" must be primary_source or supplementary',
))
# type-specific fields
if stype == "table":
errors.extend(_check(
isinstance(src.get("section"), str) and bool(src["section"].strip()),
f"{label}.section: required non-empty string for table source",
))
errors.extend(_check(
isinstance(src.get("row"), int),
f"{label}.row: required int for table source",
))
elif stype == "logic_tree":
errors.extend(_check(
isinstance(src.get("image_id"), str) and bool(src["image_id"].strip()),
f"{label}.image_id: required non-empty string for logic_tree source",
))
node_ids = src.get("node_ids", [])
errors.extend(_check(
isinstance(node_ids, list) and len(node_ids) > 0,
f"{label}.node_ids: required non-empty list for logic_tree source",
))
elif stype == "text":
errors.extend(_check(
isinstance(src.get("section"), str) and bool(src["section"].strip()),
f"{label}.section: required non-empty string for text source",
))
return errors
def _validate_trigger(trigger: dict, label: str) -> list[str]:
errors: list[str] = []
operator = trigger.get("operator", "")
errors.extend(_check(
operator in VALID_TRIGGER_OPERATORS,
f'{label}.operator: "{operator}" not in {VALID_TRIGGER_OPERATORS}',
))
conditions = trigger.get("conditions")
if conditions is not None:
if not isinstance(conditions, list):
errors.append(f"{label}.conditions: must be a list")
else:
for ci, cond in enumerate(conditions):
if not isinstance(cond, dict):
errors.append(f"{label}.conditions[{ci}]: not a dict")
else:
errors.extend(_check(
isinstance(cond.get("signal"), str) and bool(cond["signal"].strip()),
f"{label}.conditions[{ci}].signal: required non-empty string",
))
errors.extend(_check(
"operator" in cond,
f"{label}.conditions[{ci}].operator: required",
))
# empty conditions is valid (e.g. "switch always off, no conditions")
return errors
def _validate_action(action: dict, label: str) -> list[str]:
errors: list[str] = []
if not isinstance(action, dict):
return [f"{label}: not a dict"]
atype = action.get("type", "")
errors.extend(_check(
atype in VALID_ACTION_TYPES,
f'{label}.type: "{atype}" not in {VALID_ACTION_TYPES}',
))
errors.extend(_check(
isinstance(action.get("description"), str) and bool(action["description"].strip()),
f"{label}.description: required non-empty string",
))
return errors
def _find_nulls(obj: Any, label: str) -> list[str]:
"""Find any None values at any depth in *obj*."""
errors: list[str] = []
if obj is None:
return [f"{label}: null value"]
elif isinstance(obj, dict):
for k, v in obj.items():
errors.extend(_find_nulls(v, f"{label}.{k}"))
elif isinstance(obj, list):
for i, v in enumerate(obj):
errors.extend(_find_nulls(v, f"{label}[{i}]"))
return errors
# ── Top-level validation ────────────────────────────────────────────────────
def validate_ir(ir_data: dict) -> dict:
"""Validate the entire IR document.
Returns:
{
"valid": bool,
"errors": [str, ...],
"stats": {total_rules, valid_rules, has_config_defaults, ...}
}
"""
errors: list[str] = []
stats = {"total_rules": 0, "valid_rules": 0, "has_config_defaults": False, "features": 0}
if not isinstance(ir_data, dict):
return {"valid": False, "errors": ["IR root is not a dict"], "stats": stats}
# top-level required fields
for field in ("feature", "feature_id", "rules"):
if field not in ir_data:
errors.append(f"root.{field}: missing required field")
elif field in ("feature", "feature_id") and not (
isinstance(ir_data[field], str) and ir_data[field].strip()
):
errors.append(f"root.{field}: must be non-empty string")
# config_defaults (optional)
if "config_defaults" in ir_data:
stats["has_config_defaults"] = True
cd = ir_data["config_defaults"]
if not isinstance(cd, dict):
errors.append("root.config_defaults: must be a dict")
# rules array
rules = ir_data.get("rules", [])
if not isinstance(rules, list):
errors.append("root.rules: must be a list")
else:
stats["total_rules"] = len(rules)
if len(rules) == 0:
errors.append("root.rules: must have at least one rule")
else:
for i, rule in enumerate(rules):
rule_errors = validate_rule(rule, i)
if rule_errors:
errors.extend(rule_errors)
else:
stats["valid_rules"] += 1
# feature count
if isinstance(ir_data.get("feature_id"), str):
stats["features"] = 1
return {
"valid": len(errors) == 0,
"errors": errors,
"stats": stats,
}
# ── Summary helpers ─────────────────────────────────────────────────────────
def schema_checklist(ir_data: dict) -> list[dict]:
"""Run individual checks and return a checklist for reporting.
Each item: {"check": str, "passed": bool, "detail": str}
"""
report = validate_ir(ir_data)
checks: list[dict] = []
def _add(name: str, passed: bool, detail: str = ""):
checks.append({"check": name, "passed": passed, "detail": detail})
# Top-level
_add("root is dict", isinstance(ir_data, dict))
_add("root.feature present", isinstance(ir_data.get("feature"), str) and bool(ir_data["feature"].strip()))
_add("root.feature_id present", isinstance(ir_data.get("feature_id"), str) and bool(ir_data["feature_id"].strip()))
_add("root.rules is non-empty list", isinstance(ir_data.get("rules"), list) and len(ir_data["rules"]) > 0)
# Per-rule checks
rules = ir_data.get("rules", []) if isinstance(ir_data, dict) else []
rule_ids = []
for i, rule in enumerate(rules):
if not isinstance(rule, dict):
continue
rid = rule.get("rule_id", f"rules[{i}]")
rule_ids.append(rid)
errs = validate_rule(rule, i)
_add(f"{rid}: valid", len(errs) == 0, "; ".join(errs) if errs else "")
# Aggregate checks
_add("no duplicate rule_ids", len(rule_ids) == len(set(rule_ids)),
f"duplicates: {[r for r in rule_ids if rule_ids.count(r) > 1]}" if len(rule_ids) != len(set(rule_ids)) else "")
_add("all rules valid", report["valid"],
f"{report['stats']['valid_rules']}/{report['stats']['total_rules']} valid")
return checks
+178
View File
@@ -0,0 +1,178 @@
"""Structured JSON report generation for QE acceptance test results.
Produces a unified report with three-layer verdict:
Layer A Schema compliance
Layer B Structural coverage + stability
Layer C LLM QE expert audit
Final verdict: PASS (releasable) or FAIL (blocked).
"""
from __future__ import annotations
import json
import time
from pathlib import Path
from typing import Any
def generate_report(
schema_result: dict,
coverage_result: dict,
audit_result: dict | None,
*,
commit: str = "",
branch: str = "main",
output_path: str | None = None,
) -> dict:
"""Assemble the three-layer report and return it.
Args:
schema_result: ``{"verdict": "PASS"|"FAIL", "total_checks": N, "passed": N, "failed": N}``
coverage_result: ``{"verdict": "PASS"|"FAIL", "coverage_rate": float,
"stability": {"runs": N, "values": [...], "std": float}}``
audit_result: ``{"verdict": "ACCEPT"|"REJECT", "inadequate_ratio": float,
"rationale": str, "section_assessments": [...]}`` or None
commit: git commit SHA
branch: branch name
output_path: if set, write the report JSON to this path
Returns the report dict.
"""
layers: dict[str, Any] = {
"A_schema": schema_result,
"B_coverage": coverage_result,
}
if audit_result is not None:
layers["C_qe_audit"] = audit_result
# ── final verdict ──
a_pass = schema_result.get("verdict") == "PASS"
b_pass = coverage_result.get("verdict") == "PASS"
c_pass = (
audit_result is None
or audit_result.get("verdict") == "ACCEPT"
)
all_pass = a_pass and b_pass and c_pass
report = {
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
"commit": commit,
"branch": branch,
"layers": layers,
"final_verdict": "PASS" if all_pass else "FAIL",
"releasable": all_pass,
"failure_details": _failure_details(layers),
}
if output_path:
out = Path(output_path)
out.parent.mkdir(parents=True, exist_ok=True)
out.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
return report
def _failure_details(layers: dict) -> list[str]:
"""Summarise which layers failed and why."""
details: list[str] = []
schema = layers.get("A_schema", {})
if schema.get("verdict") != "PASS":
details.append(
f"Layer A (Schema): {schema.get('failed', '?')}/{schema.get('total_checks', '?')} checks failed"
)
coverage = layers.get("B_coverage", {})
if coverage.get("verdict") != "PASS":
cv = coverage.get("coverage_rate", "?")
details.append(f"Layer B (Coverage): rate={cv} (threshold: 0.70)")
audit = layers.get("C_qe_audit", {})
if audit.get("verdict") == "REJECT":
details.append(
f"Layer C (QE Audit): REJECT — inadequate_ratio={audit.get('inadequate_ratio', '?')}"
)
return details
# ── Layer-specific result builders ──────────────────────────────────────────
def schema_verdict(errors: list[str], stats: dict) -> dict:
"""Build Layer A result from schema validation errors & stats."""
total = stats.get("total_rules", 0)
valid = stats.get("valid_rules", 0)
failed_checks = len(errors) + (total - valid)
return {
"verdict": "PASS" if failed_checks == 0 else "FAIL",
"total_checks": max(total, 1), # at minimum, we checked the root
"passed": valid if failed_checks == 0 else valid,
"failed": failed_checks,
"rule_pass_rate": round(valid / max(total, 1), 2) if total > 0 else 0,
"sample_errors": errors[:10], # first 10 for the report
}
def coverage_verdict(
coverage_rate: float,
stability_std: float,
stability_values: list[float],
*,
coverage_threshold: float = 0.70,
stability_threshold: float = 0.05,
section_coverage: dict | None = None,
table_coverage: dict | None = None,
diagram_coverage: dict | None = None,
) -> dict:
"""Build Layer B result from coverage metrics."""
b1_pass = coverage_rate >= coverage_threshold
b2_pass = stability_std <= stability_threshold
both_pass = b1_pass and b2_pass
result: dict[str, Any] = {
"verdict": "PASS" if both_pass else "FAIL",
"coverage_rate": round(coverage_rate, 3),
"coverage_threshold": coverage_threshold,
"coverage_pass": b1_pass,
"stability": {
"runs": len(stability_values),
"values": [round(v, 3) for v in stability_values],
"std": round(stability_std, 4),
"threshold": stability_threshold,
"pass": b2_pass,
},
}
if section_coverage:
result["section_coverage"] = section_coverage
if table_coverage:
result["table_coverage"] = table_coverage
if diagram_coverage:
result["diagram_coverage"] = diagram_coverage
return result
def audit_verdict(audit_data: dict, *, inadequate_threshold: float = 0.30) -> dict:
"""Build Layer C result from LLM QE audit.
*audit_data* should contain:
inadequate_ratio: float
rationale: str
section_assessments: list[dict]
"""
ratio = audit_data.get("inadequate_ratio", 1.0)
passed = ratio <= inadequate_threshold
return {
"verdict": "ACCEPT" if passed else "REJECT",
"inadequate_ratio": round(ratio, 3),
"threshold": inadequate_threshold,
"rationale": audit_data.get("rationale", ""),
"total_sections": audit_data.get("total_functional_sections", 0),
"adequate": audit_data.get("adequate", 0),
"inadequate": audit_data.get("inadequate", 0),
"not_applicable": audit_data.get("not_applicable", 0),
}
+558
View File
@@ -0,0 +1,558 @@
"""QE Acceptance Test — Three-layer main branch health check.
Layer A (Schema): structural correctness of IR
Layer B (Coverage): structural source-traceability coverage + stability
Layer C (QE Audit): LLM as QE expert — functional coverage assessment
Final verdict: all three layers must pass for main to be releasable.
"""
from __future__ import annotations
import json
import math
import re
import statistics
import tempfile
import time
from pathlib import Path
from typing import Any
import pytest
from .ir_schema import validate_ir, schema_checklist
from .report import generate_report, schema_verdict, coverage_verdict, audit_verdict
# ═══════════════════════════════════════════════════════════════════════════════
# Layer A: SCHEMA — deterministic structural validation
# ═══════════════════════════════════════════════════════════════════════════════
def test_layer_a_schema(ir_data: dict, request):
"""Validate IR structure: required fields, types, naming conventions, no nulls."""
report = validate_ir(ir_data)
checks = schema_checklist(ir_data)
# Build Layer A result
a_errors = report["errors"]
a_stats = report["stats"]
a_result = schema_verdict(a_errors, a_stats)
a_result["checks"] = checks
# Store for downstream layers & report
_stash(request, "layer_a", a_result)
# Assert
assert report["valid"], (
f"Schema validation FAILED ({len(a_errors)} errors)\n"
+ "\n".join(f" - {e}" for e in a_errors[:20])
)
# ═══════════════════════════════════════════════════════════════════════════════
# Layer B: STRUCTURAL COVERAGE + STABILITY
# ═══════════════════════════════════════════════════════════════════════════════
# Section titles that are NOT functional requirements
NON_FUNCTIONAL_PATTERNS = [
re.compile(p) for p in [
r"编制.*变更.*日志",
r"文档背景",
r"文档范围",
r"术语解释",
r"参考",
r"附录",
r"版本",
r"变更记录",
r"目录",
r"前言",
r"概述",
r"简介",
r"概述.*背景",
]
]
def _is_functional_section(section_name: str) -> bool:
"""Heuristic: exclude background, glossary, changelog, scope sections.
Sections that are purely structural — preface, glossary, changelog — are excluded.
Sections with numbering like '3.1.1' are always considered functional.
"""
# Numbered sections are functional
if _section_number(section_name) != section_name:
return True
for pat in NON_FUNCTIONAL_PATTERNS:
if pat.search(section_name):
return False
return True
def _extract_content_units(parsed_data: dict) -> dict:
"""Extract countable content units from parsed JSON.
Returns:
{"sections": [{"name": ..., "number": ...}, ...],
"table_rows": int, "diagram_images": [rid, ...]}
"""
sections = parsed_data.get("sections", [])
functional_sections: list[dict] = []
total_table_rows = 0
for sec in sections:
name = sec.get("source", "")
if _is_functional_section(name):
functional_sections.append({
"name": name,
"number": _section_number(name),
})
for block in sec.get("blocks", []):
if block.get("type") == "table":
rows = block.get("rows", [])
total_table_rows += len(rows)
# Diagram-type images from image_analysis
diagram_rids: list[str] = []
for img in parsed_data.get("image_analysis", []):
img_type = img.get("type", "")
if img_type in ("flowchart", "logic_tree", "architecture",
"state", "sequence", "activity"):
diagram_rids.append(img.get("rid", ""))
return {
"functional_sections": functional_sections,
"table_rows": total_table_rows,
"diagram_images": diagram_rids,
}
def _section_number(section_name: str) -> str:
"""Extract leading section number, e.g. '3.1.1 系统限制''3.1.1'."""
import re
m = re.match(r"^([\d.]+)", section_name)
return m.group(1) if m else section_name
def _section_matches(sec_ref: str, func_sections: list[dict]) -> str | None:
"""Find a functional section matching *sec_ref*. Returns the section name or None.
Matching: exact match → starts-with match → number match → substring match.
"""
# exact
for s in func_sections:
if s["name"] == sec_ref:
return s["name"]
# starts with section number
for s in func_sections:
if s["name"].startswith(sec_ref) or sec_ref.startswith(s["name"]):
return s["name"]
# number match
sec_num = _section_number(sec_ref)
if sec_num:
for s in func_sections:
if s["number"] == sec_num:
return s["name"]
# substring
for s in func_sections:
if sec_ref in s["name"] or s["name"] in sec_ref:
return s["name"]
return None
def _measure_coverage(ir_data: dict, parsed_data: dict) -> dict:
"""Compute structural coverage of IR over parsed document.
Returns:
{
"section_coverage": {total, covered, rate, uncovered},
"table_coverage": {total_rows, covered_rows, rate},
"diagram_coverage": {total, covered, rate},
"overall_rate": float,
}
"""
units = _extract_content_units(parsed_data)
rules = ir_data.get("rules", [])
# ── section coverage ──
func_sections = units["functional_sections"]
covered_sections: set[str] = set()
for rule in rules:
for src in rule.get("sources", []):
sec_ref = src.get("section", "")
if sec_ref:
matched = _section_matches(sec_ref, func_sections)
if matched:
covered_sections.add(matched)
section_coverage = {
"total": len(func_sections),
"covered": len(covered_sections),
"rate": round(len(covered_sections) / max(len(func_sections), 1), 3),
"uncovered": [s["name"] for s in func_sections
if s["name"] not in covered_sections],
}
# ── table row coverage ──
covered_rows: set[tuple] = set()
for rule in rules:
for src in rule.get("sources", []):
if src.get("type") == "table":
sec = src.get("section", "")
row = src.get("row")
if sec and row is not None:
covered_rows.add((sec, row))
total_rows = units["table_rows"]
table_coverage = {
"total_rows": total_rows,
"covered_rows": len(covered_rows),
"rate": round(len(covered_rows) / max(total_rows, 1), 3),
}
# ── diagram coverage ──
diagram_rids = units["diagram_images"]
covered_rids: set[str] = set()
for rule in rules:
for src in rule.get("sources", []):
if src.get("type") == "logic_tree":
img_id = src.get("image_id", "")
if img_id and img_id in diagram_rids:
covered_rids.add(img_id)
diagram_coverage = {
"total": len(diagram_rids),
"covered": len(covered_rids),
"rate": round(len(covered_rids) / max(len(diagram_rids), 1), 3),
"uncovered": [r for r in diagram_rids if r not in covered_rids],
}
# ── overall ──
rates = [
section_coverage["rate"],
table_coverage["rate"],
diagram_coverage["rate"],
]
overall = round(sum(rates) / len(rates), 3) if rates else 0.0
return {
"section_coverage": section_coverage,
"table_coverage": table_coverage,
"diagram_coverage": diagram_coverage,
"overall_rate": overall,
}
def test_layer_b_coverage(
ir_data: dict,
parsed_data: dict | None,
ir_path: str,
acceptance_runs: int,
run_ir_pipeline,
request,
):
"""Measure structural coverage and (optionally) coverage stability."""
if parsed_data is None:
pytest.skip("No parsed JSON available for coverage analysis")
# ── B1: single-run coverage ──
cov = _measure_coverage(ir_data, parsed_data)
# ── B2: stability (multi-run) ──
stability_values: list[float] = [cov["overall_rate"]]
stability_std = 0.0
if acceptance_runs > 1:
parsed_path = request.config.getoption("--parsed-path")
if parsed_path and os.path.exists(parsed_path):
for _ in range(acceptance_runs - 1):
try:
ir_list, _ = run_ir_pipeline(parsed_path)
# Convert list-format IR to dict for coverage measurement
run_ir = _wrap_list_ir(ir_list)
run_cov = _measure_coverage(run_ir, parsed_data)
stability_values.append(run_cov["overall_rate"])
time.sleep(0.5) # rate limiting between runs
except Exception as e:
pytest.fail(f"Stability run failed: {e}")
if len(stability_values) > 1:
stability_std = statistics.stdev(stability_values)
# Build Layer B result
b_result = coverage_verdict(
coverage_rate=cov["overall_rate"],
stability_std=stability_std,
stability_values=stability_values,
section_coverage=cov["section_coverage"],
table_coverage=cov["table_coverage"],
diagram_coverage=cov["diagram_coverage"],
)
_stash(request, "layer_b", b_result)
# Assert — both B1 and B2 must pass
assert b_result["coverage_pass"], (
f"Coverage {cov['overall_rate']:.1%} < threshold 70%\n"
f" Sections: {cov['section_coverage']['covered']}/{cov['section_coverage']['total']} "
f"({cov['section_coverage']['rate']:.1%})\n"
f" Uncovered: {cov['section_coverage']['uncovered']}\n"
f" Table rows: {cov['table_coverage']['covered_rows']}/{cov['table_coverage']['total_rows']} "
f"({cov['table_coverage']['rate']:.1%})\n"
f" Diagrams: {cov['diagram_coverage']['covered']}/{cov['diagram_coverage']['total']} "
f"({cov['diagram_coverage']['rate']:.1%})\n"
f" Uncovered diagrams: {cov['diagram_coverage']['uncovered']}"
)
if len(stability_values) > 1:
assert b_result["stability"]["pass"], (
f"Coverage stability std={stability_std:.4f} > threshold 0.05\n"
f" Values across {len(stability_values)} runs: {stability_values}"
)
def _wrap_list_ir(ir_list: list) -> dict:
"""Wrap a list-format IR (from ir_generator.py) into a dict for schema compat."""
# Convert simple format to rich format for coverage measurement
rules = []
for i, entry in enumerate(ir_list):
if not isinstance(entry, dict):
continue
rule = {
"rule_id": f"GEN-001-RULE-{i:03d}",
"description": entry.get("function", ""),
"path": [],
"priority": "P2",
"sources": [],
"precondition": {},
"trigger": entry.get("trigger", {"operator": "AND", "conditions": []}),
"actions": [],
}
# Convert source
src = entry.get("source", {})
if src.get("section"):
rule["sources"].append({
"type": "text",
"section": src["section"],
"paragraph": 1,
"text_snippet": src.get("location", ""),
"priority": "primary_source",
})
rules.append(rule)
return {
"feature": "generated",
"feature_id": "GEN-001",
"rules": rules,
}
# ═══════════════════════════════════════════════════════════════════════════════
# Layer C: LLM QE EXPERT AUDIT
# ═══════════════════════════════════════════════════════════════════════════════
QE_AUDITOR_PROMPT = """你是一个资深 QE 专家,负责审查需求文档的 IR(中间表示层)是否充分覆盖了源文档的所有可测试功能点。
你不是 IR 的生成者,你是独立的质量审计员。你的职责是判断 IR 的功能覆盖率是否充分。
## 审计输入
### Layer B 结构化覆盖率数据(参考)
{coverage_summary}
### 源文档内容(Parsed JSON
{parsed_content}
### 生成的 IR(待审计)
{ir_content}
## 审计要求
对源文档中的每个章节逐一评估其功能需求是否被 IR 充分覆盖。
**判断标准**
- **adequate**(充分覆盖):该章节的所有功能需求在 IR 中都有对应的 rule,包括触发条件、执行动作
- **inadequate**(覆盖不足):该章节存在功能需求未在 IR 中体现,或描述不完整(缺少触发条件或动作)
- **not_applicable**(不适用):该章节为背景介绍、术语定义、变更日志等,不包含功能需求
**注意**
- 如果某个章节涉及多个决策路径(如流程图),检查 IR 是否覆盖了每条路径
- 表格中的每个功能行都应被至少一个 IR rule 覆盖
- 图片分析中的流程图/决策树节点应被 IR 引用
## 输出格式
请严格输出以下 JSON 格式(不要包含代码块标记):
{{
"total_functional_sections": <number>,
"adequate": <number>,
"inadequate": <number>,
"not_applicable": <number>,
"inadequate_ratio": <float>,
"verdict": "ACCEPT 或 REJECT",
"rationale": "<一句话说明接受或拒绝的理由>",
"section_assessments": [
{{
"section": "<章节名>",
"assessment": "adequate | inadequate | not_applicable",
"reason": "<评估理由>",
"missing": ["<缺失项1>", "<缺失项2>"] // 仅 inadequate 时需要
}}
]
}}
verdict 判定规则:
- inadequate_ratio ≤ 0.30 → "ACCEPT"(风险可控)
- inadequate_ratio > 0.30 → "REJECT"(功能点认知差异大,需要补充 IR
"""
def test_layer_c_qe_audit(
ir_data: dict, parsed_data: dict | None, llm_client, request
):
"""LLM QE expert audit of functional coverage."""
if parsed_data is None:
pytest.skip("No parsed JSON available — cannot run QE audit")
# ── get Layer B summary for context ──
layer_b = _unstash(request, "layer_b") or {}
cov_summary = json.dumps(
{
"coverage_rate": layer_b.get("coverage_rate", "N/A"),
"section_coverage": layer_b.get("section_coverage", {}),
"diagram_coverage": layer_b.get("diagram_coverage", {}),
},
ensure_ascii=False,
indent=2,
)
# ── prepare content (trim to avoid token overflow) ──
parsed_str = json.dumps(parsed_data, ensure_ascii=False)
ir_str = json.dumps(ir_data, ensure_ascii=False)
max_parsed = 12000
max_ir = 8000
if len(parsed_str) > max_parsed:
parsed_str = parsed_str[:max_parsed] + "\n...[truncated]"
if len(ir_str) > max_ir:
ir_str = ir_str[:max_ir] + "\n...[truncated]"
prompt = QE_AUDITOR_PROMPT.format(
coverage_summary=cov_summary,
parsed_content=parsed_str,
ir_content=ir_str,
)
# ── call LLM ──
try:
raw = llm_client.chat(
model=llm_client.TEXT_MODEL,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
)
except Exception as e:
pytest.fail(f"QE audit LLM call failed: {e}")
# ── parse response ──
audit_data = _parse_json_response(raw)
if audit_data is None:
pytest.fail(f"QE audit returned unparseable response:\n{raw[:500]}")
# Build Layer C result
c_result = audit_verdict(audit_data)
c_result["raw_assessments"] = audit_data.get("section_assessments", [])
_stash(request, "layer_c", c_result)
# Assert
assert c_result["verdict"] == "ACCEPT", (
f"QE Audit REJECTED — inadequate_ratio={c_result['inadequate_ratio']:.1%} > 30%\n"
f" Rationale: {c_result['rationale']}\n"
f" Adequate: {c_result['adequate']}, Inadequate: {c_result['inadequate']}"
)
# ═══════════════════════════════════════════════════════════════════════════════
# Final report (runs last)
# ═══════════════════════════════════════════════════════════════════════════════
def test_final_report(ir_data: dict, ir_path: str, request):
"""Generate the final three-layer JSON report.
This test always passes (report generation). The verdicts from layers A/B/C
determine the final releasable status, but the report itself is informational.
"""
layer_a = _unstash(request, "layer_a") or {"verdict": "SKIPPED"}
layer_b = _unstash(request, "layer_b") or {"verdict": "SKIPPED"}
layer_c = _unstash(request, "layer_c") or {"verdict": "SKIPPED"}
report_path = request.config.getoption("--json-report-file", None) or str(
Path.cwd() / "acceptance-report.json"
)
report = generate_report(
layer_a,
layer_b,
layer_c,
commit=os.environ.get("GITEA_SHA", ""),
branch=os.environ.get("GITEA_BRANCH", "main"),
output_path=report_path,
)
# Print summary
print(f"\n{'='*60}")
print(f"QE ACCEPTANCE REPORT")
print(f"{'='*60}")
print(f" Layer A (Schema): {layer_a.get('verdict', '?')}")
print(f" Layer B (Coverage): {layer_b.get('verdict', '?')} "
f"(rate={layer_b.get('coverage_rate', '?')})")
print(f" Layer C (QE Audit): {layer_c.get('verdict', '?')}")
print(f" {''*40}")
print(f" FINAL: {report['final_verdict']} | "
f"Releasable: {report['releasable']}")
print(f" Report: {report_path}")
print(f"{'='*60}\n")
# Fail if any layer failed (aggregate assertion)
failures = report.get("failure_details", [])
if failures:
pytest.fail(
"Acceptance tests FAILED:\n" + "\n".join(f" - {f}" for f in failures)
)
# ═══════════════════════════════════════════════════════════════════════════════
# Helpers
# ═══════════════════════════════════════════════════════════════════════════════
import os # noqa: E402
# Module-level stash for sharing results across tests in the same module.
# Each test function stores its result here; later tests read earlier results.
_module_stash: dict[str, dict] = {}
def _stash(request, key: str, value: dict):
"""Store a result dict for cross-test access within this module."""
_module_stash[key] = value
def _unstash(request, key: str) -> dict | None:
"""Retrieve a stashed result."""
return _module_stash.get(key)
def _parse_json_response(raw: str) -> dict | None:
"""Parse JSON from an LLM response, handling markdown code fences."""
if not raw:
return None
text = raw.strip()
if text.startswith("```"):
nl = text.find("\n")
text = text[nl + 1:] if nl != -1 else text[3:]
if text.endswith("```"):
text = text[:-3]
try:
return json.loads(text)
except json.JSONDecodeError:
return None
+10 -5
View File
@@ -55,12 +55,17 @@ def test_import_detect_conflicts():
# -- IR generation tests ------------------------------------------------------
def test_import_ir_generator():
"""ir_generator module should be importable."""
def test_import_ir_main():
"""ir_generation main module should be importable (new project structure)."""
os.environ.setdefault("DASHSCOPE_API_KEY", "test-fake-key")
_import_from_skill("ir_generation_skill", "ir_generator")
import ir_generator
assert hasattr(ir_generator, "generate_ir")
skill_dir = os.path.join(
os.path.dirname(os.path.dirname(__file__)),
"skills", "ir_generation_skill"
)
if skill_dir not in sys.path:
sys.path.insert(0, skill_dir)
import main
assert hasattr(main, "main")
# -- Resolution application tests ---------------------------------------------