sync: update all skills from latest workspace code
CI / test (push) Successful in 8s

doc_parser_skill:
- New: verify_flowchart.py (flowchart validation)
- Updated: LLM.py (multi-provider: DeepSeek + DashScope)
- Updated: image_parser.py (logic tree support, external prompts)
- Updated: SKILL.md, prompts/image_prompt.md

conflict_detection_skill:
- Updated: LLM.py (multi-provider sync)
- Updated: detect_conflicts.py (logic tree text conversion)

ir_generation_skill:
- Replaced old scripts/LLM.py + ir_generator.py with standalone project
- New: main.py, config.py, step1-3_*.py, ensemble_merge.py
- New: prompts/, tests/ subdirectories

tests:
- New: acceptance/ test suite with schema validation
- Fixed: conftest no longer globally skips non-acceptance tests
- Updated: test_sample.py for new ir_generation structure

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-30 22:45:08 +08:00
parent db64df2da1
commit fec4c09ee0
35 changed files with 8021 additions and 530 deletions
+558
View File
@@ -0,0 +1,558 @@
"""QE Acceptance Test — Three-layer main branch health check.
Layer A (Schema): structural correctness of IR
Layer B (Coverage): structural source-traceability coverage + stability
Layer C (QE Audit): LLM as QE expert — functional coverage assessment
Final verdict: all three layers must pass for main to be releasable.
"""
from __future__ import annotations
import json
import math
import re
import statistics
import tempfile
import time
from pathlib import Path
from typing import Any
import pytest
from .ir_schema import validate_ir, schema_checklist
from .report import generate_report, schema_verdict, coverage_verdict, audit_verdict
# ═══════════════════════════════════════════════════════════════════════════════
# Layer A: SCHEMA — deterministic structural validation
# ═══════════════════════════════════════════════════════════════════════════════
def test_layer_a_schema(ir_data: dict, request):
"""Validate IR structure: required fields, types, naming conventions, no nulls."""
report = validate_ir(ir_data)
checks = schema_checklist(ir_data)
# Build Layer A result
a_errors = report["errors"]
a_stats = report["stats"]
a_result = schema_verdict(a_errors, a_stats)
a_result["checks"] = checks
# Store for downstream layers & report
_stash(request, "layer_a", a_result)
# Assert
assert report["valid"], (
f"Schema validation FAILED ({len(a_errors)} errors)\n"
+ "\n".join(f" - {e}" for e in a_errors[:20])
)
# ═══════════════════════════════════════════════════════════════════════════════
# Layer B: STRUCTURAL COVERAGE + STABILITY
# ═══════════════════════════════════════════════════════════════════════════════
# Section titles that are NOT functional requirements
NON_FUNCTIONAL_PATTERNS = [
re.compile(p) for p in [
r"编制.*变更.*日志",
r"文档背景",
r"文档范围",
r"术语解释",
r"参考",
r"附录",
r"版本",
r"变更记录",
r"目录",
r"前言",
r"概述",
r"简介",
r"概述.*背景",
]
]
def _is_functional_section(section_name: str) -> bool:
"""Heuristic: exclude background, glossary, changelog, scope sections.
Sections that are purely structural — preface, glossary, changelog — are excluded.
Sections with numbering like '3.1.1' are always considered functional.
"""
# Numbered sections are functional
if _section_number(section_name) != section_name:
return True
for pat in NON_FUNCTIONAL_PATTERNS:
if pat.search(section_name):
return False
return True
def _extract_content_units(parsed_data: dict) -> dict:
"""Extract countable content units from parsed JSON.
Returns:
{"sections": [{"name": ..., "number": ...}, ...],
"table_rows": int, "diagram_images": [rid, ...]}
"""
sections = parsed_data.get("sections", [])
functional_sections: list[dict] = []
total_table_rows = 0
for sec in sections:
name = sec.get("source", "")
if _is_functional_section(name):
functional_sections.append({
"name": name,
"number": _section_number(name),
})
for block in sec.get("blocks", []):
if block.get("type") == "table":
rows = block.get("rows", [])
total_table_rows += len(rows)
# Diagram-type images from image_analysis
diagram_rids: list[str] = []
for img in parsed_data.get("image_analysis", []):
img_type = img.get("type", "")
if img_type in ("flowchart", "logic_tree", "architecture",
"state", "sequence", "activity"):
diagram_rids.append(img.get("rid", ""))
return {
"functional_sections": functional_sections,
"table_rows": total_table_rows,
"diagram_images": diagram_rids,
}
def _section_number(section_name: str) -> str:
"""Extract leading section number, e.g. '3.1.1 系统限制''3.1.1'."""
import re
m = re.match(r"^([\d.]+)", section_name)
return m.group(1) if m else section_name
def _section_matches(sec_ref: str, func_sections: list[dict]) -> str | None:
"""Find a functional section matching *sec_ref*. Returns the section name or None.
Matching: exact match → starts-with match → number match → substring match.
"""
# exact
for s in func_sections:
if s["name"] == sec_ref:
return s["name"]
# starts with section number
for s in func_sections:
if s["name"].startswith(sec_ref) or sec_ref.startswith(s["name"]):
return s["name"]
# number match
sec_num = _section_number(sec_ref)
if sec_num:
for s in func_sections:
if s["number"] == sec_num:
return s["name"]
# substring
for s in func_sections:
if sec_ref in s["name"] or s["name"] in sec_ref:
return s["name"]
return None
def _measure_coverage(ir_data: dict, parsed_data: dict) -> dict:
"""Compute structural coverage of IR over parsed document.
Returns:
{
"section_coverage": {total, covered, rate, uncovered},
"table_coverage": {total_rows, covered_rows, rate},
"diagram_coverage": {total, covered, rate},
"overall_rate": float,
}
"""
units = _extract_content_units(parsed_data)
rules = ir_data.get("rules", [])
# ── section coverage ──
func_sections = units["functional_sections"]
covered_sections: set[str] = set()
for rule in rules:
for src in rule.get("sources", []):
sec_ref = src.get("section", "")
if sec_ref:
matched = _section_matches(sec_ref, func_sections)
if matched:
covered_sections.add(matched)
section_coverage = {
"total": len(func_sections),
"covered": len(covered_sections),
"rate": round(len(covered_sections) / max(len(func_sections), 1), 3),
"uncovered": [s["name"] for s in func_sections
if s["name"] not in covered_sections],
}
# ── table row coverage ──
covered_rows: set[tuple] = set()
for rule in rules:
for src in rule.get("sources", []):
if src.get("type") == "table":
sec = src.get("section", "")
row = src.get("row")
if sec and row is not None:
covered_rows.add((sec, row))
total_rows = units["table_rows"]
table_coverage = {
"total_rows": total_rows,
"covered_rows": len(covered_rows),
"rate": round(len(covered_rows) / max(total_rows, 1), 3),
}
# ── diagram coverage ──
diagram_rids = units["diagram_images"]
covered_rids: set[str] = set()
for rule in rules:
for src in rule.get("sources", []):
if src.get("type") == "logic_tree":
img_id = src.get("image_id", "")
if img_id and img_id in diagram_rids:
covered_rids.add(img_id)
diagram_coverage = {
"total": len(diagram_rids),
"covered": len(covered_rids),
"rate": round(len(covered_rids) / max(len(diagram_rids), 1), 3),
"uncovered": [r for r in diagram_rids if r not in covered_rids],
}
# ── overall ──
rates = [
section_coverage["rate"],
table_coverage["rate"],
diagram_coverage["rate"],
]
overall = round(sum(rates) / len(rates), 3) if rates else 0.0
return {
"section_coverage": section_coverage,
"table_coverage": table_coverage,
"diagram_coverage": diagram_coverage,
"overall_rate": overall,
}
def test_layer_b_coverage(
ir_data: dict,
parsed_data: dict | None,
ir_path: str,
acceptance_runs: int,
run_ir_pipeline,
request,
):
"""Measure structural coverage and (optionally) coverage stability."""
if parsed_data is None:
pytest.skip("No parsed JSON available for coverage analysis")
# ── B1: single-run coverage ──
cov = _measure_coverage(ir_data, parsed_data)
# ── B2: stability (multi-run) ──
stability_values: list[float] = [cov["overall_rate"]]
stability_std = 0.0
if acceptance_runs > 1:
parsed_path = request.config.getoption("--parsed-path")
if parsed_path and os.path.exists(parsed_path):
for _ in range(acceptance_runs - 1):
try:
ir_list, _ = run_ir_pipeline(parsed_path)
# Convert list-format IR to dict for coverage measurement
run_ir = _wrap_list_ir(ir_list)
run_cov = _measure_coverage(run_ir, parsed_data)
stability_values.append(run_cov["overall_rate"])
time.sleep(0.5) # rate limiting between runs
except Exception as e:
pytest.fail(f"Stability run failed: {e}")
if len(stability_values) > 1:
stability_std = statistics.stdev(stability_values)
# Build Layer B result
b_result = coverage_verdict(
coverage_rate=cov["overall_rate"],
stability_std=stability_std,
stability_values=stability_values,
section_coverage=cov["section_coverage"],
table_coverage=cov["table_coverage"],
diagram_coverage=cov["diagram_coverage"],
)
_stash(request, "layer_b", b_result)
# Assert — both B1 and B2 must pass
assert b_result["coverage_pass"], (
f"Coverage {cov['overall_rate']:.1%} < threshold 70%\n"
f" Sections: {cov['section_coverage']['covered']}/{cov['section_coverage']['total']} "
f"({cov['section_coverage']['rate']:.1%})\n"
f" Uncovered: {cov['section_coverage']['uncovered']}\n"
f" Table rows: {cov['table_coverage']['covered_rows']}/{cov['table_coverage']['total_rows']} "
f"({cov['table_coverage']['rate']:.1%})\n"
f" Diagrams: {cov['diagram_coverage']['covered']}/{cov['diagram_coverage']['total']} "
f"({cov['diagram_coverage']['rate']:.1%})\n"
f" Uncovered diagrams: {cov['diagram_coverage']['uncovered']}"
)
if len(stability_values) > 1:
assert b_result["stability"]["pass"], (
f"Coverage stability std={stability_std:.4f} > threshold 0.05\n"
f" Values across {len(stability_values)} runs: {stability_values}"
)
def _wrap_list_ir(ir_list: list) -> dict:
"""Wrap a list-format IR (from ir_generator.py) into a dict for schema compat."""
# Convert simple format to rich format for coverage measurement
rules = []
for i, entry in enumerate(ir_list):
if not isinstance(entry, dict):
continue
rule = {
"rule_id": f"GEN-001-RULE-{i:03d}",
"description": entry.get("function", ""),
"path": [],
"priority": "P2",
"sources": [],
"precondition": {},
"trigger": entry.get("trigger", {"operator": "AND", "conditions": []}),
"actions": [],
}
# Convert source
src = entry.get("source", {})
if src.get("section"):
rule["sources"].append({
"type": "text",
"section": src["section"],
"paragraph": 1,
"text_snippet": src.get("location", ""),
"priority": "primary_source",
})
rules.append(rule)
return {
"feature": "generated",
"feature_id": "GEN-001",
"rules": rules,
}
# ═══════════════════════════════════════════════════════════════════════════════
# Layer C: LLM QE EXPERT AUDIT
# ═══════════════════════════════════════════════════════════════════════════════
QE_AUDITOR_PROMPT = """你是一个资深 QE 专家,负责审查需求文档的 IR(中间表示层)是否充分覆盖了源文档的所有可测试功能点。
你不是 IR 的生成者,你是独立的质量审计员。你的职责是判断 IR 的功能覆盖率是否充分。
## 审计输入
### Layer B 结构化覆盖率数据(参考)
{coverage_summary}
### 源文档内容(Parsed JSON
{parsed_content}
### 生成的 IR(待审计)
{ir_content}
## 审计要求
对源文档中的每个章节逐一评估其功能需求是否被 IR 充分覆盖。
**判断标准**
- **adequate**(充分覆盖):该章节的所有功能需求在 IR 中都有对应的 rule,包括触发条件、执行动作
- **inadequate**(覆盖不足):该章节存在功能需求未在 IR 中体现,或描述不完整(缺少触发条件或动作)
- **not_applicable**(不适用):该章节为背景介绍、术语定义、变更日志等,不包含功能需求
**注意**
- 如果某个章节涉及多个决策路径(如流程图),检查 IR 是否覆盖了每条路径
- 表格中的每个功能行都应被至少一个 IR rule 覆盖
- 图片分析中的流程图/决策树节点应被 IR 引用
## 输出格式
请严格输出以下 JSON 格式(不要包含代码块标记):
{{
"total_functional_sections": <number>,
"adequate": <number>,
"inadequate": <number>,
"not_applicable": <number>,
"inadequate_ratio": <float>,
"verdict": "ACCEPT 或 REJECT",
"rationale": "<一句话说明接受或拒绝的理由>",
"section_assessments": [
{{
"section": "<章节名>",
"assessment": "adequate | inadequate | not_applicable",
"reason": "<评估理由>",
"missing": ["<缺失项1>", "<缺失项2>"] // 仅 inadequate 时需要
}}
]
}}
verdict 判定规则:
- inadequate_ratio ≤ 0.30 → "ACCEPT"(风险可控)
- inadequate_ratio > 0.30 → "REJECT"(功能点认知差异大,需要补充 IR
"""
def test_layer_c_qe_audit(
ir_data: dict, parsed_data: dict | None, llm_client, request
):
"""LLM QE expert audit of functional coverage."""
if parsed_data is None:
pytest.skip("No parsed JSON available — cannot run QE audit")
# ── get Layer B summary for context ──
layer_b = _unstash(request, "layer_b") or {}
cov_summary = json.dumps(
{
"coverage_rate": layer_b.get("coverage_rate", "N/A"),
"section_coverage": layer_b.get("section_coverage", {}),
"diagram_coverage": layer_b.get("diagram_coverage", {}),
},
ensure_ascii=False,
indent=2,
)
# ── prepare content (trim to avoid token overflow) ──
parsed_str = json.dumps(parsed_data, ensure_ascii=False)
ir_str = json.dumps(ir_data, ensure_ascii=False)
max_parsed = 12000
max_ir = 8000
if len(parsed_str) > max_parsed:
parsed_str = parsed_str[:max_parsed] + "\n...[truncated]"
if len(ir_str) > max_ir:
ir_str = ir_str[:max_ir] + "\n...[truncated]"
prompt = QE_AUDITOR_PROMPT.format(
coverage_summary=cov_summary,
parsed_content=parsed_str,
ir_content=ir_str,
)
# ── call LLM ──
try:
raw = llm_client.chat(
model=llm_client.TEXT_MODEL,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"},
)
except Exception as e:
pytest.fail(f"QE audit LLM call failed: {e}")
# ── parse response ──
audit_data = _parse_json_response(raw)
if audit_data is None:
pytest.fail(f"QE audit returned unparseable response:\n{raw[:500]}")
# Build Layer C result
c_result = audit_verdict(audit_data)
c_result["raw_assessments"] = audit_data.get("section_assessments", [])
_stash(request, "layer_c", c_result)
# Assert
assert c_result["verdict"] == "ACCEPT", (
f"QE Audit REJECTED — inadequate_ratio={c_result['inadequate_ratio']:.1%} > 30%\n"
f" Rationale: {c_result['rationale']}\n"
f" Adequate: {c_result['adequate']}, Inadequate: {c_result['inadequate']}"
)
# ═══════════════════════════════════════════════════════════════════════════════
# Final report (runs last)
# ═══════════════════════════════════════════════════════════════════════════════
def test_final_report(ir_data: dict, ir_path: str, request):
"""Generate the final three-layer JSON report.
This test always passes (report generation). The verdicts from layers A/B/C
determine the final releasable status, but the report itself is informational.
"""
layer_a = _unstash(request, "layer_a") or {"verdict": "SKIPPED"}
layer_b = _unstash(request, "layer_b") or {"verdict": "SKIPPED"}
layer_c = _unstash(request, "layer_c") or {"verdict": "SKIPPED"}
report_path = request.config.getoption("--json-report-file", None) or str(
Path.cwd() / "acceptance-report.json"
)
report = generate_report(
layer_a,
layer_b,
layer_c,
commit=os.environ.get("GITEA_SHA", ""),
branch=os.environ.get("GITEA_BRANCH", "main"),
output_path=report_path,
)
# Print summary
print(f"\n{'='*60}")
print(f"QE ACCEPTANCE REPORT")
print(f"{'='*60}")
print(f" Layer A (Schema): {layer_a.get('verdict', '?')}")
print(f" Layer B (Coverage): {layer_b.get('verdict', '?')} "
f"(rate={layer_b.get('coverage_rate', '?')})")
print(f" Layer C (QE Audit): {layer_c.get('verdict', '?')}")
print(f" {''*40}")
print(f" FINAL: {report['final_verdict']} | "
f"Releasable: {report['releasable']}")
print(f" Report: {report_path}")
print(f"{'='*60}\n")
# Fail if any layer failed (aggregate assertion)
failures = report.get("failure_details", [])
if failures:
pytest.fail(
"Acceptance tests FAILED:\n" + "\n".join(f" - {f}" for f in failures)
)
# ═══════════════════════════════════════════════════════════════════════════════
# Helpers
# ═══════════════════════════════════════════════════════════════════════════════
import os # noqa: E402
# Module-level stash for sharing results across tests in the same module.
# Each test function stores its result here; later tests read earlier results.
_module_stash: dict[str, dict] = {}
def _stash(request, key: str, value: dict):
"""Store a result dict for cross-test access within this module."""
_module_stash[key] = value
def _unstash(request, key: str) -> dict | None:
"""Retrieve a stashed result."""
return _module_stash.get(key)
def _parse_json_response(raw: str) -> dict | None:
"""Parse JSON from an LLM response, handling markdown code fences."""
if not raw:
return None
text = raw.strip()
if text.startswith("```"):
nl = text.find("\n")
text = text[nl + 1:] if nl != -1 else text[3:]
if text.endswith("```"):
text = text[:-3]
try:
return json.loads(text)
except json.JSONDecodeError:
return None