doc_parser_skill: - New: verify_flowchart.py (flowchart validation) - Updated: LLM.py (multi-provider: DeepSeek + DashScope) - Updated: image_parser.py (logic tree support, external prompts) - Updated: SKILL.md, prompts/image_prompt.md conflict_detection_skill: - Updated: LLM.py (multi-provider sync) - Updated: detect_conflicts.py (logic tree text conversion) ir_generation_skill: - Replaced old scripts/LLM.py + ir_generator.py with standalone project - New: main.py, config.py, step1-3_*.py, ensemble_merge.py - New: prompts/, tests/ subdirectories tests: - New: acceptance/ test suite with schema validation - Fixed: conftest no longer globally skips non-acceptance tests - Updated: test_sample.py for new ir_generation structure Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1 @@
|
||||
# Tests package for document_analyzer
|
||||
@@ -0,0 +1 @@
|
||||
# QE Acceptance Tests for document_analyzer
|
||||
@@ -0,0 +1,186 @@
|
||||
"""Pytest configuration and shared fixtures for QE acceptance tests.
|
||||
|
||||
Usage::
|
||||
|
||||
pytest tests/acceptance/ -v --run-acceptance [--acceptance-runs=3]
|
||||
|
||||
Environment variables:
|
||||
DASHSCOPE_API_KEY — LLM API key (required for Layers B/C)
|
||||
TEST_IR_PATH — path to IR JSON to validate (default: ir_final.json sample)
|
||||
TEST_PARSED_PATH — path to _parsed.json or _updated.json for coverage analysis
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
# ── Path setup ──────────────────────────────────────────────────────────────
|
||||
|
||||
_PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
|
||||
sys.path.insert(0, str(_PROJECT_ROOT))
|
||||
|
||||
|
||||
def _skill_path(skill_name: str) -> str:
|
||||
return str(_PROJECT_ROOT / "skills" / skill_name / "scripts")
|
||||
|
||||
|
||||
# ── pytest configuration ────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--run-acceptance",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Run QE acceptance tests (requires DASHSCOPE_API_KEY)",
|
||||
)
|
||||
parser.addoption(
|
||||
"--acceptance-runs",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of IR generation runs for Layer B stability testing (default: 1 = skip)",
|
||||
)
|
||||
parser.addoption(
|
||||
"--ir-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to IR JSON file to validate",
|
||||
)
|
||||
parser.addoption(
|
||||
"--parsed-path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Path to _parsed.json or _updated.json for coverage analysis",
|
||||
)
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
config.addinivalue_line(
|
||||
"markers",
|
||||
"acceptance: QE acceptance test (requires --run-acceptance flag and DASHSCOPE_API_KEY)",
|
||||
)
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config, items):
|
||||
acceptance_dir = str(_PROJECT_ROOT / "tests" / "acceptance")
|
||||
acceptance_items = [i for i in items if str(i.fspath).startswith(acceptance_dir)]
|
||||
non_acceptance_items = [i for i in items if not str(i.fspath).startswith(acceptance_dir)]
|
||||
|
||||
if not config.getoption("--run-acceptance"):
|
||||
skip_msg = pytest.mark.skip(reason="Need --run-acceptance flag to run")
|
||||
for item in acceptance_items:
|
||||
item.add_marker(skip_msg)
|
||||
# Don't skip non-acceptance tests
|
||||
return
|
||||
|
||||
if not os.environ.get("DASHSCOPE_API_KEY"):
|
||||
skip_msg = pytest.mark.skip(reason="DASHSCOPE_API_KEY not set")
|
||||
for item in acceptance_items:
|
||||
item.add_marker(skip_msg)
|
||||
|
||||
|
||||
# ── Shared fixtures ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def project_root() -> Path:
|
||||
return _PROJECT_ROOT
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ir_path(request) -> str:
|
||||
"""Path to the IR JSON file under test."""
|
||||
path = (
|
||||
request.config.getoption("--ir-path")
|
||||
or os.environ.get("TEST_IR_PATH")
|
||||
or str(
|
||||
Path.home()
|
||||
/ ".openclaw/workspace/skills/doc_parser_skill/output/ir_final.json"
|
||||
)
|
||||
)
|
||||
if not os.path.exists(path):
|
||||
pytest.skip(f"IR file not found: {path}")
|
||||
return path
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ir_data(ir_path: str) -> dict:
|
||||
"""Load the IR JSON data."""
|
||||
with open(ir_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def parsed_path(request) -> str | None:
|
||||
"""Path to the corresponding _parsed.json or _updated.json."""
|
||||
path = (
|
||||
request.config.getoption("--parsed-path")
|
||||
or os.environ.get("TEST_PARSED_PATH")
|
||||
or str(
|
||||
_PROJECT_ROOT
|
||||
/ "skills/ir_generation_skill/车机娱乐系统禁止功能文档_精简_updated.json"
|
||||
)
|
||||
)
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def parsed_data(parsed_path: str | None) -> dict | None:
|
||||
"""Load the parsed document JSON for coverage analysis."""
|
||||
if parsed_path is None:
|
||||
return None
|
||||
with open(parsed_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def llm_client():
|
||||
"""Create an LLMClient instance for acceptance tests.
|
||||
|
||||
Uses the DashScope-compatible LLMClient from the project.
|
||||
"""
|
||||
sys.path.insert(0, _skill_path("doc_parser_skill"))
|
||||
from LLM import LLMClient
|
||||
|
||||
return LLMClient()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def acceptance_runs(request) -> int:
|
||||
return request.config.getoption("--acceptance-runs", default=1)
|
||||
|
||||
|
||||
# ── Pipeline runner ─────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def run_ir_pipeline():
|
||||
"""Return a callable that runs the IR generation pipeline on a parsed JSON.
|
||||
|
||||
Usage::
|
||||
|
||||
ir_data, ir_path = run_ir_pipeline(parsed_json_path, output_dir)
|
||||
"""
|
||||
sys.path.insert(0, _skill_path("ir_generation_skill"))
|
||||
from ir_generator import generate_ir
|
||||
|
||||
def _run(parsed_path: str, output_dir: str | None = None) -> tuple[dict, str]:
|
||||
"""Run IR generation and return (ir_data, ir_path)."""
|
||||
out = output_dir or tempfile.mkdtemp(prefix="qe_acceptance_")
|
||||
result = generate_ir(parsed_path, out, dry_run=False)
|
||||
ir_list = result.get("ir", [])
|
||||
ir_path = result.get("path", "")
|
||||
# ir_generator produces a list; wrap to match rich format expectations
|
||||
# for schema validation we accept both formats
|
||||
return ir_list, ir_path
|
||||
|
||||
return _run
|
||||
@@ -0,0 +1,325 @@
|
||||
"""Rich IR schema definition and validators for the document_analyzer QE framework.
|
||||
|
||||
Target format is the production IR (``ir_final.json``):
|
||||
{feature, feature_id, config_defaults?, rules: [{rule_id, path, description,
|
||||
priority, sources, precondition, trigger, actions}]}
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
# ── Constants ────────────────────────────────────────────────────────────────
|
||||
|
||||
VALID_SOURCE_TYPES = {"table", "logic_tree", "text"}
|
||||
VALID_ACTION_TYPES = {"system", "user_interaction"}
|
||||
VALID_PRIORITIES = {"P0", "P1", "P2"}
|
||||
VALID_TRIGGER_OPERATORS = {"AND", "OR"}
|
||||
|
||||
# rule_id pattern: FEAT-NNN-SCOPE-TYPE-...-PATH-NN (variable middle segments)
|
||||
RULE_ID_RE = re.compile(
|
||||
r"^[A-Z]+-\d+(-[A-Z]+)+-\d+$"
|
||||
)
|
||||
|
||||
|
||||
# ── Validation helpers ──────────────────────────────────────────────────────
|
||||
|
||||
def _check(condition: bool, message: str) -> list[str]:
|
||||
"""Return a list with an error message if *condition* is False, else empty list."""
|
||||
return [] if condition else [message]
|
||||
|
||||
|
||||
def validate_rule(rule: dict, index: int = 0) -> list[str]:
|
||||
"""Validate a single rule dict. Returns a (possibly empty) list of error strings."""
|
||||
errors: list[str] = []
|
||||
label = f"rules[{index}]"
|
||||
|
||||
if not isinstance(rule, dict):
|
||||
return [f"{label}: not a dict"]
|
||||
|
||||
# ── required top-level fields ──
|
||||
for field in ("rule_id", "description"):
|
||||
errors.extend(_check(
|
||||
isinstance(rule.get(field), str) and bool(rule[field].strip()),
|
||||
f'{label}.{field}: required non-empty string',
|
||||
))
|
||||
|
||||
# sources is a list, not a string — validated separately below
|
||||
|
||||
# ── rule_id naming ──
|
||||
rid = rule.get("rule_id", "")
|
||||
if rid and isinstance(rid, str):
|
||||
errors.extend(_check(
|
||||
bool(RULE_ID_RE.match(rid)),
|
||||
f'{label}.rule_id: "{rid}" does not match pattern FEAT-NNN-SCOPE-TYPE-PATH-NN',
|
||||
))
|
||||
|
||||
# ── priority ──
|
||||
priority = rule.get("priority")
|
||||
if priority is not None:
|
||||
errors.extend(_check(
|
||||
priority in VALID_PRIORITIES,
|
||||
f'{label}.priority: "{priority}" not in {VALID_PRIORITIES}',
|
||||
))
|
||||
|
||||
# ── path ──
|
||||
path = rule.get("path")
|
||||
if path is not None:
|
||||
if not isinstance(path, list):
|
||||
errors.append(f"{label}.path: must be a list")
|
||||
elif len(path) == 0:
|
||||
errors.append(f"{label}.path: must not be empty")
|
||||
elif not all(isinstance(p, str) and p.strip() for p in path):
|
||||
errors.append(f"{label}.path: all segments must be non-empty strings")
|
||||
|
||||
# ── sources[] ──
|
||||
sources = rule.get("sources", [])
|
||||
if not isinstance(sources, list):
|
||||
errors.append(f"{label}.sources: must be a list")
|
||||
elif len(sources) == 0:
|
||||
errors.append(f"{label}.sources: must have at least one source")
|
||||
else:
|
||||
for si, src in enumerate(sources):
|
||||
errors.extend(_validate_source(src, f"{label}.sources[{si}]"))
|
||||
|
||||
# ── precondition ──
|
||||
precondition = rule.get("precondition")
|
||||
if precondition is not None:
|
||||
if not isinstance(precondition, dict):
|
||||
errors.append(f"{label}.precondition: must be a dict")
|
||||
elif len(precondition) == 0:
|
||||
errors.append(f"{label}.precondition: must not be empty")
|
||||
|
||||
# ── trigger ──
|
||||
trigger = rule.get("trigger")
|
||||
if trigger is not None:
|
||||
if not isinstance(trigger, dict):
|
||||
errors.append(f"{label}.trigger: must be a dict")
|
||||
else:
|
||||
errors.extend(_validate_trigger(trigger, f"{label}.trigger"))
|
||||
|
||||
# ── actions ──
|
||||
actions = rule.get("actions")
|
||||
if actions is not None:
|
||||
if not isinstance(actions, list):
|
||||
errors.append(f"{label}.actions: must be a list")
|
||||
else:
|
||||
for ai, act in enumerate(actions):
|
||||
errors.extend(_validate_action(act, f"{label}.actions[{ai}]"))
|
||||
|
||||
# ── no null values at any depth ──
|
||||
errors.extend(_find_nulls(rule, label))
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def _validate_source(src: dict, label: str) -> list[str]:
|
||||
errors: list[str] = []
|
||||
if not isinstance(src, dict):
|
||||
return [f"{label}: not a dict"]
|
||||
|
||||
stype = src.get("type", "")
|
||||
errors.extend(_check(
|
||||
stype in VALID_SOURCE_TYPES,
|
||||
f'{label}.type: "{stype}" not in {VALID_SOURCE_TYPES}',
|
||||
))
|
||||
|
||||
priority = src.get("priority", "")
|
||||
if priority:
|
||||
errors.extend(_check(
|
||||
priority in ("primary_source", "supplementary"),
|
||||
f'{label}.priority: "{priority}" must be primary_source or supplementary',
|
||||
))
|
||||
|
||||
# type-specific fields
|
||||
if stype == "table":
|
||||
errors.extend(_check(
|
||||
isinstance(src.get("section"), str) and bool(src["section"].strip()),
|
||||
f"{label}.section: required non-empty string for table source",
|
||||
))
|
||||
errors.extend(_check(
|
||||
isinstance(src.get("row"), int),
|
||||
f"{label}.row: required int for table source",
|
||||
))
|
||||
elif stype == "logic_tree":
|
||||
errors.extend(_check(
|
||||
isinstance(src.get("image_id"), str) and bool(src["image_id"].strip()),
|
||||
f"{label}.image_id: required non-empty string for logic_tree source",
|
||||
))
|
||||
node_ids = src.get("node_ids", [])
|
||||
errors.extend(_check(
|
||||
isinstance(node_ids, list) and len(node_ids) > 0,
|
||||
f"{label}.node_ids: required non-empty list for logic_tree source",
|
||||
))
|
||||
elif stype == "text":
|
||||
errors.extend(_check(
|
||||
isinstance(src.get("section"), str) and bool(src["section"].strip()),
|
||||
f"{label}.section: required non-empty string for text source",
|
||||
))
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def _validate_trigger(trigger: dict, label: str) -> list[str]:
|
||||
errors: list[str] = []
|
||||
operator = trigger.get("operator", "")
|
||||
errors.extend(_check(
|
||||
operator in VALID_TRIGGER_OPERATORS,
|
||||
f'{label}.operator: "{operator}" not in {VALID_TRIGGER_OPERATORS}',
|
||||
))
|
||||
|
||||
conditions = trigger.get("conditions")
|
||||
if conditions is not None:
|
||||
if not isinstance(conditions, list):
|
||||
errors.append(f"{label}.conditions: must be a list")
|
||||
else:
|
||||
for ci, cond in enumerate(conditions):
|
||||
if not isinstance(cond, dict):
|
||||
errors.append(f"{label}.conditions[{ci}]: not a dict")
|
||||
else:
|
||||
errors.extend(_check(
|
||||
isinstance(cond.get("signal"), str) and bool(cond["signal"].strip()),
|
||||
f"{label}.conditions[{ci}].signal: required non-empty string",
|
||||
))
|
||||
errors.extend(_check(
|
||||
"operator" in cond,
|
||||
f"{label}.conditions[{ci}].operator: required",
|
||||
))
|
||||
# empty conditions is valid (e.g. "switch always off, no conditions")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def _validate_action(action: dict, label: str) -> list[str]:
|
||||
errors: list[str] = []
|
||||
if not isinstance(action, dict):
|
||||
return [f"{label}: not a dict"]
|
||||
|
||||
atype = action.get("type", "")
|
||||
errors.extend(_check(
|
||||
atype in VALID_ACTION_TYPES,
|
||||
f'{label}.type: "{atype}" not in {VALID_ACTION_TYPES}',
|
||||
))
|
||||
errors.extend(_check(
|
||||
isinstance(action.get("description"), str) and bool(action["description"].strip()),
|
||||
f"{label}.description: required non-empty string",
|
||||
))
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def _find_nulls(obj: Any, label: str) -> list[str]:
|
||||
"""Find any None values at any depth in *obj*."""
|
||||
errors: list[str] = []
|
||||
if obj is None:
|
||||
return [f"{label}: null value"]
|
||||
elif isinstance(obj, dict):
|
||||
for k, v in obj.items():
|
||||
errors.extend(_find_nulls(v, f"{label}.{k}"))
|
||||
elif isinstance(obj, list):
|
||||
for i, v in enumerate(obj):
|
||||
errors.extend(_find_nulls(v, f"{label}[{i}]"))
|
||||
return errors
|
||||
|
||||
|
||||
# ── Top-level validation ────────────────────────────────────────────────────
|
||||
|
||||
def validate_ir(ir_data: dict) -> dict:
|
||||
"""Validate the entire IR document.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"valid": bool,
|
||||
"errors": [str, ...],
|
||||
"stats": {total_rules, valid_rules, has_config_defaults, ...}
|
||||
}
|
||||
"""
|
||||
errors: list[str] = []
|
||||
stats = {"total_rules": 0, "valid_rules": 0, "has_config_defaults": False, "features": 0}
|
||||
|
||||
if not isinstance(ir_data, dict):
|
||||
return {"valid": False, "errors": ["IR root is not a dict"], "stats": stats}
|
||||
|
||||
# top-level required fields
|
||||
for field in ("feature", "feature_id", "rules"):
|
||||
if field not in ir_data:
|
||||
errors.append(f"root.{field}: missing required field")
|
||||
elif field in ("feature", "feature_id") and not (
|
||||
isinstance(ir_data[field], str) and ir_data[field].strip()
|
||||
):
|
||||
errors.append(f"root.{field}: must be non-empty string")
|
||||
|
||||
# config_defaults (optional)
|
||||
if "config_defaults" in ir_data:
|
||||
stats["has_config_defaults"] = True
|
||||
cd = ir_data["config_defaults"]
|
||||
if not isinstance(cd, dict):
|
||||
errors.append("root.config_defaults: must be a dict")
|
||||
|
||||
# rules array
|
||||
rules = ir_data.get("rules", [])
|
||||
if not isinstance(rules, list):
|
||||
errors.append("root.rules: must be a list")
|
||||
else:
|
||||
stats["total_rules"] = len(rules)
|
||||
if len(rules) == 0:
|
||||
errors.append("root.rules: must have at least one rule")
|
||||
else:
|
||||
for i, rule in enumerate(rules):
|
||||
rule_errors = validate_rule(rule, i)
|
||||
if rule_errors:
|
||||
errors.extend(rule_errors)
|
||||
else:
|
||||
stats["valid_rules"] += 1
|
||||
|
||||
# feature count
|
||||
if isinstance(ir_data.get("feature_id"), str):
|
||||
stats["features"] = 1
|
||||
|
||||
return {
|
||||
"valid": len(errors) == 0,
|
||||
"errors": errors,
|
||||
"stats": stats,
|
||||
}
|
||||
|
||||
|
||||
# ── Summary helpers ─────────────────────────────────────────────────────────
|
||||
|
||||
def schema_checklist(ir_data: dict) -> list[dict]:
|
||||
"""Run individual checks and return a checklist for reporting.
|
||||
|
||||
Each item: {"check": str, "passed": bool, "detail": str}
|
||||
"""
|
||||
report = validate_ir(ir_data)
|
||||
checks: list[dict] = []
|
||||
|
||||
def _add(name: str, passed: bool, detail: str = ""):
|
||||
checks.append({"check": name, "passed": passed, "detail": detail})
|
||||
|
||||
# Top-level
|
||||
_add("root is dict", isinstance(ir_data, dict))
|
||||
_add("root.feature present", isinstance(ir_data.get("feature"), str) and bool(ir_data["feature"].strip()))
|
||||
_add("root.feature_id present", isinstance(ir_data.get("feature_id"), str) and bool(ir_data["feature_id"].strip()))
|
||||
_add("root.rules is non-empty list", isinstance(ir_data.get("rules"), list) and len(ir_data["rules"]) > 0)
|
||||
|
||||
# Per-rule checks
|
||||
rules = ir_data.get("rules", []) if isinstance(ir_data, dict) else []
|
||||
rule_ids = []
|
||||
for i, rule in enumerate(rules):
|
||||
if not isinstance(rule, dict):
|
||||
continue
|
||||
rid = rule.get("rule_id", f"rules[{i}]")
|
||||
rule_ids.append(rid)
|
||||
|
||||
errs = validate_rule(rule, i)
|
||||
_add(f"{rid}: valid", len(errs) == 0, "; ".join(errs) if errs else "")
|
||||
|
||||
# Aggregate checks
|
||||
_add("no duplicate rule_ids", len(rule_ids) == len(set(rule_ids)),
|
||||
f"duplicates: {[r for r in rule_ids if rule_ids.count(r) > 1]}" if len(rule_ids) != len(set(rule_ids)) else "")
|
||||
|
||||
_add("all rules valid", report["valid"],
|
||||
f"{report['stats']['valid_rules']}/{report['stats']['total_rules']} valid")
|
||||
|
||||
return checks
|
||||
@@ -0,0 +1,178 @@
|
||||
"""Structured JSON report generation for QE acceptance test results.
|
||||
|
||||
Produces a unified report with three-layer verdict:
|
||||
Layer A – Schema compliance
|
||||
Layer B – Structural coverage + stability
|
||||
Layer C – LLM QE expert audit
|
||||
|
||||
Final verdict: PASS (releasable) or FAIL (blocked).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def generate_report(
|
||||
schema_result: dict,
|
||||
coverage_result: dict,
|
||||
audit_result: dict | None,
|
||||
*,
|
||||
commit: str = "",
|
||||
branch: str = "main",
|
||||
output_path: str | None = None,
|
||||
) -> dict:
|
||||
"""Assemble the three-layer report and return it.
|
||||
|
||||
Args:
|
||||
schema_result: ``{"verdict": "PASS"|"FAIL", "total_checks": N, "passed": N, "failed": N}``
|
||||
coverage_result: ``{"verdict": "PASS"|"FAIL", "coverage_rate": float,
|
||||
"stability": {"runs": N, "values": [...], "std": float}}``
|
||||
audit_result: ``{"verdict": "ACCEPT"|"REJECT", "inadequate_ratio": float,
|
||||
"rationale": str, "section_assessments": [...]}`` or None
|
||||
commit: git commit SHA
|
||||
branch: branch name
|
||||
output_path: if set, write the report JSON to this path
|
||||
|
||||
Returns the report dict.
|
||||
"""
|
||||
layers: dict[str, Any] = {
|
||||
"A_schema": schema_result,
|
||||
"B_coverage": coverage_result,
|
||||
}
|
||||
if audit_result is not None:
|
||||
layers["C_qe_audit"] = audit_result
|
||||
|
||||
# ── final verdict ──
|
||||
a_pass = schema_result.get("verdict") == "PASS"
|
||||
b_pass = coverage_result.get("verdict") == "PASS"
|
||||
c_pass = (
|
||||
audit_result is None
|
||||
or audit_result.get("verdict") == "ACCEPT"
|
||||
)
|
||||
all_pass = a_pass and b_pass and c_pass
|
||||
|
||||
report = {
|
||||
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
|
||||
"commit": commit,
|
||||
"branch": branch,
|
||||
"layers": layers,
|
||||
"final_verdict": "PASS" if all_pass else "FAIL",
|
||||
"releasable": all_pass,
|
||||
"failure_details": _failure_details(layers),
|
||||
}
|
||||
|
||||
if output_path:
|
||||
out = Path(output_path)
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def _failure_details(layers: dict) -> list[str]:
|
||||
"""Summarise which layers failed and why."""
|
||||
details: list[str] = []
|
||||
|
||||
schema = layers.get("A_schema", {})
|
||||
if schema.get("verdict") != "PASS":
|
||||
details.append(
|
||||
f"Layer A (Schema): {schema.get('failed', '?')}/{schema.get('total_checks', '?')} checks failed"
|
||||
)
|
||||
|
||||
coverage = layers.get("B_coverage", {})
|
||||
if coverage.get("verdict") != "PASS":
|
||||
cv = coverage.get("coverage_rate", "?")
|
||||
details.append(f"Layer B (Coverage): rate={cv} (threshold: 0.70)")
|
||||
|
||||
audit = layers.get("C_qe_audit", {})
|
||||
if audit.get("verdict") == "REJECT":
|
||||
details.append(
|
||||
f"Layer C (QE Audit): REJECT — inadequate_ratio={audit.get('inadequate_ratio', '?')}"
|
||||
)
|
||||
|
||||
return details
|
||||
|
||||
|
||||
# ── Layer-specific result builders ──────────────────────────────────────────
|
||||
|
||||
def schema_verdict(errors: list[str], stats: dict) -> dict:
|
||||
"""Build Layer A result from schema validation errors & stats."""
|
||||
total = stats.get("total_rules", 0)
|
||||
valid = stats.get("valid_rules", 0)
|
||||
failed_checks = len(errors) + (total - valid)
|
||||
|
||||
return {
|
||||
"verdict": "PASS" if failed_checks == 0 else "FAIL",
|
||||
"total_checks": max(total, 1), # at minimum, we checked the root
|
||||
"passed": valid if failed_checks == 0 else valid,
|
||||
"failed": failed_checks,
|
||||
"rule_pass_rate": round(valid / max(total, 1), 2) if total > 0 else 0,
|
||||
"sample_errors": errors[:10], # first 10 for the report
|
||||
}
|
||||
|
||||
|
||||
def coverage_verdict(
|
||||
coverage_rate: float,
|
||||
stability_std: float,
|
||||
stability_values: list[float],
|
||||
*,
|
||||
coverage_threshold: float = 0.70,
|
||||
stability_threshold: float = 0.05,
|
||||
section_coverage: dict | None = None,
|
||||
table_coverage: dict | None = None,
|
||||
diagram_coverage: dict | None = None,
|
||||
) -> dict:
|
||||
"""Build Layer B result from coverage metrics."""
|
||||
b1_pass = coverage_rate >= coverage_threshold
|
||||
b2_pass = stability_std <= stability_threshold
|
||||
both_pass = b1_pass and b2_pass
|
||||
|
||||
result: dict[str, Any] = {
|
||||
"verdict": "PASS" if both_pass else "FAIL",
|
||||
"coverage_rate": round(coverage_rate, 3),
|
||||
"coverage_threshold": coverage_threshold,
|
||||
"coverage_pass": b1_pass,
|
||||
"stability": {
|
||||
"runs": len(stability_values),
|
||||
"values": [round(v, 3) for v in stability_values],
|
||||
"std": round(stability_std, 4),
|
||||
"threshold": stability_threshold,
|
||||
"pass": b2_pass,
|
||||
},
|
||||
}
|
||||
|
||||
if section_coverage:
|
||||
result["section_coverage"] = section_coverage
|
||||
if table_coverage:
|
||||
result["table_coverage"] = table_coverage
|
||||
if diagram_coverage:
|
||||
result["diagram_coverage"] = diagram_coverage
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def audit_verdict(audit_data: dict, *, inadequate_threshold: float = 0.30) -> dict:
|
||||
"""Build Layer C result from LLM QE audit.
|
||||
|
||||
*audit_data* should contain:
|
||||
inadequate_ratio: float
|
||||
rationale: str
|
||||
section_assessments: list[dict]
|
||||
"""
|
||||
ratio = audit_data.get("inadequate_ratio", 1.0)
|
||||
passed = ratio <= inadequate_threshold
|
||||
|
||||
return {
|
||||
"verdict": "ACCEPT" if passed else "REJECT",
|
||||
"inadequate_ratio": round(ratio, 3),
|
||||
"threshold": inadequate_threshold,
|
||||
"rationale": audit_data.get("rationale", ""),
|
||||
"total_sections": audit_data.get("total_functional_sections", 0),
|
||||
"adequate": audit_data.get("adequate", 0),
|
||||
"inadequate": audit_data.get("inadequate", 0),
|
||||
"not_applicable": audit_data.get("not_applicable", 0),
|
||||
}
|
||||
@@ -0,0 +1,558 @@
|
||||
"""QE Acceptance Test — Three-layer main branch health check.
|
||||
|
||||
Layer A (Schema): structural correctness of IR
|
||||
Layer B (Coverage): structural source-traceability coverage + stability
|
||||
Layer C (QE Audit): LLM as QE expert — functional coverage assessment
|
||||
|
||||
Final verdict: all three layers must pass for main to be releasable.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
import statistics
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from .ir_schema import validate_ir, schema_checklist
|
||||
from .report import generate_report, schema_verdict, coverage_verdict, audit_verdict
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# Layer A: SCHEMA — deterministic structural validation
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def test_layer_a_schema(ir_data: dict, request):
|
||||
"""Validate IR structure: required fields, types, naming conventions, no nulls."""
|
||||
report = validate_ir(ir_data)
|
||||
checks = schema_checklist(ir_data)
|
||||
|
||||
# Build Layer A result
|
||||
a_errors = report["errors"]
|
||||
a_stats = report["stats"]
|
||||
a_result = schema_verdict(a_errors, a_stats)
|
||||
a_result["checks"] = checks
|
||||
|
||||
# Store for downstream layers & report
|
||||
_stash(request, "layer_a", a_result)
|
||||
|
||||
# Assert
|
||||
assert report["valid"], (
|
||||
f"Schema validation FAILED ({len(a_errors)} errors)\n"
|
||||
+ "\n".join(f" - {e}" for e in a_errors[:20])
|
||||
)
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# Layer B: STRUCTURAL COVERAGE + STABILITY
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
# Section titles that are NOT functional requirements
|
||||
NON_FUNCTIONAL_PATTERNS = [
|
||||
re.compile(p) for p in [
|
||||
r"编制.*变更.*日志",
|
||||
r"文档背景",
|
||||
r"文档范围",
|
||||
r"术语解释",
|
||||
r"参考",
|
||||
r"附录",
|
||||
r"版本",
|
||||
r"变更记录",
|
||||
r"目录",
|
||||
r"前言",
|
||||
r"概述",
|
||||
r"简介",
|
||||
r"概述.*背景",
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
def _is_functional_section(section_name: str) -> bool:
|
||||
"""Heuristic: exclude background, glossary, changelog, scope sections.
|
||||
|
||||
Sections that are purely structural — preface, glossary, changelog — are excluded.
|
||||
Sections with numbering like '3.1.1' are always considered functional.
|
||||
"""
|
||||
# Numbered sections are functional
|
||||
if _section_number(section_name) != section_name:
|
||||
return True
|
||||
for pat in NON_FUNCTIONAL_PATTERNS:
|
||||
if pat.search(section_name):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _extract_content_units(parsed_data: dict) -> dict:
|
||||
"""Extract countable content units from parsed JSON.
|
||||
|
||||
Returns:
|
||||
{"sections": [{"name": ..., "number": ...}, ...],
|
||||
"table_rows": int, "diagram_images": [rid, ...]}
|
||||
"""
|
||||
sections = parsed_data.get("sections", [])
|
||||
|
||||
functional_sections: list[dict] = []
|
||||
total_table_rows = 0
|
||||
|
||||
for sec in sections:
|
||||
name = sec.get("source", "")
|
||||
if _is_functional_section(name):
|
||||
functional_sections.append({
|
||||
"name": name,
|
||||
"number": _section_number(name),
|
||||
})
|
||||
|
||||
for block in sec.get("blocks", []):
|
||||
if block.get("type") == "table":
|
||||
rows = block.get("rows", [])
|
||||
total_table_rows += len(rows)
|
||||
|
||||
# Diagram-type images from image_analysis
|
||||
diagram_rids: list[str] = []
|
||||
for img in parsed_data.get("image_analysis", []):
|
||||
img_type = img.get("type", "")
|
||||
if img_type in ("flowchart", "logic_tree", "architecture",
|
||||
"state", "sequence", "activity"):
|
||||
diagram_rids.append(img.get("rid", ""))
|
||||
|
||||
return {
|
||||
"functional_sections": functional_sections,
|
||||
"table_rows": total_table_rows,
|
||||
"diagram_images": diagram_rids,
|
||||
}
|
||||
|
||||
|
||||
def _section_number(section_name: str) -> str:
|
||||
"""Extract leading section number, e.g. '3.1.1 系统限制' → '3.1.1'."""
|
||||
import re
|
||||
m = re.match(r"^([\d.]+)", section_name)
|
||||
return m.group(1) if m else section_name
|
||||
|
||||
|
||||
def _section_matches(sec_ref: str, func_sections: list[dict]) -> str | None:
|
||||
"""Find a functional section matching *sec_ref*. Returns the section name or None.
|
||||
|
||||
Matching: exact match → starts-with match → number match → substring match.
|
||||
"""
|
||||
# exact
|
||||
for s in func_sections:
|
||||
if s["name"] == sec_ref:
|
||||
return s["name"]
|
||||
# starts with section number
|
||||
for s in func_sections:
|
||||
if s["name"].startswith(sec_ref) or sec_ref.startswith(s["name"]):
|
||||
return s["name"]
|
||||
# number match
|
||||
sec_num = _section_number(sec_ref)
|
||||
if sec_num:
|
||||
for s in func_sections:
|
||||
if s["number"] == sec_num:
|
||||
return s["name"]
|
||||
# substring
|
||||
for s in func_sections:
|
||||
if sec_ref in s["name"] or s["name"] in sec_ref:
|
||||
return s["name"]
|
||||
return None
|
||||
|
||||
|
||||
def _measure_coverage(ir_data: dict, parsed_data: dict) -> dict:
|
||||
"""Compute structural coverage of IR over parsed document.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"section_coverage": {total, covered, rate, uncovered},
|
||||
"table_coverage": {total_rows, covered_rows, rate},
|
||||
"diagram_coverage": {total, covered, rate},
|
||||
"overall_rate": float,
|
||||
}
|
||||
"""
|
||||
units = _extract_content_units(parsed_data)
|
||||
rules = ir_data.get("rules", [])
|
||||
|
||||
# ── section coverage ──
|
||||
func_sections = units["functional_sections"]
|
||||
covered_sections: set[str] = set()
|
||||
for rule in rules:
|
||||
for src in rule.get("sources", []):
|
||||
sec_ref = src.get("section", "")
|
||||
if sec_ref:
|
||||
matched = _section_matches(sec_ref, func_sections)
|
||||
if matched:
|
||||
covered_sections.add(matched)
|
||||
|
||||
section_coverage = {
|
||||
"total": len(func_sections),
|
||||
"covered": len(covered_sections),
|
||||
"rate": round(len(covered_sections) / max(len(func_sections), 1), 3),
|
||||
"uncovered": [s["name"] for s in func_sections
|
||||
if s["name"] not in covered_sections],
|
||||
}
|
||||
|
||||
# ── table row coverage ──
|
||||
covered_rows: set[tuple] = set()
|
||||
for rule in rules:
|
||||
for src in rule.get("sources", []):
|
||||
if src.get("type") == "table":
|
||||
sec = src.get("section", "")
|
||||
row = src.get("row")
|
||||
if sec and row is not None:
|
||||
covered_rows.add((sec, row))
|
||||
|
||||
total_rows = units["table_rows"]
|
||||
table_coverage = {
|
||||
"total_rows": total_rows,
|
||||
"covered_rows": len(covered_rows),
|
||||
"rate": round(len(covered_rows) / max(total_rows, 1), 3),
|
||||
}
|
||||
|
||||
# ── diagram coverage ──
|
||||
diagram_rids = units["diagram_images"]
|
||||
covered_rids: set[str] = set()
|
||||
for rule in rules:
|
||||
for src in rule.get("sources", []):
|
||||
if src.get("type") == "logic_tree":
|
||||
img_id = src.get("image_id", "")
|
||||
if img_id and img_id in diagram_rids:
|
||||
covered_rids.add(img_id)
|
||||
|
||||
diagram_coverage = {
|
||||
"total": len(diagram_rids),
|
||||
"covered": len(covered_rids),
|
||||
"rate": round(len(covered_rids) / max(len(diagram_rids), 1), 3),
|
||||
"uncovered": [r for r in diagram_rids if r not in covered_rids],
|
||||
}
|
||||
|
||||
# ── overall ──
|
||||
rates = [
|
||||
section_coverage["rate"],
|
||||
table_coverage["rate"],
|
||||
diagram_coverage["rate"],
|
||||
]
|
||||
overall = round(sum(rates) / len(rates), 3) if rates else 0.0
|
||||
|
||||
return {
|
||||
"section_coverage": section_coverage,
|
||||
"table_coverage": table_coverage,
|
||||
"diagram_coverage": diagram_coverage,
|
||||
"overall_rate": overall,
|
||||
}
|
||||
|
||||
|
||||
def test_layer_b_coverage(
|
||||
ir_data: dict,
|
||||
parsed_data: dict | None,
|
||||
ir_path: str,
|
||||
acceptance_runs: int,
|
||||
run_ir_pipeline,
|
||||
request,
|
||||
):
|
||||
"""Measure structural coverage and (optionally) coverage stability."""
|
||||
if parsed_data is None:
|
||||
pytest.skip("No parsed JSON available for coverage analysis")
|
||||
|
||||
# ── B1: single-run coverage ──
|
||||
cov = _measure_coverage(ir_data, parsed_data)
|
||||
|
||||
# ── B2: stability (multi-run) ──
|
||||
stability_values: list[float] = [cov["overall_rate"]]
|
||||
stability_std = 0.0
|
||||
|
||||
if acceptance_runs > 1:
|
||||
parsed_path = request.config.getoption("--parsed-path")
|
||||
if parsed_path and os.path.exists(parsed_path):
|
||||
for _ in range(acceptance_runs - 1):
|
||||
try:
|
||||
ir_list, _ = run_ir_pipeline(parsed_path)
|
||||
# Convert list-format IR to dict for coverage measurement
|
||||
run_ir = _wrap_list_ir(ir_list)
|
||||
run_cov = _measure_coverage(run_ir, parsed_data)
|
||||
stability_values.append(run_cov["overall_rate"])
|
||||
time.sleep(0.5) # rate limiting between runs
|
||||
except Exception as e:
|
||||
pytest.fail(f"Stability run failed: {e}")
|
||||
|
||||
if len(stability_values) > 1:
|
||||
stability_std = statistics.stdev(stability_values)
|
||||
|
||||
# Build Layer B result
|
||||
b_result = coverage_verdict(
|
||||
coverage_rate=cov["overall_rate"],
|
||||
stability_std=stability_std,
|
||||
stability_values=stability_values,
|
||||
section_coverage=cov["section_coverage"],
|
||||
table_coverage=cov["table_coverage"],
|
||||
diagram_coverage=cov["diagram_coverage"],
|
||||
)
|
||||
_stash(request, "layer_b", b_result)
|
||||
|
||||
# Assert — both B1 and B2 must pass
|
||||
assert b_result["coverage_pass"], (
|
||||
f"Coverage {cov['overall_rate']:.1%} < threshold 70%\n"
|
||||
f" Sections: {cov['section_coverage']['covered']}/{cov['section_coverage']['total']} "
|
||||
f"({cov['section_coverage']['rate']:.1%})\n"
|
||||
f" Uncovered: {cov['section_coverage']['uncovered']}\n"
|
||||
f" Table rows: {cov['table_coverage']['covered_rows']}/{cov['table_coverage']['total_rows']} "
|
||||
f"({cov['table_coverage']['rate']:.1%})\n"
|
||||
f" Diagrams: {cov['diagram_coverage']['covered']}/{cov['diagram_coverage']['total']} "
|
||||
f"({cov['diagram_coverage']['rate']:.1%})\n"
|
||||
f" Uncovered diagrams: {cov['diagram_coverage']['uncovered']}"
|
||||
)
|
||||
|
||||
if len(stability_values) > 1:
|
||||
assert b_result["stability"]["pass"], (
|
||||
f"Coverage stability std={stability_std:.4f} > threshold 0.05\n"
|
||||
f" Values across {len(stability_values)} runs: {stability_values}"
|
||||
)
|
||||
|
||||
|
||||
def _wrap_list_ir(ir_list: list) -> dict:
|
||||
"""Wrap a list-format IR (from ir_generator.py) into a dict for schema compat."""
|
||||
# Convert simple format to rich format for coverage measurement
|
||||
rules = []
|
||||
for i, entry in enumerate(ir_list):
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
rule = {
|
||||
"rule_id": f"GEN-001-RULE-{i:03d}",
|
||||
"description": entry.get("function", ""),
|
||||
"path": [],
|
||||
"priority": "P2",
|
||||
"sources": [],
|
||||
"precondition": {},
|
||||
"trigger": entry.get("trigger", {"operator": "AND", "conditions": []}),
|
||||
"actions": [],
|
||||
}
|
||||
# Convert source
|
||||
src = entry.get("source", {})
|
||||
if src.get("section"):
|
||||
rule["sources"].append({
|
||||
"type": "text",
|
||||
"section": src["section"],
|
||||
"paragraph": 1,
|
||||
"text_snippet": src.get("location", ""),
|
||||
"priority": "primary_source",
|
||||
})
|
||||
rules.append(rule)
|
||||
|
||||
return {
|
||||
"feature": "generated",
|
||||
"feature_id": "GEN-001",
|
||||
"rules": rules,
|
||||
}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# Layer C: LLM QE EXPERT AUDIT
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
QE_AUDITOR_PROMPT = """你是一个资深 QE 专家,负责审查需求文档的 IR(中间表示层)是否充分覆盖了源文档的所有可测试功能点。
|
||||
|
||||
你不是 IR 的生成者,你是独立的质量审计员。你的职责是判断 IR 的功能覆盖率是否充分。
|
||||
|
||||
## 审计输入
|
||||
|
||||
### Layer B 结构化覆盖率数据(参考)
|
||||
{coverage_summary}
|
||||
|
||||
### 源文档内容(Parsed JSON)
|
||||
{parsed_content}
|
||||
|
||||
### 生成的 IR(待审计)
|
||||
{ir_content}
|
||||
|
||||
## 审计要求
|
||||
|
||||
对源文档中的每个章节逐一评估其功能需求是否被 IR 充分覆盖。
|
||||
|
||||
**判断标准**:
|
||||
- **adequate**(充分覆盖):该章节的所有功能需求在 IR 中都有对应的 rule,包括触发条件、执行动作
|
||||
- **inadequate**(覆盖不足):该章节存在功能需求未在 IR 中体现,或描述不完整(缺少触发条件或动作)
|
||||
- **not_applicable**(不适用):该章节为背景介绍、术语定义、变更日志等,不包含功能需求
|
||||
|
||||
**注意**:
|
||||
- 如果某个章节涉及多个决策路径(如流程图),检查 IR 是否覆盖了每条路径
|
||||
- 表格中的每个功能行都应被至少一个 IR rule 覆盖
|
||||
- 图片分析中的流程图/决策树节点应被 IR 引用
|
||||
|
||||
## 输出格式
|
||||
|
||||
请严格输出以下 JSON 格式(不要包含代码块标记):
|
||||
|
||||
{{
|
||||
"total_functional_sections": <number>,
|
||||
"adequate": <number>,
|
||||
"inadequate": <number>,
|
||||
"not_applicable": <number>,
|
||||
"inadequate_ratio": <float>,
|
||||
"verdict": "ACCEPT 或 REJECT",
|
||||
"rationale": "<一句话说明接受或拒绝的理由>",
|
||||
"section_assessments": [
|
||||
{{
|
||||
"section": "<章节名>",
|
||||
"assessment": "adequate | inadequate | not_applicable",
|
||||
"reason": "<评估理由>",
|
||||
"missing": ["<缺失项1>", "<缺失项2>"] // 仅 inadequate 时需要
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
verdict 判定规则:
|
||||
- inadequate_ratio ≤ 0.30 → "ACCEPT"(风险可控)
|
||||
- inadequate_ratio > 0.30 → "REJECT"(功能点认知差异大,需要补充 IR)
|
||||
"""
|
||||
|
||||
|
||||
def test_layer_c_qe_audit(
|
||||
ir_data: dict, parsed_data: dict | None, llm_client, request
|
||||
):
|
||||
"""LLM QE expert audit of functional coverage."""
|
||||
if parsed_data is None:
|
||||
pytest.skip("No parsed JSON available — cannot run QE audit")
|
||||
|
||||
# ── get Layer B summary for context ──
|
||||
layer_b = _unstash(request, "layer_b") or {}
|
||||
cov_summary = json.dumps(
|
||||
{
|
||||
"coverage_rate": layer_b.get("coverage_rate", "N/A"),
|
||||
"section_coverage": layer_b.get("section_coverage", {}),
|
||||
"diagram_coverage": layer_b.get("diagram_coverage", {}),
|
||||
},
|
||||
ensure_ascii=False,
|
||||
indent=2,
|
||||
)
|
||||
|
||||
# ── prepare content (trim to avoid token overflow) ──
|
||||
parsed_str = json.dumps(parsed_data, ensure_ascii=False)
|
||||
ir_str = json.dumps(ir_data, ensure_ascii=False)
|
||||
|
||||
max_parsed = 12000
|
||||
max_ir = 8000
|
||||
if len(parsed_str) > max_parsed:
|
||||
parsed_str = parsed_str[:max_parsed] + "\n...[truncated]"
|
||||
if len(ir_str) > max_ir:
|
||||
ir_str = ir_str[:max_ir] + "\n...[truncated]"
|
||||
|
||||
prompt = QE_AUDITOR_PROMPT.format(
|
||||
coverage_summary=cov_summary,
|
||||
parsed_content=parsed_str,
|
||||
ir_content=ir_str,
|
||||
)
|
||||
|
||||
# ── call LLM ──
|
||||
try:
|
||||
raw = llm_client.chat(
|
||||
model=llm_client.TEXT_MODEL,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
except Exception as e:
|
||||
pytest.fail(f"QE audit LLM call failed: {e}")
|
||||
|
||||
# ── parse response ──
|
||||
audit_data = _parse_json_response(raw)
|
||||
if audit_data is None:
|
||||
pytest.fail(f"QE audit returned unparseable response:\n{raw[:500]}")
|
||||
|
||||
# Build Layer C result
|
||||
c_result = audit_verdict(audit_data)
|
||||
c_result["raw_assessments"] = audit_data.get("section_assessments", [])
|
||||
_stash(request, "layer_c", c_result)
|
||||
|
||||
# Assert
|
||||
assert c_result["verdict"] == "ACCEPT", (
|
||||
f"QE Audit REJECTED — inadequate_ratio={c_result['inadequate_ratio']:.1%} > 30%\n"
|
||||
f" Rationale: {c_result['rationale']}\n"
|
||||
f" Adequate: {c_result['adequate']}, Inadequate: {c_result['inadequate']}"
|
||||
)
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# Final report (runs last)
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def test_final_report(ir_data: dict, ir_path: str, request):
|
||||
"""Generate the final three-layer JSON report.
|
||||
|
||||
This test always passes (report generation). The verdicts from layers A/B/C
|
||||
determine the final releasable status, but the report itself is informational.
|
||||
"""
|
||||
layer_a = _unstash(request, "layer_a") or {"verdict": "SKIPPED"}
|
||||
layer_b = _unstash(request, "layer_b") or {"verdict": "SKIPPED"}
|
||||
layer_c = _unstash(request, "layer_c") or {"verdict": "SKIPPED"}
|
||||
|
||||
report_path = request.config.getoption("--json-report-file", None) or str(
|
||||
Path.cwd() / "acceptance-report.json"
|
||||
)
|
||||
|
||||
report = generate_report(
|
||||
layer_a,
|
||||
layer_b,
|
||||
layer_c,
|
||||
commit=os.environ.get("GITEA_SHA", ""),
|
||||
branch=os.environ.get("GITEA_BRANCH", "main"),
|
||||
output_path=report_path,
|
||||
)
|
||||
|
||||
# Print summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"QE ACCEPTANCE REPORT")
|
||||
print(f"{'='*60}")
|
||||
print(f" Layer A (Schema): {layer_a.get('verdict', '?')}")
|
||||
print(f" Layer B (Coverage): {layer_b.get('verdict', '?')} "
|
||||
f"(rate={layer_b.get('coverage_rate', '?')})")
|
||||
print(f" Layer C (QE Audit): {layer_c.get('verdict', '?')}")
|
||||
print(f" {'─'*40}")
|
||||
print(f" FINAL: {report['final_verdict']} | "
|
||||
f"Releasable: {report['releasable']}")
|
||||
print(f" Report: {report_path}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Fail if any layer failed (aggregate assertion)
|
||||
failures = report.get("failure_details", [])
|
||||
if failures:
|
||||
pytest.fail(
|
||||
"Acceptance tests FAILED:\n" + "\n".join(f" - {f}" for f in failures)
|
||||
)
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# Helpers
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
import os # noqa: E402
|
||||
|
||||
# Module-level stash for sharing results across tests in the same module.
|
||||
# Each test function stores its result here; later tests read earlier results.
|
||||
_module_stash: dict[str, dict] = {}
|
||||
|
||||
|
||||
def _stash(request, key: str, value: dict):
|
||||
"""Store a result dict for cross-test access within this module."""
|
||||
_module_stash[key] = value
|
||||
|
||||
|
||||
def _unstash(request, key: str) -> dict | None:
|
||||
"""Retrieve a stashed result."""
|
||||
return _module_stash.get(key)
|
||||
|
||||
|
||||
def _parse_json_response(raw: str) -> dict | None:
|
||||
"""Parse JSON from an LLM response, handling markdown code fences."""
|
||||
if not raw:
|
||||
return None
|
||||
text = raw.strip()
|
||||
if text.startswith("```"):
|
||||
nl = text.find("\n")
|
||||
text = text[nl + 1:] if nl != -1 else text[3:]
|
||||
if text.endswith("```"):
|
||||
text = text[:-3]
|
||||
try:
|
||||
return json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
+10
-5
@@ -55,12 +55,17 @@ def test_import_detect_conflicts():
|
||||
|
||||
# -- IR generation tests ------------------------------------------------------
|
||||
|
||||
def test_import_ir_generator():
|
||||
"""ir_generator module should be importable."""
|
||||
def test_import_ir_main():
|
||||
"""ir_generation main module should be importable (new project structure)."""
|
||||
os.environ.setdefault("DASHSCOPE_API_KEY", "test-fake-key")
|
||||
_import_from_skill("ir_generation_skill", "ir_generator")
|
||||
import ir_generator
|
||||
assert hasattr(ir_generator, "generate_ir")
|
||||
skill_dir = os.path.join(
|
||||
os.path.dirname(os.path.dirname(__file__)),
|
||||
"skills", "ir_generation_skill"
|
||||
)
|
||||
if skill_dir not in sys.path:
|
||||
sys.path.insert(0, skill_dir)
|
||||
import main
|
||||
assert hasattr(main, "main")
|
||||
|
||||
|
||||
# -- Resolution application tests ---------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user