fec4c09ee0
CI / test (push) Successful in 8s
doc_parser_skill: - New: verify_flowchart.py (flowchart validation) - Updated: LLM.py (multi-provider: DeepSeek + DashScope) - Updated: image_parser.py (logic tree support, external prompts) - Updated: SKILL.md, prompts/image_prompt.md conflict_detection_skill: - Updated: LLM.py (multi-provider sync) - Updated: detect_conflicts.py (logic tree text conversion) ir_generation_skill: - Replaced old scripts/LLM.py + ir_generator.py with standalone project - New: main.py, config.py, step1-3_*.py, ensemble_merge.py - New: prompts/, tests/ subdirectories tests: - New: acceptance/ test suite with schema validation - Fixed: conftest no longer globally skips non-acceptance tests - Updated: test_sample.py for new ir_generation structure Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
179 lines
5.8 KiB
Python
179 lines
5.8 KiB
Python
"""Structured JSON report generation for QE acceptance test results.
|
||
|
||
Produces a unified report with three-layer verdict:
|
||
Layer A – Schema compliance
|
||
Layer B – Structural coverage + stability
|
||
Layer C – LLM QE expert audit
|
||
|
||
Final verdict: PASS (releasable) or FAIL (blocked).
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import time
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
|
||
def generate_report(
|
||
schema_result: dict,
|
||
coverage_result: dict,
|
||
audit_result: dict | None,
|
||
*,
|
||
commit: str = "",
|
||
branch: str = "main",
|
||
output_path: str | None = None,
|
||
) -> dict:
|
||
"""Assemble the three-layer report and return it.
|
||
|
||
Args:
|
||
schema_result: ``{"verdict": "PASS"|"FAIL", "total_checks": N, "passed": N, "failed": N}``
|
||
coverage_result: ``{"verdict": "PASS"|"FAIL", "coverage_rate": float,
|
||
"stability": {"runs": N, "values": [...], "std": float}}``
|
||
audit_result: ``{"verdict": "ACCEPT"|"REJECT", "inadequate_ratio": float,
|
||
"rationale": str, "section_assessments": [...]}`` or None
|
||
commit: git commit SHA
|
||
branch: branch name
|
||
output_path: if set, write the report JSON to this path
|
||
|
||
Returns the report dict.
|
||
"""
|
||
layers: dict[str, Any] = {
|
||
"A_schema": schema_result,
|
||
"B_coverage": coverage_result,
|
||
}
|
||
if audit_result is not None:
|
||
layers["C_qe_audit"] = audit_result
|
||
|
||
# ── final verdict ──
|
||
a_pass = schema_result.get("verdict") == "PASS"
|
||
b_pass = coverage_result.get("verdict") == "PASS"
|
||
c_pass = (
|
||
audit_result is None
|
||
or audit_result.get("verdict") == "ACCEPT"
|
||
)
|
||
all_pass = a_pass and b_pass and c_pass
|
||
|
||
report = {
|
||
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
|
||
"commit": commit,
|
||
"branch": branch,
|
||
"layers": layers,
|
||
"final_verdict": "PASS" if all_pass else "FAIL",
|
||
"releasable": all_pass,
|
||
"failure_details": _failure_details(layers),
|
||
}
|
||
|
||
if output_path:
|
||
out = Path(output_path)
|
||
out.parent.mkdir(parents=True, exist_ok=True)
|
||
out.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
|
||
return report
|
||
|
||
|
||
def _failure_details(layers: dict) -> list[str]:
|
||
"""Summarise which layers failed and why."""
|
||
details: list[str] = []
|
||
|
||
schema = layers.get("A_schema", {})
|
||
if schema.get("verdict") != "PASS":
|
||
details.append(
|
||
f"Layer A (Schema): {schema.get('failed', '?')}/{schema.get('total_checks', '?')} checks failed"
|
||
)
|
||
|
||
coverage = layers.get("B_coverage", {})
|
||
if coverage.get("verdict") != "PASS":
|
||
cv = coverage.get("coverage_rate", "?")
|
||
details.append(f"Layer B (Coverage): rate={cv} (threshold: 0.70)")
|
||
|
||
audit = layers.get("C_qe_audit", {})
|
||
if audit.get("verdict") == "REJECT":
|
||
details.append(
|
||
f"Layer C (QE Audit): REJECT — inadequate_ratio={audit.get('inadequate_ratio', '?')}"
|
||
)
|
||
|
||
return details
|
||
|
||
|
||
# ── Layer-specific result builders ──────────────────────────────────────────
|
||
|
||
def schema_verdict(errors: list[str], stats: dict) -> dict:
|
||
"""Build Layer A result from schema validation errors & stats."""
|
||
total = stats.get("total_rules", 0)
|
||
valid = stats.get("valid_rules", 0)
|
||
failed_checks = len(errors) + (total - valid)
|
||
|
||
return {
|
||
"verdict": "PASS" if failed_checks == 0 else "FAIL",
|
||
"total_checks": max(total, 1), # at minimum, we checked the root
|
||
"passed": valid if failed_checks == 0 else valid,
|
||
"failed": failed_checks,
|
||
"rule_pass_rate": round(valid / max(total, 1), 2) if total > 0 else 0,
|
||
"sample_errors": errors[:10], # first 10 for the report
|
||
}
|
||
|
||
|
||
def coverage_verdict(
|
||
coverage_rate: float,
|
||
stability_std: float,
|
||
stability_values: list[float],
|
||
*,
|
||
coverage_threshold: float = 0.70,
|
||
stability_threshold: float = 0.05,
|
||
section_coverage: dict | None = None,
|
||
table_coverage: dict | None = None,
|
||
diagram_coverage: dict | None = None,
|
||
) -> dict:
|
||
"""Build Layer B result from coverage metrics."""
|
||
b1_pass = coverage_rate >= coverage_threshold
|
||
b2_pass = stability_std <= stability_threshold
|
||
both_pass = b1_pass and b2_pass
|
||
|
||
result: dict[str, Any] = {
|
||
"verdict": "PASS" if both_pass else "FAIL",
|
||
"coverage_rate": round(coverage_rate, 3),
|
||
"coverage_threshold": coverage_threshold,
|
||
"coverage_pass": b1_pass,
|
||
"stability": {
|
||
"runs": len(stability_values),
|
||
"values": [round(v, 3) for v in stability_values],
|
||
"std": round(stability_std, 4),
|
||
"threshold": stability_threshold,
|
||
"pass": b2_pass,
|
||
},
|
||
}
|
||
|
||
if section_coverage:
|
||
result["section_coverage"] = section_coverage
|
||
if table_coverage:
|
||
result["table_coverage"] = table_coverage
|
||
if diagram_coverage:
|
||
result["diagram_coverage"] = diagram_coverage
|
||
|
||
return result
|
||
|
||
|
||
def audit_verdict(audit_data: dict, *, inadequate_threshold: float = 0.30) -> dict:
|
||
"""Build Layer C result from LLM QE audit.
|
||
|
||
*audit_data* should contain:
|
||
inadequate_ratio: float
|
||
rationale: str
|
||
section_assessments: list[dict]
|
||
"""
|
||
ratio = audit_data.get("inadequate_ratio", 1.0)
|
||
passed = ratio <= inadequate_threshold
|
||
|
||
return {
|
||
"verdict": "ACCEPT" if passed else "REJECT",
|
||
"inadequate_ratio": round(ratio, 3),
|
||
"threshold": inadequate_threshold,
|
||
"rationale": audit_data.get("rationale", ""),
|
||
"total_sections": audit_data.get("total_functional_sections", 0),
|
||
"adequate": audit_data.get("adequate", 0),
|
||
"inadequate": audit_data.get("inadequate", 0),
|
||
"not_applicable": audit_data.get("not_applicable", 0),
|
||
}
|