sync: update all skills from latest workspace code

doc_parser_skill: - New: verify_flowchart.py (flowchart validation) - Updated: LLM.py (multi-provider: DeepSeek + DashScope) - Updated: image_parser.py (logic tree support, external prompts) - Updated: SKILL.md, prompts/image_prompt.md conflict_detection_skill: - Updated: LLM.py (multi-provider sync) - Updated: detect_conflicts.py (logic tree text conversion) ir_generation_skill: - Replaced old scripts/LLM.py + ir_generator.py with standalone project - New: main.py, config.py, step1-3_*.py, ensemble_merge.py - New: prompts/, tests/ subdirectories tests: - New: acceptance/ test suite with schema validation - Fixed: conftest no longer globally skips non-acceptance tests - Updated: test_sample.py for new ir_generation structure Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-30 22:45:08 +08:00
parent db64df2da1
commit fec4c09ee0
35 changed files with 8021 additions and 530 deletions
@@ -0,0 +1,593 @@
+"""
+Deterministic ensemble merge for semantic index generation.
+
+All functions are pure Python with zero LLM calls. Fully testable with mock data.
+
+Cross-references N semantic_index outputs (generated with different temperatures)
+and produces a single merged index with confidence scores.
+
+Used by: step1_semantic_index.py
+Tested by: tests/test_ensemble_merge.py
+"""
+
+from collections import defaultdict
+from difflib import SequenceMatcher
+
+
+# =============================================================================
+# Concept Name Similarity
+# =============================================================================
+
+def concept_name_similarity(name_a: str, name_b: str) -> float:
+    """Compute similarity between two concept names for cross-version matching.
+
+    Strategy (in order of precedence):
+      1. Exact string match -> 1.0
+      2. Substring containment (one is a substring of the other) -> 0.9
+      3. SequenceMatcher ratio on character sequences -> 0.0-1.0
+
+    Returns:
+        float in [0.0, 1.0] where >= 0.7 means "likely the same concept".
+    """
+    if name_a == name_b:
+        return 1.0
+
+    # Substring containment: one name is contained in the other
+    if name_a in name_b or name_b in name_a:
+        # Only count as similar if they're of comparable length
+        # (avoid matching "国内" with "国内行车娱乐限制")
+        len_ratio = min(len(name_a), len(name_b)) / max(len(name_a), len(name_b))
+        if len_ratio >= 0.5:
+            return 0.85 + 0.05 * len_ratio  # range 0.875-0.90
+        return 0.55  # too different in length → below threshold
+
+    return SequenceMatcher(None, name_a, name_b).ratio()
+
+
+# =============================================================================
+# Concept Clustering & Merging
+# =============================================================================
+
+def cluster_concepts(
+    all_concepts_lists: list[list[dict]],
+    similarity_threshold: float = 0.7,
+) -> list[list[tuple[int, dict]]]:
+    """Group concepts across ensemble versions by name similarity.
+
+    Uses greedy single-pass clustering: for each concept, find the best-matching
+    existing cluster. If max similarity >= threshold, add to it; otherwise,
+    create a new cluster.
+
+    Args:
+        all_concepts_lists: List of concept lists, one per ensemble version.
+                            all_concepts_lists[i] = concepts from version i.
+        similarity_threshold: Minimum name similarity to join a cluster.
+
+    Returns:
+        List of clusters. Each cluster is list of (version_idx, concept_dict).
+    """
+    clusters = []  # type: list[list[tuple[int, dict]]]
+
+    for version_idx, concepts in enumerate(all_concepts_lists):
+        for c in concepts:
+            name = c.get("name", "")
+            if not name:
+                continue
+
+            best_cluster = None
+            best_sim = 0.0
+
+            for cluster in clusters:
+                # Compare against the first member of the cluster (seed)
+                seed_name = cluster[0][1].get("name", "")
+                sim = concept_name_similarity(name, seed_name)
+                if sim > best_sim:
+                    best_sim = sim
+                    best_cluster = cluster
+
+            if best_cluster is not None and best_sim >= similarity_threshold:
+                best_cluster.append((version_idx, c))
+            else:
+                clusters.append([(version_idx, c)])
+
+    return clusters
+
+
+def merge_concept_cluster(
+    cluster: list[tuple[int, dict]],
+    total_versions: int,
+) -> tuple[dict, str]:
+    """Merge a single cluster of matched concepts into one concept dict.
+
+    Rules:
+      - name: Longest name (most specific). Tie-break by lower version_idx.
+      - aliases: Union of all aliases across versions.
+      - defined_in: Union of all defined_in across versions.
+      - parent: Most common non-null parent (voting). Tie-break by lower version_idx.
+
+    Returns:
+        (merged_concept_dict, confidence_level) where confidence is "high"/"medium"/"low".
+    """
+    if not cluster:
+        return {}, "low"
+
+    # --- name: longest (most specific) ---
+    best_name = ""
+    best_name_len = 0
+    for v_idx, c in cluster:
+        n = c.get("name", "")
+        if len(n) > best_name_len:
+            best_name = n
+            best_name_len = len(n)
+        elif len(n) == best_name_len and v_idx < cluster[0][0]:  # lower version idx
+            best_name = n
+
+    # --- aliases: union ---
+    aliases = set()
+    for _, c in cluster:
+        for a in c.get("aliases", []):
+            aliases.add(a)
+
+    # --- defined_in: union ---
+    defined_in = set()
+    for _, c in cluster:
+        for d in c.get("defined_in", []):
+            defined_in.add(d)
+
+    # --- parent: most common non-null parent (vote) ---
+    parent_votes = defaultdict(int)
+    for v_idx, c in cluster:
+        p = c.get("parent")
+        if p is not None:
+            parent_votes[p] += 1
+
+    if parent_votes:
+        best_parent = max(parent_votes, key=lambda p: (parent_votes[p], -1))
+    else:
+        best_parent = None
+
+    # --- confidence ---
+    versions_present = len({v_idx for v_idx, _ in cluster})
+    confidence = compute_confidence_versions(versions_present, total_versions,
+                                              any(v_idx == 0 for v_idx, _ in cluster))
+
+    merged = {
+        "name": best_name,
+        "aliases": sorted(aliases),
+        "defined_in": sorted(defined_in),
+        "parent": best_parent,
+        "confidence": confidence,
+    }
+    return merged, confidence
+
+
+# =============================================================================
+# Unit Similarity Functions
+# =============================================================================
+
+def _collect_logic_tree_nodes(unit: dict) -> set[str]:
+    """Extract the flattened set of all logic tree node IDs from a function_unit."""
+    nodes = set()
+    for src in unit.get("sources", []):
+        if src.get("type") == "logic_tree":
+            nodes.update(src.get("logic_tree_nodes", []))
+    return nodes
+
+
+def unit_node_jaccard(unit_a: dict, unit_b: dict) -> float:
+    """Compute Jaccard similarity on logic tree node sets between two units.
+
+    Jaccard(A, B) = |A ∩ B| / |A ∪ B|. Returns 0.0 if both have no nodes.
+    """
+    nodes_a = _collect_logic_tree_nodes(unit_a)
+    nodes_b = _collect_logic_tree_nodes(unit_b)
+
+    if not nodes_a and not nodes_b:
+        return 0.0
+    if not nodes_a or not nodes_b:
+        return 0.0
+
+    intersection = nodes_a & nodes_b
+    union = nodes_a | nodes_b
+    return len(intersection) / len(union)
+
+
+def path_similarity(path_a: list[str], path_b: list[str]) -> float:
+    """Compute similarity between two path arrays.
+
+    Hybrid approach:
+      - Sequential similarity (order-aware): SequenceMatcher on joined strings.
+      - Set similarity (order-independent): Jaccard on path element sets.
+      - Final score: 0.5 * seq_sim + 0.5 * set_sim
+
+    Returns:
+        float in [0.0, 1.0].
+    """
+    if not path_a and not path_b:
+        return 1.0
+    if not path_a or not path_b:
+        return 0.0
+
+    # Sequential similarity
+    joined_a = "|".join(path_a)
+    joined_b = "|".join(path_b)
+    seq_sim = SequenceMatcher(None, joined_a, joined_b).ratio()
+
+    # Set similarity
+    set_a = set(path_a)
+    set_b = set(path_b)
+    set_sim = len(set_a & set_b) / len(set_a | set_b)
+
+    return 0.5 * seq_sim + 0.5 * set_sim
+
+
+def unit_similarity(unit_a: dict, unit_b: dict) -> float:
+    """Combined similarity between two function_units.
+
+    Weighted combination:
+      - 0.6 * unit_node_jaccard  (primary signal: same logic tree nodes = same rule)
+      - 0.4 * path_similarity    (secondary signal: semantic agreement)
+
+    Returns:
+        float in [0.0, 1.0]. >= 0.5 means "likely the same function_unit".
+    """
+    return 0.6 * unit_node_jaccard(unit_a, unit_b) + 0.4 * path_similarity(
+        unit_a.get("path", []), unit_b.get("path", [])
+    )
+
+
+# =============================================================================
+# Function Unit Clustering & Merging
+# =============================================================================
+
+def cluster_function_units(
+    all_units_lists: list[list[dict]],
+    similarity_threshold: float = 0.5,
+) -> list[list[tuple[int, dict]]]:
+    """Group function_units across ensemble versions by content similarity.
+
+    Lowest-temperature versions are processed first (most stable → cluster seeds).
+    Higher-temperature variants join existing clusters if similar enough.
+
+    Args:
+        all_units_lists: List of unit lists, one per ensemble version.
+        similarity_threshold: Minimum unit_similarity to join a cluster.
+
+    Returns:
+        List of clusters. Each cluster is list of (version_idx, unit_dict).
+    """
+    clusters = []  # type: list[list[tuple[int, dict]]]
+
+    for version_idx, units in enumerate(all_units_lists):
+        for unit in units:
+            best_cluster = None
+            best_sim = 0.0
+
+            for cluster in clusters:
+                # Compare against all members already in the cluster
+                cluster_sim = max(
+                    unit_similarity(unit, existing_unit)
+                    for (_, existing_unit) in cluster
+                )
+                if cluster_sim > best_sim:
+                    best_sim = cluster_sim
+                    best_cluster = cluster
+
+            if best_cluster is not None and best_sim >= similarity_threshold:
+                best_cluster.append((version_idx, unit))
+            else:
+                clusters.append([(version_idx, unit)])
+
+    return clusters
+
+
+def pick_best_representative(
+    cluster: list[tuple[int, dict]],
+) -> dict:
+    """Select the best function_unit from a cluster as the merged representative.
+
+    Scoring formula (all normalized to [0, 1]):
+      - 0.35: Node count (more logic_tree_nodes = more complete trace)
+      - 0.25: Source count (more sources = more evidence)
+      - 0.20: Description length (longer = more detail, capped at 500 chars)
+      - 0.20: Temperature rank (lower version_idx = lower temp = more stable)
+
+    Returns a deep copy of the winning unit dict.
+    """
+    if not cluster:
+        return {}
+
+    # Compute max values for normalization
+    max_nodes = max(
+        len(_collect_logic_tree_nodes(unit)) for _, unit in cluster
+    )
+    max_sources = max(
+        len(unit.get("sources", [])) for _, unit in cluster
+    )
+    max_desc_len = max(
+        len(unit.get("description", "")) for _, unit in cluster
+    )
+    max_version_idx = max(v_idx for v_idx, _ in cluster)
+    num_versions = len(cluster)
+
+    def score(v_idx: int, unit: dict) -> float:
+        nodes = len(_collect_logic_tree_nodes(unit))
+        sources = len(unit.get("sources", []))
+        desc_len = min(len(unit.get("description", "")), 500)
+        temp_rank = 1.0 - (v_idx / max(num_versions, max_version_idx + 1))
+
+        return (
+            0.35 * (nodes / max(1, max_nodes))
+            + 0.25 * (sources / max(1, max_sources))
+            + 0.20 * (desc_len / max(1, max_desc_len))
+            + 0.20 * temp_rank
+        )
+
+    best = max(cluster, key=lambda x: score(x[0], x[1]))
+    return dict(best[1])  # deep-ish copy (1 level)
+
+
+def merge_unit_sources(
+    cluster: list[tuple[int, dict]],
+) -> list[dict]:
+    """Union all sources from units in a cluster, deduplicating by (type, image_id, section).
+
+    When the same source key appears in multiple versions, keeps the one with
+    the most logic_tree_nodes.
+    """
+    # Group by dedup key
+    source_groups = defaultdict(list)
+
+    for v_idx, unit in cluster:
+        for src in unit.get("sources", []):
+            # Build a dedup key
+            src_type = src.get("type", "")
+            if src_type == "logic_tree":
+                key = ("logic_tree", src.get("image_id", ""))
+            else:
+                key = (src_type, src.get("section", ""), src.get("row", ""))
+
+            source_groups[key].append(src)
+
+    # Pick best per group
+    result = []
+    for key, sources in source_groups.items():
+        # Pick the source with the most logic_tree_nodes (if any)
+        best = max(sources, key=lambda s: len(s.get("logic_tree_nodes", [])))
+        result.append(dict(best))
+
+    return result
+
+
+def compute_confidence_versions(
+    versions_present: int,
+    total_versions: int,
+    includes_lowest_temp: bool = False,
+) -> str:
+    """Compute 3-level confidence based on cross-version agreement.
+
+    - "high": Appears in all versions, OR >= 2/3 with lowest-temp version (T=0.0).
+    - "medium": Appears in >= half the versions but not all.
+    - "low": Appears in fewer than half (singleton in ensemble).
+
+    Args:
+        versions_present: Number of versions this item appeared in.
+        total_versions: Total number of ensemble versions.
+        includes_lowest_temp: Whether the item appeared in the T=0.0 version.
+    """
+    ratio = versions_present / total_versions
+
+    if ratio >= 1.0:
+        return "high"
+    if ratio >= 0.5 and includes_lowest_temp:
+        return "high"
+    if ratio >= 0.5:
+        return "medium"
+    return "low"
+
+
+def ensemble_merge_concepts(
+    all_concepts_lists: list[list[dict]],
+) -> list[dict]:
+    """Merge concepts across all ensemble versions.
+
+    Returns:
+        List of merged concept dicts, each with added "confidence" field.
+    """
+    total = len(all_concepts_lists)
+    clusters = cluster_concepts(all_concepts_lists)
+    merged = []
+    seen_names = set()
+
+    for cluster in clusters:
+        concept, confidence = merge_concept_cluster(cluster, total)
+        name = concept.get("name", "")
+        if name and name not in seen_names:
+            concept["ensemble_support"] = f"{len({v for v, _ in cluster})}/{total}"
+            merged.append(concept)
+            seen_names.add(name)
+
+    # Sort: high confidence first, then by name
+    conf_order = {"high": 0, "medium": 1, "low": 2}
+    merged.sort(key=lambda c: (conf_order.get(c.get("confidence", "low"), 3), c.get("name", "")))
+
+    # Validate and fix parent references
+    merged = _validate_concept_parents(merged)
+
+    return merged
+
+
+def _validate_concept_parents(concepts: list[dict]) -> list[dict]:
+    """Post-merge: validate that every concept's parent exists in the list.
+
+    Strategy for dangling parents:
+      1. Fuzzy match (concept_name_similarity >= 0.7) → fix reference
+      2. No match → set parent to null, downgrade confidence to "low"
+    """
+    concept_names = {c["name"] for c in concepts}
+    conf_order = {"high": 0, "medium": 1, "low": 2}
+
+    for c in concepts:
+        parent = c.get("parent")
+        if parent is None:
+            continue
+        if parent in concept_names:
+            continue
+
+        # Dangling parent — try fuzzy match
+        best_match = None
+        best_sim = 0.0
+        for name in concept_names:
+            sim = concept_name_similarity(parent, name)
+            if sim > best_sim:
+                best_sim = sim
+                best_match = name
+
+        if best_match and best_sim >= 0.7:
+            c["parent"] = best_match
+            # Downgrade if match was fuzzy (not exact)
+            if best_sim < 1.0:
+                current_conf = c.get("confidence", "low")
+                c["confidence"] = _downgrade_confidence(current_conf)
+        else:
+            c["parent"] = None
+            c["confidence"] = _downgrade_confidence(c.get("confidence", "low"))
+
+    # Re-sort after confidence changes
+    concepts.sort(key=lambda c: (conf_order.get(c.get("confidence", "low"), 3), c.get("name", "")))
+    return concepts
+
+
+def _downgrade_confidence(current: str) -> str:
+    """Drop confidence one level."""
+    if current == "high":
+        return "medium"
+    return "low"
+
+
+def ensemble_merge_function_units(
+    all_units_lists: list[list[dict]],
+) -> list[dict]:
+    """Merge function_units across all ensemble versions.
+
+    1. Cluster units across versions.
+    2. For each cluster: pick best, merge sources, compute confidence.
+    3. Reassign stable unit_ids: FU-ENS-001, FU-ENS-002, ...
+
+    Returns:
+        List of merged function_unit dicts with added "confidence",
+        "ensemble_support", "source_versions" fields.
+    """
+    total = len(all_units_lists)
+    clusters = cluster_function_units(all_units_lists)
+
+    merged = []
+    for cluster in clusters:
+        # Pick best representative
+        best = pick_best_representative(cluster)
+
+        # Merge sources from all cluster members
+        best["sources"] = merge_unit_sources(cluster)
+
+        # Compute confidence
+        versions_present = len({v_idx for v_idx, _ in cluster})
+        includes_t0 = any(v_idx == 0 for v_idx, _ in cluster)
+        confidence = compute_confidence_versions(
+            versions_present, total, includes_t0
+        )
+
+        best["confidence"] = confidence
+        best["ensemble_support"] = f"{versions_present}/{total}"
+        best["source_versions"] = versions_present
+
+        merged.append(best)
+
+    # Sort by confidence desc, then by unit_id
+    conf_order = {"high": 0, "medium": 1, "low": 2}
+    merged.sort(key=lambda u: (conf_order.get(u.get("confidence", "low"), 3),
+                                 u.get("unit_id", "")))
+
+    # Reassign stable unit_ids
+    for i, unit in enumerate(merged):
+        # Preserve original unit_id for traceability
+        if "original_unit_id" not in unit:
+            unit["original_unit_id"] = unit.get("unit_id", "")
+        unit["unit_id"] = f"FU-ENS-{i + 1:03d}"
+
+    return merged
+
+
+# =============================================================================
+# Top-Level Ensemble Merge
+# =============================================================================
+
+def ensemble_merge(
+    semantic_indices: list[dict],
+) -> dict:
+    """Merge N semantic index outputs into one ensemble result.
+
+    Args:
+        semantic_indices: List of semantic_index dicts from each temperature run.
+                          semantic_indices[0] should be the lowest-temperature version.
+
+    Returns:
+        Merged semantic_index dict with structure:
+        {
+            "feature_name": str,
+            "ensemble_versions": int,
+            "concepts": [...],
+            "function_units": [...],
+            "confidence_summary": {...},
+        }
+    """
+    if not semantic_indices:
+        return {
+            "feature_name": "",
+            "ensemble_versions": 0,
+            "concepts": [],
+            "function_units": [],
+            "confidence_summary": {},
+        }
+
+    total = len(semantic_indices)
+
+    # Extract concepts and function_units from each version
+    all_concepts = [si.get("concepts", []) for si in semantic_indices]
+    all_units = [si.get("function_units", []) for si in semantic_indices]
+
+    # Merge
+    merged_concepts = ensemble_merge_concepts(all_concepts)
+    merged_units = ensemble_merge_function_units(all_units)
+
+    # Feature name: majority vote across versions
+    feature_names = [si.get("feature_name", "") for si in semantic_indices]
+    name_counts = defaultdict(int)
+    for fn in feature_names:
+        if fn:
+            name_counts[fn] += 1
+    feature_name = max(name_counts, key=name_counts.get) if name_counts else ""
+
+    # Confidence summary
+    unit_conf = defaultdict(int)
+    for u in merged_units:
+        unit_conf[u.get("confidence", "low")] += 1
+    concept_conf = defaultdict(int)
+    for c in merged_concepts:
+        concept_conf[c.get("confidence", "low")] += 1
+
+    return {
+        "feature_name": feature_name,
+        "ensemble_versions": total,
+        "concepts": merged_concepts,
+        "function_units": merged_units,
+        "confidence_summary": {
+            "total_units": len(merged_units),
+            "high": unit_conf.get("high", 0),
+            "medium": unit_conf.get("medium", 0),
+            "low": unit_conf.get("low", 0),
+            "total_concepts": len(merged_concepts),
+            "concept_high": concept_conf.get("high", 0),
+            "concept_medium": concept_conf.get("medium", 0),
+            "concept_low": concept_conf.get("low", 0),
+        },
+    }