Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ebda8e37d1 | |||
| d1e36b20ee | |||
| 01c93e52d3 | |||
| 7bcd414692 |
@@ -509,10 +509,28 @@ def _quick_validate(
|
|||||||
return True
|
return True
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def _has_section_content(sec: dict) -> bool:
|
||||||
|
"""Check if a section has meaningful content (text >= 10 chars, table, or image).
|
||||||
|
|
||||||
|
A section is considered "empty" if all its text blocks have fewer than
|
||||||
|
10 characters and it contains no tables or images. These typically come
|
||||||
|
from image-only Word sections that doc_parser cannot extract text from.
|
||||||
|
"""
|
||||||
|
for block in sec.get("blocks", []):
|
||||||
|
blk_type = block.get("type", "")
|
||||||
|
if blk_type == "table":
|
||||||
|
return True
|
||||||
|
if blk_type in ("image", "figure", "picture"):
|
||||||
|
return True
|
||||||
|
text = block.get("text", "")
|
||||||
|
if isinstance(text, str) and len(text.strip()) >= 10:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
func_sections = [
|
func_sections = [
|
||||||
s for s in doc.get("sections", [])
|
s for s in doc.get("sections", [])
|
||||||
if _is_functional_section(s.get("source", ""))
|
if _is_functional_section(s.get("source", ""))
|
||||||
and any(b.get("type") in ("para", "table") for b in s.get("blocks", []))
|
and _has_section_content(s)
|
||||||
]
|
]
|
||||||
covered_sections: set[str] = set()
|
covered_sections: set[str] = set()
|
||||||
for fu in units:
|
for fu in units:
|
||||||
|
|||||||
@@ -111,8 +111,8 @@ def load_path_enumeration() -> dict:
|
|||||||
def rule_signature(rule: dict) -> str:
|
def rule_signature(rule: dict) -> str:
|
||||||
"""Generate a dedup signature from path + trigger + actions."""
|
"""Generate a dedup signature from path + trigger + actions."""
|
||||||
path = rule.get("path", [])
|
path = rule.get("path", [])
|
||||||
trigger = rule.get("trigger", {})
|
trigger = rule.get("trigger") or {}
|
||||||
actions = rule.get("actions", [])
|
actions = rule.get("actions") or []
|
||||||
|
|
||||||
conditions = sorted(
|
conditions = sorted(
|
||||||
trigger.get("conditions", []), key=lambda c: c.get("signal", "")
|
trigger.get("conditions", []), key=lambda c: c.get("signal", "")
|
||||||
|
|||||||
@@ -105,6 +105,24 @@ def _is_functional_section(section_name: str) -> bool:
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _has_section_content(sec: dict) -> bool:
|
||||||
|
"""Check if a section has meaningful content (text, table, or image).
|
||||||
|
|
||||||
|
A section is considered "empty" (no real content) if all its text blocks
|
||||||
|
have fewer than 10 characters and it contains no tables or images.
|
||||||
|
"""
|
||||||
|
for block in sec.get("blocks", []):
|
||||||
|
blk_type = block.get("type", "")
|
||||||
|
if blk_type == "table":
|
||||||
|
return True
|
||||||
|
if blk_type in ("image", "figure", "picture"):
|
||||||
|
return True
|
||||||
|
text = block.get("text", "")
|
||||||
|
if isinstance(text, str) and len(text.strip()) >= 10:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def _extract_content_units(parsed_data: dict) -> dict:
|
def _extract_content_units(parsed_data: dict) -> dict:
|
||||||
"""Extract countable content units from parsed JSON.
|
"""Extract countable content units from parsed JSON.
|
||||||
|
|
||||||
@@ -119,7 +137,7 @@ def _extract_content_units(parsed_data: dict) -> dict:
|
|||||||
|
|
||||||
for sec in sections:
|
for sec in sections:
|
||||||
name = sec.get("source", "")
|
name = sec.get("source", "")
|
||||||
if _is_functional_section(name):
|
if _is_functional_section(name) and _has_section_content(sec):
|
||||||
functional_sections.append({
|
functional_sections.append({
|
||||||
"name": name,
|
"name": name,
|
||||||
"number": _section_number(name),
|
"number": _section_number(name),
|
||||||
|
|||||||
Reference in New Issue
Block a user