Merge pull request 'fix: [test-dev] _extract_content_units 空章节误计为功能章节 - Closes #29' (#30) from test/issue-29 into main
CI / test (push) Successful in 14s

This commit was merged in pull request #30.
This commit is contained in:
2026-06-01 11:24:04 +08:00
+19 -1
View File
@@ -105,6 +105,24 @@ def _is_functional_section(section_name: str) -> bool:
return True return True
def _has_section_content(sec: dict) -> bool:
"""Check if a section has meaningful content (text, table, or image).
A section is considered "empty" (no real content) if all its text blocks
have fewer than 10 characters and it contains no tables or images.
"""
for block in sec.get("blocks", []):
blk_type = block.get("type", "")
if blk_type == "table":
return True
if blk_type in ("image", "figure", "picture"):
return True
text = block.get("text", "")
if isinstance(text, str) and len(text.strip()) >= 10:
return True
return False
def _extract_content_units(parsed_data: dict) -> dict: def _extract_content_units(parsed_data: dict) -> dict:
"""Extract countable content units from parsed JSON. """Extract countable content units from parsed JSON.
@@ -119,7 +137,7 @@ def _extract_content_units(parsed_data: dict) -> dict:
for sec in sections: for sec in sections:
name = sec.get("source", "") name = sec.get("source", "")
if _is_functional_section(name): if _is_functional_section(name) and _has_section_content(sec):
functional_sections.append({ functional_sections.append({
"name": name, "name": name,
"number": _section_number(name), "number": _section_number(name),