feat: 完善 DOCX 解析支持和修复 PDF 异步事件循环问题

sqhyz55 · sqhyz55 · commit cd4d0e13a89a · 2026-01-14T20:43:42.000+08:00
- 修改 DoclingParser 支持 DOCX style metadata
  - _iter_doc_items 返回 label 信息
  - 新增 _label_to_style 方法映射 Docling label 到 Word 样式名
  - _transfer_to_sections 在 DOCX 中使用 label 作为 style（而非空字符串）

- 修复 PDF 解析器异步事件循环冲突
  - 检测是否有运行中的事件循环
  - 如果有，在新线程中创建新的事件循环运行 OCR
  - 如果没有，直接使用 asyncio.run()

这些修改使得：
1. DOCX 解析结果包含完整的 style 信息，前端可以正确渲染文档结构
2. PDF 解析可以在已有事件循环的环境中正常工作（如 pytest async 测试）
diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py
@@ -168,20 +168,32 @@ def crop(self, text: str, ZM: int = 1, need_position: bool = False):
 
         return (pic, positions) if need_position else pic
 
-    def _iter_doc_items(self, doc, has_bbox: bool = True) -> Iterable[tuple[str, Any, Optional[_BBox]]]:
+    def _iter_doc_items(self, doc, has_bbox: bool = True) -> Iterable[tuple[str, Any, Optional[_BBox], str]]:
         """
         Iterate over document items (texts, equations).
         
         Args:
             doc: Docling document object
             has_bbox: Whether the document format supports bbox (PDF=True, DOCX/PPTX=False)
+            
+        Yields:
+            Tuple of (content_type, text, bbox, label) where:
+            - content_type: DoclingContentType value
+            - text: Text content
+            - bbox: Bounding box (None for DOCX/PPTX)
+            - label: Docling label (e.g., "section_header", "text", "list_item", "FORMULA")
         """
         for t in getattr(doc, "texts", []):
             parent = getattr(t, "parent", "")
             ref = getattr(parent, "cref", "") if parent else ""
             label = getattr(t, "label", "")
-            if (label in ("section_header", "text",) and ref in ("#/body",)) or label in ("list_item",):
+            # Accept section_header, text, and list_item labels
+            # For DOCX/PPTX, ref may not be exactly "#/body" (could be "#/groups/0", "#/texts/0", etc.)
+            # So we accept any ref for these labels, or specifically check for "#/body" when needed
+            if label in ("section_header", "text", "list_item"):
                 text = getattr(t, "text", "") or ""
+                if not text.strip():
+                    continue
                 bbox = None
                 if has_bbox and getattr(t, "prov", None):
                     pn = getattr(t.prov[0], "page_no", None)
@@ -190,10 +202,11 @@ def _iter_doc_items(self, doc, has_bbox: bool = True) -> Iterable[tuple[str, Any
                         bb = [getattr(bb, "l", None), getattr(bb, "t", None), getattr(bb, "r", None), getattr(bb, "b", None)]
                         if pn and bb and len(bb) == 4 and all(b is not None for b in bb):
                             bbox = _BBox(page_no=int(pn), x0=bb[0], y0=bb[1], x1=bb[2], y1=bb[3])
-                yield (DoclingContentType.TEXT.value, text, bbox)
+                yield (DoclingContentType.TEXT.value, text, bbox, label)
 
         for item in getattr(doc, "texts", []):
-            if getattr(item, "label", "") in ("FORMULA",):
+            item_label = getattr(item, "label", "")
+            if item_label in ("FORMULA",):
                 text = getattr(item, "text", "") or ""
                 bbox = None
                 if has_bbox and getattr(item, "prov", None):
@@ -203,7 +216,25 @@ def _iter_doc_items(self, doc, has_bbox: bool = True) -> Iterable[tuple[str, Any
                         bb = [getattr(bb, "l", None), getattr(bb, "t", None), getattr(bb, "r", None), getattr(bb, "b", None)]
                         if pn and bb and len(bb) == 4 and all(b is not None for b in bb):
                             bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3])
-                yield (DoclingContentType.EQUATION.value, text, bbox)
+                yield (DoclingContentType.EQUATION.value, text, bbox, item_label)
+
+    def _label_to_style(self, label: str) -> str:
+        """
+        Map Docling label to Word style name.
+        
+        Args:
+            label: Docling label (e.g., "section_header", "text", "list_item")
+            
+        Returns:
+            Word-style name (e.g., "Heading", "Normal", "List Item")
+        """
+        label_to_style_map = {
+            "section_header": "Heading",
+            "text": "Normal",
+            "list_item": "List Item",
+            "FORMULA": "Equation",
+        }
+        return label_to_style_map.get(label, "Normal")
 
     def _transfer_to_sections(self, doc, parse_method: str, has_bbox: bool = True) -> list[tuple[str, str]]:
         """
@@ -213,9 +244,14 @@ def _transfer_to_sections(self, doc, parse_method: str, has_bbox: bool = True) -
             doc: Docling document object
             parse_method: Parsing method ("raw", "manual", "paper")
             has_bbox: Whether the document format supports bbox
+            
+        Returns:
+            List of (text, tag_or_style) tuples where:
+            - For PDF (has_bbox=True): tag is position tag (e.g., "@@1\t0.0\t100.0\t0.0\t50.0##")
+            - For DOCX/PPTX (has_bbox=False): tag is style name (e.g., "Heading", "Normal")
         """
         sections: list[tuple[str, str]] = []
-        for typ, payload, bbox in self._iter_doc_items(doc, has_bbox=has_bbox):
+        for typ, payload, bbox, label in self._iter_doc_items(doc, has_bbox=has_bbox):
             if typ == DoclingContentType.TEXT.value:
                 section = payload.strip()
                 if not section:
@@ -225,7 +261,14 @@ def _transfer_to_sections(self, doc, parse_method: str, has_bbox: bool = True) -
             else:
                 continue
             
-            tag = self._make_line_tag(bbox) if isinstance(bbox, _BBox) else ""
+            # For PDF (has_bbox=True): use position tag
+            # For DOCX/PPTX (has_bbox=False): use label as style
+            if isinstance(bbox, _BBox):
+                tag = self._make_line_tag(bbox)
+            else:
+                # No bbox, use label as style for DOCX/PPTX
+                tag = self._label_to_style(label)
+            
             if parse_method == "manual":
                 sections.append((section, typ, tag))
             elif parse_method == "paper":
diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py
@@ -18,6 +18,7 @@
 import logging
 import math
 import os
+import queue
 import random
 import re
 import sys
@@ -1185,7 +1186,35 @@ async def wrapper(i=i, img=img, chars=chars, semaphore=semaphore):
 
         start = timer()
 
-        asyncio.run(__img_ocr_launcher())
+        # Handle asyncio.run() in case there's already a running event loop
+        try:
+            # Check if there's a running event loop
+            asyncio.get_running_loop()
+            # If we get here, there's a running loop, so we need to run in a new thread
+            result_queue: queue.Queue = queue.Queue()
+
+            def runner():
+                try:
+                    # Create a new event loop in this thread
+                    new_loop = asyncio.new_event_loop()
+                    asyncio.set_event_loop(new_loop)
+                    try:
+                        result_queue.put((True, new_loop.run_until_complete(__img_ocr_launcher())))
+                    finally:
+                        new_loop.close()
+                except Exception as e:
+                    result_queue.put((False, e))
+
+            thread = threading.Thread(target=runner, daemon=True)
+            thread.start()
+            thread.join()
+
+            success, value = result_queue.get_nowait()
+            if not success:
+                raise value
+        except RuntimeError:
+            # No running event loop, safe to use asyncio.run()
+            asyncio.run(__img_ocr_launcher())
 
         logging.info(f"__images__ {len(self.page_images)} pages cost {timer() - start}s")