Skip to content

Commit cd4d0e1

Browse files
committed
feat: 完善 DOCX 解析支持和修复 PDF 异步事件循环问题
- 修改 DoclingParser 支持 DOCX style metadata - _iter_doc_items 返回 label 信息 - 新增 _label_to_style 方法映射 Docling label 到 Word 样式名 - _transfer_to_sections 在 DOCX 中使用 label 作为 style(而非空字符串) - 修复 PDF 解析器异步事件循环冲突 - 检测是否有运行中的事件循环 - 如果有,在新线程中创建新的事件循环运行 OCR - 如果没有,直接使用 asyncio.run() 这些修改使得: 1. DOCX 解析结果包含完整的 style 信息,前端可以正确渲染文档结构 2. PDF 解析可以在已有事件循环的环境中正常工作(如 pytest async 测试)
1 parent 8a7c146 commit cd4d0e1

File tree

2 files changed

+80
-8
lines changed

2 files changed

+80
-8
lines changed

deepdoc/parser/docling_parser.py

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -168,20 +168,32 @@ def crop(self, text: str, ZM: int = 1, need_position: bool = False):
168168

169169
return (pic, positions) if need_position else pic
170170

171-
def _iter_doc_items(self, doc, has_bbox: bool = True) -> Iterable[tuple[str, Any, Optional[_BBox]]]:
171+
def _iter_doc_items(self, doc, has_bbox: bool = True) -> Iterable[tuple[str, Any, Optional[_BBox], str]]:
172172
"""
173173
Iterate over document items (texts, equations).
174174
175175
Args:
176176
doc: Docling document object
177177
has_bbox: Whether the document format supports bbox (PDF=True, DOCX/PPTX=False)
178+
179+
Yields:
180+
Tuple of (content_type, text, bbox, label) where:
181+
- content_type: DoclingContentType value
182+
- text: Text content
183+
- bbox: Bounding box (None for DOCX/PPTX)
184+
- label: Docling label (e.g., "section_header", "text", "list_item", "FORMULA")
178185
"""
179186
for t in getattr(doc, "texts", []):
180187
parent = getattr(t, "parent", "")
181188
ref = getattr(parent, "cref", "") if parent else ""
182189
label = getattr(t, "label", "")
183-
if (label in ("section_header", "text",) and ref in ("#/body",)) or label in ("list_item",):
190+
# Accept section_header, text, and list_item labels
191+
# For DOCX/PPTX, ref may not be exactly "#/body" (could be "#/groups/0", "#/texts/0", etc.)
192+
# So we accept any ref for these labels, or specifically check for "#/body" when needed
193+
if label in ("section_header", "text", "list_item"):
184194
text = getattr(t, "text", "") or ""
195+
if not text.strip():
196+
continue
185197
bbox = None
186198
if has_bbox and getattr(t, "prov", None):
187199
pn = getattr(t.prov[0], "page_no", None)
@@ -190,10 +202,11 @@ def _iter_doc_items(self, doc, has_bbox: bool = True) -> Iterable[tuple[str, Any
190202
bb = [getattr(bb, "l", None), getattr(bb, "t", None), getattr(bb, "r", None), getattr(bb, "b", None)]
191203
if pn and bb and len(bb) == 4 and all(b is not None for b in bb):
192204
bbox = _BBox(page_no=int(pn), x0=bb[0], y0=bb[1], x1=bb[2], y1=bb[3])
193-
yield (DoclingContentType.TEXT.value, text, bbox)
205+
yield (DoclingContentType.TEXT.value, text, bbox, label)
194206

195207
for item in getattr(doc, "texts", []):
196-
if getattr(item, "label", "") in ("FORMULA",):
208+
item_label = getattr(item, "label", "")
209+
if item_label in ("FORMULA",):
197210
text = getattr(item, "text", "") or ""
198211
bbox = None
199212
if has_bbox and getattr(item, "prov", None):
@@ -203,7 +216,25 @@ def _iter_doc_items(self, doc, has_bbox: bool = True) -> Iterable[tuple[str, Any
203216
bb = [getattr(bb, "l", None), getattr(bb, "t", None), getattr(bb, "r", None), getattr(bb, "b", None)]
204217
if pn and bb and len(bb) == 4 and all(b is not None for b in bb):
205218
bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3])
206-
yield (DoclingContentType.EQUATION.value, text, bbox)
219+
yield (DoclingContentType.EQUATION.value, text, bbox, item_label)
220+
221+
def _label_to_style(self, label: str) -> str:
222+
"""
223+
Map Docling label to Word style name.
224+
225+
Args:
226+
label: Docling label (e.g., "section_header", "text", "list_item")
227+
228+
Returns:
229+
Word-style name (e.g., "Heading", "Normal", "List Item")
230+
"""
231+
label_to_style_map = {
232+
"section_header": "Heading",
233+
"text": "Normal",
234+
"list_item": "List Item",
235+
"FORMULA": "Equation",
236+
}
237+
return label_to_style_map.get(label, "Normal")
207238

208239
def _transfer_to_sections(self, doc, parse_method: str, has_bbox: bool = True) -> list[tuple[str, str]]:
209240
"""
@@ -213,9 +244,14 @@ def _transfer_to_sections(self, doc, parse_method: str, has_bbox: bool = True) -
213244
doc: Docling document object
214245
parse_method: Parsing method ("raw", "manual", "paper")
215246
has_bbox: Whether the document format supports bbox
247+
248+
Returns:
249+
List of (text, tag_or_style) tuples where:
250+
- For PDF (has_bbox=True): tag is position tag (e.g., "@@1\t0.0\t100.0\t0.0\t50.0##")
251+
- For DOCX/PPTX (has_bbox=False): tag is style name (e.g., "Heading", "Normal")
216252
"""
217253
sections: list[tuple[str, str]] = []
218-
for typ, payload, bbox in self._iter_doc_items(doc, has_bbox=has_bbox):
254+
for typ, payload, bbox, label in self._iter_doc_items(doc, has_bbox=has_bbox):
219255
if typ == DoclingContentType.TEXT.value:
220256
section = payload.strip()
221257
if not section:
@@ -225,7 +261,14 @@ def _transfer_to_sections(self, doc, parse_method: str, has_bbox: bool = True) -
225261
else:
226262
continue
227263

228-
tag = self._make_line_tag(bbox) if isinstance(bbox, _BBox) else ""
264+
# For PDF (has_bbox=True): use position tag
265+
# For DOCX/PPTX (has_bbox=False): use label as style
266+
if isinstance(bbox, _BBox):
267+
tag = self._make_line_tag(bbox)
268+
else:
269+
# No bbox, use label as style for DOCX/PPTX
270+
tag = self._label_to_style(label)
271+
229272
if parse_method == "manual":
230273
sections.append((section, typ, tag))
231274
elif parse_method == "paper":

deepdoc/parser/pdf_parser.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import logging
1919
import math
2020
import os
21+
import queue
2122
import random
2223
import re
2324
import sys
@@ -1185,7 +1186,35 @@ async def wrapper(i=i, img=img, chars=chars, semaphore=semaphore):
11851186

11861187
start = timer()
11871188

1188-
asyncio.run(__img_ocr_launcher())
1189+
# Handle asyncio.run() in case there's already a running event loop
1190+
try:
1191+
# Check if there's a running event loop
1192+
asyncio.get_running_loop()
1193+
# If we get here, there's a running loop, so we need to run in a new thread
1194+
result_queue: queue.Queue = queue.Queue()
1195+
1196+
def runner():
1197+
try:
1198+
# Create a new event loop in this thread
1199+
new_loop = asyncio.new_event_loop()
1200+
asyncio.set_event_loop(new_loop)
1201+
try:
1202+
result_queue.put((True, new_loop.run_until_complete(__img_ocr_launcher())))
1203+
finally:
1204+
new_loop.close()
1205+
except Exception as e:
1206+
result_queue.put((False, e))
1207+
1208+
thread = threading.Thread(target=runner, daemon=True)
1209+
thread.start()
1210+
thread.join()
1211+
1212+
success, value = result_queue.get_nowait()
1213+
if not success:
1214+
raise value
1215+
except RuntimeError:
1216+
# No running event loop, safe to use asyncio.run()
1217+
asyncio.run(__img_ocr_launcher())
11891218

11901219
logging.info(f"__images__ {len(self.page_images)} pages cost {timer() - start}s")
11911220

0 commit comments

Comments
 (0)