@@ -168,20 +168,32 @@ def crop(self, text: str, ZM: int = 1, need_position: bool = False):
168168
169169 return (pic , positions ) if need_position else pic
170170
171- def _iter_doc_items (self , doc , has_bbox : bool = True ) -> Iterable [tuple [str , Any , Optional [_BBox ]]]:
171+ def _iter_doc_items (self , doc , has_bbox : bool = True ) -> Iterable [tuple [str , Any , Optional [_BBox ], str ]]:
172172 """
173173 Iterate over document items (texts, equations).
174174
175175 Args:
176176 doc: Docling document object
177177 has_bbox: Whether the document format supports bbox (PDF=True, DOCX/PPTX=False)
178+
179+ Yields:
180+ Tuple of (content_type, text, bbox, label) where:
181+ - content_type: DoclingContentType value
182+ - text: Text content
183+ - bbox: Bounding box (None for DOCX/PPTX)
184+ - label: Docling label (e.g., "section_header", "text", "list_item", "FORMULA")
178185 """
179186 for t in getattr (doc , "texts" , []):
180187 parent = getattr (t , "parent" , "" )
181188 ref = getattr (parent , "cref" , "" ) if parent else ""
182189 label = getattr (t , "label" , "" )
183- if (label in ("section_header" , "text" ,) and ref in ("#/body" ,)) or label in ("list_item" ,):
190+ # Accept section_header, text, and list_item labels
191+ # For DOCX/PPTX, ref may not be exactly "#/body" (could be "#/groups/0", "#/texts/0", etc.)
192+ # So we accept any ref for these labels, or specifically check for "#/body" when needed
193+ if label in ("section_header" , "text" , "list_item" ):
184194 text = getattr (t , "text" , "" ) or ""
195+ if not text .strip ():
196+ continue
185197 bbox = None
186198 if has_bbox and getattr (t , "prov" , None ):
187199 pn = getattr (t .prov [0 ], "page_no" , None )
@@ -190,10 +202,11 @@ def _iter_doc_items(self, doc, has_bbox: bool = True) -> Iterable[tuple[str, Any
190202 bb = [getattr (bb , "l" , None ), getattr (bb , "t" , None ), getattr (bb , "r" , None ), getattr (bb , "b" , None )]
191203 if pn and bb and len (bb ) == 4 and all (b is not None for b in bb ):
192204 bbox = _BBox (page_no = int (pn ), x0 = bb [0 ], y0 = bb [1 ], x1 = bb [2 ], y1 = bb [3 ])
193- yield (DoclingContentType .TEXT .value , text , bbox )
205+ yield (DoclingContentType .TEXT .value , text , bbox , label )
194206
195207 for item in getattr (doc , "texts" , []):
196- if getattr (item , "label" , "" ) in ("FORMULA" ,):
208+ item_label = getattr (item , "label" , "" )
209+ if item_label in ("FORMULA" ,):
197210 text = getattr (item , "text" , "" ) or ""
198211 bbox = None
199212 if has_bbox and getattr (item , "prov" , None ):
@@ -203,7 +216,25 @@ def _iter_doc_items(self, doc, has_bbox: bool = True) -> Iterable[tuple[str, Any
203216 bb = [getattr (bb , "l" , None ), getattr (bb , "t" , None ), getattr (bb , "r" , None ), getattr (bb , "b" , None )]
204217 if pn and bb and len (bb ) == 4 and all (b is not None for b in bb ):
205218 bbox = _BBox (int (pn ), bb [0 ], bb [1 ], bb [2 ], bb [3 ])
206- yield (DoclingContentType .EQUATION .value , text , bbox )
219+ yield (DoclingContentType .EQUATION .value , text , bbox , item_label )
220+
221+ def _label_to_style (self , label : str ) -> str :
222+ """
223+ Map Docling label to Word style name.
224+
225+ Args:
226+ label: Docling label (e.g., "section_header", "text", "list_item")
227+
228+ Returns:
229+ Word-style name (e.g., "Heading", "Normal", "List Item")
230+ """
231+ label_to_style_map = {
232+ "section_header" : "Heading" ,
233+ "text" : "Normal" ,
234+ "list_item" : "List Item" ,
235+ "FORMULA" : "Equation" ,
236+ }
237+ return label_to_style_map .get (label , "Normal" )
207238
208239 def _transfer_to_sections (self , doc , parse_method : str , has_bbox : bool = True ) -> list [tuple [str , str ]]:
209240 """
@@ -213,9 +244,14 @@ def _transfer_to_sections(self, doc, parse_method: str, has_bbox: bool = True) -
213244 doc: Docling document object
214245 parse_method: Parsing method ("raw", "manual", "paper")
215246 has_bbox: Whether the document format supports bbox
247+
248+ Returns:
249+ List of (text, tag_or_style) tuples where:
250+ - For PDF (has_bbox=True): tag is position tag (e.g., "@@1\t 0.0\t 100.0\t 0.0\t 50.0##")
251+ - For DOCX/PPTX (has_bbox=False): tag is style name (e.g., "Heading", "Normal")
216252 """
217253 sections : list [tuple [str , str ]] = []
218- for typ , payload , bbox in self ._iter_doc_items (doc , has_bbox = has_bbox ):
254+ for typ , payload , bbox , label in self ._iter_doc_items (doc , has_bbox = has_bbox ):
219255 if typ == DoclingContentType .TEXT .value :
220256 section = payload .strip ()
221257 if not section :
@@ -225,7 +261,14 @@ def _transfer_to_sections(self, doc, parse_method: str, has_bbox: bool = True) -
225261 else :
226262 continue
227263
228- tag = self ._make_line_tag (bbox ) if isinstance (bbox , _BBox ) else ""
264+ # For PDF (has_bbox=True): use position tag
265+ # For DOCX/PPTX (has_bbox=False): use label as style
266+ if isinstance (bbox , _BBox ):
267+ tag = self ._make_line_tag (bbox )
268+ else :
269+ # No bbox, use label as style for DOCX/PPTX
270+ tag = self ._label_to_style (label )
271+
229272 if parse_method == "manual" :
230273 sections .append ((section , typ , tag ))
231274 elif parse_method == "paper" :
0 commit comments