Skip to content

Commit b5dc1ee

Browse files
committed
fix: restore floating-item footnotes in markdown/html exports
Signed-off-by: Hassan Raza <raihassanraza10@gmail.com>
1 parent 1513f7d commit b5dc1ee

File tree

6 files changed

+290
-32
lines changed

6 files changed

+290
-32
lines changed

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,13 @@ def serialize(
100100
table_text = ". ".join(table_text_parts)
101101
parts.append(create_ser_result(text=table_text, span_source=item))
102102

103+
ftn_res = doc_serializer.serialize_footnotes(
104+
item=item,
105+
**kwargs,
106+
)
107+
if ftn_res.text:
108+
parts.append(ftn_res)
109+
103110
text_res = "\n\n".join([r.text for r in parts])
104111

105112
return create_ser_result(text=text_res, span_source=parts)

docling_core/transforms/serializer/common.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,29 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
316316
self._excluded_refs_cache[params_json] = refs
317317
return refs
318318

319+
def _serialize_referenced_text_items(
320+
self,
321+
refs: Iterable[Any],
322+
**kwargs: Any,
323+
) -> list[SerializationResult]:
324+
"""Serialize referenced text items while bypassing the top-level skip path."""
325+
excluded_refs = self.get_excluded_refs(**kwargs)
326+
results: list[SerializationResult] = []
327+
328+
for ref in refs:
329+
if isinstance(it := ref.resolve(self.doc), TextItem) and it.self_ref not in excluded_refs:
330+
results.append(
331+
self.text_serializer.serialize(
332+
item=it,
333+
doc_serializer=self,
334+
doc=self.doc,
335+
is_inline_scope=True,
336+
**kwargs,
337+
)
338+
)
339+
340+
return results
341+
319342
@abstractmethod
320343
def serialize_doc(
321344
self,
@@ -621,18 +644,12 @@ def serialize_footnotes(
621644
) -> SerializationResult:
622645
"""Serialize the item's footnotes."""
623646
params = self.params.merge_with_patch(patch=kwargs)
624-
results: list[SerializationResult] = []
625647
if DocItemLabel.FOOTNOTE in params.labels:
626-
results = [
627-
create_ser_result(text=it.text, span_source=it)
628-
for ftn in item.footnotes
629-
if isinstance(it := ftn.resolve(self.doc), TextItem)
630-
and it.self_ref not in self.get_excluded_refs(**kwargs)
631-
]
648+
results = self._serialize_referenced_text_items(item.footnotes, **kwargs)
632649
# FIXME: using the caption_delimiter for now ...
633650
text_res = params.caption_delim.join([r.text for r in results])
634-
text_res = self.post_process(text=text_res)
635651
else:
652+
results = []
636653
text_res = ""
637654
return create_ser_result(text=text_res, span_source=results)
638655

docling_core/transforms/serializer/html.py

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -363,10 +363,10 @@ def serialize(
363363
**kwargs: Any,
364364
) -> SerializationResult:
365365
"""Serializes the passed table item to HTML."""
366-
res_parts: list[SerializationResult] = []
366+
table_parts: list[SerializationResult] = []
367367
cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs)
368368
if cap_res.text:
369-
res_parts.append(cap_res)
369+
table_parts.append(cap_res)
370370

371371
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
372372
body = ""
@@ -416,10 +416,18 @@ def serialize(
416416

417417
if body:
418418
body = f"<tbody>{body}</tbody>"
419-
res_parts.append(create_ser_result(text=body, span_source=span_source))
419+
table_parts.append(create_ser_result(text=body, span_source=span_source))
420+
421+
res_parts: list[SerializationResult] = []
422+
if table_parts:
423+
table_text = "".join([r.text for r in table_parts])
424+
res_parts.append(create_ser_result(text=f"<table>{table_text}</table>", span_source=table_parts))
425+
426+
ftn_res = doc_serializer.serialize_footnotes(item=item, **kwargs)
427+
if ftn_res.text:
428+
res_parts.append(ftn_res)
420429

421430
text_res = "".join([r.text for r in res_parts])
422-
text_res = f"<table>{text_res}</table>" if text_res else ""
423431

424432
return create_ser_result(text=text_res, span_source=res_parts)
425433

@@ -610,6 +618,10 @@ def get_img_row(imgb64: str, ind: int) -> str:
610618
details_html = f"<details><summary>Meta</summary>{meta_res.text}</details>"
611619
res_parts.append(create_ser_result(text=details_html, span_source=[meta_res]))
612620

621+
ftn_res = doc_serializer.serialize_footnotes(item=item, **kwargs)
622+
if ftn_res.text:
623+
res_parts.append(ftn_res)
624+
613625
text_res = "".join([r.text for r in res_parts])
614626
if text_res:
615627
text_res = f"<figure>{text_res}</figure>"
@@ -1210,6 +1222,47 @@ def serialize_captions(
12101222
text_res = f"<{tag}>{text_res}</{tag}>"
12111223
return create_ser_result(text=text_res, span_source=results)
12121224

1225+
@override
1226+
def serialize_footnotes(
1227+
self,
1228+
item: FloatingItem,
1229+
**kwargs: Any,
1230+
) -> SerializationResult:
1231+
"""Serialize the item's footnotes."""
1232+
params = self.params.merge_with_patch(patch=kwargs)
1233+
if DocItemLabel.FOOTNOTE not in params.labels:
1234+
return create_ser_result()
1235+
1236+
raw_results = self._serialize_referenced_text_items(item.footnotes, **kwargs)
1237+
if not raw_results:
1238+
return create_ser_result()
1239+
1240+
results: list[SerializationResult] = []
1241+
for idx, ser_res in enumerate(raw_results):
1242+
dir_str = ""
1243+
if ser_res.spans and isinstance(ser_res.spans[0].item, TextItem):
1244+
text_dir = get_text_direction(ser_res.spans[0].item.text)
1245+
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
1246+
1247+
margin_top = "0" if idx == 0 else "0.35em"
1248+
results.append(
1249+
create_ser_result(
1250+
text=(
1251+
f'<div class="footnote"{dir_str} style="margin-top: {margin_top};">{ser_res.text}</div>'
1252+
),
1253+
span_source=[ser_res],
1254+
)
1255+
)
1256+
1257+
text_res = "".join([r.text for r in results])
1258+
text_res = (
1259+
'<div class="footnotes" role="note" '
1260+
'style="margin-top: 0.65em; padding-top: 0.45em; border-top: 1px solid #ddd; '
1261+
'color: #666; font-size: 0.95em; line-height: 1.5; text-align: left;">'
1262+
f"{text_res}</div>"
1263+
)
1264+
return create_ser_result(text=text_res, span_source=results)
1265+
12131266
def _generate_head(self) -> str:
12141267
"""Generate the HTML head section with metadata and styles."""
12151268
params = self.params

docling_core/transforms/serializer/markdown.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,13 @@ def serialize(
561561
if table_text:
562562
res_parts.append(create_ser_result(text=table_text, span_source=item))
563563

564+
ftn_res = doc_serializer.serialize_footnotes(
565+
item=item,
566+
**kwargs,
567+
)
568+
if ftn_res.text:
569+
res_parts.append(ftn_res)
570+
564571
text_res = "\n\n".join([r.text for r in res_parts])
565572

566573
return create_ser_result(text=text_res, span_source=res_parts)
@@ -621,6 +628,14 @@ def serialize(
621628
md_table_content = temp_table.export_to_markdown(temp_doc)
622629
if len(md_table_content) > 0:
623630
res_parts.append(create_ser_result(text=md_table_content, span_source=item))
631+
632+
ftn_res = doc_serializer.serialize_footnotes(
633+
item=item,
634+
**kwargs,
635+
)
636+
if ftn_res.text:
637+
res_parts.append(ftn_res)
638+
624639
text_res = "\n\n".join([r.text for r in res_parts if r.text])
625640

626641
return create_ser_result(text=text_res, span_source=res_parts)
@@ -911,6 +926,21 @@ def post_process(
911926
)
912927
return res
913928

929+
@override
930+
def serialize_footnotes(
931+
self,
932+
item: FloatingItem,
933+
**kwargs: Any,
934+
) -> SerializationResult:
935+
"""Serialize footnotes as separate Markdown blocks."""
936+
params = self.params.merge_with_patch(patch=kwargs)
937+
if DocItemLabel.FOOTNOTE not in params.labels:
938+
return create_ser_result()
939+
940+
results = self._serialize_referenced_text_items(item.footnotes, **kwargs)
941+
text_res = "\n\n".join([r.text for r in results])
942+
return create_ser_result(text=text_res, span_source=results)
943+
914944
@override
915945
def serialize_doc(
916946
self,

test/test_hierarchical_chunker.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,31 @@ def test_triplet_table_serializer_single_column():
180180
expected = "Country = Italy. Country = Canada. Country = Switzerland"
181181
assert result.text == expected, f"Expected '{expected}', got '{result.text}'"
182182

183+
184+
def test_triplet_table_serializer_includes_footnotes():
185+
"""Regression: table footnotes must be preserved in chunking serialization."""
186+
187+
doc = DoclingDocument(name="table_footnotes")
188+
table_data = TableData(num_cols=1)
189+
table_data.add_row(["Country"])
190+
table_data.add_row(["Italy"])
191+
doc.add_table(data=table_data)
192+
193+
table_item = next(iter(doc.iterate_items()))[0]
194+
footnote = doc.add_text(label=DocItemLabel.FOOTNOTE, text="Country footnote")
195+
table_item.footnotes.append(footnote.get_ref())
196+
197+
serializer = ChunkingDocSerializer(doc=doc)
198+
result = TripletTableSerializer().serialize(
199+
item=table_item,
200+
doc_serializer=serializer,
201+
doc=doc,
202+
)
203+
204+
assert result.text == "Country = Italy\n\nCountry footnote"
205+
assert result.text.count("Country footnote") == 1
206+
207+
183208
def test_chunk_rich_table_custom_serializer(rich_table_doc: DoclingDocument):
184209
doc = rich_table_doc
185210

0 commit comments

Comments
 (0)