Skip to content

Commit be81d87

Browse files
committed
fix: restore floating-item footnotes in markdown/html exports
Signed-off-by: Hassan Raza <raihassanraza10@gmail.com>
1 parent 1513f7d commit be81d87

File tree

7 files changed

+353
-40
lines changed

7 files changed

+353
-40
lines changed

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,13 @@ def serialize(
100100
table_text = ". ".join(table_text_parts)
101101
parts.append(create_ser_result(text=table_text, span_source=item))
102102

103+
ftn_res = doc_serializer.serialize_footnotes(
104+
item=item,
105+
**kwargs,
106+
)
107+
if ftn_res.text:
108+
parts.append(ftn_res)
109+
103110
text_res = "\n\n".join([r.text for r in parts])
104111

105112
return create_ser_result(text=text_res, span_source=parts)

docling_core/transforms/serializer/common.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
PictureDataType,
5353
PictureItem,
5454
PictureMoleculeData,
55+
RefItem,
5556
Script,
5657
TableAnnotationType,
5758
TableItem,
@@ -316,6 +317,29 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
316317
self._excluded_refs_cache[params_json] = refs
317318
return refs
318319

320+
def _serialize_referenced_text_items(
321+
self,
322+
refs: Iterable[RefItem],
323+
**kwargs: Any,
324+
) -> list[SerializationResult]:
325+
"""Serialize referenced text items while bypassing the top-level skip path."""
326+
excluded_refs = self.get_excluded_refs(**kwargs)
327+
results: list[SerializationResult] = []
328+
329+
for ref in refs:
330+
if isinstance(it := ref.resolve(self.doc), TextItem) and it.self_ref not in excluded_refs:
331+
results.append(
332+
self.text_serializer.serialize(
333+
item=it,
334+
doc_serializer=self,
335+
doc=self.doc,
336+
is_inline_scope=True,
337+
**kwargs,
338+
)
339+
)
340+
341+
return results
342+
319343
@abstractmethod
320344
def serialize_doc(
321345
self,
@@ -621,18 +645,12 @@ def serialize_footnotes(
621645
) -> SerializationResult:
622646
"""Serialize the item's footnotes."""
623647
params = self.params.merge_with_patch(patch=kwargs)
624-
results: list[SerializationResult] = []
625648
if DocItemLabel.FOOTNOTE in params.labels:
626-
results = [
627-
create_ser_result(text=it.text, span_source=it)
628-
for ftn in item.footnotes
629-
if isinstance(it := ftn.resolve(self.doc), TextItem)
630-
and it.self_ref not in self.get_excluded_refs(**kwargs)
631-
]
632-
# FIXME: using the caption_delimiter for now ...
649+
results = self._serialize_referenced_text_items(item.footnotes, **kwargs)
650+
# Plain-text serializers keep floating-item metadata compact.
633651
text_res = params.caption_delim.join([r.text for r in results])
634-
text_res = self.post_process(text=text_res)
635652
else:
653+
results = []
636654
text_res = ""
637655
return create_ser_result(text=text_res, span_source=results)
638656

docling_core/transforms/serializer/html.py

Lines changed: 74 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,10 @@
3939
create_ser_result,
4040
)
4141
from docling_core.transforms.serializer.html_styles import (
42+
_get_css_for_footnotes,
4243
_get_css_for_single_column,
4344
_get_css_for_split_page,
45+
_get_css_with_no_styling,
4446
)
4547
from docling_core.transforms.visualizer.base import BaseVisualizer
4648
from docling_core.types.doc.base import ImageRefMode
@@ -363,10 +365,10 @@ def serialize(
363365
**kwargs: Any,
364366
) -> SerializationResult:
365367
"""Serializes the passed table item to HTML."""
366-
res_parts: list[SerializationResult] = []
368+
table_parts: list[SerializationResult] = []
367369
cap_res = doc_serializer.serialize_captions(item=item, tag="caption", **kwargs)
368370
if cap_res.text:
369-
res_parts.append(cap_res)
371+
table_parts.append(cap_res)
370372

371373
if item.self_ref not in doc_serializer.get_excluded_refs(**kwargs):
372374
body = ""
@@ -416,10 +418,18 @@ def serialize(
416418

417419
if body:
418420
body = f"<tbody>{body}</tbody>"
419-
res_parts.append(create_ser_result(text=body, span_source=span_source))
421+
table_parts.append(create_ser_result(text=body, span_source=span_source))
422+
423+
res_parts: list[SerializationResult] = []
424+
if table_parts:
425+
table_text = "".join([r.text for r in table_parts])
426+
res_parts.append(create_ser_result(text=f"<table>{table_text}</table>", span_source=table_parts))
427+
428+
ftn_res = doc_serializer.serialize_footnotes(item=item, **kwargs)
429+
if ftn_res.text:
430+
res_parts.append(ftn_res)
420431

421432
text_res = "".join([r.text for r in res_parts])
422-
text_res = f"<table>{text_res}</table>" if text_res else ""
423433

424434
return create_ser_result(text=text_res, span_source=res_parts)
425435

@@ -610,6 +620,10 @@ def get_img_row(imgb64: str, ind: int) -> str:
610620
details_html = f"<details><summary>Meta</summary>{meta_res.text}</details>"
611621
res_parts.append(create_ser_result(text=details_html, span_source=[meta_res]))
612622

623+
ftn_res = doc_serializer.serialize_footnotes(item=item, **kwargs)
624+
if ftn_res.text:
625+
res_parts.append(ftn_res)
626+
613627
text_res = "".join([r.text for r in res_parts])
614628
if text_res:
615629
text_res = f"<figure>{text_res}</figure>"
@@ -1210,6 +1224,39 @@ def serialize_captions(
12101224
text_res = f"<{tag}>{text_res}</{tag}>"
12111225
return create_ser_result(text=text_res, span_source=results)
12121226

1227+
@override
1228+
def serialize_footnotes(
1229+
self,
1230+
item: FloatingItem,
1231+
**kwargs: Any,
1232+
) -> SerializationResult:
1233+
"""Serialize the item's footnotes."""
1234+
params = self.params.merge_with_patch(patch=kwargs)
1235+
if DocItemLabel.FOOTNOTE not in params.labels:
1236+
return create_ser_result()
1237+
1238+
raw_results = self._serialize_referenced_text_items(item.footnotes, **kwargs)
1239+
if not raw_results:
1240+
return create_ser_result()
1241+
1242+
results: list[SerializationResult] = []
1243+
for ser_res in raw_results:
1244+
dir_str = ""
1245+
if ser_res.spans and isinstance(ser_res.spans[0].item, TextItem):
1246+
text_dir = get_text_direction(ser_res.spans[0].item.text)
1247+
dir_str = f' dir="{text_dir}"' if text_dir == "rtl" else ""
1248+
1249+
results.append(
1250+
create_ser_result(
1251+
text=f'<div class="footnote"{dir_str}>{ser_res.text}</div>',
1252+
span_source=[ser_res],
1253+
)
1254+
)
1255+
1256+
text_res = "".join([r.text for r in results])
1257+
text_res = f'<div class="footnotes" role="note">{text_res}</div>'
1258+
return create_ser_result(text=text_res, span_source=results)
1259+
12131260
def _generate_head(self) -> str:
12141261
"""Generate the HTML head section with metadata and styles."""
12151262
params = self.params
@@ -1236,8 +1283,12 @@ def _generate_head(self) -> str:
12361283
head_parts.append(f"<style>\n{params.css_styles}\n</style>")
12371284
elif self.params.output_style == HTMLOutputStyle.SPLIT_PAGE:
12381285
head_parts.append(_get_css_for_split_page())
1286+
if self._has_visible_footnotes():
1287+
head_parts.append(_get_css_for_footnotes())
12391288
elif self.params.output_style == HTMLOutputStyle.SINGLE_COLUMN:
12401289
head_parts.append(_get_css_for_single_column())
1290+
if self._has_visible_footnotes():
1291+
head_parts.append(_get_css_for_footnotes())
12411292
else:
12421293
raise ValueError(f"unknown output-style: {self.params.output_style}")
12431294

@@ -1250,7 +1301,25 @@ def _generate_head(self) -> str:
12501301

12511302
def _get_default_css(self) -> str:
12521303
"""Return default CSS styles for the HTML document."""
1253-
return "<style></style>"
1304+
return _get_css_with_no_styling()
1305+
1306+
def _has_visible_footnotes(self) -> bool:
1307+
"""Whether the serialized output includes floating-item footnotes."""
1308+
if DocItemLabel.FOOTNOTE not in self.params.labels:
1309+
return False
1310+
1311+
excluded_refs = self.get_excluded_refs()
1312+
for items in (self.doc.tables, self.doc.pictures):
1313+
for item in items:
1314+
if item.self_ref in excluded_refs:
1315+
continue
1316+
1317+
for footnote_ref in item.footnotes:
1318+
if isinstance(it := footnote_ref.resolve(self.doc), TextItem):
1319+
if it.self_ref not in excluded_refs:
1320+
return True
1321+
1322+
return False
12541323

12551324
@override
12561325
def requires_page_break(self):

docling_core/transforms/serializer/html_styles.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,42 @@
11
"""HTML styles for different export modes."""
22

33

4+
def _wrap_style(css: str, *, trailing_newline: bool = False) -> str:
5+
"""Wrap CSS in a style tag."""
6+
suffix = "\n" if trailing_newline else ""
7+
return f"<style>{css}</style>{suffix}"
8+
9+
410
def _get_css_with_no_styling() -> str:
511
"""Return default CSS styles for the HTML document."""
6-
return "<style></style>"
12+
return _wrap_style("")
13+
14+
15+
_FOOTNOTE_CSS = """
16+
.footnotes {
17+
margin-top: 0.65em;
18+
padding-top: 0.45em;
19+
border-top: 1px solid #ddd;
20+
color: #666;
21+
font-size: 0.95em;
22+
line-height: 1.5;
23+
text-align: left;
24+
}
25+
.footnote + .footnote {
26+
margin-top: 0.35em;
27+
}
28+
"""
29+
30+
31+
def _get_css_for_footnotes() -> str:
32+
"""Return CSS styles for floating-item footnotes."""
33+
return _wrap_style(_FOOTNOTE_CSS)
734

835

936
def _get_css_for_split_page() -> str:
1037
"""Return default CSS styles for the HTML document."""
11-
return """<style>
38+
return _wrap_style(
39+
"""
1240
html {
1341
background-color: #e1e1e1;
1442
font-family: Arial, sans-serif;
@@ -87,13 +115,15 @@ def _get_css_for_split_page() -> str:
87115
word-wrap: break-word;
88116
/*overflow-wrap: break-word;*/
89117
}
90-
</style>
91-
"""
118+
""",
119+
trailing_newline=True,
120+
)
92121

93122

94123
def _get_css_for_single_column() -> str:
95124
"""Return CSS styles for the single-column HTML document."""
96-
return """<style>
125+
return _wrap_style(
126+
"""
97127
html {
98128
background-color: #f5f5f5;
99129
font-family: Arial, sans-serif;
@@ -209,4 +239,5 @@ def _get_css_for_single_column() -> str:
209239
color: #666;
210240
margin-top: 0.5em;
211241
}
212-
</style>"""
242+
"""
243+
)

docling_core/transforms/serializer/markdown.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,13 @@ def serialize(
561561
if table_text:
562562
res_parts.append(create_ser_result(text=table_text, span_source=item))
563563

564+
ftn_res = doc_serializer.serialize_footnotes(
565+
item=item,
566+
**kwargs,
567+
)
568+
if ftn_res.text:
569+
res_parts.append(ftn_res)
570+
564571
text_res = "\n\n".join([r.text for r in res_parts])
565572

566573
return create_ser_result(text=text_res, span_source=res_parts)
@@ -621,6 +628,14 @@ def serialize(
621628
md_table_content = temp_table.export_to_markdown(temp_doc)
622629
if len(md_table_content) > 0:
623630
res_parts.append(create_ser_result(text=md_table_content, span_source=item))
631+
632+
ftn_res = doc_serializer.serialize_footnotes(
633+
item=item,
634+
**kwargs,
635+
)
636+
if ftn_res.text:
637+
res_parts.append(ftn_res)
638+
624639
text_res = "\n\n".join([r.text for r in res_parts if r.text])
625640

626641
return create_ser_result(text=text_res, span_source=res_parts)
@@ -911,6 +926,21 @@ def post_process(
911926
)
912927
return res
913928

929+
@override
930+
def serialize_footnotes(
931+
self,
932+
item: FloatingItem,
933+
**kwargs: Any,
934+
) -> SerializationResult:
935+
"""Serialize footnotes as separate Markdown blocks."""
936+
params = self.params.merge_with_patch(patch=kwargs)
937+
if DocItemLabel.FOOTNOTE not in params.labels:
938+
return create_ser_result()
939+
940+
results = self._serialize_referenced_text_items(item.footnotes, **kwargs)
941+
text_res = "\n\n".join([r.text for r in results])
942+
return create_ser_result(text=text_res, span_source=results)
943+
914944
@override
915945
def serialize_doc(
916946
self,

test/test_hierarchical_chunker.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,31 @@ def test_triplet_table_serializer_single_column():
180180
expected = "Country = Italy. Country = Canada. Country = Switzerland"
181181
assert result.text == expected, f"Expected '{expected}', got '{result.text}'"
182182

183+
184+
def test_triplet_table_serializer_includes_footnotes():
185+
"""Regression: table footnotes must be preserved in chunking serialization."""
186+
187+
doc = DoclingDocument(name="table_footnotes")
188+
table_data = TableData(num_cols=1)
189+
table_data.add_row(["Country"])
190+
table_data.add_row(["Italy"])
191+
doc.add_table(data=table_data)
192+
193+
table_item = next(iter(doc.iterate_items()))[0]
194+
footnote = doc.add_text(label=DocItemLabel.FOOTNOTE, text="Country footnote")
195+
table_item.footnotes.append(footnote.get_ref())
196+
197+
serializer = ChunkingDocSerializer(doc=doc)
198+
result = TripletTableSerializer().serialize(
199+
item=table_item,
200+
doc_serializer=serializer,
201+
doc=doc,
202+
)
203+
204+
assert result.text == "Country = Italy\n\nCountry footnote"
205+
assert result.text.count("Country footnote") == 1
206+
207+
183208
def test_chunk_rich_table_custom_serializer(rich_table_doc: DoclingDocument):
184209
doc = rich_table_doc
185210

0 commit comments

Comments
 (0)