diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index 1304f7b1bf..0978d45a6c 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -256,8 +256,8 @@ def _convert_workbook(self, doc: DoclingDocument) -> DoclingDocument: self.parents[0] = doc.add_group( parent=None, - label=GroupLabel.SECTION, - name=f"sheet: {name}", + label=GroupLabel.SHEET, + name=name, content_layer=self._get_sheet_content_layer(sheet), ) doc = self._convert_sheet(doc, sheet, page_no) diff --git a/tests/data/groundtruth/docling_v2/1706.03762_main.tex.md b/tests/data/groundtruth/docling_v2/1706.03762_main.tex.md index 8e7849a82c..db788a9cf6 100644 --- a/tests/data/groundtruth/docling_v2/1706.03762_main.tex.md +++ b/tests/data/groundtruth/docling_v2/1706.03762_main.tex.md @@ -336,26 +336,26 @@ Variations on the Transformer architecture. Unlisted values are identical to tho | 0pt2.0ex | 2*$N$ | 2*$d_{\text{model}}$ | 2*$d_{\text{ff}}$ | 2*$h$ | 2*$d_k$ | 2*$d_v$ | 2*$P_{drop}$ | 2*$\epsilon_{ls}$ | train | PPL | BLEU | params | |----------------|---------|---------------------------------------------|---------------------|---------|-----------|-----------|----------------|---------------------|---------|-------|--------|--------------| | | | | | | | | | | steps | (dev) | (dev) | $\times10^6$ | -| 0pt2.0ex base | 6 | 512 | 2048 | 8 | 64 | 64 | 0.1 | 0.1 | 100K | 4.92 | 25.8 | 65 | -| 0pt2.0ex 4*(A) | | | | 1 | 512 | 512 | | | | 5.29 | 24.9 | | -| | | | | 4 | 128 | 128 | | | | 5.00 | 25.5 | | -| | | | | 16 | 32 | 32 | | | | 4.91 | 25.8 | | -| | | | | 32 | 16 | 16 | | | | 5.01 | 25.4 | | -| 0pt2.0ex 2*(B) | | | | | 16 | | | | | 5.16 | 25.1 | 58 | -| | | | | | 32 | | | | | 5.01 | 25.4 | 60 | +| 0pt2.0ex base | 6 | 512 | 2048 | 8 | 64 | 64 | 0.1 | 0.1 | 100K | 4.92 | 25.8 | 65 | +| 0pt2.0ex 4*(A) | | | | 1 | 512 | 512 | | | | 5.29 | 24.9 | | +| | | | | 4 | 128 | 128 | | | | 5.00 | 25.5 | | +| | | | | 16 | 32 | 32 | | | | 4.91 | 25.8 | | +| | | | | 32 | 16 | 16 | | | | 5.01 | 25.4 | | +| 0pt2.0ex 2*(B) | | | | | 16 | | | | | 5.16 | 25.1 | 58 | +| | | | | | 32 | | | | | 5.01 | 25.4 | 60 | | 0pt2.0ex 7*(C) | 2 | | | | | | | | | 6.11 | 23.7 | 36 | | | 4 | | | | | | | | | 5.19 | 25.3 | 50 | | | 8 | | | | | | | | | 4.88 | 25.5 | 80 | -| | | 256 | | | 32 | 32 | | | | 5.75 | 24.5 | 28 | -| | | 1024 | | | 128 | 128 | | | | 4.66 | 26.0 | 168 | +| | | 256 | | | 32 | 32 | | | | 5.75 | 24.5 | 28 | +| | | 1024 | | | 128 | 128 | | | | 4.66 | 26.0 | 168 | | | | | 1024 | | | | | | | 5.12 | 25.4 | 53 | | | | | 4096 | | | | | | | 4.75 | 26.2 | 90 | -| 0pt2.0ex 4*(D) | | | | | | | 0 | | | 5.77 | 24.6 | | +| 0pt2.0ex 4*(D) | | | | | | | 0.0 | | | 5.77 | 24.6 | | | | | | | | | | 0.2 | | | 4.95 | 25.5 | | -| | | | | | | | | 0 | | 4.67 | 25.3 | | +| | | | | | | | | 0.0 | | 4.67 | 25.3 | | | | | | | | | | | 0.2 | | 5.47 | 25.7 | | | 0pt2.0ex (E) | | 7cpositional embedding instead of sinusoids | | 4.92 | 25.7 | | | | | | | | -| 0pt2.0ex big | 6 | 1024 | 4096 | 16 | | | 0.3 | | 300K | 4.33 | 26.4 | 213 | +| 0pt2.0ex big | 6 | 1024 | 4096 | 16 | | | 0.3 | | 300K | 4.33 | 26.4 | 213 | | | | | | | | | | | | | | | To evaluate the importance of different components of the Transformer, we varied our base model in different ways, measuring the change in performance on English-to-German translation on the development set, newstest2013. We used beam search as described in the previous section, but no checkpoint averaging. We present these results in Table diff --git a/tests/data/groundtruth/docling_v2/2203.01017v2.md b/tests/data/groundtruth/docling_v2/2203.01017v2.md index 152a4b5277..208ca178a0 100644 --- a/tests/data/groundtruth/docling_v2/2203.01017v2.md +++ b/tests/data/groundtruth/docling_v2/2203.01017v2.md @@ -184,16 +184,16 @@ Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across d | Model | Dataset | Simple | TEDS Complex | All | |-------------|-----------|----------|----------------|-------| -| EDD | PTN | 91.1 | 88.7 | 89.9 | +| EDD | PTN | 91.1 | 88.7 | 89.9 | | GTE | PTN | - | - | 93.01 | | TableFormer | PTN | 98.5 | 95.0 | 96.75 | -| EDD | FTN | 88.4 | 92.08 | 90.6 | +| EDD | FTN | 88.4 | 92.08 | 90.6 | | GTE | FTN | - | - | 87.14 | | GTE (FT) | FTN | - | - | 91.02 | -| TableFormer | FTN | 97.5 | 96.0 | 96.8 | -| EDD | TB | 86.0 | - | 86 | -| TableFormer | TB | 89.6 | - | 89.6 | -| TableFormer | STN | 96.9 | 95.7 | 96.7 | +| TableFormer | FTN | 97.5 | 96.0 | 96.8 | +| EDD | TB | 86.0 | - | 86.0 | +| TableFormer | TB | 89.6 | - | 89.6 | +| TableFormer | STN | 96.9 | 95.7 | 96.7 | Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN). @@ -215,9 +215,9 @@ Table 4: Results of structure with content retrieved using cell detection on Pub | Model | Simple | TEDS Complex | All | |-------------|----------|----------------|-------| -| Tabula | 78 | 57.8 | 67.9 | +| Tabula | 78.0 | 57.8 | 67.9 | | Traprange | 60.8 | 49.9 | 55.4 | -| Camelot | 80 | 66 | 73 | +| Camelot | 80.0 | 66.0 | 73.0 | | Acrobat Pro | 68.9 | 61.8 | 65.3 | | EDD | 91.2 | 85.4 | 88.3 | | TableFormer | 95.4 | 90.1 | 93.6 | diff --git a/tests/data/groundtruth/docling_v2/2305.03393_main.tex.md b/tests/data/groundtruth/docling_v2/2305.03393_main.tex.md index de5e3ceb90..bc35a4d408 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393_main.tex.md +++ b/tests/data/groundtruth/docling_v2/2305.03393_main.tex.md @@ -169,14 +169,14 @@ HPO performed in OTSL and HTML representation on the same transformer-based Tabl | 2*c]@c@# enc-layers | 2*c]@c@# dec-layers | 2*Language | 3c|TEDs | 2*mAP (0.75) | 2*Inference time (secs) | | | |------------------------|------------------------|--------------|----------------|------------------|-----------------------------|-------|------| | 4-6 | | | 1c|simple | 1c|complex | all | | | -| 2*6 | 2*6 | OTSL | 0.965 | 0.934 | 0.955 | 0.88 | 2.73 | +| 2*6 | 2*6 | OTSL | 0.965 | 0.934 | 0.955 | 0.88 | 2.73 | | | | HTML | 0.969 | 0.927 | 0.955 | 0.857 | 5.39 | | 2*4 | 2*4 | OTSL | 0.938 | 0.904 | 0.927 | 0.853 | 1.97 | | | | HTML | 0.952 | 0.909 | 0.938 | 0.843 | 3.77 | | 2*2 | 2*4 | OTSL | 0.923 | 0.897 | 0.915 | 0.859 | 1.91 | | | | HTML | 0.945 | 0.901 | 0.931 | 0.834 | 3.81 | | 2*4 | 2*2 | OTSL | 0.952 | 0.92 | 0.942 | 0.857 | 1.22 | -| | | HTML | 0.944 | 0.903 | 0.931 | 0.824 | 2 | +| | | HTML | 0.944 | 0.903 | 0.931 | 0.824 | 2 | | | | | | | | | | ### Quantitative Results @@ -190,7 +190,7 @@ TSR and cell detection results compared between OTSL and HTML on the PubTabNet[P | 2*Data set | 2*Language | 3c|TEDs | 2*mAP(0.75) | 2*Inference time (secs) | | | |----------------|--------------|----------------|-----------------|-----------------------------|-------|------| | 3-5 | | 1c|simple | 1c|complex | all | | | -| 2*PubTabNet | OTSL | 0.965 | 0.934 | 0.955 | 0.88 | 2.73 | +| 2*PubTabNet | OTSL | 0.965 | 0.934 | 0.955 | 0.88 | 2.73 | | | HTML | 0.969 | 0.927 | 0.955 | 0.857 | 5.39 | | 2*FinTabNet | OTSL | 0.955 | 0.961 | 0.959 | 0.862 | 1.85 | | | HTML | 0.917 | 0.922 | 0.92 | 0.722 | 3.26 | diff --git a/tests/data/groundtruth/docling_v2/2412.19437_main.tex.md b/tests/data/groundtruth/docling_v2/2412.19437_main.tex.md index 2e553d14f7..dd4a5707bb 100644 --- a/tests/data/groundtruth/docling_v2/2412.19437_main.tex.md +++ b/tests/data/groundtruth/docling_v2/2412.19437_main.tex.md @@ -822,16 +822,16 @@ In addition, we perform language-modeling-based evaluation for Pile-test and use | | TriviaQA (EM) | 5-shot | 80.0 | 71.9 | 82.7 | 82.9 | | | | | NaturalQuestions (EM) | 5-shot | 38.6 | 33.2 | 41.5 | 40.0 | | | | | AGIEval (EM) | 0-shot | 57.5 | 75.8 | 60.6 | 79.6 | | | -| 75.4 | 4*Code | HumanEval (Pass@1) | 0-shot | 43.3 | 53.0 | 54.9 | 65.2 | | +| 75.4 | 4*Code | HumanEval (Pass@1) | 0-shot | 43.3 | 53.0 | 54.9 | 65.2 | | | | MBPP (Pass@1) | 3-shot | 65.0 | 72.6 | 68.4 | 75.4 | | | | | LiveCodeBench-Base (Pass@1) | 3-shot | 11.6 | 12.9 | 15.5 | 19.4 | | | | | CRUXEval-I (EM) | 2-shot | 52.5 | 59.1 | 58.5 | 67.3 | | | | | CRUXEval-O (EM) | 2-shot | 49.8 | 59.9 | 59.9 | 69.8 | | | -| (EM) | 3*Math | GSM8K (EM) | 8-shot | 81.6 | 88.3 | 83.5 | 89.3 | | +| (EM) | 3*Math | GSM8K (EM) | 8-shot | 81.6 | 88.3 | 83.5 | 89.3 | | | | MATH (EM) | 4-shot | 43.4 | 54.4 | 49.0 | 61.6 | | | | | MGSM (EM) | 8-shot | 63.6 | 76.2 | 69.9 | 79.8 | | | | | CMath (EM) | 3-shot | 78.7 | 84.5 | 77.3 | 90.7 | | | -| 90.1 | 7*Chinese | CLUEWSC (EM) | 5-shot | 82.0 | 82.5 | 83.0 | 82.7 | | +| 90.1 | 7*Chinese | CLUEWSC (EM) | 5-shot | 82.0 | 82.5 | 83.0 | 82.7 | | | | C-Eval (EM) | 5-shot | 81.4 | 89.2 | 72.5 | 90.1 | | | | | CMMLU (EM) | 5-shot | 84.0 | 89.5 | 73.7 | 88.8 | | | | | CMRC (EM) | 1-shot | 77.4 | 75.8 | 76.0 | 76.3 | | | @@ -1169,8 +1169,8 @@ DeepSeek-V3and Qwen2.5-72B exhibit similar performance levels, indicating that b | Qwen2.5-72B-Instruct | 81.2 | 49.1 | | LLaMA-3.1 405B | 69.3 | 40.5 | | GPT-4o-0513 | 80.4 | 51.1 | -| Claude-Sonnet-3.5-1022 | 85.2 | 52 | -| DeepSeek-V3 | 85.5 | 70 | +| Claude-Sonnet-3.5-1022 | 85.2 | 52.0 | +| DeepSeek-V3 | 85.5 | 70.0 | | | | | English open-ended conversation evaluations. @@ -1208,9 +1208,9 @@ Therefore, we employ DeepSeek-V3along with voting to offer self-feedback on open | GPT-4o-0513 | 96.6 | 70.4 | 86.7 | 84.9 | 84.7 | | GPT-4o-0806 | 96.1 | 76.1 | 88.1 | 86.6 | 86.7 | | GPT-4o-1120 | 95.8 | 71.3 | 86.2 | 85.2 | 84.6 | -| Claude-3.5-sonnet-0620 | 96.4 | 74 | 81.6 | 84.7 | 84.2 | +| Claude-3.5-sonnet-0620 | 96.4 | 74.0 | 81.6 | 84.7 | 84.2 | | Claude-3.5-sonnet-1022 | 96.4 | 79.7 | 91.1 | 87.6 | 88.7 | -| DeepSeek-V3 | 96.9 | 79.8 | 87 | 84.3 | 87 | +| DeepSeek-V3 | 96.9 | 79.8 | 87.0 | 84.3 | 87.0 | | DeepSeek-V3 (maj@6) | 96.9 | 82.6 | 89.5 | 89.2 | 89.6 | | | | | | | | diff --git a/tests/data/groundtruth/docling_v2/escaped_characters.md.md b/tests/data/groundtruth/docling_v2/escaped_characters.md.md index 95bdd09a14..683241cc61 100644 --- a/tests/data/groundtruth/docling_v2/escaped_characters.md.md +++ b/tests/data/groundtruth/docling_v2/escaped_characters.md.md @@ -41,4 +41,4 @@ The pipe symbol (| or `|` ) only needs to be escaped in tables. ## Link -[& < > " '](https://en.wikipedia.org/wiki/Albert_Einstein) +[& < > " '](https://en.wikipedia.org/wiki/Albert_Einstein) \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/mlac-20251231.xml.md b/tests/data/groundtruth/docling_v2/mlac-20251231.xml.md index e363f73f0d..93a9e6d938 100644 --- a/tests/data/groundtruth/docling_v2/mlac-20251231.xml.md +++ b/tests/data/groundtruth/docling_v2/mlac-20251231.xml.md @@ -131,27 +131,27 @@ The calculation of diluted net income per ordinary share does not consider the e The following table presents a reconciliation of the numerator and denominator used to compute basic and diluted net income ordinary share for each class of ordinary shares: -| | | For the Year Ended December 31, | | | | | | | | For the Period from June 14, 2024 (inception) through December 31, | | | | | | | -|-------------------------------------|----|------------------------------------|----------------|------|------|--------------------------|----------------|----|----|------------------------------------------------------------------------|-----------------|------|------|--------------------------|------------------|----| -| | | 2025 | 2025 | 2025 | 2025 | 2025 | 2025 | | | 2024 | 2024 | 2024 | 2024 | 2024 | 2024 | | -| | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | -| Basic net income per share: | | | | | | | | | | | | | | | | | -| Numerator: | | | | | | | | | | | | | | | | | -| Allocation of net income | | $ | 6.36127e+06 | | | $ | 1.92067e+06 | | | $ | 99286 | | | $ | 343831 | | -| Denominator: | | | | | | | | | | | | | | | | | -| Weighted-average shares outstanding | | | 2.3805e+07 | | | | 7.1875e+06 | | | | 1.78538e+06 | | | | 6.18281e+06 | | -| Basic net income per common stock | | $ | 0.27 | | | $ | 0.27 | | | $ | 0.06 | | | $ | 0.06 | | - -| | | For the Year Ended December 31, | | | | | | | | For the Period from June 14, 2024 (inception) through December 31, | | | | | | | -|-------------------------------------|----|------------------------------------|----------------|------|------|--------------------------|----------------|----|----|------------------------------------------------------------------------|-----------------|------|------|--------------------------|------------------|----| -| | | 2025 | 2025 | 2025 | 2025 | 2025 | 2025 | | | 2024 | 2024 | 2024 | 2024 | 2024 | 2024 | | -| | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | -| Diluted net income per share: | | | | | | | | | | | | | | | | | -| Numerator: | | | | | | | | | | | | | | | | | -| Allocation of net income | | $ | 6.36127e+06 | | | $ | 1.92067e+06 | | | $ | 93012 | | | $ | 350105 | | -| Denominator: | | | | | | | | | | | | | | | | | -| Weighted-average shares outstanding | | | 2.3805e+07 | | | | 7.1875e+06 | | | | 1.78538e+06 | | | | 6.72031e+06 | | -| Diluted net income per common stock | | $ | 0.27 | | | $ | 0.27 | | | $ | 0.05 | | | $ | 0.05 | | +| | | For the Year Ended December 31, | | | | | | | | For the Period from June 14, 2024 (inception) through December 31, | | | | | | | +|-------------------------------------|----|------------------------------------|------------|------|------|--------------------------|-----------|----|----|------------------------------------------------------------------------|-----------|------|------|--------------------------|-----------|----| +| | | 2025 | 2025 | 2025 | 2025 | 2025 | 2025 | | | 2024 | 2024 | 2024 | 2024 | 2024 | 2024 | | +| | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | +| Basic net income per share: | | | | | | | | | | | | | | | | | +| Numerator: | | | | | | | | | | | | | | | | | +| Allocation of net income | | $ | 6,361,272 | | | $ | 1,920,674 | | | $ | 99,286 | | | $ | 343,831 | | +| Denominator: | | | | | | | | | | | | | | | | | +| Weighted-average shares outstanding | | | 23,805,000 | | | | 7,187,500 | | | | 1,785,375 | | | | 6,182,813 | | +| Basic net income per common stock | | $ | 0.27 | | | $ | 0.27 | | | $ | 0.06 | | | $ | 0.06 | | + +| | | For the Year Ended December 31, | | | | | | | | For the Period from June 14, 2024 (inception) through December 31, | | | | | | | +|-------------------------------------|----|------------------------------------|------------|------|------|--------------------------|-----------|----|----|------------------------------------------------------------------------|-----------|------|------|--------------------------|-----------|----| +| | | 2025 | 2025 | 2025 | 2025 | 2025 | 2025 | | | 2024 | 2024 | 2024 | 2024 | 2024 | 2024 | | +| | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | +| Diluted net income per share: | | | | | | | | | | | | | | | | | +| Numerator: | | | | | | | | | | | | | | | | | +| Allocation of net income | | $ | 6,361,272 | | | $ | 1,920,674 | | | $ | 93,012 | | | $ | 350,105 | | +| Denominator: | | | | | | | | | | | | | | | | | +| Weighted-average shares outstanding | | | 23,805,000 | | | | 7,187,500 | | | | 1,785,375 | | | | 6,720,313 | | +| Diluted net income per common stock | | $ | 0.27 | | | $ | 0.27 | | | $ | 0.05 | | | $ | 0.05 | | Concentration of Credit Risk @@ -254,51 +254,51 @@ The calculation of diluted net income per ordinary share does not consider the e The following table presents a reconciliation of the numerator and denominator used to compute basic and diluted net income ordinary share for each class of ordinary shares: -| | | For the Year Ended December 31, | | | | | | | | For the Period from June 14, 2024 (inception) through December 31, | | | | | | | -|-------------------------------------|----|------------------------------------|----------------|------|------|--------------------------|----------------|----|----|------------------------------------------------------------------------|-----------------|------|------|--------------------------|------------------|----| -| | | 2025 | 2025 | 2025 | 2025 | 2025 | 2025 | | | 2024 | 2024 | 2024 | 2024 | 2024 | 2024 | | -| | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | -| Basic net income per share: | | | | | | | | | | | | | | | | | -| Numerator: | | | | | | | | | | | | | | | | | -| Allocation of net income | | $ | 6.36127e+06 | | | $ | 1.92067e+06 | | | $ | 99286 | | | $ | 343831 | | -| Denominator: | | | | | | | | | | | | | | | | | -| Weighted-average shares outstanding | | | 2.3805e+07 | | | | 7.1875e+06 | | | | 1.78538e+06 | | | | 6.18281e+06 | | -| Basic net income per common stock | | $ | 0.27 | | | $ | 0.27 | | | $ | 0.06 | | | $ | 0.06 | | - -| | | For the Year Ended December 31, | | | | | | | | For the Period from June 14, 2024 (inception) through December 31, | | | | | | | -|-------------------------------------|----|------------------------------------|----------------|------|------|--------------------------|----------------|----|----|------------------------------------------------------------------------|-----------------|------|------|--------------------------|------------------|----| -| | | 2025 | 2025 | 2025 | 2025 | 2025 | 2025 | | | 2024 | 2024 | 2024 | 2024 | 2024 | 2024 | | -| | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | -| Diluted net income per share: | | | | | | | | | | | | | | | | | -| Numerator: | | | | | | | | | | | | | | | | | -| Allocation of net income | | $ | 6.36127e+06 | | | $ | 1.92067e+06 | | | $ | 93012 | | | $ | 350105 | | -| Denominator: | | | | | | | | | | | | | | | | | -| Weighted-average shares outstanding | | | 2.3805e+07 | | | | 7.1875e+06 | | | | 1.78538e+06 | | | | 6.72031e+06 | | -| Diluted net income per common stock | | $ | 0.27 | | | $ | 0.27 | | | $ | 0.05 | | | $ | 0.05 | | +| | | For the Year Ended December 31, | | | | | | | | For the Period from June 14, 2024 (inception) through December 31, | | | | | | | +|-------------------------------------|----|------------------------------------|------------|------|------|--------------------------|-----------|----|----|------------------------------------------------------------------------|-----------|------|------|--------------------------|-----------|----| +| | | 2025 | 2025 | 2025 | 2025 | 2025 | 2025 | | | 2024 | 2024 | 2024 | 2024 | 2024 | 2024 | | +| | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | +| Basic net income per share: | | | | | | | | | | | | | | | | | +| Numerator: | | | | | | | | | | | | | | | | | +| Allocation of net income | | $ | 6,361,272 | | | $ | 1,920,674 | | | $ | 99,286 | | | $ | 343,831 | | +| Denominator: | | | | | | | | | | | | | | | | | +| Weighted-average shares outstanding | | | 23,805,000 | | | | 7,187,500 | | | | 1,785,375 | | | | 6,182,813 | | +| Basic net income per common stock | | $ | 0.27 | | | $ | 0.27 | | | $ | 0.06 | | | $ | 0.06 | | + +| | | For the Year Ended December 31, | | | | | | | | For the Period from June 14, 2024 (inception) through December 31, | | | | | | | +|-------------------------------------|----|------------------------------------|------------|------|------|--------------------------|-----------|----|----|------------------------------------------------------------------------|-----------|------|------|--------------------------|-----------|----| +| | | 2025 | 2025 | 2025 | 2025 | 2025 | 2025 | | | 2024 | 2024 | 2024 | 2024 | 2024 | 2024 | | +| | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | +| Diluted net income per share: | | | | | | | | | | | | | | | | | +| Numerator: | | | | | | | | | | | | | | | | | +| Allocation of net income | | $ | 6,361,272 | | | $ | 1,920,674 | | | $ | 93,012 | | | $ | 350,105 | | +| Denominator: | | | | | | | | | | | | | | | | | +| Weighted-average shares outstanding | | | 23,805,000 | | | | 7,187,500 | | | | 1,785,375 | | | | 6,720,313 | | +| Diluted net income per common stock | | $ | 0.27 | | | $ | 0.27 | | | $ | 0.05 | | | $ | 0.05 | | The following table presents a reconciliation of the numerator and denominator used to compute basic and diluted net income ordinary share for each class of ordinary shares: -| | | For the Year Ended December 31, | | | | | | | | For the Period from June 14, 2024 (inception) through December 31, | | | | | | | -|-------------------------------------|----|------------------------------------|----------------|------|------|--------------------------|----------------|----|----|------------------------------------------------------------------------|-----------------|------|------|--------------------------|------------------|----| -| | | 2025 | 2025 | 2025 | 2025 | 2025 | 2025 | | | 2024 | 2024 | 2024 | 2024 | 2024 | 2024 | | -| | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | -| Basic net income per share: | | | | | | | | | | | | | | | | | -| Numerator: | | | | | | | | | | | | | | | | | -| Allocation of net income | | $ | 6.36127e+06 | | | $ | 1.92067e+06 | | | $ | 99286 | | | $ | 343831 | | -| Denominator: | | | | | | | | | | | | | | | | | -| Weighted-average shares outstanding | | | 2.3805e+07 | | | | 7.1875e+06 | | | | 1.78538e+06 | | | | 6.18281e+06 | | -| Basic net income per common stock | | $ | 0.27 | | | $ | 0.27 | | | $ | 0.06 | | | $ | 0.06 | | - -| | | For the Year Ended December 31, | | | | | | | | For the Period from June 14, 2024 (inception) through December 31, | | | | | | | -|-------------------------------------|----|------------------------------------|----------------|------|------|--------------------------|----------------|----|----|------------------------------------------------------------------------|-----------------|------|------|--------------------------|------------------|----| -| | | 2025 | 2025 | 2025 | 2025 | 2025 | 2025 | | | 2024 | 2024 | 2024 | 2024 | 2024 | 2024 | | -| | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | -| Diluted net income per share: | | | | | | | | | | | | | | | | | -| Numerator: | | | | | | | | | | | | | | | | | -| Allocation of net income | | $ | 6.36127e+06 | | | $ | 1.92067e+06 | | | $ | 93012 | | | $ | 350105 | | -| Denominator: | | | | | | | | | | | | | | | | | -| Weighted-average shares outstanding | | | 2.3805e+07 | | | | 7.1875e+06 | | | | 1.78538e+06 | | | | 6.72031e+06 | | -| Diluted net income per common stock | | $ | 0.27 | | | $ | 0.27 | | | $ | 0.05 | | | $ | 0.05 | | +| | | For the Year Ended December 31, | | | | | | | | For the Period from June 14, 2024 (inception) through December 31, | | | | | | | +|-------------------------------------|----|------------------------------------|------------|------|------|--------------------------|-----------|----|----|------------------------------------------------------------------------|-----------|------|------|--------------------------|-----------|----| +| | | 2025 | 2025 | 2025 | 2025 | 2025 | 2025 | | | 2024 | 2024 | 2024 | 2024 | 2024 | 2024 | | +| | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | +| Basic net income per share: | | | | | | | | | | | | | | | | | +| Numerator: | | | | | | | | | | | | | | | | | +| Allocation of net income | | $ | 6,361,272 | | | $ | 1,920,674 | | | $ | 99,286 | | | $ | 343,831 | | +| Denominator: | | | | | | | | | | | | | | | | | +| Weighted-average shares outstanding | | | 23,805,000 | | | | 7,187,500 | | | | 1,785,375 | | | | 6,182,813 | | +| Basic net income per common stock | | $ | 0.27 | | | $ | 0.27 | | | $ | 0.06 | | | $ | 0.06 | | + +| | | For the Year Ended December 31, | | | | | | | | For the Period from June 14, 2024 (inception) through December 31, | | | | | | | +|-------------------------------------|----|------------------------------------|------------|------|------|--------------------------|-----------|----|----|------------------------------------------------------------------------|-----------|------|------|--------------------------|-----------|----| +| | | 2025 | 2025 | 2025 | 2025 | 2025 | 2025 | | | 2024 | 2024 | 2024 | 2024 | 2024 | 2024 | | +| | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | | Class A Ordinary Shares | | | | Class B Ordinary Shares | | | +| Diluted net income per share: | | | | | | | | | | | | | | | | | +| Numerator: | | | | | | | | | | | | | | | | | +| Allocation of net income | | $ | 6,361,272 | | | $ | 1,920,674 | | | $ | 93,012 | | | $ | 350,105 | | +| Denominator: | | | | | | | | | | | | | | | | | +| Weighted-average shares outstanding | | | 23,805,000 | | | | 7,187,500 | | | | 1,785,375 | | | | 6,720,313 | | +| Diluted net income per common stock | | $ | 0.27 | | | $ | 0.27 | | | $ | 0.05 | | | $ | 0.05 | | Concentration of Credit Risk @@ -416,7 +416,7 @@ The Public Share Rights have been classified within shareholders' deficit and wi |----------------------------|----|--------------------------------------------------------|-------|----| | Trade price of Unit | | $ | 10.94 | | | Risk-free rate | | | 4.17 | % | -| Market adjustment (1) | | | 9.2 | % | +| Market adjustment (1) | | | 9.2 | % | | Fair value per share right | | $ | 0.09 | | | (1) | Market adjustment reflects additional factors not fully captured by low volatility selection, which may include likelihood of Business Combination occurring, market perception of lack of available or suitable targets, or possible post-acquisition decline of stock price prior to beginning of the exercise period. The adjustment is determined by comparing traded warrant prices to simulated model outputs. | @@ -444,7 +444,7 @@ The following table presents the quantitative information regarding market assum |----------------------------|----|--------------------------------------------------------|-------|----| | Trade price of Unit | | $ | 10.94 | | | Risk-free rate | | | 4.17 | % | -| Market adjustment (1) | | | 9.2 | % | +| Market adjustment (1) | | | 9.2 | % | | Fair value per share right | | $ | 0.09 | | | (1) | Market adjustment reflects additional factors not fully captured by low volatility selection, which may include likelihood of Business Combination occurring, market perception of lack of available or suitable targets, or possible post-acquisition decline of stock price prior to beginning of the exercise period. The adjustment is determined by comparing traded warrant prices to simulated model outputs. | diff --git a/tests/data/groundtruth/docling_v2/pone.0234687.nxml.md b/tests/data/groundtruth/docling_v2/pone.0234687.nxml.md index 6f133259bd..d6d8d81b0d 100644 --- a/tests/data/groundtruth/docling_v2/pone.0234687.nxml.md +++ b/tests/data/groundtruth/docling_v2/pone.0234687.nxml.md @@ -42,18 +42,18 @@ Table 1 Descriptive characteristics of the herd. | Item | Unit | Average | |-------------------------------|-----------|-----------| -| Milking cows | # | 165 | -| Milk production | kg year-1 | 7015 | -| Milk fat | % | 4 | +| Milking cows | # | 165 | +| Milk production | kg year-1 | 7,015 | +| Milk fat | % | 4.0 | | Milk protein | % | 3.3 | -| Length of lactation | days | 305 | -| Body weight | kg | 553 | -| Lactations per cow | # | 4 | -| Replacement rate | % | 25 | -| Cull rate | % | 25 | -| First artificial insemination | months | 16 | -| Weaned | days | 60 | -| Mortality | % | 3 | +| Length of lactation | days | 305 | +| Body weight | kg | 553 | +| Lactations per cow | # | 4 | +| Replacement rate | % | 25 | +| Cull rate | % | 25 | +| First artificial insemination | months | 16 | +| Weaned | days | 60 | +| Mortality | % | 3.0 | ### Data sources and livestock system description @@ -119,33 +119,33 @@ Table 4 GHG emissions from On-farm feed production. | Item | Corn silage | Annual temperate pasture | Annual tropical pasture | Perennial tropical pasture | |-------------------------------------------|---------------|----------------------------|---------------------------|------------------------------| -| DM yield, kg ha-1 | 16000 | 9500 | 11000 | 9500 | +| DM yield, kg ha-1 | 16000 | 9500 | 11000 | 9500 | | Direct N2O emissions to air | | | | | -| N organic fertilizer, kg ha-1a | 150 | 180 | 225 | 225 | -| N synthetic fertilizer | - | 20 | 25 | 25 | -| N from residual DM, kg ha-1b | 70 | 112 | 129 | 112 | -| Emission fator, kg N2O-N (kg N)-1c | 0.002 | 0.002 | 0.002 | 0.002 | -| kg N2O ha-1 from direct emissions | 0.69 | 0.98 | 1.19 | 1.14 | +| N organic fertilizer, kg ha-1a | 150 | 180 | 225 | 225 | +| N synthetic fertilizer | - | 20 | 25 | 25 | +| N from residual DM, kg ha-1b | 70 | 112 | 129 | 112 | +| Emission fator, kg N2O-N (kg N)-1c | 0.002 | 0.002 | 0.002 | 0.002 | +| kg N2O ha-1 from direct emissions | 0.69 | 0.98 | 1.19 | 1.14 | | Indirect N2O emissions to air | | | | | -| kg NH3-N+NOx-N (kg organic N)-1b | 0.2 | 0.2 | 0.2 | 0.2 | -| kg NH3-N+NOx-N (kg synthetic N)-1b | 0.1 | 0.1 | 0.1 | 0.1 | -| kg N2O-N (kg NH3-N+NOx-N)-1b | 0.01 | 0.01 | 0.01 | 0.01 | -| kg N2O ha-1 from NH3+NOx volatilized | 0.47 | 0.60 | 0.75 | 0.75 | +| kg NH3-N+NOx-N (kg organic N)-1b | 0.2 | 0.2 | 0.2 | 0.2 | +| kg NH3-N+NOx-N (kg synthetic N)-1b | 0.1 | 0.1 | 0.1 | 0.1 | +| kg N2O-N (kg NH3-N+NOx-N)-1b | 0.01 | 0.01 | 0.01 | 0.01 | +| kg N2O ha-1 from NH3+NOx volatilized | 0.47 | 0.60 | 0.75 | 0.75 | | Indirect N2O emissions to soil | | | | | -| kg N losses by leaching (kg N)-1b | 0.3 | 0.3 | 0.3 | 0.3 | +| kg N losses by leaching (kg N)-1b | 0.3 | 0.3 | 0.3 | 0.3 | | kg N2O-N (kg N leaching)-1 | 0.0075 | 0.0075 | 0.0075 | 0.0075 | -| kg N2O ha-1 from N losses by leaching | 0.78 | 1.10 | 1.34 | 1.28 | -| kg N2O ha-1 (direct + indirect emissions) | 1.94 | 2.68 | 3.28 | 3.16 | -| kg CO2e ha-1 from N20 emissionsd | 514 | 710 | 869 | 838 | -| kg CO2 ha-1 from lime+ureab | 515 | 721 | 882 | 852 | -| kg CO2 ha-1 from diesel combustione | 802 | 38 | 23 | 12 | -| kg CO2e from secondary sourcesf | 516 | 205 | 225 | 284 | -| Total CO2e emitted, kg ha-1 | 1833 | 964 | 1130 | 1148 | -| Emission factor, kg CO2e (kg DM)-1g | 0.115 | 0.145 | 0.147 | 0.173 | -| Carbon sequestered, kg ha-1h | - | - | - | 570 | -| Sequestered CO2-C, kg ha-1 | - | - | - | 1393 | -| kg CO2e ha-1 (emitted—sequestered) | 1833 | 964 | 1130 | -245 | -| Emission factor, kg CO2e (kg DM)-1i | 0.115 | 0.145 | 0.147 | -0.037 | +| kg N2O ha-1 from N losses by leaching | 0.78 | 1.10 | 1.34 | 1.28 | +| kg N2O ha-1 (direct + indirect emissions) | 1.94 | 2.68 | 3.28 | 3.16 | +| kg CO2e ha-1 from N20 emissionsd | 514 | 710 | 869 | 838 | +| kg CO2 ha-1 from lime+ureab | 515 | 721 | 882 | 852 | +| kg CO2 ha-1 from diesel combustione | 802 | 38 | 23 | 12 | +| kg CO2e from secondary sourcesf | 516 | 205 | 225 | 284 | +| Total CO2e emitted, kg ha-1 | 1833 | 964 | 1130 | 1148 | +| Emission factor, kg CO2e (kg DM)-1g | 0.115 | 0.145 | 0.147 | 0.173 | +| Carbon sequestered, kg ha-1h | - | - | - | 570 | +| Sequestered CO2-C, kg ha-1 | - | - | - | 1393 | +| kg CO2e ha-1 (emitted—sequestered) | 1833 | 964 | 1130 | -245 | +| Emission factor, kg CO2e (kg DM)-1i | 0.115 | 0.145 | 0.147 | -0.037 | ### Animal husbandry @@ -169,15 +169,15 @@ Table 5 Factors for major resource inputs in farm management. |------------------------------------------|----------|-------------------|--------------| | Production and transport of diesel | 0.374 | kg CO2e L-1 | [41] | | Emissions from diesel fuel combustion | 2.637 | kg CO2e L-1 | [41] | -| Production of electricityb | 0.73 | kg CO2e kWh-1 | [41] | +| Production of electricityb | 0.73 | kg CO2e kWh-1 | [41] | | Production of electricity (alternative)c | 0.205 | kg CO2e kWh-1 | [46] | -| Production of machinery | 3.54 | kg CO2e (kg mm)-1 | [42] | +| Production of machinery | 3.54 | kg CO2e (kg mm)-1 | [42] | | Manure handling | | | | -| Fuel for manure handling | 0.6 | L diesel tonne-1 | [42] | -| Machinery for manure handling | 0.17 | kg mm kg-1 | [42] | +| Fuel for manure handling | 0.600 | L diesel tonne-1 | [42] | +| Machinery for manure handling | 0.17 | kg mm kg-1 | [42] | | Milking and confinement | | | | -| Electricity for milking | 0.06 | kWh (kg milk)-1 | [47] | -| Electricity for lightingd | 75 | kWh cow-1 | [47] | +| Electricity for milking | 0.06 | kWh (kg milk)-1 | [47] | +| Electricity for lightingd | 75 | kWh cow-1 | [47] | The amount of fuel use for manure handling were estimated taking into consideration the amount of manure produced per cow and the amounts of fuel required for manure handling (L diesel t-1) [42]. The amount of manure was estimated from OM excretions (kg cow-1), assuming that the manure has 8% ash on DM basis and 60% DM content. The OM excretions were calculated by NDOMI × days in confinement × proportion of daily time that animals stayed on confinement. diff --git a/tests/data/groundtruth/docling_v2/signature_stamp_01.md.md b/tests/data/groundtruth/docling_v2/signature_stamp_01.md.md index 1f54889793..19b7952d47 100644 --- a/tests/data/groundtruth/docling_v2/signature_stamp_01.md.md +++ b/tests/data/groundtruth/docling_v2/signature_stamp_01.md.md @@ -10,4 +10,4 @@ Stamp -Final text here +Final text here \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.itxt index 216d68f84c..e691d09a9d 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.itxt +++ b/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.itxt @@ -1,11 +1,11 @@ item-0 at level 0: unspecified: group _root_ - item-1 at level 1: section: group sheet: Sheet1 + item-1 at level 1: sheet: group Sheet1 item-2 at level 2: table with [7x3] - item-3 at level 1: section: group sheet: Sheet2 + item-3 at level 1: sheet: group Sheet2 item-4 at level 2: table with [9x4] item-5 at level 2: table with [5x3] item-6 at level 2: table with [5x3] - item-7 at level 1: section: group sheet: Sheet3 + item-7 at level 1: sheet: group Sheet3 item-8 at level 2: table with [7x3] item-9 at level 2: table with [7x3] item-10 at level 2: picture \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.json index 95b0a8ba88..15a0dde352 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.json +++ b/tests/data/groundtruth/docling_v2/xlsx_01.xlsx.json @@ -46,8 +46,8 @@ } ], "content_layer": "body", - "name": "sheet: Sheet1", - "label": "section" + "name": "Sheet1", + "label": "sheet" }, { "self_ref": "#/groups/1", @@ -66,8 +66,8 @@ } ], "content_layer": "body", - "name": "sheet: Sheet2", - "label": "section" + "name": "Sheet2", + "label": "sheet" }, { "self_ref": "#/groups/2", @@ -86,8 +86,8 @@ } ], "content_layer": "body", - "name": "sheet: Sheet3", - "label": "section" + "name": "Sheet3", + "label": "sheet" }, { "self_ref": "#/groups/3", @@ -100,8 +100,8 @@ } ], "content_layer": "invisible", - "name": "sheet: Sheet4", - "label": "section" + "name": "Sheet4", + "label": "sheet" } ], "texts": [], diff --git a/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.itxt b/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.itxt index f7965d2401..72d44830e9 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.itxt +++ b/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.itxt @@ -1,3 +1,3 @@ item-0 at level 0: unspecified: group _root_ - item-1 at level 1: section: group sheet: SalesData + item-1 at level 1: sheet: group SalesData item-2 at level 2: table with [21x4] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.json b/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.json index 7aadda78a5..06aa4a029c 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.json +++ b/tests/data/groundtruth/docling_v2/xlsx_02_sample_sales_data.xlsm.json @@ -37,8 +37,8 @@ } ], "content_layer": "body", - "name": "sheet: SalesData", - "label": "section" + "name": "SalesData", + "label": "sheet" } ], "texts": [], diff --git a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt index 2bb5754e49..c3f5a0304a 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt +++ b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt @@ -1,4 +1,4 @@ item-0 at level 0: unspecified: group _root_ - item-1 at level 1: section: group sheet: Duck Observations + item-1 at level 1: sheet: group Duck Observations item-2 at level 2: table with [7x4] - item-3 at level 1: section: group sheet: Duck Chart \ No newline at end of file + item-3 at level 1: sheet: group Duck Chart \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json index 41d946bf4c..ecb2042cd1 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json +++ b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json @@ -40,8 +40,8 @@ } ], "content_layer": "body", - "name": "sheet: Duck Observations", - "label": "section" + "name": "Duck Observations", + "label": "sheet" }, { "self_ref": "#/groups/1", @@ -50,8 +50,8 @@ }, "children": [], "content_layer": "body", - "name": "sheet: Duck Chart", - "label": "section" + "name": "Duck Chart", + "label": "sheet" } ], "texts": [], diff --git a/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.itxt index 216d68f84c..e691d09a9d 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.itxt +++ b/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.itxt @@ -1,11 +1,11 @@ item-0 at level 0: unspecified: group _root_ - item-1 at level 1: section: group sheet: Sheet1 + item-1 at level 1: sheet: group Sheet1 item-2 at level 2: table with [7x3] - item-3 at level 1: section: group sheet: Sheet2 + item-3 at level 1: sheet: group Sheet2 item-4 at level 2: table with [9x4] item-5 at level 2: table with [5x3] item-6 at level 2: table with [5x3] - item-7 at level 1: section: group sheet: Sheet3 + item-7 at level 1: sheet: group Sheet3 item-8 at level 2: table with [7x3] item-9 at level 2: table with [7x3] item-10 at level 2: picture \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.json index 75fac3d5d6..2fa329b54e 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.json +++ b/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.json @@ -46,8 +46,8 @@ } ], "content_layer": "body", - "name": "sheet: Sheet1", - "label": "section" + "name": "Sheet1", + "label": "sheet" }, { "self_ref": "#/groups/1", @@ -66,8 +66,8 @@ } ], "content_layer": "body", - "name": "sheet: Sheet2", - "label": "section" + "name": "Sheet2", + "label": "sheet" }, { "self_ref": "#/groups/2", @@ -86,8 +86,8 @@ } ], "content_layer": "body", - "name": "sheet: Sheet3", - "label": "section" + "name": "Sheet3", + "label": "sheet" }, { "self_ref": "#/groups/3", @@ -100,8 +100,8 @@ } ], "content_layer": "invisible", - "name": "sheet: Sheet4", - "label": "section" + "name": "Sheet4", + "label": "sheet" } ], "texts": [], diff --git a/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt index 30e0db5881..f25067a475 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt +++ b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.itxt @@ -1,4 +1,4 @@ item-0 at level 0: unspecified: group _root_ - item-1 at level 1: section: group sheet: Duck Observations + item-1 at level 1: sheet: group Duck Observations item-2 at level 2: table with [1x1] item-3 at level 2: table with [7x2] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json index 752b45e45d..8e1e600140 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json +++ b/tests/data/groundtruth/docling_v2/xlsx_05_table_with_title.xlsx.json @@ -40,8 +40,8 @@ } ], "content_layer": "body", - "name": "sheet: Duck Observations", - "label": "section" + "name": "Duck Observations", + "label": "sheet" } ], "texts": [], diff --git a/tests/data/groundtruth/docling_v2/xlsx_06_edge_cases_.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_06_edge_cases_.xlsx.itxt index d94ca88480..5437af9f94 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_06_edge_cases_.xlsx.itxt +++ b/tests/data/groundtruth/docling_v2/xlsx_06_edge_cases_.xlsx.itxt @@ -1,8 +1,8 @@ item-0 at level 0: unspecified: group _root_ - item-1 at level 1: section: group sheet: missing_header_head + item-1 at level 1: sheet: group missing_header_head item-2 at level 2: table with [4x6] - item-3 at level 1: section: group sheet: Attached_left + item-3 at level 1: sheet: group Attached_left item-4 at level 2: table with [4x7] - item-5 at level 1: section: group sheet: Diagonal + item-5 at level 1: sheet: group Diagonal item-6 at level 2: table with [4x6] item-7 at level 2: table with [3x3] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_06_edge_cases_.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_06_edge_cases_.xlsx.json index ac17943a23..cfcae08ee5 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_06_edge_cases_.xlsx.json +++ b/tests/data/groundtruth/docling_v2/xlsx_06_edge_cases_.xlsx.json @@ -43,8 +43,8 @@ } ], "content_layer": "body", - "name": "sheet: missing_header_head", - "label": "section" + "name": "missing_header_head", + "label": "sheet" }, { "self_ref": "#/groups/1", @@ -57,8 +57,8 @@ } ], "content_layer": "body", - "name": "sheet: Attached_left", - "label": "section" + "name": "Attached_left", + "label": "sheet" }, { "self_ref": "#/groups/2", @@ -74,8 +74,8 @@ } ], "content_layer": "body", - "name": "sheet: Diagonal", - "label": "section" + "name": "Diagonal", + "label": "sheet" } ], "texts": [], diff --git a/tests/data/groundtruth/docling_v2/xlsx_07_gap_tolerance_.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_07_gap_tolerance_.xlsx.itxt index 29d4c11580..480e4a3b20 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_07_gap_tolerance_.xlsx.itxt +++ b/tests/data/groundtruth/docling_v2/xlsx_07_gap_tolerance_.xlsx.itxt @@ -1,5 +1,5 @@ item-0 at level 0: unspecified: group _root_ - item-1 at level 1: section: group sheet: Page 1 + item-1 at level 1: sheet: group Page 1 item-2 at level 2: table with [2x1] item-3 at level 2: table with [3x1] item-4 at level 2: table with [1x1] diff --git a/tests/data/groundtruth/docling_v2/xlsx_07_gap_tolerance_.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_07_gap_tolerance_.xlsx.json index baecd915d9..aa6c8f98a3 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_07_gap_tolerance_.xlsx.json +++ b/tests/data/groundtruth/docling_v2/xlsx_07_gap_tolerance_.xlsx.json @@ -79,8 +79,8 @@ } ], "content_layer": "body", - "name": "sheet: Page 1", - "label": "section" + "name": "Page 1", + "label": "sheet" } ], "texts": [], diff --git a/tests/data/groundtruth/docling_v2/xlsx_08_one_cell_anchor.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_08_one_cell_anchor.xlsx.itxt index 2e24c295ad..362a3e2c49 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_08_one_cell_anchor.xlsx.itxt +++ b/tests/data/groundtruth/docling_v2/xlsx_08_one_cell_anchor.xlsx.itxt @@ -1,4 +1,4 @@ item-0 at level 0: unspecified: group _root_ - item-1 at level 1: section: group sheet: Sheet1 + item-1 at level 1: sheet: group Sheet1 item-2 at level 2: table with [3x2] item-3 at level 2: picture \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_08_one_cell_anchor.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_08_one_cell_anchor.xlsx.json index 23a0a84f48..3fcb44ac6b 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_08_one_cell_anchor.xlsx.json +++ b/tests/data/groundtruth/docling_v2/xlsx_08_one_cell_anchor.xlsx.json @@ -40,8 +40,8 @@ } ], "content_layer": "body", - "name": "sheet: Sheet1", - "label": "section" + "name": "Sheet1", + "label": "sheet" } ], "texts": [], diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index c4ffa51668..5c273836d8 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -59,8 +59,14 @@ def documents() -> list[tuple[Path, DoclingDocument]]: def test_e2e_excel_conversions(documents) -> None: + _mod = pytest.importorskip( + "docling_core.transforms.serializer.markdown_excel", + reason="docling-core with MsExcelMarkdownDocSerializer not installed", + ) + MsExcelMarkdownDocSerializer = _mod.MsExcelMarkdownDocSerializer + for gt_path, doc in documents: - pred_md: str = doc.export_to_markdown() + pred_md: str = MsExcelMarkdownDocSerializer(doc=doc).serialize().text assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" pred_itxt: str = doc._export_to_indented_text( @@ -113,7 +119,7 @@ def test_chartsheet(documents) -> None: assert len(doc.pages) == 2 # Chartseet content is for now ignored - assert doc.groups[1].name == "sheet: Duck Chart" + assert doc.groups[1].name == "Duck Chart" assert doc.pages[2].size.height == 0 assert doc.pages[2].size.width == 0 @@ -249,7 +255,7 @@ def test_table_with_title(): conv_result: ConversionResult = converter.convert(path) doc: DoclingDocument = conv_result.document - # With treat_singleton_as_text=True, the singleton title cell should be a TextItem + # With treat_singleton_as_text=True, the singleton title cell should be a TextItem. texts = list(doc.texts) tables = list(doc.tables) @@ -271,6 +277,37 @@ def test_table_with_title(): ) +def test_sheet_names_as_headings(documents) -> None: + """Test that sheet names are rendered as headings in markdown export. + + Sheet groups (``GroupLabel.SHEET``) carry the sheet name. + The ``MsExcelMarkdownDocSerializer`` renders each such group's name as a + level-2 Markdown heading before the group's tables. No heading nodes are + injected into the document model by the backend. + Sheet4 in xlsx_01 is empty and intentionally has no heading. + """ + _mod = pytest.importorskip( + "docling_core.transforms.serializer.markdown_excel", + reason="docling-core with MsExcelMarkdownDocSerializer not installed", + ) + MsExcelMarkdownDocSerializer = _mod.MsExcelMarkdownDocSerializer + + doc = next(item for path, item in documents if path.stem == "xlsx_01") + + # No SectionHeaderItem nodes injected by the backend + headings = [t for t in doc.texts if t.label.value == "section_header"] + assert headings == [], ( + f"Backend should not inject heading nodes; found: {[h.text for h in headings]}" + ) + + # Sheet names appear as ## headings via the custom serializer + non_empty_sheet_names = ["Sheet1", "Sheet2", "Sheet3"] + serializer = MsExcelMarkdownDocSerializer(doc=doc) + md = serializer.serialize().text + for name in non_empty_sheet_names: + assert f"## {name}" in md, f"Expected '## {name}' in markdown output" + + def test_bytesio_stream(): """Test that Excel files can be loaded from BytesIO streams. diff --git a/uv.lock b/uv.lock index a3989046dd..d2996fa086 100644 --- a/uv.lock +++ b/uv.lock @@ -1345,7 +1345,7 @@ examples = [ [[package]] name = "docling-core" -version = "2.73.0" +version = "2.74.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "defusedxml" }, @@ -1361,9 +1361,9 @@ dependencies = [ { name = "typer" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/4c/e3/b9c3b1a1ea62e5e03d9e844a5cff2f89b7a3e960725a862f009e8553ca3d/docling_core-2.73.0.tar.gz", hash = "sha256:33ffc2b2bf736ed0e079bba296081a26885f6cb08081c828d630ca85a51e22e0", size = 308895, upload-time = "2026-04-09T08:08:51.573Z" } +sdist = { url = "https://files.pythonhosted.org/packages/43/d1/147ec84a59217d63620885e5103f9f40101972e70aae9e1c3b501e5637b8/docling_core-2.74.0.tar.gz", hash = "sha256:e8beb0b84a033c814386b1d990e73cb1c68c6485906c78c841b901577c705dc0", size = 316214, upload-time = "2026-04-17T06:50:28.344Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7f/c3/08143b7e8fe1b9230ce15e54926859f8c40ec2622fb612f0b2ff13169696/docling_core-2.73.0-py3-none-any.whl", hash = "sha256:4366fab8f4422fbde090ed87d9b091bd25b3b37cdd284dc0b02c9a5e24caaa22", size = 271518, upload-time = "2026-04-09T08:08:49.838Z" }, + { url = "https://files.pythonhosted.org/packages/b4/9e/a7a5a71db047f5f50f5e4a4a43a918f346f97752539f1e5d99c785487497/docling_core-2.74.0-py3-none-any.whl", hash = "sha256:359f101a261cdcfa592bcb0e82dd508bd431f8d9ed49c6938ee271db1d420039", size = 275860, upload-time = "2026-04-17T06:50:26.779Z" }, ] [package.optional-dependencies]