Skip to content

Commit 7bfa2c8

Browse files
committed
Merge branch 'main' of github.com:DS4SD/docling into cau/od-and-image-classifier-api-facet
2 parents 71709af + dbba6ea commit 7bfa2c8

File tree

3 files changed

+150
-23
lines changed

3 files changed

+150
-23
lines changed

docling/datamodel/pipeline_options.py

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
BaseModel,
1111
ConfigDict,
1212
Field,
13+
field_validator,
1314
)
1415
from typing_extensions import deprecated
1516

@@ -58,6 +59,10 @@
5859
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
5960
VlmModelType,
6061
)
62+
from docling.models.inference_engines.object_detection.base import (
63+
ObjectDetectionEngineOptionsMixin,
64+
)
65+
from docling.models.inference_engines.vlm.base import VlmEngineOptionsMixin
6166

6267
_log = logging.getLogger(__name__)
6368

@@ -639,7 +644,7 @@ def repo_cache_folder(self) -> str:
639644

640645

641646
class PictureDescriptionVlmEngineOptions(
642-
StagePresetMixin, PictureDescriptionBaseOptions
647+
StagePresetMixin, VlmEngineOptionsMixin, PictureDescriptionBaseOptions
643648
):
644649
"""Configuration for VLM runtime-based picture description.
645650
@@ -667,9 +672,6 @@ class PictureDescriptionVlmEngineOptions(
667672
model_spec: VlmModelSpec = Field(
668673
description="Model specification with runtime-specific overrides"
669674
)
670-
engine_options: BaseVlmEngineOptions = Field(
671-
description="Runtime configuration (transformers, mlx, api, etc.)"
672-
)
673675
prompt: Annotated[
674676
str,
675677
Field(
@@ -715,7 +717,7 @@ class PictureDescriptionVlmEngineOptions(
715717
"""
716718

717719

718-
class VlmConvertOptions(StagePresetMixin, BaseModel):
720+
class VlmConvertOptions(StagePresetMixin, VlmEngineOptionsMixin, BaseModel):
719721
"""Configuration for VLM-based document conversion.
720722
721723
This stage uses vision-language models to convert document pages to
@@ -738,10 +740,6 @@ class VlmConvertOptions(StagePresetMixin, BaseModel):
738740
description="Model specification with runtime-specific overrides"
739741
)
740742

741-
engine_options: BaseVlmEngineOptions = Field(
742-
description="Runtime configuration (transformers, mlx, api, etc.)"
743-
)
744-
745743
scale: float = Field(
746744
default=2.0, description="Image scaling factor for preprocessing"
747745
)
@@ -759,7 +757,7 @@ class VlmConvertOptions(StagePresetMixin, BaseModel):
759757
)
760758

761759

762-
class CodeFormulaVlmOptions(StagePresetMixin, BaseModel):
760+
class CodeFormulaVlmOptions(StagePresetMixin, VlmEngineOptionsMixin, BaseModel):
763761
"""Configuration for VLM-based code and formula extraction.
764762
765763
This stage uses vision-language models to extract code blocks and
@@ -778,10 +776,6 @@ class CodeFormulaVlmOptions(StagePresetMixin, BaseModel):
778776
description="Model specification with runtime-specific overrides"
779777
)
780778

781-
engine_options: BaseVlmEngineOptions = Field(
782-
description="Runtime configuration (transformers, mlx, api, etc.)"
783-
)
784-
785779
scale: float = Field(
786780
default=2.0, description="Image scaling factor for preprocessing"
787781
)
@@ -1119,7 +1113,11 @@ class LayoutOptions(BaseLayoutOptions):
11191113
] = DOCLING_LAYOUT_HERON
11201114

11211115

1122-
class LayoutObjectDetectionOptions(ObjectDetectionStagePresetMixin, BaseLayoutOptions):
1116+
class LayoutObjectDetectionOptions(
1117+
ObjectDetectionStagePresetMixin,
1118+
ObjectDetectionEngineOptionsMixin,
1119+
BaseLayoutOptions,
1120+
):
11231121
"""Options for layout detection using object-detection runtimes."""
11241122

11251123
kind: ClassVar[str] = "layout_object_detection"
@@ -1141,10 +1139,6 @@ class LayoutObjectDetectionOptions(ObjectDetectionStagePresetMixin, BaseLayoutOp
11411139
description="Object-detection model specification for layout analysis",
11421140
)
11431141

1144-
engine_options: BaseObjectDetectionEngineOptions = Field(
1145-
description="Runtime configuration for the object-detection engine",
1146-
)
1147-
11481142

11491143
LayoutObjectDetectionOptions.register_preset(
11501144
stage_model_specs.OBJECT_DETECTION_LAYOUT_HERON

docling/models/inference_engines/object_detection/base.py

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,22 @@
55
import logging
66
from abc import ABC, abstractmethod
77
from enum import Enum
8-
from typing import TYPE_CHECKING, Any, Dict, List, Optional
8+
from typing import (
9+
TYPE_CHECKING,
10+
Any,
11+
ClassVar,
12+
Dict,
13+
List,
14+
Literal,
15+
Optional,
16+
Type,
17+
get_args,
18+
get_origin,
19+
)
920

1021
from PIL.Image import Image
11-
from pydantic import BaseModel, ConfigDict, Field
22+
from pydantic import BaseModel, ConfigDict, Field, field_validator
23+
from pydantic_core import PydanticUndefined
1224

1325
if TYPE_CHECKING:
1426
from docling.datamodel.stage_model_specs import EngineModelConfig
@@ -43,6 +55,61 @@ class BaseObjectDetectionEngineOptions(BaseModel):
4355
description="Minimum confidence score to keep a detection (0.0 to 1.0)",
4456
)
4557

58+
_registry: ClassVar[
59+
dict[ObjectDetectionEngineType, Type[BaseObjectDetectionEngineOptions]]
60+
] = {}
61+
62+
@classmethod
63+
def __pydantic_init_subclass__(cls, **kwargs):
64+
super().__pydantic_init_subclass__(**kwargs)
65+
66+
# Skip base class itself
67+
if cls is BaseObjectDetectionEngineOptions:
68+
return
69+
70+
# only register concrete subclasses that fix engine_type via Literal
71+
field = cls.model_fields.get("engine_type")
72+
if not field:
73+
return
74+
75+
engine_type = None
76+
77+
# 1. Literal[...] annotation
78+
ann = field.annotation
79+
if get_origin(ann) is Literal:
80+
values = get_args(ann)
81+
if len(values) == 1:
82+
engine_type = values[0]
83+
84+
# 2. Explicit default
85+
if engine_type is None and field.default is not PydanticUndefined:
86+
engine_type = field.default
87+
88+
if engine_type is not None:
89+
BaseObjectDetectionEngineOptions._registry[engine_type] = cls
90+
91+
92+
class ObjectDetectionEngineOptionsMixin(BaseModel):
93+
engine_options: BaseObjectDetectionEngineOptions = Field(
94+
description="Runtime configuration for the object-detection engine",
95+
)
96+
97+
@field_validator("engine_options", mode="before")
98+
@classmethod
99+
def resolve_engine_options(cls, value):
100+
# already concrete
101+
if isinstance(value, BaseObjectDetectionEngineOptions):
102+
return value
103+
104+
# dict / JSON case
105+
if isinstance(value, dict):
106+
engine_type = value.get("engine_type")
107+
model_cls = BaseObjectDetectionEngineOptions._registry.get(engine_type)
108+
if model_cls:
109+
return model_cls.model_validate(value)
110+
111+
return value
112+
46113

47114
class ObjectDetectionEngineInput(BaseModel):
48115
"""Generic input accepted by every object-detection engine."""

docling/models/inference_engines/vlm/base.py

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,22 @@
33
import logging
44
from abc import ABC, abstractmethod
55
from enum import Enum
6-
from typing import TYPE_CHECKING, Any, Dict, List, Optional
6+
from typing import (
7+
TYPE_CHECKING,
8+
Any,
9+
ClassVar,
10+
Dict,
11+
List,
12+
Literal,
13+
Optional,
14+
Type,
15+
get_args,
16+
get_origin,
17+
)
718

819
from PIL.Image import Image
9-
from pydantic import BaseModel, ConfigDict, Field
20+
from pydantic import BaseModel, ConfigDict, Field, field_validator
21+
from pydantic_core import PydanticUndefined
1022

1123
if TYPE_CHECKING:
1224
from docling.datamodel.stage_model_specs import EngineModelConfig
@@ -62,6 +74,60 @@ class BaseVlmEngineOptions(BaseModel):
6274

6375
engine_type: VlmEngineType = Field(description="Type of inference engine to use")
6476

77+
# registry: engine_type → subclass
78+
_registry: ClassVar[Dict[VlmEngineType, Type["BaseVlmEngineOptions"]]] = {}
79+
80+
@classmethod
81+
def __pydantic_init_subclass__(cls, **kwargs):
82+
super().__pydantic_init_subclass__(**kwargs)
83+
84+
# Skip base class itself
85+
if cls is BaseVlmEngineOptions:
86+
return
87+
88+
# only register concrete subclasses that fix engine_type via Literal
89+
field = cls.model_fields.get("engine_type")
90+
if not field:
91+
return
92+
93+
engine_type = None
94+
95+
# 1. Literal[...] annotation
96+
ann = field.annotation
97+
if get_origin(ann) is Literal:
98+
values = get_args(ann)
99+
if len(values) == 1:
100+
engine_type = values[0]
101+
102+
# 2. Explicit default
103+
if engine_type is None and field.default is not PydanticUndefined:
104+
engine_type = field.default
105+
106+
if engine_type is not None:
107+
BaseVlmEngineOptions._registry[engine_type] = cls
108+
109+
110+
class VlmEngineOptionsMixin(BaseModel):
111+
engine_options: BaseVlmEngineOptions = Field(
112+
description="Runtime configuration (transformers, mlx, api, etc.)"
113+
)
114+
115+
@field_validator("engine_options", mode="before")
116+
@classmethod
117+
def resolve_engine_options(cls, value):
118+
# already concrete
119+
if isinstance(value, BaseVlmEngineOptions):
120+
return value
121+
122+
# dict / JSON case
123+
if isinstance(value, dict):
124+
engine_type = value.get("engine_type")
125+
model_cls = BaseVlmEngineOptions._registry.get(engine_type)
126+
if model_cls:
127+
return model_cls.model_validate(value)
128+
129+
return value
130+
65131

66132
class VlmEngineInput(BaseModel):
67133
"""Input to a VLM inference engine.

0 commit comments

Comments
 (0)