feat: add fast ASR backend

BBC-Esq · BBC-Esq · commit 85b6fefa7e2e · 2026-01-31T00:29:21.000-05:00
Signed-off-by: BBC, Esquire &lt;bbc@chintellalaw.com&gt;
diff --git a/docling/cli/main.py b/docling/cli/main.py
@@ -52,6 +52,19 @@
     WHISPER_TURBO,
     WHISPER_TURBO_MLX,
     WHISPER_TURBO_NATIVE,
+    # WhisperS2T models
+    WHISPER_TINY_S2T,
+    WHISPER_TINY_EN_S2T,
+    WHISPER_BASE_S2T,
+    WHISPER_BASE_EN_S2T,
+    WHISPER_SMALL_S2T,
+    WHISPER_SMALL_EN_S2T,
+    WHISPER_DISTIL_SMALL_EN_S2T,
+    WHISPER_MEDIUM_S2T,
+    WHISPER_MEDIUM_EN_S2T,
+    WHISPER_DISTIL_MEDIUM_EN_S2T,
+    WHISPER_LARGE_V3_S2T,
+    WHISPER_DISTIL_LARGE_V3_S2T,
     AsrModelType,
 )
 from docling.datamodel.backend_options import PdfBackendOptions
@@ -874,7 +887,6 @@ def convert(  # noqa: C901
             # enable_remote_services=enable_remote_services,
             # artifacts_path = artifacts_path
         )
-
         # Auto-selecting models (choose best implementation for hardware)
         if asr_model == AsrModelType.WHISPER_TINY:
             asr_pipeline_options.asr_options = WHISPER_TINY
@@ -888,7 +900,6 @@ def convert(  # noqa: C901
             asr_pipeline_options.asr_options = WHISPER_LARGE
         elif asr_model == AsrModelType.WHISPER_TURBO:
             asr_pipeline_options.asr_options = WHISPER_TURBO
-
         # Explicit MLX models (force MLX implementation)
         elif asr_model == AsrModelType.WHISPER_TINY_MLX:
             asr_pipeline_options.asr_options = WHISPER_TINY_MLX
@@ -902,7 +913,6 @@ def convert(  # noqa: C901
             asr_pipeline_options.asr_options = WHISPER_LARGE_MLX
         elif asr_model == AsrModelType.WHISPER_TURBO_MLX:
             asr_pipeline_options.asr_options = WHISPER_TURBO_MLX
-
         # Explicit Native models (force native implementation)
         elif asr_model == AsrModelType.WHISPER_TINY_NATIVE:
             asr_pipeline_options.asr_options = WHISPER_TINY_NATIVE
@@ -916,13 +926,35 @@ def convert(  # noqa: C901
             asr_pipeline_options.asr_options = WHISPER_LARGE_NATIVE
         elif asr_model == AsrModelType.WHISPER_TURBO_NATIVE:
             asr_pipeline_options.asr_options = WHISPER_TURBO_NATIVE
-
+        # Explicit WhisperS2T models (CTranslate2 backend - fastest)
+        elif asr_model == AsrModelType.WHISPER_TINY_S2T:
+            asr_pipeline_options.asr_options = WHISPER_TINY_S2T
+        elif asr_model == AsrModelType.WHISPER_TINY_EN_S2T:
+            asr_pipeline_options.asr_options = WHISPER_TINY_EN_S2T
+        elif asr_model == AsrModelType.WHISPER_BASE_S2T:
+            asr_pipeline_options.asr_options = WHISPER_BASE_S2T
+        elif asr_model == AsrModelType.WHISPER_BASE_EN_S2T:
+            asr_pipeline_options.asr_options = WHISPER_BASE_EN_S2T
+        elif asr_model == AsrModelType.WHISPER_SMALL_S2T:
+            asr_pipeline_options.asr_options = WHISPER_SMALL_S2T
+        elif asr_model == AsrModelType.WHISPER_SMALL_EN_S2T:
+            asr_pipeline_options.asr_options = WHISPER_SMALL_EN_S2T
+        elif asr_model == AsrModelType.WHISPER_DISTIL_SMALL_EN_S2T:
+            asr_pipeline_options.asr_options = WHISPER_DISTIL_SMALL_EN_S2T
+        elif asr_model == AsrModelType.WHISPER_MEDIUM_S2T:
+            asr_pipeline_options.asr_options = WHISPER_MEDIUM_S2T
+        elif asr_model == AsrModelType.WHISPER_MEDIUM_EN_S2T:
+            asr_pipeline_options.asr_options = WHISPER_MEDIUM_EN_S2T
+        elif asr_model == AsrModelType.WHISPER_DISTIL_MEDIUM_EN_S2T:
+            asr_pipeline_options.asr_options = WHISPER_DISTIL_MEDIUM_EN_S2T
+        elif asr_model == AsrModelType.WHISPER_LARGE_V3_S2T:
+            asr_pipeline_options.asr_options = WHISPER_LARGE_V3_S2T
+        elif asr_model == AsrModelType.WHISPER_DISTIL_LARGE_V3_S2T:
+            asr_pipeline_options.asr_options = WHISPER_DISTIL_LARGE_V3_S2T
         else:
             _log.error(f"{asr_model} is not known")
             raise ValueError(f"{asr_model} is not known")
-
         _log.debug(f"ASR pipeline_options: {asr_pipeline_options}")
-
         audio_format_option = AudioFormatOption(
             pipeline_cls=AsrPipeline,
             pipeline_options=asr_pipeline_options,
diff --git a/docling/datamodel/asr_model_specs.py b/docling/datamodel/asr_model_specs.py
@@ -12,6 +12,7 @@
     InferenceAsrFramework,
     InlineAsrMlxWhisperOptions,
     InlineAsrNativeWhisperOptions,
+    InlineAsrWhisperS2TOptions,
     TransformersModelType,
 )
 
@@ -463,9 +464,138 @@ def _get_whisper_turbo_model():
     max_time_chunk=30.0,
 )
 
+# =============================================================================
+# WhisperS2T Models (CTranslate2 backend - fastest option for CPU/CUDA)
+# =============================================================================
+
+# Tiny models
+WHISPER_TINY_S2T = InlineAsrWhisperS2TOptions(
+    repo_id="tiny",
+    inference_framework=InferenceAsrFramework.WHISPER_S2T,
+    language="en",
+    task="transcribe",
+    compute_type="float16",
+    batch_size=16,
+    beam_size=1,
+)
+
+WHISPER_TINY_EN_S2T = InlineAsrWhisperS2TOptions(
+    repo_id="tiny.en",
+    inference_framework=InferenceAsrFramework.WHISPER_S2T,
+    language="en",
+    task="transcribe",
+    compute_type="float16",
+    batch_size=16,
+    beam_size=1,
+)
+
+# Base models
+WHISPER_BASE_S2T = InlineAsrWhisperS2TOptions(
+    repo_id="base",
+    inference_framework=InferenceAsrFramework.WHISPER_S2T,
+    language="en",
+    task="transcribe",
+    compute_type="float16",
+    batch_size=12,
+    beam_size=1,
+)
+
+WHISPER_BASE_EN_S2T = InlineAsrWhisperS2TOptions(
+    repo_id="base.en",
+    inference_framework=InferenceAsrFramework.WHISPER_S2T,
+    language="en",
+    task="transcribe",
+    compute_type="float16",
+    batch_size=12,
+    beam_size=1,
+)
+
+# Small models
+WHISPER_SMALL_S2T = InlineAsrWhisperS2TOptions(
+    repo_id="small",
+    inference_framework=InferenceAsrFramework.WHISPER_S2T,
+    language="en",
+    task="transcribe",
+    compute_type="float16",
+    batch_size=8,
+    beam_size=1,
+)
+
+WHISPER_SMALL_EN_S2T = InlineAsrWhisperS2TOptions(
+    repo_id="small.en",
+    inference_framework=InferenceAsrFramework.WHISPER_S2T,
+    language="en",
+    task="transcribe",
+    compute_type="float16",
+    batch_size=8,
+    beam_size=1,
+)
+
+WHISPER_DISTIL_SMALL_EN_S2T = InlineAsrWhisperS2TOptions(
+    repo_id="distil-small.en",
+    inference_framework=InferenceAsrFramework.WHISPER_S2T,
+    language="en",
+    task="transcribe",
+    compute_type="float16",
+    batch_size=10,
+    beam_size=1,
+)
+
+# Medium models
+WHISPER_MEDIUM_S2T = InlineAsrWhisperS2TOptions(
+    repo_id="medium",
+    inference_framework=InferenceAsrFramework.WHISPER_S2T,
+    language="en",
+    task="transcribe",
+    compute_type="float16",
+    batch_size=6,
+    beam_size=1,
+)
+
+WHISPER_MEDIUM_EN_S2T = InlineAsrWhisperS2TOptions(
+    repo_id="medium.en",
+    inference_framework=InferenceAsrFramework.WHISPER_S2T,
+    language="en",
+    task="transcribe",
+    compute_type="float16",
+    batch_size=6,
+    beam_size=1,
+)
+
+WHISPER_DISTIL_MEDIUM_EN_S2T = InlineAsrWhisperS2TOptions(
+    repo_id="distil-medium.en",
+    inference_framework=InferenceAsrFramework.WHISPER_S2T,
+    language="en",
+    task="transcribe",
+    compute_type="float16",
+    batch_size=8,
+    beam_size=1,
+)
+
+# Large models
+WHISPER_LARGE_V3_S2T = InlineAsrWhisperS2TOptions(
+    repo_id="large-v3",
+    inference_framework=InferenceAsrFramework.WHISPER_S2T,
+    language="en",
+    task="transcribe",
+    compute_type="float16",
+    batch_size=4,
+    beam_size=1,
+)
+
+WHISPER_DISTIL_LARGE_V3_S2T = InlineAsrWhisperS2TOptions(
+    repo_id="distil-large-v3",
+    inference_framework=InferenceAsrFramework.WHISPER_S2T,
+    language="en",
+    task="transcribe",
+    compute_type="float16",
+    batch_size=6,
+    beam_size=1,
+)
+
 # Note: The main WHISPER_* models (WHISPER_TURBO, WHISPER_BASE, etc.) automatically
 # select the best implementation (MLX on Apple Silicon, Native elsewhere).
-# Use the explicit _MLX or _NATIVE variants if you need to force a specific implementation.
+# Use the explicit _MLX, _NATIVE, or _S2T variants if you need to force a specific implementation.
 
 
 class AsrModelType(str, Enum):
@@ -492,3 +622,17 @@ class AsrModelType(str, Enum):
     WHISPER_BASE_NATIVE = "whisper_base_native"
     WHISPER_LARGE_NATIVE = "whisper_large_native"
     WHISPER_TURBO_NATIVE = "whisper_turbo_native"
+
+    # Explicit WhisperS2T models (CTranslate2 backend - fastest)
+    WHISPER_TINY_S2T = "whisper_tiny_s2t"
+    WHISPER_TINY_EN_S2T = "whisper_tiny_en_s2t"
+    WHISPER_BASE_S2T = "whisper_base_s2t"
+    WHISPER_BASE_EN_S2T = "whisper_base_en_s2t"
+    WHISPER_SMALL_S2T = "whisper_small_s2t"
+    WHISPER_SMALL_EN_S2T = "whisper_small_en_s2t"
+    WHISPER_DISTIL_SMALL_EN_S2T = "whisper_distil_small_en_s2t"
+    WHISPER_MEDIUM_S2T = "whisper_medium_s2t"
+    WHISPER_MEDIUM_EN_S2T = "whisper_medium_en_s2t"
+    WHISPER_DISTIL_MEDIUM_EN_S2T = "whisper_distil_medium_en_s2t"
+    WHISPER_LARGE_V3_S2T = "whisper_large_v3_s2t"
+    WHISPER_DISTIL_LARGE_V3_S2T = "whisper_distil_large_v3_s2t"
diff --git a/docling/datamodel/pipeline_options_asr_model.py b/docling/datamodel/pipeline_options_asr_model.py
@@ -29,6 +29,7 @@ class InferenceAsrFramework(str, Enum):
     MLX = "mlx"
     # TRANSFORMERS = "transformers" # disabled for now
     WHISPER = "whisper"
+    WHISPER_S2T = "whisper_s2t"
 
 
 class InlineAsrOptions(BaseAsrOptions):
@@ -262,3 +263,115 @@ class InlineAsrMlxWhisperOptions(InlineAsrOptions):
             )
         ),
     ] = 2.4
+
+
+class InlineAsrWhisperS2TOptions(InlineAsrOptions):
+    """Configuration for WhisperS2T (CTranslate2-based) high-speed ASR.
+
+    Uses whisper_s2t library with CTranslate2 backend for fast inference
+    on CPU and CUDA devices. Requires whisper-s2t-reborn package.
+    """
+
+    inference_framework: Annotated[
+        InferenceAsrFramework,
+        Field(
+            description=(
+                "Inference framework for ASR. Uses WhisperS2T with CTranslate2 "
+                "backend for optimized high-speed inference."
+            )
+        ),
+    ] = InferenceAsrFramework.WHISPER_S2T
+    language: Annotated[
+        str,
+        Field(
+            description=(
+                "Language code for transcription. Use ISO 639-1 codes "
+                "(e.g., `en`, `es`, `fr`)."
+            ),
+            examples=["en", "es", "fr", "de", "ja", "zh"],
+        ),
+    ] = "en"
+    task: Annotated[
+        str,
+        Field(
+            description=(
+                "ASR task type. `transcribe` converts speech to text in the "
+                "same language. `translate` converts speech to English text."
+            ),
+            examples=["transcribe", "translate"],
+        ),
+    ] = "transcribe"
+    compute_type: Annotated[
+        str,
+        Field(
+            description=(
+                "Computation precision for CTranslate2. Options: `float32`, "
+                "`float16`, `bfloat16`. Lower precision increases speed and "
+                "reduces memory. bfloat16 requires compute capability >= 8.6."
+            ),
+            examples=["float32", "float16", "bfloat16"],
+        ),
+    ] = "float16"
+    batch_size: Annotated[
+        int,
+        Field(
+            description=(
+                "Number of audio segments to process in parallel. Higher values "
+                "increase throughput but require more VRAM."
+            )
+        ),
+    ] = 8
+    beam_size: Annotated[
+        int,
+        Field(
+            description=(
+                "Beam size for beam search decoding. 1 = greedy decoding (fastest), "
+                "higher values (e.g., 5) may improve accuracy at cost of speed."
+            )
+        ),
+    ] = 1
+    word_timestamps: Annotated[
+        bool,
+        Field(
+            description=(
+                "Generate word-level timestamps. Requires an additional alignment "
+                "model and increases processing time."
+            )
+        ),
+    ] = False
+    cpu_threads: Annotated[
+        int,
+        Field(
+            description=(
+                "Number of CPU threads for inference. Only used when device is CPU."
+            )
+        ),
+    ] = 4
+    num_workers: Annotated[
+        int,
+        Field(
+            description=(
+                "Number of parallel workers for CTranslate2."
+            )
+        ),
+    ] = 1
+    initial_prompt: Annotated[
+        Optional[str],
+        Field(
+            description=(
+                "Optional text prompt to condition the transcription style or "
+                "provide context. Useful for domain-specific vocabulary."
+            )
+        ),
+    ] = None
+    supported_devices: Annotated[
+        list[AcceleratorDevice],
+        Field(
+            description=(
+                "Hardware accelerators supported by WhisperS2T."
+            )
+        ),
+    ] = [
+        AcceleratorDevice.CPU,
+        AcceleratorDevice.CUDA,
+    ]
diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py