1212 InferenceAsrFramework ,
1313 InlineAsrMlxWhisperOptions ,
1414 InlineAsrNativeWhisperOptions ,
15+ InlineAsrWhisperS2TOptions ,
1516 TransformersModelType ,
1617)
1718
@@ -463,9 +464,138 @@ def _get_whisper_turbo_model():
463464 max_time_chunk = 30.0 ,
464465)
465466
467+ # =============================================================================
468+ # WhisperS2T Models (CTranslate2 backend - fastest option for CPU/CUDA)
469+ # =============================================================================
470+
471+ # Tiny models
472+ WHISPER_TINY_S2T = InlineAsrWhisperS2TOptions (
473+ repo_id = "tiny" ,
474+ inference_framework = InferenceAsrFramework .WHISPER_S2T ,
475+ language = "en" ,
476+ task = "transcribe" ,
477+ compute_type = "float16" ,
478+ batch_size = 16 ,
479+ beam_size = 1 ,
480+ )
481+
482+ WHISPER_TINY_EN_S2T = InlineAsrWhisperS2TOptions (
483+ repo_id = "tiny.en" ,
484+ inference_framework = InferenceAsrFramework .WHISPER_S2T ,
485+ language = "en" ,
486+ task = "transcribe" ,
487+ compute_type = "float16" ,
488+ batch_size = 16 ,
489+ beam_size = 1 ,
490+ )
491+
492+ # Base models
493+ WHISPER_BASE_S2T = InlineAsrWhisperS2TOptions (
494+ repo_id = "base" ,
495+ inference_framework = InferenceAsrFramework .WHISPER_S2T ,
496+ language = "en" ,
497+ task = "transcribe" ,
498+ compute_type = "float16" ,
499+ batch_size = 12 ,
500+ beam_size = 1 ,
501+ )
502+
503+ WHISPER_BASE_EN_S2T = InlineAsrWhisperS2TOptions (
504+ repo_id = "base.en" ,
505+ inference_framework = InferenceAsrFramework .WHISPER_S2T ,
506+ language = "en" ,
507+ task = "transcribe" ,
508+ compute_type = "float16" ,
509+ batch_size = 12 ,
510+ beam_size = 1 ,
511+ )
512+
513+ # Small models
514+ WHISPER_SMALL_S2T = InlineAsrWhisperS2TOptions (
515+ repo_id = "small" ,
516+ inference_framework = InferenceAsrFramework .WHISPER_S2T ,
517+ language = "en" ,
518+ task = "transcribe" ,
519+ compute_type = "float16" ,
520+ batch_size = 8 ,
521+ beam_size = 1 ,
522+ )
523+
524+ WHISPER_SMALL_EN_S2T = InlineAsrWhisperS2TOptions (
525+ repo_id = "small.en" ,
526+ inference_framework = InferenceAsrFramework .WHISPER_S2T ,
527+ language = "en" ,
528+ task = "transcribe" ,
529+ compute_type = "float16" ,
530+ batch_size = 8 ,
531+ beam_size = 1 ,
532+ )
533+
534+ WHISPER_DISTIL_SMALL_EN_S2T = InlineAsrWhisperS2TOptions (
535+ repo_id = "distil-small.en" ,
536+ inference_framework = InferenceAsrFramework .WHISPER_S2T ,
537+ language = "en" ,
538+ task = "transcribe" ,
539+ compute_type = "float16" ,
540+ batch_size = 10 ,
541+ beam_size = 1 ,
542+ )
543+
544+ # Medium models
545+ WHISPER_MEDIUM_S2T = InlineAsrWhisperS2TOptions (
546+ repo_id = "medium" ,
547+ inference_framework = InferenceAsrFramework .WHISPER_S2T ,
548+ language = "en" ,
549+ task = "transcribe" ,
550+ compute_type = "float16" ,
551+ batch_size = 6 ,
552+ beam_size = 1 ,
553+ )
554+
555+ WHISPER_MEDIUM_EN_S2T = InlineAsrWhisperS2TOptions (
556+ repo_id = "medium.en" ,
557+ inference_framework = InferenceAsrFramework .WHISPER_S2T ,
558+ language = "en" ,
559+ task = "transcribe" ,
560+ compute_type = "float16" ,
561+ batch_size = 6 ,
562+ beam_size = 1 ,
563+ )
564+
565+ WHISPER_DISTIL_MEDIUM_EN_S2T = InlineAsrWhisperS2TOptions (
566+ repo_id = "distil-medium.en" ,
567+ inference_framework = InferenceAsrFramework .WHISPER_S2T ,
568+ language = "en" ,
569+ task = "transcribe" ,
570+ compute_type = "float16" ,
571+ batch_size = 8 ,
572+ beam_size = 1 ,
573+ )
574+
575+ # Large models
576+ WHISPER_LARGE_V3_S2T = InlineAsrWhisperS2TOptions (
577+ repo_id = "large-v3" ,
578+ inference_framework = InferenceAsrFramework .WHISPER_S2T ,
579+ language = "en" ,
580+ task = "transcribe" ,
581+ compute_type = "float16" ,
582+ batch_size = 4 ,
583+ beam_size = 1 ,
584+ )
585+
586+ WHISPER_DISTIL_LARGE_V3_S2T = InlineAsrWhisperS2TOptions (
587+ repo_id = "distil-large-v3" ,
588+ inference_framework = InferenceAsrFramework .WHISPER_S2T ,
589+ language = "en" ,
590+ task = "transcribe" ,
591+ compute_type = "float16" ,
592+ batch_size = 6 ,
593+ beam_size = 1 ,
594+ )
595+
466596# Note: The main WHISPER_* models (WHISPER_TURBO, WHISPER_BASE, etc.) automatically
467597# select the best implementation (MLX on Apple Silicon, Native elsewhere).
468- # Use the explicit _MLX or _NATIVE variants if you need to force a specific implementation.
598+ # Use the explicit _MLX, _NATIVE, or _S2T variants if you need to force a specific implementation.
469599
470600
471601class AsrModelType (str , Enum ):
@@ -492,3 +622,17 @@ class AsrModelType(str, Enum):
492622 WHISPER_BASE_NATIVE = "whisper_base_native"
493623 WHISPER_LARGE_NATIVE = "whisper_large_native"
494624 WHISPER_TURBO_NATIVE = "whisper_turbo_native"
625+
626+ # Explicit WhisperS2T models (CTranslate2 backend - fastest)
627+ WHISPER_TINY_S2T = "whisper_tiny_s2t"
628+ WHISPER_TINY_EN_S2T = "whisper_tiny_en_s2t"
629+ WHISPER_BASE_S2T = "whisper_base_s2t"
630+ WHISPER_BASE_EN_S2T = "whisper_base_en_s2t"
631+ WHISPER_SMALL_S2T = "whisper_small_s2t"
632+ WHISPER_SMALL_EN_S2T = "whisper_small_en_s2t"
633+ WHISPER_DISTIL_SMALL_EN_S2T = "whisper_distil_small_en_s2t"
634+ WHISPER_MEDIUM_S2T = "whisper_medium_s2t"
635+ WHISPER_MEDIUM_EN_S2T = "whisper_medium_en_s2t"
636+ WHISPER_DISTIL_MEDIUM_EN_S2T = "whisper_distil_medium_en_s2t"
637+ WHISPER_LARGE_V3_S2T = "whisper_large_v3_s2t"
638+ WHISPER_DISTIL_LARGE_V3_S2T = "whisper_distil_large_v3_s2t"
0 commit comments