22import asyncio
33import base64
44from concurrent .futures import ThreadPoolExecutor
5+ from dataclasses import dataclass
56from enum import Enum
67import json
78import logging
@@ -334,6 +335,16 @@ def __init__(
334335 self .tools_config_list = tools_config_list
335336 self .prompt_knowledge_base = prompt_knowledge_base
336337
338+
339+ @dataclass
340+ class AudioEventAlignment :
341+ """Audio alignment data containing character-level timing information. d"""
342+
343+ chars : List [str ]
344+ char_start_times_ms : List [int ]
345+ char_durations_ms : List [int ]
346+
347+
337348class BaseConversation :
338349 """Base class for conversation implementations with shared parameters and logic."""
339350
@@ -364,12 +375,16 @@ def __init__(
364375 def _get_wss_url (self ):
365376 if self .on_prem_config :
366377 return self .on_prem_config .on_prem_conversation_url
367-
378+
368379 base_http_url = self .client ._client_wrapper .get_base_url ()
369- base_ws_url = urllib .parse .urlparse (base_http_url )._replace (scheme = "wss" if base_http_url .startswith ("https" ) else "ws" ).geturl ()
380+ base_ws_url = (
381+ urllib .parse .urlparse (base_http_url )
382+ ._replace (scheme = "wss" if base_http_url .startswith ("https" ) else "ws" )
383+ .geturl ()
384+ )
370385 # Ensure base URL ends with '/' for proper joining
371- if not base_ws_url .endswith ('/' ):
372- base_ws_url += '/'
386+ if not base_ws_url .endswith ("/" ):
387+ base_ws_url += "/"
373388 return f"{ base_ws_url } v1/convai/conversation?agent_id={ self .agent_id } &source=python_sdk&version={ __version__ } "
374389
375390 def _get_signed_url (self ):
@@ -391,7 +406,7 @@ def _create_on_prem_initiation_message(self):
391406 "prompt_knowledge_base" : self .on_prem_config .prompt_knowledge_base ,
392407 }
393408 )
394-
409+
395410 def _create_initiation_message (self ):
396411 return json .dumps (
397412 {
@@ -426,6 +441,15 @@ def _handle_message_core(self, message, message_handler):
426441 audio = base64 .b64decode (event ["audio_base_64" ])
427442 message_handler .handle_audio_output (audio )
428443
444+ if message_handler .callback_audio_alignment and "alignment" in event :
445+ alignment_data = event ["alignment" ]
446+ alignment = AudioEventAlignment (
447+ chars = alignment_data .get ("chars" , []),
448+ char_start_times_ms = alignment_data .get ("char_start_times_ms" , []),
449+ char_durations_ms = alignment_data .get ("char_durations_ms" , []),
450+ )
451+ message_handler .handle_audio_alignment (alignment )
452+
429453 elif message ["type" ] == "agent_response" :
430454 if message_handler .callback_agent_response :
431455 event = message ["agent_response_event" ]
@@ -446,8 +470,7 @@ def _handle_message_core(self, message, message_handler):
446470 if message_handler .callback_agent_response_correction :
447471 event = message ["agent_response_correction_event" ]
448472 message_handler .handle_agent_response_correction (
449- event ["original_agent_response" ].strip (),
450- event ["corrected_agent_response" ].strip ()
473+ event ["original_agent_response" ].strip (), event ["corrected_agent_response" ].strip ()
451474 )
452475
453476 elif message ["type" ] == "user_transcript" :
@@ -488,6 +511,15 @@ async def _handle_message_core_async(self, message, message_handler):
488511 audio = base64 .b64decode (event ["audio_base_64" ])
489512 await message_handler .handle_audio_output (audio )
490513
514+ if message_handler .callback_audio_alignment and "alignment" in event :
515+ alignment_data = event ["alignment" ]
516+ alignment = AudioEventAlignment (
517+ chars = alignment_data .get ("chars" , []),
518+ char_start_times_ms = alignment_data .get ("char_start_times_ms" , []),
519+ char_durations_ms = alignment_data .get ("char_durations_ms" , []),
520+ )
521+ await message_handler .handle_audio_alignment (alignment )
522+
491523 elif message ["type" ] == "agent_response" :
492524 if message_handler .callback_agent_response :
493525 event = message ["agent_response_event" ]
@@ -508,8 +540,7 @@ async def _handle_message_core_async(self, message, message_handler):
508540 if message_handler .callback_agent_response_correction :
509541 event = message ["agent_response_correction_event" ]
510542 await message_handler .handle_agent_response_correction (
511- event ["original_agent_response" ].strip (),
512- event ["corrected_agent_response" ].strip ()
543+ event ["original_agent_response" ].strip (), event ["corrected_agent_response" ].strip ()
513544 )
514545
515546 elif message ["type" ] == "user_transcript" :
@@ -544,6 +575,7 @@ class Conversation(BaseConversation):
544575 callback_agent_chat_response_part : Optional [Callable [[str , AgentChatResponsePartType ], None ]]
545576 callback_user_transcript : Optional [Callable [[str ], None ]]
546577 callback_latency_measurement : Optional [Callable [[int ], None ]]
578+ callback_audio_alignment : Optional [Callable [[AudioEventAlignment ], None ]]
547579 callback_end_session : Optional [Callable ]
548580
549581 _thread : Optional [threading .Thread ]
@@ -565,6 +597,7 @@ def __init__(
565597 callback_agent_chat_response_part : Optional [Callable [[str , AgentChatResponsePartType ], None ]] = None ,
566598 callback_user_transcript : Optional [Callable [[str ], None ]] = None ,
567599 callback_latency_measurement : Optional [Callable [[int ], None ]] = None ,
600+ callback_audio_alignment : Optional [Callable [[AudioEventAlignment ], None ]] = None ,
568601 callback_end_session : Optional [Callable ] = None ,
569602 on_prem_config : Optional [OnPremInitiationData ] = None ,
570603 ):
@@ -587,6 +620,7 @@ def __init__(
587620 First argument is the text chunk, second argument is the type (START, DELTA, STOP).
588621 callback_user_transcript: Callback for user transcripts.
589622 callback_latency_measurement: Callback for latency measurements (in milliseconds).
623+ callback_audio_alignment: Callback for audio alignment data with character-level timing.
590624 """
591625
592626 super ().__init__ (
@@ -605,6 +639,7 @@ def __init__(
605639 self .callback_agent_chat_response_part = callback_agent_chat_response_part
606640 self .callback_user_transcript = callback_user_transcript
607641 self .callback_latency_measurement = callback_latency_measurement
642+ self .callback_audio_alignment = callback_audio_alignment
608643 self .callback_end_session = callback_end_session
609644
610645 self ._thread = None
@@ -751,10 +786,14 @@ def __init__(self, conversation, ws):
751786 self .callback_agent_chat_response_part = conversation .callback_agent_chat_response_part
752787 self .callback_user_transcript = conversation .callback_user_transcript
753788 self .callback_latency_measurement = conversation .callback_latency_measurement
789+ self .callback_audio_alignment = conversation .callback_audio_alignment
754790
755791 def handle_audio_output (self , audio ):
756792 self .conversation .audio_interface .output (audio )
757793
794+ def handle_audio_alignment (self , alignment ):
795+ self .conversation .callback_audio_alignment (alignment )
796+
758797 def handle_agent_response (self , response ):
759798 self .conversation .callback_agent_response (response )
760799
@@ -801,6 +840,7 @@ class AsyncConversation(BaseConversation):
801840 callback_agent_chat_response_part : Optional [Callable [[str , AgentChatResponsePartType ], Awaitable [None ]]]
802841 callback_user_transcript : Optional [Callable [[str ], Awaitable [None ]]]
803842 callback_latency_measurement : Optional [Callable [[int ], Awaitable [None ]]]
843+ callback_audio_alignment : Optional [Callable [[AudioEventAlignment ], Awaitable [None ]]]
804844 callback_end_session : Optional [Callable [[], Awaitable [None ]]]
805845
806846 _task : Optional [asyncio .Task ]
@@ -822,6 +862,7 @@ def __init__(
822862 callback_agent_chat_response_part : Optional [Callable [[str , AgentChatResponsePartType ], Awaitable [None ]]] = None ,
823863 callback_user_transcript : Optional [Callable [[str ], Awaitable [None ]]] = None ,
824864 callback_latency_measurement : Optional [Callable [[int ], Awaitable [None ]]] = None ,
865+ callback_audio_alignment : Optional [Callable [[AudioEventAlignment ], Awaitable [None ]]] = None ,
825866 callback_end_session : Optional [Callable [[], Awaitable [None ]]] = None ,
826867 on_prem_config : Optional [OnPremInitiationData ] = None ,
827868 ):
@@ -844,6 +885,7 @@ def __init__(
844885 First argument is the text chunk, second argument is the type (START, DELTA, STOP).
845886 callback_user_transcript: Async callback for user transcripts.
846887 callback_latency_measurement: Async callback for latency measurements (in milliseconds).
888+ callback_audio_alignment: Async callback for audio alignment data with character-level timing.
847889 callback_end_session: Async callback for when session ends.
848890 """
849891
@@ -863,6 +905,7 @@ def __init__(
863905 self .callback_agent_chat_response_part = callback_agent_chat_response_part
864906 self .callback_user_transcript = callback_user_transcript
865907 self .callback_latency_measurement = callback_latency_measurement
908+ self .callback_audio_alignment = callback_audio_alignment
866909 self .callback_end_session = callback_end_session
867910
868911 self ._task = None
@@ -1012,10 +1055,14 @@ def __init__(self, conversation, ws):
10121055 self .callback_agent_chat_response_part = conversation .callback_agent_chat_response_part
10131056 self .callback_user_transcript = conversation .callback_user_transcript
10141057 self .callback_latency_measurement = conversation .callback_latency_measurement
1058+ self .callback_audio_alignment = conversation .callback_audio_alignment
10151059
10161060 async def handle_audio_output (self , audio ):
10171061 await self .conversation .audio_interface .output (audio )
10181062
1063+ async def handle_audio_alignment (self , alignment ):
1064+ await self .conversation .callback_audio_alignment (alignment )
1065+
10191066 async def handle_agent_response (self , response ):
10201067 await self .conversation .callback_agent_response (response )
10211068
0 commit comments