Skip to content

Commit 45223dc

Browse files
authored
Added audio alignment callback (#713)
1 parent 5d39547 commit 45223dc

1 file changed

Lines changed: 56 additions & 9 deletions

File tree

src/elevenlabs/conversational_ai/conversation.py

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import asyncio
33
import base64
44
from concurrent.futures import ThreadPoolExecutor
5+
from dataclasses import dataclass
56
from enum import Enum
67
import json
78
import logging
@@ -334,6 +335,16 @@ def __init__(
334335
self.tools_config_list = tools_config_list
335336
self.prompt_knowledge_base = prompt_knowledge_base
336337

338+
339+
@dataclass
340+
class AudioEventAlignment:
341+
"""Audio alignment data containing character-level timing information. d"""
342+
343+
chars: List[str]
344+
char_start_times_ms: List[int]
345+
char_durations_ms: List[int]
346+
347+
337348
class BaseConversation:
338349
"""Base class for conversation implementations with shared parameters and logic."""
339350

@@ -364,12 +375,16 @@ def __init__(
364375
def _get_wss_url(self):
365376
if self.on_prem_config:
366377
return self.on_prem_config.on_prem_conversation_url
367-
378+
368379
base_http_url = self.client._client_wrapper.get_base_url()
369-
base_ws_url = urllib.parse.urlparse(base_http_url)._replace(scheme="wss" if base_http_url.startswith("https") else "ws").geturl()
380+
base_ws_url = (
381+
urllib.parse.urlparse(base_http_url)
382+
._replace(scheme="wss" if base_http_url.startswith("https") else "ws")
383+
.geturl()
384+
)
370385
# Ensure base URL ends with '/' for proper joining
371-
if not base_ws_url.endswith('/'):
372-
base_ws_url += '/'
386+
if not base_ws_url.endswith("/"):
387+
base_ws_url += "/"
373388
return f"{base_ws_url}v1/convai/conversation?agent_id={self.agent_id}&source=python_sdk&version={__version__}"
374389

375390
def _get_signed_url(self):
@@ -391,7 +406,7 @@ def _create_on_prem_initiation_message(self):
391406
"prompt_knowledge_base": self.on_prem_config.prompt_knowledge_base,
392407
}
393408
)
394-
409+
395410
def _create_initiation_message(self):
396411
return json.dumps(
397412
{
@@ -426,6 +441,15 @@ def _handle_message_core(self, message, message_handler):
426441
audio = base64.b64decode(event["audio_base_64"])
427442
message_handler.handle_audio_output(audio)
428443

444+
if message_handler.callback_audio_alignment and "alignment" in event:
445+
alignment_data = event["alignment"]
446+
alignment = AudioEventAlignment(
447+
chars=alignment_data.get("chars", []),
448+
char_start_times_ms=alignment_data.get("char_start_times_ms", []),
449+
char_durations_ms=alignment_data.get("char_durations_ms", []),
450+
)
451+
message_handler.handle_audio_alignment(alignment)
452+
429453
elif message["type"] == "agent_response":
430454
if message_handler.callback_agent_response:
431455
event = message["agent_response_event"]
@@ -446,8 +470,7 @@ def _handle_message_core(self, message, message_handler):
446470
if message_handler.callback_agent_response_correction:
447471
event = message["agent_response_correction_event"]
448472
message_handler.handle_agent_response_correction(
449-
event["original_agent_response"].strip(),
450-
event["corrected_agent_response"].strip()
473+
event["original_agent_response"].strip(), event["corrected_agent_response"].strip()
451474
)
452475

453476
elif message["type"] == "user_transcript":
@@ -488,6 +511,15 @@ async def _handle_message_core_async(self, message, message_handler):
488511
audio = base64.b64decode(event["audio_base_64"])
489512
await message_handler.handle_audio_output(audio)
490513

514+
if message_handler.callback_audio_alignment and "alignment" in event:
515+
alignment_data = event["alignment"]
516+
alignment = AudioEventAlignment(
517+
chars=alignment_data.get("chars", []),
518+
char_start_times_ms=alignment_data.get("char_start_times_ms", []),
519+
char_durations_ms=alignment_data.get("char_durations_ms", []),
520+
)
521+
await message_handler.handle_audio_alignment(alignment)
522+
491523
elif message["type"] == "agent_response":
492524
if message_handler.callback_agent_response:
493525
event = message["agent_response_event"]
@@ -508,8 +540,7 @@ async def _handle_message_core_async(self, message, message_handler):
508540
if message_handler.callback_agent_response_correction:
509541
event = message["agent_response_correction_event"]
510542
await message_handler.handle_agent_response_correction(
511-
event["original_agent_response"].strip(),
512-
event["corrected_agent_response"].strip()
543+
event["original_agent_response"].strip(), event["corrected_agent_response"].strip()
513544
)
514545

515546
elif message["type"] == "user_transcript":
@@ -544,6 +575,7 @@ class Conversation(BaseConversation):
544575
callback_agent_chat_response_part: Optional[Callable[[str, AgentChatResponsePartType], None]]
545576
callback_user_transcript: Optional[Callable[[str], None]]
546577
callback_latency_measurement: Optional[Callable[[int], None]]
578+
callback_audio_alignment: Optional[Callable[[AudioEventAlignment], None]]
547579
callback_end_session: Optional[Callable]
548580

549581
_thread: Optional[threading.Thread]
@@ -565,6 +597,7 @@ def __init__(
565597
callback_agent_chat_response_part: Optional[Callable[[str, AgentChatResponsePartType], None]] = None,
566598
callback_user_transcript: Optional[Callable[[str], None]] = None,
567599
callback_latency_measurement: Optional[Callable[[int], None]] = None,
600+
callback_audio_alignment: Optional[Callable[[AudioEventAlignment], None]] = None,
568601
callback_end_session: Optional[Callable] = None,
569602
on_prem_config: Optional[OnPremInitiationData] = None,
570603
):
@@ -587,6 +620,7 @@ def __init__(
587620
First argument is the text chunk, second argument is the type (START, DELTA, STOP).
588621
callback_user_transcript: Callback for user transcripts.
589622
callback_latency_measurement: Callback for latency measurements (in milliseconds).
623+
callback_audio_alignment: Callback for audio alignment data with character-level timing.
590624
"""
591625

592626
super().__init__(
@@ -605,6 +639,7 @@ def __init__(
605639
self.callback_agent_chat_response_part = callback_agent_chat_response_part
606640
self.callback_user_transcript = callback_user_transcript
607641
self.callback_latency_measurement = callback_latency_measurement
642+
self.callback_audio_alignment = callback_audio_alignment
608643
self.callback_end_session = callback_end_session
609644

610645
self._thread = None
@@ -751,10 +786,14 @@ def __init__(self, conversation, ws):
751786
self.callback_agent_chat_response_part = conversation.callback_agent_chat_response_part
752787
self.callback_user_transcript = conversation.callback_user_transcript
753788
self.callback_latency_measurement = conversation.callback_latency_measurement
789+
self.callback_audio_alignment = conversation.callback_audio_alignment
754790

755791
def handle_audio_output(self, audio):
756792
self.conversation.audio_interface.output(audio)
757793

794+
def handle_audio_alignment(self, alignment):
795+
self.conversation.callback_audio_alignment(alignment)
796+
758797
def handle_agent_response(self, response):
759798
self.conversation.callback_agent_response(response)
760799

@@ -801,6 +840,7 @@ class AsyncConversation(BaseConversation):
801840
callback_agent_chat_response_part: Optional[Callable[[str, AgentChatResponsePartType], Awaitable[None]]]
802841
callback_user_transcript: Optional[Callable[[str], Awaitable[None]]]
803842
callback_latency_measurement: Optional[Callable[[int], Awaitable[None]]]
843+
callback_audio_alignment: Optional[Callable[[AudioEventAlignment], Awaitable[None]]]
804844
callback_end_session: Optional[Callable[[], Awaitable[None]]]
805845

806846
_task: Optional[asyncio.Task]
@@ -822,6 +862,7 @@ def __init__(
822862
callback_agent_chat_response_part: Optional[Callable[[str, AgentChatResponsePartType], Awaitable[None]]] = None,
823863
callback_user_transcript: Optional[Callable[[str], Awaitable[None]]] = None,
824864
callback_latency_measurement: Optional[Callable[[int], Awaitable[None]]] = None,
865+
callback_audio_alignment: Optional[Callable[[AudioEventAlignment], Awaitable[None]]] = None,
825866
callback_end_session: Optional[Callable[[], Awaitable[None]]] = None,
826867
on_prem_config: Optional[OnPremInitiationData] = None,
827868
):
@@ -844,6 +885,7 @@ def __init__(
844885
First argument is the text chunk, second argument is the type (START, DELTA, STOP).
845886
callback_user_transcript: Async callback for user transcripts.
846887
callback_latency_measurement: Async callback for latency measurements (in milliseconds).
888+
callback_audio_alignment: Async callback for audio alignment data with character-level timing.
847889
callback_end_session: Async callback for when session ends.
848890
"""
849891

@@ -863,6 +905,7 @@ def __init__(
863905
self.callback_agent_chat_response_part = callback_agent_chat_response_part
864906
self.callback_user_transcript = callback_user_transcript
865907
self.callback_latency_measurement = callback_latency_measurement
908+
self.callback_audio_alignment = callback_audio_alignment
866909
self.callback_end_session = callback_end_session
867910

868911
self._task = None
@@ -1012,10 +1055,14 @@ def __init__(self, conversation, ws):
10121055
self.callback_agent_chat_response_part = conversation.callback_agent_chat_response_part
10131056
self.callback_user_transcript = conversation.callback_user_transcript
10141057
self.callback_latency_measurement = conversation.callback_latency_measurement
1058+
self.callback_audio_alignment = conversation.callback_audio_alignment
10151059

10161060
async def handle_audio_output(self, audio):
10171061
await self.conversation.audio_interface.output(audio)
10181062

1063+
async def handle_audio_alignment(self, alignment):
1064+
await self.conversation.callback_audio_alignment(alignment)
1065+
10191066
async def handle_agent_response(self, response):
10201067
await self.conversation.callback_agent_response(response)
10211068

0 commit comments

Comments
 (0)