import io from google import genai from google.genai import types from pydub import AudioSegment from ..constants import SAFETY_SETTINGS TTS_MODEL = "gemini-2.5-flash-preview-tts" class TTSAgent: def __init__(self, client: genai.client.AsyncClient) -> None: self.client = client self.content_config = types.GenerateContentConfig( response_modalities=[types.Modality.AUDIO], speech_config=types.SpeechConfig( voice_config=types.VoiceConfig( prebuilt_voice_config=types.PrebuiltVoiceConfig( voice_name="Kore", ) ) ), # safety_settings=SAFETY_SETTINGS, ) async def generate(self, text: str): response = await self.client.models.generate_content( model=TTS_MODEL, contents=text, config=self.content_config ) data = response.candidates[0].content.parts[0].inline_data.data pcm_io = io.BytesIO(data) pcm_io.seek(0) audio = AudioSegment( pcm_io.read(), sample_width=2, frame_rate=24000, channels=1 ) ogg_io = io.BytesIO() audio.export(ogg_io, format="ogg", codec="libopus") ogg_bytes = ogg_io.getvalue() return ogg_bytes