46 lines
1.3 KiB
Python
46 lines
1.3 KiB
Python
import io
|
|
|
|
from google import genai
|
|
from google.genai import types
|
|
from pydub import AudioSegment
|
|
|
|
from ..constants import SAFETY_SETTINGS
|
|
|
|
TTS_MODEL = "gemini-2.5-flash-preview-tts"
|
|
|
|
|
|
class TTSAgent:
|
|
def __init__(self, client: genai.client.AsyncClient) -> None:
|
|
self.client = client
|
|
|
|
self.content_config = types.GenerateContentConfig(
|
|
response_modalities=[types.Modality.AUDIO],
|
|
speech_config=types.SpeechConfig(
|
|
voice_config=types.VoiceConfig(
|
|
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
|
voice_name="Kore",
|
|
)
|
|
)
|
|
),
|
|
# safety_settings=SAFETY_SETTINGS,
|
|
)
|
|
|
|
async def generate(self, text: str):
|
|
response = await self.client.models.generate_content(
|
|
model=TTS_MODEL, contents=text, config=self.content_config
|
|
)
|
|
|
|
data = response.candidates[0].content.parts[0].inline_data.data
|
|
pcm_io = io.BytesIO(data)
|
|
pcm_io.seek(0)
|
|
|
|
audio = AudioSegment(
|
|
pcm_io.read(), sample_width=2, frame_rate=24000, channels=1
|
|
)
|
|
|
|
ogg_io = io.BytesIO()
|
|
audio.export(ogg_io, format="ogg", codec="libopus")
|
|
ogg_bytes = ogg_io.getvalue()
|
|
|
|
return ogg_bytes
|