added TTS agent for voice messages

This commit is contained in:
shinrei
2025-07-01 19:50:06 +00:00
parent 955550e3bf
commit 81875620ff
4 changed files with 63 additions and 1 deletions

View File

@@ -0,0 +1,32 @@
import io
import json
from google import genai
from google.genai import types
from pydub import AudioSegment
from ..content_configs import TTS_CONTENT_CONFIG
class TTSAgent:
def __init__(self, api_key: str) -> None:
# код повторяется некрасиво
self.client = genai.Client(api_key=api_key).aio
async def generate(self, text: str):
response = await self.client.models.generate_content(
model="gemini-2.5-flash-preview-tts",
contents=text,
config=TTS_CONTENT_CONFIG
)
data = response.candidates[0].content.parts[0].inline_data.data
pcm_io = io.BytesIO(data)
audio = AudioSegment(
pcm_io.read(),
sample_width=2,
frame_rate=24000,
channels=1
)
# Экспортируем как .ogg с кодеком opus
ogg_io = io.BytesIO()
audio.export(ogg_io, format="ogg", codec="libopus")
ogg_bytes = ogg_io.getvalue()
return ogg_bytes

View File

@@ -2,7 +2,7 @@
from google.genai import types
from .structures import OutputMessage
CONTENT_CONFIG = types.GenerateContentConfig(
MAIN_CONTENT_CONFIG = types.GenerateContentConfig(
system_instruction="meow meow meow", # надо где-то промпт хранить, в бд наверное хезе
thinking_config=types.ThinkingConfig(thinking_budget=0),
response_mime_type="application/json",
@@ -15,3 +15,21 @@ CONTENT_CONFIG = types.GenerateContentConfig(
for category in types.HarmBlockThreshold
]
)
TTS_CONTENT_CONFIG = types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name='Kore',
)
)
),
safety_settings=[
types.SafetySetting(
category=category,
threshold=types.HarmBlockThreshold.OFF
)
for category in types.HarmBlockThreshold
]
)