added TTS agent for voice messages

2025-07-01 19:50:06 +00:00
parent 955550e3bf
commit 81875620ff
4 changed files with 63 additions and 1 deletions
--- a/src/bot/modules/solaris/agents/tts.py
+++ b/src/bot/modules/solaris/agents/tts.py
@@ -0,0 +1,32 @@
+import io
+import json
+from google import genai
+from google.genai import types
+from pydub import AudioSegment
+from ..content_configs import TTS_CONTENT_CONFIG
+
+class TTSAgent:
+    def __init__(self, api_key: str) -> None:
+        # код повторяется некрасиво
+        self.client = genai.Client(api_key=api_key).aio
+    async def generate(self, text: str):
+        response = await self.client.models.generate_content(
+            model="gemini-2.5-flash-preview-tts",
+            contents=text,
+            config=TTS_CONTENT_CONFIG
+        )
+        data = response.candidates[0].content.parts[0].inline_data.data
+        pcm_io = io.BytesIO(data)
+        audio = AudioSegment(
+            pcm_io.read(),
+            sample_width=2,
+            frame_rate=24000,
+            channels=1
+        )
+
+        # Экспортируем как .ogg с кодеком opus
+        ogg_io = io.BytesIO()
+        audio.export(ogg_io, format="ogg", codec="libopus")
+        ogg_bytes = ogg_io.getvalue()
+        return ogg_bytes
+
--- a/src/bot/modules/solaris/content_configs.py
+++ b/src/bot/modules/solaris/content_configs.py
@@ -2,7 +2,7 @@
 from google.genai import types 
 from .structures import OutputMessage

-CONTENT_CONFIG = types.GenerateContentConfig(
+MAIN_CONTENT_CONFIG = types.GenerateContentConfig(
    system_instruction="meow meow meow",  # надо где-то промпт хранить, в бд наверное хезе
    thinking_config=types.ThinkingConfig(thinking_budget=0),
    response_mime_type="application/json",
@@ -15,3 +15,21 @@ CONTENT_CONFIG = types.GenerateContentConfig(
        for category in types.HarmBlockThreshold
    ]
 )
+
+TTS_CONTENT_CONFIG = types.GenerateContentConfig(
+    response_modalities=["AUDIO"],
+    speech_config=types.SpeechConfig(
+        voice_config=types.VoiceConfig(
+            prebuilt_voice_config=types.PrebuiltVoiceConfig(
+                voice_name='Kore',
+            )
+        )
+    ),
+    safety_settings=[
+        types.SafetySetting(
+            category=category,
+            threshold=types.HarmBlockThreshold.OFF
+        )
+        for category in types.HarmBlockThreshold
+    ]
+)