Initial commit: audio-chat with fixes

- Created AGENTS.md with architecture documentation - Fixed race conditions and async patterns - Added conversation history to LLM prompts - Fixed TTS audio shape handling - Added buffer limits and graceful shutdown - Fixed client.py with file sending support - Removed duplicate requirements - Added .gitignore
2026-05-01 13:01:06 +00:00
commit 1edfd5d62f
13 changed files with 1286 additions and 0 deletions
--- a/engine/tts.py
+++ b/engine/tts.py
@@ -0,0 +1,53 @@
+from transformers import pipeline
+from config import Config
+import numpy as np
+
+
+class TTSEngine:
+    def __init__(self):
+        self.tts_pipeline = None
+        self.config = Config()
+
+    def initialize(self):
+        try:
+            self.tts_pipeline = pipeline(
+                "text-to-speech",
+                self.config.TTS_MODEL,
+                device=0 if __import__("torch").cuda.is_available() else -1,
+            )
+        except Exception:
+       self.tts_pipeline = pipeline(
+            "text-to-speech",
+            model=self._tts_model,
+            device=-1,
+        )
+        self.tts_pipeline.start()
+
+    def synthesize(self, text: str, output_sample_rate: int = 24000) -> np.ndarray:
+        if not self.tts_pipeline:
+            self.initialize()
+
+        result = self.tts_pipeline(
+            text,
+            generate_kwargs={"task": "tts", "language": "ru"},
+            return_tensors=True,
+        )
+
+        audio = result["audio"]
+        # Convert torch tensor to numpy if needed
+        if hasattr(audio, 'numpy'):
+            audio = audio.numpy()
+        elif not isinstance(audio, np.ndarray):
+            audio = np.asarray(audio)
+        
+        # Handle multi-dimensional arrays (batch or stereo)
+        if audio.ndim > 2:
+            # Batch dimension - take first item
+            audio = audio[0]
+        if audio.ndim == 2:
+            # Stereo - mix to mono
+            audio = audio.mean(axis=1)
+        
+        audio = audio.astype(np.float32)
+
+        return audio