Initial commit: audio-chat with fixes

- Created AGENTS.md with architecture documentation
- Fixed race conditions and async patterns
- Added conversation history to LLM prompts
- Fixed TTS audio shape handling
- Added buffer limits and graceful shutdown
- Fixed client.py with file sending support
- Removed duplicate requirements
- Added .gitignore
This commit is contained in:
2026-05-01 13:01:06 +00:00
commit 1edfd5d62f
13 changed files with 1286 additions and 0 deletions

53
engine/tts.py Normal file
View File

@@ -0,0 +1,53 @@
from transformers import pipeline
from config import Config
import numpy as np
class TTSEngine:
def __init__(self):
self.tts_pipeline = None
self.config = Config()
def initialize(self):
try:
self.tts_pipeline = pipeline(
"text-to-speech",
self.config.TTS_MODEL,
device=0 if __import__("torch").cuda.is_available() else -1,
)
except Exception:
self.tts_pipeline = pipeline(
"text-to-speech",
model=self._tts_model,
device=-1,
)
self.tts_pipeline.start()
def synthesize(self, text: str, output_sample_rate: int = 24000) -> np.ndarray:
if not self.tts_pipeline:
self.initialize()
result = self.tts_pipeline(
text,
generate_kwargs={"task": "tts", "language": "ru"},
return_tensors=True,
)
audio = result["audio"]
# Convert torch tensor to numpy if needed
if hasattr(audio, 'numpy'):
audio = audio.numpy()
elif not isinstance(audio, np.ndarray):
audio = np.asarray(audio)
# Handle multi-dimensional arrays (batch or stereo)
if audio.ndim > 2:
# Batch dimension - take first item
audio = audio[0]
if audio.ndim == 2:
# Stereo - mix to mono
audio = audio.mean(axis=1)
audio = audio.astype(np.float32)
return audio