Initial commit: audio-chat with fixes
- Created AGENTS.md with architecture documentation - Fixed race conditions and async patterns - Added conversation history to LLM prompts - Fixed TTS audio shape handling - Added buffer limits and graceful shutdown - Fixed client.py with file sending support - Removed duplicate requirements - Added .gitignore
This commit is contained in:
53
engine/tts.py
Normal file
53
engine/tts.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from transformers import pipeline
|
||||
from config import Config
|
||||
import numpy as np
|
||||
|
||||
|
||||
class TTSEngine:
|
||||
def __init__(self):
|
||||
self.tts_pipeline = None
|
||||
self.config = Config()
|
||||
|
||||
def initialize(self):
|
||||
try:
|
||||
self.tts_pipeline = pipeline(
|
||||
"text-to-speech",
|
||||
self.config.TTS_MODEL,
|
||||
device=0 if __import__("torch").cuda.is_available() else -1,
|
||||
)
|
||||
except Exception:
|
||||
self.tts_pipeline = pipeline(
|
||||
"text-to-speech",
|
||||
model=self._tts_model,
|
||||
device=-1,
|
||||
)
|
||||
self.tts_pipeline.start()
|
||||
|
||||
def synthesize(self, text: str, output_sample_rate: int = 24000) -> np.ndarray:
|
||||
if not self.tts_pipeline:
|
||||
self.initialize()
|
||||
|
||||
result = self.tts_pipeline(
|
||||
text,
|
||||
generate_kwargs={"task": "tts", "language": "ru"},
|
||||
return_tensors=True,
|
||||
)
|
||||
|
||||
audio = result["audio"]
|
||||
# Convert torch tensor to numpy if needed
|
||||
if hasattr(audio, 'numpy'):
|
||||
audio = audio.numpy()
|
||||
elif not isinstance(audio, np.ndarray):
|
||||
audio = np.asarray(audio)
|
||||
|
||||
# Handle multi-dimensional arrays (batch or stereo)
|
||||
if audio.ndim > 2:
|
||||
# Batch dimension - take first item
|
||||
audio = audio[0]
|
||||
if audio.ndim == 2:
|
||||
# Stereo - mix to mono
|
||||
audio = audio.mean(axis=1)
|
||||
|
||||
audio = audio.astype(np.float32)
|
||||
|
||||
return audio
|
||||
Reference in New Issue
Block a user