- Add webrtcvad dependency for real-time voice activity detection - Create audio/fade.py with fade-in/fade-out utility - Add VAD voice activation to client recording (sends audio only during speech) - Apply 200ms fade-out to TTS output to avoid abrupt audio cuts - Fix tts.py indentation error in except block
58 lines
1.7 KiB
Python
58 lines
1.7 KiB
Python
from transformers import pipeline
|
|
from config import Config
|
|
import numpy as np
|
|
from audio.fade import apply_fade
|
|
|
|
|
|
class TTSEngine:
|
|
def __init__(self):
|
|
self.tts_pipeline = None
|
|
self.config = Config()
|
|
|
|
def initialize(self):
|
|
try:
|
|
self.tts_pipeline = pipeline(
|
|
"text-to-speech",
|
|
self.config.TTS_MODEL,
|
|
device=0 if __import__("torch").cuda.is_available() else -1,
|
|
)
|
|
except Exception:
|
|
self.tts_pipeline = pipeline(
|
|
"text-to-speech",
|
|
model=self._tts_model,
|
|
device=-1,
|
|
)
|
|
self.tts_pipeline.start()
|
|
|
|
def synthesize(self, text: str, output_sample_rate: int = 24000) -> np.ndarray:
|
|
if not self.tts_pipeline:
|
|
self.initialize()
|
|
|
|
result = self.tts_pipeline(
|
|
text,
|
|
generate_kwargs={"task": "tts", "language": "ru"},
|
|
return_tensors=True,
|
|
)
|
|
|
|
audio = result["audio"]
|
|
# Convert torch tensor to numpy if needed
|
|
if hasattr(audio, 'numpy'):
|
|
audio = audio.numpy()
|
|
elif not isinstance(audio, np.ndarray):
|
|
audio = np.asarray(audio)
|
|
|
|
# Handle multi-dimensional arrays (batch or stereo)
|
|
if audio.ndim > 2:
|
|
# Batch dimension - take first item
|
|
audio = audio[0]
|
|
if audio.ndim == 2:
|
|
# Stereo - mix to mono
|
|
audio = audio.mean(axis=1)
|
|
|
|
audio = audio.astype(np.float32)
|
|
|
|
# Apply fade-out to avoid abrupt audio cuts
|
|
audio = apply_fade(audio, output_sample_rate, fade_duration_ms=200)
|
|
|
|
return audio
|