from transformers import pipeline from config import Config import numpy as np from audio.fade import apply_fade class TTSEngine: def __init__(self): self.tts_pipeline = None self.config = Config() def initialize(self): try: self.tts_pipeline = pipeline( "text-to-speech", self.config.TTS_MODEL, device=0 if __import__("torch").cuda.is_available() else -1, ) except Exception: self.tts_pipeline = pipeline( "text-to-speech", model=self._tts_model, device=-1, ) self.tts_pipeline.start() def synthesize(self, text: str, output_sample_rate: int = 24000) -> np.ndarray: if not self.tts_pipeline: self.initialize() result = self.tts_pipeline( text, generate_kwargs={"task": "tts", "language": "ru"}, return_tensors=True, ) audio = result["audio"] # Convert torch tensor to numpy if needed if hasattr(audio, 'numpy'): audio = audio.numpy() elif not isinstance(audio, np.ndarray): audio = np.asarray(audio) # Handle multi-dimensional arrays (batch or stereo) if audio.ndim > 2: # Batch dimension - take first item audio = audio[0] if audio.ndim == 2: # Stereo - mix to mono audio = audio.mean(axis=1) audio = audio.astype(np.float32) # Apply fade-out to avoid abrupt audio cuts audio = apply_fade(audio, output_sample_rate, fade_duration_ms=200) return audio