from faster_whisper import WhisperModel from config import Config import io import numpy as np class STTEngine: def __init__(self): self.model = None self.config = Config() self._model_size = self._resolve_model_size(self.config.STT_MODEL) def _resolve_model_size(self, model_name: str) -> str: """Extract model size from various naming conventions.""" # Handle Systran/faster-whisper-* format if "faster-whisper-" in model_name: return model_name.split("faster-whisper-")[-1] # Handle whisper-* format if model_name.startswith("whisper-"): return model_name[len("whisper-"):] # Return as-is for direct model names return model_name def initialize(self): device = "cuda" if self.config.DEVICE == "auto" else self.config.DEVICE self.model = WhisperModel( self._model_size, device=device, compute_type="float16" if device == "cuda" else "int8", download_root=None, ) def transcribe(self, audio_bytes: bytes) -> str: if not self.model: self.initialize() audio_file = io.BytesIO(audio_bytes) segments, info = self.model.transcribe( audio_file, beam_size=5, language="ru", vad_filter=True, ) text = "" for segment in segments: text += segment.text + " " return text.strip()