- Created AGENTS.md with architecture documentation - Fixed race conditions and async patterns - Added conversation history to LLM prompts - Fixed TTS audio shape handling - Added buffer limits and graceful shutdown - Fixed client.py with file sending support - Removed duplicate requirements - Added .gitignore
50 lines
1.5 KiB
Python
50 lines
1.5 KiB
Python
from faster_whisper import WhisperModel
|
|
from config import Config
|
|
import io
|
|
import numpy as np
|
|
|
|
|
|
class STTEngine:
|
|
def __init__(self):
|
|
self.model = None
|
|
self.config = Config()
|
|
self._model_size = self._resolve_model_size(self.config.STT_MODEL)
|
|
|
|
def _resolve_model_size(self, model_name: str) -> str:
|
|
"""Extract model size from various naming conventions."""
|
|
# Handle Systran/faster-whisper-* format
|
|
if "faster-whisper-" in model_name:
|
|
return model_name.split("faster-whisper-")[-1]
|
|
# Handle whisper-* format
|
|
if model_name.startswith("whisper-"):
|
|
return model_name[len("whisper-"):]
|
|
# Return as-is for direct model names
|
|
return model_name
|
|
|
|
def initialize(self):
|
|
device = "cuda" if self.config.DEVICE == "auto" else self.config.DEVICE
|
|
self.model = WhisperModel(
|
|
self._model_size,
|
|
device=device,
|
|
compute_type="float16" if device == "cuda" else "int8",
|
|
download_root=None,
|
|
)
|
|
|
|
def transcribe(self, audio_bytes: bytes) -> str:
|
|
if not self.model:
|
|
self.initialize()
|
|
|
|
audio_file = io.BytesIO(audio_bytes)
|
|
segments, info = self.model.transcribe(
|
|
audio_file,
|
|
beam_size=5,
|
|
language="ru",
|
|
vad_filter=True,
|
|
)
|
|
|
|
text = ""
|
|
for segment in segments:
|
|
text += segment.text + " "
|
|
|
|
return text.strip()
|