Files
audio-chat/engine/stt.py
noturum 1edfd5d62f Initial commit: audio-chat with fixes
- Created AGENTS.md with architecture documentation
- Fixed race conditions and async patterns
- Added conversation history to LLM prompts
- Fixed TTS audio shape handling
- Added buffer limits and graceful shutdown
- Fixed client.py with file sending support
- Removed duplicate requirements
- Added .gitignore
2026-05-01 13:01:06 +00:00

50 lines
1.5 KiB
Python

from faster_whisper import WhisperModel
from config import Config
import io
import numpy as np
class STTEngine:
def __init__(self):
self.model = None
self.config = Config()
self._model_size = self._resolve_model_size(self.config.STT_MODEL)
def _resolve_model_size(self, model_name: str) -> str:
"""Extract model size from various naming conventions."""
# Handle Systran/faster-whisper-* format
if "faster-whisper-" in model_name:
return model_name.split("faster-whisper-")[-1]
# Handle whisper-* format
if model_name.startswith("whisper-"):
return model_name[len("whisper-"):]
# Return as-is for direct model names
return model_name
def initialize(self):
device = "cuda" if self.config.DEVICE == "auto" else self.config.DEVICE
self.model = WhisperModel(
self._model_size,
device=device,
compute_type="float16" if device == "cuda" else "int8",
download_root=None,
)
def transcribe(self, audio_bytes: bytes) -> str:
if not self.model:
self.initialize()
audio_file = io.BytesIO(audio_bytes)
segments, info = self.model.transcribe(
audio_file,
beam_size=5,
language="ru",
vad_filter=True,
)
text = ""
for segment in segments:
text += segment.text + " "
return text.strip()