Initial commit: audio-chat with fixes

- Created AGENTS.md with architecture documentation - Fixed race conditions and async patterns - Added conversation history to LLM prompts - Fixed TTS audio shape handling - Added buffer limits and graceful shutdown - Fixed client.py with file sending support - Removed duplicate requirements - Added .gitignore
2026-05-01 13:01:06 +00:00
commit 1edfd5d62f
13 changed files with 1286 additions and 0 deletions
--- a/engine/init.py
+++ b/engine/init.py
--- a/engine/llm.py
+++ b/engine/llm.py
@@ -0,0 +1,61 @@
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
+from config import Config
+import torch
+
+
+class LLMEngine:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.config = Config()
+
+    def initialize(self):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        dtype = torch.float16 if device == "cuda" else torch.float32
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.config.LLM_MODEL,
+            trust_remote_code=True,
+        )
+
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.config.LLM_MODEL,
+            torch_dtype=dtype,
+            device_map="auto",
+            trust_remote_code=True,
+        )
+
+    def generate(self, user_text: str, system_prompt: str = None) -> str:
+        if not self.model:
+            self.initialize()
+
+        if system_prompt is None:
+            system_prompt = "Ты полезный ассистент. Отвечай на русском языке кратко и по делу."
+
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_text},
+        ]
+
+        text = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+
+        inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
+
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_new_tokens=self.config.LLM_MAX_TOKENS,
+                temperature=self.config.LLM_TEMPERATURE,
+                do_sample=True,
+                top_p=0.9,
+                repetition_penalty=1.1,
+            )
+
+        generated = outputs[0][inputs["input_ids"].shape[1]:]
+        response = self.tokenizer.decode(generated, skip_special_tokens=True)
+
+        return response.strip()
--- a/engine/stt.py
+++ b/engine/stt.py
@@ -0,0 +1,49 @@
+from faster_whisper import WhisperModel
+from config import Config
+import io
+import numpy as np
+
+
+class STTEngine:
+    def __init__(self):
+        self.model = None
+        self.config = Config()
+        self._model_size = self._resolve_model_size(self.config.STT_MODEL)
+
+    def _resolve_model_size(self, model_name: str) -> str:
+        """Extract model size from various naming conventions."""
+        # Handle Systran/faster-whisper-* format
+        if "faster-whisper-" in model_name:
+            return model_name.split("faster-whisper-")[-1]
+        # Handle whisper-* format
+        if model_name.startswith("whisper-"):
+            return model_name[len("whisper-"):]
+        # Return as-is for direct model names
+        return model_name
+
+    def initialize(self):
+        device = "cuda" if self.config.DEVICE == "auto" else self.config.DEVICE
+        self.model = WhisperModel(
+            self._model_size,
+            device=device,
+            compute_type="float16" if device == "cuda" else "int8",
+            download_root=None,
+        )
+
+    def transcribe(self, audio_bytes: bytes) -> str:
+        if not self.model:
+            self.initialize()
+
+        audio_file = io.BytesIO(audio_bytes)
+        segments, info = self.model.transcribe(
+            audio_file,
+            beam_size=5,
+            language="ru",
+            vad_filter=True,
+        )
+
+        text = ""
+        for segment in segments:
+            text += segment.text + " "
+
+        return text.strip()
--- a/engine/tts.py
+++ b/engine/tts.py
@@ -0,0 +1,53 @@
+from transformers import pipeline
+from config import Config
+import numpy as np
+
+
+class TTSEngine:
+    def __init__(self):
+        self.tts_pipeline = None
+        self.config = Config()
+
+    def initialize(self):
+        try:
+            self.tts_pipeline = pipeline(
+                "text-to-speech",
+                self.config.TTS_MODEL,
+                device=0 if __import__("torch").cuda.is_available() else -1,
+            )
+        except Exception:
+       self.tts_pipeline = pipeline(
+            "text-to-speech",
+            model=self._tts_model,
+            device=-1,
+        )
+        self.tts_pipeline.start()
+
+    def synthesize(self, text: str, output_sample_rate: int = 24000) -> np.ndarray:
+        if not self.tts_pipeline:
+            self.initialize()
+
+        result = self.tts_pipeline(
+            text,
+            generate_kwargs={"task": "tts", "language": "ru"},
+            return_tensors=True,
+        )
+
+        audio = result["audio"]
+        # Convert torch tensor to numpy if needed
+        if hasattr(audio, 'numpy'):
+            audio = audio.numpy()
+        elif not isinstance(audio, np.ndarray):
+            audio = np.asarray(audio)
+        
+        # Handle multi-dimensional arrays (batch or stereo)
+        if audio.ndim > 2:
+            # Batch dimension - take first item
+            audio = audio[0]
+        if audio.ndim == 2:
+            # Stereo - mix to mono
+            audio = audio.mean(axis=1)
+        
+        audio = audio.astype(np.float32)
+
+        return audio