Initial commit: audio-chat with fixes
- Created AGENTS.md with architecture documentation - Fixed race conditions and async patterns - Added conversation history to LLM prompts - Fixed TTS audio shape handling - Added buffer limits and graceful shutdown - Fixed client.py with file sending support - Removed duplicate requirements - Added .gitignore
This commit is contained in:
0
engine/__init__.py
Normal file
0
engine/__init__.py
Normal file
61
engine/llm.py
Normal file
61
engine/llm.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
||||
from config import Config
|
||||
import torch
|
||||
|
||||
|
||||
class LLMEngine:
|
||||
def __init__(self):
|
||||
self.model = None
|
||||
self.tokenizer = None
|
||||
self.config = Config()
|
||||
|
||||
def initialize(self):
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
dtype = torch.float16 if device == "cuda" else torch.float32
|
||||
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||
self.config.LLM_MODEL,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
self.model = AutoModelForCausalLM.from_pretrained(
|
||||
self.config.LLM_MODEL,
|
||||
torch_dtype=dtype,
|
||||
device_map="auto",
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
def generate(self, user_text: str, system_prompt: str = None) -> str:
|
||||
if not self.model:
|
||||
self.initialize()
|
||||
|
||||
if system_prompt is None:
|
||||
system_prompt = "Ты полезный ассистент. Отвечай на русском языке кратко и по делу."
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_text},
|
||||
]
|
||||
|
||||
text = self.tokenizer.apply_chat_template(
|
||||
messages,
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = self.model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=self.config.LLM_MAX_TOKENS,
|
||||
temperature=self.config.LLM_TEMPERATURE,
|
||||
do_sample=True,
|
||||
top_p=0.9,
|
||||
repetition_penalty=1.1,
|
||||
)
|
||||
|
||||
generated = outputs[0][inputs["input_ids"].shape[1]:]
|
||||
response = self.tokenizer.decode(generated, skip_special_tokens=True)
|
||||
|
||||
return response.strip()
|
||||
49
engine/stt.py
Normal file
49
engine/stt.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from faster_whisper import WhisperModel
|
||||
from config import Config
|
||||
import io
|
||||
import numpy as np
|
||||
|
||||
|
||||
class STTEngine:
|
||||
def __init__(self):
|
||||
self.model = None
|
||||
self.config = Config()
|
||||
self._model_size = self._resolve_model_size(self.config.STT_MODEL)
|
||||
|
||||
def _resolve_model_size(self, model_name: str) -> str:
|
||||
"""Extract model size from various naming conventions."""
|
||||
# Handle Systran/faster-whisper-* format
|
||||
if "faster-whisper-" in model_name:
|
||||
return model_name.split("faster-whisper-")[-1]
|
||||
# Handle whisper-* format
|
||||
if model_name.startswith("whisper-"):
|
||||
return model_name[len("whisper-"):]
|
||||
# Return as-is for direct model names
|
||||
return model_name
|
||||
|
||||
def initialize(self):
|
||||
device = "cuda" if self.config.DEVICE == "auto" else self.config.DEVICE
|
||||
self.model = WhisperModel(
|
||||
self._model_size,
|
||||
device=device,
|
||||
compute_type="float16" if device == "cuda" else "int8",
|
||||
download_root=None,
|
||||
)
|
||||
|
||||
def transcribe(self, audio_bytes: bytes) -> str:
|
||||
if not self.model:
|
||||
self.initialize()
|
||||
|
||||
audio_file = io.BytesIO(audio_bytes)
|
||||
segments, info = self.model.transcribe(
|
||||
audio_file,
|
||||
beam_size=5,
|
||||
language="ru",
|
||||
vad_filter=True,
|
||||
)
|
||||
|
||||
text = ""
|
||||
for segment in segments:
|
||||
text += segment.text + " "
|
||||
|
||||
return text.strip()
|
||||
53
engine/tts.py
Normal file
53
engine/tts.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from transformers import pipeline
|
||||
from config import Config
|
||||
import numpy as np
|
||||
|
||||
|
||||
class TTSEngine:
|
||||
def __init__(self):
|
||||
self.tts_pipeline = None
|
||||
self.config = Config()
|
||||
|
||||
def initialize(self):
|
||||
try:
|
||||
self.tts_pipeline = pipeline(
|
||||
"text-to-speech",
|
||||
self.config.TTS_MODEL,
|
||||
device=0 if __import__("torch").cuda.is_available() else -1,
|
||||
)
|
||||
except Exception:
|
||||
self.tts_pipeline = pipeline(
|
||||
"text-to-speech",
|
||||
model=self._tts_model,
|
||||
device=-1,
|
||||
)
|
||||
self.tts_pipeline.start()
|
||||
|
||||
def synthesize(self, text: str, output_sample_rate: int = 24000) -> np.ndarray:
|
||||
if not self.tts_pipeline:
|
||||
self.initialize()
|
||||
|
||||
result = self.tts_pipeline(
|
||||
text,
|
||||
generate_kwargs={"task": "tts", "language": "ru"},
|
||||
return_tensors=True,
|
||||
)
|
||||
|
||||
audio = result["audio"]
|
||||
# Convert torch tensor to numpy if needed
|
||||
if hasattr(audio, 'numpy'):
|
||||
audio = audio.numpy()
|
||||
elif not isinstance(audio, np.ndarray):
|
||||
audio = np.asarray(audio)
|
||||
|
||||
# Handle multi-dimensional arrays (batch or stereo)
|
||||
if audio.ndim > 2:
|
||||
# Batch dimension - take first item
|
||||
audio = audio[0]
|
||||
if audio.ndim == 2:
|
||||
# Stereo - mix to mono
|
||||
audio = audio.mean(axis=1)
|
||||
|
||||
audio = audio.astype(np.float32)
|
||||
|
||||
return audio
|
||||
Reference in New Issue
Block a user