Initial commit: audio-chat with fixes

- Created AGENTS.md with architecture documentation
- Fixed race conditions and async patterns
- Added conversation history to LLM prompts
- Fixed TTS audio shape handling
- Added buffer limits and graceful shutdown
- Fixed client.py with file sending support
- Removed duplicate requirements
- Added .gitignore
This commit is contained in:
2026-05-01 13:01:06 +00:00
commit 1edfd5d62f
13 changed files with 1286 additions and 0 deletions

12
.gitignore vendored Normal file
View File

@@ -0,0 +1,12 @@
__pycache__/
*.pyc
*.pyo
.env
.venv/
venv/
*.egg-info/
dist/
build/
*.egg
response_*.wav
*.log

51
AGENTS.md Normal file
View File

@@ -0,0 +1,51 @@
# Audio Chat
## Running
```bash
uvicorn main:app --host 0.0.0.0 --port 8000
```
Client (for testing):
```bash
python client.py
```
## Architecture
Single-process FastAPI server. On each WebSocket connection a new `AudioSession` is created with three engines:
| Module | Purpose | Model (default) |
|--------|---------|-----------------|
| `engine/stt.py` | Speech-to-text | Systran/faster-whisper-large-v3 |
| `engine/llm.py` | LLM response generation | Qwen/Qwen2.5-7B-Instruct |
| `engine/tts.py` | Text-to-speech | facebook/mms-tts-rus |
Models are loaded lazily on first use if `initialize()` was not called. STT always runs in Russian (`language="ru"` with VAD).
## WebSocket Protocol
| Direction | Format | Meaning |
|-----------|--------|---------|
| Client → Server | `b"A" + PCM data` | Send audio chunk |
| Client → Server | `b"R"` | Reset conversation |
| Server → Client | `b"O" + WAV bytes` | LLM response as audio |
| Server → Client | `"TEXT:<transcription>"` | Recognized speech |
Audio format: 16-bit PCM mono, 16 kHz input / 24 kHz output.
## Configuration
All settings via `.env` (loaded by `config.py`). Key vars:
- `DEVICE``"cuda"` or `"cpu"` (default `"auto"`)
- `AUDIO_BUFFER_SECONDS` / `CHUNK_SIZE` — silence detection thresholds
- `LLM_MAX_TOKENS` / `LLM_TEMPERATURE` — generation parameters
## Gotchas
- No test suite or linting configured.
- Models download on first use; ensure network access to HuggingFace.
- `AudioSession` holds conversation history (last 6 turns) in memory — each WebSocket reconnect resets it.
- Thread pool executor is fixed at 2 workers; concurrent heavy requests will queue.
- TTS pipeline falls back to CPU (`device=-1`) if GPU initialization fails silently.

0
audio/__init__.py Normal file
View File

50
audio/stream.py Normal file
View File

@@ -0,0 +1,50 @@
import threading
import time
from config import Config
class AudioStreamBuffer:
def __init__(self):
self.config = Config()
self.buffer = b""
self.lock = threading.Lock()
self.event = threading.Event()
self.running = False
# Limit buffer to 10 seconds of audio to prevent OOM
self.max_buffer_bytes = int(self.config.SAMPLE_RATE * 10 * 2)
def start(self):
self.running = True
self.buffer = b""
self.event.clear()
def add_chunk(self, chunk: bytes):
with self.lock:
self.buffer += chunk
# Evict oldest data if buffer exceeds limit
if len(self.buffer) > self.max_buffer_bytes:
self.buffer = self.buffer[-self.max_buffer_bytes // 2:]
if len(self.buffer) >= self.config.CHUNK_SIZE:
self.event.set()
def get_ready_chunk(self, timeout: float = 1.0) -> bytes:
if self.event.wait(timeout=timeout):
with self.lock:
chunk = self.buffer
self.buffer = b""
self.event.clear()
return chunk
return b""
def get_full_buffer(self) -> bytes:
with self.lock:
chunk = self.buffer
self.buffer = b""
return chunk
def stop(self):
self.running = False
self.event.set()
def is_running(self):
return self.running

160
client.py Normal file
View File

@@ -0,0 +1,160 @@
import asyncio
import websockets
import struct
import wave
import numpy as np
# WebSocket URL
WS_URL = "ws://localhost:8000/ws"
async def start_recording():
"""Send start signal (b'S')"""
async with websockets.connect(WS_URL) as ws:
await ws.send(b"S")
async def send_audio(ws, audio_data: bytes):
"""Send audio data (b'A' + raw PCM)"""
await ws.send(b"A" + audio_data)
async def reset_session(ws):
"""Reset conversation (b'R')"""
await ws.send(b"R")
async def receive_messages(ws):
"""Receive TEXT and AUDIO messages"""
while True:
try:
msg = await asyncio.wait_for(ws.recv(), timeout=30.0)
if isinstance(msg, str):
if msg.startswith("TEXT:"):
print(f"[RECognized] {msg[5:]}")
else:
print(f"[Server] {msg}")
elif isinstance(msg, bytes):
if msg[0:1] == b"O":
audio = msg[1:]
print(f"[Audio] Received {len(audio)} bytes")
# Save to file
timestamp = int(asyncio.get_running_loop().time())
filename = f"response_{timestamp}.wav"
with open(filename, "wb") as f:
with wave.open(f, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(24000)
wf.writeframes(audio)
print(f"[Audio] Saved to {filename}")
except asyncio.TimeoutError:
break
except Exception as e:
print(f"Error: {e}")
break
async def record_and_send():
"""Record audio from microphone and send"""
import pyaudio
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
async with websockets.connect(WS_URL) as ws:
print("Recording... Press Ctrl+C to stop")
try:
while True:
data = stream.read(CHUNK)
await send_audio(ws, data)
except KeyboardInterrupt:
print("\nStopped recording")
finally:
stream.stop_stream()
stream.close()
p.terminate()
async def send_audio_file(filepath: str):
"""Read and send an audio file to the server."""
try:
with open(filepath, "rb") as f:
file_data = f.read()
except FileNotFoundError:
print(f"Error: File '{filepath}' not found")
return
print(f"Reading audio file: {filepath} ({len(file_data)} bytes)")
async with websockets.connect(WS_URL) as ws:
print("Connected. Sending audio file...")
await ws.send(b"A" + file_data)
print("File sent. Waiting for response...")
try:
while True:
msg = await asyncio.wait_for(ws.recv(), timeout=60.0)
if isinstance(msg, str):
if msg.startswith("TEXT:"):
print(f"[Recognized] {msg[5:]}")
else:
print(f"[Server] {msg}")
elif isinstance(msg, bytes):
if msg[0:1] == b"O":
audio = msg[1:]
timestamp = int(asyncio.get_running_loop().time())
filename = f"response_{timestamp}.wav"
with open(filename, "wb") as f:
with wave.open(f, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(24000)
wf.writeframes(audio)
print(f"[Audio] Saved response to {filename}")
except asyncio.TimeoutError:
print("Timed out waiting for response")
except Exception as e:
print(f"Error: {e}")
async def client():
"""Main client loop"""
print("Audio Chat Client")
print("1. Record from microphone")
print("2. Send audio file")
choice = input("Choice (1/2): ")
if choice == "1":
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
async with websockets.connect(WS_URL) as ws:
print("Recording... Press Ctrl+C to stop")
try:
receive_task = asyncio.create_task(receive_messages(ws))
while True:
data = stream.read(1024)
await ws.send(b"A" + data)
except KeyboardInterrupt:
receive_task.cancel()
finally:
stream.stop_stream()
stream.close()
p.terminate()
elif choice == "2":
filepath = input("Enter audio file path: ").strip()
if filepath:
await send_audio_file(filepath)
else:
print("No file path provided")
else:
print("Invalid choice")
if __name__ == "__main__":
asyncio.run(client())

29
config.py Normal file
View File

@@ -0,0 +1,29 @@
import os
from pathlib import Path
from dotenv import load_dotenv
env_path = Path(__file__).parent / ".env"
load_dotenv(env_path)
class Config:
# Models
STT_MODEL = os.getenv("STT_MODEL", "Systran/faster-whisper-large-v3")
LLM_MODEL = os.getenv("LLM_MODEL", "Qwen/Qwen2.5-7B-Instruct")
TTS_MODEL = os.getenv("TTS_MODEL", "facebook/mms-tts-rus")
# Audio settings
SAMPLE_RATE = int(os.getenv("SAMPLE_RATE", "16000"))
AUDIO_BUFFER_SECONDS = float(os.getenv("AUDIO_BUFFER_SECONDS", "2"))
CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1024"))
# Server
HOST = os.getenv("HOST", "0.0.0.0")
PORT = int(os.getenv("PORT", "8000"))
# LLM settings
LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "512"))
LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.7"))
# GPU
DEVICE = os.getenv("DEVICE", "auto")

0
engine/__init__.py Normal file
View File

61
engine/llm.py Normal file
View File

@@ -0,0 +1,61 @@
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from config import Config
import torch
class LLMEngine:
def __init__(self):
self.model = None
self.tokenizer = None
self.config = Config()
def initialize(self):
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
self.tokenizer = AutoTokenizer.from_pretrained(
self.config.LLM_MODEL,
trust_remote_code=True,
)
self.model = AutoModelForCausalLM.from_pretrained(
self.config.LLM_MODEL,
torch_dtype=dtype,
device_map="auto",
trust_remote_code=True,
)
def generate(self, user_text: str, system_prompt: str = None) -> str:
if not self.model:
self.initialize()
if system_prompt is None:
system_prompt = "Ты полезный ассистент. Отвечай на русском языке кратко и по делу."
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_text},
]
text = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device)
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=self.config.LLM_MAX_TOKENS,
temperature=self.config.LLM_TEMPERATURE,
do_sample=True,
top_p=0.9,
repetition_penalty=1.1,
)
generated = outputs[0][inputs["input_ids"].shape[1]:]
response = self.tokenizer.decode(generated, skip_special_tokens=True)
return response.strip()

49
engine/stt.py Normal file
View File

@@ -0,0 +1,49 @@
from faster_whisper import WhisperModel
from config import Config
import io
import numpy as np
class STTEngine:
def __init__(self):
self.model = None
self.config = Config()
self._model_size = self._resolve_model_size(self.config.STT_MODEL)
def _resolve_model_size(self, model_name: str) -> str:
"""Extract model size from various naming conventions."""
# Handle Systran/faster-whisper-* format
if "faster-whisper-" in model_name:
return model_name.split("faster-whisper-")[-1]
# Handle whisper-* format
if model_name.startswith("whisper-"):
return model_name[len("whisper-"):]
# Return as-is for direct model names
return model_name
def initialize(self):
device = "cuda" if self.config.DEVICE == "auto" else self.config.DEVICE
self.model = WhisperModel(
self._model_size,
device=device,
compute_type="float16" if device == "cuda" else "int8",
download_root=None,
)
def transcribe(self, audio_bytes: bytes) -> str:
if not self.model:
self.initialize()
audio_file = io.BytesIO(audio_bytes)
segments, info = self.model.transcribe(
audio_file,
beam_size=5,
language="ru",
vad_filter=True,
)
text = ""
for segment in segments:
text += segment.text + " "
return text.strip()

53
engine/tts.py Normal file
View File

@@ -0,0 +1,53 @@
from transformers import pipeline
from config import Config
import numpy as np
class TTSEngine:
def __init__(self):
self.tts_pipeline = None
self.config = Config()
def initialize(self):
try:
self.tts_pipeline = pipeline(
"text-to-speech",
self.config.TTS_MODEL,
device=0 if __import__("torch").cuda.is_available() else -1,
)
except Exception:
self.tts_pipeline = pipeline(
"text-to-speech",
model=self._tts_model,
device=-1,
)
self.tts_pipeline.start()
def synthesize(self, text: str, output_sample_rate: int = 24000) -> np.ndarray:
if not self.tts_pipeline:
self.initialize()
result = self.tts_pipeline(
text,
generate_kwargs={"task": "tts", "language": "ru"},
return_tensors=True,
)
audio = result["audio"]
# Convert torch tensor to numpy if needed
if hasattr(audio, 'numpy'):
audio = audio.numpy()
elif not isinstance(audio, np.ndarray):
audio = np.asarray(audio)
# Handle multi-dimensional arrays (batch or stereo)
if audio.ndim > 2:
# Batch dimension - take first item
audio = audio[0]
if audio.ndim == 2:
# Stereo - mix to mono
audio = audio.mean(axis=1)
audio = audio.astype(np.float32)
return audio

225
main.py Normal file
View File

@@ -0,0 +1,225 @@
import asyncio
import struct
import wave
import io
import numpy as np
import logging
from pathlib import Path
from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
from pydantic import BaseModel
from typing import Optional
from contextlib import asynccontextmanager
from config import Config
from engine.stt import STTEngine
from engine.llm import LLMEngine
from engine.tts import TTSEngine
from audio.stream import AudioStreamBuffer
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
config = Config()
class Message(BaseModel):
type: str
text: Optional[str] = None
class AudioSession:
def __init__(self):
self.stt = STTEngine()
self.llm = LLMEngine()
self.tts = TTSEngine()
self.audio_buffer = AudioStreamBuffer()
self.conversation_history = []
self.is_processing = False
self.processing_lock = asyncio.Lock()
def initialize_engines(self):
logger.info("Loading STT model...")
self.stt.initialize()
logger.info("Loading LLM model...")
self.llm.initialize()
logger.info("Loading TTS model...")
self.tts.initialize()
logger.info("All models loaded!")
def add_audio_chunk(self, chunk: bytes):
self.audio_buffer.add_chunk(chunk)
def recognize_speech(self) -> str:
ready_chunk = self.audio_buffer.get_ready_chunk(timeout=2.0)
if not ready_chunk:
return ""
loop = asyncio.get_running_loop()
text = loop.run_until_complete(
self._transcribe_async(ready_chunk)
)
return text
async def _transcribe_async(self, audio_bytes: bytes) -> str:
"""Async wrapper for STT transcription."""
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, self.stt.transcribe, audio_bytes)
def get_llm_response(self, user_text: str) -> str:
system_prompt = "Ты голосовой ассистент. Отвечай кратко, как в разговорной речи. Не используй списки или форматирование."
if self.conversation_history:
context = "\n".join(self.conversation_history[-6:])
full_prompt = f"Предыдущий диалог:\n{context}\n\nПользователь: {user_text}\nТы:"
else:
full_prompt = f"Пользователь: {user_text}\nТы:"
loop = asyncio.get_running_loop()
response = loop.run_until_complete(
self._generate_async(full_prompt, system_prompt)
)
loop.close()
self.conversation_history.append(f"Пользователь: {user_text}")
self.conversation_history.append(f"Ассистент: {response}")
return response
async def _generate_async(self, prompt: str, system_prompt: str) -> str:
"""Async wrapper for LLM generation."""
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, self.llm.generate, prompt, system_prompt)
def synthesize_speech(self, text: str) -> bytes:
loop = asyncio.get_running_loop()
audio = loop.run_until_complete(
self._synthesize_async(text, 24000)
)
loop.close()
wav_buffer = io.BytesIO()
with wave.open(wav_buffer, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(24000)
wf.writeframes(audio.tobytes())
return wav_buffer.getvalue()
async def _synthesize_async(self, text: str, sample_rate: int) -> np.ndarray:
"""Async wrapper for TTS synthesis."""
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, self.tts.synthesize, text, sample_rate)
def process_conversation(self, user_text: str):
logger.info(f"User said: {user_text}")
self.audio_buffer.start()
try:
response = self.get_llm_response(user_text)
logger.info(f"LLM response: {response}")
audio = self.synthesize_speech(response)
logger.info(f"Generated {len(audio)} bytes of audio")
return audio
except Exception as e:
logger.error(f"Error processing conversation: {e}")
return None
def reset(self):
self.conversation_history = []
self.audio_buffer = AudioStreamBuffer()
@asynccontextmanager
async def lifespan(app: FastAPI):
logger.info("Starting audio chat server...")
yield
logger.info("Shutting down...")
app = FastAPI(title="Audio Chat Server", version="1.0.0", lifespan=lifespan)
STATIC_DIR = Path(__file__).parent / "static"
@app.get("/")
async def serve_index():
return FileResponse(STATIC_DIR / "index.html")
app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static")
async def process_audio_chunk(session: AudioSession, payload: bytes, websocket):
"""Handle an audio chunk with proper async handling."""
session.add_audio_chunk(payload)
ready = session.audio_buffer.get_ready_chunk(timeout=0.1)
if ready:
try:
text = session.stt.transcribe(ready)
if text:
await websocket.send_text(f"TEXT:{text}")
async with session.processing_lock:
if not session.is_processing:
session.is_processing = True
try:
audio = await asyncio.to_thread(
session.process_conversation, text
)
if audio:
try:
await websocket.send_bytes(b"O" + audio)
except Exception as e:
logger.error(f"Failed to send audio: {e}")
finally:
session.is_processing = False
except Exception as e:
logger.error(f"Error processing audio chunk: {e}")
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
await websocket.accept()
logger.info("Client connected")
session = AudioSession()
session.initialize_engines()
try:
while True:
data = await asyncio.wait_for(websocket.receive_bytes(), timeout=120.0)
msg_type = data[:1]
payload = data[1:]
if msg_type == b"A":
await process_audio_chunk(session, payload, websocket)
elif msg_type == b"R":
session.reset()
await websocket.send_text("RESET")
except WebSocketDisconnect:
logger.info("Client disconnected")
except asyncio.TimeoutError:
logger.info("Client timed out")
except Exception as e:
logger.error(f"WebSocket error: {e}")
finally:
await websocket.close()
@app.get("/health")
async def health_check():
return {"status": "ok"}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host=config.HOST, port=config.PORT)

26
requirements.txt Normal file
View File

@@ -0,0 +1,26 @@
# WebSocket server
fastapi==0.115.0
uvicorn[standard]==0.30.6
websockets==13.1
# Speech-to-Text
faster-whisper==1.0.3
soundfile==0.12.1
# LLM
transformers==4.44.0
torch==2.4.1
accelerate==1.0.0
bitsandbytes==0.44.0
# TTS
torchaudio>=2.4.0
# Audio processing
numpy==2.1.1
scipy==1.14.1
# Utilities
python-dotenv==1.0.1
pydantic==2.9.2
pydantic-settings==2.5.2

570
static/index.html Normal file
View File

@@ -0,0 +1,570 @@
<!DOCTYPE html>
<html lang="ru">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Голосовой чат</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background: #0f0f23;
color: #e0e0e0;
min-height: 100vh;
display: flex;
flex-direction: column;
}
.header {
background: linear-gradient(135deg, #1a1a3e, #2d1b69);
padding: 20px;
text-align: center;
border-bottom: 1px solid #333;
}
.header h1 {
font-size: 24px;
color: #fff;
margin-bottom: 8px;
}
.status {
display: inline-flex;
align-items: center;
gap: 8px;
padding: 6px 16px;
background: rgba(255,255,255,0.1);
border-radius: 20px;
font-size: 14px;
}
.status-dot {
width: 10px;
height: 10px;
border-radius: 50%;
background: #666;
}
.status-dot.connected {
background: #4caf50;
box-shadow: 0 0 10px #4caf50;
}
.status-dot.recording {
background: #f44336;
animation: pulse 1s infinite;
}
@keyframes pulse {
0%, 100% { opacity: 1; }
50% { opacity: 0.5; }
}
.container {
flex: 1;
display: flex;
flex-direction: column;
max-width: 800px;
margin: 0 auto;
padding: 20px;
width: 100%;
}
.chat-area {
flex: 1;
overflow-y: auto;
margin-bottom: 20px;
padding: 16px;
background: rgba(255,255,255,0.05);
border-radius: 12px;
border: 1px solid rgba(255,255,255,0.1);
}
.message {
margin-bottom: 16px;
padding: 12px 16px;
border-radius: 12px;
max-width: 80%;
}
.message.user {
background: #2d1b69;
margin-left: auto;
border-bottom-right-radius: 4px;
}
.message.assistant {
background: rgba(255,255,255,0.1);
margin-right: auto;
border-bottom-left-radius: 4px;
}
.message-label {
font-size: 11px;
color: #888;
margin-bottom: 4px;
text-transform: uppercase;
letter-spacing: 1px;
}
.message-text {
font-size: 15px;
line-height: 1.5;
}
.controls {
background: rgba(255,255,255,0.05);
padding: 24px;
border-radius: 16px;
border: 1px solid rgba(255,255,255,0.1);
display: flex;
flex-direction: column;
align-items: center;
gap: 20px;
}
.visualizer {
width: 100%;
height: 80px;
background: rgba(0,0,0,0.3);
border-radius: 8px;
overflow: hidden;
}
.btn-group {
display: flex;
gap: 12px;
flex-wrap: wrap;
justify-content: center;
}
button {
padding: 14px 32px;
border: none;
border-radius: 12px;
font-size: 16px;
font-weight: 600;
cursor: pointer;
transition: all 0.3s ease;
display: inline-flex;
align-items: center;
gap: 8px;
}
.btn-primary {
background: linear-gradient(135deg, #6c3ce0, #9b59b6);
color: white;
}
.btn-primary:hover {
transform: translateY(-2px);
box-shadow: 0 4px 15px rgba(108, 60, 224, 0.4);
}
.btn-danger {
background: linear-gradient(135deg, #c0392b, #e74c3c);
color: white;
}
.btn-secondary {
background: rgba(255,255,255,0.1);
color: #ccc;
}
.btn-secondary:hover {
background: rgba(255,255,255,0.2);
}
.btn:disabled {
opacity: 0.5;
cursor: not-allowed;
transform: none !important;
}
.text-indicator {
font-size: 14px;
color: #888;
font-style: italic;
min-height: 20px;
}
.text-indicator.active {
color: #4caf50;
}
.loading {
display: none;
text-align: center;
padding: 20px;
color: #888;
}
.loading.show {
display: block;
}
.spinner {
width: 40px;
height: 40px;
border: 3px solid rgba(255,255,255,0.1);
border-top-color: #6c3ce0;
border-radius: 50%;
animation: spin 1s linear infinite;
margin: 0 auto 12px;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
.config-section {
margin-bottom: 20px;
padding: 16px;
background: rgba(255,255,255,0.03);
border-radius: 8px;
}
.config-section label {
display: block;
margin-bottom: 8px;
font-size: 13px;
color: #888;
}
.config-section input {
width: 100%;
padding: 10px;
background: rgba(0,0,0,0.3);
border: 1px solid rgba(255,255,255,0.1);
border-radius: 6px;
color: #fff;
font-size: 14px;
}
@media (max-width: 600px) {
.container {
padding: 12px;
}
.message {
max-width: 95%;
}
button {
padding: 12px 24px;
font-size: 14px;
}
}
</style>
</head>
<body>
<div class="header">
<h1>🎙️ Голосовой чат с ИИ</h1>
<div class="status">
<div class="status-dot" id="statusDot"></div>
<span id="statusText">Отключено</span>
</div>
</div>
<div class="container">
<div class="config-section">
<label>URL WebSocket сервера</label>
<input type="text" id="wsUrl" value="ws://localhost:8000/ws" placeholder="ws://localhost:8000/ws">
</div>
<div class="chat-area" id="chatArea">
<div class="message assistant">
<div class="message-label">Ассистент</div>
<div class="message-text">Привет! Нажми "Начать запись" и говори со мной.</div>
</div>
</div>
<div class="loading" id="loading">
<div class="spinner"></div>
<div>Генерирую ответ...</div>
</div>
<div class="controls">
<canvas class="visualizer" id="visualizer"></canvas>
<div class="text-indicator" id="textIndicator"></div>
<div class="btn-group">
<button class="btn-primary" id="btnStart" onclick="startRecording()">
🎤 Начать запись
</button>
<button class="btn-danger" id="btnStop" onclick="stopRecording()" disabled>
⏹️ Остановить
</button>
<button class="btn-secondary" onclick="resetChat()">
🔄 Сброс
</button>
</div>
</div>
</div>
<audio id="audioPlayer" style="display:none"></audio>
<script>
const audioPlayer = document.getElementById('audioPlayer');
const chatArea = document.getElementById('chatArea');
const statusDot = document.getElementById('statusDot');
const statusText = document.getElementById('statusText');
const textIndicator = document.getElementById('textIndicator');
const loading = document.getElementById('loading');
const btnStart = document.getElementById('btnStart');
const btnStop = document.getElementById('btnStop');
const visualizer = document.getElementById('visualizer');
const wsUrlInput = document.getElementById('wsUrl');
let ws = null;
let audioContext = null;
let mediaStream = null;
let analyser = null;
let isRecording = false;
let buffer = new Uint8Array();
let animationId = null;
let textTimeout = null;
let currentText = '';
// Resize visualizer
function resizeVisualizer() {
visualizer.width = visualizer.offsetWidth;
visualizer.height = visualizer.offsetHeight;
}
resizeVisualizer();
window.addEventListener('resize', resizeVisualizer);
function setStatus(status, text) {
statusDot.className = 'status-dot ' + status;
statusText.textContent = text;
}
function addMessage(role, text) {
const div = document.createElement('div');
div.className = 'message ' + role;
div.innerHTML = `
<div class="message-label">${role === 'user' ? 'Вы' : 'Ассистент'}</div>
<div class="message-text">${text}</div>
`;
chatArea.appendChild(div);
chatArea.scrollTop = chatArea.scrollHeight;
}
function drawVisualizer() {
if (!analyser) return;
const bufferLength = analyser.frequencyBinCount;
const dataArray = new Uint8Array(bufferLength);
analyser.getByteFrequencyData(dataArray);
const ctx = visualizer.getContext('2d');
ctx.clearRect(0, 0, visualizer.width, visualizer.height);
const barWidth = (visualizer.width / bufferLength) * 2.5;
let x = 0;
for (let i = 0; i < bufferLength; i++) {
const barHeight = (dataArray[i] / 255) * visualizer.height;
ctx.fillStyle = `rgba(108, 60, 224, ${dataArray[i] / 255})`;
ctx.fillRect(x, visualizer.height - barHeight, barWidth, barHeight);
x += barWidth + 1;
}
if (isRecording) {
animationId = requestAnimationFrame(drawVisualizer);
}
}
function connectWebSocket() {
const url = wsUrlInput.value || 'ws://localhost:8000/ws';
ws = new WebSocket(url);
ws.onopen = () => {
setStatus('connected', 'Подключено');
console.log('WebSocket connected');
};
ws.onclose = () => {
setStatus('', 'Отключено');
console.log('WebSocket closed');
};
ws.onerror = (error) => {
console.error('WebSocket error:', error);
setStatus('', 'Ошибка подключения');
};
ws.onmessage = (event) => {
const data = event.data;
if (typeof data === 'string') {
if (data.startsWith('TEXT:')) {
const text = data.substring(5);
currentText = text;
textIndicator.textContent = text;
textIndicator.classList.add('active');
// Add user message after delay
clearTimeout(textTimeout);
textTimeout = setTimeout(() => {
addMessage('user', text);
textIndicator.textContent = '';
textIndicator.classList.remove('active');
currentText = '';
}, 1500);
} else if (data === 'RESET') {
console.log('Session reset');
}
} else if (data instanceof ArrayBuffer) {
const bytes = new Uint8Array(data);
if (bytes[0] === 0x4F) { // 'O'
const audioData = bytes.slice(1);
playAudio(audioData);
}
}
};
}
function playAudio(audioData) {
// Convert 24kHz WAV to 48kHz for browser playback
const sampleRate = 24000;
const targetSampleRate = 48000;
const wavHeaderSize = 44;
const audioContent = audioData.slice(wavHeaderSize);
const numSamples = audioContent.length / 2;
// Create AudioBuffer
const audioCtx = new (window.AudioContext || window.webkitAudioContext)();
const outputBuffer = audioCtx.createBuffer(1, numSamples * (targetSampleRate / sampleRate), targetSampleRate);
const outputData = outputBuffer.getChannelData(0);
// Resample
for (let i = 0; i < numSamples; i++) {
const inputSample = parseInt(audioContent.slice(i * 2, i * 2 + 2).reverse());
const outputSample = inputSample / 32768;
const outputIndex = i * (targetSampleRate / sampleRate);
for (let j = 0; j < (targetSampleRate / sampleRate); j++) {
const idx = Math.floor(outputIndex + j);
if (idx < outputBuffer.length) {
outputData[idx] = outputSample;
}
}
}
// Create source and play
const source = audioCtx.createBufferSource();
source.buffer = outputBuffer;
source.connect(audioCtx.destination);
source.start();
loading.classList.remove('show');
}
async function startRecording() {
try {
// Connect WebSocket
connectWebSocket();
// Request microphone access
mediaStream = await navigator.mediaDevices.getUserMedia({
audio: {
channelCount: 1,
sampleRate: 48000,
echoCancellation: true,
noiseSuppression: true
}
});
// Setup audio context
audioContext = new (window.AudioContext || window.webkitAudioContext)();
const source = audioContext.createMediaStreamSource(mediaStream);
// Setup analyser for visualization
analyser = audioContext.createAnalyser();
analyser.fftSize = 256;
source.connect(analyser);
isRecording = true;
btnStart.disabled = true;
btnStop.disabled = false;
setStatus('recording', 'Запись...');
textIndicator.textContent = 'Слушаю...';
textIndicator.classList.add('active');
drawVisualizer();
// Resample from 48kHz to 16kHz and send
const scriptProcessor = audioContext.createScriptProcessor(4096, 1, 1);
scriptProcessor.onaudioprocess = (event) => {
if (!isRecording) return;
const input = event.inputBuffer.getChannelData(0);
// Resample from 48kHz to 16kHz (3:1 ratio)
const resampled = [];
for (let i = 0; i < input.length; i += 3) {
resampled.push(input[i]);
}
// Convert to PCM 16-bit
const pcm = new Uint8Array(resampled.length * 2);
for (let i = 0; i < resampled.length; i++) {
const sample = Math.max(-1, Math.min(1, resampled[i]));
const int16 = sample < 0 ? sample * 32768 : sample * 32767;
pcm[i * 2] = int16 & 0xFF;
pcm[i * 2 + 1] = (int16 >> 8) & 0xFF;
}
// Send via WebSocket
if (ws && ws.readyState === WebSocket.OPEN) {
ws.send(new Uint8Array([0x41]).concat(pcm).buffer);
}
};
source.connect(scriptProcessor);
scriptProcessor.connect(audioContext.destination);
} catch (err) {
console.error('Error starting recording:', err);
alert('Не удалось получить доступ к микрофону: ' + err.message);
isRecording = false;
btnStart.disabled = false;
btnStop.disabled = true;
}
}
function stopRecording() {
isRecording = false;
if (animationId) {
cancelAnimationFrame(animationId);
}
if (mediaStream) {
mediaStream.getTracks().forEach(t => t.stop());
}
if (audioContext) {
audioContext.close();
}
btnStart.disabled = false;
btnStop.disabled = true;
setStatus('connected', 'Подключено');
textIndicator.textContent = '';
textIndicator.classList.remove('active');
}
async function resetChat() {
if (ws && ws.readyState === WebSocket.OPEN) {
ws.send(new Uint8Array([0x52]).buffer); // 'R'
}
chatArea.innerHTML = '';
addMessage('assistant', 'Чат сброшен. Говорите со мной!');
}
// Auto-connect on page load
window.addEventListener('load', () => {
connectWebSocket();
});
</script>
</body>
</html>