Files
audio-chat/client.py
noturum e2d3cbe783 Add voice activation (VAD) and audio fade
- Add webrtcvad dependency for real-time voice activity detection
- Create audio/fade.py with fade-in/fade-out utility
- Add VAD voice activation to client recording (sends audio only during speech)
- Apply 200ms fade-out to TTS output to avoid abrupt audio cuts
- Fix tts.py indentation error in except block
2026-05-01 13:14:31 +00:00

197 lines
6.4 KiB
Python

import asyncio
import websockets
import struct
import wave
import numpy as np
import webrtcvad
# WebSocket URL
WS_URL = "ws://localhost:8000/ws"
async def start_recording():
"""Send start signal (b'S')"""
async with websockets.connect(WS_URL) as ws:
await ws.send(b"S")
async def send_audio(ws, audio_data: bytes):
"""Send audio data (b'A' + raw PCM)"""
await ws.send(b"A" + audio_data)
async def reset_session(ws):
"""Reset conversation (b'R')"""
await ws.send(b"R")
async def receive_messages(ws):
"""Receive TEXT and AUDIO messages"""
while True:
try:
msg = await asyncio.wait_for(ws.recv(), timeout=30.0)
if isinstance(msg, str):
if msg.startswith("TEXT:"):
print(f"[RECognized] {msg[5:]}")
else:
print(f"[Server] {msg}")
elif isinstance(msg, bytes):
if msg[0:1] == b"O":
audio = msg[1:]
print(f"[Audio] Received {len(audio)} bytes")
# Save to file
timestamp = int(asyncio.get_running_loop().time())
filename = f"response_{timestamp}.wav"
with open(filename, "wb") as f:
with wave.open(f, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(24000)
wf.writeframes(audio)
print(f"[Audio] Saved to {filename}")
except asyncio.TimeoutError:
break
except Exception as e:
print(f"Error: {e}")
break
async def record_and_send():
"""Record audio from microphone and send with VAD voice activation"""
import pyaudio
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
VAD_MODE = 3
SILENCE_THRESHOLD = 5 # consecutive silent chunks to stop sending
vad = webrtcvad.Vad()
vad.set_mode(VAD_MODE)
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
async with websockets.connect(WS_URL) as ws:
print("Recording (VAD active)... Press Ctrl+C to stop")
speech_active = False
silence_count = 0
try:
while True:
data = stream.read(CHUNK)
is_speech = vad.is_speech(data, RATE)
if is_speech:
speech_active = True
silence_count = 0
await send_audio(ws, data)
elif speech_active:
silence_count += 1
if silence_count < SILENCE_THRESHOLD:
await send_audio(ws, data)
else:
speech_active = False
except KeyboardInterrupt:
print("\nStopped recording")
finally:
stream.stop_stream()
stream.close()
p.terminate()
async def send_audio_file(filepath: str):
"""Read and send an audio file to the server."""
try:
with open(filepath, "rb") as f:
file_data = f.read()
except FileNotFoundError:
print(f"Error: File '{filepath}' not found")
return
print(f"Reading audio file: {filepath} ({len(file_data)} bytes)")
async with websockets.connect(WS_URL) as ws:
print("Connected. Sending audio file...")
await ws.send(b"A" + file_data)
print("File sent. Waiting for response...")
try:
while True:
msg = await asyncio.wait_for(ws.recv(), timeout=60.0)
if isinstance(msg, str):
if msg.startswith("TEXT:"):
print(f"[Recognized] {msg[5:]}")
else:
print(f"[Server] {msg}")
elif isinstance(msg, bytes):
if msg[0:1] == b"O":
audio = msg[1:]
timestamp = int(asyncio.get_running_loop().time())
filename = f"response_{timestamp}.wav"
with open(filename, "wb") as f:
with wave.open(f, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(24000)
wf.writeframes(audio)
print(f"[Audio] Saved response to {filename}")
except asyncio.TimeoutError:
print("Timed out waiting for response")
except Exception as e:
print(f"Error: {e}")
async def client():
"""Main client loop"""
print("Audio Chat Client")
print("1. Record from microphone")
print("2. Send audio file")
choice = input("Choice (1/2): ")
if choice == "1":
import webrtcvad
vad = webrtcvad.Vad()
vad.set_mode(3)
SILENCE_THRESHOLD = 5
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
async with websockets.connect(WS_URL) as ws:
print("Recording (VAD active)... Press Ctrl+C to stop")
speech_active = False
silence_count = 0
try:
receive_task = asyncio.create_task(receive_messages(ws))
while True:
data = stream.read(1024)
is_speech = vad.is_speech(data, 16000)
if is_speech:
speech_active = True
silence_count = 0
await ws.send(b"A" + data)
elif speech_active:
silence_count += 1
if silence_count < SILENCE_THRESHOLD:
await ws.send(b"A" + data)
else:
speech_active = False
except KeyboardInterrupt:
receive_task.cancel()
finally:
stream.stop_stream()
stream.close()
p.terminate()
elif choice == "2":
filepath = input("Enter audio file path: ").strip()
if filepath:
await send_audio_file(filepath)
else:
print("No file path provided")
else:
print("Invalid choice")
if __name__ == "__main__":
asyncio.run(client())