Add voice activation (VAD) and audio fade
- Add webrtcvad dependency for real-time voice activity detection - Create audio/fade.py with fade-in/fade-out utility - Add VAD voice activation to client recording (sends audio only during speech) - Apply 200ms fade-out to TTS output to avoid abrupt audio cuts - Fix tts.py indentation error in except block
This commit is contained in:
46
client.py
46
client.py
@@ -3,6 +3,7 @@ import websockets
|
||||
import struct
|
||||
import wave
|
||||
import numpy as np
|
||||
import webrtcvad
|
||||
|
||||
# WebSocket URL
|
||||
WS_URL = "ws://localhost:8000/ws"
|
||||
@@ -56,23 +57,41 @@ async def receive_messages(ws):
|
||||
|
||||
|
||||
async def record_and_send():
|
||||
"""Record audio from microphone and send"""
|
||||
"""Record audio from microphone and send with VAD voice activation"""
|
||||
import pyaudio
|
||||
|
||||
CHUNK = 1024
|
||||
FORMAT = pyaudio.paInt16
|
||||
CHANNELS = 1
|
||||
RATE = 16000
|
||||
VAD_MODE = 3
|
||||
SILENCE_THRESHOLD = 5 # consecutive silent chunks to stop sending
|
||||
|
||||
vad = webrtcvad.Vad()
|
||||
vad.set_mode(VAD_MODE)
|
||||
|
||||
p = pyaudio.PyAudio()
|
||||
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
|
||||
|
||||
async with websockets.connect(WS_URL) as ws:
|
||||
print("Recording... Press Ctrl+C to stop")
|
||||
print("Recording (VAD active)... Press Ctrl+C to stop")
|
||||
speech_active = False
|
||||
silence_count = 0
|
||||
try:
|
||||
while True:
|
||||
data = stream.read(CHUNK)
|
||||
await send_audio(ws, data)
|
||||
is_speech = vad.is_speech(data, RATE)
|
||||
|
||||
if is_speech:
|
||||
speech_active = True
|
||||
silence_count = 0
|
||||
await send_audio(ws, data)
|
||||
elif speech_active:
|
||||
silence_count += 1
|
||||
if silence_count < SILENCE_THRESHOLD:
|
||||
await send_audio(ws, data)
|
||||
else:
|
||||
speech_active = False
|
||||
except KeyboardInterrupt:
|
||||
print("\nStopped recording")
|
||||
finally:
|
||||
@@ -131,15 +150,32 @@ async def client():
|
||||
choice = input("Choice (1/2): ")
|
||||
|
||||
if choice == "1":
|
||||
import webrtcvad
|
||||
vad = webrtcvad.Vad()
|
||||
vad.set_mode(3)
|
||||
SILENCE_THRESHOLD = 5
|
||||
|
||||
p = pyaudio.PyAudio()
|
||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
|
||||
async with websockets.connect(WS_URL) as ws:
|
||||
print("Recording... Press Ctrl+C to stop")
|
||||
print("Recording (VAD active)... Press Ctrl+C to stop")
|
||||
speech_active = False
|
||||
silence_count = 0
|
||||
try:
|
||||
receive_task = asyncio.create_task(receive_messages(ws))
|
||||
while True:
|
||||
data = stream.read(1024)
|
||||
await ws.send(b"A" + data)
|
||||
is_speech = vad.is_speech(data, 16000)
|
||||
if is_speech:
|
||||
speech_active = True
|
||||
silence_count = 0
|
||||
await ws.send(b"A" + data)
|
||||
elif speech_active:
|
||||
silence_count += 1
|
||||
if silence_count < SILENCE_THRESHOLD:
|
||||
await ws.send(b"A" + data)
|
||||
else:
|
||||
speech_active = False
|
||||
except KeyboardInterrupt:
|
||||
receive_task.cancel()
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user