Add voice activation (VAD) and audio fade

- Add webrtcvad dependency for real-time voice activity detection - Create audio/fade.py with fade-in/fade-out utility - Add VAD voice activation to client recording (sends audio only during speech) - Apply 200ms fade-out to TTS output to avoid abrupt audio cuts - Fix tts.py indentation error in except block
2026-05-01 13:14:31 +00:00
parent 7b023cc698
commit e2d3cbe783
4 changed files with 85 additions and 10 deletions
--- a/client.py
+++ b/client.py
@@ -3,6 +3,7 @@ import websockets
 import struct
 import wave
 import numpy as np
+import webrtcvad

 # WebSocket URL
 WS_URL = "ws://localhost:8000/ws"
@@ -56,23 +57,41 @@ async def receive_messages(ws):


 async def record_and_send():
-    """Record audio from microphone and send"""
+    """Record audio from microphone and send with VAD voice activation"""
    import pyaudio

    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
+    VAD_MODE = 3
+    SILENCE_THRESHOLD = 5  # consecutive silent chunks to stop sending
+
+    vad = webrtcvad.Vad()
+    vad.set_mode(VAD_MODE)

    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)

    async with websockets.connect(WS_URL) as ws:
-        print("Recording... Press Ctrl+C to stop")
+        print("Recording (VAD active)... Press Ctrl+C to stop")
+        speech_active = False
+        silence_count = 0
        try:
            while True:
                data = stream.read(CHUNK)
-                await send_audio(ws, data)
+                is_speech = vad.is_speech(data, RATE)
+
+                if is_speech:
+                    speech_active = True
+                    silence_count = 0
+                    await send_audio(ws, data)
+                elif speech_active:
+                    silence_count += 1
+                    if silence_count < SILENCE_THRESHOLD:
+                        await send_audio(ws, data)
+                    else:
+                        speech_active = False
        except KeyboardInterrupt:
            print("\nStopped recording")
        finally:
@@ -131,15 +150,32 @@ async def client():
    choice = input("Choice (1/2): ")

    if choice == "1":
+        import webrtcvad
+        vad = webrtcvad.Vad()
+        vad.set_mode(3)
+        SILENCE_THRESHOLD = 5
+
        p = pyaudio.PyAudio()
        stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
        async with websockets.connect(WS_URL) as ws:
-            print("Recording... Press Ctrl+C to stop")
+            print("Recording (VAD active)... Press Ctrl+C to stop")
+            speech_active = False
+            silence_count = 0
            try:
                receive_task = asyncio.create_task(receive_messages(ws))
                while True:
                    data = stream.read(1024)
-                    await ws.send(b"A" + data)
+                    is_speech = vad.is_speech(data, 16000)
+                    if is_speech:
+                        speech_active = True
+                        silence_count = 0
+                        await ws.send(b"A" + data)
+                    elif speech_active:
+                        silence_count += 1
+                        if silence_count < SILENCE_THRESHOLD:
+                            await ws.send(b"A" + data)
+                        else:
+                            speech_active = False
            except KeyboardInterrupt:
                receive_task.cancel()
            finally: