Add voice activation (VAD) and audio fade

- Add webrtcvad dependency for real-time voice activity detection
- Create audio/fade.py with fade-in/fade-out utility
- Add VAD voice activation to client recording (sends audio only during speech)
- Apply 200ms fade-out to TTS output to avoid abrupt audio cuts
- Fix tts.py indentation error in except block
This commit is contained in:
2026-05-01 13:14:31 +00:00
parent 7b023cc698
commit e2d3cbe783
4 changed files with 85 additions and 10 deletions

34
audio/fade.py Normal file
View File

@@ -0,0 +1,34 @@
import numpy as np
def apply_fade(audio: np.ndarray, sample_rate: int = 24000, fade_duration_ms: int = 300) -> np.ndarray:
"""Apply fade-in and fade-out to audio array.
Args:
audio: numpy array of audio samples
sample_rate: audio sample rate in Hz
fade_duration_ms: fade duration in milliseconds
Returns:
Audio array with fade applied
"""
if len(audio) == 0:
return audio
fade_samples = int(sample_rate * fade_duration_ms / 1000)
fade_samples = min(fade_samples, len(audio) // 4)
if fade_samples <= 0:
return audio
# Create fade envelope
fade_in = np.linspace(0, 1, fade_samples)
fade_out = np.linspace(1, 0, fade_samples)
# Apply fade-in
audio[:fade_samples] *= fade_in
# Apply fade-out
audio[-fade_samples:] *= fade_out
return audio

View File

@@ -3,6 +3,7 @@ import websockets
import struct import struct
import wave import wave
import numpy as np import numpy as np
import webrtcvad
# WebSocket URL # WebSocket URL
WS_URL = "ws://localhost:8000/ws" WS_URL = "ws://localhost:8000/ws"
@@ -56,23 +57,41 @@ async def receive_messages(ws):
async def record_and_send(): async def record_and_send():
"""Record audio from microphone and send""" """Record audio from microphone and send with VAD voice activation"""
import pyaudio import pyaudio
CHUNK = 1024 CHUNK = 1024
FORMAT = pyaudio.paInt16 FORMAT = pyaudio.paInt16
CHANNELS = 1 CHANNELS = 1
RATE = 16000 RATE = 16000
VAD_MODE = 3
SILENCE_THRESHOLD = 5 # consecutive silent chunks to stop sending
vad = webrtcvad.Vad()
vad.set_mode(VAD_MODE)
p = pyaudio.PyAudio() p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
async with websockets.connect(WS_URL) as ws: async with websockets.connect(WS_URL) as ws:
print("Recording... Press Ctrl+C to stop") print("Recording (VAD active)... Press Ctrl+C to stop")
speech_active = False
silence_count = 0
try: try:
while True: while True:
data = stream.read(CHUNK) data = stream.read(CHUNK)
await send_audio(ws, data) is_speech = vad.is_speech(data, RATE)
if is_speech:
speech_active = True
silence_count = 0
await send_audio(ws, data)
elif speech_active:
silence_count += 1
if silence_count < SILENCE_THRESHOLD:
await send_audio(ws, data)
else:
speech_active = False
except KeyboardInterrupt: except KeyboardInterrupt:
print("\nStopped recording") print("\nStopped recording")
finally: finally:
@@ -131,15 +150,32 @@ async def client():
choice = input("Choice (1/2): ") choice = input("Choice (1/2): ")
if choice == "1": if choice == "1":
import webrtcvad
vad = webrtcvad.Vad()
vad.set_mode(3)
SILENCE_THRESHOLD = 5
p = pyaudio.PyAudio() p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
async with websockets.connect(WS_URL) as ws: async with websockets.connect(WS_URL) as ws:
print("Recording... Press Ctrl+C to stop") print("Recording (VAD active)... Press Ctrl+C to stop")
speech_active = False
silence_count = 0
try: try:
receive_task = asyncio.create_task(receive_messages(ws)) receive_task = asyncio.create_task(receive_messages(ws))
while True: while True:
data = stream.read(1024) data = stream.read(1024)
await ws.send(b"A" + data) is_speech = vad.is_speech(data, 16000)
if is_speech:
speech_active = True
silence_count = 0
await ws.send(b"A" + data)
elif speech_active:
silence_count += 1
if silence_count < SILENCE_THRESHOLD:
await ws.send(b"A" + data)
else:
speech_active = False
except KeyboardInterrupt: except KeyboardInterrupt:
receive_task.cancel() receive_task.cancel()
finally: finally:

View File

@@ -1,6 +1,7 @@
from transformers import pipeline from transformers import pipeline
from config import Config from config import Config
import numpy as np import numpy as np
from audio.fade import apply_fade
class TTSEngine: class TTSEngine:
@@ -16,11 +17,11 @@ class TTSEngine:
device=0 if __import__("torch").cuda.is_available() else -1, device=0 if __import__("torch").cuda.is_available() else -1,
) )
except Exception: except Exception:
self.tts_pipeline = pipeline( self.tts_pipeline = pipeline(
"text-to-speech", "text-to-speech",
model=self._tts_model, model=self._tts_model,
device=-1, device=-1,
) )
self.tts_pipeline.start() self.tts_pipeline.start()
def synthesize(self, text: str, output_sample_rate: int = 24000) -> np.ndarray: def synthesize(self, text: str, output_sample_rate: int = 24000) -> np.ndarray:
@@ -50,4 +51,7 @@ class TTSEngine:
audio = audio.astype(np.float32) audio = audio.astype(np.float32)
# Apply fade-out to avoid abrupt audio cuts
audio = apply_fade(audio, output_sample_rate, fade_duration_ms=200)
return audio return audio

View File

@@ -2,6 +2,7 @@
fastapi fastapi
uvicorn[standard] uvicorn[standard]
websockets websockets
webrtcvad
# Speech-to-Text # Speech-to-Text
faster-whisper faster-whisper