From e2d3cbe78348e074a1334d68847a782ac5fcae1f Mon Sep 17 00:00:00 2001 From: noturum Date: Fri, 1 May 2026 13:14:31 +0000 Subject: [PATCH] Add voice activation (VAD) and audio fade - Add webrtcvad dependency for real-time voice activity detection - Create audio/fade.py with fade-in/fade-out utility - Add VAD voice activation to client recording (sends audio only during speech) - Apply 200ms fade-out to TTS output to avoid abrupt audio cuts - Fix tts.py indentation error in except block --- audio/fade.py | 34 ++++++++++++++++++++++++++++++++++ client.py | 46 +++++++++++++++++++++++++++++++++++++++++----- engine/tts.py | 14 +++++++++----- requirements.txt | 1 + 4 files changed, 85 insertions(+), 10 deletions(-) create mode 100644 audio/fade.py diff --git a/audio/fade.py b/audio/fade.py new file mode 100644 index 0000000..16e0bce --- /dev/null +++ b/audio/fade.py @@ -0,0 +1,34 @@ +import numpy as np + + +def apply_fade(audio: np.ndarray, sample_rate: int = 24000, fade_duration_ms: int = 300) -> np.ndarray: + """Apply fade-in and fade-out to audio array. + + Args: + audio: numpy array of audio samples + sample_rate: audio sample rate in Hz + fade_duration_ms: fade duration in milliseconds + + Returns: + Audio array with fade applied + """ + if len(audio) == 0: + return audio + + fade_samples = int(sample_rate * fade_duration_ms / 1000) + fade_samples = min(fade_samples, len(audio) // 4) + + if fade_samples <= 0: + return audio + + # Create fade envelope + fade_in = np.linspace(0, 1, fade_samples) + fade_out = np.linspace(1, 0, fade_samples) + + # Apply fade-in + audio[:fade_samples] *= fade_in + + # Apply fade-out + audio[-fade_samples:] *= fade_out + + return audio diff --git a/client.py b/client.py index d140fae..fe6e95a 100644 --- a/client.py +++ b/client.py @@ -3,6 +3,7 @@ import websockets import struct import wave import numpy as np +import webrtcvad # WebSocket URL WS_URL = "ws://localhost:8000/ws" @@ -56,23 +57,41 @@ async def receive_messages(ws): async def record_and_send(): - """Record audio from microphone and send""" + """Record audio from microphone and send with VAD voice activation""" import pyaudio CHUNK = 1024 FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 16000 + VAD_MODE = 3 + SILENCE_THRESHOLD = 5 # consecutive silent chunks to stop sending + + vad = webrtcvad.Vad() + vad.set_mode(VAD_MODE) p = pyaudio.PyAudio() stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) async with websockets.connect(WS_URL) as ws: - print("Recording... Press Ctrl+C to stop") + print("Recording (VAD active)... Press Ctrl+C to stop") + speech_active = False + silence_count = 0 try: while True: data = stream.read(CHUNK) - await send_audio(ws, data) + is_speech = vad.is_speech(data, RATE) + + if is_speech: + speech_active = True + silence_count = 0 + await send_audio(ws, data) + elif speech_active: + silence_count += 1 + if silence_count < SILENCE_THRESHOLD: + await send_audio(ws, data) + else: + speech_active = False except KeyboardInterrupt: print("\nStopped recording") finally: @@ -131,15 +150,32 @@ async def client(): choice = input("Choice (1/2): ") if choice == "1": + import webrtcvad + vad = webrtcvad.Vad() + vad.set_mode(3) + SILENCE_THRESHOLD = 5 + p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) async with websockets.connect(WS_URL) as ws: - print("Recording... Press Ctrl+C to stop") + print("Recording (VAD active)... Press Ctrl+C to stop") + speech_active = False + silence_count = 0 try: receive_task = asyncio.create_task(receive_messages(ws)) while True: data = stream.read(1024) - await ws.send(b"A" + data) + is_speech = vad.is_speech(data, 16000) + if is_speech: + speech_active = True + silence_count = 0 + await ws.send(b"A" + data) + elif speech_active: + silence_count += 1 + if silence_count < SILENCE_THRESHOLD: + await ws.send(b"A" + data) + else: + speech_active = False except KeyboardInterrupt: receive_task.cancel() finally: diff --git a/engine/tts.py b/engine/tts.py index a8019dd..98795d0 100644 --- a/engine/tts.py +++ b/engine/tts.py @@ -1,6 +1,7 @@ from transformers import pipeline from config import Config import numpy as np +from audio.fade import apply_fade class TTSEngine: @@ -16,11 +17,11 @@ class TTSEngine: device=0 if __import__("torch").cuda.is_available() else -1, ) except Exception: - self.tts_pipeline = pipeline( - "text-to-speech", - model=self._tts_model, - device=-1, - ) + self.tts_pipeline = pipeline( + "text-to-speech", + model=self._tts_model, + device=-1, + ) self.tts_pipeline.start() def synthesize(self, text: str, output_sample_rate: int = 24000) -> np.ndarray: @@ -50,4 +51,7 @@ class TTSEngine: audio = audio.astype(np.float32) + # Apply fade-out to avoid abrupt audio cuts + audio = apply_fade(audio, output_sample_rate, fade_duration_ms=200) + return audio diff --git a/requirements.txt b/requirements.txt index 9474624..25ab34e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ fastapi uvicorn[standard] websockets +webrtcvad # Speech-to-Text faster-whisper