Add voice activation (VAD) and audio fade
- Add webrtcvad dependency for real-time voice activity detection - Create audio/fade.py with fade-in/fade-out utility - Add VAD voice activation to client recording (sends audio only during speech) - Apply 200ms fade-out to TTS output to avoid abrupt audio cuts - Fix tts.py indentation error in except block
This commit is contained in:
34
audio/fade.py
Normal file
34
audio/fade.py
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def apply_fade(audio: np.ndarray, sample_rate: int = 24000, fade_duration_ms: int = 300) -> np.ndarray:
|
||||||
|
"""Apply fade-in and fade-out to audio array.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
audio: numpy array of audio samples
|
||||||
|
sample_rate: audio sample rate in Hz
|
||||||
|
fade_duration_ms: fade duration in milliseconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Audio array with fade applied
|
||||||
|
"""
|
||||||
|
if len(audio) == 0:
|
||||||
|
return audio
|
||||||
|
|
||||||
|
fade_samples = int(sample_rate * fade_duration_ms / 1000)
|
||||||
|
fade_samples = min(fade_samples, len(audio) // 4)
|
||||||
|
|
||||||
|
if fade_samples <= 0:
|
||||||
|
return audio
|
||||||
|
|
||||||
|
# Create fade envelope
|
||||||
|
fade_in = np.linspace(0, 1, fade_samples)
|
||||||
|
fade_out = np.linspace(1, 0, fade_samples)
|
||||||
|
|
||||||
|
# Apply fade-in
|
||||||
|
audio[:fade_samples] *= fade_in
|
||||||
|
|
||||||
|
# Apply fade-out
|
||||||
|
audio[-fade_samples:] *= fade_out
|
||||||
|
|
||||||
|
return audio
|
||||||
42
client.py
42
client.py
@@ -3,6 +3,7 @@ import websockets
|
|||||||
import struct
|
import struct
|
||||||
import wave
|
import wave
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import webrtcvad
|
||||||
|
|
||||||
# WebSocket URL
|
# WebSocket URL
|
||||||
WS_URL = "ws://localhost:8000/ws"
|
WS_URL = "ws://localhost:8000/ws"
|
||||||
@@ -56,23 +57,41 @@ async def receive_messages(ws):
|
|||||||
|
|
||||||
|
|
||||||
async def record_and_send():
|
async def record_and_send():
|
||||||
"""Record audio from microphone and send"""
|
"""Record audio from microphone and send with VAD voice activation"""
|
||||||
import pyaudio
|
import pyaudio
|
||||||
|
|
||||||
CHUNK = 1024
|
CHUNK = 1024
|
||||||
FORMAT = pyaudio.paInt16
|
FORMAT = pyaudio.paInt16
|
||||||
CHANNELS = 1
|
CHANNELS = 1
|
||||||
RATE = 16000
|
RATE = 16000
|
||||||
|
VAD_MODE = 3
|
||||||
|
SILENCE_THRESHOLD = 5 # consecutive silent chunks to stop sending
|
||||||
|
|
||||||
|
vad = webrtcvad.Vad()
|
||||||
|
vad.set_mode(VAD_MODE)
|
||||||
|
|
||||||
p = pyaudio.PyAudio()
|
p = pyaudio.PyAudio()
|
||||||
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
|
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
|
||||||
|
|
||||||
async with websockets.connect(WS_URL) as ws:
|
async with websockets.connect(WS_URL) as ws:
|
||||||
print("Recording... Press Ctrl+C to stop")
|
print("Recording (VAD active)... Press Ctrl+C to stop")
|
||||||
|
speech_active = False
|
||||||
|
silence_count = 0
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
data = stream.read(CHUNK)
|
data = stream.read(CHUNK)
|
||||||
|
is_speech = vad.is_speech(data, RATE)
|
||||||
|
|
||||||
|
if is_speech:
|
||||||
|
speech_active = True
|
||||||
|
silence_count = 0
|
||||||
await send_audio(ws, data)
|
await send_audio(ws, data)
|
||||||
|
elif speech_active:
|
||||||
|
silence_count += 1
|
||||||
|
if silence_count < SILENCE_THRESHOLD:
|
||||||
|
await send_audio(ws, data)
|
||||||
|
else:
|
||||||
|
speech_active = False
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print("\nStopped recording")
|
print("\nStopped recording")
|
||||||
finally:
|
finally:
|
||||||
@@ -131,15 +150,32 @@ async def client():
|
|||||||
choice = input("Choice (1/2): ")
|
choice = input("Choice (1/2): ")
|
||||||
|
|
||||||
if choice == "1":
|
if choice == "1":
|
||||||
|
import webrtcvad
|
||||||
|
vad = webrtcvad.Vad()
|
||||||
|
vad.set_mode(3)
|
||||||
|
SILENCE_THRESHOLD = 5
|
||||||
|
|
||||||
p = pyaudio.PyAudio()
|
p = pyaudio.PyAudio()
|
||||||
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
|
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
|
||||||
async with websockets.connect(WS_URL) as ws:
|
async with websockets.connect(WS_URL) as ws:
|
||||||
print("Recording... Press Ctrl+C to stop")
|
print("Recording (VAD active)... Press Ctrl+C to stop")
|
||||||
|
speech_active = False
|
||||||
|
silence_count = 0
|
||||||
try:
|
try:
|
||||||
receive_task = asyncio.create_task(receive_messages(ws))
|
receive_task = asyncio.create_task(receive_messages(ws))
|
||||||
while True:
|
while True:
|
||||||
data = stream.read(1024)
|
data = stream.read(1024)
|
||||||
|
is_speech = vad.is_speech(data, 16000)
|
||||||
|
if is_speech:
|
||||||
|
speech_active = True
|
||||||
|
silence_count = 0
|
||||||
await ws.send(b"A" + data)
|
await ws.send(b"A" + data)
|
||||||
|
elif speech_active:
|
||||||
|
silence_count += 1
|
||||||
|
if silence_count < SILENCE_THRESHOLD:
|
||||||
|
await ws.send(b"A" + data)
|
||||||
|
else:
|
||||||
|
speech_active = False
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
receive_task.cancel()
|
receive_task.cancel()
|
||||||
finally:
|
finally:
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
from config import Config
|
from config import Config
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from audio.fade import apply_fade
|
||||||
|
|
||||||
|
|
||||||
class TTSEngine:
|
class TTSEngine:
|
||||||
@@ -50,4 +51,7 @@ class TTSEngine:
|
|||||||
|
|
||||||
audio = audio.astype(np.float32)
|
audio = audio.astype(np.float32)
|
||||||
|
|
||||||
|
# Apply fade-out to avoid abrupt audio cuts
|
||||||
|
audio = apply_fade(audio, output_sample_rate, fade_duration_ms=200)
|
||||||
|
|
||||||
return audio
|
return audio
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
fastapi
|
fastapi
|
||||||
uvicorn[standard]
|
uvicorn[standard]
|
||||||
websockets
|
websockets
|
||||||
|
webrtcvad
|
||||||
|
|
||||||
# Speech-to-Text
|
# Speech-to-Text
|
||||||
faster-whisper
|
faster-whisper
|
||||||
|
|||||||
Reference in New Issue
Block a user