From e2d3cbe78348e074a1334d68847a782ac5fcae1f Mon Sep 17 00:00:00 2001
From: noturum <noturum@yandex.ru>
Date: Fri, 1 May 2026 13:14:31 +0000
Subject: [PATCH] Add voice activation (VAD) and audio fade

- Add webrtcvad dependency for real-time voice activity detection
- Create audio/fade.py with fade-in/fade-out utility
- Add VAD voice activation to client recording (sends audio only during speech)
- Apply 200ms fade-out to TTS output to avoid abrupt audio cuts
- Fix tts.py indentation error in except block
---
 audio/fade.py    | 34 ++++++++++++++++++++++++++++++++++
 client.py        | 46 +++++++++++++++++++++++++++++++++++++++++-----
 engine/tts.py    | 14 +++++++++-----
 requirements.txt |  1 +
 4 files changed, 85 insertions(+), 10 deletions(-)
 create mode 100644 audio/fade.py

diff --git a/audio/fade.py b/audio/fade.py
new file mode 100644
index 0000000..16e0bce
--- /dev/null
+++ b/audio/fade.py
@@ -0,0 +1,34 @@
+import numpy as np
+
+
+def apply_fade(audio: np.ndarray, sample_rate: int = 24000, fade_duration_ms: int = 300) -> np.ndarray:
+    """Apply fade-in and fade-out to audio array.
+
+    Args:
+        audio: numpy array of audio samples
+        sample_rate: audio sample rate in Hz
+        fade_duration_ms: fade duration in milliseconds
+
+    Returns:
+        Audio array with fade applied
+    """
+    if len(audio) == 0:
+        return audio
+
+    fade_samples = int(sample_rate * fade_duration_ms / 1000)
+    fade_samples = min(fade_samples, len(audio) // 4)
+
+    if fade_samples <= 0:
+        return audio
+
+    # Create fade envelope
+    fade_in = np.linspace(0, 1, fade_samples)
+    fade_out = np.linspace(1, 0, fade_samples)
+
+    # Apply fade-in
+    audio[:fade_samples] *= fade_in
+
+    # Apply fade-out
+    audio[-fade_samples:] *= fade_out
+
+    return audio
diff --git a/client.py b/client.py
index d140fae..fe6e95a 100644
--- a/client.py
+++ b/client.py
@@ -3,6 +3,7 @@ import websockets
 import struct
 import wave
 import numpy as np
+import webrtcvad
 
 # WebSocket URL
 WS_URL = "ws://localhost:8000/ws"
@@ -56,23 +57,41 @@ async def receive_messages(ws):
 
 
 async def record_and_send():
-    """Record audio from microphone and send"""
+    """Record audio from microphone and send with VAD voice activation"""
     import pyaudio
 
     CHUNK = 1024
     FORMAT = pyaudio.paInt16
     CHANNELS = 1
     RATE = 16000
+    VAD_MODE = 3
+    SILENCE_THRESHOLD = 5  # consecutive silent chunks to stop sending
+
+    vad = webrtcvad.Vad()
+    vad.set_mode(VAD_MODE)
 
     p = pyaudio.PyAudio()
     stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
 
     async with websockets.connect(WS_URL) as ws:
-        print("Recording... Press Ctrl+C to stop")
+        print("Recording (VAD active)... Press Ctrl+C to stop")
+        speech_active = False
+        silence_count = 0
         try:
             while True:
                 data = stream.read(CHUNK)
-                await send_audio(ws, data)
+                is_speech = vad.is_speech(data, RATE)
+
+                if is_speech:
+                    speech_active = True
+                    silence_count = 0
+                    await send_audio(ws, data)
+                elif speech_active:
+                    silence_count += 1
+                    if silence_count < SILENCE_THRESHOLD:
+                        await send_audio(ws, data)
+                    else:
+                        speech_active = False
         except KeyboardInterrupt:
             print("\nStopped recording")
         finally:
@@ -131,15 +150,32 @@ async def client():
     choice = input("Choice (1/2): ")
 
     if choice == "1":
+        import webrtcvad
+        vad = webrtcvad.Vad()
+        vad.set_mode(3)
+        SILENCE_THRESHOLD = 5
+
         p = pyaudio.PyAudio()
         stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
         async with websockets.connect(WS_URL) as ws:
-            print("Recording... Press Ctrl+C to stop")
+            print("Recording (VAD active)... Press Ctrl+C to stop")
+            speech_active = False
+            silence_count = 0
             try:
                 receive_task = asyncio.create_task(receive_messages(ws))
                 while True:
                     data = stream.read(1024)
-                    await ws.send(b"A" + data)
+                    is_speech = vad.is_speech(data, 16000)
+                    if is_speech:
+                        speech_active = True
+                        silence_count = 0
+                        await ws.send(b"A" + data)
+                    elif speech_active:
+                        silence_count += 1
+                        if silence_count < SILENCE_THRESHOLD:
+                            await ws.send(b"A" + data)
+                        else:
+                            speech_active = False
             except KeyboardInterrupt:
                 receive_task.cancel()
             finally:
diff --git a/engine/tts.py b/engine/tts.py
index a8019dd..98795d0 100644
--- a/engine/tts.py
+++ b/engine/tts.py
@@ -1,6 +1,7 @@
 from transformers import pipeline
 from config import Config
 import numpy as np
+from audio.fade import apply_fade
 
 
 class TTSEngine:
@@ -16,11 +17,11 @@ class TTSEngine:
                 device=0 if __import__("torch").cuda.is_available() else -1,
             )
         except Exception:
-       self.tts_pipeline = pipeline(
-            "text-to-speech",
-            model=self._tts_model,
-            device=-1,
-        )
+            self.tts_pipeline = pipeline(
+                "text-to-speech",
+                model=self._tts_model,
+                device=-1,
+            )
         self.tts_pipeline.start()
 
     def synthesize(self, text: str, output_sample_rate: int = 24000) -> np.ndarray:
@@ -50,4 +51,7 @@ class TTSEngine:
         
         audio = audio.astype(np.float32)
 
+        # Apply fade-out to avoid abrupt audio cuts
+        audio = apply_fade(audio, output_sample_rate, fade_duration_ms=200)
+
         return audio
diff --git a/requirements.txt b/requirements.txt
index 9474624..25ab34e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@
 fastapi
 uvicorn[standard]
 websockets
+webrtcvad
 
 # Speech-to-Text
 faster-whisper