Using webrtcvad to capture audio when the user starts speaking and stops speaking (like Siri) and then saving to a .wav file

36 views Asked by helpme At 18 February 2024 at 02:03

I'm trying to make a "Siri-like" audio capture using python. I would like the system to detect speech to begin recording, and then detect silence to stop recording. I would like for this audio to be saved as a playable audio file on my computer. (right now I am trying with .wav, but I'm not partial to .wav). I have created a class using the webrtcvad (google's vad package) called VoiceActivityDetector.

The code is able to detect speech, detect silence, and then stop after the silence has been detected; however, when it saves the .wav file, this file is just a silent file. The audio has not actually been recorded.

I've been troubleshooting this for a long time, but can't seem to find the answer - this audio stuff is tricky...

import numpy as np
import sounddevice as sd
from scipy.io.wavfile import write
import tempfile
import os
import webrtcvad
import threading

class VoiceActivityDetector:
    def __init__(self, frame_duration_ms=30, padding_duration_ms=300, vad_aggressiveness=3):
        self.vad = webrtcvad.Vad(vad_aggressiveness)
        self.frame_duration_ms = frame_duration_ms
        self.padding_duration_ms = padding_duration_ms
        self.original_num_padding_frames = padding_duration_ms // frame_duration_ms
        self.num_padding_frames = self.original_num_padding_frames
        self.frames = []
        self.triggered = False
        self.has_spoken = False

    def process_frame(self, frame):
        is_speech = self.vad.is_speech(frame, 16000)
        if not self.triggered:
            self.frames.append(frame)
            if is_speech:
                self.triggered = True
                self.has_spoken = True
                self.frames = self.frames[-self.original_num_padding_frames:]  # Keep only the last few frames
                print("speech detected")
        else:
            self.frames.append(frame)
            if not is_speech and self.has_spoken:
                self.num_padding_frames -= 1
                print(f'Silence countdown: {self.num_padding_frames}')
                if self.num_padding_frames == 0:
                    print('silence detected...')
                    return False  # Stop recording
            elif is_speech:
                self.num_padding_frames = self.original_num_padding_frames
        return True

    def record(self, fs=16000):
        self.stop_recording = threading.Event()  # Use an event to signal stopping

        def callback(indata, frames, time, status):
            if status:
                print(status)
            if not self.process_frame(indata):
                self.stop_recording.set()  # Signal to stop recording

        with sd.RawInputStream(samplerate=fs, blocksize=int(fs * self.frame_duration_ms / 1000), channels=1, dtype='int16', callback=callback):
            while not self.stop_recording.is_set():  # Wait until stop signal is received
                sd.sleep(100)  # Short sleep to remain responsive
        return np.frombuffer(b''.join(self.frames), dtype=np.int16), fs

def get_user_response():
    vad = VoiceActivityDetector()
    print("Please start speaking, when finished, pause and the recording will end...")
    recording, fs = vad.record()

    print('Saving the recording...')
    output_filename = "temporary.wav"
    write(output_filename, fs, recording)  # Save as WAV file
    print(f"Recording saved as {output_filename}")


if __name__ == "__main__":
    get_user_response()

The code above with the correct imports will run without error; but, the saved audio file is blank.

Original Q&A

TechQA.

Using webrtcvad to capture audio when the user starts speaking and stops speaking (like Siri) and then saving to a .wav file

There are 0 answers

Related Questions in PYTHON

Related Questions in AUDIO

Related Questions in WAV

Related Questions in VOICE-RECOGNITION

Popular Questions

Trending Questions