Is there a way to pause and resume speech recognition in webkitSpeechRecognition when audio is played by the computer? The computer right now seems to be confusing what is user input via the microphone and audio output from a wav file.

Right now I have created the below:

var speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; 
var recognition = new webkitSpeechRecognition();
    
window.addEventListener('DOMContentLoaded', function() {
    document.getElementById("speak_button").addEventListener('click', function() {
       recognition.start();
       setInterval(updateCountDown,1000); /* countdown timer starts 1 second after 
                                           being clicked */
       updateCountDown(); //this is a function that counts down from 2 minutes to 0
        
    });
});


var transcript; // transcript variable will store what the user says to the computer

recognition.addEventListener('result', e => {
    transcript = Array.from(e.results)
       .map(result => result[0])
       .map(result => result.transcript)
       .join('');
   console.log(transcript);
    communicateToUser();
  
});


function communicateToUser() {

    var audio_age = new Audio("age_20.wav");

        var age_regular_expression = /(?=.*\bhow\b)(?=.*\bold\b)(?=.*\byou\b)|(?=.*\bgrab\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bage\b)| (?=.*\btell\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\bhow\b)(?=.*\bold\b)|(?=.*\byou\b)(?=.*\bhow\b)(?=.*\bold\b)/gi;

    // if regular expression matches all words, then function will be performed

        if (age_regular_expression.test(transcript)) {
        recognition.stop(); /* wanting the speech recognition to stop here so that it 
                            doesn't capture the contents of audio_age */
        audio_age.play(); // audio will play "I am 20 years old" 
        recognition.start(); /* wanting the speech recognition to start again 
                            after audio_age is played */
        
    }

}

The problem is that the recognition.stop() function isn't working, which means that the microphone will continue capturing the contents of audio_age.wav and will convert it to text. So, when I want to speak to the computer again and ask it a question, the transcript that will be analysed will include the transcript from when I just spoke before.

Any advice would be appreciated.

I was thinking of a solution but I'm not sure how to implement it:
SOLUTION: stop the recognition function and delay it by the same number of seconds that the audio file plays for (for example 5 seconds), and then the recognition function can be started again after those 5 seconds?

Thanks!

EDIT FOR CESARE:

// SPEECH RECOGNITION SET UP 

    var speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; 
    var recognition = new webkitSpeechRecognition();
        

    window.addEventListener('DOMContentLoaded', function() {
    document.getElementById("speak_button").addEventListener('click', function() {
            recognition.start();
            setInterval(updateCountDown,1000);
            updateCountDown();
        });
    });

// ALL OF THE AUDIO FILES --> WILL BE PLAYED IF REGEX MATCHES TRUE
    
    const audio_name = new Audio("name_harry.wav");
    
    const audio_age = new Audio("age_20.wav");
    
    const audio_date_of_birth = new Audio("15_nov_1999.wav");
    
    const audio_occupation = new Audio("grocery_store.wav");


// ON SPEECH START --> IF MICROPHONE INPUT IS DETECTED, THEN SPEECH RECOGNITION STARTS 
    
    recognition.onspeechstart = () => {
        console.log("SPEECH STARTED");
        if (!audio_age.paused) {audio_age.pause()}
        else if (!audio_name.paused) {audio_name.pause()}
        else if (!audio_date_of_birth.paused) {audio_date_of_birth.pause()}
        else if (!audio_occupation.paused) {audio_occupation.pause()}
  
    };
    
// ON SPEECH END --> WHEN MICROPHONE INPUT STOPS, SPEECH RECOGNITION SHOULD END 

    recognition.onspeechend = () => {
        console.log("SPEECH ENDED");
        recognition.stop();
    
    };
    
// I have included this because I want the computer to continue listening to the user, but only after the audio is finished playing 

    recognition.addEventListener('end', recognition.start);

// After audio is ended, speech recognition will start again
    
    audio_name.addEventListener('ended', recognition.start);
    audio_age.addEventListener('ended', recognition.start);
    audio_date_of_birth.addEventListener('ended', recognition.start);
    audio_occupation.addEventListener('ended', recognition.start);
    audio_height.addEventListener('ended', recognition.start);
    
    
// USED TO OBTAIN THE USER TRANSCRIPT/ACTUAL SPEECH CONTENT

    var transcript;
    
    recognition.addEventListener('result', e => {
        transcript = Array.from(e.results)
           .map((result) => result[0])
           .map((result) => result.transcript)
           .join('');
       console.log(transcript);
       communicateToUser();
      
    });
    
 




     // ALL OF THE REGULAR EXPRESSIONS

    const name_regex = /what is your name|(?=.*\byour\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\btell\b)(?=.*\bme\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\bcan\b)(?=.*\btell\b)(?=.*\bme\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\btell\b)(?=.*\bme\b)(?=.*\byour\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\blet\b)(?=.*\bknow\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\bgrab\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\bshare\b)(?=.*\bme\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bfirst\b)(?=.*\band\b)(?=.*\blast\b)(?=.*\bname\b)/ig;

const age_regex = /(?=.*\bhow\b)(?=.*\bold\b)(?=.*\byou\b)|(?=.*\bgrab\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bage\b)| (?=.*\btell\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\bhow\b)(?=.*\bold\b)|(?=.*\byou\b)(?=.*\bhow\b)(?=.*\bold\b)/gi;

const date_of_birth_regex = /(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\byour\b)(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\bshare\b)(?=.*\byour\b)(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\bshare\b)(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\bwhen\b)(?=.*\byou\b)(?=.*\bborn\b)|(?=.*\bwhen\b)(?=.*\byou\b)(?=.*\bborn\b)|(?=.*\bwhat\b)(?=.*\bdate\b)(?=.*\byou\b)(?=.*\bborn\b)/gi

const patient_occupation = /do you have a job|(?=.*\bdo\b)(?=.*\byou\b)(?=.*\bwork\b)|(?=.*\byou\b)(?=.*\bhave\b)(?=.*\bjob\b)|(?=.*\byou\b)(?=.*\bwork\b)(?=.*\bwhere\b)|(?=.*\banything\b)(?=.*\bfor\b)(?=.*\bwork\b)|(?=.*\byou\b)(?=.*\bwork\b)(?=.*\banywhere\b)|(?=.*\bwhat\b)(?=.*\boccupation\b)|(?=.*\byour\b)(?=.*\boccupation\b)|(?=.*\byou\b)(?=.*\boccupation\b)|(?=.*\byour\b)(?=.*\bjob\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bjob\b)|(?=.*\byou\b)(?=.*\bjob\b)|(?=.*\bjob\b)/ig;

// COMMUNICATE BACK TO USER FUNCTION
 

       function communicateToUser() {
    
    if (name_regex.test(transcript)) {
            audio_name.play();
    }

    if (age_regex.test(transcript)) {
            audio_age.play();
    }
    if (date_of_birth_regex.test(transcript)) {
                audio_date_of_birth.play();
    }
    if (occuptation_regex.test(transcript)) {
                    audio_occupation.play();
        }
    
    }
         

UpdateCountdown function

function updateCountDown() {
   
   const minutes = Math.floor(time / 60);
   let seconds = time % 60;

   seconds = seconds < 2 ? '0' + seconds : seconds;

   document.getElementById("countdown").innerHTML = `${minutes}:${seconds}`;

   time--;

   time = time < 0 ? 0 : time; 

    if (minutes == 0 && seconds == 0) {
        document.getElementById('tableStyle').style.display = "block";
        recognition.stop(); //ADDING IN RECOGNITION.STOP ONCE MINUTES AND SECONDS == 0!
        
    }

   };
1

There are 1 answers

18
Cesare Polonara On BEST ANSWER

EDIT:

I made a working example, https://stackblitz.com/edit/web-platform-ppcuh9?file=index.html:

let isListening = false; // use this flag to toggle the recognition
let interval;
const button = document.getElementById('speak_button');

const speaker = new MakeSpeechSynth({
  pitch: 0.5,
  rate: 0.8,
  language: 'en-US',
});

const SpeechRecognition =
  window.SpeechRecognition || window.webkitSpeechRecognition;
const recognition = new SpeechRecognition();

button.addEventListener('click', function() {
  if (isListening) {
    console.log('ABORTING RECOGNITION');
    isListening = false;
    recognition.abort();
    clearInterval(interval);
    button.innerText = 'Click Me To Speak';
  } else {
    console.log('STARTING RECOGNITION');
    recognition.start();
    interval = setInterval(updateCountDown, 1000);
    updateCountDown();
    button.innerText = 'Stop Recognition';
    isListening = true;
  }
});

recognition.onaudiostart = () => {
  console.log('RECOGNITION STARTED');
};

recognition.onaudioend = () => {
  console.log('RECOGNITION FINISHED');
};

recognition.onend = () => {
  console.log('RECOGNITION DISCONNECTED');
  if (isListening) recognition.start();
};

recognition.onspeechstart = () => {
  console.log('SPEECH STARTED');
  // You can stop the bot speaking if you want when you speak over him:
  // Comment if you want him to keep speaking

  //Object.values(data).forEach((d) => d.audio.pause());
  if (speaker.isSpeaking) speaker.cancel();
};

recognition.onspeechend = () => {
  console.log('SPEECH ENDED');
};

recognition.addEventListener('result', (e) => {
  const transcript = Array.from(e.results)
    .map((result) => result[0])
    .map((result) => result.transcript)
    .join('');
  console.log(transcript);
  speakBackToMe(transcript);
});

function speakBackToMe(str) {
  Object.values(data).forEach((d) => {
    if (d.regex.test(str)) {
      // d.audio.play();
      speaker.speak(d.message);
      console.log(d.message);
    }
  });
}

// UPDATE COUNTDOWN
const startingMinutes = 2;
let time = startingMinutes * 60;

function updateCountDown() {
  const minutes = Math.floor(time / 60);
  let seconds = time % 60;
  seconds = seconds < 2 ? '0' + seconds : seconds;
  document.getElementById('countdown').innerHTML = `${minutes}:${seconds}`;
  time--;
  time = time < 0 ? 0 : time;

  if (minutes == 0 && seconds == 0) {
    document.getElementById('tableStyle').style.display = 'table-cell';
  }
}
<div id="app"></div>
<button id="speak_button">Click Me to Speak</button>
<p id="countdown"></p>