Is there a way to pause and resume speech recognition in webkitSpeechRecognition when audio is played by the computer? The computer right now seems to be confusing what is user input via the microphone and audio output from a wav file.
Right now I have created the below:
var speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
var recognition = new webkitSpeechRecognition();
window.addEventListener('DOMContentLoaded', function() {
document.getElementById("speak_button").addEventListener('click', function() {
recognition.start();
setInterval(updateCountDown,1000); /* countdown timer starts 1 second after
being clicked */
updateCountDown(); //this is a function that counts down from 2 minutes to 0
});
});
var transcript; // transcript variable will store what the user says to the computer
recognition.addEventListener('result', e => {
transcript = Array.from(e.results)
.map(result => result[0])
.map(result => result.transcript)
.join('');
console.log(transcript);
communicateToUser();
});
function communicateToUser() {
var audio_age = new Audio("age_20.wav");
var age_regular_expression = /(?=.*\bhow\b)(?=.*\bold\b)(?=.*\byou\b)|(?=.*\bgrab\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bage\b)| (?=.*\btell\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\bhow\b)(?=.*\bold\b)|(?=.*\byou\b)(?=.*\bhow\b)(?=.*\bold\b)/gi;
// if regular expression matches all words, then function will be performed
if (age_regular_expression.test(transcript)) {
recognition.stop(); /* wanting the speech recognition to stop here so that it
doesn't capture the contents of audio_age */
audio_age.play(); // audio will play "I am 20 years old"
recognition.start(); /* wanting the speech recognition to start again
after audio_age is played */
}
}
The problem is that the recognition.stop()
function isn't working, which means that the microphone will continue capturing the contents of audio_age.wav
and will convert it to text. So, when I want to speak to the computer again and ask it a question, the transcript that will be analysed will include the transcript from when I just spoke before.
Any advice would be appreciated.
I was thinking of a solution but I'm not sure how to implement it:
SOLUTION:
stop the recognition function and delay it by the same number of seconds that the audio file plays for (for example 5 seconds), and then the recognition function can be started again after those 5 seconds?
Thanks!
EDIT FOR CESARE:
// SPEECH RECOGNITION SET UP
var speechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
var recognition = new webkitSpeechRecognition();
window.addEventListener('DOMContentLoaded', function() {
document.getElementById("speak_button").addEventListener('click', function() {
recognition.start();
setInterval(updateCountDown,1000);
updateCountDown();
});
});
// ALL OF THE AUDIO FILES --> WILL BE PLAYED IF REGEX MATCHES TRUE
const audio_name = new Audio("name_harry.wav");
const audio_age = new Audio("age_20.wav");
const audio_date_of_birth = new Audio("15_nov_1999.wav");
const audio_occupation = new Audio("grocery_store.wav");
// ON SPEECH START --> IF MICROPHONE INPUT IS DETECTED, THEN SPEECH RECOGNITION STARTS
recognition.onspeechstart = () => {
console.log("SPEECH STARTED");
if (!audio_age.paused) {audio_age.pause()}
else if (!audio_name.paused) {audio_name.pause()}
else if (!audio_date_of_birth.paused) {audio_date_of_birth.pause()}
else if (!audio_occupation.paused) {audio_occupation.pause()}
};
// ON SPEECH END --> WHEN MICROPHONE INPUT STOPS, SPEECH RECOGNITION SHOULD END
recognition.onspeechend = () => {
console.log("SPEECH ENDED");
recognition.stop();
};
// I have included this because I want the computer to continue listening to the user, but only after the audio is finished playing
recognition.addEventListener('end', recognition.start);
// After audio is ended, speech recognition will start again
audio_name.addEventListener('ended', recognition.start);
audio_age.addEventListener('ended', recognition.start);
audio_date_of_birth.addEventListener('ended', recognition.start);
audio_occupation.addEventListener('ended', recognition.start);
audio_height.addEventListener('ended', recognition.start);
// USED TO OBTAIN THE USER TRANSCRIPT/ACTUAL SPEECH CONTENT
var transcript;
recognition.addEventListener('result', e => {
transcript = Array.from(e.results)
.map((result) => result[0])
.map((result) => result.transcript)
.join('');
console.log(transcript);
communicateToUser();
});
// ALL OF THE REGULAR EXPRESSIONS
const name_regex = /what is your name|(?=.*\byour\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\btell\b)(?=.*\bme\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\bcan\b)(?=.*\btell\b)(?=.*\bme\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\btell\b)(?=.*\bme\b)(?=.*\byour\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\blet\b)(?=.*\bknow\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\bgrab\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bname\b)|(?=.*\bshare\b)(?=.*\bme\b)(?=.*\bfull\b)(?=.*\bname\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bfirst\b)(?=.*\band\b)(?=.*\blast\b)(?=.*\bname\b)/ig;
const age_regex = /(?=.*\bhow\b)(?=.*\bold\b)(?=.*\byou\b)|(?=.*\bgrab\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bage\b)| (?=.*\btell\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\byour\b)(?=.*\bage\b)|(?=.*\bshare\b)(?=.*\bhow\b)(?=.*\bold\b)|(?=.*\byou\b)(?=.*\bhow\b)(?=.*\bold\b)/gi;
const date_of_birth_regex = /(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\byour\b)(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\bshare\b)(?=.*\byour\b)(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\bshare\b)(?=.*\bdate\b)(?=.*\bof\b)(?=.*\bbirth\b)|(?=.*\bwhen\b)(?=.*\byou\b)(?=.*\bborn\b)|(?=.*\bwhen\b)(?=.*\byou\b)(?=.*\bborn\b)|(?=.*\bwhat\b)(?=.*\bdate\b)(?=.*\byou\b)(?=.*\bborn\b)/gi
const patient_occupation = /do you have a job|(?=.*\bdo\b)(?=.*\byou\b)(?=.*\bwork\b)|(?=.*\byou\b)(?=.*\bhave\b)(?=.*\bjob\b)|(?=.*\byou\b)(?=.*\bwork\b)(?=.*\bwhere\b)|(?=.*\banything\b)(?=.*\bfor\b)(?=.*\bwork\b)|(?=.*\byou\b)(?=.*\bwork\b)(?=.*\banywhere\b)|(?=.*\bwhat\b)(?=.*\boccupation\b)|(?=.*\byour\b)(?=.*\boccupation\b)|(?=.*\byou\b)(?=.*\boccupation\b)|(?=.*\byour\b)(?=.*\bjob\b)|(?=.*\bwhat\b)(?=.*\byour\b)(?=.*\bjob\b)|(?=.*\byou\b)(?=.*\bjob\b)|(?=.*\bjob\b)/ig;
// COMMUNICATE BACK TO USER FUNCTION
function communicateToUser() {
if (name_regex.test(transcript)) {
audio_name.play();
}
if (age_regex.test(transcript)) {
audio_age.play();
}
if (date_of_birth_regex.test(transcript)) {
audio_date_of_birth.play();
}
if (occuptation_regex.test(transcript)) {
audio_occupation.play();
}
}
UpdateCountdown function
function updateCountDown() {
const minutes = Math.floor(time / 60);
let seconds = time % 60;
seconds = seconds < 2 ? '0' + seconds : seconds;
document.getElementById("countdown").innerHTML = `${minutes}:${seconds}`;
time--;
time = time < 0 ? 0 : time;
if (minutes == 0 && seconds == 0) {
document.getElementById('tableStyle').style.display = "block";
recognition.stop(); //ADDING IN RECOGNITION.STOP ONCE MINUTES AND SECONDS == 0!
}
};
EDIT:
I made a working example, https://stackblitz.com/edit/web-platform-ppcuh9?file=index.html: