Append time sliced audio to stream sent to dialogflow.streamingDetectIntent()

243 views Asked by At

Prologue

I implemented a search form (a react widget) where a user can click on the mic button and use his/her voice to fill the form.

I am using: ReacordRTC for recording the audio. Dialogflow API for NLP capability. stream.io - stream.io-client for streaming the BLOBs from my react widget to node server.

Current Implementation

At client side:

    const captureUserMedia = () => {
        setIsRecording(true);
        navigator.getUserMedia(
            { audio: true }, 
            function (stream) {
                const recordAudio = recordrtc(stream, {
                    type: 'audio',
                    mimeType: 'audio/webm',
                    sampleRate: 44100,
                    
                    recorderType: recordrtc.StereoAudioRecorder,
                    numberOfAudioChannels: 1,

                    desiredSampRate: 16000,
                });
                setRecordAudio(recordAudio);
                recordAudio.startRecording();
                setIsRecording(true);
            }, function(error) {
                console.error(JSON.stringify(error));
            });
    };

    const endCaptureMedia = () => {
        setIsRecording(false);
        recordAudio.stopRecording(()=>{
            const blob = recordAudio.getBlob();
            console.log(blob);
            const audioStream = ss.createStream();
            ss(socket).emit('stream_audio', audioStream, {
                name: '_temp/stream.wav', 
                size: blob.size,
                language: 'en-US',
            });
            
            ss.createBlobReadStream(blob).pipe(audioStream);
        });
    };

At server side:

    ss(socket).on('stream_audio', (stream, data) => {
      handleAudioStream(sessionId, stream, data);
    });

handleAudioStream

export default async (sessionId, stream, data) => {
    const filename = path.basename(data.name);
  
    stream.pipe(fs.createWriteStream(filename));
    detectIntentStream(getAudioQueryRequest(sessionId), stream, async function(response){

      //callback function body
    });
  
  }

detectIntentStream

export default async (request, filename, cb) => {
  const detectStream = sessionsClient.streamingDetectIntent()
    .on('data', (data) => {
      cb(data);
    })
    .on('error', (e) => {
      console.log(e);
    })
    .on('end', () => {
      console.log('on end');
    });
  detectStream.write(request);
  await pump(
    fs.createReadStream(filename),,
    new Transform({
      objectMode: true,
      transform: (obj, _, next) => {
        next(null, { inputAudio: obj });
      },
    }),
    detectStream,
  );
};

The Issue

The above mentioned current implementation works fine, but the issue is I am collecting the blob at client side only after the recording has stopped, and then start to send it. This brings in some amount of delay to get the recognition transcript and detect the intent. What I am trying to achieve is start sending ```timeSliced``` audio streams when the recording is happening, so that ```dialogflow``` can start the process of detecting the intent while the user is speaking itself.

What I tried

So I tried timeSlicing the recording at intervals of 3 seconds and streaming it on ```ondataavailable```.

Client Side:

    const captureUserMedia = () => {
        setIsRecording(true);
        navigator.getUserMedia(
            { audio: true }, 
            function (stream) {
                const recordAudio = recordrtc(stream, {
                    type: 'audio',
                    mimeType: 'audio/webm',
                    sampleRate: 44100,
                    
                    recorderType: recordrtc.StereoAudioRecorder,
                    numberOfAudioChannels: 1,

                    desiredSampRate: 16000,
                    timeSlice: 3000,
    
                    ondataavailable: function(blob) {
                        console.log('ondataavailable');
                        console.log(blob);
                        const audioStream = ss.createStream();
                        ss(socket).emit('stream_audio', audioStream, {
                            name: '_temp/stream.wav', 
                            size: blob.size,
                            language: 'en-US',
                        });
                     
                        ss.createBlobReadStream(blob).pipe(audioStream);
                    }
                });
                setRecordAudio(recordAudio);
                recordAudio.startRecording();
                setIsRecording(true);
            }, function(error) {
                console.error(JSON.stringify(error));
            });
    };

    const endCaptureMedia = () => {
        setIsRecording(false);
        recordAudio.stopRecording();
    };

But the issues are:

  1. when the second chunck of the stream is received at server, the first chunck is being overridden and only the second chunck is processed to detectIntent. I want each chunk of BLOB to be appended to the previous one. before it being processed to detectIntent.

  2. Currently the mic button is a press, hold and speak button. So when the user hold and speaks for 8 seconds, since the recording is sliced at 3 second intervals, the audio after the 6th second is not received at the server.

0

There are 0 answers