Speech recognition API python

48 views Asked by At

What we are trying to do
We are currently trying to develop a system which uses the Azure Speech recognition API.

Technologies used

  • React front-end
  • Flask python Backend
  • React Media Recorder to record the question
  • Azure Speech Recognition API to perform the Speech-To-Text task.

My question input component contains the following code which handles the call to the api:

 <ReactMediaRecorder
                audio
                render={({ startRecording, stopRecording }) => (
                    <>
                        <div
                            id="startRecording"
                            className={`${styles.questionInputSendButton} ${!listeningActive ? "" : styles.questionInputSendButtonDisabled}`}
                            aria-label="Speech-to-text button"
                            onClick={() => {
                                setListeningActive(true);
                                setQuestion("");
                                setPlaceholder("Speak into your microphone and then press stop once you finished!");
                                startRecording();
                            }}
                        >
                            <Mic16Filled primaryFill="rgba(255, 64, 0, 1)" />
                        </div>
                        <div
                            id="stopRecording"
                            className={`${styles.questionInputSendButton} ${listeningActive ? "" : styles.questionInputSendButtonDisabled} `}
                            aria-label="Stop recording button"
                            onClick={() => {
                                setListeningActive(false);
                                stopRecording();
                            }}
                        >
                            <Stop16Filled primaryFill="rgba(255, 64, 0, 1)" />
                        </div>
                    </>
                )}
                onStop={async (blobUrl: string, blob: Blob) => {
                    const blobToQuestion = await produceQuestionFromAudioRecording(blobUrl, blob);

                    setQuestion(blobToQuestion);
                }}
            />

And this apparently works correctly, given the fact that to the api call the blob url and the blob are sent correctly (i just need the blob as per current implementation, so the signature of the api call might change to require just the blob). The code in the api.ts file is the following:
export async function produceQuestionFromAudioRecording(blobUrl: string, blob: Blob) {
// create the body of the request and pass it to the request, wit the blob file.

// const blob = await fetch(blobUrl).then(r => r.blob());
const formData = new FormData();
formData.append("fileAudioWav", blob);

const postRequest = {
    method: "POST",
    body: formData
};

const response = await fetch("chat/generateQuestionFromAudio", postRequest);

if (response.status > 299 || !response.ok) throw Error("Received error while generating question.");

const data = await response.json();

return data.text;
}

Now in my python backend the code is the following:

@app.route('/chat/generateQuestionFromAudio', methods=["POST"])
async def generateQuestionFromAudio():
    logger.info(f"generateQuestionFromAudio called.")
               
    speech_region = AzureKeyCredential(DefaultConfig.AZURE_SPEECH_REGION)
    

    if speech_credential is not None and speech_region is not None:
        logger.info(f"Credentials found, request successfully processed")
        speech_config = speechsdk.SpeechConfig(subscription=str(speech_credential.key), region = str(speech_region.key))
        
        fileAudio = request.files['fileAudioWav']
        file_path = os.path.join(TEMP_FILE_FOLDER, "question_audio_file-" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + ".wav")
        save_as_wav(fileAudio.stream.read(), file_path)
        #fileAudio.save(file_path)
        
        audio_config = speechsdk.audio.AudioConfig(filename = file_path)
        speech_recognizer: speechsdk.SpeechRecognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config) 
        
        # Non riconosce audio, lo salvo male
        speech_recognition_result = speech_recognizer.recognize_once()
        logger.info(speech_recognition_result.text)
        
        # delete temp file (DA errore qui)
        os.remove(file_path)

        
        return jsonify({'text' : speech_recognition_result.text}), 200        
    
    else:
        logger.info(f"There were errors in fetching the information")
        
        return  jsonify({'text': 'There were errors!'}) , 500


def save_as_wav(audio_data_bytes, output_file_path, sample_rate=16000, sample_width=2, num_channels=1):
    with wave.open(output_file_path, 'wb') as wav_file:
        # Set WAV file parameters
        wav_file.setnchannels(num_channels)
        wav_file.setsampwidth(sample_width)
        wav_file.setframerate(sample_rate)
        
        # Write audio data to WAV file
        wav_file.writeframes(audio_data_bytes)



I tried:

  • Saving the fileAudio variable directly as it comes without passing through the save_as_wav function, in that case I get the following error when trying to create the speech recognizer resource
Traceback (most recent call last):
  File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\flask\app.py", line 1455, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\flask\app.py", line 869, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\flask\app.py", line 867, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\flask\app.py", line 852, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\asgiref\sync.py", line 277, in __call__
    return call_result.result()
           ^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\concurrent\futures\_base.py", line 449, in result
    return self.__get_result()
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\concurrent\futures\_base.py", line 401, in __get_result
    raise self._exception
  File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\asgiref\sync.py", line 353, in main_wrap
    result = await self.awaitable(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\-\Project\backend\app.py", line 292, in generateQuestionFromAudio
    speech_recognizer: speechsdk.SpeechRecognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
                                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\azure\cognitiveservices\speech\speech.py", line 1006, in __init__
    _call_hr_fn(
  File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\azure\cognitiveservices\speech\interop.py", line 62, in _call_hr_fn
    _raise_if_failed(hr)
  File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\azure\cognitiveservices\speech\interop.py", line 55, in _raise_if_failed
    __try_get_error(_spx_handle(hr))
  File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\azure\cognitiveservices\speech\interop.py", line 50, in __try_get_error
    raise RuntimeError(message)
RuntimeError: Exception with error code:
[CALL STACK BEGIN]

    > pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - pal_string_to_wstring
    - recognizer_create_speech_recognizer_from_config
    - recognizer_create_speech_recognizer_from_config

[CALL STACK END]

  • Saving the file audio using the save_as_wav function. In that case the speech recognizer object gets instantiated correctly without generating any exception but the file audio gets saved but it is saved as noise, hence it is not possible to recognize anything, given that the audio is not understandable And so the recognized text is always equal to ''
0

There are 0 answers