What we are trying to do
We are currently trying to develop a system which uses the Azure Speech recognition API.
Technologies used
- React front-end
- Flask python Backend
- React Media Recorder to record the question
- Azure Speech Recognition API to perform the Speech-To-Text task.
My question input component contains the following code which handles the call to the api:
<ReactMediaRecorder
audio
render={({ startRecording, stopRecording }) => (
<>
<div
id="startRecording"
className={`${styles.questionInputSendButton} ${!listeningActive ? "" : styles.questionInputSendButtonDisabled}`}
aria-label="Speech-to-text button"
onClick={() => {
setListeningActive(true);
setQuestion("");
setPlaceholder("Speak into your microphone and then press stop once you finished!");
startRecording();
}}
>
<Mic16Filled primaryFill="rgba(255, 64, 0, 1)" />
</div>
<div
id="stopRecording"
className={`${styles.questionInputSendButton} ${listeningActive ? "" : styles.questionInputSendButtonDisabled} `}
aria-label="Stop recording button"
onClick={() => {
setListeningActive(false);
stopRecording();
}}
>
<Stop16Filled primaryFill="rgba(255, 64, 0, 1)" />
</div>
</>
)}
onStop={async (blobUrl: string, blob: Blob) => {
const blobToQuestion = await produceQuestionFromAudioRecording(blobUrl, blob);
setQuestion(blobToQuestion);
}}
/>
And this apparently works correctly, given the fact that to the api call the blob url and the blob are sent correctly (i just need the blob as per current implementation, so the signature of the api call might change to require just the blob). The code in the api.ts file is the following:
export async function produceQuestionFromAudioRecording(blobUrl: string, blob: Blob) {
// create the body of the request and pass it to the request, wit the blob file.
// const blob = await fetch(blobUrl).then(r => r.blob());
const formData = new FormData();
formData.append("fileAudioWav", blob);
const postRequest = {
method: "POST",
body: formData
};
const response = await fetch("chat/generateQuestionFromAudio", postRequest);
if (response.status > 299 || !response.ok) throw Error("Received error while generating question.");
const data = await response.json();
return data.text;
}
Now in my python backend the code is the following:
@app.route('/chat/generateQuestionFromAudio', methods=["POST"])
async def generateQuestionFromAudio():
logger.info(f"generateQuestionFromAudio called.")
speech_region = AzureKeyCredential(DefaultConfig.AZURE_SPEECH_REGION)
if speech_credential is not None and speech_region is not None:
logger.info(f"Credentials found, request successfully processed")
speech_config = speechsdk.SpeechConfig(subscription=str(speech_credential.key), region = str(speech_region.key))
fileAudio = request.files['fileAudioWav']
file_path = os.path.join(TEMP_FILE_FOLDER, "question_audio_file-" + time.strftime("%Y%m%d%H%M%S", time.localtime()) + ".wav")
save_as_wav(fileAudio.stream.read(), file_path)
#fileAudio.save(file_path)
audio_config = speechsdk.audio.AudioConfig(filename = file_path)
speech_recognizer: speechsdk.SpeechRecognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
# Non riconosce audio, lo salvo male
speech_recognition_result = speech_recognizer.recognize_once()
logger.info(speech_recognition_result.text)
# delete temp file (DA errore qui)
os.remove(file_path)
return jsonify({'text' : speech_recognition_result.text}), 200
else:
logger.info(f"There were errors in fetching the information")
return jsonify({'text': 'There were errors!'}) , 500
def save_as_wav(audio_data_bytes, output_file_path, sample_rate=16000, sample_width=2, num_channels=1):
with wave.open(output_file_path, 'wb') as wav_file:
# Set WAV file parameters
wav_file.setnchannels(num_channels)
wav_file.setsampwidth(sample_width)
wav_file.setframerate(sample_rate)
# Write audio data to WAV file
wav_file.writeframes(audio_data_bytes)
I tried:
- Saving the fileAudio variable directly as it comes without passing through the save_as_wav function, in that case I get the following error when trying to create the speech recognizer resource
Traceback (most recent call last):
File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\flask\app.py", line 1455, in wsgi_app
response = self.full_dispatch_request()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\flask\app.py", line 869, in full_dispatch_request
rv = self.handle_user_exception(e)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\flask\app.py", line 867, in full_dispatch_request
rv = self.dispatch_request()
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\flask\app.py", line 852, in dispatch_request
return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\asgiref\sync.py", line 277, in __call__
return call_result.result()
^^^^^^^^^^^^^^^^^^^^
File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\concurrent\futures\_base.py", line 449, in result
return self.__get_result()
^^^^^^^^^^^^^^^^^^^
File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\concurrent\futures\_base.py", line 401, in __get_result
raise self._exception
File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\asgiref\sync.py", line 353, in main_wrap
result = await self.awaitable(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\-\Project\backend\app.py", line 292, in generateQuestionFromAudio
speech_recognizer: speechsdk.SpeechRecognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\azure\cognitiveservices\speech\speech.py", line 1006, in __init__
_call_hr_fn(
File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\azure\cognitiveservices\speech\interop.py", line 62, in _call_hr_fn
_raise_if_failed(hr)
File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\azure\cognitiveservices\speech\interop.py", line 55, in _raise_if_failed
__try_get_error(_spx_handle(hr))
File "C:\Users\-\AppData\Local\Programs\Python\Python312\Lib\site-packages\azure\cognitiveservices\speech\interop.py", line 50, in __try_get_error
raise RuntimeError(message)
RuntimeError: Exception with error code:
[CALL STACK BEGIN]
> pal_string_to_wstring
- pal_string_to_wstring
- pal_string_to_wstring
- pal_string_to_wstring
- pal_string_to_wstring
- pal_string_to_wstring
- pal_string_to_wstring
- pal_string_to_wstring
- pal_string_to_wstring
- pal_string_to_wstring
- pal_string_to_wstring
- pal_string_to_wstring
- pal_string_to_wstring
- pal_string_to_wstring
- recognizer_create_speech_recognizer_from_config
- recognizer_create_speech_recognizer_from_config
[CALL STACK END]
- Saving the file audio using the save_as_wav function. In that case the speech recognizer object gets instantiated correctly without generating any exception but the file audio gets saved but it is saved as noise, hence it is not possible to recognize anything, given that the audio is not understandable And so the recognized text is always equal to ''