Azure speech-to-text speaker identification (or diarization): no text and no guests

673 views Asked by At

I run this sample code from here, just changing the file name and the number of channels from eight to two (one channel is not supported).

My goal is to test the speaker identification. Actually the diarization, since I'm not providing speakers signature. All I want is something like "speaker1", "speaker2". Just differentiate among speakers.

import azure.cognitiveservices.speech as speechsdk
import time
import uuid
from scipy.io import wavfile

speech_key, service_region="your-subscription-key","your-region"
conversationfilename= "myfile.wav" # Two channels, 16 bits, 16 kHz audio

def conversation_transcription_differentiate_speakers():

    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    speech_config.set_property_by_name("ConversationTranscriptionInRoomAndOnline", "true")
    speech_config.set_property_by_name("DifferentiateGuestSpeakers", "true")

    channels = 8
    bits_per_sample = 16
    samples_per_second = 16000

    wave_format = speechsdk.audio.AudioStreamFormat(samples_per_second, bits_per_sample, channels)
    stream = speechsdk.audio.PushAudioInputStream(stream_format=wave_format)
    audio_config = speechsdk.audio.AudioConfig(stream=stream)

    transcriber = speechsdk.transcription.ConversationTranscriber(audio_config)

    conversation_id = str(uuid.uuid4())
    conversation = speechsdk.transcription.Conversation(speech_config, conversation_id)
    done = False

    def stop_cb(evt: speechsdk.SessionEventArgs):
        """callback that signals to stop continuous transcription upon receiving an event `evt`"""
        print('CLOSING {}'.format(evt))
        nonlocal done
        done = True

    transcriber.transcribed.connect(lambda evt: print('TRANSCRIBED: {}'.format(evt)))
    transcriber.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    transcriber.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    transcriber.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))

    # Stop continuous transcription on either session stopped or canceled events
    transcriber.session_stopped.connect(stop_cb)
    transcriber.canceled.connect(stop_cb)

    # Note user voice signatures are not required for speaker differentiation.
    # Use voice signatures when adding participants when more enhanced speaker identification is required.
    user1 = speechsdk.transcription.Participant("[email protected]", "en-us", voice_signature_user1)
    user2 = speechsdk.transcription.Participant("[email protected]", "en-us", voice_signature_user2)

    conversation.add_participant_async(user1).get()
    conversation.add_participant_async(user2).get()
    transcriber.join_conversation_async(conversation).get()
    transcriber.start_transcribing_async()

    sample_rate, wav_data = wavfile.read(conversationfilename)
    stream.write(wav_data.tobytes())
    stream.close()
    while not done:
        time.sleep(.5)

    transcriber.stop_transcribing_async()

This is the output I get:

asdasdasd
SESSION STARTED: SessionEventArgs(session_id=9729b9164e244fb887d014874fa613fe)
TRANSCRIBED: ConversationTranscriptionEventArgs(session_id=9729b9164e244fb887d014874fa613fe, result=ConversationTranscriptionResult(result_id=7a22278656474e138f6dfff2cbfc7c1a, user_id=Unidentified, utterance_id=1_Unidentified_500000, text=, reason=ResultReason.RecognizedSpeech))
TRANSCRIBED: ConversationTranscriptionEventArgs(session_id=9729b9164e244fb887d014874fa613fe, result=ConversationTranscriptionResult(result_id=76245e4a9d9a429eac0b36c663974b1a, user_id=Unidentified, utterance_id=0_Unidentified_8300000, text=, reason=ResultReason.RecognizedSpeech))
TRANSCRIBED: ConversationTranscriptionEventArgs(session_id=9729b9164e244fb887d014874fa613fe, result=ConversationTranscriptionResult(result_id=76794a88432842e49936f0816e60d94a, user_id=Unidentified, utterance_id=0_Unidentified_131300000, text=, reason=ResultReason.RecognizedSpeech))
TRANSCRIBED: ConversationTranscriptionEventArgs(session_id=9729b9164e244fb887d014874fa613fe, result=ConversationTranscriptionResult(result_id=00533cfd0b984e5cbd86d76542f16caa, user_id=Unidentified, utterance_id=1_Unidentified_118900000, text=, reason=ResultReason.RecognizedSpeech))
TRANSCRIBED: ConversationTranscriptionEventArgs(session_id=9729b9164e244fb887d014874fa613fe, result=ConversationTranscriptionResult(result_id=633f782d59a54d5c94676d1557126e79, user_id=Unidentified, utterance_id=1_Unidentified_202800000, text=, reason=ResultReason.RecognizedSpeech))
TRANSCRIBED: ConversationTranscriptionEventArgs(session_id=9729b9164e244fb887d014874fa613fe, result=ConversationTranscriptionResult(result_id=c74c88056dba4900a13e02dfb8583385, user_id=Unidentified, utterance_id=0_Unidentified_267200000, text=, reason=ResultReason.RecognizedSpeech))
CANCELED ConversationTranscriptionCanceledEventArgs(session_id=9729b9164e244fb887d014874fa613fe, result=ConversationTranscriptionResult(result_id=01c1a9d1287c48368ced9a957658701b, user_id=, utterance_id=, text=, reason=ResultReason.Canceled))
CLOSING ConversationTranscriptionCanceledEventArgs(session_id=9729b9164e244fb887d014874fa613fe, result=ConversationTranscriptionResult(result_id=01c1a9d1287c48368ced9a957658701b, user_id=, utterance_id=, text=, reason=ResultReason.Canceled))
SESSION STOPPED SessionEventArgs(session_id=9729b9164e244fb887d014874fa613fe)
CLOSING SessionEventArgs(session_id=9729b9164e244fb887d014874fa613fe)

As you can see, first I didn't get any text. Second, I don't see any Guest_0 nor Guest_1 as explained here.

I tried with several files. What am I doing wrong?

0

There are 0 answers