Transcribing audio from amazon connect & (Kinesis video Strems) using google cloud speech to text

35 views Asked by At
  1. The transcription is not as good as it should be.
  2. The transcription only works when the language code is "en-US".
  3. I don't get any results when i set language code to "en-IN".

Here is my java code

package com.amazonaws.transcribestreaming;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;

import com.amazonaws.kvstranscribestreaming.KVSStreamTrackObject;
import com.amazonaws.kvstranscribestreaming.KVSUtils;
import com.google.api.gax.rpc.ClientStream;
import com.google.api.gax.rpc.ResponseObserver;
import com.google.api.gax.rpc.StreamController;
import com.google.cloud.speech.v1p1beta1.RecognitionConfig;
import com.google.cloud.speech.v1p1beta1.SpeechClient;
import com.google.cloud.speech.v1p1beta1.SpeechRecognitionAlternative;
import com.google.cloud.speech.v1p1beta1.StreamingRecognitionConfig;
import com.google.cloud.speech.v1p1beta1.StreamingRecognitionResult;
import com.google.cloud.speech.v1p1beta1.StreamingRecognizeRequest;
import com.google.cloud.speech.v1p1beta1.StreamingRecognizeResponse;
import com.google.protobuf.ByteString;
import com.google.protobuf.Duration;

public class InfiniteStreamRecognize {

  private static final int STREAMING_LIMIT = 290000; // ~5 minutes

  public static final String RED = "\033[0;31m";
  public static final String GREEN = "\033[0;32m";
  public static final String YELLOW = "\033[0;33m";

  // Creating shared object
  private static volatile BlockingQueue<byte[]> sharedQueue = new LinkedBlockingQueue<byte[]>();

  private static int restartCounter = 0;
  private static ArrayList<ByteString> audioInput = new ArrayList<ByteString>();
  private static ArrayList<ByteString> lastAudioInput = new ArrayList<ByteString>();
  private static int resultEndTimeInMS = 0;
  private static int isFinalEndTime = 0;
  private static int finalRequestEndTime = 0;
  private static boolean newStream = true;
  private static double bridgingOffset = 0;
  private static boolean lastTranscriptWasFinal = false;
  private static StreamController referenceToStreamController;
  private static ByteString tempByteString;
  private static boolean breakTranscription = false;


  public static String convertMillisToDate(double milliSeconds) {
    long millis = (long) milliSeconds;
    DecimalFormat format = new DecimalFormat();
    format.setMinimumIntegerDigits(2);
    return String.format(
        "%s:%s /",
        format.format(TimeUnit.MILLISECONDS.toMinutes(millis)),
        format.format(
            TimeUnit.MILLISECONDS.toSeconds(millis)
                - TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis))));
  }

  /** Performs infinite streaming speech recognition */
  public static void infiniteStreamingRecognize(String languageCode, KVSStreamTrackObject kvsStreamTrackObject, String contactId) throws Exception {
      
    // AWS Input buffering
    class AWSAudioBuffer implements Runnable {
      @Override
      public void run() {
        try {
            byte[] data;
            ByteBuffer audioBuffer = KVSUtils.getByteBufferFromStream(kvsStreamTrackObject.getStreamingMkvReader(),
                    kvsStreamTrackObject.getFragmentVisitor(), kvsStreamTrackObject.getTagProcessor(), contactId, kvsStreamTrackObject.getTrackName());
            do{
                data = new byte[audioBuffer.remaining()];
                audioBuffer.get(data);
                
                sharedQueue.put(data.clone());
                audioBuffer = KVSUtils.getByteBufferFromStream(kvsStreamTrackObject.getStreamingMkvReader(),
                        kvsStreamTrackObject.getFragmentVisitor(), kvsStreamTrackObject.getTagProcessor(), contactId, kvsStreamTrackObject.getTrackName());
                //expensive op
                kvsStreamTrackObject.getOutputStream().write(data);
                
            }while(data.length>0);
        } catch (Exception e) {
            System.out.println("kvs interrupted, "+e.getMessage());
        } finally {
            // break transcription if no audio available
            breakTranscription = true;
            try {
                System.out.println("Closing the audiofile.");
                kvsStreamTrackObject.getOutputStream().close();
            } catch (IOException e2) {
                System.out.println("Error while closing file:-- "+e2.getMessage());
            }
        }
        
      }
    }

    // Creating microphone input buffer thread
    AWSAudioBuffer awsRunnable = new AWSAudioBuffer();
    Thread awsAudioThread = new Thread(awsRunnable);
    
    // Response observer
    ResponseObserver<StreamingRecognizeResponse> responseObserver = null;
    try (SpeechClient client = SpeechClient.create()) {
        ArrayList<String> languageList = new ArrayList<> ();
        languageList.add("en-IN");
        languageList.add("hi-IN");
      ClientStream<StreamingRecognizeRequest> clientStream;
      responseObserver =
          new ResponseObserver<StreamingRecognizeResponse>() {

            ArrayList<StreamingRecognizeResponse> responses = new ArrayList<>();

            public void onStart(StreamController controller) {
              referenceToStreamController = controller;
            }

            public void onResponse(StreamingRecognizeResponse response) {
              responses.add(response);
              StreamingRecognitionResult result = response.getResultsList().get(0);
              Duration resultEndTime = result.getResultEndTime();
              resultEndTimeInMS =
                  (int)
                      ((resultEndTime.getSeconds() * 1000) + (resultEndTime.getNanos() / 1000000));
              double correctedTime =
                  resultEndTimeInMS - bridgingOffset + (STREAMING_LIMIT * restartCounter);

              SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
              if (result.getIsFinal()) {
                System.out.print(GREEN);
                System.out.print("\033[2K\r");
                System.out.printf(
                    "%s: %s [confidence: %.2f]\n",
                    convertMillisToDate(correctedTime),
                    alternative.getTranscript(),
                    alternative.getConfidence());
                isFinalEndTime = resultEndTimeInMS;
                lastTranscriptWasFinal = true;
              } else {
                System.out.print(RED);
                System.out.print("\033[2K\r");
                System.out.printf(
                    "%s: %s", convertMillisToDate(correctedTime), alternative.getTranscript());
                lastTranscriptWasFinal = false;
              }
            }

            public void onComplete() {}

            public void onError(Throwable t) {}
          };
      clientStream = client.streamingRecognizeCallable().splitCall(responseObserver);

      RecognitionConfig recognitionConfig =
          RecognitionConfig.newBuilder()
              .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
              .setLanguageCode(languageCode)
              .addAllAlternativeLanguageCodes(languageList)
              .setSampleRateHertz(8000)
              .build();

      StreamingRecognitionConfig streamingRecognitionConfig =
          StreamingRecognitionConfig.newBuilder()
              .setConfig(recognitionConfig)
              .setInterimResults(false)
              .build();

      StreamingRecognizeRequest request =
          StreamingRecognizeRequest.newBuilder()
              .setStreamingConfig(streamingRecognitionConfig)
              .build(); // The first request in a streaming call has to be a config

      clientStream.send(request);

      try {
        
          awsAudioThread.start();

        long startTime = System.currentTimeMillis();

        while (true) {

          long estimatedTime = System.currentTimeMillis() - startTime;

          if (estimatedTime >= STREAMING_LIMIT) {

            clientStream.closeSend();
            referenceToStreamController.cancel(); // remove Observer

            if (resultEndTimeInMS > 0) {
              finalRequestEndTime = isFinalEndTime;
            }
            resultEndTimeInMS = 0;

            lastAudioInput = null;
            lastAudioInput = audioInput;
            audioInput = new ArrayList<ByteString>();

            restartCounter++;

            if (!lastTranscriptWasFinal) {
              System.out.print('\n');
            }

            newStream = true;

            clientStream = client.streamingRecognizeCallable().splitCall(responseObserver);

            request =
                StreamingRecognizeRequest.newBuilder()
                    .setStreamingConfig(streamingRecognitionConfig)
                    .build();

            System.out.println(YELLOW);
            System.out.printf("%d: RESTARTING REQUEST\n", restartCounter * STREAMING_LIMIT);

            startTime = System.currentTimeMillis();

          } else {

            if ((newStream) && (lastAudioInput.size() > 0)) {
              // if this is the first audio from a new request
              // calculate amount of unfinalized audio from last request
              // resend the audio to the speech client before incoming audio
              double chunkTime = STREAMING_LIMIT / lastAudioInput.size();
              // ms length of each chunk in previous request audio arrayList
              if (chunkTime != 0) {
                if (bridgingOffset < 0) {
                  // bridging Offset accounts for time of resent audio
                  // calculated from last request
                  bridgingOffset = 0;
                }
                if (bridgingOffset > finalRequestEndTime) {
                  bridgingOffset = finalRequestEndTime;
                }
                int chunksFromMs =
                    (int) Math.floor((finalRequestEndTime - bridgingOffset) / chunkTime);
                // chunks from MS is number of chunks to resend
                bridgingOffset =
                    (int) Math.floor((lastAudioInput.size() - chunksFromMs) * chunkTime);
                // set bridging offset for next request
                for (int i = chunksFromMs; i < lastAudioInput.size(); i++) {
                  request =
                      StreamingRecognizeRequest.newBuilder()
                          .setAudioContent(lastAudioInput.get(i))
                          .build();
                  clientStream.send(request);
                }
              }
              newStream = false;
            }

            tempByteString = ByteString.copyFrom(sharedQueue.take());

            request =
                StreamingRecognizeRequest.newBuilder().setAudioContent(tempByteString).build();

            audioInput.add(tempByteString);
          }
          clientStream.send(request);
          if(breakTranscription) {
              clientStream.closeSend();
              break;
          }
          
        }
      } catch (Exception e) {
        System.out.println(e);
      }
    }
  }
}

When setting language code to en-IN only the partial transcripts are produced but they are no good. I tried setting alternate languages also by passing languageCode="en-US" and adding "en-IN" and "hi-IN" in alternate language list but that started producing transcripts in hindi even when i was speaking in english. what am i doing wrong can someone help.

0

There are 0 answers