Android: How to upgrade google cloud speech from v1 to v2?

97 views Asked by At

I am working on a project that uses the v1 version of google-cloud-speech. I want to use the v2 version of the library but I am unable to find any useful guides online to perform this task. I have upgraded all the grpc libraries and added the google cloud speech library into the project,

protobuf {
protoc {
    artifact = "com.google.protobuf:protoc:3.25.2:osx-x86_64" // Specify the protoc version
}
plugins {
    grpc {
        artifact = 'io.grpc:protoc-gen-grpc-java:1.61.1:osx-x86_64' // Specify the plugin for gRPC if needed
    }
}
generateProtoTasks {
    all().each { task ->
        task.builtins {
            java {}
        }
        task.plugins {
            grpc {} // Apply the grpc plugin to generate gRPC related files
        }
      }
    }
    // Adjust the protoPath to fix the issue
    generatedFilesBaseDir = "$projectDir/src/generated"
}
//The following is in the dependencies section
// gRPC
implementation("io.grpc:grpc-okhttp:1.60.0"){
    exclude group: 'com.google.guava', module: 'guava'
}
implementation('io.grpc:grpc-stub:1.61.0') {
    exclude group: 'com.google.guava', module: 'guava'
}
implementation('com.google.cloud:google-cloud-speech:4.29.0'){
    exclude group: 'com.google.guava', module: 'guava'
}

The following SpeechService class is implemented with the v1 version of the library,

import android.app.Service;
import android.content.Intent;
import android.os.Binder;
import android.os.IBinder;
import androidx.annotation.NonNull;
import androidx.annotation.Nullable;
import android.text.TextUtils;
import android.util.Log;
import com.google.cloud.speech.v1.RecognitionConfig;
import com.google.cloud.speech.v1.SpeechContext;
import com.google.cloud.speech.v1.SpeechGrpc;
import com.google.cloud.speech.v1.SpeechRecognitionAlternative;
import com.google.cloud.speech.v1.StreamingRecognitionConfig;
import com.google.cloud.speech.v1.StreamingRecognitionResult;
import com.google.cloud.speech.v1.StreamingRecognizeRequest;
import com.google.cloud.speech.v1.StreamingRecognizeResponse;
import com.google.protobuf.ByteString;

import java.util.ArrayList;
import java.util.Locale;
import java.util.concurrent.TimeUnit;
import io.grpc.CallOptions;
import io.grpc.Channel;
import io.grpc.ClientCall;
import io.grpc.ClientInterceptor;
import io.grpc.ForwardingClientCall;
import io.grpc.ManagedChannel;
import io.grpc.Metadata;
import io.grpc.MethodDescriptor;
import io.grpc.internal.DnsNameResolverProvider;
import io.grpc.okhttp.OkHttpChannelProvider;
import io.grpc.stub.StreamObserver;


public class SpeechService extends Service {

    public interface Listener {

        /**
         * Called when a new piece of text was recognized by the Speech API.
         *
         * @param text    The text.
         * @param isFinal {@code true} when the API finished processing audio.
         */
        void onSpeechRecognized(String text, boolean isFinal);
        void onError(Throwable t);
        void onEnd();

    }
    private static final String TAG = "SpeechService";
    private static final String HOSTNAME = "speech.googleapis.com";
    private static final int PORT = 443;

    private final SpeechBinder mBinder = new SpeechBinder();
    private final ArrayList<Listener> mListeners = new ArrayList<>();
    private SpeechGrpc.SpeechStub mApi;

    public static volatile boolean finalReceived = false;

    private final StreamObserver<StreamingRecognizeResponse> mResponseObserver
            = new StreamObserver<StreamingRecognizeResponse>() {

        @Override
        public void onNext(StreamingRecognizeResponse response) {
            String text = null;
            boolean isFinal = false;


            if (response.getResultsCount() > 0) {
                final StreamingRecognitionResult result = response.getResults(0);
                isFinal = result.getIsFinal();
                finalReceived = isFinal;
                if (result.getAlternativesCount() > 0) {
                    final SpeechRecognitionAlternative alternative = result.getAlternatives(0);
                    text = alternative.getTranscript();
                }

                Log.e(TAG, "is Final : " + isFinal + " .. " + text);
            }
            if (text != null) {
                for (Listener listener : mListeners) {
                    listener.onSpeechRecognized(text, isFinal);
                }
            }
        }

        @Override
        public void onError(Throwable t) {
            Log.e(TAG, "Error calling the API.", t);
            for (Listener listener : mListeners) {
                listener.onError(t);
            }
        }

        @Override
        public void onCompleted() {
            Log.i(TAG, "API completed start: " + finalReceived);
            if (!finalReceived) {
                for (Listener listener : mListeners) {
                    listener.onEnd();
                }
                finalReceived = false;
            }
            Log.i(TAG, "API completed." + finalReceived);
        }

    };



    private StreamObserver<StreamingRecognizeRequest> mRequestObserver;

    public static SpeechService from(IBinder binder) {
        return ((SpeechBinder) binder).getService();
    }

    private String getSignature(){
        return PackageManagerUtils.getSignature(getPackageManager(), "com..");
    }

    @Override
    public void onCreate() {
        super.onCreate();

        final ManagedChannel channel = new OkHttpChannelProvider()
                .builderForAddress(HOSTNAME, PORT)
                .nameResolverFactory(new DnsNameResolverProvider())
                .intercept(new ApiKeyInterceptor(getSignature()))
                .build();
        mApi = SpeechGrpc.newStub(channel);
    }

    @Override
    public void onDestroy() {
        super.onDestroy();

        // Release the gRPC channel.
        if (mApi != null) {
            final ManagedChannel channel = (ManagedChannel) mApi.getChannel();
            if (channel != null && !channel.isShutdown()) {
                try {
                    channel.shutdown().awaitTermination(5, TimeUnit.SECONDS);
                } catch (InterruptedException e) {
                    Log.e(TAG, "Error shutting down the gRPC channel.", e);
                }
            }
            mApi = null;
        }
    }

    private String getDefaultLanguageCode() {
        final Locale locale = Locale.getDefault();
        final StringBuilder language = new StringBuilder(locale.getLanguage());
        final String country = locale.getCountry();
        if (!TextUtils.isEmpty(country)) {
            language.append("-");
            language.append(country);
        }
        Log.v(TAG,language+"");
        return language.toString();
    }

    @Nullable
    @Override
    public IBinder onBind(Intent intent) {
        return mBinder;
    }

    public void addListener(@NonNull Listener listener) {
        mListeners.add(listener);
    }

    public void removeListener(@NonNull Listener listener) {
        mListeners.remove(listener);
    }

    /**
     * Starts recognizing speech audio.
     *
     * @param sampleRate The sample rate of the audio.
     */

    public void startRecognizing(int sampleRate) {
        if (mApi == null) {
            Log.w(TAG, "API not ready. Ignoring the request.");
            return;
        }
        // Configure the API
        finalReceived = false;
        mRequestObserver = mApi.streamingRecognize(mResponseObserver);

        ArrayList<String> phrases = new ArrayList<>();
        phrases.add("kind");
        phrases.add("kin");
        phrases.add("what is");
        phrases.add("i love");
        phrases.add("like");
        SpeechContext context = SpeechContext.newBuilder().addAllPhrases(phrases).build();
        ArrayList<SpeechContext> contexts = new ArrayList<>();
        contexts.add(context);


        RecognitionConfig config = RecognitionConfig.newBuilder()
//                .setLanguageCode(getDefaultLanguageCode())
                .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
                .setSampleRateHertz(sampleRate)
                .addAllSpeechContexts(contexts)
                .setLanguageCode(KaiProfile.getInstance().getLanguage())


                .build();

        mRequestObserver.onNext(StreamingRecognizeRequest.newBuilder()
                .setStreamingConfig(StreamingRecognitionConfig.newBuilder()
                        .setConfig(config)
                        .setInterimResults(true)
                        .setSingleUtterance(false)
                        .build())
                .build());
    }

    /**
     * Recognizes the speech audio. This method should be called every time a chunk of byte buffer
     * is ready.
     *
     * @param data The audio data.
     * @param size The number of elements that are actually relevant in the {@code data}.
     */
    public void recognize(byte[] data, int size) {
        if (mRequestObserver == null) {
            return;
        }
        // Call the streaming recognition API
        mRequestObserver.onNext(StreamingRecognizeRequest.newBuilder()
                .setAudioContent(ByteString.copyFrom(data, 0, size))
                .build());
    }

    /**
     * Finishes recognizing speech audio.
     */
    public void finishRecognizing() {
        if (mRequestObserver == null) {
            return;
        }
        mRequestObserver.onCompleted();
        mRequestObserver = null;
    }


    private class SpeechBinder extends Binder {

        SpeechService getService() {
            return SpeechService.this;
        }

    }



    private static final class ApiKeyInterceptor implements ClientInterceptor {
        private final String signature;

        private static Metadata.Key<String> API_KEY_HEADER =
                Metadata.Key.of("x-goog-api-key", Metadata.ASCII_STRING_MARSHALLER);
        private static Metadata.Key<String> API_APP_KEY =
                Metadata.Key.of("X-Android-Package", Metadata.ASCII_STRING_MARSHALLER);

        private static Metadata.Key<String> APP_CERT_KEY =
                Metadata.Key.of("X-Android-Cert", Metadata.ASCII_STRING_MARSHALLER);
        ApiKeyInterceptor(String signature) {
            this.signature = signature;
        }

        @Override
        public <ReqT, RespT> ClientCall<ReqT, RespT> interceptCall(
                MethodDescriptor<ReqT, RespT> method, CallOptions callOptions, Channel next) {
            ClientCall<ReqT, RespT> call = next.newCall(method, callOptions);
            call = new ForwardingClientCall.SimpleForwardingClientCall<ReqT, RespT>(call) {
                @Override
                public void start(Listener<RespT> responseListener, Metadata headers) {
                    if (BuildConfig.API_KEY != null && !BuildConfig.API_KEY.isEmpty()) {
                        headers.put(API_KEY_HEADER, BuildConfig.API_KEY);
                        headers.put(API_APP_KEY, "com..");
                        headers.put(APP_CERT_KEY, signature);
                        Log.v(TAG,"signature " + signature);
                    }
                    super.start(responseListener, headers);
                }
            };
            return call;
        }
    }
}

Following is the SpeechService class with the v2 version:

import android.app.Service;
import android.content.Intent;
import android.os.Binder;
import android.os.IBinder;
import androidx.annotation.NonNull;
import androidx.annotation.Nullable;
import android.text.TextUtils;
import android.util.Log;

import com.google.api.gax.rpc.ClientStream;
import com.google.api.gax.rpc.FixedTransportChannelProvider;
import com.google.api.gax.rpc.ResponseObserver;
import com.google.api.gax.rpc.StreamController;
import com.google.cloud.speech.v2.ExplicitDecodingConfig;
import com.google.cloud.speech.v2.RecognitionConfig;
import com.google.cloud.speech.v2.SpeechRecognitionAlternative;
import com.google.cloud.speech.v2.SpeechSettings;
import com.google.cloud.speech.v2.StreamingRecognitionConfig;
import com.google.cloud.speech.v2.StreamingRecognitionFeatures;
import com.google.cloud.speech.v2.StreamingRecognitionResult;
import com.google.cloud.speech.v2.StreamingRecognizeRequest;
import com.google.cloud.speech.v2.StreamingRecognizeResponse;
import com.google.cloud.speech.v2.SpeechClient;
import com.google.cloud.speech.v2.stub.GrpcSpeechStub;
import com.google.cloud.speech.v2.stub.SpeechStubSettings;
import com.google.protobuf.ByteString;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Locale;
import java.util.concurrent.TimeUnit;
import io.grpc.CallOptions;
import io.grpc.Channel;
import io.grpc.ClientCall;
import io.grpc.ClientInterceptor;
import io.grpc.ForwardingClientCall;
import io.grpc.ManagedChannel;
import io.grpc.Metadata;
import io.grpc.MethodDescriptor;
import io.grpc.internal.DnsNameResolverProvider;
import io.grpc.okhttp.OkHttpChannelProvider;
import io.grpc.stub.StreamObserver;


public class SpeechService extends Service {

    public interface Listener {

        /**
         * Called when a new piece of text was recognized by the Speech API.
         *
         * @param text    The text.
         * @param isFinal {@code true} when the API finished processing audio.
         */
        void onSpeechRecognized(String text, boolean isFinal);
        void onError(Throwable t);
        void onEnd();

    }
    private static final String TAG = "SpeechService";
    private static final String HOSTNAME = "speech.googleapis.com";
    private static final int PORT = 443;

    private ClientStream<StreamingRecognizeRequest> clientStream;
    private SpeechClient speechClient = null;
    private GrpcSpeechStub grpcStub;

    private final SpeechBinder mBinder = new SpeechBinder();
    private final ArrayList<Listener> mListeners = new ArrayList<>();
    //private SpeechGrpc.SpeechStub mApi;

    public static volatile boolean finalReceived = false;

    private final ResponseObserver<StreamingRecognizeResponse> mResponseObserver
            = new ResponseObserver<StreamingRecognizeResponse>() {

        @Override
        public void onStart(StreamController controller) {

        }

        @Override
        public void onResponse(StreamingRecognizeResponse response) {
            String text = null;
            boolean isFinal = false;


            if (response.getResultsCount() > 0) {
                final StreamingRecognitionResult result = response.getResults(0);
                isFinal = result.getIsFinal();
                finalReceived = isFinal;
                if (result.getAlternativesCount() > 0) {
                    final SpeechRecognitionAlternative alternative = result.getAlternatives(0);
                    text = alternative.getTranscript();
                }

                Log.e(TAG, "is Final : " + isFinal + " .. " + text);
            }
            if (text != null) {
                for (Listener listener : mListeners) {
                    listener.onSpeechRecognized(text, isFinal);
                }
            }
        }

        @Override
        public void onError(Throwable t) {
            Log.e(TAG, "Error calling the API.", t);
            for (Listener listener : mListeners) {
                listener.onError(t);
            }
        }

        @Override
        public void onComplete() {
            Log.i(TAG, "API completed start: " + finalReceived);
            if (!finalReceived) {
                for (Listener listener : mListeners) {
                    listener.onEnd();
                }
                finalReceived = false;
            }
            Log.i(TAG, "API completed." + finalReceived);
        }

    };



    private StreamObserver<StreamingRecognizeRequest> mRequestObserver;

    public static SpeechService from(IBinder binder) {
        return ((SpeechBinder) binder).getService();
    }

    private String getSignature(){
        return PackageManagerUtils.getSignature(getPackageManager(), "com...");
    }

    @Override
    public void onCreate() {
        super.onCreate();

        /*final ManagedChannel channel = new OkHttpChannelProvider()
                .builderForAddress(HOSTNAME, PORT)
                .nameResolverFactory(new DnsNameResolverProvider())
                .intercept(new ApiKeyInterceptor(getSignature()))
                .build();
        mApi = SpeechGrpc.newStub(channel);*/
    }

    public void initSpeechClient() {
        if (speechClient == null) {
            try {
                SpeechStubSettings.Builder builder = SpeechStubSettings.newBuilder();
                builder.setCredentialsProvider(new SpeechCredentialsProvider(this));
                builder.setEndpoint(HOSTNAME + ":" + PORT);
                final ManagedChannel channel = new OkHttpChannelProvider()
                        .builderForAddress(HOSTNAME, PORT)
                        .nameResolverFactory(new DnsNameResolverProvider())
                        .intercept(new ApiKeyInterceptor(getSignature()))
                        .build();
                SpeechSettings.newBuilder().setTransportChannelProvider(FixedTransportChannelProvider.create(channel));
                grpcStub = GrpcSpeechStub.create(builder.build());

                speechClient = SpeechClient.create(grpcStub);
            } catch (IOException e) {
                // Handle exception
            }
        }
    }

    @Override
    public void onDestroy() {
        super.onDestroy();
        // Release the gRPC channel.
        /*if (mApi != null) {
            final ManagedChannel channel = (ManagedChannel) mApi.getChannel();
            if (channel != null && !channel.isShutdown()) {
                try {
                    channel.shutdown().awaitTermination(5, TimeUnit.SECONDS);
                } catch (InterruptedException e) {
                    Log.e(TAG, "Error shutting down the gRPC channel.", e);
                }
            }
            mApi = null;
        }*/
    }

    private String getDefaultLanguageCode() {
        final Locale locale = Locale.getDefault();
        final StringBuilder language = new StringBuilder(locale.getLanguage());
        final String country = locale.getCountry();
        if (!TextUtils.isEmpty(country)) {
            language.append("-");
            language.append(country);
        }
        Log.v(TAG,language+"");
        return language.toString();
    }

    @Nullable
    @Override
    public IBinder onBind(Intent intent) {
        return mBinder;
    }

    public void addListener(@NonNull Listener listener) {
        mListeners.add(listener);
    }

    public void removeListener(@NonNull Listener listener) {
        mListeners.remove(listener);
    }

    /**
     * Starts recognizing speech audio.
     *
     * @param sampleRate The sample rate of the audio.
     */

    public void startRecognizing(int sampleRate) {
        if (speechClient == null) {
            initSpeechClient();
        }
        clientStream = speechClient.streamingRecognizeCallable().splitCall(mResponseObserver);
        StreamingRecognizeRequest streamRequest = StreamingRecognizeRequest.newBuilder()
                .setStreamingConfig(StreamingRecognitionConfig.newBuilder()
                        .setConfig(RecognitionConfig.newBuilder()
                                .addLanguageCodes(getDefaultLanguageCode())
                                .setLanguageCodes(0, getDefaultLanguageCode())
                                .setExplicitDecodingConfig(ExplicitDecodingConfig.newBuilder()
                                        .setEncoding(ExplicitDecodingConfig.AudioEncoding.LINEAR16)
                                        .setSampleRateHertz(sampleRate))
                                .build())
                        .setStreamingFeatures(StreamingRecognitionFeatures
                                .newBuilder()
                                .setInterimResults(true)
                                .build())
                        .build())
                .build();
        clientStream.send(streamRequest);
    }

    /*public void startRecognizing(int sampleRate) {
        if (mApi == null) {
            Log.w(TAG, "API not ready. Ignoring the request.");
            return;
        }
        // Configure the API
        finalReceived = false;
        mRequestObserver = mApi.streamingRecognize(mResponseObserver);

        ArrayList<String> phrases = new ArrayList<>();
        phrases.add("kind");
        phrases.add("kin");
        phrases.add("what is");
        phrases.add("i love");
        phrases.add("like");
        SpeechContext context = SpeechContext.newBuilder().addAllPhrases(phrases).build();
        ArrayList<SpeechContext> contexts = new ArrayList<>();
        contexts.add(context);


        RecognitionConfig config = RecognitionConfig.newBuilder()
//                .setLanguageCode(getDefaultLanguageCode())
                .setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
                .setSampleRateHertz(sampleRate)
                .addAllSpeechContexts(contexts)
                .setLanguageCode(KaiProfile.getInstance().getLanguage())


                .build();

        mRequestObserver.onNext(StreamingRecognizeRequest.newBuilder()
                .setStreamingConfig(StreamingRecognitionConfig.newBuilder()
                        .setConfig(config)
                        .setInterimResults(true)
                        .setSingleUtterance(false)
                        .build())
                .build());
    }*/

    /**
     * Recognizes the speech audio. This method should be called every time a chunk of byte buffer
     * is ready.
     *
     * @param data The audio data.
     * @param size The number of elements that are actually relevant in the {@code data}.
     */
    public void recognize(byte[] data, int size) {
        if (mRequestObserver == null) {
            return;
        }
        clientStream.send(
                StreamingRecognizeRequest.newBuilder()
                        .setAudio(ByteString.copyFrom(data, 0, size))
                        .build()
        );
    }
    /*public void recognize(byte[] data, int size) {
        if (mRequestObserver == null) {
            return;
        }
        // Call the streaming recognition API
        mRequestObserver.onNext(StreamingRecognizeRequest.newBuilder()
                .setAudioContent(ByteString.copyFrom(data, 0, size))
                .build());
    }*/

    /**
     * Finishes recognizing speech audio.
     */
    public void finishRecognizing() {
        if (mRequestObserver == null) {
            return;
        }
        mRequestObserver.onCompleted();
        mRequestObserver = null;
    }


    private class SpeechBinder extends Binder {

        SpeechService getService() {
            return SpeechService.this;
        }

    }



    private static final class ApiKeyInterceptor implements ClientInterceptor {
        private final String signature;

        private static Metadata.Key<String> API_KEY_HEADER =
                Metadata.Key.of("x-goog-api-key", Metadata.ASCII_STRING_MARSHALLER);
        private static Metadata.Key<String> API_APP_KEY =
                Metadata.Key.of("X-Android-Package", Metadata.ASCII_STRING_MARSHALLER);

        private static Metadata.Key<String> APP_CERT_KEY =
                Metadata.Key.of("X-Android-Cert", Metadata.ASCII_STRING_MARSHALLER);
        ApiKeyInterceptor(String signature) {
            this.signature = signature;
        }

        @Override
        public <ReqT, RespT> ClientCall<ReqT, RespT> interceptCall(
                MethodDescriptor<ReqT, RespT> method, CallOptions callOptions, Channel next) {
            ClientCall<ReqT, RespT> call = next.newCall(method, callOptions);
            call = new ForwardingClientCall.SimpleForwardingClientCall<ReqT, RespT>(call) {
                @Override
                public void start(Listener<RespT> responseListener, Metadata headers) {
                    if (BuildConfig.API_KEY != null && !BuildConfig.API_KEY.isEmpty()) {
                        headers.put(API_KEY_HEADER, BuildConfig.API_KEY);
                        headers.put(API_APP_KEY, "com..");
                        headers.put(APP_CERT_KEY, signature);
                        Log.v(TAG,"signature " + signature);
                    }
                    super.start(responseListener, headers);
                }
            };
            return call;
        }
    }
}

When I run the v2 version of the speech service and try to speak into the mic I get the following error,

Error calling the API. com.google.api.gax.rpc.InvalidArgumentException: io.grpc.StatusRuntimeException: INVALID_ARGUMENT: Invalid resource field value in the request. at com.google.api.gax.rpc.ApiExceptionFactory.createException(ApiExceptionFactory.java:92) at com.google.api.gax.grpc.GrpcApiExceptionFactory.create(GrpcApiExceptionFactory.java:98) at com.google.api.gax.grpc.GrpcApiExceptionFactory.create(GrpcApiExceptionFactory.java:66) at com.google.api.gax.grpc.ExceptionResponseObserver.onErrorImpl(ExceptionResponseObserver.java:82) at com.google.api.gax.rpc.StateCheckingResponseObserver.onError(StateCheckingResponseObserver.java:84) at com.google.api.gax.grpc.GrpcDirectStreamController$ResponseObserverAdapter.onClose(GrpcDirectStreamController.java:148)

Any help would be appreciated.

0

There are 0 answers