I am working on a project that uses the v1 version of google-cloud-speech. I want to use the v2 version of the library but I am unable to find any useful guides online to perform this task. I have upgraded all the grpc libraries and added the google cloud speech library into the project,
protobuf {
protoc {
artifact = "com.google.protobuf:protoc:3.25.2:osx-x86_64" // Specify the protoc version
}
plugins {
grpc {
artifact = 'io.grpc:protoc-gen-grpc-java:1.61.1:osx-x86_64' // Specify the plugin for gRPC if needed
}
}
generateProtoTasks {
all().each { task ->
task.builtins {
java {}
}
task.plugins {
grpc {} // Apply the grpc plugin to generate gRPC related files
}
}
}
// Adjust the protoPath to fix the issue
generatedFilesBaseDir = "$projectDir/src/generated"
}
//The following is in the dependencies section
// gRPC
implementation("io.grpc:grpc-okhttp:1.60.0"){
exclude group: 'com.google.guava', module: 'guava'
}
implementation('io.grpc:grpc-stub:1.61.0') {
exclude group: 'com.google.guava', module: 'guava'
}
implementation('com.google.cloud:google-cloud-speech:4.29.0'){
exclude group: 'com.google.guava', module: 'guava'
}
The following SpeechService class is implemented with the v1 version of the library,
import android.app.Service;
import android.content.Intent;
import android.os.Binder;
import android.os.IBinder;
import androidx.annotation.NonNull;
import androidx.annotation.Nullable;
import android.text.TextUtils;
import android.util.Log;
import com.google.cloud.speech.v1.RecognitionConfig;
import com.google.cloud.speech.v1.SpeechContext;
import com.google.cloud.speech.v1.SpeechGrpc;
import com.google.cloud.speech.v1.SpeechRecognitionAlternative;
import com.google.cloud.speech.v1.StreamingRecognitionConfig;
import com.google.cloud.speech.v1.StreamingRecognitionResult;
import com.google.cloud.speech.v1.StreamingRecognizeRequest;
import com.google.cloud.speech.v1.StreamingRecognizeResponse;
import com.google.protobuf.ByteString;
import java.util.ArrayList;
import java.util.Locale;
import java.util.concurrent.TimeUnit;
import io.grpc.CallOptions;
import io.grpc.Channel;
import io.grpc.ClientCall;
import io.grpc.ClientInterceptor;
import io.grpc.ForwardingClientCall;
import io.grpc.ManagedChannel;
import io.grpc.Metadata;
import io.grpc.MethodDescriptor;
import io.grpc.internal.DnsNameResolverProvider;
import io.grpc.okhttp.OkHttpChannelProvider;
import io.grpc.stub.StreamObserver;
public class SpeechService extends Service {
public interface Listener {
/**
* Called when a new piece of text was recognized by the Speech API.
*
* @param text The text.
* @param isFinal {@code true} when the API finished processing audio.
*/
void onSpeechRecognized(String text, boolean isFinal);
void onError(Throwable t);
void onEnd();
}
private static final String TAG = "SpeechService";
private static final String HOSTNAME = "speech.googleapis.com";
private static final int PORT = 443;
private final SpeechBinder mBinder = new SpeechBinder();
private final ArrayList<Listener> mListeners = new ArrayList<>();
private SpeechGrpc.SpeechStub mApi;
public static volatile boolean finalReceived = false;
private final StreamObserver<StreamingRecognizeResponse> mResponseObserver
= new StreamObserver<StreamingRecognizeResponse>() {
@Override
public void onNext(StreamingRecognizeResponse response) {
String text = null;
boolean isFinal = false;
if (response.getResultsCount() > 0) {
final StreamingRecognitionResult result = response.getResults(0);
isFinal = result.getIsFinal();
finalReceived = isFinal;
if (result.getAlternativesCount() > 0) {
final SpeechRecognitionAlternative alternative = result.getAlternatives(0);
text = alternative.getTranscript();
}
Log.e(TAG, "is Final : " + isFinal + " .. " + text);
}
if (text != null) {
for (Listener listener : mListeners) {
listener.onSpeechRecognized(text, isFinal);
}
}
}
@Override
public void onError(Throwable t) {
Log.e(TAG, "Error calling the API.", t);
for (Listener listener : mListeners) {
listener.onError(t);
}
}
@Override
public void onCompleted() {
Log.i(TAG, "API completed start: " + finalReceived);
if (!finalReceived) {
for (Listener listener : mListeners) {
listener.onEnd();
}
finalReceived = false;
}
Log.i(TAG, "API completed." + finalReceived);
}
};
private StreamObserver<StreamingRecognizeRequest> mRequestObserver;
public static SpeechService from(IBinder binder) {
return ((SpeechBinder) binder).getService();
}
private String getSignature(){
return PackageManagerUtils.getSignature(getPackageManager(), "com..");
}
@Override
public void onCreate() {
super.onCreate();
final ManagedChannel channel = new OkHttpChannelProvider()
.builderForAddress(HOSTNAME, PORT)
.nameResolverFactory(new DnsNameResolverProvider())
.intercept(new ApiKeyInterceptor(getSignature()))
.build();
mApi = SpeechGrpc.newStub(channel);
}
@Override
public void onDestroy() {
super.onDestroy();
// Release the gRPC channel.
if (mApi != null) {
final ManagedChannel channel = (ManagedChannel) mApi.getChannel();
if (channel != null && !channel.isShutdown()) {
try {
channel.shutdown().awaitTermination(5, TimeUnit.SECONDS);
} catch (InterruptedException e) {
Log.e(TAG, "Error shutting down the gRPC channel.", e);
}
}
mApi = null;
}
}
private String getDefaultLanguageCode() {
final Locale locale = Locale.getDefault();
final StringBuilder language = new StringBuilder(locale.getLanguage());
final String country = locale.getCountry();
if (!TextUtils.isEmpty(country)) {
language.append("-");
language.append(country);
}
Log.v(TAG,language+"");
return language.toString();
}
@Nullable
@Override
public IBinder onBind(Intent intent) {
return mBinder;
}
public void addListener(@NonNull Listener listener) {
mListeners.add(listener);
}
public void removeListener(@NonNull Listener listener) {
mListeners.remove(listener);
}
/**
* Starts recognizing speech audio.
*
* @param sampleRate The sample rate of the audio.
*/
public void startRecognizing(int sampleRate) {
if (mApi == null) {
Log.w(TAG, "API not ready. Ignoring the request.");
return;
}
// Configure the API
finalReceived = false;
mRequestObserver = mApi.streamingRecognize(mResponseObserver);
ArrayList<String> phrases = new ArrayList<>();
phrases.add("kind");
phrases.add("kin");
phrases.add("what is");
phrases.add("i love");
phrases.add("like");
SpeechContext context = SpeechContext.newBuilder().addAllPhrases(phrases).build();
ArrayList<SpeechContext> contexts = new ArrayList<>();
contexts.add(context);
RecognitionConfig config = RecognitionConfig.newBuilder()
// .setLanguageCode(getDefaultLanguageCode())
.setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
.setSampleRateHertz(sampleRate)
.addAllSpeechContexts(contexts)
.setLanguageCode(KaiProfile.getInstance().getLanguage())
.build();
mRequestObserver.onNext(StreamingRecognizeRequest.newBuilder()
.setStreamingConfig(StreamingRecognitionConfig.newBuilder()
.setConfig(config)
.setInterimResults(true)
.setSingleUtterance(false)
.build())
.build());
}
/**
* Recognizes the speech audio. This method should be called every time a chunk of byte buffer
* is ready.
*
* @param data The audio data.
* @param size The number of elements that are actually relevant in the {@code data}.
*/
public void recognize(byte[] data, int size) {
if (mRequestObserver == null) {
return;
}
// Call the streaming recognition API
mRequestObserver.onNext(StreamingRecognizeRequest.newBuilder()
.setAudioContent(ByteString.copyFrom(data, 0, size))
.build());
}
/**
* Finishes recognizing speech audio.
*/
public void finishRecognizing() {
if (mRequestObserver == null) {
return;
}
mRequestObserver.onCompleted();
mRequestObserver = null;
}
private class SpeechBinder extends Binder {
SpeechService getService() {
return SpeechService.this;
}
}
private static final class ApiKeyInterceptor implements ClientInterceptor {
private final String signature;
private static Metadata.Key<String> API_KEY_HEADER =
Metadata.Key.of("x-goog-api-key", Metadata.ASCII_STRING_MARSHALLER);
private static Metadata.Key<String> API_APP_KEY =
Metadata.Key.of("X-Android-Package", Metadata.ASCII_STRING_MARSHALLER);
private static Metadata.Key<String> APP_CERT_KEY =
Metadata.Key.of("X-Android-Cert", Metadata.ASCII_STRING_MARSHALLER);
ApiKeyInterceptor(String signature) {
this.signature = signature;
}
@Override
public <ReqT, RespT> ClientCall<ReqT, RespT> interceptCall(
MethodDescriptor<ReqT, RespT> method, CallOptions callOptions, Channel next) {
ClientCall<ReqT, RespT> call = next.newCall(method, callOptions);
call = new ForwardingClientCall.SimpleForwardingClientCall<ReqT, RespT>(call) {
@Override
public void start(Listener<RespT> responseListener, Metadata headers) {
if (BuildConfig.API_KEY != null && !BuildConfig.API_KEY.isEmpty()) {
headers.put(API_KEY_HEADER, BuildConfig.API_KEY);
headers.put(API_APP_KEY, "com..");
headers.put(APP_CERT_KEY, signature);
Log.v(TAG,"signature " + signature);
}
super.start(responseListener, headers);
}
};
return call;
}
}
}
Following is the SpeechService class with the v2 version:
import android.app.Service;
import android.content.Intent;
import android.os.Binder;
import android.os.IBinder;
import androidx.annotation.NonNull;
import androidx.annotation.Nullable;
import android.text.TextUtils;
import android.util.Log;
import com.google.api.gax.rpc.ClientStream;
import com.google.api.gax.rpc.FixedTransportChannelProvider;
import com.google.api.gax.rpc.ResponseObserver;
import com.google.api.gax.rpc.StreamController;
import com.google.cloud.speech.v2.ExplicitDecodingConfig;
import com.google.cloud.speech.v2.RecognitionConfig;
import com.google.cloud.speech.v2.SpeechRecognitionAlternative;
import com.google.cloud.speech.v2.SpeechSettings;
import com.google.cloud.speech.v2.StreamingRecognitionConfig;
import com.google.cloud.speech.v2.StreamingRecognitionFeatures;
import com.google.cloud.speech.v2.StreamingRecognitionResult;
import com.google.cloud.speech.v2.StreamingRecognizeRequest;
import com.google.cloud.speech.v2.StreamingRecognizeResponse;
import com.google.cloud.speech.v2.SpeechClient;
import com.google.cloud.speech.v2.stub.GrpcSpeechStub;
import com.google.cloud.speech.v2.stub.SpeechStubSettings;
import com.google.protobuf.ByteString;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Locale;
import java.util.concurrent.TimeUnit;
import io.grpc.CallOptions;
import io.grpc.Channel;
import io.grpc.ClientCall;
import io.grpc.ClientInterceptor;
import io.grpc.ForwardingClientCall;
import io.grpc.ManagedChannel;
import io.grpc.Metadata;
import io.grpc.MethodDescriptor;
import io.grpc.internal.DnsNameResolverProvider;
import io.grpc.okhttp.OkHttpChannelProvider;
import io.grpc.stub.StreamObserver;
public class SpeechService extends Service {
public interface Listener {
/**
* Called when a new piece of text was recognized by the Speech API.
*
* @param text The text.
* @param isFinal {@code true} when the API finished processing audio.
*/
void onSpeechRecognized(String text, boolean isFinal);
void onError(Throwable t);
void onEnd();
}
private static final String TAG = "SpeechService";
private static final String HOSTNAME = "speech.googleapis.com";
private static final int PORT = 443;
private ClientStream<StreamingRecognizeRequest> clientStream;
private SpeechClient speechClient = null;
private GrpcSpeechStub grpcStub;
private final SpeechBinder mBinder = new SpeechBinder();
private final ArrayList<Listener> mListeners = new ArrayList<>();
//private SpeechGrpc.SpeechStub mApi;
public static volatile boolean finalReceived = false;
private final ResponseObserver<StreamingRecognizeResponse> mResponseObserver
= new ResponseObserver<StreamingRecognizeResponse>() {
@Override
public void onStart(StreamController controller) {
}
@Override
public void onResponse(StreamingRecognizeResponse response) {
String text = null;
boolean isFinal = false;
if (response.getResultsCount() > 0) {
final StreamingRecognitionResult result = response.getResults(0);
isFinal = result.getIsFinal();
finalReceived = isFinal;
if (result.getAlternativesCount() > 0) {
final SpeechRecognitionAlternative alternative = result.getAlternatives(0);
text = alternative.getTranscript();
}
Log.e(TAG, "is Final : " + isFinal + " .. " + text);
}
if (text != null) {
for (Listener listener : mListeners) {
listener.onSpeechRecognized(text, isFinal);
}
}
}
@Override
public void onError(Throwable t) {
Log.e(TAG, "Error calling the API.", t);
for (Listener listener : mListeners) {
listener.onError(t);
}
}
@Override
public void onComplete() {
Log.i(TAG, "API completed start: " + finalReceived);
if (!finalReceived) {
for (Listener listener : mListeners) {
listener.onEnd();
}
finalReceived = false;
}
Log.i(TAG, "API completed." + finalReceived);
}
};
private StreamObserver<StreamingRecognizeRequest> mRequestObserver;
public static SpeechService from(IBinder binder) {
return ((SpeechBinder) binder).getService();
}
private String getSignature(){
return PackageManagerUtils.getSignature(getPackageManager(), "com...");
}
@Override
public void onCreate() {
super.onCreate();
/*final ManagedChannel channel = new OkHttpChannelProvider()
.builderForAddress(HOSTNAME, PORT)
.nameResolverFactory(new DnsNameResolverProvider())
.intercept(new ApiKeyInterceptor(getSignature()))
.build();
mApi = SpeechGrpc.newStub(channel);*/
}
public void initSpeechClient() {
if (speechClient == null) {
try {
SpeechStubSettings.Builder builder = SpeechStubSettings.newBuilder();
builder.setCredentialsProvider(new SpeechCredentialsProvider(this));
builder.setEndpoint(HOSTNAME + ":" + PORT);
final ManagedChannel channel = new OkHttpChannelProvider()
.builderForAddress(HOSTNAME, PORT)
.nameResolverFactory(new DnsNameResolverProvider())
.intercept(new ApiKeyInterceptor(getSignature()))
.build();
SpeechSettings.newBuilder().setTransportChannelProvider(FixedTransportChannelProvider.create(channel));
grpcStub = GrpcSpeechStub.create(builder.build());
speechClient = SpeechClient.create(grpcStub);
} catch (IOException e) {
// Handle exception
}
}
}
@Override
public void onDestroy() {
super.onDestroy();
// Release the gRPC channel.
/*if (mApi != null) {
final ManagedChannel channel = (ManagedChannel) mApi.getChannel();
if (channel != null && !channel.isShutdown()) {
try {
channel.shutdown().awaitTermination(5, TimeUnit.SECONDS);
} catch (InterruptedException e) {
Log.e(TAG, "Error shutting down the gRPC channel.", e);
}
}
mApi = null;
}*/
}
private String getDefaultLanguageCode() {
final Locale locale = Locale.getDefault();
final StringBuilder language = new StringBuilder(locale.getLanguage());
final String country = locale.getCountry();
if (!TextUtils.isEmpty(country)) {
language.append("-");
language.append(country);
}
Log.v(TAG,language+"");
return language.toString();
}
@Nullable
@Override
public IBinder onBind(Intent intent) {
return mBinder;
}
public void addListener(@NonNull Listener listener) {
mListeners.add(listener);
}
public void removeListener(@NonNull Listener listener) {
mListeners.remove(listener);
}
/**
* Starts recognizing speech audio.
*
* @param sampleRate The sample rate of the audio.
*/
public void startRecognizing(int sampleRate) {
if (speechClient == null) {
initSpeechClient();
}
clientStream = speechClient.streamingRecognizeCallable().splitCall(mResponseObserver);
StreamingRecognizeRequest streamRequest = StreamingRecognizeRequest.newBuilder()
.setStreamingConfig(StreamingRecognitionConfig.newBuilder()
.setConfig(RecognitionConfig.newBuilder()
.addLanguageCodes(getDefaultLanguageCode())
.setLanguageCodes(0, getDefaultLanguageCode())
.setExplicitDecodingConfig(ExplicitDecodingConfig.newBuilder()
.setEncoding(ExplicitDecodingConfig.AudioEncoding.LINEAR16)
.setSampleRateHertz(sampleRate))
.build())
.setStreamingFeatures(StreamingRecognitionFeatures
.newBuilder()
.setInterimResults(true)
.build())
.build())
.build();
clientStream.send(streamRequest);
}
/*public void startRecognizing(int sampleRate) {
if (mApi == null) {
Log.w(TAG, "API not ready. Ignoring the request.");
return;
}
// Configure the API
finalReceived = false;
mRequestObserver = mApi.streamingRecognize(mResponseObserver);
ArrayList<String> phrases = new ArrayList<>();
phrases.add("kind");
phrases.add("kin");
phrases.add("what is");
phrases.add("i love");
phrases.add("like");
SpeechContext context = SpeechContext.newBuilder().addAllPhrases(phrases).build();
ArrayList<SpeechContext> contexts = new ArrayList<>();
contexts.add(context);
RecognitionConfig config = RecognitionConfig.newBuilder()
// .setLanguageCode(getDefaultLanguageCode())
.setEncoding(RecognitionConfig.AudioEncoding.LINEAR16)
.setSampleRateHertz(sampleRate)
.addAllSpeechContexts(contexts)
.setLanguageCode(KaiProfile.getInstance().getLanguage())
.build();
mRequestObserver.onNext(StreamingRecognizeRequest.newBuilder()
.setStreamingConfig(StreamingRecognitionConfig.newBuilder()
.setConfig(config)
.setInterimResults(true)
.setSingleUtterance(false)
.build())
.build());
}*/
/**
* Recognizes the speech audio. This method should be called every time a chunk of byte buffer
* is ready.
*
* @param data The audio data.
* @param size The number of elements that are actually relevant in the {@code data}.
*/
public void recognize(byte[] data, int size) {
if (mRequestObserver == null) {
return;
}
clientStream.send(
StreamingRecognizeRequest.newBuilder()
.setAudio(ByteString.copyFrom(data, 0, size))
.build()
);
}
/*public void recognize(byte[] data, int size) {
if (mRequestObserver == null) {
return;
}
// Call the streaming recognition API
mRequestObserver.onNext(StreamingRecognizeRequest.newBuilder()
.setAudioContent(ByteString.copyFrom(data, 0, size))
.build());
}*/
/**
* Finishes recognizing speech audio.
*/
public void finishRecognizing() {
if (mRequestObserver == null) {
return;
}
mRequestObserver.onCompleted();
mRequestObserver = null;
}
private class SpeechBinder extends Binder {
SpeechService getService() {
return SpeechService.this;
}
}
private static final class ApiKeyInterceptor implements ClientInterceptor {
private final String signature;
private static Metadata.Key<String> API_KEY_HEADER =
Metadata.Key.of("x-goog-api-key", Metadata.ASCII_STRING_MARSHALLER);
private static Metadata.Key<String> API_APP_KEY =
Metadata.Key.of("X-Android-Package", Metadata.ASCII_STRING_MARSHALLER);
private static Metadata.Key<String> APP_CERT_KEY =
Metadata.Key.of("X-Android-Cert", Metadata.ASCII_STRING_MARSHALLER);
ApiKeyInterceptor(String signature) {
this.signature = signature;
}
@Override
public <ReqT, RespT> ClientCall<ReqT, RespT> interceptCall(
MethodDescriptor<ReqT, RespT> method, CallOptions callOptions, Channel next) {
ClientCall<ReqT, RespT> call = next.newCall(method, callOptions);
call = new ForwardingClientCall.SimpleForwardingClientCall<ReqT, RespT>(call) {
@Override
public void start(Listener<RespT> responseListener, Metadata headers) {
if (BuildConfig.API_KEY != null && !BuildConfig.API_KEY.isEmpty()) {
headers.put(API_KEY_HEADER, BuildConfig.API_KEY);
headers.put(API_APP_KEY, "com..");
headers.put(APP_CERT_KEY, signature);
Log.v(TAG,"signature " + signature);
}
super.start(responseListener, headers);
}
};
return call;
}
}
}
When I run the v2 version of the speech service and try to speak into the mic I get the following error,
Error calling the API. com.google.api.gax.rpc.InvalidArgumentException: io.grpc.StatusRuntimeException: INVALID_ARGUMENT: Invalid resource field value in the request. at com.google.api.gax.rpc.ApiExceptionFactory.createException(ApiExceptionFactory.java:92) at com.google.api.gax.grpc.GrpcApiExceptionFactory.create(GrpcApiExceptionFactory.java:98) at com.google.api.gax.grpc.GrpcApiExceptionFactory.create(GrpcApiExceptionFactory.java:66) at com.google.api.gax.grpc.ExceptionResponseObserver.onErrorImpl(ExceptionResponseObserver.java:82) at com.google.api.gax.rpc.StateCheckingResponseObserver.onError(StateCheckingResponseObserver.java:84) at com.google.api.gax.grpc.GrpcDirectStreamController$ResponseObserverAdapter.onClose(GrpcDirectStreamController.java:148)
Any help would be appreciated.