Live transciption with google speech to text

2.1k views Asked by At

I want to make a live transcription app with nodejs and google speech to text api.

I am using RecordRTC and socket.io to get audio chunks to the backend server. At the moment I am recording 1 s long chunks and the transciption works but it does not treat it as a stream, it sends a response after processing each chunk. This means that I am getting back half sentences and google can't use the context to help itself recognize the speech.

My question is how to make that google treats my chunks as a continous stream. Or if there is another solution to achieve the same result? (which is transcribing the mic audio live, or very close to live).

Google has a demo on their website which does exactly what I am looking for so it should be possible to do it.

My code: (which is mainly from selfservicekiosk-audio-streaming repo)

ss is socket.io-stream

Server side

io.on("connect", (socket) => {
        socket.on("create-room", (data, cb) => createRoom(socket, data, cb))
        socket.on("disconnecting", () => exitFromRoom(socket))

        // getting the stream, it gets called every 1s with a blob
        ss(socket).on("stream-speech", async function (stream: any, data: any) {

            const filename = path.basename("stream.wav")
            const writeStream = fs.createWriteStream(filename)
           
            stream.pipe(writeStream)
            speech.speechStreamToText(
                stream,
                async function (transcribeObj: any) {
                    socket.emit("transcript", transcribeObj.transcript)
                }
            )
        })

async speechStreamToText(stream: any, cb: Function) {
        sttRequest.config.languageCode = "en-US"

        sttRequest = {
            config: {
                sampleRateHertz: 16000,
                encoding: "WEBM_OPUS",
                enableAutomaticPunctuation: true,
            },
            singleUtterance: false,
        }

        const stt = speechToText.SpeechClient()
        //setup the stt stream
        const recognizeStream = stt
            .streamingRecognize(sttRequest)
            .on("data", function (data: any) {
                //this gets called every second and I get transciption chunks which usually have close to no sense
                console.log(data.results[0].alternatives)
            })
            .on("error", (e: any) => {
                console.log(e)
            })
            .on("end", () => {
                //this gets called every second. 
                console.log("on end")
            })

        stream.pipe(recognizeStream)
        stream.on("end", function () {
            console.log("socket.io stream ended")
        })
    }

Client side

const sendBinaryStream = (blob: Blob) => {
    const stream = ss.createStream()
    ss(socket).emit("stream-speech", stream, {
        name: "_temp/stream.wav",
        size: blob.size,
    })
    ss.createBlobReadStream(blob).pipe(stream)
}

useEffect(() => {
        let recorder: any
        if (activeChat) {
            navigator.mediaDevices.getUserMedia({ audio: true, video: false }).then((stream) => {
                streamRef.current = stream
                recorder = new RecordRTC(stream, {
                    type: "audio",
                    mimeType: "audio/webm",
                    sampleRate: 44100,
                    desiredSampleRate: 16000,
                    timeSlice: 1000,
                    numberOfAudioChannels: 1,
                    recorderType: StereoAudioRecorder,
                    ondataavailable(blob: Blob) {
                        sendBinaryStream(blob)
                    },
                })
                recorder.startRecording()
            })
        }
        return () => {
            recorder?.stopRecording()
            streamRef.current?.getTracks().forEach((track) => track.stop())
        }
    }, [])

Any help is appreciated!

2

There are 2 answers

0
Seva On

I have exactly the same question!

Maybe google oficial demo is using node-record-lpcm16 with SoX: https://cloud.google.com/speech-to-text/docs/streaming-recognize?hl=en

0
Hamza Kyamanywa On

I recommend using the Web Audio API to process the stream and send it to the backend using a web socket. Here is a working demo that uses the navigator.mediaDevices.getUserMedia() API to generate an audio stream.

The Web Audio API is a high-level JavaScript API for processing and synthesizing audio in web applications. We can create a RecorderProcessor Class which extends the AudioWorkletProcessor class to record and transcribe audio in real-time.

  1. Create the recorderWorkletProcessor.js processor file in the react public folder at the path public/worklets/recorderWorkletProcessor.js
/**
  An in-place replacement for ScriptProcessorNode using AudioWorklet
*/
class RecorderProcessor extends AudioWorkletProcessor {
  // 0. Determine the buffer size (this is the same as the 1st argument of ScriptProcessor)
  bufferSize = 2048;
  // 1. Track the current buffer fill level
  _bytesWritten = 0;

  // 2. Create a buffer of fixed size
  _buffer = new Float32Array(this.bufferSize);

  constructor() {
    super();
    this.initBuffer();
  }

  initBuffer() {
    this._bytesWritten = 0;
  }

  isBufferEmpty() {
    return this._bytesWritten === 0;
  }

  isBufferFull() {
    return this._bytesWritten === this.bufferSize;
  }

  /**
   * @param {Float32Array[][]} inputs
   * @returns {boolean}
   */
  process(inputs) {
    // Grabbing the 1st channel similar to ScriptProcessorNode
    this.append(inputs[0][0]);

    return true;
  }

  /**
   *
   * @param {Float32Array} channelData
   */
  append(channelData) {
    if (this.isBufferFull()) {
      this.flush();
    }

    if (!channelData) return;

    for (let i = 0; i < channelData.length; i++) {
      this._buffer[this._bytesWritten++] = channelData[i];
    }
  }

  flush() {
    // trim the buffer if ended prematurely
    const buffer =
      this._bytesWritten < this.bufferSize
        ? this._buffer.slice(0, this._bytesWritten)
        : this._buffer;
    const result = this.downsampleBuffer(buffer, 44100, 16000);
    this.port.postMessage(result);
    this.initBuffer();
  }

  downsampleBuffer(buffer, sampleRate, outSampleRate) {
    if (outSampleRate == sampleRate) {
      return buffer;
    }
    if (outSampleRate > sampleRate) {
      throw new Error(
        "downsampling rate show be smaller than original sample rate"
      );
    }
    var sampleRateRatio = sampleRate / outSampleRate;
    var newLength = Math.round(buffer.length / sampleRateRatio);
    var result = new Int16Array(newLength);
    var offsetResult = 0;
    var offsetBuffer = 0;
    while (offsetResult < result.length) {
      var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
      var accum = 0,
        count = 0;
      for (
        var i = offsetBuffer;
        i < nextOffsetBuffer && i < buffer.length;
        i++
      ) {
        accum += buffer[i];
        count++;
      }

      result[offsetResult] = Math.min(1, accum / count) * 0x7fff;
      offsetResult++;
      offsetBuffer = nextOffsetBuffer;
    }
    return result.buffer;
  }
}

registerProcessor("recorder.worklet", RecorderProcessor);

  1. React client side
/* eslint-disable react-hooks/exhaustive-deps */
import { default as React, useEffect, useState, useRef } from "react";
import { Button } from "react-bootstrap";
import Container from "react-bootstrap/Container";
import * as io from "socket.io-client";

const sampleRate = 16000;

const getMediaStream = () =>
  navigator.mediaDevices.getUserMedia({
    audio: {
      deviceId: "default",
      sampleRate: sampleRate,
      sampleSize: 16,
      channelCount: 1,
    },
    video: false,
  });

interface WordRecognized {
  final: boolean;
  text: string;
}

const AudioToText: React.FC = () => {
  const [connection, setConnection] = useState<io.Socket>();
  const [currentRecognition, setCurrentRecognition] = useState<string>();
  const [recognitionHistory, setRecognitionHistory] = useState<string[]>([]);
  const [isRecording, setIsRecording] = useState<boolean>(false);
  const [recorder, setRecorder] = useState<any>();
  const processorRef = useRef<any>();
  const audioContextRef = useRef<any>();
  const audioInputRef = useRef<any>();

  const speechRecognized = (data: WordRecognized) => {
    if (data.final) {
      setCurrentRecognition("...");
      setRecognitionHistory((old) => [data.text, ...old]);
    } else setCurrentRecognition(data.text + "...");
  };

  const connect = () => {
    connection?.disconnect();
    const socket = io.connect("http://localhost:8081");
    socket.on("connect", () => {
      console.log("connected", socket.id);
      setConnection(socket);
    });

    socket.emit("send_message", "hello world");

    socket.emit("startGoogleCloudStream");

    socket.on("receive_message", (data) => {
      console.log("received message", data);
    });

    socket.on("receive_audio_text", (data) => {
      speechRecognized(data);
      console.log("received audio text", data);
    });

    socket.on("disconnect", () => {
      console.log("disconnected", socket.id);
    });
  };

  const disconnect = () => {
    if (!connection) return;
    connection?.emit("endGoogleCloudStream");
    connection?.disconnect();
    processorRef.current?.disconnect();
    audioInputRef.current?.disconnect();
    audioContextRef.current?.close();
    setConnection(undefined);
    setRecorder(undefined);
    setIsRecording(false);
  };

  useEffect(() => {
    (async () => {
      if (connection) {
        if (isRecording) {
          return;
        }

        const stream = await getMediaStream();

        audioContextRef.current = new window.AudioContext();

        await audioContextRef.current.audioWorklet.addModule(
          "/src/worklets/recorderWorkletProcessor.js"
        );

        audioContextRef.current.resume();

        audioInputRef.current =
          audioContextRef.current.createMediaStreamSource(stream);

        processorRef.current = new AudioWorkletNode(
          audioContextRef.current,
          "recorder.worklet"
        );

        processorRef.current.connect(audioContextRef.current.destination);
        audioContextRef.current.resume();

        audioInputRef.current.connect(processorRef.current);

        processorRef.current.port.onmessage = (event: any) => {
          const audioData = event.data;
          connection.emit("send_audio_data", { audio: audioData });
        };
        setIsRecording(true);
      } else {
        console.error("No connection");
      }
    })();
    return () => {
      if (isRecording) {
        processorRef.current?.disconnect();
        audioInputRef.current?.disconnect();
        if (audioContextRef.current?.state !== "closed") {
          audioContextRef.current?.close();
        }
      }
    };
  }, [connection, isRecording, recorder]);

  return (
    <React.Fragment>
      <Container className="py-5 text-center">
        <Container fluid className="py-5 bg-primary text-light text-center ">
          <Container>
            <Button
              className={isRecording ? "btn-danger" : "btn-outline-light"}
              onClick={connect}
              disabled={isRecording}
            >
              Start
            </Button>
            <Button
              className="btn-outline-light"
              onClick={disconnect}
              disabled={!isRecording}
            >
              Stop
            </Button>
          </Container>
        </Container>
        <Container className="py-5 text-center">
          {recognitionHistory.map((tx, idx) => (
            <p key={idx}>{tx}</p>
          ))}
          <p>{currentRecognition}</p>
        </Container>
      </Container>
    </React.Fragment>
  );
};

export default AudioToText;

  1. Sever side (Node js)
const express = require("express");
const speech = require("@google-cloud/speech");

//use logger
const logger = require("morgan");

//use body parser
const bodyParser = require("body-parser");

//use corrs
const cors = require("cors");

const http = require("http");
const { Server } = require("socket.io");

const app = express();

app.use(cors());
app.use(logger("dev"));

app.use(bodyParser.json());

const server = http.createServer(app);

const io = new Server(server, {
  cors: {
    origin: "http://localhost:3000",
    methods: ["GET", "POST"],
  },
});

//TODO: run in terminal first to setup credentials export GOOGLE_APPLICATION_CREDENTIALS="./speech-to-text-key.json"

const speechClient = new speech.SpeechClient();

io.on("connection", (socket) => {
  let recognizeStream = null;
  console.log("** a user connected - " + socket.id + " **\n");

  socket.on("disconnect", () => {
    console.log("** user disconnected ** \n");
  });

  socket.on("send_message", (message) => {
    console.log("message: " + message);
    setTimeout(() => {
      io.emit("receive_message", "got this message" + message);
    }, 1000);
  });

  socket.on("startGoogleCloudStream", function (data) {
    startRecognitionStream(this, data);
  });

  socket.on("endGoogleCloudStream", function () {
    console.log("** ending google cloud stream **\n");
    stopRecognitionStream();
  });

  socket.on("send_audio_data", async (audioData) => {
    io.emit("receive_message", "Got audio data");
    if (recognizeStream !== null) {
      try {
        recognizeStream.write(audioData.audio);
      } catch (err) {
        console.log("Error calling google api " + err);
      }
    } else {
      console.log("RecognizeStream is null");
    }
  });

  function startRecognitionStream(client) {
    console.log("* StartRecognitionStream\n");
    try {
      recognizeStream = speechClient
        .streamingRecognize(request)
        .on("error", console.error)
        .on("data", (data) => {
          const result = data.results[0];
          const isFinal = result.isFinal;

          const transcription = data.results
            .map((result) => result.alternatives[0].transcript)
            .join("\n");

          console.log(`Transcription: `, transcription);

          client.emit("receive_audio_text", {
            text: transcription,
            final: isFinal,
          });
        });
    } catch (err) {
      console.error("Error streaming google api " + err);
    }
  }

  function stopRecognitionStream() {
    if (recognizeStream) {
      console.log("* StopRecognitionStream \n");
      recognizeStream.end();
    }
    recognizeStream = null;
  }
});

server.listen(8081, () => {
  console.log("WebSocket server listening on port 8081.");
});

// =========================== GOOGLE CLOUD SETTINGS ================================ //

// The encoding of the audio file, e.g. 'LINEAR16'
// The sample rate of the audio file in hertz, e.g. 16000
// The BCP-47 language code to use, e.g. 'en-US'
const encoding = "LINEAR16";
const sampleRateHertz = 16000;
const languageCode = "en-US"; //en-US
const alternativeLanguageCodes = ["en-US", "ko-KR"];

const request = {
  config: {
    encoding: encoding,
    sampleRateHertz: sampleRateHertz,
    languageCode: languageCode,
    //alternativeLanguageCodes: alternativeLanguageCodes,
    enableWordTimeOffsets: true,
    enableAutomaticPunctuation: true,
    enableWordConfidence: true,
    enableSpeakerDiarization: true,
    diarizationSpeakerCount: 2,
    model: "video",
    //model: "command_and_search",
    useEnhanced: true,
    speechContexts: [
      {
        phrases: ["hello", "안녕하세요"],
      },
    ],
  },
  interimResults: true,
};