diff --git a/README.md b/README.md index de4c73f..25dc628 100644 --- a/README.md +++ b/README.md @@ -11,10 +11,15 @@ the audio from a bridge using the Google Speech APIs. * A functional Asterisk 16.6.0+ installation. * A conference bridge or phone configured. * Node.JS version 10 or greater. -* Google Speech API credentials set in environment variable GOOGLE_APPLICATION_CREDENTIALS. -See https://cloud.google.com/speech-to-text/docs/ for more information. - -Run `npm install` from the top of the source tree. +* Google Speech API credentials set in environment variable `GOOGLE_APPLICATION_CREDENTIALS`. + * See https://cloud.google.com/speech-to-text/docs/ for more information. +* For Amazon Transcribe: AWS credentials and region set to environment variables + * See https://docs.aws.amazon.com/sdkref/latest/guide/environment-variables.html + - `AWS_ACCESS_KEY_ID` + - `AWS_SECRET_ACCESS_KEY` + - `AWS_REGION` + +Run `npm install` and `npm audit fix` from the top of the source tree. This will install the required npm packages including `node-ari-client` and `@google-cloud/speech`. You can then run the transcriber as `bin/ari-transcriber`. If you add the `-g` option to `npm install` to install system wide, you can just run `ari-transcriber`. @@ -36,7 +41,8 @@ Incoming Audio Server Speech --speechModel Google Speech API model [string] [choices: "phone_call", "video", "default"] [default: "default"] - --speechLang BCP-47 Language code. en-US, fr-CA, etc. [string] [choices: "en-US", "fr-CA"] [default: "en-US"] + --speechProvider Speech engine provider [string] [choices: "google", "aws"] [default: "google"] + --speechLang BCP-47 Language code. en-US, fr-CA, etc. [string] [choices: Language supported by Google/AWS] [default: "en-US"] --speakerDiarization Outputs words associated to speaker index to the console. [boolean] [default: false] ARI @@ -85,7 +91,21 @@ wouldn't need the Local channel and mixing bridge. You don't need the WebSocket transcription server to try this. Just a phone to call. -``` +```sh $ export GOOGLE_APPLICATION_CREDENTIALS= $ ari-transcriber --format=slin16 'Local/1234' -```` +``` + +Amazon Transcibe mode: + +```sh +$ cat > env.sh +export AWS_ACCESS_KEY_ID= +export AWS_SECRET_ACCESS_KEY= +export AWS_REGION= +(Type Ctrl+D) + +$ source env.sh +$ ari-transcriber --format=slin16 --speechProvider=aws 'Local/1234' +``` + diff --git a/bin/ari-transcriber b/bin/ari-transcriber index 9e51657..2fdfd9a 100755 --- a/bin/ari-transcriber +++ b/bin/ari-transcriber @@ -69,13 +69,21 @@ require(`yargs`) }, speechLang: { default: 'en-US', - choices: ['en-US', 'fr-CA'], global: true, requiresArg: true, description: "BCP-47 Language code. en-US, fr-CA, etc.", group: "Speech", type: 'string', }, + speechProvider: { + default: 'google', + choices: ['google', 'aws', ], + global: true, + requiresArg: true, + description: "Speech Provider", + group: "Speech", + type: 'string', + }, speakerDiarization: { default: false, global: true, diff --git a/lib/amazon-transcribe-provider.js b/lib/amazon-transcribe-provider.js new file mode 100644 index 0000000..6365873 --- /dev/null +++ b/lib/amazon-transcribe-provider.js @@ -0,0 +1,124 @@ +/* + * Amazon Transcribe speech engine provider + * for Asterisk External Media Sample + */ + +const chalk = require('chalk'); +// Note: "node:stream" Cannot work with Node.js 10.0 to 14.17.6 +const { Transform, PassThrough } = require('stream'); +const { + TranscribeStreamingClient, + StartStreamTranscriptionCommand, +} = require('@aws-sdk/client-transcribe-streaming'); + +/** + * Amazon Transcribe Provider + * + * @class + */ +class AmazonTranscribeProvider { + + /** + * @constructor + * @param {Object} config Configuration for AWS API TranscribeStreamingClient() + * @param {RtpUdpServerSocket.server(extended from node:dgram.Stream)} socket Raw-RTP audio data socket + * @param {transcriptCallback} transcriptCallback + * @param {resultsCallback} resultsCallback + */ + constructor(config, socket, transcriptCallback, resultsCallback) { + + this.config = config; + this.socket = socket; + this.transcriptCallback = transcriptCallback; + this.resultsCallback = resultsCallback; + + this.audioInputPayloadStream = new PassThrough({ highWaterMark: 1 * 1024 }); // Stream chunk less than 1 KB + + this.audioInputStreamTransform = new Transform({ + readableHighWaterMark: 1 * 1024, + transform: (chunk, encoding, callback) => { + this.transformer(chunk, encoding, callback); + }, + }); + + this.socket.pipe(this.audioInputStreamTransform); + + // Initialize Amazon Transcribe + this.transcribeClient = new TranscribeStreamingClient(this.config); + + this.startTranscribe(); + } + + /* + * Transform to another stream. + * + * @param {Buffer} chunk Audio chunk from Asterisk + * @param {String} encoding Not use here + * @param {Function} done_callback Put it end of this function. + */ + transformer(chunk, encoding, done_callback) { + this.audioInputPayloadStream.write(chunk); + + done_callback(); + } + + /** + * Asterisk audio data generator + */ + async* audioGenerator() { + try { + for await (const chunk of this.audioInputPayloadStream) { + yield { AudioEvent: { AudioChunk: chunk } }; + } + + } catch (error) { + console.error("Exception at Audio stream: ", error); + throw exception; + } + } + + /** + * Start Amazon Transcribe process + */ + async startTranscribe() { + const command = new StartStreamTranscriptionCommand({ + AudioStream: this.audioGenerator(), + LanguageCode: this.config.LanguageCode, + LanguageModelName: this.config.LanguageModelName, + MediaEncoding: this.config.MediaEncoding, + MediaSampleRateHertz: this.config.MediaSampleRateHertz, + ShowSpeakerLabel: this.config.ShowSpeakerLabel, + }); + + const awsResponse = await this.transcribeClient.send(command); + + for await (const event of awsResponse.TranscriptResultStream) { + const results = event.TranscriptEvent.Transcript.Results; + + if (this.resultsCallback) { + this.resultsCallback(results); + } + + if (0 < results.length && 0 < results[0].Alternatives.length) { + const isFinal = !results[0].IsPartial; + let stdoutText = results[0].Alternatives[0].Transcript; + + process.stdout.clearLine(); + process.stdout.cursorTo(0); + + if (isFinal) { + process.stdout.write(chalk.green(`${stdoutText}\n`)); + } else { + // Make sure transcript does not exceed console character length + if (stdoutText.length > process.stdout.columns) { + stdoutText = stdoutText.substring(0, process.stdout.columns - 4) + '...'; + } + process.stdout.write(chalk.yellow(`${stdoutText}`)); + } + } + } + } + +} + +module.exports.AmazonTranscribeProvider = AmazonTranscribeProvider; diff --git a/lib/ari-transcriber.js b/lib/ari-transcriber.js index 6e0bd25..fe00c8c 100644 --- a/lib/ari-transcriber.js +++ b/lib/ari-transcriber.js @@ -17,6 +17,8 @@ const rtp = require('./rtp-udp-server'); const provider = require('./google-speech-provider'); +const awsprovider = require('./amazon-transcribe-provider'); +const { LanguageCode } = require('@aws-sdk/client-transcribe-streaming'); const ari = require('./ari-controller'); const WebSocket = require('ws'); const fs = require('fs'); @@ -78,12 +80,60 @@ class AriTranscriber { } } + /** + * Output Amazon Transcribe results text with speaker infromation + * (Only works with awsConfig.ShowSpeakerLabel: true) + * + * @callback resultsCallback + * @param {ResultList} results Response data from Amazon Transcribe + */ + awsResultsCallback(results) { + if (!results[0] === undefined) { + console.log("Error: No response"); + return; + } + + if (0 < results.length && 0 < results[0].Alternatives.length) { + const transcript = results[0].Alternatives[0].Transcript; + const isFinal = !results[0].IsPartial; + const speakers = { "unknown_speaker": "" }; + + if (!isFinal) { + return; + } + + // Combine words by speaker ID + for (const item of results[0].Alternatives[0].Items) { + if (undefined === item.Speaker) { + speakers.unknown_speaker += item.Content; + continue; + } + if (undefined === speakers[item.Speaker]) { + speakers[item.Speaker] = ""; + } + speakers[item.Speaker] += item.Content; + } + + console.log(""); // Output newline + for (const key in speakers) { + // Output transcribe with speaker ID + console.log(` word: ${speakers[key]}, speakerTag: ${key}`); + } + } + } + // The main wrapper async transcriber() { let speechEncoding; let speechRate; let swap16 = false; + // https://docs.aws.amazon.com/transcribe/latest/dg/how-input.html + if (this.opts.format != "slin16" && this.opts.speechProvider == "aws") { + this.opts.format = "slin16"; + console.warn('WARNING: Format replaced forcefully to "slin16". For Amazon Transcribe.'); + } + switch(this.opts.format) { case "ulaw": speechEncoding = "MULAW"; @@ -150,18 +200,66 @@ class AriTranscriber { config.enableSpeakerDiarization = true; config.diarizationSpeakerCount = 5; } + + const awsConfig = { + region: process.env.AWS_REGION, + credentials: { + accessKeyId: process.env.AWS_ACCESS_KEY_ID, + secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY, + }, + LanguageCode: this.opts.speechLang, + LanguageModelName: undefined, + MediaEncoding: "pcm", + MediaSampleRateHertz: speechRate, + ShowSpeakerLabel: false, + }; + if (this.opts.speakerDiarization) { + awsConfig.ShowSpeakerLabel = true; + } + if (this.opts.speechModel != "default") { + awsConfig.LanguageModelName = this.opts.speechModel; + } - // Start the speech provider passing in the audio server socket. - this.speechProvider = new provider.GoogleSpeechProvider(config, this.audioServer, + switch(this.opts.speechProvider) { + case "aws": // Amazon Transcribe + if (!Object.values(LanguageCode).find((item) => item === awsConfig.LanguageCode)) { + console.error("Invalid Value: This language is NOT supported by AWS-SDK: " + awsConfig.LanguageCode); + process.exit(1); + } + + this.speechProvider = new awsprovider.AmazonTranscribeProvider(awsConfig, this.audioServer, (text, isFinal) => { - this.transcriptCallback(text, isFinal); + this.transcriptCallback(text, isFinal); }, (results) => { if (this.opts.speakerDiarization) { - this.resultsCallback(results); + this.awsResultsCallback(results); } + } + ); + break; + + case "google": // FALLTHROUGH + default: // Google Speech-to-Text + + if ( !provider.GoogleSupportedLanguages.find((item) => item === config.languageCode) ) { + console.error("Invalid Value: This language is NOT supported by Google Speech-to-Text: " + config.languageCode); + process.exit(1); + } + + // Start the speech provider passing in the audio server socket. + this.speechProvider = new provider.GoogleSpeechProvider(config, this.audioServer, + (text, isFinal) => { + this.transcriptCallback(text, isFinal); }, - ); + (results) => { + if (this.opts.speakerDiarization) { + this.resultsCallback(results); + } + } + ); + break; + } // Kick the whole process off by creating the channels and bridges. console.log("Creating Bridge and Channels"); diff --git a/lib/google-speech-provider.js b/lib/google-speech-provider.js index 1522d25..8285593 100644 --- a/lib/google-speech-provider.js +++ b/lib/google-speech-provider.js @@ -221,4 +221,38 @@ class GoogleSpeechProvider { } } +// https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages +module.exports.GoogleSupportedLanguages = [ + "af-ZA", "sq-AL", "am-ET", "ar-DZ", "ar-BH", + "ar-EG", "ar-IQ", "ar-IL", "ar-JO", "ar-KW", + "ar-LB", "ar-MR", "ar-MA", "ar-OM", "ar-QA", + "ar-SA", "ar-PS", "ar-SY", "ar-TN", "ar-AE", + "ar-YE", "hy-AM", "az-AZ", "eu-ES", "bn-BD", + "bn-IN", "bs-BA", "bg-BG", "my-MM", "ca-ES", + "cmn-Hans-CN", "cmn-Hans-HK", "cmn-Hant-TW", "yue-Hant-HK", "hr-HR", + "cs-CZ", "da-DK", "nl-BE", "nl-NL", "en-AU", + "en-CA", "en-GH", "en-HK", "en-IN", "en-IE", + "en-KE", "en-NZ", "en-NG", "en-PK", "en-PH", + "en-SG", "en-ZA", "en-TZ", "en-GB", "en-US", + "et-EE", "fil-PH", "fi-FI", "fr-BE", "fr-CA", + "fr-FR", "fr-CH", "gl-ES", "ka-GE", "de-AT", + "de-DE", "de-CH", "el-GR", "gu-IN", "iw-IL", + "hi-IN", "hu-HU", "is-IS", "id-ID", "it-IT", + "it-CH", "ja-JP", "jv-ID", "kn-IN", "kk-KZ", + "km-KH", "km-KH", "rw-RW", "ko-KR", "lo-LA", + "lv-LV", "lt-LT", "mk-MK", "ms-MY", "ml-IN", + "mr-IN", "mn-MN", "ne-NP", "no-NO", "fa-IR", + "pl-PL", "pt-BR", "pt-PT", "pa-Guru-IN", "ro-RO", + "ru-RU", "sr-RS", "si-LK", "sk-SK", "sl-SI", + "st-ZA", "es-AR", "es-BO", "es-CL", "es-CO", + "es-CR", "es-DO", "es-EC", "es-SV", "es-GT", + "es-HN", "es-MX", "es-NI", "es-PA", "es-PY", + "es-PE", "es-PR", "es-ES", "es-US", "es-UY", + "es-VE", "su-ID", "sw-KE", "sw-TZ", "ss-Latn-ZA", + "sv-SE", "ta-IN", "ta-MY", "ta-SG", "ta-LK", + "te-IN", "th-TH", "ts-ZA", "tn-Latn-ZA", "tr-TR", + "uk-UA", "ur-IN", "uz-UZ", "ve-ZA", "vi-VN", + "xh-ZA", "zu-ZA" +]; + module.exports.GoogleSpeechProvider = GoogleSpeechProvider; diff --git a/package.json b/package.json index 97d2b03..53abfc5 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,7 @@ "node": ">=10" }, "dependencies": { + "@aws-sdk/client-transcribe-streaming": "^3.812.0", "@google-cloud/speech": "^3.3.1", "ari-client": "^2.2.0", "chalk": "^2.4.2",