Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 27 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,15 @@ the audio from a bridge using the Google Speech APIs.
* A functional Asterisk 16.6.0+ installation.
* A conference bridge or phone configured.
* Node.JS version 10 or greater.
* Google Speech API credentials set in environment variable GOOGLE_APPLICATION_CREDENTIALS.
See https://cloud.google.com/speech-to-text/docs/ for more information.

Run `npm install` from the top of the source tree.
* Google Speech API credentials set in environment variable `GOOGLE_APPLICATION_CREDENTIALS`.
* See https://cloud.google.com/speech-to-text/docs/ for more information.
* For Amazon Transcribe: AWS credentials and region set to environment variables
* See https://docs.aws.amazon.com/sdkref/latest/guide/environment-variables.html
- `AWS_ACCESS_KEY_ID`
- `AWS_SECRET_ACCESS_KEY`
- `AWS_REGION`

Run `npm install` and `npm audit fix` from the top of the source tree.
This will install the required npm packages including `node-ari-client` and `@google-cloud/speech`.
You can then run the transcriber as `bin/ari-transcriber`. If you add the `-g`
option to `npm install` to install system wide, you can just run `ari-transcriber`.
Expand All @@ -36,7 +41,8 @@ Incoming Audio Server

Speech
--speechModel Google Speech API model [string] [choices: "phone_call", "video", "default"] [default: "default"]
--speechLang BCP-47 Language code. en-US, fr-CA, etc. [string] [choices: "en-US", "fr-CA"] [default: "en-US"]
--speechProvider Speech engine provider [string] [choices: "google", "aws"] [default: "google"]
--speechLang BCP-47 Language code. en-US, fr-CA, etc. [string] [choices: Language supported by Google/AWS] [default: "en-US"]
--speakerDiarization Outputs words associated to speaker index to the console. [boolean] [default: false]

ARI
Expand Down Expand Up @@ -85,7 +91,21 @@ wouldn't need the Local channel and mixing bridge.
You don't need the WebSocket transcription server to try this.
Just a phone to call.

```
```sh
$ export GOOGLE_APPLICATION_CREDENTIALS=<path to Google API credentials>
$ ari-transcriber --format=slin16 'Local/1234'
````
```

Amazon Transcibe mode:

```sh
$ cat > env.sh
export AWS_ACCESS_KEY_ID=<AWS IAM access key>
export AWS_SECRET_ACCESS_KEY=<AWS IAM sectet key>
export AWS_REGION=<AWS region>
(Type Ctrl+D)

$ source env.sh
$ ari-transcriber --format=slin16 --speechProvider=aws 'Local/1234'
```

10 changes: 9 additions & 1 deletion bin/ari-transcriber
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,21 @@ require(`yargs`)
},
speechLang: {
default: 'en-US',
choices: ['en-US', 'fr-CA'],
global: true,
requiresArg: true,
description: "BCP-47 Language code. en-US, fr-CA, etc.",
group: "Speech",
type: 'string',
},
speechProvider: {
default: 'google',
choices: ['google', 'aws', ],
global: true,
requiresArg: true,
description: "Speech Provider",
group: "Speech",
type: 'string',
},
speakerDiarization: {
default: false,
global: true,
Expand Down
124 changes: 124 additions & 0 deletions lib/amazon-transcribe-provider.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Amazon Transcribe speech engine provider
* for Asterisk External Media Sample
*/

const chalk = require('chalk');
// Note: "node:stream" Cannot work with Node.js 10.0 to 14.17.6
const { Transform, PassThrough } = require('stream');
const {
TranscribeStreamingClient,
StartStreamTranscriptionCommand,
} = require('@aws-sdk/client-transcribe-streaming');

/**
* Amazon Transcribe Provider
*
* @class
*/
class AmazonTranscribeProvider {

/**
* @constructor
* @param {Object} config Configuration for AWS API TranscribeStreamingClient()
* @param {RtpUdpServerSocket.server(extended from node:dgram.Stream)} socket Raw-RTP audio data socket
* @param {transcriptCallback} transcriptCallback
* @param {resultsCallback} resultsCallback
*/
constructor(config, socket, transcriptCallback, resultsCallback) {

this.config = config;
this.socket = socket;
this.transcriptCallback = transcriptCallback;
this.resultsCallback = resultsCallback;

this.audioInputPayloadStream = new PassThrough({ highWaterMark: 1 * 1024 }); // Stream chunk less than 1 KB

this.audioInputStreamTransform = new Transform({
readableHighWaterMark: 1 * 1024,
transform: (chunk, encoding, callback) => {
this.transformer(chunk, encoding, callback);
},
});

this.socket.pipe(this.audioInputStreamTransform);

// Initialize Amazon Transcribe
this.transcribeClient = new TranscribeStreamingClient(this.config);

this.startTranscribe();
}

/*
* Transform to another stream.
*
* @param {Buffer} chunk Audio chunk from Asterisk
* @param {String} encoding Not use here
* @param {Function} done_callback Put it end of this function.
*/
transformer(chunk, encoding, done_callback) {
this.audioInputPayloadStream.write(chunk);

done_callback();
}

/**
* Asterisk audio data generator
*/
async* audioGenerator() {
try {
for await (const chunk of this.audioInputPayloadStream) {
yield { AudioEvent: { AudioChunk: chunk } };
}

} catch (error) {
console.error("Exception at Audio stream: ", error);
throw exception;
}
}

/**
* Start Amazon Transcribe process
*/
async startTranscribe() {
const command = new StartStreamTranscriptionCommand({
AudioStream: this.audioGenerator(),
LanguageCode: this.config.LanguageCode,
LanguageModelName: this.config.LanguageModelName,
MediaEncoding: this.config.MediaEncoding,
MediaSampleRateHertz: this.config.MediaSampleRateHertz,
ShowSpeakerLabel: this.config.ShowSpeakerLabel,
});

const awsResponse = await this.transcribeClient.send(command);

for await (const event of awsResponse.TranscriptResultStream) {
const results = event.TranscriptEvent.Transcript.Results;

if (this.resultsCallback) {
this.resultsCallback(results);
}

if (0 < results.length && 0 < results[0].Alternatives.length) {
const isFinal = !results[0].IsPartial;
let stdoutText = results[0].Alternatives[0].Transcript;

process.stdout.clearLine();
process.stdout.cursorTo(0);

if (isFinal) {
process.stdout.write(chalk.green(`${stdoutText}\n`));
} else {
// Make sure transcript does not exceed console character length
if (stdoutText.length > process.stdout.columns) {
stdoutText = stdoutText.substring(0, process.stdout.columns - 4) + '...';
}
process.stdout.write(chalk.yellow(`${stdoutText}`));
}
}
}
}

}

module.exports.AmazonTranscribeProvider = AmazonTranscribeProvider;
108 changes: 103 additions & 5 deletions lib/ari-transcriber.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

const rtp = require('./rtp-udp-server');
const provider = require('./google-speech-provider');
const awsprovider = require('./amazon-transcribe-provider');
const { LanguageCode } = require('@aws-sdk/client-transcribe-streaming');
const ari = require('./ari-controller');
const WebSocket = require('ws');
const fs = require('fs');
Expand Down Expand Up @@ -78,12 +80,60 @@ class AriTranscriber {
}
}

/**
* Output Amazon Transcribe results text with speaker infromation
* (Only works with awsConfig.ShowSpeakerLabel: true)
*
* @callback resultsCallback
* @param {ResultList} results Response data from Amazon Transcribe
*/
awsResultsCallback(results) {
if (!results[0] === undefined) {
console.log("Error: No response");
return;
}

if (0 < results.length && 0 < results[0].Alternatives.length) {
const transcript = results[0].Alternatives[0].Transcript;
const isFinal = !results[0].IsPartial;
const speakers = { "unknown_speaker": "" };

if (!isFinal) {
return;
}

// Combine words by speaker ID
for (const item of results[0].Alternatives[0].Items) {
if (undefined === item.Speaker) {
speakers.unknown_speaker += item.Content;
continue;
}
if (undefined === speakers[item.Speaker]) {
speakers[item.Speaker] = "";
}
speakers[item.Speaker] += item.Content;
}

console.log(""); // Output newline
for (const key in speakers) {
// Output transcribe with speaker ID
console.log(` word: ${speakers[key]}, speakerTag: ${key}`);
}
}
}

// The main wrapper
async transcriber() {
let speechEncoding;
let speechRate;
let swap16 = false;

// https://docs.aws.amazon.com/transcribe/latest/dg/how-input.html
if (this.opts.format != "slin16" && this.opts.speechProvider == "aws") {
this.opts.format = "slin16";
console.warn('WARNING: Format replaced forcefully to "slin16". For Amazon Transcribe.');
}

switch(this.opts.format) {
case "ulaw":
speechEncoding = "MULAW";
Expand Down Expand Up @@ -150,18 +200,66 @@ class AriTranscriber {
config.enableSpeakerDiarization = true;
config.diarizationSpeakerCount = 5;
}

const awsConfig = {
region: process.env.AWS_REGION,
credentials: {
accessKeyId: process.env.AWS_ACCESS_KEY_ID,
secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
},
LanguageCode: this.opts.speechLang,
LanguageModelName: undefined,
MediaEncoding: "pcm",
MediaSampleRateHertz: speechRate,
ShowSpeakerLabel: false,
};
if (this.opts.speakerDiarization) {
awsConfig.ShowSpeakerLabel = true;
}
if (this.opts.speechModel != "default") {
awsConfig.LanguageModelName = this.opts.speechModel;
}

// Start the speech provider passing in the audio server socket.
this.speechProvider = new provider.GoogleSpeechProvider(config, this.audioServer,
switch(this.opts.speechProvider) {
case "aws": // Amazon Transcribe
if (!Object.values(LanguageCode).find((item) => item === awsConfig.LanguageCode)) {
console.error("Invalid Value: This language is NOT supported by AWS-SDK: " + awsConfig.LanguageCode);
process.exit(1);
}

this.speechProvider = new awsprovider.AmazonTranscribeProvider(awsConfig, this.audioServer,
(text, isFinal) => {
this.transcriptCallback(text, isFinal);
this.transcriptCallback(text, isFinal);
},
(results) => {
if (this.opts.speakerDiarization) {
this.resultsCallback(results);
this.awsResultsCallback(results);
}
}
);
break;

case "google": // FALLTHROUGH
default: // Google Speech-to-Text

if ( !provider.GoogleSupportedLanguages.find((item) => item === config.languageCode) ) {
console.error("Invalid Value: This language is NOT supported by Google Speech-to-Text: " + config.languageCode);
process.exit(1);
}

// Start the speech provider passing in the audio server socket.
this.speechProvider = new provider.GoogleSpeechProvider(config, this.audioServer,
(text, isFinal) => {
this.transcriptCallback(text, isFinal);
},
);
(results) => {
if (this.opts.speakerDiarization) {
this.resultsCallback(results);
}
}
);
break;
}

// Kick the whole process off by creating the channels and bridges.
console.log("Creating Bridge and Channels");
Expand Down
34 changes: 34 additions & 0 deletions lib/google-speech-provider.js
Original file line number Diff line number Diff line change
Expand Up @@ -221,4 +221,38 @@ class GoogleSpeechProvider {
}
}

// https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages
module.exports.GoogleSupportedLanguages = [
"af-ZA", "sq-AL", "am-ET", "ar-DZ", "ar-BH",
"ar-EG", "ar-IQ", "ar-IL", "ar-JO", "ar-KW",
"ar-LB", "ar-MR", "ar-MA", "ar-OM", "ar-QA",
"ar-SA", "ar-PS", "ar-SY", "ar-TN", "ar-AE",
"ar-YE", "hy-AM", "az-AZ", "eu-ES", "bn-BD",
"bn-IN", "bs-BA", "bg-BG", "my-MM", "ca-ES",
"cmn-Hans-CN", "cmn-Hans-HK", "cmn-Hant-TW", "yue-Hant-HK", "hr-HR",
"cs-CZ", "da-DK", "nl-BE", "nl-NL", "en-AU",
"en-CA", "en-GH", "en-HK", "en-IN", "en-IE",
"en-KE", "en-NZ", "en-NG", "en-PK", "en-PH",
"en-SG", "en-ZA", "en-TZ", "en-GB", "en-US",
"et-EE", "fil-PH", "fi-FI", "fr-BE", "fr-CA",
"fr-FR", "fr-CH", "gl-ES", "ka-GE", "de-AT",
"de-DE", "de-CH", "el-GR", "gu-IN", "iw-IL",
"hi-IN", "hu-HU", "is-IS", "id-ID", "it-IT",
"it-CH", "ja-JP", "jv-ID", "kn-IN", "kk-KZ",
"km-KH", "km-KH", "rw-RW", "ko-KR", "lo-LA",
"lv-LV", "lt-LT", "mk-MK", "ms-MY", "ml-IN",
"mr-IN", "mn-MN", "ne-NP", "no-NO", "fa-IR",
"pl-PL", "pt-BR", "pt-PT", "pa-Guru-IN", "ro-RO",
"ru-RU", "sr-RS", "si-LK", "sk-SK", "sl-SI",
"st-ZA", "es-AR", "es-BO", "es-CL", "es-CO",
"es-CR", "es-DO", "es-EC", "es-SV", "es-GT",
"es-HN", "es-MX", "es-NI", "es-PA", "es-PY",
"es-PE", "es-PR", "es-ES", "es-US", "es-UY",
"es-VE", "su-ID", "sw-KE", "sw-TZ", "ss-Latn-ZA",
"sv-SE", "ta-IN", "ta-MY", "ta-SG", "ta-LK",
"te-IN", "th-TH", "ts-ZA", "tn-Latn-ZA", "tr-TR",
"uk-UA", "ur-IN", "uz-UZ", "ve-ZA", "vi-VN",
"xh-ZA", "zu-ZA"
];

module.exports.GoogleSpeechProvider = GoogleSpeechProvider;
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"node": ">=10"
},
"dependencies": {
"@aws-sdk/client-transcribe-streaming": "^3.812.0",
"@google-cloud/speech": "^3.3.1",
"ari-client": "^2.2.0",
"chalk": "^2.4.2",
Expand Down