asterisk · yamajun · Jun 1, 2025 · Jun 2, 2025 · Jun 25, 2025
diff --git a/README.md b/README.md
@@ -11,10 +11,15 @@ the audio from a bridge using the Google Speech APIs.
 * A functional Asterisk 16.6.0+ installation.
 * A conference bridge or phone configured.
 * Node.JS version 10 or greater.
-* Google Speech API credentials set in environment variable GOOGLE_APPLICATION_CREDENTIALS.  
-See https://cloud.google.com/speech-to-text/docs/ for more information.
-
-Run `npm install` from the top of the source tree.
+* Google Speech API credentials set in environment variable `GOOGLE_APPLICATION_CREDENTIALS`.
+  * See https://cloud.google.com/speech-to-text/docs/ for more information.
+* For Amazon Transcribe: AWS credentials and region set to environment variables
+  * See https://docs.aws.amazon.com/sdkref/latest/guide/environment-variables.html
+    - `AWS_ACCESS_KEY_ID`
+    - `AWS_SECRET_ACCESS_KEY`
+    - `AWS_REGION`
+
+Run `npm install` and `npm audit fix` from the top of the source tree.
 This will install the required npm packages including `node-ari-client` and `@google-cloud/speech`.
 You can then run the transcriber as `bin/ari-transcriber`.  If you add the `-g`
 option to `npm install` to install system wide, you can just run `ari-transcriber`. 
@@ -36,7 +41,8 @@ Incoming Audio Server
 
 Speech
   --speechModel         Google Speech API model                  [string] [choices: "phone_call", "video", "default"] [default: "default"]
-  --speechLang          BCP-47 Language code.  en-US, fr-CA, etc.                  [string] [choices: "en-US", "fr-CA"] [default: "en-US"]
+  --speechProvider      Speech engine provider                                     [string] [choices: "google", "aws"] [default: "google"]
+  --speechLang          BCP-47 Language code.  en-US, fr-CA, etc.  [string] [choices: Language supported by Google/AWS] [default: "en-US"]
   --speakerDiarization  Outputs words associated to speaker index to the console.                               [boolean] [default: false]
 
 ARI
@@ -85,7 +91,21 @@ wouldn't need the Local channel and mixing bridge.
 You don't need the WebSocket transcription server to try this.
 Just a phone to call.
 
-```
+```sh
 $ export GOOGLE_APPLICATION_CREDENTIALS=<path to Google API credentials>
 $ ari-transcriber --format=slin16 'Local/1234'
-````
+```
+
+Amazon Transcibe mode:
+
+```sh
+$ cat > env.sh
+export AWS_ACCESS_KEY_ID=<AWS IAM access key>
+export AWS_SECRET_ACCESS_KEY=<AWS IAM sectet key>
+export AWS_REGION=<AWS region>
+(Type Ctrl+D)
+
+$ source env.sh
+$ ari-transcriber --format=slin16 --speechProvider=aws 'Local/1234'
+```
+
diff --git a/bin/ari-transcriber b/bin/ari-transcriber
@@ -69,13 +69,21 @@ require(`yargs`)
 		},
 		speechLang: {
 			default: 'en-US',
-			choices: ['en-US', 'fr-CA'],
 			global: true,
 			requiresArg: true,
 			description: "BCP-47 Language code.  en-US, fr-CA, etc.",
 			group: "Speech",
 			type: 'string',
 		},
+		speechProvider: {
+			default: 'google',
+			choices: ['google', 'aws', ],
+			global: true,
+			requiresArg: true,
+			description: "Speech Provider",
+			group: "Speech",
+			type: 'string',
+		},
 		speakerDiarization: {
 			default: false,
 			global: true,

diff --git a/lib/amazon-transcribe-provider.js b/lib/amazon-transcribe-provider.js
@@ -0,0 +1,124 @@
+/*
+ * Amazon Transcribe speech engine provider
+ * for Asterisk External Media Sample
+ */
+
+const chalk = require('chalk');
+// Note: "node:stream" Cannot work with Node.js 10.0 to 14.17.6
+const { Transform, PassThrough } = require('stream');
+const {
+	TranscribeStreamingClient,
+	StartStreamTranscriptionCommand,
+} = require('@aws-sdk/client-transcribe-streaming');
+
+/**
+ * Amazon Transcribe Provider
+ *
+ * @class
+ */
+class AmazonTranscribeProvider {
+
+	/**
+	 * @constructor
+	 * @param {Object} config	Configuration for AWS API TranscribeStreamingClient()
+	 * @param {RtpUdpServerSocket.server(extended from node:dgram.Stream)} socket	Raw-RTP audio data socket
+	 * @param {transcriptCallback} transcriptCallback
+	 * @param {resultsCallback} resultsCallback
+	 */
+	constructor(config, socket, transcriptCallback, resultsCallback) {
+
+		this.config = config;
+		this.socket = socket;
+		this.transcriptCallback = transcriptCallback;
+		this.resultsCallback = resultsCallback;
+
+		this.audioInputPayloadStream = new PassThrough({ highWaterMark: 1 * 1024 }); // Stream chunk less than 1 KB
+
+		this.audioInputStreamTransform = new Transform({
+			readableHighWaterMark: 1 * 1024,
+			transform: (chunk, encoding, callback) => {
+				this.transformer(chunk, encoding, callback);
+			},
+		});
+
+		this.socket.pipe(this.audioInputStreamTransform);
+
+		// Initialize Amazon Transcribe
+		this.transcribeClient = new TranscribeStreamingClient(this.config);
+
+		this.startTranscribe();
+	}
+
+	/*
+	 * Transform to another stream.
+	 *
+	 * @param {Buffer} chunk		Audio chunk from Asterisk
+	 * @param {String} encoding		Not use here
+	 * @param {Function} done_callback	Put it end of this function.
+	 */
+	transformer(chunk, encoding, done_callback) {
+		this.audioInputPayloadStream.write(chunk);
+
+		done_callback();
+	}
+
+	/**
+	 * Asterisk audio data generator
+	 */
+	async* audioGenerator() {
+		try {
+			for await (const chunk of this.audioInputPayloadStream) {
+				yield { AudioEvent: { AudioChunk: chunk } };
+			}
+
+		} catch (error) {
+			console.error("Exception at Audio stream: ", error);
+			throw exception;
+		}
+	}
+
+	/**
+	 * Start Amazon Transcribe process
+	 */
+	async startTranscribe() {
+		const command = new StartStreamTranscriptionCommand({
+			AudioStream: this.audioGenerator(),
+			LanguageCode: this.config.LanguageCode,
+			LanguageModelName: this.config.LanguageModelName,
+			MediaEncoding: this.config.MediaEncoding,
+			MediaSampleRateHertz: this.config.MediaSampleRateHertz,
+			ShowSpeakerLabel: this.config.ShowSpeakerLabel,
+		});
+
+		const awsResponse = await this.transcribeClient.send(command);
+
+		for await (const event of awsResponse.TranscriptResultStream) {
+			const results = event.TranscriptEvent.Transcript.Results;
+
+			if (this.resultsCallback) {
+				this.resultsCallback(results);
+			}
+
+			if (0 < results.length && 0 < results[0].Alternatives.length) {
+				const isFinal = !results[0].IsPartial;
+				let stdoutText = results[0].Alternatives[0].Transcript;
+
+				process.stdout.clearLine();
+				process.stdout.cursorTo(0);
+
+				if (isFinal) {
+					process.stdout.write(chalk.green(`${stdoutText}\n`));
+				} else {
+					// Make sure transcript does not exceed console character length
+					if (stdoutText.length > process.stdout.columns) {
+						stdoutText = stdoutText.substring(0, process.stdout.columns - 4) + '...';
+					}
+					process.stdout.write(chalk.yellow(`${stdoutText}`));
+				}
+			}
+		}
+	}
+
+}
+
+module.exports.AmazonTranscribeProvider = AmazonTranscribeProvider;
diff --git a/lib/ari-transcriber.js b/lib/ari-transcriber.js
@@ -17,6 +17,8 @@
 
 const rtp = require('./rtp-udp-server');
 const provider = require('./google-speech-provider');
+const awsprovider = require('./amazon-transcribe-provider');
+const { LanguageCode } = require('@aws-sdk/client-transcribe-streaming');
 const ari = require('./ari-controller');
 const WebSocket = require('ws'); 
 const fs = require('fs');
@@ -78,12 +80,60 @@ class AriTranscriber {
 		}
 	}
 
+	/**
+	 * Output Amazon Transcribe results text with speaker infromation
+	 * (Only works with awsConfig.ShowSpeakerLabel: true)
+	 *
+	 * @callback resultsCallback
+	 * @param {ResultList} results	Response data from Amazon Transcribe
+	 */
+	awsResultsCallback(results) {
+		if (!results[0] === undefined) {
+			console.log("Error: No response");
+			return;
+		}
+
+		if (0 < results.length && 0 < results[0].Alternatives.length) {
+			const transcript = results[0].Alternatives[0].Transcript;
+			const isFinal = !results[0].IsPartial;
+			const speakers = { "unknown_speaker": "" };
+
+			if (!isFinal) {
+				return;
+			}
+
+			// Combine words by speaker ID
+			for (const item of results[0].Alternatives[0].Items) {
+				if (undefined === item.Speaker) {
+					speakers.unknown_speaker += item.Content;
+					continue;
+				}
+				if (undefined === speakers[item.Speaker]) {
+					speakers[item.Speaker] = "";
+				}
+				speakers[item.Speaker] += item.Content;
+			}
+
+			console.log(""); // Output newline
+			for (const key in speakers) {
+				// Output transcribe with speaker ID
+				console.log(` word: ${speakers[key]}, speakerTag: ${key}`);
+			}
+		}
+	}
+
 	// The main wrapper
 	async transcriber() {
 		let speechEncoding;
 		let speechRate;
 		let swap16 = false;
 
+		// https://docs.aws.amazon.com/transcribe/latest/dg/how-input.html
+		if (this.opts.format != "slin16" && this.opts.speechProvider == "aws") {
+			this.opts.format = "slin16";
+			console.warn('WARNING: Format replaced forcefully to "slin16".  For Amazon Transcribe.');
+		}
+
 		switch(this.opts.format) {
 		case "ulaw":
 			speechEncoding = "MULAW";
@@ -150,18 +200,66 @@ class AriTranscriber {
 			config.enableSpeakerDiarization = true;
 			config.diarizationSpeakerCount = 5;
 		}
+
+		const awsConfig = {
+			region: process.env.AWS_REGION,
+			credentials: {
+				accessKeyId: process.env.AWS_ACCESS_KEY_ID,
+				secretAccessKey: process.env.AWS_SECRET_ACCESS_KEY,
+			},
+			LanguageCode: this.opts.speechLang,
+			LanguageModelName: undefined,
+			MediaEncoding: "pcm",
+			MediaSampleRateHertz: speechRate,
+			ShowSpeakerLabel: false,
+		};
+		if (this.opts.speakerDiarization) {
+			awsConfig.ShowSpeakerLabel = true;
+		}
+		if (this.opts.speechModel != "default") {
+			awsConfig.LanguageModelName = this.opts.speechModel;
+		}
 
-		// Start the speech provider passing in the audio server socket.
-		this.speechProvider = new provider.GoogleSpeechProvider(config, this.audioServer,
+		switch(this.opts.speechProvider) {
+		case "aws": // Amazon Transcribe
+			if (!Object.values(LanguageCode).find((item) => item === awsConfig.LanguageCode)) {
+				console.error("Invalid Value: This language is NOT supported by AWS-SDK: " + awsConfig.LanguageCode);
+				process.exit(1);
+			}
+
+			this.speechProvider = new awsprovider.AmazonTranscribeProvider(awsConfig, this.audioServer,
 				(text, isFinal) => {
-					this.transcriptCallback(text, isFinal);			
+					this.transcriptCallback(text, isFinal);
 				},
 				(results) => {
 					if (this.opts.speakerDiarization) {
-						this.resultsCallback(results);
+						this.awsResultsCallback(results);
 					}
+				}
+			);
+			break;
+
+		case "google": // FALLTHROUGH
+		default: // Google Speech-to-Text
+
+			if ( !provider.GoogleSupportedLanguages.find((item) => item === config.languageCode) ) {
+				console.error("Invalid Value: This language is NOT supported by Google Speech-to-Text: " + config.languageCode);
+				process.exit(1);
+			}
+
+			// Start the speech provider passing in the audio server socket.
+			this.speechProvider = new provider.GoogleSpeechProvider(config, this.audioServer,
+				(text, isFinal) => {
+					this.transcriptCallback(text, isFinal);
 				},
-		);
+				(results) => {
+					if (this.opts.speakerDiarization) {
+						this.resultsCallback(results);
+					}
+				}
+			);
+			break;
+		}
 
 		// Kick the whole process off by creating the channels and bridges.
 		console.log("Creating Bridge and Channels");

diff --git a/lib/google-speech-provider.js b/lib/google-speech-provider.js
@@ -221,4 +221,38 @@ class GoogleSpeechProvider {
 	}
 }
 
+// https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages
+module.exports.GoogleSupportedLanguages = [
+	"af-ZA", "sq-AL", "am-ET", "ar-DZ", "ar-BH",
+	"ar-EG", "ar-IQ", "ar-IL", "ar-JO", "ar-KW",
+	"ar-LB", "ar-MR", "ar-MA", "ar-OM", "ar-QA",
+	"ar-SA", "ar-PS", "ar-SY", "ar-TN", "ar-AE",
+	"ar-YE", "hy-AM", "az-AZ", "eu-ES", "bn-BD",
+	"bn-IN", "bs-BA", "bg-BG", "my-MM", "ca-ES",
+	"cmn-Hans-CN", "cmn-Hans-HK", "cmn-Hant-TW", "yue-Hant-HK", "hr-HR",
+	"cs-CZ", "da-DK", "nl-BE", "nl-NL", "en-AU",
+	"en-CA", "en-GH", "en-HK", "en-IN", "en-IE",
+	"en-KE", "en-NZ", "en-NG", "en-PK", "en-PH",
+	"en-SG", "en-ZA", "en-TZ", "en-GB", "en-US",
+	"et-EE", "fil-PH", "fi-FI", "fr-BE", "fr-CA",
+	"fr-FR", "fr-CH", "gl-ES", "ka-GE", "de-AT",
+	"de-DE", "de-CH", "el-GR", "gu-IN", "iw-IL",
+	"hi-IN", "hu-HU", "is-IS", "id-ID", "it-IT",
+	"it-CH", "ja-JP", "jv-ID", "kn-IN", "kk-KZ",
+	"km-KH", "km-KH", "rw-RW", "ko-KR", "lo-LA",
+	"lv-LV", "lt-LT", "mk-MK", "ms-MY", "ml-IN",
+	"mr-IN", "mn-MN", "ne-NP", "no-NO", "fa-IR",
+	"pl-PL", "pt-BR", "pt-PT", "pa-Guru-IN", "ro-RO",
+	"ru-RU", "sr-RS", "si-LK", "sk-SK", "sl-SI",
+	"st-ZA", "es-AR", "es-BO", "es-CL", "es-CO",
+	"es-CR", "es-DO", "es-EC", "es-SV", "es-GT",
+	"es-HN", "es-MX", "es-NI", "es-PA", "es-PY",
+	"es-PE", "es-PR", "es-ES", "es-US", "es-UY",
+	"es-VE", "su-ID", "sw-KE", "sw-TZ", "ss-Latn-ZA",
+	"sv-SE", "ta-IN", "ta-MY", "ta-SG", "ta-LK",
+	"te-IN", "th-TH", "ts-ZA", "tn-Latn-ZA", "tr-TR",
+	"uk-UA", "ur-IN", "uz-UZ", "ve-ZA", "vi-VN",
+	"xh-ZA", "zu-ZA"
+];
+
 module.exports.GoogleSpeechProvider = GoogleSpeechProvider; 
diff --git a/package.json b/package.json
@@ -27,6 +27,7 @@
         "node": ">=10"
     },
     "dependencies": {
+        "@aws-sdk/client-transcribe-streaming": "^3.812.0",
         "@google-cloud/speech": "^3.3.1",
         "ari-client": "^2.2.0",
         "chalk": "^2.4.2",