Skip to content
This repository was archived by the owner on May 6, 2022. It is now read-only.

Commit 56415e5

Browse files
authored
Merge pull request #40 from spokestack/jz-azure
ASR via Azure Speech Service
2 parents 8f96cfe + a2ad865 commit 56415e5

File tree

12 files changed

+633
-982
lines changed

12 files changed

+633
-982
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Spokestack provides an extensible speech recognition pipeline for the Android
44
platform. It includes a variety of built-in speech processors for Voice
55
Activity Detection (VAD) and Automatic Speech Recognition (ASR) via popular
6-
speech recognition services, such as the Google Speech API and Bing Speech
6+
speech recognition services such as the Google Speech API and Azure Speech
77
API.
88

99
See the [documentation](https://spokestack.io/docs) for a lot more information

pom.xml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@
4747
<id>jcenter</id>
4848
<url>https://jcenter.bintray.com/</url>
4949
</repository>
50+
<repository>
51+
<id>microsoft</id>
52+
<url>https://csspeechstorage.blob.core.windows.net/maven/</url>
53+
</repository>
5054
</repositories>
5155

5256
<distributionManagement>
@@ -106,7 +110,16 @@
106110
<scope>provided</scope>
107111
</dependency>
108112

109-
<!-- microsoft speech api / spokestack TTS -->
113+
<!-- azure speech service -->
114+
<dependency>
115+
<groupId>com.microsoft.cognitiveservices.speech</groupId>
116+
<artifactId>client-sdk</artifactId>
117+
<version>1.9.0</version>
118+
<type>aar</type>
119+
<scope>provided</scope>
120+
</dependency>
121+
122+
<!-- spokestack TTS -->
110123
<dependency>
111124
<groupId>com.squareup.okhttp3</groupId>
112125
<artifactId>okhttp</artifactId>
Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
package io.spokestack.spokestack.microsoft;
2+
3+
import com.microsoft.cognitiveservices.speech.CancellationReason;
4+
import com.microsoft.cognitiveservices.speech.ProfanityOption;
5+
import com.microsoft.cognitiveservices.speech.ResultReason;
6+
import com.microsoft.cognitiveservices.speech.SpeechRecognitionCanceledEventArgs;
7+
import com.microsoft.cognitiveservices.speech.SpeechRecognitionEventArgs;
8+
import com.microsoft.cognitiveservices.speech.SpeechRecognizer;
9+
import com.microsoft.cognitiveservices.speech.audio.AudioConfig;
10+
import com.microsoft.cognitiveservices.speech.audio.AudioInputStream;
11+
import com.microsoft.cognitiveservices.speech.audio.PushAudioInputStream;
12+
import com.microsoft.cognitiveservices.speech.util.EventHandler;
13+
import io.spokestack.spokestack.SpeechConfig;
14+
import io.spokestack.spokestack.SpeechContext;
15+
import io.spokestack.spokestack.SpeechProcessor;
16+
17+
import java.nio.ByteBuffer;
18+
import java.nio.ByteOrder;
19+
20+
/**
21+
* microsoft azure speech service recognizer
22+
*
23+
* <p>
24+
* This component implements the speech processor interface using the Azure
25+
* Speech Service for speech recognition.
26+
* </p>
27+
*
28+
* <p>
29+
* When the speech context is triggered, the recognizer begins streaming
30+
* buffered frames to the API for recognition. Once the speech context becomes
31+
* inactive, the recognizer raises a RECOGNIZE event along with the audio
32+
* transcript. Unfortunately, the Azure Speech SDK currently doesn't return
33+
* confidence values alongside transcripts, so confidence is always set to 1.0.
34+
* </p>
35+
*
36+
* <p>
37+
* Use of the Azure Speech Service implies acceptance of Microsoft's license
38+
* terms, which can be found
39+
* <a href=
40+
* "https://csspeechstorage.blob.core.windows.net/drop/license201809.html">
41+
* here</a>.
42+
* </p>
43+
*
44+
* <p>
45+
* This pipeline component requires the following configuration properties:
46+
* </p>
47+
* <ul>
48+
* <li>
49+
* <b>sample-rate</b> (integer): audio sampling rate, in Hz
50+
* </li>
51+
* <li>
52+
* <b>frame-width</b> (integer): speech frame width, in ms
53+
* </li>
54+
* <li>
55+
* <b>locale</b> (string): language code for speech recognition
56+
* </li>
57+
* <li>
58+
* <b>azure-api-key</b> (string): API key for the Azure Speech
59+
* service
60+
* </li>
61+
* <li>
62+
* <b>azure-region</b> (string): Azure Speech service region
63+
* </li>
64+
* </ul>
65+
*/
66+
public class AzureSpeechRecognizer implements SpeechProcessor {
67+
private final com.microsoft.cognitiveservices.speech.SpeechConfig msConfig;
68+
69+
private SpeechRecognizer recognizer;
70+
private PushAudioInputStream audioStream;
71+
private AudioConfig audioConfig;
72+
private boolean active;
73+
74+
// Azure speech requires little-endian (wav-format) data, so we buffer
75+
// audio frames internally to avoid mutating data coming from the speech
76+
// context
77+
private ByteBuffer buffer;
78+
79+
/**
80+
* initializes a new recognizer instance.
81+
*
82+
* @param speechConfig Spokestack speech configuration
83+
*/
84+
public AzureSpeechRecognizer(SpeechConfig speechConfig) {
85+
String apiKey = speechConfig.getString("azure-api-key");
86+
String region = speechConfig.getString("azure-region");
87+
int sampleRate = speechConfig.getInteger("sample-rate");
88+
89+
if (sampleRate != 16000) {
90+
throw new IllegalArgumentException(
91+
"Azure only supports a 16kHz sample rate; found: "
92+
+ sampleRate);
93+
}
94+
95+
this.buffer = ByteBuffer.allocateDirect(4096)
96+
.order(ByteOrder.LITTLE_ENDIAN);
97+
this.msConfig = createMsConfig(apiKey, region);
98+
}
99+
100+
com.microsoft.cognitiveservices.speech.SpeechConfig createMsConfig(
101+
String apiKey, String region) {
102+
com.microsoft.cognitiveservices.speech.SpeechConfig config =
103+
com.microsoft.cognitiveservices.speech.SpeechConfig
104+
.fromSubscription(apiKey, region);
105+
config.setProfanity(ProfanityOption.Raw);
106+
return config;
107+
}
108+
109+
/**
110+
* releases the resources associated with the recognizer.
111+
*/
112+
public void close() {
113+
if (this.audioStream != null) {
114+
this.audioStream.close();
115+
this.audioStream = null;
116+
}
117+
if (this.recognizer != null) {
118+
this.recognizer.close();
119+
this.recognizer = null;
120+
}
121+
}
122+
123+
/**
124+
* processes a frame of audio.
125+
*
126+
* @param speechContext the current speech context
127+
* @param frame the audio frame to detect
128+
*
129+
* @throws Exception if there is an error performing active recognition.
130+
*/
131+
public void process(SpeechContext speechContext, ByteBuffer frame)
132+
throws Exception {
133+
if (speechContext.isActive() && !this.active) {
134+
begin(speechContext);
135+
} else if (!speechContext.isActive() && this.active) {
136+
commit();
137+
} else if (speechContext.isActive()) {
138+
bufferFrame(frame);
139+
}
140+
}
141+
142+
void begin(SpeechContext speechContext) {
143+
this.audioStream = AudioInputStream.createPushStream();
144+
this.audioConfig = AudioConfig.fromStreamInput(this.audioStream);
145+
this.recognizer = createRecognizer(speechContext);
146+
recognizer.startContinuousRecognitionAsync();
147+
this.active = true;
148+
149+
// send any existing frames into the stream
150+
for (ByteBuffer frame : speechContext.getBuffer()) {
151+
bufferFrame(frame);
152+
}
153+
}
154+
155+
SpeechRecognizer createRecognizer(SpeechContext context) {
156+
// factored into a separate method for testing
157+
SpeechRecognizer rec = new SpeechRecognizer(msConfig, audioConfig);
158+
listen(rec, context);
159+
return rec;
160+
}
161+
162+
private void listen(SpeechRecognizer rec, SpeechContext context) {
163+
RecognitionListener recognitionListener =
164+
new RecognitionListener(context);
165+
rec.recognized.addEventListener(recognitionListener);
166+
167+
CancellationListener cancellationListener =
168+
new CancellationListener(context);
169+
rec.canceled.addEventListener(cancellationListener);
170+
}
171+
172+
void bufferFrame(ByteBuffer frame) {
173+
if (frame != null) {
174+
if (this.buffer.remaining() < frame.capacity()) {
175+
flush();
176+
}
177+
178+
frame.rewind();
179+
this.buffer.put(frame);
180+
}
181+
}
182+
183+
void commit() throws Exception {
184+
// send the end of audio
185+
flush();
186+
this.audioStream.close();
187+
this.recognizer.stopContinuousRecognitionAsync().get();
188+
this.recognizer.close();
189+
this.audioConfig.close();
190+
this.active = false;
191+
}
192+
193+
private void flush() {
194+
if (this.buffer.hasArray()) {
195+
this.buffer.flip();
196+
this.audioStream.write(this.buffer.array());
197+
this.buffer.clear();
198+
}
199+
}
200+
201+
/**
202+
* Listener for Speech SDK recognition events.
203+
*/
204+
static class RecognitionListener
205+
implements EventHandler<SpeechRecognitionEventArgs> {
206+
private SpeechContext speechContext;
207+
208+
RecognitionListener(SpeechContext context) {
209+
this.speechContext = context;
210+
}
211+
212+
@Override
213+
public void onEvent(
214+
Object sender,
215+
SpeechRecognitionEventArgs recognitionArgs) {
216+
if (recognitionArgs.getResult().getReason()
217+
== ResultReason.RecognizedSpeech) {
218+
String transcript = recognitionArgs.getResult().getText();
219+
this.speechContext.setTranscript(transcript);
220+
this.speechContext.setConfidence(1.0);
221+
this.speechContext.dispatch(SpeechContext.Event.RECOGNIZE);
222+
}
223+
}
224+
}
225+
226+
/**
227+
* Listener for Speech SDK cancellation events.
228+
*/
229+
static class CancellationListener
230+
implements EventHandler<SpeechRecognitionCanceledEventArgs> {
231+
232+
private SpeechContext speechContext;
233+
234+
CancellationListener(SpeechContext context) {
235+
this.speechContext = context;
236+
}
237+
238+
@Override
239+
public void onEvent(
240+
Object sender,
241+
SpeechRecognitionCanceledEventArgs cancellationArgs) {
242+
if (cancellationArgs.getReason()
243+
== CancellationReason.Error) {
244+
245+
String message = String.format(
246+
"%s (error code %s)",
247+
cancellationArgs.getErrorDetails(),
248+
cancellationArgs.getErrorCode().name());
249+
250+
this.speechContext.setError(new Exception(message));
251+
this.speechContext.dispatch(SpeechContext.Event.ERROR);
252+
}
253+
}
254+
}
255+
}

0 commit comments

Comments
 (0)