View Javadoc
1   /*
2    * Licensed to The Apereo Foundation under one or more contributor license
3    * agreements. See the NOTICE file distributed with this work for additional
4    * information regarding copyright ownership.
5    *
6    *
7    * The Apereo Foundation licenses this file to you under the Educational
8    * Community License, Version 2.0 (the "License"); you may not use this file
9    * except in compliance with the License. You may obtain a copy of the License
10   * at:
11   *
12   *   http://opensource.org/licenses/ecl2.txt
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
16   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
17   * License for the specific language governing permissions and limitations under
18   * the License.
19   *
20   */
21  
22  package org.opencastproject.transcription.microsoft.azure;
23  
24  import org.opencastproject.transcription.microsoft.azure.model.MicrosoftAzureSpeechTranscription;
25  import org.opencastproject.transcription.microsoft.azure.model.MicrosoftAzureSpeechTranscriptionFile;
26  import org.opencastproject.transcription.microsoft.azure.model.MicrosoftAzureSpeechTranscriptionFiles;
27  import org.opencastproject.transcription.microsoft.azure.model.MicrosoftAzureSpeechTranscriptionJson;
28  import org.opencastproject.transcription.microsoft.azure.model.MicrosoftAzureSpeechTranscriptions;
29  import org.opencastproject.workspace.api.Workspace;
30  
31  import com.google.gson.Gson;
32  import com.google.gson.GsonBuilder;
33  
34  import org.apache.commons.lang3.StringUtils;
35  import org.apache.http.HttpStatus;
36  import org.apache.http.client.methods.CloseableHttpResponse;
37  import org.apache.http.client.methods.HttpDelete;
38  import org.apache.http.client.methods.HttpGet;
39  import org.apache.http.client.methods.HttpPost;
40  import org.apache.http.entity.ContentType;
41  import org.apache.http.entity.StringEntity;
42  import org.apache.http.impl.client.CloseableHttpClient;
43  import org.apache.http.util.EntityUtils;
44  import org.slf4j.Logger;
45  import org.slf4j.LoggerFactory;
46  
47  import java.io.ByteArrayInputStream;
48  import java.io.IOException;
49  import java.io.InputStream;
50  import java.net.URI;
51  import java.net.URLEncoder;
52  import java.nio.charset.StandardCharsets;
53  import java.util.HashMap;
54  import java.util.List;
55  import java.util.Map;
56  import java.util.UUID;
57  
58  public class MicrosoftAzureSpeechServicesClient {
59  
60    private static final Logger logger = LoggerFactory.getLogger(MicrosoftAzureSpeechServicesClient.class);
61    private static final String WORKSPACE_COLLECTION = "azure-speech-services";
62    private static final String DEFAULT_TRANSCRIPTION_TIME_TO_LIVE = "P14D";
63    private final String azureSpeechServicesEndpoint;
64    private final String azureCognitiveServicesSubscriptionKey;
65  
66    public MicrosoftAzureSpeechServicesClient(String azureSpeechServicesEndpoint,
67        String azureCognitiveServicesSubscriptionKey) {
68      this.azureSpeechServicesEndpoint = StringUtils.trimToEmpty(azureSpeechServicesEndpoint);
69      this.azureCognitiveServicesSubscriptionKey = StringUtils.trimToEmpty(azureCognitiveServicesSubscriptionKey);
70    }
71  
72    public List<MicrosoftAzureSpeechTranscription> getTranscriptions(int skip, int top)
73            throws IOException, MicrosoftAzureNotAllowedException, MicrosoftAzureSpeechClientException {
74      return getTranscriptions(skip, top, null);
75    }
76  
77    public List<MicrosoftAzureSpeechTranscription> getTranscriptions(int skip, int top, String filter)
78            throws IOException, MicrosoftAzureNotAllowedException, MicrosoftAzureSpeechClientException {
79      // Documentation:
80      // https://eastus.dev.cognitive.microsoft.com/docs/services/speech-to-text-api-v3-1/operations/Transcriptions_List
81      StringBuilder url = new StringBuilder(azureSpeechServicesEndpoint + "/speechtotext/v3.1/transcriptions");
82      StringBuilder params = new StringBuilder();
83      if (skip > 0) {
84        params.append("skip=" + skip);
85      }
86      if (top > 0) {
87        params.append("top=" + top);
88      }
89      if (StringUtils.isNotBlank(filter)) {
90        params.append("filter=" + URLEncoder.encode(filter, StandardCharsets.UTF_8));
91      }
92      if (params.length() > 0) {
93        url.append("?");
94        url.append(params);
95      }
96      try (CloseableHttpClient httpClient = HttpUtils.makeHttpClient()) {
97        HttpGet httpGet = new HttpGet(url.toString());
98        httpGet.addHeader("Ocp-Apim-Subscription-Key", azureCognitiveServicesSubscriptionKey);
99        try (CloseableHttpResponse response = httpClient.execute(httpGet)) {
100         int code = response.getStatusLine().getStatusCode();
101         String responseString = "";
102         if (response.getEntity() != null) {
103           responseString = EntityUtils.toString(response.getEntity());
104         }
105         switch (code) {
106           case HttpStatus.SC_OK: // 200
107             break;
108           case HttpStatus.SC_FORBIDDEN: // 403
109             throw new MicrosoftAzureNotAllowedException(String.format("Not allowed to get transcriptions. "
110                 + "Microsoft Azure Speech Services response: %s", responseString));
111           default:
112             throw new MicrosoftAzureSpeechClientException(String.format(
113                 "Getting transcriptions failed with HTTP response code %d. "
114                     + "Microsoft Azure Speech Services response: %s", code, responseString));
115         }
116         Gson gson = new GsonBuilder().create();
117         MicrosoftAzureSpeechTranscriptions transcriptions = gson.fromJson(responseString,
118             MicrosoftAzureSpeechTranscriptions.class);
119         return transcriptions.values;
120       }
121     }
122   }
123 
124   public MicrosoftAzureSpeechTranscription getTranscriptionById(String transcriptionId)
125           throws IOException, MicrosoftAzureNotAllowedException, MicrosoftAzureSpeechClientException,
126           MicrosoftAzureNotFoundException {
127     String transcriptionUrl = azureSpeechServicesEndpoint + "/speechtotext/v3.1/transcriptions/"
128         + StringUtils.trimToEmpty(transcriptionId);
129     return getTranscription(transcriptionUrl);
130   }
131 
132   public MicrosoftAzureSpeechTranscription getTranscription(String transcriptionUrl)
133           throws IOException, MicrosoftAzureNotAllowedException, MicrosoftAzureSpeechClientException,
134           MicrosoftAzureNotFoundException {
135     if (StringUtils.isBlank(transcriptionUrl)) {
136       throw new IllegalArgumentException("Transcription URL not set.");
137     }
138     // Documentation:
139     // https://eastus.dev.cognitive.microsoft.com/docs/services/speech-to-text-api-v3-1/operations/Transcriptions_Get
140     String url = StringUtils.trimToEmpty(transcriptionUrl);
141     try (CloseableHttpClient httpClient = HttpUtils.makeHttpClient()) {
142       HttpGet httpGet = new HttpGet(url);
143       httpGet.addHeader("Ocp-Apim-Subscription-Key", azureCognitiveServicesSubscriptionKey);
144       try (CloseableHttpResponse response = httpClient.execute(httpGet)) {
145         int code = response.getStatusLine().getStatusCode();
146         String responseString = "";
147         if (response.getEntity() != null) {
148           responseString = EntityUtils.toString(response.getEntity());
149         }
150         switch (code) {
151           case HttpStatus.SC_OK: // 200
152             break;
153           case HttpStatus.SC_FORBIDDEN: // 403
154             throw new MicrosoftAzureNotAllowedException(String.format("Not allowed to get transcription '%s'. "
155                     + "Microsoft Azure Speech Services response: %s", transcriptionUrl, responseString));
156           case HttpStatus.SC_NOT_FOUND: // 404
157             throw new MicrosoftAzureNotFoundException(String.format("Transcription '%s' not found.", transcriptionUrl));
158           default:
159             throw new MicrosoftAzureSpeechClientException(String.format(
160                 "Getting transcription '%s' failed with HTTP response code %d. "
161                     + "Microsoft Azure Speech Services  response: %s", transcriptionUrl, code, responseString));
162         }
163         Gson gson = new GsonBuilder().create();
164         return gson.fromJson(responseString, MicrosoftAzureSpeechTranscription.class);
165       }
166     }
167   }
168 
169   public MicrosoftAzureSpeechTranscription createTranscription(List<String> contentUrls, String destinationContainerUrl,
170       String displayName , String locale, List<String> candidateLocales, String timeToLive,
171       Map<String, Object> properties)
172           throws IOException, MicrosoftAzureNotAllowedException, MicrosoftAzureSpeechClientException {
173     // CHECKSTYLE:OFF checkstyle:LineLength
174     // Documentation:
175     // https://eastus.dev.cognitive.microsoft.com/docs/services/speech-to-text-api-v3-1/operations/Transcriptions_Create
176     // https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/batch-transcription-create?pivots=rest-api
177     // CHECKSTYLE:ON checkstyle:LineLength
178     String url = azureSpeechServicesEndpoint  + "/speechtotext/v3.1/transcriptions";
179     MicrosoftAzureSpeechTranscription requestTranscription = new MicrosoftAzureSpeechTranscription();
180     // required properties
181     requestTranscription.displayName = displayName;
182     requestTranscription.locale = locale;
183     requestTranscription.contentUrls = contentUrls;
184     // optional properties
185     requestTranscription.properties = new HashMap<>();
186     if (properties != null && !properties.isEmpty()) {
187       requestTranscription.properties.putAll(properties);
188     }
189     // TODO produces InvalidUri: Could not access the results container
190 //    if (StringUtils.isNotEmpty(destinationContainerUrl)) {
191 //      requestTranscription.properties.put("destinationContainerUrl", destinationContainerUrl);
192 //    }
193     if (candidateLocales != null && !candidateLocales.isEmpty()) {
194       Map<String, Object> languageIdentification = new HashMap<>();
195       languageIdentification.put("candidateLocales", candidateLocales);
196       requestTranscription.properties.put("languageIdentification",languageIdentification);
197     }
198     if (StringUtils.isNotEmpty(timeToLive)) {
199       requestTranscription.properties.put("timeToLive", timeToLive);
200     } else {
201       requestTranscription.properties.put("timeToLive", DEFAULT_TRANSCRIPTION_TIME_TO_LIVE);
202     }
203     Gson gson = new GsonBuilder().create();
204     try (CloseableHttpClient httpClient = HttpUtils.makeHttpClient()) {
205       HttpPost httpPost = new HttpPost(url);
206       httpPost.addHeader("Ocp-Apim-Subscription-Key", azureCognitiveServicesSubscriptionKey);
207       httpPost.setEntity(new StringEntity(gson.toJson(requestTranscription), ContentType.APPLICATION_JSON));
208       try (CloseableHttpResponse response = httpClient.execute(httpPost)) {
209         int code = response.getStatusLine().getStatusCode();
210         String responseString = "";
211         if (response.getEntity() != null) {
212           responseString = EntityUtils.toString(response.getEntity());
213         }
214         switch (code) {
215           case HttpStatus.SC_OK: // 200
216           case HttpStatus.SC_CREATED: // 201
217             break;
218           case HttpStatus.SC_FORBIDDEN: // 403
219             throw new MicrosoftAzureNotAllowedException(String.format(
220                 "Not allowed to create transcription '%s'. Microsoft Azure Speech Services response: %s",
221                 displayName, responseString));
222           default:
223             throw new MicrosoftAzureSpeechClientException(String.format(
224                 "Creating transcription '%s' failed with HTTP response code %d. "
225                     + "Microsoft Azure Speech Services response: %s", displayName, code, responseString));
226         }
227         return gson.fromJson(responseString, MicrosoftAzureSpeechTranscription.class);
228       }
229     }
230   }
231 
232   public MicrosoftAzureSpeechTranscriptionFiles getTranscriptionFilesById(String transcriptionId)
233           throws IOException, MicrosoftAzureNotAllowedException, MicrosoftAzureSpeechClientException,
234           MicrosoftAzureNotFoundException {
235     String transcriptionUrl = String.format("%s/speechtotext/v3.1/transcriptions/%s/files", azureSpeechServicesEndpoint,
236         StringUtils.trimToEmpty(transcriptionId));
237     return getTranscriptionFiles(transcriptionUrl);
238   }
239 
240   public MicrosoftAzureSpeechTranscriptionFiles getTranscriptionFiles(String transcriptionFilesUrl)
241           throws IOException, MicrosoftAzureNotAllowedException, MicrosoftAzureSpeechClientException,
242           MicrosoftAzureNotFoundException {
243     if (StringUtils.isBlank(transcriptionFilesUrl)) {
244       throw new IllegalArgumentException("Transcription files URL not set.");
245     }
246     // Documentation:
247     // https://eastus.dev.cognitive.microsoft.com/docs/services/speech-to-text-api-v3-1/operations/Transcriptions_Get
248     String url = StringUtils.trimToEmpty(transcriptionFilesUrl);
249     try (CloseableHttpClient httpClient = HttpUtils.makeHttpClient()) {
250       HttpGet httpGet = new HttpGet(url);
251       httpGet.addHeader("Ocp-Apim-Subscription-Key", azureCognitiveServicesSubscriptionKey);
252       try (CloseableHttpResponse response = httpClient.execute(httpGet)) {
253         int code = response.getStatusLine().getStatusCode();
254         String responseString = "";
255         if (response.getEntity() != null) {
256           responseString = EntityUtils.toString(response.getEntity());
257         }
258         switch (code) {
259           case HttpStatus.SC_OK: // 200
260             break;
261           case HttpStatus.SC_FORBIDDEN: // 403
262             throw new MicrosoftAzureNotAllowedException(String.format("Not allowed to get transcription files '%s'. "
263                     + "Microsoft Azure Speech Services response: %s", transcriptionFilesUrl, responseString));
264           case HttpStatus.SC_NOT_FOUND: // 404
265             throw new MicrosoftAzureNotFoundException(String.format("Transcription files '%s' not found. "
266                 + "Microsoft Azure Speech Services response: %s", transcriptionFilesUrl, responseString));
267           default:
268             throw new MicrosoftAzureSpeechClientException(String.format(
269                 "Getting transcription files '%s' failed with HTTP response code %d. "
270                     + "Microsoft Azure Speech Services response: %s", transcriptionFilesUrl, code, responseString));
271         }
272         Gson gson = new GsonBuilder().create();
273         return gson.fromJson(responseString, MicrosoftAzureSpeechTranscriptionFiles.class);
274       }
275     }
276   }
277 
278   public static MicrosoftAzureSpeechTranscriptionJson getTranscriptionJson(
279       MicrosoftAzureSpeechTranscriptionFile transcriptionFile)
280           throws IOException, MicrosoftAzureNotAllowedException, MicrosoftAzureSpeechClientException,
281           MicrosoftAzureNotFoundException {
282     String transcriptionUrl = transcriptionFile.links.contentUrl;
283     try (CloseableHttpClient httpClient = HttpUtils.makeHttpClient()) {
284       HttpGet httpGet = new HttpGet(transcriptionUrl);
285       try (CloseableHttpResponse response = httpClient.execute(httpGet)) {
286         int code = response.getStatusLine().getStatusCode();
287         String responseString = "";
288         if (response.getEntity() != null) {
289           responseString = EntityUtils.toString(response.getEntity());
290         }
291         switch (code) {
292           case HttpStatus.SC_OK: // 200
293             break;
294           case HttpStatus.SC_FORBIDDEN: // 403
295             throw new MicrosoftAzureNotAllowedException(String.format("Not allowed to get transcription file '%s'. "
296                     + "Microsoft Azure Speech Services response: %s",
297                 transcriptionUrl, responseString));
298           case HttpStatus.SC_NOT_FOUND: // 404
299             throw new MicrosoftAzureNotFoundException(String.format("Transcription file '%s' not found. "
300                     + "Microsoft Azure Speech Services response: %s", transcriptionUrl, responseString));
301           default:
302             throw new MicrosoftAzureSpeechClientException(String.format(
303                 "Getting transcription file '%s' failed with HTTP response code %d. "
304                     + "Microsoft Azure Speech Services response: %s", transcriptionUrl, code, responseString));
305         }
306         Gson gson = new GsonBuilder().create();
307         return gson.fromJson(responseString, MicrosoftAzureSpeechTranscriptionJson.class);
308       }
309     }
310   }
311 
312   public static URI writeTranscriptionFile(MicrosoftAzureSpeechTranscriptionJson transcriptionJson,
313       Workspace workspace, String format, float minConfidence, int maxCueLength) throws IOException {
314     boolean formatIsWebVtt;
315     switch (StringUtils.lowerCase(format)) {
316       case "vtt":
317         formatIsWebVtt = true;
318         break;
319       case "srt":
320         formatIsWebVtt = false;
321         break;
322       default:
323         throw new IllegalArgumentException("format should be srt or vtt");
324     }
325     String content;
326     if (formatIsWebVtt) {
327       content = transcriptionJson.toWebVtt(minConfidence, maxCueLength);
328     } else {
329       content = transcriptionJson.toSrt(minConfidence, maxCueLength);
330     }
331     String fileName = UUID.randomUUID().toString();
332     if (formatIsWebVtt) {
333       fileName += ".vtt";
334     } else {
335       fileName += ".srt";
336     }
337     try (InputStream is = new ByteArrayInputStream(content.getBytes(StandardCharsets.UTF_8))) {
338       return workspace.putInCollection(WORKSPACE_COLLECTION, fileName, is);
339     }
340   }
341 
342   public void deleteTranscription(String transcriptionId)
343           throws IOException, MicrosoftAzureNotAllowedException, MicrosoftAzureSpeechClientException {
344     String transcriptionDeleteUrl = azureSpeechServicesEndpoint + "/speechtotext/v3.1/transcriptions/"
345         + StringUtils.trimToEmpty(transcriptionId);
346     try (CloseableHttpClient httpClient = HttpUtils.makeHttpClient()) {
347       HttpDelete httpDelete = new HttpDelete(transcriptionDeleteUrl);
348       httpDelete.addHeader("Ocp-Apim-Subscription-Key", azureCognitiveServicesSubscriptionKey);
349       try (CloseableHttpResponse response = httpClient.execute(httpDelete)) {
350         int code = response.getStatusLine().getStatusCode();
351         String responseString = "";
352         if (response.getEntity() != null) {
353           responseString = EntityUtils.toString(response.getEntity());
354         }
355         switch (code) {
356           case HttpStatus.SC_OK: // 200
357           case HttpStatus.SC_NO_CONTENT: // 204
358           case HttpStatus.SC_NOT_FOUND: // 404
359             break;
360           case HttpStatus.SC_FORBIDDEN: // 403
361             throw new MicrosoftAzureNotAllowedException(String.format("Not allowed to delete transcription '%s'. "
362                     + "Microsoft Azure Speech Services response: %s",
363                 transcriptionId, responseString));
364           default:
365             throw new MicrosoftAzureSpeechClientException(String.format(
366                 "Deleting transcription '%s' failed with HTTP response code %d. "
367                     + "Microsoft Azure Speech Services response: %s", transcriptionId, code, responseString));
368         }
369       }
370     }
371   }
372 }