View Javadoc
1   /*
2    * Licensed to The Apereo Foundation under one or more contributor license
3    * agreements. See the NOTICE file distributed with this work for additional
4    * information regarding copyright ownership.
5    *
6    *
7    * The Apereo Foundation licenses this file to you under the Educational
8    * Community License, Version 2.0 (the "License"); you may not use this file
9    * except in compliance with the License. You may obtain a copy of the License
10   * at:
11   *
12   *   http://opensource.org/licenses/ecl2.txt
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
16   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
17   * License for the specific language governing permissions and limitations under
18   * the License.
19   *
20   */
21  
22  package org.opencastproject.transcription.microsoft.azure.model;
23  
24  import org.apache.commons.lang3.StringUtils;
25  
26  import java.text.SimpleDateFormat;
27  import java.util.ArrayList;
28  import java.util.Date;
29  import java.util.List;
30  import java.util.Optional;
31  import java.util.TimeZone;
32  
33  public class MicrosoftAzureSpeechTranscriptionJsonRecognizedPhrases {
34  
35    // CHECKSTYLE:OFF checkstyle:LineLength
36  
37    // Documentation:
38    // https://eastus.dev.cognitive.microsoft.com/docs/services/speech-to-text-api-v3-1/operations/Transcriptions_ListFiles
39  
40    // CHECKSTYLE:ON checkstyle:LineLength
41    // CHECKSTYLE:OFF checkstyle:VisibilityModifier
42  
43    public String recognitionStatus;
44    public int channel;
45    public String offset;
46    public String duration;
47    public long offsetInTicks;
48    public long durationInTicks;
49    public List<MicrosoftAzureSpeechTranscriptionJsonRecognizedPhrase> nBest;
50    public String locale;
51  
52    // CHECKSTYLE:ON checkstyle:VisibilityModifier
53  
54    public MicrosoftAzureSpeechTranscriptionJsonRecognizedPhrases() { }
55  
56    public String[] toSrt(float minConfidence, int maxCueLength) {
57      String text = getBestRecognizedText(minConfidence);
58      String[] cueText = splitCueText(text, maxCueLength);
59      return timestampCues(false, cueText);
60    }
61  
62    public String[] toWebVtt(float minConfidence, int maxCueLength) {
63      String text = getBestRecognizedText(minConfidence);
64      String[] cueText = splitCueText(text, maxCueLength);
65      return timestampCues(true, cueText);
66    }
67  
68    String[] timestampCues(boolean formatWebVtt, String[] cueText) {
69      long ticksPerMillisecond = 10000;
70      String format;
71      if (formatWebVtt) {
72        format = "HH:mm:ss.SSS";
73      } else {
74        // SRT format requires ',' as decimal separator rather than '.'.
75        format = "HH:mm:ss,SSS";
76      }
77      SimpleDateFormat formatter = new SimpleDateFormat(format);
78      // If we don't do this, the time is adjusted for our local time zone, which we don't want.
79      formatter.setTimeZone(TimeZone.getTimeZone("GMT"));
80      int cueTextLength = 0;
81      int[] cuesTextLenth = new int[cueText.length];
82      for (int i = 0; i < cueText.length; i++) {
83        cuesTextLenth[i] = StringUtils.length(cueText[i]);
84        cueTextLength += cuesTextLenth[i];
85      }
86      String[] result = new String[cueText.length];
87      long cueOffsetInTicks = 0;
88      for (int i = 0; i < cueText.length; i++) {
89        long cueLengthInTicks = (long)Math.ceil((double)durationInTicks * (double)cuesTextLenth[i]
90            / (double)cueTextLength);
91  
92        Date startTime = new Date((offsetInTicks + cueOffsetInTicks) / ticksPerMillisecond);
93        Date endTime = new Date((offsetInTicks  + cueOffsetInTicks + cueLengthInTicks) / ticksPerMillisecond);
94        cueOffsetInTicks += cueLengthInTicks;
95        result[i] = String.format("%s --> %s\n%s\n", formatter.format(startTime), formatter.format(endTime), cueText[i]);
96      }
97      return result;
98    }
99  
100   public String getBestRecognizedText(float minConfidence) {
101     if (nBest == null) {
102       return null;
103     }
104     Optional<MicrosoftAzureSpeechTranscriptionJsonRecognizedPhrase> bestPhrase;
105     if (minConfidence >= 0 && minConfidence < 1) {
106       bestPhrase = nBest.stream()
107           .filter(phrase -> phrase.confidence >= minConfidence)
108           .sorted((t1, t2) -> Float.compare(t2.confidence, t1.confidence))  // descendant order
109           .findFirst();
110     } else if (minConfidence >= 1) {
111       bestPhrase = nBest.stream().findFirst();
112     } else {
113       bestPhrase = nBest.stream()
114           .sorted((t1, t2) -> Float.compare(t2.confidence, t1.confidence))  // descendant order
115           .findFirst();
116     }
117     return bestPhrase.isPresent() ? bestPhrase.get().display : "";
118   }
119 
120   public static String[] splitCueText(String text, int maxCueLength) {
121     int textLength = StringUtils.length(text);
122     if (textLength == 0) {
123       return new String[0];
124     } else if (textLength <= maxCueLength) {
125       return new String[] { text };
126     }
127     List<String> result = new ArrayList<>();
128     int start = 0;
129     do {
130       if (textLength - start <= maxCueLength) {
131         result.add(StringUtils.trimToEmpty(StringUtils.substring(text, start, textLength)));
132         break;
133       }
134       int end = StringUtils.lastIndexOf(text, " ", start + maxCueLength);
135       if (start >= end) {
136         end = Math.min(textLength, start + maxCueLength);
137       }
138       result.add(StringUtils.trimToEmpty(StringUtils.substring(text, start, end)));
139       start = end;
140     } while (start < textLength);
141     return result.toArray(new String[0]);
142   }
143 }