View Javadoc
1   /*
2    * Licensed to The Apereo Foundation under one or more contributor license
3    * agreements. See the NOTICE file distributed with this work for additional
4    * information regarding copyright ownership.
5    *
6    *
7    * The Apereo Foundation licenses this file to you under the Educational
8    * Community License, Version 2.0 (the "License"); you may not use this file
9    * except in compliance with the License. You may obtain a copy of the License
10   * at:
11   *
12   *   http://opensource.org/licenses/ecl2.txt
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
16   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
17   * License for the specific language governing permissions and limitations under
18   * the License.
19   *
20   */
21  package org.opencastproject.workflow.handler.speechtotext;
22  
23  import org.opencastproject.inspection.api.MediaInspectionService;
24  import org.opencastproject.job.api.Job;
25  import org.opencastproject.job.api.JobContext;
26  import org.opencastproject.mediapackage.MediaPackage;
27  import org.opencastproject.mediapackage.MediaPackageElement;
28  import org.opencastproject.mediapackage.MediaPackageElementFlavor;
29  import org.opencastproject.mediapackage.MediaPackageElementParser;
30  import org.opencastproject.mediapackage.MediaPackageElements;
31  import org.opencastproject.mediapackage.Track;
32  import org.opencastproject.mediapackage.attachment.AttachmentImpl;
33  import org.opencastproject.mediapackage.selector.TrackSelector;
34  import org.opencastproject.mediapackage.track.TrackImpl;
35  import org.opencastproject.metadata.api.MediaPackageMetadata;
36  import org.opencastproject.metadata.dublincore.DublinCoreCatalogService;
37  import org.opencastproject.serviceregistry.api.ServiceRegistry;
38  import org.opencastproject.speechtotext.api.SpeechToTextService;
39  import org.opencastproject.speechtotext.api.SpeechToTextServiceException;
40  import org.opencastproject.workflow.api.AbstractWorkflowOperationHandler;
41  import org.opencastproject.workflow.api.ConfiguredTagsAndFlavors;
42  import org.opencastproject.workflow.api.WorkflowInstance;
43  import org.opencastproject.workflow.api.WorkflowOperationException;
44  import org.opencastproject.workflow.api.WorkflowOperationHandler;
45  import org.opencastproject.workflow.api.WorkflowOperationInstance;
46  import org.opencastproject.workflow.api.WorkflowOperationResult;
47  import org.opencastproject.workspace.api.Workspace;
48  
49  import org.apache.commons.io.FilenameUtils;
50  import org.apache.commons.lang3.BooleanUtils;
51  import org.apache.commons.lang3.StringUtils;
52  import org.osgi.service.component.ComponentContext;
53  import org.osgi.service.component.annotations.Activate;
54  import org.osgi.service.component.annotations.Component;
55  import org.osgi.service.component.annotations.Reference;
56  import org.slf4j.Logger;
57  import org.slf4j.LoggerFactory;
58  
59  import java.io.IOException;
60  import java.io.InputStream;
61  import java.net.URI;
62  import java.util.ArrayList;
63  import java.util.Collection;
64  import java.util.List;
65  import java.util.Objects;
66  import java.util.UUID;
67  import java.util.stream.Collectors;
68  
69  /**
70   * Workflow operation for the speech-to-text service.
71   */
72  @Component(
73      immediate = true,
74      service = WorkflowOperationHandler.class,
75      property = {
76          "service.description=Speech-to-Text Workflow Operation Handler",
77          "workflow.operation=speechtotext"
78      }
79  )
80  public class
81      SpeechToTextWorkflowOperationHandler extends AbstractWorkflowOperationHandler {
82  
83    private static final Logger logger = LoggerFactory.getLogger(SpeechToTextWorkflowOperationHandler.class);
84  
85    /** Speech to Text language configuration property name. */
86    private static final String LANGUAGE_CODE = "language-code";
87  
88    /** Speech to Text language fallback configuration property name. */
89    private static final String LANGUAGE_FALLBACK = "language-fallback";
90  
91    /** Property name for configuring the place where the subtitles shall be appended. */
92    private static final String TARGET_ELEMENT = "target-element";
93  
94    /** Language placeholder */
95    private static final String PLACEHOLDER_LANG = "#{lang}";
96  
97    /** Translation mode */
98    private static final String TRANSLATE_MODE = "translate";
99  
100   /** Configuration: Track Selection Strategy (Control which tracks shall be transcribed) */
101   private static final String TRACK_SELECTION_STRATEGY = "track-selection-strategy";
102 
103   /** Configuration: Limit to One (If true, max 1 subtitle file will be generated) */
104   private static final String LIMIT_TO_ONE = "limit-to-one";
105 
106   /** Configuration: Synchronous or asynchronous mode */
107   private static final String ASYNCHRONOUS = "async";
108 
109   /** Workflow configuration name to store jobs in */
110   private static final String JOBS_WORKFLOW_CONFIGURATION = "speech-to-text-jobs";
111 
112   private enum TrackSelectionStrategy {
113     PRESENTER_OR_NOTHING,
114     PRESENTATION_OR_NOTHING,
115     TRY_PRESENTER_FIRST,
116     TRY_PRESENTATION_FIRST,
117     EVERYTHING;
118 
119     private static TrackSelectionStrategy fromString(String value) {
120       for (TrackSelectionStrategy strategy : values()) {
121         if (strategy.name().equalsIgnoreCase(value)) {
122           return strategy;
123         }
124       }
125       throw new IllegalArgumentException(
126           "No TrackSelectionStrategy enum constant " + TrackSelectionStrategy.class.getCanonicalName() + "." + value);
127     }
128   }
129 
130   private enum AppendSubtitleAs {
131     attachment, track
132   }
133 
134   /** The speech-to-text service. */
135   private SpeechToTextService speechToTextService = null;
136 
137   /** The workspace service. */
138   private Workspace workspace;
139 
140   /** The inspection service. */
141   private MediaInspectionService mediaInspectionService;
142 
143   /** The dublin core catalog service. */
144   private DublinCoreCatalogService dublinCoreCatalogService;
145 
146   @Override
147   @Activate
148   public void activate(ComponentContext cc) {
149     super.activate(cc);
150     logger.info("Registering speech-to-text workflow operation handler");
151   }
152 
153   /**
154    * {@inheritDoc}
155    *
156    * @see
157    * org.opencastproject.workflow.api.WorkflowOperationHandler#start(org.opencastproject.workflow.api.WorkflowInstance,
158    * org.opencastproject.job.api.JobContext)
159    */
160   @Override
161   public WorkflowOperationResult start(WorkflowInstance workflowInstance, JobContext context)
162           throws WorkflowOperationException {
163 
164     MediaPackage mediaPackage = workflowInstance.getMediaPackage();
165     logger.info("Start speech-to-text workflow operation for media package {}", mediaPackage);
166 
167     // Defaults to `false` if `null`
168     var async = BooleanUtils.toBoolean(workflowInstance.getCurrentOperation().getConfiguration(ASYNCHRONOUS));
169 
170     ConfiguredTagsAndFlavors tagsAndFlavors = getTagsAndFlavors(workflowInstance,
171             Configuration.many, Configuration.one,
172             Configuration.many, Configuration.one);
173     MediaPackageElementFlavor sourceFlavor = tagsAndFlavors.getSingleSrcFlavor();
174     List<String> srcTags = tagsAndFlavors.getSrcTags();
175 
176     TrackSelector trackSelector = new TrackSelector();
177     trackSelector.addFlavor(sourceFlavor);
178     for (String tag : srcTags) {
179       trackSelector.addTag(tag);
180     }
181     Collection<Track> tracks = trackSelector.select(mediaPackage, true);
182 
183     if (tracks.isEmpty()) {
184       throw new WorkflowOperationException(
185               String.format("No tracks with source flavor '%s' found for transcription", sourceFlavor));
186     }
187 
188     logger.info("Found {} track(s) with source flavor '{}'.", tracks.size(), sourceFlavor);
189 
190     // Get the information in which language the audio track should be
191     String languageCode = getMediaPackageLanguage(mediaPackage, workflowInstance);
192 
193     // How to save the subtitle file? (as attachment, as track...)
194     AppendSubtitleAs appendSubtitleAs = howToAppendTheSubtitles(workflowInstance);
195 
196     // Translate to english
197     Boolean translate = getTranslationMode(workflowInstance);
198 
199     // Create sublist that includes only the tracks that has audio
200     List<Track> tracksWithAudio = tracks.stream().filter(Track::hasAudio).collect(Collectors.toList());
201 
202     // Get the track selection strategy from the workflow configuration
203     // If nothing is set, all tracks (with audio) will be transcribed
204     TrackSelectionStrategy trackSelectionStrategy = getTrackSelectionStrategy(mediaPackage, workflowInstance);
205 
206     // Use the selection strategy from the workflow config to get the tracks we want to transcribe
207     List<Track> tracksToTranscribe = filterTracksByStrategy(tracksWithAudio, trackSelectionStrategy);
208     if (tracksToTranscribe.isEmpty()) {
209       logger.info("No subtitles were created for media package {}. "
210           + "Workflow Configuration 'track-selection-strategy' is set to {}", mediaPackage, trackSelectionStrategy);
211       return createResult(mediaPackage, WorkflowOperationResult.Action.SKIP);
212     }
213 
214     // Load the 'limit-to-one' configuration from the workflow operation.
215     // This configuration sets the limit of generated subtitle files to one
216     boolean limitToOne = BooleanUtils.toBoolean(workflowInstance.getCurrentOperation().getConfiguration(LIMIT_TO_ONE));
217     if (limitToOne) {
218       tracksToTranscribe = List.of(tracksToTranscribe.get(0));
219     }
220 
221     if (async) {
222       createSubtitleAsync(workflowInstance, tracksToTranscribe, languageCode, translate);
223     } else {
224       for (Track track : tracksToTranscribe) {
225         createSubtitle(track, languageCode, mediaPackage, tagsAndFlavors, appendSubtitleAs, translate);
226       }
227     }
228 
229     logger.info("Speech-To-Text workflow operation for media package {} completed", mediaPackage);
230     return createResult(mediaPackage, WorkflowOperationResult.Action.CONTINUE);
231   }
232 
233   /**
234    * Filters the tracks by the strategy configured in the workflow operation
235    * @param tracksWithAudio        List of the tracks that includes audio
236    * @param trackSelectionStrategy The strategy configured in the workflow operation
237    * @return The filtered tracks
238    */
239   private List<Track> filterTracksByStrategy(List<Track> tracksWithAudio,
240       TrackSelectionStrategy trackSelectionStrategy) {
241 
242     List<Track> tracksToTranscribe = new ArrayList<>();
243     if (!tracksWithAudio.isEmpty()) {
244 
245       String presenterTypeConstant = MediaPackageElements.PRESENTER_SOURCE.getType();
246       String presentationTypeConstant = MediaPackageElements.PRESENTATION_SOURCE.getType();
247 
248       // Creates a sublist only including the presenter tracks
249       List<Track> presenterTracksWithAudio = tracksWithAudio.stream()
250           .filter(track -> Objects.equals(track.getFlavor().getType(), presenterTypeConstant))
251           .collect(Collectors.toList());
252 
253       // Creates a sublist only including the presentation tracks
254       List<Track> presentationTracksWithAudio = tracksWithAudio.stream()
255           .filter(track -> Objects.equals(track.getFlavor().getType(), presentationTypeConstant))
256           .collect(Collectors.toList());
257 
258       if (TrackSelectionStrategy.PRESENTER_OR_NOTHING.equals(trackSelectionStrategy)) {
259         tracksToTranscribe.addAll(presenterTracksWithAudio);
260       }
261 
262       if (TrackSelectionStrategy.PRESENTATION_OR_NOTHING.equals(trackSelectionStrategy)) {
263         tracksToTranscribe.addAll(presentationTracksWithAudio);
264       }
265 
266       if (TrackSelectionStrategy.TRY_PRESENTER_FIRST.equals(trackSelectionStrategy)) {
267         tracksToTranscribe.addAll(presenterTracksWithAudio);
268         if (tracksToTranscribe.isEmpty()) {
269           tracksToTranscribe.addAll(tracksWithAudio);
270         }
271       }
272 
273       if (TrackSelectionStrategy.TRY_PRESENTATION_FIRST.equals(trackSelectionStrategy)) {
274         tracksToTranscribe.addAll((presentationTracksWithAudio));
275         if (tracksToTranscribe.isEmpty()) {
276           tracksToTranscribe.addAll(tracksWithAudio);
277         }
278       }
279 
280       if (TrackSelectionStrategy.EVERYTHING.equals(trackSelectionStrategy)) {
281         tracksToTranscribe.addAll(tracksWithAudio);
282       }
283     }
284     return tracksToTranscribe;
285   }
286 
287   /**
288    * Creates the subtitle file for a track and appends it to the media package.
289    *
290    * @param track The track from which the subtitles are created.
291    * @param languageCode The language of the track.
292    * @param parentMediaPackage The media package where the track is located.
293    * @param tagsAndFlavors Tags and flavors instance (to get target flavor information)
294    * @param appendSubtitleAs Tells how the subtitles file has to be appended.
295    * @param translate Enable translation to english.
296    * @throws WorkflowOperationException Get thrown if an error occurs.
297    */
298   private void createSubtitle(Track track, String languageCode, MediaPackage parentMediaPackage,
299           ConfiguredTagsAndFlavors tagsAndFlavors, AppendSubtitleAs appendSubtitleAs, Boolean translate)
300           throws WorkflowOperationException {
301 
302     // Start the transcription job, create subtitles file
303     URI trackURI = track.getURI();
304 
305     Job job;
306     logger.info("Generating subtitle for '{}'...", trackURI);
307     try {
308       job = speechToTextService.transcribe(trackURI, languageCode, translate);
309     } catch (SpeechToTextServiceException e) {
310       throw new WorkflowOperationException(
311               String.format("Generating subtitles for '%s' in media package '%s' failed",
312                       trackURI, parentMediaPackage), e);
313     }
314 
315     if (!waitForStatus(job).isSuccess()) {
316       throw new WorkflowOperationException(
317               String.format("Speech-to-Text job for media package '%s' failed", parentMediaPackage));
318     }
319 
320     // subtitles file is generated now, put it into the media package
321     try {
322       String[] jobOutput = job.getPayload().split(",");
323       URI output = new URI(jobOutput[0]);
324       String outputLanguage = jobOutput[1];
325       String engineType = jobOutput[2];
326 
327       String mediaPackageIdentifier = UUID.randomUUID().toString();
328 
329       MediaPackageElement subtitleMediaPackageElement;
330       switch (appendSubtitleAs) {
331         case attachment:
332           subtitleMediaPackageElement = new AttachmentImpl();
333           break;
334         case track:
335         default:
336           subtitleMediaPackageElement = new TrackImpl();
337       }
338 
339       subtitleMediaPackageElement.setIdentifier(mediaPackageIdentifier);
340       try (InputStream in = workspace.read(output)) {
341         URI uri = workspace.put(parentMediaPackage.getIdentifier().toString(), mediaPackageIdentifier,
342                 FilenameUtils.getName(output.getPath()), in);
343         subtitleMediaPackageElement.setURI(uri);
344       }
345       MediaPackageElementFlavor targetFlavor = tagsAndFlavors.getSingleTargetFlavor().applyTo(track.getFlavor());
346       subtitleMediaPackageElement.setFlavor(targetFlavor);
347 
348       List<String> targetTags = tagsAndFlavors.getTargetTags();
349       targetTags.add("lang:" + outputLanguage);
350       targetTags.add("generator-type:auto");
351       targetTags.add("generator:" + engineType.toLowerCase());
352 
353       // this is used to set some values automatically, like the correct mimetype
354       Job inspection = mediaInspectionService.enrich(subtitleMediaPackageElement, true);
355       if (!waitForStatus(inspection).isSuccess()) {
356         throw new SpeechToTextServiceException(String.format(
357                 "Transcription for '%s' failed at enriching process", trackURI));
358       }
359 
360       subtitleMediaPackageElement = MediaPackageElementParser.getFromXml(inspection.getPayload());
361 
362       for (String tag : targetTags) {
363         subtitleMediaPackageElement.addTag(tag);
364       }
365 
366       parentMediaPackage.add(subtitleMediaPackageElement);
367 
368       workspace.delete(output);
369     } catch (Exception e) {
370       throw new WorkflowOperationException("Error handling text-to-speech service output", e);
371     }
372 
373     try {
374       workspace.cleanup(parentMediaPackage.getIdentifier());
375     } catch (IOException e) {
376       throw new WorkflowOperationException(e);
377     }
378   }
379 
380   /**
381    * Start the transcription, but don't actually wait for the process to finish. Instead, let the jobs run
382    * asynchronously and just store the launched jobs in the workflow configuration.
383    * @param workflow Workflow instance to store the jobs in
384    * @param tracks Tracks to run the transcription on
385    * @param languageCode Language to use
386    * @param translate If the transcription should be translated
387    * @throws WorkflowOperationException
388    */
389   private void createSubtitleAsync(WorkflowInstance workflow, List<Track> tracks, String languageCode,
390       Boolean translate) throws WorkflowOperationException {
391 
392     logger.info("Asynchronously generating subtitles");
393     StringBuilder jobs = new StringBuilder();
394     try {
395       for (var track: tracks) {
396         var job = speechToTextService.transcribe(track.getURI(), languageCode, translate);
397         jobs.append(",").append(job.getId());
398       }
399     } catch (SpeechToTextServiceException e) {
400       throw new WorkflowOperationException(
401           String.format("Starting subtitle job in media package '%s' failed",
402               workflow.getMediaPackage().getIdentifier()), e);
403     }
404 
405     var config = Objects.toString(workflow.getConfiguration(JOBS_WORKFLOW_CONFIGURATION), "") + jobs;
406     workflow.setConfiguration(JOBS_WORKFLOW_CONFIGURATION, config.replaceFirst("^,", ""));
407   }
408 
409   /**
410    * Get the config for the "track selection strategy". It's used to determine which tracks shall be transcribed.
411    * If there are 2 Videos and both has audio for example, what audio shall be transcribed?
412    *
413    * @param workflowInstance Contains the workflow configuration.
414    * @return Which strategy to use
415    * @throws WorkflowOperationException Get thrown if an error occurs.
416    */
417   private TrackSelectionStrategy getTrackSelectionStrategy(MediaPackage mediaPackage, WorkflowInstance workflowInstance)
418           throws WorkflowOperationException {
419 
420     WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
421     String strategyCfg = StringUtils.trimToEmpty(operation.getConfiguration(TRACK_SELECTION_STRATEGY)).toLowerCase();
422 
423     if (strategyCfg.isEmpty()) {
424       return TrackSelectionStrategy.EVERYTHING; // "transcribe everything" is the default/fallback
425     }
426     try {
427       return TrackSelectionStrategy.fromString(strategyCfg);
428     } catch (IllegalArgumentException e) {
429       throw new WorkflowOperationException(String.format(
430           "Speech-to-Text job for media package '%s' failed, because of wrong workflow configuration. "
431               + "track-selection-strategy of type '%s' does not exist.", mediaPackage, strategyCfg));
432     }
433   }
434 
435 
436   /**
437    * Get the information how to append the subtitles file to the media package.
438    *
439    * @param workflowInstance Contains the workflow configuration.
440    * @return How to append the subtitles file to the media package.
441    * @throws WorkflowOperationException Get thrown if an error occurs.
442    */
443   private AppendSubtitleAs howToAppendTheSubtitles(WorkflowInstance workflowInstance)
444           throws WorkflowOperationException {
445     WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
446     String targetElement = StringUtils.trimToEmpty(operation.getConfiguration(TARGET_ELEMENT)).toLowerCase();
447     if (targetElement.isEmpty()) {
448       return AppendSubtitleAs.track;
449     }
450     try {
451       return AppendSubtitleAs.valueOf(targetElement);
452     } catch (IllegalArgumentException e) {
453       throw new WorkflowOperationException(String.format(
454           "Speech-to-Text job for media package '%s' failed, because of wrong workflow configuration. "
455               + "target-element of type '%s' does not exist.", workflowInstance.getMediaPackage(), targetElement));
456     }
457   }
458 
459   /**
460    * Get if the subtitle needs to be translated into english
461    *
462    * @param workflowInstance Contains the workflow configuration
463    * @return Boolean to enable english translation
464    */
465   private Boolean getTranslationMode(WorkflowInstance workflowInstance) {
466     WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
467     return BooleanUtils.toBoolean(StringUtils.trimToEmpty(operation.getConfiguration(TRANSLATE_MODE)));
468   }
469 
470   /**
471    * Searches some places to get the right language of the media package / track.
472    *
473    * @param mediaPackage The media package from which the subtitles are generated.
474    * @param workflowInstance Contains the workflow configuration.
475    * @return The language of the media package / track.
476    */
477   private String getMediaPackageLanguage(MediaPackage mediaPackage, WorkflowInstance workflowInstance) {
478 
479     // First look if there is a fixed language configured in the operation
480     WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
481     String language = StringUtils.trimToEmpty(operation.getConfiguration(LANGUAGE_CODE));
482 
483     if (language.isEmpty()) {
484       // If not we look in the dublin core metadata if the language is available
485       MediaPackageMetadata dublinCoreMetadata = dublinCoreCatalogService.getMetadata(mediaPackage);
486       language = StringUtils.trimToEmpty(dublinCoreMetadata.getLanguage());
487     }
488 
489     if (language.isEmpty()) {
490       // If there is still no language, we look in the media package itself
491       language = StringUtils.trimToEmpty(mediaPackage.getLanguage());
492     }
493 
494     if (language.isEmpty()) {
495       // Use the fallback language if the operation configuration defines one
496       language = Objects.toString(operation.getConfiguration(LANGUAGE_FALLBACK), "");
497     }
498 
499     return language;
500   }
501 
502 
503   //================================================================================
504   // OSGi setter
505   //================================================================================
506 
507   @Reference
508   public void setSpeechToTextService(SpeechToTextService speechToTextService) {
509     this.speechToTextService = speechToTextService;
510   }
511 
512   @Reference
513   public void setMediaInspectionService(MediaInspectionService mediaInspectionService) {
514     this.mediaInspectionService = mediaInspectionService;
515   }
516 
517   @Reference
518   public void setWorkspace(Workspace workspace) {
519     this.workspace = workspace;
520   }
521 
522   @Reference
523   public void setDublinCoreCatalogService(DublinCoreCatalogService dublinCoreCatalogService) {
524     this.dublinCoreCatalogService = dublinCoreCatalogService;
525   }
526 
527   @Reference
528   public void setServiceRegistry(ServiceRegistry serviceRegistry) {
529     this.serviceRegistry = serviceRegistry;
530   }
531 }