View Javadoc
1   /*
2    * Licensed to The Apereo Foundation under one or more contributor license
3    * agreements. See the NOTICE file distributed with this work for additional
4    * information regarding copyright ownership.
5    *
6    *
7    * The Apereo Foundation licenses this file to you under the Educational
8    * Community License, Version 2.0 (the "License"); you may not use this file
9    * except in compliance with the License. You may obtain a copy of the License
10   * at:
11   *
12   *   http://opensource.org/licenses/ecl2.txt
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
16   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
17   * License for the specific language governing permissions and limitations under
18   * the License.
19   *
20   */
21  package org.opencastproject.workflow.handler.speechtotext;
22  
23  import org.opencastproject.inspection.api.MediaInspectionService;
24  import org.opencastproject.job.api.Job;
25  import org.opencastproject.job.api.JobContext;
26  import org.opencastproject.mediapackage.MediaPackage;
27  import org.opencastproject.mediapackage.MediaPackageElement;
28  import org.opencastproject.mediapackage.MediaPackageElementFlavor;
29  import org.opencastproject.mediapackage.MediaPackageElementParser;
30  import org.opencastproject.mediapackage.MediaPackageElements;
31  import org.opencastproject.mediapackage.Track;
32  import org.opencastproject.mediapackage.attachment.AttachmentImpl;
33  import org.opencastproject.mediapackage.selector.TrackSelector;
34  import org.opencastproject.mediapackage.track.TrackImpl;
35  import org.opencastproject.metadata.api.MediaPackageMetadata;
36  import org.opencastproject.metadata.dublincore.DublinCoreCatalogService;
37  import org.opencastproject.serviceregistry.api.ServiceRegistry;
38  import org.opencastproject.speechtotext.api.SpeechToTextService;
39  import org.opencastproject.speechtotext.api.SpeechToTextServiceException;
40  import org.opencastproject.workflow.api.AbstractWorkflowOperationHandler;
41  import org.opencastproject.workflow.api.ConfiguredTagsAndFlavors;
42  import org.opencastproject.workflow.api.WorkflowInstance;
43  import org.opencastproject.workflow.api.WorkflowOperationException;
44  import org.opencastproject.workflow.api.WorkflowOperationHandler;
45  import org.opencastproject.workflow.api.WorkflowOperationInstance;
46  import org.opencastproject.workflow.api.WorkflowOperationResult;
47  import org.opencastproject.workspace.api.Workspace;
48  
49  import org.apache.commons.io.FilenameUtils;
50  import org.apache.commons.lang3.BooleanUtils;
51  import org.apache.commons.lang3.StringUtils;
52  import org.osgi.service.component.ComponentContext;
53  import org.osgi.service.component.annotations.Activate;
54  import org.osgi.service.component.annotations.Component;
55  import org.osgi.service.component.annotations.Reference;
56  import org.slf4j.Logger;
57  import org.slf4j.LoggerFactory;
58  
59  import java.io.IOException;
60  import java.io.InputStream;
61  import java.net.URI;
62  import java.util.ArrayList;
63  import java.util.Collection;
64  import java.util.List;
65  import java.util.Objects;
66  import java.util.UUID;
67  import java.util.stream.Collectors;
68  
69  /**
70   * Workflow operation for the speech-to-text service.
71   */
72  @Component(
73      immediate = true,
74      service = WorkflowOperationHandler.class,
75      property = {
76          "service.description=Speech-to-Text Workflow Operation Handler",
77          "workflow.operation=speechtotext"
78      }
79  )
80  public class
81      SpeechToTextWorkflowOperationHandler extends AbstractWorkflowOperationHandler {
82  
83    private static final Logger logger = LoggerFactory.getLogger(SpeechToTextWorkflowOperationHandler.class);
84  
85    /** Speech to Text language configuration property name. */
86    private static final String LANGUAGE_CODE = "language-code";
87  
88    /** Speech to Text language fallback configuration property name. */
89    private static final String LANGUAGE_FALLBACK = "language-fallback";
90  
91    /** Property name for configuring the place where the subtitles shall be appended. */
92    private static final String TARGET_ELEMENT = "target-element";
93  
94    /** Language placeholder */
95    private static final String PLACEHOLDER_LANG = "#{lang}";
96  
97    /** Translation mode */
98    private static final String TRANSLATE_MODE = "translate";
99  
100   /** Configuration: Track Selection Strategy (Control which tracks shall be transcribed) */
101   private static final String TRACK_SELECTION_STRATEGY = "track-selection-strategy";
102 
103   /** Configuration: Limit to One (If true, max 1 subtitle file will be generated) */
104   private static final String LIMIT_TO_ONE = "limit-to-one";
105 
106   /** Configuration: Synchronous or asynchronous mode */
107   private static final String ASYNCHRONOUS = "async";
108 
109   /** Workflow configuration name to store jobs in */
110   private static final String JOBS_WORKFLOW_CONFIGURATION = "speech-to-text-jobs";
111 
112   private enum TrackSelectionStrategy {
113     PRESENTER_OR_NOTHING,
114     PRESENTATION_OR_NOTHING,
115     TRY_PRESENTER_FIRST,
116     TRY_PRESENTATION_FIRST,
117     EVERYTHING;
118 
119     private static TrackSelectionStrategy fromString(String value) {
120       for (TrackSelectionStrategy strategy : values()) {
121         if (strategy.name().equalsIgnoreCase(value)) {
122           return strategy;
123         }
124       }
125       throw new IllegalArgumentException(
126           "No TrackSelectionStrategy enum constant " + TrackSelectionStrategy.class.getCanonicalName() + "." + value);
127     }
128   }
129 
130   private enum AppendSubtitleAs {
131     attachment, track
132   }
133 
134   /** The speech-to-text service. */
135   private SpeechToTextService speechToTextService = null;
136 
137   /** The workspace service. */
138   private Workspace workspace;
139 
140   /** The inspection service. */
141   private MediaInspectionService mediaInspectionService;
142 
143   /** The dublin core catalog service. */
144   private DublinCoreCatalogService dublinCoreCatalogService;
145 
146   @Override
147   @Activate
148   public void activate(ComponentContext cc) {
149     super.activate(cc);
150     logger.info("Registering speech-to-text workflow operation handler");
151   }
152 
153   /**
154    * {@inheritDoc}
155    *
156    * @see
157    * org.opencastproject.workflow.api.WorkflowOperationHandler#start(org.opencastproject.workflow.api.WorkflowInstance,
158    * org.opencastproject.job.api.JobContext)
159    */
160   @Override
161   public WorkflowOperationResult start(WorkflowInstance workflowInstance, JobContext context)
162           throws WorkflowOperationException {
163 
164     MediaPackage mediaPackage = workflowInstance.getMediaPackage();
165     logger.info("Start speech-to-text workflow operation for media package {}", mediaPackage);
166 
167     // Defaults to `false` if `null`
168     var async = BooleanUtils.toBoolean(workflowInstance.getCurrentOperation().getConfiguration(ASYNCHRONOUS));
169 
170     ConfiguredTagsAndFlavors tagsAndFlavors = getTagsAndFlavors(workflowInstance,
171             Configuration.none, Configuration.one,
172             Configuration.many, Configuration.one);
173     MediaPackageElementFlavor sourceFlavor = tagsAndFlavors.getSingleSrcFlavor();
174 
175     TrackSelector trackSelector = new TrackSelector();
176     trackSelector.addFlavor(sourceFlavor);
177     Collection<Track> tracks = trackSelector.select(mediaPackage, false);
178 
179     if (tracks.isEmpty()) {
180       throw new WorkflowOperationException(
181               String.format("No tracks with source flavor '%s' found for transcription", sourceFlavor));
182     }
183 
184     logger.info("Found {} track(s) with source flavor '{}'.", tracks.size(), sourceFlavor);
185 
186     // Get the information in which language the audio track should be
187     String languageCode = getMediaPackageLanguage(mediaPackage, workflowInstance);
188 
189     // How to save the subtitle file? (as attachment, as track...)
190     AppendSubtitleAs appendSubtitleAs = howToAppendTheSubtitles(workflowInstance);
191 
192     // Translate to english
193     Boolean translate = getTranslationMode(workflowInstance);
194 
195     // Create sublist that includes only the tracks that has audio
196     List<Track> tracksWithAudio = tracks.stream().filter(Track::hasAudio).collect(Collectors.toList());
197 
198     // Get the track selection strategy from the workflow configuration
199     // If nothing is set, all tracks (with audio) will be transcribed
200     TrackSelectionStrategy trackSelectionStrategy = getTrackSelectionStrategy(mediaPackage, workflowInstance);
201 
202     // Use the selection strategy from the workflow config to get the tracks we want to transcribe
203     List<Track> tracksToTranscribe = filterTracksByStrategy(tracksWithAudio, trackSelectionStrategy);
204     if (tracksToTranscribe.isEmpty()) {
205       logger.info("No subtitles were created for media package {}. "
206           + "Workflow Configuration 'track-selection-strategy' is set to {}", mediaPackage, trackSelectionStrategy);
207       return createResult(mediaPackage, WorkflowOperationResult.Action.SKIP);
208     }
209 
210     // Load the 'limit-to-one' configuration from the workflow operation.
211     // This configuration sets the limit of generated subtitle files to one
212     boolean limitToOne = BooleanUtils.toBoolean(workflowInstance.getCurrentOperation().getConfiguration(LIMIT_TO_ONE));
213     if (limitToOne) {
214       tracksToTranscribe = List.of(tracksToTranscribe.get(0));
215     }
216 
217     if (async) {
218       createSubtitleAsync(workflowInstance, tracksToTranscribe, languageCode, translate);
219     } else {
220       for (Track track : tracksToTranscribe) {
221         createSubtitle(track, languageCode, mediaPackage, tagsAndFlavors, appendSubtitleAs, translate);
222       }
223     }
224 
225     logger.info("Speech-To-Text workflow operation for media package {} completed", mediaPackage);
226     return createResult(mediaPackage, WorkflowOperationResult.Action.CONTINUE);
227   }
228 
229   /**
230    * Filters the tracks by the strategy configured in the workflow operation
231    * @param tracksWithAudio        List of the tracks that includes audio
232    * @param trackSelectionStrategy The strategy configured in the workflow operation
233    * @return The filtered tracks
234    */
235   private List<Track> filterTracksByStrategy(List<Track> tracksWithAudio,
236       TrackSelectionStrategy trackSelectionStrategy) {
237 
238     List<Track> tracksToTranscribe = new ArrayList<>();
239     if (!tracksWithAudio.isEmpty()) {
240 
241       String presenterTypeConstant = MediaPackageElements.PRESENTER_SOURCE.getType();
242       String presentationTypeConstant = MediaPackageElements.PRESENTATION_SOURCE.getType();
243 
244       // Creates a sublist only including the presenter tracks
245       List<Track> presenterTracksWithAudio = tracksWithAudio.stream()
246           .filter(track -> Objects.equals(track.getFlavor().getType(), presenterTypeConstant))
247           .collect(Collectors.toList());
248 
249       // Creates a sublist only including the presentation tracks
250       List<Track> presentationTracksWithAudio = tracksWithAudio.stream()
251           .filter(track -> Objects.equals(track.getFlavor().getType(), presentationTypeConstant))
252           .collect(Collectors.toList());
253 
254       if (TrackSelectionStrategy.PRESENTER_OR_NOTHING.equals(trackSelectionStrategy)) {
255         tracksToTranscribe.addAll(presenterTracksWithAudio);
256       }
257 
258       if (TrackSelectionStrategy.PRESENTATION_OR_NOTHING.equals(trackSelectionStrategy)) {
259         tracksToTranscribe.addAll(presentationTracksWithAudio);
260       }
261 
262       if (TrackSelectionStrategy.TRY_PRESENTER_FIRST.equals(trackSelectionStrategy)) {
263         tracksToTranscribe.addAll(presenterTracksWithAudio);
264         if (tracksToTranscribe.isEmpty()) {
265           tracksToTranscribe.addAll(tracksWithAudio);
266         }
267       }
268 
269       if (TrackSelectionStrategy.TRY_PRESENTATION_FIRST.equals(trackSelectionStrategy)) {
270         tracksToTranscribe.addAll((presentationTracksWithAudio));
271         if (tracksToTranscribe.isEmpty()) {
272           tracksToTranscribe.addAll(tracksWithAudio);
273         }
274       }
275 
276       if (TrackSelectionStrategy.EVERYTHING.equals(trackSelectionStrategy)) {
277         tracksToTranscribe.addAll(tracksWithAudio);
278       }
279     }
280     return tracksToTranscribe;
281   }
282 
283   /**
284    * Creates the subtitle file for a track and appends it to the media package.
285    *
286    * @param track The track from which the subtitles are created.
287    * @param languageCode The language of the track.
288    * @param parentMediaPackage The media package where the track is located.
289    * @param tagsAndFlavors Tags and flavors instance (to get target flavor information)
290    * @param appendSubtitleAs Tells how the subtitles file has to be appended.
291    * @param translate Enable translation to english.
292    * @throws WorkflowOperationException Get thrown if an error occurs.
293    */
294   private void createSubtitle(Track track, String languageCode, MediaPackage parentMediaPackage,
295           ConfiguredTagsAndFlavors tagsAndFlavors, AppendSubtitleAs appendSubtitleAs, Boolean translate)
296           throws WorkflowOperationException {
297 
298     // Start the transcription job, create subtitles file
299     URI trackURI = track.getURI();
300 
301     Job job;
302     logger.info("Generating subtitle for '{}'...", trackURI);
303     try {
304       job = speechToTextService.transcribe(trackURI, languageCode, translate);
305     } catch (SpeechToTextServiceException e) {
306       throw new WorkflowOperationException(
307               String.format("Generating subtitles for '%s' in media package '%s' failed",
308                       trackURI, parentMediaPackage), e);
309     }
310 
311     if (!waitForStatus(job).isSuccess()) {
312       throw new WorkflowOperationException(
313               String.format("Speech-to-Text job for media package '%s' failed", parentMediaPackage));
314     }
315 
316     // subtitles file is generated now, put it into the media package
317     try {
318       String[] jobOutput = job.getPayload().split(",");
319       URI output = new URI(jobOutput[0]);
320       String outputLanguage = jobOutput[1];
321       String engineType = jobOutput[2];
322 
323       String mediaPackageIdentifier = UUID.randomUUID().toString();
324 
325       MediaPackageElement subtitleMediaPackageElement;
326       switch (appendSubtitleAs) {
327         case attachment:
328           subtitleMediaPackageElement = new AttachmentImpl();
329           break;
330         case track:
331         default:
332           subtitleMediaPackageElement = new TrackImpl();
333       }
334 
335       subtitleMediaPackageElement.setIdentifier(mediaPackageIdentifier);
336       try (InputStream in = workspace.read(output)) {
337         URI uri = workspace.put(parentMediaPackage.getIdentifier().toString(), mediaPackageIdentifier,
338                 FilenameUtils.getName(output.getPath()), in);
339         subtitleMediaPackageElement.setURI(uri);
340       }
341       MediaPackageElementFlavor targetFlavor = tagsAndFlavors.getSingleTargetFlavor().applyTo(track.getFlavor());
342       subtitleMediaPackageElement.setFlavor(targetFlavor);
343 
344       List<String> targetTags = tagsAndFlavors.getTargetTags();
345       targetTags.add("lang:" + outputLanguage);
346       targetTags.add("generator-type:auto");
347       targetTags.add("generator:" + engineType.toLowerCase());
348 
349       // this is used to set some values automatically, like the correct mimetype
350       Job inspection = mediaInspectionService.enrich(subtitleMediaPackageElement, true);
351       if (!waitForStatus(inspection).isSuccess()) {
352         throw new SpeechToTextServiceException(String.format(
353                 "Transcription for '%s' failed at enriching process", trackURI));
354       }
355 
356       subtitleMediaPackageElement = MediaPackageElementParser.getFromXml(inspection.getPayload());
357 
358       for (String tag : targetTags) {
359         subtitleMediaPackageElement.addTag(tag);
360       }
361 
362       parentMediaPackage.add(subtitleMediaPackageElement);
363 
364       workspace.delete(output);
365     } catch (Exception e) {
366       throw new WorkflowOperationException("Error handling text-to-speech service output", e);
367     }
368 
369     try {
370       workspace.cleanup(parentMediaPackage.getIdentifier());
371     } catch (IOException e) {
372       throw new WorkflowOperationException(e);
373     }
374   }
375 
376   /**
377    * Start the transcription, but don't actually wait for the process to finish. Instead, let the jobs run
378    * asynchronously and just store the launched jobs in the workflow configuration.
379    * @param workflow Workflow instance to store the jobs in
380    * @param tracks Tracks to run the transcription on
381    * @param languageCode Language to use
382    * @param translate If the transcription should be translated
383    * @throws WorkflowOperationException
384    */
385   private void createSubtitleAsync(WorkflowInstance workflow, List<Track> tracks, String languageCode,
386       Boolean translate) throws WorkflowOperationException {
387 
388     logger.info("Asynchronously generating subtitles");
389     StringBuilder jobs = new StringBuilder();
390     try {
391       for (var track: tracks) {
392         var job = speechToTextService.transcribe(track.getURI(), languageCode, translate);
393         jobs.append(",").append(job.getId());
394       }
395     } catch (SpeechToTextServiceException e) {
396       throw new WorkflowOperationException(
397           String.format("Starting subtitle job in media package '%s' failed",
398               workflow.getMediaPackage().getIdentifier()), e);
399     }
400 
401     var config = Objects.toString(workflow.getConfiguration(JOBS_WORKFLOW_CONFIGURATION), "") + jobs;
402     workflow.setConfiguration(JOBS_WORKFLOW_CONFIGURATION, config.replaceFirst("^,", ""));
403   }
404 
405   /**
406    * Get the config for the "track selection strategy". It's used to determine which tracks shall be transcribed.
407    * If there are 2 Videos and both has audio for example, what audio shall be transcribed?
408    *
409    * @param workflowInstance Contains the workflow configuration.
410    * @return Which strategy to use
411    * @throws WorkflowOperationException Get thrown if an error occurs.
412    */
413   private TrackSelectionStrategy getTrackSelectionStrategy(MediaPackage mediaPackage, WorkflowInstance workflowInstance)
414           throws WorkflowOperationException {
415 
416     WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
417     String strategyCfg = StringUtils.trimToEmpty(operation.getConfiguration(TRACK_SELECTION_STRATEGY)).toLowerCase();
418 
419     if (strategyCfg.isEmpty()) {
420       return TrackSelectionStrategy.EVERYTHING; // "transcribe everything" is the default/fallback
421     }
422     try {
423       return TrackSelectionStrategy.fromString(strategyCfg);
424     } catch (IllegalArgumentException e) {
425       throw new WorkflowOperationException(String.format(
426           "Speech-to-Text job for media package '%s' failed, because of wrong workflow configuration. "
427               + "track-selection-strategy of type '%s' does not exist.", mediaPackage, strategyCfg));
428     }
429   }
430 
431 
432   /**
433    * Get the information how to append the subtitles file to the media package.
434    *
435    * @param workflowInstance Contains the workflow configuration.
436    * @return How to append the subtitles file to the media package.
437    * @throws WorkflowOperationException Get thrown if an error occurs.
438    */
439   private AppendSubtitleAs howToAppendTheSubtitles(WorkflowInstance workflowInstance)
440           throws WorkflowOperationException {
441     WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
442     String targetElement = StringUtils.trimToEmpty(operation.getConfiguration(TARGET_ELEMENT)).toLowerCase();
443     if (targetElement.isEmpty()) {
444       return AppendSubtitleAs.track;
445     }
446     try {
447       return AppendSubtitleAs.valueOf(targetElement);
448     } catch (IllegalArgumentException e) {
449       throw new WorkflowOperationException(String.format(
450           "Speech-to-Text job for media package '%s' failed, because of wrong workflow configuration. "
451               + "target-element of type '%s' does not exist.", workflowInstance.getMediaPackage(), targetElement));
452     }
453   }
454 
455   /**
456    * Get if the subtitle needs to be translated into english
457    *
458    * @param workflowInstance Contains the workflow configuration
459    * @return Boolean to enable english translation
460    */
461   private Boolean getTranslationMode(WorkflowInstance workflowInstance) {
462     WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
463     return BooleanUtils.toBoolean(StringUtils.trimToEmpty(operation.getConfiguration(TRANSLATE_MODE)));
464   }
465 
466   /**
467    * Searches some places to get the right language of the media package / track.
468    *
469    * @param mediaPackage The media package from which the subtitles are generated.
470    * @param workflowInstance Contains the workflow configuration.
471    * @return The language of the media package / track.
472    */
473   private String getMediaPackageLanguage(MediaPackage mediaPackage, WorkflowInstance workflowInstance) {
474 
475     // First look if there is a fixed language configured in the operation
476     WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
477     String language = StringUtils.trimToEmpty(operation.getConfiguration(LANGUAGE_CODE));
478 
479     if (language.isEmpty()) {
480       // If not we look in the dublin core metadata if the language is available
481       MediaPackageMetadata dublinCoreMetadata = dublinCoreCatalogService.getMetadata(mediaPackage);
482       language = StringUtils.trimToEmpty(dublinCoreMetadata.getLanguage());
483     }
484 
485     if (language.isEmpty()) {
486       // If there is still no language, we look in the media package itself
487       language = StringUtils.trimToEmpty(mediaPackage.getLanguage());
488     }
489 
490     if (language.isEmpty()) {
491       // Use the fallback language if the operation configuration defines one
492       language = Objects.toString(operation.getConfiguration(LANGUAGE_FALLBACK), "");
493     }
494 
495     return language;
496   }
497 
498 
499   //================================================================================
500   // OSGi setter
501   //================================================================================
502 
503   @Reference
504   public void setSpeechToTextService(SpeechToTextService speechToTextService) {
505     this.speechToTextService = speechToTextService;
506   }
507 
508   @Reference
509   public void setMediaInspectionService(MediaInspectionService mediaInspectionService) {
510     this.mediaInspectionService = mediaInspectionService;
511   }
512 
513   @Reference
514   public void setWorkspace(Workspace workspace) {
515     this.workspace = workspace;
516   }
517 
518   @Reference
519   public void setDublinCoreCatalogService(DublinCoreCatalogService dublinCoreCatalogService) {
520     this.dublinCoreCatalogService = dublinCoreCatalogService;
521   }
522 
523   @Reference
524   public void setServiceRegistry(ServiceRegistry serviceRegistry) {
525     this.serviceRegistry = serviceRegistry;
526   }
527 }