View Javadoc
1   /*
2    * Licensed to The Apereo Foundation under one or more contributor license
3    * agreements. See the NOTICE file distributed with this work for additional
4    * information regarding copyright ownership.
5    *
6    *
7    * The Apereo Foundation licenses this file to you under the Educational
8    * Community License, Version 2.0 (the "License"); you may not use this file
9    * except in compliance with the License. You may obtain a copy of the License
10   * at:
11   *
12   *   http://opensource.org/licenses/ecl2.txt
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
16   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
17   * License for the specific language governing permissions and limitations under
18   * the License.
19   *
20   */
21  package org.opencastproject.workflow.handler.speechtotext;
22  
23  import org.opencastproject.inspection.api.MediaInspectionService;
24  import org.opencastproject.job.api.Job;
25  import org.opencastproject.job.api.JobContext;
26  import org.opencastproject.mediapackage.MediaPackage;
27  import org.opencastproject.mediapackage.MediaPackageElement;
28  import org.opencastproject.mediapackage.MediaPackageElementFlavor;
29  import org.opencastproject.mediapackage.MediaPackageElementParser;
30  import org.opencastproject.mediapackage.MediaPackageElements;
31  import org.opencastproject.mediapackage.Track;
32  import org.opencastproject.mediapackage.attachment.AttachmentImpl;
33  import org.opencastproject.mediapackage.selector.TrackSelector;
34  import org.opencastproject.mediapackage.track.TrackImpl;
35  import org.opencastproject.metadata.api.MediaPackageMetadata;
36  import org.opencastproject.metadata.dublincore.DublinCoreCatalogService;
37  import org.opencastproject.serviceregistry.api.ServiceRegistry;
38  import org.opencastproject.speechtotext.api.SpeechToTextService;
39  import org.opencastproject.speechtotext.api.SpeechToTextServiceException;
40  import org.opencastproject.workflow.api.AbstractWorkflowOperationHandler;
41  import org.opencastproject.workflow.api.ConfiguredTagsAndFlavors;
42  import org.opencastproject.workflow.api.WorkflowInstance;
43  import org.opencastproject.workflow.api.WorkflowOperationException;
44  import org.opencastproject.workflow.api.WorkflowOperationHandler;
45  import org.opencastproject.workflow.api.WorkflowOperationInstance;
46  import org.opencastproject.workflow.api.WorkflowOperationResult;
47  import org.opencastproject.workspace.api.Workspace;
48  
49  import org.apache.commons.io.FilenameUtils;
50  import org.apache.commons.lang3.BooleanUtils;
51  import org.apache.commons.lang3.StringUtils;
52  import org.osgi.service.component.ComponentContext;
53  import org.osgi.service.component.annotations.Activate;
54  import org.osgi.service.component.annotations.Component;
55  import org.osgi.service.component.annotations.Reference;
56  import org.slf4j.Logger;
57  import org.slf4j.LoggerFactory;
58  
59  import java.io.IOException;
60  import java.io.InputStream;
61  import java.net.URI;
62  import java.util.ArrayList;
63  import java.util.Collection;
64  import java.util.List;
65  import java.util.Objects;
66  import java.util.stream.Collectors;
67  
68  /**
69   * Workflow operation for the speech-to-text service.
70   */
71  @Component(
72      immediate = true,
73      service = WorkflowOperationHandler.class,
74      property = {
75          "service.description=Speech-to-Text Workflow Operation Handler",
76          "workflow.operation=speechtotext"
77      }
78  )
79  public class
80      SpeechToTextWorkflowOperationHandler extends AbstractWorkflowOperationHandler {
81  
82    private static final Logger logger = LoggerFactory.getLogger(SpeechToTextWorkflowOperationHandler.class);
83  
84    /** Speech to Text language configuration property name. */
85    private static final String LANGUAGE_CODE = "language-code";
86  
87    /** Speech to Text language fallback configuration property name. */
88    private static final String LANGUAGE_FALLBACK = "language-fallback";
89  
90    /** Property name for configuring the place where the subtitles shall be appended. */
91    private static final String TARGET_ELEMENT = "target-element";
92  
93    /** Language placeholder */
94    private static final String PLACEHOLDER_LANG = "#{lang}";
95  
96    /** Translation mode */
97    private static final String TRANSLATE_MODE = "translate";
98  
99    /** Configuration: Track Selection Strategy (Control which tracks shall be transcribed) */
100   private static final String TRACK_SELECTION_STRATEGY = "track-selection-strategy";
101 
102   /** Configuration: Limit to One (If true, max 1 subtitle file will be generated) */
103   private static final String LIMIT_TO_ONE = "limit-to-one";
104 
105   /** Configuration: Synchronous or asynchronous mode */
106   private static final String ASYNCHRONOUS = "async";
107 
108   /** Workflow configuration name to store jobs in */
109   private static final String JOBS_WORKFLOW_CONFIGURATION = "speech-to-text-jobs";
110 
111   private enum TrackSelectionStrategy {
112     PRESENTER_OR_NOTHING,
113     PRESENTATION_OR_NOTHING,
114     TRY_PRESENTER_FIRST,
115     TRY_PRESENTATION_FIRST,
116     EVERYTHING;
117 
118     private static TrackSelectionStrategy fromString(String value) {
119       for (TrackSelectionStrategy strategy : values()) {
120         if (strategy.name().equalsIgnoreCase(value)) {
121           return strategy;
122         }
123       }
124       throw new IllegalArgumentException(
125           "No TrackSelectionStrategy enum constant " + TrackSelectionStrategy.class.getCanonicalName() + "." + value);
126     }
127   }
128 
129   private enum AppendSubtitleAs {
130     attachment, track
131   }
132 
133   /** The speech-to-text service. */
134   private SpeechToTextService speechToTextService = null;
135 
136   /** The workspace service. */
137   private Workspace workspace;
138 
139   /** The inspection service. */
140   private MediaInspectionService mediaInspectionService;
141 
142   /** The dublin core catalog service. */
143   private DublinCoreCatalogService dublinCoreCatalogService;
144 
145   @Override
146   @Activate
147   public void activate(ComponentContext cc) {
148     super.activate(cc);
149     logger.info("Registering speech-to-text workflow operation handler");
150   }
151 
152   /**
153    * {@inheritDoc}
154    *
155    * @see
156    * org.opencastproject.workflow.api.WorkflowOperationHandler#start(org.opencastproject.workflow.api.WorkflowInstance,
157    * org.opencastproject.job.api.JobContext)
158    */
159   @Override
160   public WorkflowOperationResult start(WorkflowInstance workflowInstance, JobContext context)
161           throws WorkflowOperationException {
162 
163     MediaPackage mediaPackage = workflowInstance.getMediaPackage();
164     logger.info("Start speech-to-text workflow operation for media package {}", mediaPackage);
165 
166     // Defaults to `false` if `null`
167     var async = BooleanUtils.toBoolean(workflowInstance.getCurrentOperation().getConfiguration(ASYNCHRONOUS));
168 
169     ConfiguredTagsAndFlavors tagsAndFlavors = getTagsAndFlavors(workflowInstance,
170             Configuration.many, Configuration.one,
171             Configuration.many, Configuration.one);
172     MediaPackageElementFlavor sourceFlavor = tagsAndFlavors.getSingleSrcFlavor();
173     List<String> srcTags = tagsAndFlavors.getSrcTags();
174 
175     TrackSelector trackSelector = new TrackSelector();
176     trackSelector.addFlavor(sourceFlavor);
177     for (String tag : srcTags) {
178       trackSelector.addTag(tag);
179     }
180     Collection<Track> tracks = trackSelector.select(mediaPackage, true);
181 
182     if (tracks.isEmpty()) {
183       throw new WorkflowOperationException(
184               String.format("No tracks with source flavor '%s' found for transcription", sourceFlavor));
185     }
186 
187     logger.info("Found {} track(s) with source flavor '{}'.", tracks.size(), sourceFlavor);
188 
189     // Get the information in which language the audio track should be
190     String languageCode = getMediaPackageLanguage(mediaPackage, workflowInstance);
191 
192     // How to save the subtitle file? (as attachment, as track...)
193     AppendSubtitleAs appendSubtitleAs = howToAppendTheSubtitles(workflowInstance);
194 
195     // Translate to english
196     Boolean translate = getTranslationMode(workflowInstance);
197 
198     // Create sublist that includes only the tracks that has audio
199     List<Track> tracksWithAudio = tracks.stream().filter(Track::hasAudio).collect(Collectors.toList());
200 
201     // Get the track selection strategy from the workflow configuration
202     // If nothing is set, all tracks (with audio) will be transcribed
203     TrackSelectionStrategy trackSelectionStrategy = getTrackSelectionStrategy(mediaPackage, workflowInstance);
204 
205     // Use the selection strategy from the workflow config to get the tracks we want to transcribe
206     List<Track> tracksToTranscribe = filterTracksByStrategy(tracksWithAudio, trackSelectionStrategy);
207     if (tracksToTranscribe.isEmpty()) {
208       logger.info("No subtitles were created for media package {}. "
209           + "Workflow Configuration 'track-selection-strategy' is set to {}", mediaPackage, trackSelectionStrategy);
210       return createResult(mediaPackage, WorkflowOperationResult.Action.SKIP);
211     }
212 
213     // Load the 'limit-to-one' configuration from the workflow operation.
214     // This configuration sets the limit of generated subtitle files to one
215     boolean limitToOne = BooleanUtils.toBoolean(workflowInstance.getCurrentOperation().getConfiguration(LIMIT_TO_ONE));
216     if (limitToOne) {
217       tracksToTranscribe = List.of(tracksToTranscribe.get(0));
218     }
219 
220     if (async) {
221       createSubtitleAsync(workflowInstance, tracksToTranscribe, languageCode, translate);
222     } else {
223       for (Track track : tracksToTranscribe) {
224         createSubtitle(track, languageCode, mediaPackage, tagsAndFlavors, appendSubtitleAs, translate);
225       }
226     }
227 
228     logger.info("Speech-To-Text workflow operation for media package {} completed", mediaPackage);
229     return createResult(mediaPackage, WorkflowOperationResult.Action.CONTINUE);
230   }
231 
232   /**
233    * Filters the tracks by the strategy configured in the workflow operation
234    * @param tracksWithAudio        List of the tracks that includes audio
235    * @param trackSelectionStrategy The strategy configured in the workflow operation
236    * @return The filtered tracks
237    */
238   private List<Track> filterTracksByStrategy(List<Track> tracksWithAudio,
239       TrackSelectionStrategy trackSelectionStrategy) {
240 
241     List<Track> tracksToTranscribe = new ArrayList<>();
242     if (!tracksWithAudio.isEmpty()) {
243 
244       String presenterTypeConstant = MediaPackageElements.PRESENTER_SOURCE.getType();
245       String presentationTypeConstant = MediaPackageElements.PRESENTATION_SOURCE.getType();
246 
247       // Creates a sublist only including the presenter tracks
248       List<Track> presenterTracksWithAudio = tracksWithAudio.stream()
249           .filter(track -> Objects.equals(track.getFlavor().getType(), presenterTypeConstant))
250           .collect(Collectors.toList());
251 
252       // Creates a sublist only including the presentation tracks
253       List<Track> presentationTracksWithAudio = tracksWithAudio.stream()
254           .filter(track -> Objects.equals(track.getFlavor().getType(), presentationTypeConstant))
255           .collect(Collectors.toList());
256 
257       if (TrackSelectionStrategy.PRESENTER_OR_NOTHING.equals(trackSelectionStrategy)) {
258         tracksToTranscribe.addAll(presenterTracksWithAudio);
259       }
260 
261       if (TrackSelectionStrategy.PRESENTATION_OR_NOTHING.equals(trackSelectionStrategy)) {
262         tracksToTranscribe.addAll(presentationTracksWithAudio);
263       }
264 
265       if (TrackSelectionStrategy.TRY_PRESENTER_FIRST.equals(trackSelectionStrategy)) {
266         tracksToTranscribe.addAll(presenterTracksWithAudio);
267         if (tracksToTranscribe.isEmpty()) {
268           tracksToTranscribe.addAll(tracksWithAudio);
269         }
270       }
271 
272       if (TrackSelectionStrategy.TRY_PRESENTATION_FIRST.equals(trackSelectionStrategy)) {
273         tracksToTranscribe.addAll((presentationTracksWithAudio));
274         if (tracksToTranscribe.isEmpty()) {
275           tracksToTranscribe.addAll(tracksWithAudio);
276         }
277       }
278 
279       if (TrackSelectionStrategy.EVERYTHING.equals(trackSelectionStrategy)) {
280         tracksToTranscribe.addAll(tracksWithAudio);
281       }
282     }
283     return tracksToTranscribe;
284   }
285 
286   /**
287    * Creates the subtitle file for a track and appends it to the media package.
288    *
289    * @param track The track from which the subtitles are created.
290    * @param languageCode The language of the track.
291    * @param parentMediaPackage The media package where the track is located.
292    * @param tagsAndFlavors Tags and flavors instance (to get target flavor information)
293    * @param appendSubtitleAs Tells how the subtitles file has to be appended.
294    * @param translate Enable translation to english.
295    * @throws WorkflowOperationException Get thrown if an error occurs.
296    */
297   private void createSubtitle(Track track, String languageCode, MediaPackage parentMediaPackage,
298           ConfiguredTagsAndFlavors tagsAndFlavors, AppendSubtitleAs appendSubtitleAs, Boolean translate)
299           throws WorkflowOperationException {
300 
301     // Start the transcription job, create subtitles file
302     URI trackURI = track.getURI();
303 
304     Job job;
305     logger.info("Generating subtitle for '{}'...", trackURI);
306     try {
307       job = speechToTextService.transcribe(trackURI, languageCode, translate);
308     } catch (SpeechToTextServiceException e) {
309       throw new WorkflowOperationException(
310               String.format("Generating subtitles for '%s' in media package '%s' failed",
311                       trackURI, parentMediaPackage), e);
312     }
313 
314     if (!waitForStatus(job).isSuccess()) {
315       throw new WorkflowOperationException(
316               String.format("Speech-to-Text job for media package '%s' failed", parentMediaPackage));
317     }
318 
319     // subtitles file is generated now, put it into the media package
320     try {
321       String[] jobOutput = job.getPayload().split(",");
322       URI output = new URI(jobOutput[0]);
323       String outputLanguage = jobOutput[1];
324       String engineType = jobOutput[2];
325 
326       MediaPackageElement subtitleMediaPackageElement;
327       switch (appendSubtitleAs) {
328         case attachment:
329           subtitleMediaPackageElement = new AttachmentImpl();
330           break;
331         case track:
332         default:
333           subtitleMediaPackageElement = new TrackImpl();
334       }
335 
336       subtitleMediaPackageElement.generateIdentifier();
337       try (InputStream in = workspace.read(output)) {
338         URI uri = workspace.put(parentMediaPackage.getIdentifier().toString(),
339                 subtitleMediaPackageElement.getIdentifier(),
340                 FilenameUtils.getName(output.getPath()), in);
341         subtitleMediaPackageElement.setURI(uri);
342       }
343       MediaPackageElementFlavor targetFlavor = tagsAndFlavors.getSingleTargetFlavor().applyTo(track.getFlavor());
344       subtitleMediaPackageElement.setFlavor(targetFlavor);
345 
346       ConfiguredTagsAndFlavors.TargetTags targetTags = tagsAndFlavors.getTargetTags();
347       targetTags.getOverrideTags().add("lang:" + outputLanguage);
348       targetTags.getOverrideTags().add("generator-type:auto");
349       targetTags.getOverrideTags().add("generator:" + engineType.toLowerCase());
350 
351       // this is used to set some values automatically, like the correct mimetype
352       Job inspection = mediaInspectionService.enrich(subtitleMediaPackageElement, true);
353       if (!waitForStatus(inspection).isSuccess()) {
354         throw new SpeechToTextServiceException(String.format(
355                 "Transcription for '%s' failed at enriching process", trackURI));
356       }
357 
358       subtitleMediaPackageElement = MediaPackageElementParser.getFromXml(inspection.getPayload());
359 
360       applyTargetTagsToElement(targetTags, subtitleMediaPackageElement);
361 
362       parentMediaPackage.add(subtitleMediaPackageElement);
363 
364       workspace.delete(output);
365     } catch (Exception e) {
366       throw new WorkflowOperationException("Error handling text-to-speech service output", e);
367     }
368 
369     try {
370       workspace.cleanup(parentMediaPackage.getIdentifier());
371     } catch (IOException e) {
372       throw new WorkflowOperationException(e);
373     }
374   }
375 
376   /**
377    * Start the transcription, but don't actually wait for the process to finish. Instead, let the jobs run
378    * asynchronously and just store the launched jobs in the workflow configuration.
379    * @param workflow Workflow instance to store the jobs in
380    * @param tracks Tracks to run the transcription on
381    * @param languageCode Language to use
382    * @param translate If the transcription should be translated
383    * @throws WorkflowOperationException
384    */
385   private void createSubtitleAsync(WorkflowInstance workflow, List<Track> tracks, String languageCode,
386       Boolean translate) throws WorkflowOperationException {
387 
388     logger.info("Asynchronously generating subtitles");
389     StringBuilder jobs = new StringBuilder();
390     try {
391       for (var track: tracks) {
392         var job = speechToTextService.transcribe(track.getURI(), languageCode, translate);
393         jobs.append(",").append(job.getId());
394       }
395     } catch (SpeechToTextServiceException e) {
396       throw new WorkflowOperationException(
397           String.format("Starting subtitle job in media package '%s' failed",
398               workflow.getMediaPackage().getIdentifier()), e);
399     }
400 
401     var config = Objects.toString(workflow.getConfiguration(JOBS_WORKFLOW_CONFIGURATION), "") + jobs;
402     workflow.setConfiguration(JOBS_WORKFLOW_CONFIGURATION, config.replaceFirst("^,", ""));
403   }
404 
405   /**
406    * Get the config for the "track selection strategy". It's used to determine which tracks shall be transcribed.
407    * If there are 2 Videos and both has audio for example, what audio shall be transcribed?
408    *
409    * @param workflowInstance Contains the workflow configuration.
410    * @return Which strategy to use
411    * @throws WorkflowOperationException Get thrown if an error occurs.
412    */
413   private TrackSelectionStrategy getTrackSelectionStrategy(MediaPackage mediaPackage, WorkflowInstance workflowInstance)
414           throws WorkflowOperationException {
415 
416     WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
417     String strategyCfg = StringUtils.trimToEmpty(operation.getConfiguration(TRACK_SELECTION_STRATEGY)).toLowerCase();
418 
419     if (strategyCfg.isEmpty()) {
420       return TrackSelectionStrategy.EVERYTHING; // "transcribe everything" is the default/fallback
421     }
422     try {
423       return TrackSelectionStrategy.fromString(strategyCfg);
424     } catch (IllegalArgumentException e) {
425       throw new WorkflowOperationException(String.format(
426           "Speech-to-Text job for media package '%s' failed, because of wrong workflow configuration. "
427               + "track-selection-strategy of type '%s' does not exist.", mediaPackage, strategyCfg));
428     }
429   }
430 
431 
432   /**
433    * Get the information how to append the subtitles file to the media package.
434    *
435    * @param workflowInstance Contains the workflow configuration.
436    * @return How to append the subtitles file to the media package.
437    * @throws WorkflowOperationException Get thrown if an error occurs.
438    */
439   private AppendSubtitleAs howToAppendTheSubtitles(WorkflowInstance workflowInstance)
440           throws WorkflowOperationException {
441     WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
442     String targetElement = StringUtils.trimToEmpty(operation.getConfiguration(TARGET_ELEMENT)).toLowerCase();
443     if (targetElement.isEmpty()) {
444       return AppendSubtitleAs.track;
445     }
446     try {
447       return AppendSubtitleAs.valueOf(targetElement);
448     } catch (IllegalArgumentException e) {
449       throw new WorkflowOperationException(String.format(
450           "Speech-to-Text job for media package '%s' failed, because of wrong workflow configuration. "
451               + "target-element of type '%s' does not exist.", workflowInstance.getMediaPackage(), targetElement));
452     }
453   }
454 
455   /**
456    * Get if the subtitle needs to be translated into english
457    *
458    * @param workflowInstance Contains the workflow configuration
459    * @return Boolean to enable english translation
460    */
461   private Boolean getTranslationMode(WorkflowInstance workflowInstance) {
462     WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
463     return BooleanUtils.toBoolean(StringUtils.trimToEmpty(operation.getConfiguration(TRANSLATE_MODE)));
464   }
465 
466   /**
467    * Searches some places to get the right language of the media package / track.
468    *
469    * @param mediaPackage The media package from which the subtitles are generated.
470    * @param workflowInstance Contains the workflow configuration.
471    * @return The language of the media package / track.
472    */
473   private String getMediaPackageLanguage(MediaPackage mediaPackage, WorkflowInstance workflowInstance) {
474 
475     // First look if there is a fixed language configured in the operation
476     WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
477     String language = StringUtils.trimToEmpty(operation.getConfiguration(LANGUAGE_CODE));
478 
479     if (language.isEmpty()) {
480       // If not we look in the dublin core metadata if the language is available
481       MediaPackageMetadata dublinCoreMetadata = dublinCoreCatalogService.getMetadata(mediaPackage);
482       language = StringUtils.trimToEmpty(dublinCoreMetadata.getLanguage());
483     }
484 
485     if (language.isEmpty()) {
486       // If there is still no language, we look in the media package itself
487       language = StringUtils.trimToEmpty(mediaPackage.getLanguage());
488     }
489 
490     if (language.isEmpty()) {
491       // Use the fallback language if the operation configuration defines one
492       language = Objects.toString(operation.getConfiguration(LANGUAGE_FALLBACK), "");
493     }
494 
495     return language;
496   }
497 
498 
499   //================================================================================
500   // OSGi setter
501   //================================================================================
502 
503   @Reference
504   public void setSpeechToTextService(SpeechToTextService speechToTextService) {
505     this.speechToTextService = speechToTextService;
506   }
507 
508   @Reference
509   public void setMediaInspectionService(MediaInspectionService mediaInspectionService) {
510     this.mediaInspectionService = mediaInspectionService;
511   }
512 
513   @Reference
514   public void setWorkspace(Workspace workspace) {
515     this.workspace = workspace;
516   }
517 
518   @Reference
519   public void setDublinCoreCatalogService(DublinCoreCatalogService dublinCoreCatalogService) {
520     this.dublinCoreCatalogService = dublinCoreCatalogService;
521   }
522 
523   @Reference
524   public void setServiceRegistry(ServiceRegistry serviceRegistry) {
525     this.serviceRegistry = serviceRegistry;
526   }
527 }