1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package org.opencastproject.workflow.handler.speechtotext;
22
23 import org.opencastproject.inspection.api.MediaInspectionService;
24 import org.opencastproject.job.api.Job;
25 import org.opencastproject.job.api.JobContext;
26 import org.opencastproject.mediapackage.MediaPackage;
27 import org.opencastproject.mediapackage.MediaPackageElement;
28 import org.opencastproject.mediapackage.MediaPackageElementFlavor;
29 import org.opencastproject.mediapackage.MediaPackageElementParser;
30 import org.opencastproject.mediapackage.MediaPackageElements;
31 import org.opencastproject.mediapackage.Track;
32 import org.opencastproject.mediapackage.attachment.AttachmentImpl;
33 import org.opencastproject.mediapackage.selector.TrackSelector;
34 import org.opencastproject.mediapackage.track.TrackImpl;
35 import org.opencastproject.metadata.api.MediaPackageMetadata;
36 import org.opencastproject.metadata.dublincore.DublinCoreCatalogService;
37 import org.opencastproject.serviceregistry.api.ServiceRegistry;
38 import org.opencastproject.speechtotext.api.SpeechToTextService;
39 import org.opencastproject.speechtotext.api.SpeechToTextServiceException;
40 import org.opencastproject.workflow.api.AbstractWorkflowOperationHandler;
41 import org.opencastproject.workflow.api.ConfiguredTagsAndFlavors;
42 import org.opencastproject.workflow.api.WorkflowInstance;
43 import org.opencastproject.workflow.api.WorkflowOperationException;
44 import org.opencastproject.workflow.api.WorkflowOperationHandler;
45 import org.opencastproject.workflow.api.WorkflowOperationInstance;
46 import org.opencastproject.workflow.api.WorkflowOperationResult;
47 import org.opencastproject.workspace.api.Workspace;
48
49 import org.apache.commons.io.FilenameUtils;
50 import org.apache.commons.lang3.BooleanUtils;
51 import org.apache.commons.lang3.StringUtils;
52 import org.osgi.service.component.ComponentContext;
53 import org.osgi.service.component.annotations.Activate;
54 import org.osgi.service.component.annotations.Component;
55 import org.osgi.service.component.annotations.Reference;
56 import org.slf4j.Logger;
57 import org.slf4j.LoggerFactory;
58
59 import java.io.IOException;
60 import java.io.InputStream;
61 import java.net.URI;
62 import java.util.ArrayList;
63 import java.util.Collection;
64 import java.util.List;
65 import java.util.Objects;
66 import java.util.stream.Collectors;
67
68
69
70
71 @Component(
72 immediate = true,
73 service = WorkflowOperationHandler.class,
74 property = {
75 "service.description=Speech-to-Text Workflow Operation Handler",
76 "workflow.operation=speechtotext"
77 }
78 )
79 public class
80 SpeechToTextWorkflowOperationHandler extends AbstractWorkflowOperationHandler {
81
82 private static final Logger logger = LoggerFactory.getLogger(SpeechToTextWorkflowOperationHandler.class);
83
84
85 private static final String LANGUAGE_CODE = "language-code";
86
87
88 private static final String LANGUAGE_FALLBACK = "language-fallback";
89
90
91 private static final String TARGET_ELEMENT = "target-element";
92
93
94 private static final String PLACEHOLDER_LANG = "#{lang}";
95
96
97 private static final String TRANSLATE_MODE = "translate";
98
99
100 private static final String TRACK_SELECTION_STRATEGY = "track-selection-strategy";
101
102
103 private static final String LIMIT_TO_ONE = "limit-to-one";
104
105
106 private static final String ASYNCHRONOUS = "async";
107
108
109 private static final String JOBS_WORKFLOW_CONFIGURATION = "speech-to-text-jobs";
110
111 private enum TrackSelectionStrategy {
112 PRESENTER_OR_NOTHING,
113 PRESENTATION_OR_NOTHING,
114 TRY_PRESENTER_FIRST,
115 TRY_PRESENTATION_FIRST,
116 EVERYTHING;
117
118 private static TrackSelectionStrategy fromString(String value) {
119 for (TrackSelectionStrategy strategy : values()) {
120 if (strategy.name().equalsIgnoreCase(value)) {
121 return strategy;
122 }
123 }
124 throw new IllegalArgumentException(
125 "No TrackSelectionStrategy enum constant " + TrackSelectionStrategy.class.getCanonicalName() + "." + value);
126 }
127 }
128
129 private enum AppendSubtitleAs {
130 attachment, track
131 }
132
133
134 private SpeechToTextService speechToTextService = null;
135
136
137 private Workspace workspace;
138
139
140 private MediaInspectionService mediaInspectionService;
141
142
143 private DublinCoreCatalogService dublinCoreCatalogService;
144
145 @Override
146 @Activate
147 public void activate(ComponentContext cc) {
148 super.activate(cc);
149 logger.info("Registering speech-to-text workflow operation handler");
150 }
151
152
153
154
155
156
157
158
159 @Override
160 public WorkflowOperationResult start(WorkflowInstance workflowInstance, JobContext context)
161 throws WorkflowOperationException {
162
163 MediaPackage mediaPackage = workflowInstance.getMediaPackage();
164 logger.info("Start speech-to-text workflow operation for media package {}", mediaPackage);
165
166
167 var async = BooleanUtils.toBoolean(workflowInstance.getCurrentOperation().getConfiguration(ASYNCHRONOUS));
168
169 ConfiguredTagsAndFlavors tagsAndFlavors = getTagsAndFlavors(workflowInstance,
170 Configuration.many, Configuration.one,
171 Configuration.many, Configuration.one);
172 MediaPackageElementFlavor sourceFlavor = tagsAndFlavors.getSingleSrcFlavor();
173 List<String> srcTags = tagsAndFlavors.getSrcTags();
174
175 TrackSelector trackSelector = new TrackSelector();
176 trackSelector.addFlavor(sourceFlavor);
177 for (String tag : srcTags) {
178 trackSelector.addTag(tag);
179 }
180 Collection<Track> tracks = trackSelector.select(mediaPackage, true);
181
182 if (tracks.isEmpty()) {
183 throw new WorkflowOperationException(
184 String.format("No tracks with source flavor '%s' found for transcription", sourceFlavor));
185 }
186
187 logger.info("Found {} track(s) with source flavor '{}'.", tracks.size(), sourceFlavor);
188
189
190 String languageCode = getMediaPackageLanguage(mediaPackage, workflowInstance);
191
192
193 AppendSubtitleAs appendSubtitleAs = howToAppendTheSubtitles(workflowInstance);
194
195
196 Boolean translate = getTranslationMode(workflowInstance);
197
198
199 List<Track> tracksWithAudio = tracks.stream().filter(Track::hasAudio).collect(Collectors.toList());
200
201
202
203 TrackSelectionStrategy trackSelectionStrategy = getTrackSelectionStrategy(mediaPackage, workflowInstance);
204
205
206 List<Track> tracksToTranscribe = filterTracksByStrategy(tracksWithAudio, trackSelectionStrategy);
207 if (tracksToTranscribe.isEmpty()) {
208 logger.info("No subtitles were created for media package {}. "
209 + "Workflow Configuration 'track-selection-strategy' is set to {}", mediaPackage, trackSelectionStrategy);
210 return createResult(mediaPackage, WorkflowOperationResult.Action.SKIP);
211 }
212
213
214
215 boolean limitToOne = BooleanUtils.toBoolean(workflowInstance.getCurrentOperation().getConfiguration(LIMIT_TO_ONE));
216 if (limitToOne) {
217 tracksToTranscribe = List.of(tracksToTranscribe.get(0));
218 }
219
220 if (async) {
221 createSubtitleAsync(workflowInstance, tracksToTranscribe, languageCode, translate);
222 } else {
223 for (Track track : tracksToTranscribe) {
224 createSubtitle(track, languageCode, mediaPackage, tagsAndFlavors, appendSubtitleAs, translate);
225 }
226 }
227
228 logger.info("Speech-To-Text workflow operation for media package {} completed", mediaPackage);
229 return createResult(mediaPackage, WorkflowOperationResult.Action.CONTINUE);
230 }
231
232
233
234
235
236
237
238 private List<Track> filterTracksByStrategy(List<Track> tracksWithAudio,
239 TrackSelectionStrategy trackSelectionStrategy) {
240
241 List<Track> tracksToTranscribe = new ArrayList<>();
242 if (!tracksWithAudio.isEmpty()) {
243
244 String presenterTypeConstant = MediaPackageElements.PRESENTER_SOURCE.getType();
245 String presentationTypeConstant = MediaPackageElements.PRESENTATION_SOURCE.getType();
246
247
248 List<Track> presenterTracksWithAudio = tracksWithAudio.stream()
249 .filter(track -> Objects.equals(track.getFlavor().getType(), presenterTypeConstant))
250 .collect(Collectors.toList());
251
252
253 List<Track> presentationTracksWithAudio = tracksWithAudio.stream()
254 .filter(track -> Objects.equals(track.getFlavor().getType(), presentationTypeConstant))
255 .collect(Collectors.toList());
256
257 if (TrackSelectionStrategy.PRESENTER_OR_NOTHING.equals(trackSelectionStrategy)) {
258 tracksToTranscribe.addAll(presenterTracksWithAudio);
259 }
260
261 if (TrackSelectionStrategy.PRESENTATION_OR_NOTHING.equals(trackSelectionStrategy)) {
262 tracksToTranscribe.addAll(presentationTracksWithAudio);
263 }
264
265 if (TrackSelectionStrategy.TRY_PRESENTER_FIRST.equals(trackSelectionStrategy)) {
266 tracksToTranscribe.addAll(presenterTracksWithAudio);
267 if (tracksToTranscribe.isEmpty()) {
268 tracksToTranscribe.addAll(tracksWithAudio);
269 }
270 }
271
272 if (TrackSelectionStrategy.TRY_PRESENTATION_FIRST.equals(trackSelectionStrategy)) {
273 tracksToTranscribe.addAll((presentationTracksWithAudio));
274 if (tracksToTranscribe.isEmpty()) {
275 tracksToTranscribe.addAll(tracksWithAudio);
276 }
277 }
278
279 if (TrackSelectionStrategy.EVERYTHING.equals(trackSelectionStrategy)) {
280 tracksToTranscribe.addAll(tracksWithAudio);
281 }
282 }
283 return tracksToTranscribe;
284 }
285
286
287
288
289
290
291
292
293
294
295
296
297 private void createSubtitle(Track track, String languageCode, MediaPackage parentMediaPackage,
298 ConfiguredTagsAndFlavors tagsAndFlavors, AppendSubtitleAs appendSubtitleAs, Boolean translate)
299 throws WorkflowOperationException {
300
301
302 URI trackURI = track.getURI();
303
304 Job job;
305 logger.info("Generating subtitle for '{}'...", trackURI);
306 try {
307 job = speechToTextService.transcribe(trackURI, languageCode, translate);
308 } catch (SpeechToTextServiceException e) {
309 throw new WorkflowOperationException(
310 String.format("Generating subtitles for '%s' in media package '%s' failed",
311 trackURI, parentMediaPackage), e);
312 }
313
314 if (!waitForStatus(job).isSuccess()) {
315 throw new WorkflowOperationException(
316 String.format("Speech-to-Text job for media package '%s' failed", parentMediaPackage));
317 }
318
319
320 try {
321 String[] jobOutput = job.getPayload().split(",");
322 URI output = new URI(jobOutput[0]);
323 String outputLanguage = jobOutput[1];
324 String engineType = jobOutput[2];
325
326 MediaPackageElement subtitleMediaPackageElement;
327 switch (appendSubtitleAs) {
328 case attachment:
329 subtitleMediaPackageElement = new AttachmentImpl();
330 break;
331 case track:
332 default:
333 subtitleMediaPackageElement = new TrackImpl();
334 }
335
336 subtitleMediaPackageElement.generateIdentifier();
337 try (InputStream in = workspace.read(output)) {
338 URI uri = workspace.put(parentMediaPackage.getIdentifier().toString(),
339 subtitleMediaPackageElement.getIdentifier(),
340 FilenameUtils.getName(output.getPath()), in);
341 subtitleMediaPackageElement.setURI(uri);
342 }
343 MediaPackageElementFlavor targetFlavor = tagsAndFlavors.getSingleTargetFlavor().applyTo(track.getFlavor());
344 subtitleMediaPackageElement.setFlavor(targetFlavor);
345
346 ConfiguredTagsAndFlavors.TargetTags targetTags = tagsAndFlavors.getTargetTags();
347 targetTags.getOverrideTags().add("lang:" + outputLanguage);
348 targetTags.getOverrideTags().add("generator-type:auto");
349 targetTags.getOverrideTags().add("generator:" + engineType.toLowerCase());
350
351
352 Job inspection = mediaInspectionService.enrich(subtitleMediaPackageElement, true);
353 if (!waitForStatus(inspection).isSuccess()) {
354 throw new SpeechToTextServiceException(String.format(
355 "Transcription for '%s' failed at enriching process", trackURI));
356 }
357
358 subtitleMediaPackageElement = MediaPackageElementParser.getFromXml(inspection.getPayload());
359
360 applyTargetTagsToElement(targetTags, subtitleMediaPackageElement);
361
362 parentMediaPackage.add(subtitleMediaPackageElement);
363
364 workspace.delete(output);
365 } catch (Exception e) {
366 throw new WorkflowOperationException("Error handling text-to-speech service output", e);
367 }
368
369 try {
370 workspace.cleanup(parentMediaPackage.getIdentifier());
371 } catch (IOException e) {
372 throw new WorkflowOperationException(e);
373 }
374 }
375
376
377
378
379
380
381
382
383
384
385 private void createSubtitleAsync(WorkflowInstance workflow, List<Track> tracks, String languageCode,
386 Boolean translate) throws WorkflowOperationException {
387
388 logger.info("Asynchronously generating subtitles");
389 StringBuilder jobs = new StringBuilder();
390 try {
391 for (var track: tracks) {
392 var job = speechToTextService.transcribe(track.getURI(), languageCode, translate);
393 jobs.append(",").append(job.getId());
394 }
395 } catch (SpeechToTextServiceException e) {
396 throw new WorkflowOperationException(
397 String.format("Starting subtitle job in media package '%s' failed",
398 workflow.getMediaPackage().getIdentifier()), e);
399 }
400
401 var config = Objects.toString(workflow.getConfiguration(JOBS_WORKFLOW_CONFIGURATION), "") + jobs;
402 workflow.setConfiguration(JOBS_WORKFLOW_CONFIGURATION, config.replaceFirst("^,", ""));
403 }
404
405
406
407
408
409
410
411
412
413 private TrackSelectionStrategy getTrackSelectionStrategy(MediaPackage mediaPackage, WorkflowInstance workflowInstance)
414 throws WorkflowOperationException {
415
416 WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
417 String strategyCfg = StringUtils.trimToEmpty(operation.getConfiguration(TRACK_SELECTION_STRATEGY)).toLowerCase();
418
419 if (strategyCfg.isEmpty()) {
420 return TrackSelectionStrategy.EVERYTHING;
421 }
422 try {
423 return TrackSelectionStrategy.fromString(strategyCfg);
424 } catch (IllegalArgumentException e) {
425 throw new WorkflowOperationException(String.format(
426 "Speech-to-Text job for media package '%s' failed, because of wrong workflow configuration. "
427 + "track-selection-strategy of type '%s' does not exist.", mediaPackage, strategyCfg));
428 }
429 }
430
431
432
433
434
435
436
437
438
439 private AppendSubtitleAs howToAppendTheSubtitles(WorkflowInstance workflowInstance)
440 throws WorkflowOperationException {
441 WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
442 String targetElement = StringUtils.trimToEmpty(operation.getConfiguration(TARGET_ELEMENT)).toLowerCase();
443 if (targetElement.isEmpty()) {
444 return AppendSubtitleAs.track;
445 }
446 try {
447 return AppendSubtitleAs.valueOf(targetElement);
448 } catch (IllegalArgumentException e) {
449 throw new WorkflowOperationException(String.format(
450 "Speech-to-Text job for media package '%s' failed, because of wrong workflow configuration. "
451 + "target-element of type '%s' does not exist.", workflowInstance.getMediaPackage(), targetElement));
452 }
453 }
454
455
456
457
458
459
460
461 private Boolean getTranslationMode(WorkflowInstance workflowInstance) {
462 WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
463 return BooleanUtils.toBoolean(StringUtils.trimToEmpty(operation.getConfiguration(TRANSLATE_MODE)));
464 }
465
466
467
468
469
470
471
472
473 private String getMediaPackageLanguage(MediaPackage mediaPackage, WorkflowInstance workflowInstance) {
474
475
476 WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
477 String language = StringUtils.trimToEmpty(operation.getConfiguration(LANGUAGE_CODE));
478
479 if (language.isEmpty()) {
480
481 MediaPackageMetadata dublinCoreMetadata = dublinCoreCatalogService.getMetadata(mediaPackage);
482 language = StringUtils.trimToEmpty(dublinCoreMetadata.getLanguage());
483 }
484
485 if (language.isEmpty()) {
486
487 language = StringUtils.trimToEmpty(mediaPackage.getLanguage());
488 }
489
490 if (language.isEmpty()) {
491
492 language = Objects.toString(operation.getConfiguration(LANGUAGE_FALLBACK), "");
493 }
494
495 return language;
496 }
497
498
499
500
501
502
503 @Reference
504 public void setSpeechToTextService(SpeechToTextService speechToTextService) {
505 this.speechToTextService = speechToTextService;
506 }
507
508 @Reference
509 public void setMediaInspectionService(MediaInspectionService mediaInspectionService) {
510 this.mediaInspectionService = mediaInspectionService;
511 }
512
513 @Reference
514 public void setWorkspace(Workspace workspace) {
515 this.workspace = workspace;
516 }
517
518 @Reference
519 public void setDublinCoreCatalogService(DublinCoreCatalogService dublinCoreCatalogService) {
520 this.dublinCoreCatalogService = dublinCoreCatalogService;
521 }
522
523 @Reference
524 public void setServiceRegistry(ServiceRegistry serviceRegistry) {
525 this.serviceRegistry = serviceRegistry;
526 }
527 }