1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package org.opencastproject.workflow.handler.speechtotext;
22
23 import org.opencastproject.inspection.api.MediaInspectionService;
24 import org.opencastproject.job.api.Job;
25 import org.opencastproject.job.api.JobContext;
26 import org.opencastproject.mediapackage.MediaPackage;
27 import org.opencastproject.mediapackage.MediaPackageElement;
28 import org.opencastproject.mediapackage.MediaPackageElementFlavor;
29 import org.opencastproject.mediapackage.MediaPackageElementParser;
30 import org.opencastproject.mediapackage.MediaPackageElements;
31 import org.opencastproject.mediapackage.Track;
32 import org.opencastproject.mediapackage.attachment.AttachmentImpl;
33 import org.opencastproject.mediapackage.selector.TrackSelector;
34 import org.opencastproject.mediapackage.track.TrackImpl;
35 import org.opencastproject.metadata.api.MediaPackageMetadata;
36 import org.opencastproject.metadata.dublincore.DublinCoreCatalogService;
37 import org.opencastproject.serviceregistry.api.ServiceRegistry;
38 import org.opencastproject.speechtotext.api.SpeechToTextService;
39 import org.opencastproject.speechtotext.api.SpeechToTextServiceException;
40 import org.opencastproject.workflow.api.AbstractWorkflowOperationHandler;
41 import org.opencastproject.workflow.api.ConfiguredTagsAndFlavors;
42 import org.opencastproject.workflow.api.WorkflowInstance;
43 import org.opencastproject.workflow.api.WorkflowOperationException;
44 import org.opencastproject.workflow.api.WorkflowOperationHandler;
45 import org.opencastproject.workflow.api.WorkflowOperationInstance;
46 import org.opencastproject.workflow.api.WorkflowOperationResult;
47 import org.opencastproject.workspace.api.Workspace;
48
49 import org.apache.commons.io.FilenameUtils;
50 import org.apache.commons.lang3.BooleanUtils;
51 import org.apache.commons.lang3.StringUtils;
52 import org.osgi.service.component.ComponentContext;
53 import org.osgi.service.component.annotations.Activate;
54 import org.osgi.service.component.annotations.Component;
55 import org.osgi.service.component.annotations.Reference;
56 import org.slf4j.Logger;
57 import org.slf4j.LoggerFactory;
58
59 import java.io.IOException;
60 import java.io.InputStream;
61 import java.net.URI;
62 import java.util.ArrayList;
63 import java.util.Collection;
64 import java.util.List;
65 import java.util.Objects;
66 import java.util.UUID;
67 import java.util.stream.Collectors;
68
69
70
71
72 @Component(
73 immediate = true,
74 service = WorkflowOperationHandler.class,
75 property = {
76 "service.description=Speech-to-Text Workflow Operation Handler",
77 "workflow.operation=speechtotext"
78 }
79 )
80 public class
81 SpeechToTextWorkflowOperationHandler extends AbstractWorkflowOperationHandler {
82
83 private static final Logger logger = LoggerFactory.getLogger(SpeechToTextWorkflowOperationHandler.class);
84
85
86 private static final String LANGUAGE_CODE = "language-code";
87
88
89 private static final String LANGUAGE_FALLBACK = "language-fallback";
90
91
92 private static final String TARGET_ELEMENT = "target-element";
93
94
95 private static final String PLACEHOLDER_LANG = "#{lang}";
96
97
98 private static final String TRANSLATE_MODE = "translate";
99
100
101 private static final String TRACK_SELECTION_STRATEGY = "track-selection-strategy";
102
103
104 private static final String LIMIT_TO_ONE = "limit-to-one";
105
106
107 private static final String ASYNCHRONOUS = "async";
108
109
110 private static final String JOBS_WORKFLOW_CONFIGURATION = "speech-to-text-jobs";
111
112 private enum TrackSelectionStrategy {
113 PRESENTER_OR_NOTHING,
114 PRESENTATION_OR_NOTHING,
115 TRY_PRESENTER_FIRST,
116 TRY_PRESENTATION_FIRST,
117 EVERYTHING;
118
119 private static TrackSelectionStrategy fromString(String value) {
120 for (TrackSelectionStrategy strategy : values()) {
121 if (strategy.name().equalsIgnoreCase(value)) {
122 return strategy;
123 }
124 }
125 throw new IllegalArgumentException(
126 "No TrackSelectionStrategy enum constant " + TrackSelectionStrategy.class.getCanonicalName() + "." + value);
127 }
128 }
129
130 private enum AppendSubtitleAs {
131 attachment, track
132 }
133
134
135 private SpeechToTextService speechToTextService = null;
136
137
138 private Workspace workspace;
139
140
141 private MediaInspectionService mediaInspectionService;
142
143
144 private DublinCoreCatalogService dublinCoreCatalogService;
145
146 @Override
147 @Activate
148 public void activate(ComponentContext cc) {
149 super.activate(cc);
150 logger.info("Registering speech-to-text workflow operation handler");
151 }
152
153
154
155
156
157
158
159
160 @Override
161 public WorkflowOperationResult start(WorkflowInstance workflowInstance, JobContext context)
162 throws WorkflowOperationException {
163
164 MediaPackage mediaPackage = workflowInstance.getMediaPackage();
165 logger.info("Start speech-to-text workflow operation for media package {}", mediaPackage);
166
167
168 var async = BooleanUtils.toBoolean(workflowInstance.getCurrentOperation().getConfiguration(ASYNCHRONOUS));
169
170 ConfiguredTagsAndFlavors tagsAndFlavors = getTagsAndFlavors(workflowInstance,
171 Configuration.many, Configuration.one,
172 Configuration.many, Configuration.one);
173 MediaPackageElementFlavor sourceFlavor = tagsAndFlavors.getSingleSrcFlavor();
174 List<String> srcTags = tagsAndFlavors.getSrcTags();
175
176 TrackSelector trackSelector = new TrackSelector();
177 trackSelector.addFlavor(sourceFlavor);
178 for (String tag : srcTags) {
179 trackSelector.addTag(tag);
180 }
181 Collection<Track> tracks = trackSelector.select(mediaPackage, true);
182
183 if (tracks.isEmpty()) {
184 throw new WorkflowOperationException(
185 String.format("No tracks with source flavor '%s' found for transcription", sourceFlavor));
186 }
187
188 logger.info("Found {} track(s) with source flavor '{}'.", tracks.size(), sourceFlavor);
189
190
191 String languageCode = getMediaPackageLanguage(mediaPackage, workflowInstance);
192
193
194 AppendSubtitleAs appendSubtitleAs = howToAppendTheSubtitles(workflowInstance);
195
196
197 Boolean translate = getTranslationMode(workflowInstance);
198
199
200 List<Track> tracksWithAudio = tracks.stream().filter(Track::hasAudio).collect(Collectors.toList());
201
202
203
204 TrackSelectionStrategy trackSelectionStrategy = getTrackSelectionStrategy(mediaPackage, workflowInstance);
205
206
207 List<Track> tracksToTranscribe = filterTracksByStrategy(tracksWithAudio, trackSelectionStrategy);
208 if (tracksToTranscribe.isEmpty()) {
209 logger.info("No subtitles were created for media package {}. "
210 + "Workflow Configuration 'track-selection-strategy' is set to {}", mediaPackage, trackSelectionStrategy);
211 return createResult(mediaPackage, WorkflowOperationResult.Action.SKIP);
212 }
213
214
215
216 boolean limitToOne = BooleanUtils.toBoolean(workflowInstance.getCurrentOperation().getConfiguration(LIMIT_TO_ONE));
217 if (limitToOne) {
218 tracksToTranscribe = List.of(tracksToTranscribe.get(0));
219 }
220
221 if (async) {
222 createSubtitleAsync(workflowInstance, tracksToTranscribe, languageCode, translate);
223 } else {
224 for (Track track : tracksToTranscribe) {
225 createSubtitle(track, languageCode, mediaPackage, tagsAndFlavors, appendSubtitleAs, translate);
226 }
227 }
228
229 logger.info("Speech-To-Text workflow operation for media package {} completed", mediaPackage);
230 return createResult(mediaPackage, WorkflowOperationResult.Action.CONTINUE);
231 }
232
233
234
235
236
237
238
239 private List<Track> filterTracksByStrategy(List<Track> tracksWithAudio,
240 TrackSelectionStrategy trackSelectionStrategy) {
241
242 List<Track> tracksToTranscribe = new ArrayList<>();
243 if (!tracksWithAudio.isEmpty()) {
244
245 String presenterTypeConstant = MediaPackageElements.PRESENTER_SOURCE.getType();
246 String presentationTypeConstant = MediaPackageElements.PRESENTATION_SOURCE.getType();
247
248
249 List<Track> presenterTracksWithAudio = tracksWithAudio.stream()
250 .filter(track -> Objects.equals(track.getFlavor().getType(), presenterTypeConstant))
251 .collect(Collectors.toList());
252
253
254 List<Track> presentationTracksWithAudio = tracksWithAudio.stream()
255 .filter(track -> Objects.equals(track.getFlavor().getType(), presentationTypeConstant))
256 .collect(Collectors.toList());
257
258 if (TrackSelectionStrategy.PRESENTER_OR_NOTHING.equals(trackSelectionStrategy)) {
259 tracksToTranscribe.addAll(presenterTracksWithAudio);
260 }
261
262 if (TrackSelectionStrategy.PRESENTATION_OR_NOTHING.equals(trackSelectionStrategy)) {
263 tracksToTranscribe.addAll(presentationTracksWithAudio);
264 }
265
266 if (TrackSelectionStrategy.TRY_PRESENTER_FIRST.equals(trackSelectionStrategy)) {
267 tracksToTranscribe.addAll(presenterTracksWithAudio);
268 if (tracksToTranscribe.isEmpty()) {
269 tracksToTranscribe.addAll(tracksWithAudio);
270 }
271 }
272
273 if (TrackSelectionStrategy.TRY_PRESENTATION_FIRST.equals(trackSelectionStrategy)) {
274 tracksToTranscribe.addAll((presentationTracksWithAudio));
275 if (tracksToTranscribe.isEmpty()) {
276 tracksToTranscribe.addAll(tracksWithAudio);
277 }
278 }
279
280 if (TrackSelectionStrategy.EVERYTHING.equals(trackSelectionStrategy)) {
281 tracksToTranscribe.addAll(tracksWithAudio);
282 }
283 }
284 return tracksToTranscribe;
285 }
286
287
288
289
290
291
292
293
294
295
296
297
298 private void createSubtitle(Track track, String languageCode, MediaPackage parentMediaPackage,
299 ConfiguredTagsAndFlavors tagsAndFlavors, AppendSubtitleAs appendSubtitleAs, Boolean translate)
300 throws WorkflowOperationException {
301
302
303 URI trackURI = track.getURI();
304
305 Job job;
306 logger.info("Generating subtitle for '{}'...", trackURI);
307 try {
308 job = speechToTextService.transcribe(trackURI, languageCode, translate);
309 } catch (SpeechToTextServiceException e) {
310 throw new WorkflowOperationException(
311 String.format("Generating subtitles for '%s' in media package '%s' failed",
312 trackURI, parentMediaPackage), e);
313 }
314
315 if (!waitForStatus(job).isSuccess()) {
316 throw new WorkflowOperationException(
317 String.format("Speech-to-Text job for media package '%s' failed", parentMediaPackage));
318 }
319
320
321 try {
322 String[] jobOutput = job.getPayload().split(",");
323 URI output = new URI(jobOutput[0]);
324 String outputLanguage = jobOutput[1];
325 String engineType = jobOutput[2];
326
327 String mediaPackageIdentifier = UUID.randomUUID().toString();
328
329 MediaPackageElement subtitleMediaPackageElement;
330 switch (appendSubtitleAs) {
331 case attachment:
332 subtitleMediaPackageElement = new AttachmentImpl();
333 break;
334 case track:
335 default:
336 subtitleMediaPackageElement = new TrackImpl();
337 }
338
339 subtitleMediaPackageElement.setIdentifier(mediaPackageIdentifier);
340 try (InputStream in = workspace.read(output)) {
341 URI uri = workspace.put(parentMediaPackage.getIdentifier().toString(), mediaPackageIdentifier,
342 FilenameUtils.getName(output.getPath()), in);
343 subtitleMediaPackageElement.setURI(uri);
344 }
345 MediaPackageElementFlavor targetFlavor = tagsAndFlavors.getSingleTargetFlavor().applyTo(track.getFlavor());
346 subtitleMediaPackageElement.setFlavor(targetFlavor);
347
348 List<String> targetTags = tagsAndFlavors.getTargetTags();
349 targetTags.add("lang:" + outputLanguage);
350 targetTags.add("generator-type:auto");
351 targetTags.add("generator:" + engineType.toLowerCase());
352
353
354 Job inspection = mediaInspectionService.enrich(subtitleMediaPackageElement, true);
355 if (!waitForStatus(inspection).isSuccess()) {
356 throw new SpeechToTextServiceException(String.format(
357 "Transcription for '%s' failed at enriching process", trackURI));
358 }
359
360 subtitleMediaPackageElement = MediaPackageElementParser.getFromXml(inspection.getPayload());
361
362 for (String tag : targetTags) {
363 subtitleMediaPackageElement.addTag(tag);
364 }
365
366 parentMediaPackage.add(subtitleMediaPackageElement);
367
368 workspace.delete(output);
369 } catch (Exception e) {
370 throw new WorkflowOperationException("Error handling text-to-speech service output", e);
371 }
372
373 try {
374 workspace.cleanup(parentMediaPackage.getIdentifier());
375 } catch (IOException e) {
376 throw new WorkflowOperationException(e);
377 }
378 }
379
380
381
382
383
384
385
386
387
388
389 private void createSubtitleAsync(WorkflowInstance workflow, List<Track> tracks, String languageCode,
390 Boolean translate) throws WorkflowOperationException {
391
392 logger.info("Asynchronously generating subtitles");
393 StringBuilder jobs = new StringBuilder();
394 try {
395 for (var track: tracks) {
396 var job = speechToTextService.transcribe(track.getURI(), languageCode, translate);
397 jobs.append(",").append(job.getId());
398 }
399 } catch (SpeechToTextServiceException e) {
400 throw new WorkflowOperationException(
401 String.format("Starting subtitle job in media package '%s' failed",
402 workflow.getMediaPackage().getIdentifier()), e);
403 }
404
405 var config = Objects.toString(workflow.getConfiguration(JOBS_WORKFLOW_CONFIGURATION), "") + jobs;
406 workflow.setConfiguration(JOBS_WORKFLOW_CONFIGURATION, config.replaceFirst("^,", ""));
407 }
408
409
410
411
412
413
414
415
416
417 private TrackSelectionStrategy getTrackSelectionStrategy(MediaPackage mediaPackage, WorkflowInstance workflowInstance)
418 throws WorkflowOperationException {
419
420 WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
421 String strategyCfg = StringUtils.trimToEmpty(operation.getConfiguration(TRACK_SELECTION_STRATEGY)).toLowerCase();
422
423 if (strategyCfg.isEmpty()) {
424 return TrackSelectionStrategy.EVERYTHING;
425 }
426 try {
427 return TrackSelectionStrategy.fromString(strategyCfg);
428 } catch (IllegalArgumentException e) {
429 throw new WorkflowOperationException(String.format(
430 "Speech-to-Text job for media package '%s' failed, because of wrong workflow configuration. "
431 + "track-selection-strategy of type '%s' does not exist.", mediaPackage, strategyCfg));
432 }
433 }
434
435
436
437
438
439
440
441
442
443 private AppendSubtitleAs howToAppendTheSubtitles(WorkflowInstance workflowInstance)
444 throws WorkflowOperationException {
445 WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
446 String targetElement = StringUtils.trimToEmpty(operation.getConfiguration(TARGET_ELEMENT)).toLowerCase();
447 if (targetElement.isEmpty()) {
448 return AppendSubtitleAs.track;
449 }
450 try {
451 return AppendSubtitleAs.valueOf(targetElement);
452 } catch (IllegalArgumentException e) {
453 throw new WorkflowOperationException(String.format(
454 "Speech-to-Text job for media package '%s' failed, because of wrong workflow configuration. "
455 + "target-element of type '%s' does not exist.", workflowInstance.getMediaPackage(), targetElement));
456 }
457 }
458
459
460
461
462
463
464
465 private Boolean getTranslationMode(WorkflowInstance workflowInstance) {
466 WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
467 return BooleanUtils.toBoolean(StringUtils.trimToEmpty(operation.getConfiguration(TRANSLATE_MODE)));
468 }
469
470
471
472
473
474
475
476
477 private String getMediaPackageLanguage(MediaPackage mediaPackage, WorkflowInstance workflowInstance) {
478
479
480 WorkflowOperationInstance operation = workflowInstance.getCurrentOperation();
481 String language = StringUtils.trimToEmpty(operation.getConfiguration(LANGUAGE_CODE));
482
483 if (language.isEmpty()) {
484
485 MediaPackageMetadata dublinCoreMetadata = dublinCoreCatalogService.getMetadata(mediaPackage);
486 language = StringUtils.trimToEmpty(dublinCoreMetadata.getLanguage());
487 }
488
489 if (language.isEmpty()) {
490
491 language = StringUtils.trimToEmpty(mediaPackage.getLanguage());
492 }
493
494 if (language.isEmpty()) {
495
496 language = Objects.toString(operation.getConfiguration(LANGUAGE_FALLBACK), "");
497 }
498
499 return language;
500 }
501
502
503
504
505
506
507 @Reference
508 public void setSpeechToTextService(SpeechToTextService speechToTextService) {
509 this.speechToTextService = speechToTextService;
510 }
511
512 @Reference
513 public void setMediaInspectionService(MediaInspectionService mediaInspectionService) {
514 this.mediaInspectionService = mediaInspectionService;
515 }
516
517 @Reference
518 public void setWorkspace(Workspace workspace) {
519 this.workspace = workspace;
520 }
521
522 @Reference
523 public void setDublinCoreCatalogService(DublinCoreCatalogService dublinCoreCatalogService) {
524 this.dublinCoreCatalogService = dublinCoreCatalogService;
525 }
526
527 @Reference
528 public void setServiceRegistry(ServiceRegistry serviceRegistry) {
529 this.serviceRegistry = serviceRegistry;
530 }
531 }