View Javadoc
1   /*
2    * Licensed to The Apereo Foundation under one or more contributor license
3    * agreements. See the NOTICE file distributed with this work for additional
4    * information regarding copyright ownership.
5    *
6    *
7    * The Apereo Foundation licenses this file to you under the Educational
8    * Community License, Version 2.0 (the "License"); you may not use this file
9    * except in compliance with the License. You may obtain a copy of the License
10   * at:
11   *
12   *   http://opensource.org/licenses/ecl2.txt
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
16   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
17   * License for the specific language governing permissions and limitations under
18   * the License.
19   *
20   */
21  
22  package org.opencastproject.textanalyzer.impl;
23  
24  import org.opencastproject.dictionary.api.DictionaryService;
25  import org.opencastproject.job.api.AbstractJobProducer;
26  import org.opencastproject.job.api.Job;
27  import org.opencastproject.mediapackage.Attachment;
28  import org.opencastproject.mediapackage.Catalog;
29  import org.opencastproject.mediapackage.MediaPackageElementBuilderFactory;
30  import org.opencastproject.mediapackage.MediaPackageElementParser;
31  import org.opencastproject.mediapackage.MediaPackageElements;
32  import org.opencastproject.mediapackage.MediaPackageException;
33  import org.opencastproject.metadata.mpeg7.MediaTime;
34  import org.opencastproject.metadata.mpeg7.MediaTimeImpl;
35  import org.opencastproject.metadata.mpeg7.Mpeg7CatalogImpl;
36  import org.opencastproject.metadata.mpeg7.Mpeg7CatalogService;
37  import org.opencastproject.metadata.mpeg7.SpatioTemporalDecomposition;
38  import org.opencastproject.metadata.mpeg7.TemporalDecomposition;
39  import org.opencastproject.metadata.mpeg7.Textual;
40  import org.opencastproject.metadata.mpeg7.Video;
41  import org.opencastproject.metadata.mpeg7.VideoSegment;
42  import org.opencastproject.metadata.mpeg7.VideoText;
43  import org.opencastproject.metadata.mpeg7.VideoTextImpl;
44  import org.opencastproject.security.api.OrganizationDirectoryService;
45  import org.opencastproject.security.api.SecurityService;
46  import org.opencastproject.security.api.UserDirectoryService;
47  import org.opencastproject.serviceregistry.api.ServiceRegistry;
48  import org.opencastproject.serviceregistry.api.ServiceRegistryException;
49  import org.opencastproject.textanalyzer.api.TextAnalyzerException;
50  import org.opencastproject.textanalyzer.api.TextAnalyzerService;
51  import org.opencastproject.textextractor.api.TextExtractor;
52  import org.opencastproject.textextractor.api.TextExtractorException;
53  import org.opencastproject.util.LoadUtil;
54  import org.opencastproject.util.NotFoundException;
55  import org.opencastproject.util.ReadinessIndicator;
56  import org.opencastproject.workspace.api.Workspace;
57  
58  import org.osgi.service.cm.ConfigurationException;
59  import org.osgi.service.cm.ManagedService;
60  import org.osgi.service.component.ComponentContext;
61  import org.osgi.service.component.annotations.Activate;
62  import org.osgi.service.component.annotations.Component;
63  import org.osgi.service.component.annotations.Reference;
64  import org.slf4j.Logger;
65  import org.slf4j.LoggerFactory;
66  
67  import java.io.File;
68  import java.io.IOException;
69  import java.io.InputStream;
70  import java.net.URI;
71  import java.util.ArrayList;
72  import java.util.Arrays;
73  import java.util.Dictionary;
74  import java.util.List;
75  
76  /**
77   * Media analysis service that takes takes an image and returns text as extracted from that image.
78   */
79  @Component(
80      immediate = true,
81      service = { TextAnalyzerService.class,ManagedService.class },
82      property = {
83          "service.description=Text Analysis Service",
84          "service.pid=org.opencastproject.textanalyzer.impl.TextAnalyzerServiceImpl"
85      }
86  )
87  public class TextAnalyzerServiceImpl extends AbstractJobProducer implements TextAnalyzerService, ManagedService {
88  
89    /** The logging facility */
90    private static final Logger logger = LoggerFactory.getLogger(TextAnalyzerServiceImpl.class);
91  
92    /** List of available operations on jobs */
93    private enum Operation {
94      Extract
95    };
96  
97    /** Resulting collection in the working file repository */
98    public static final String COLLECTION_ID = "ocrtext";
99  
100   /** The approximate load placed on the system by creating a text analysis job */
101   public static final float DEFAULT_ANALYSIS_JOB_LOAD = 0.2f;
102 
103   /** The key to look for in the service configuration file to override the {@link #DEFAULT_ANALYSIS_JOB_LOAD} */
104   public static final String ANALYSIS_JOB_LOAD_KEY = "job.load.analysis";
105 
106   /** The approximate load placed on the system by creating a text analysis job */
107   private float analysisJobLoad = DEFAULT_ANALYSIS_JOB_LOAD;
108 
109   /** The text extraction implemenetation */
110   private TextExtractor textExtractor = null;
111 
112   /** Reference to the receipt service */
113   private ServiceRegistry serviceRegistry = null;
114 
115   /** The workspace to ue when retrieving remote media files */
116   private Workspace workspace = null;
117 
118   /** The mpeg-7 service */
119   protected Mpeg7CatalogService mpeg7CatalogService;
120 
121   /** The dictionary service */
122   protected DictionaryService dictionaryService;
123 
124   /** The security service */
125   protected SecurityService securityService = null;
126 
127   /** The user directory service */
128   protected UserDirectoryService userDirectoryService = null;
129 
130   /** The organization directory service */
131   protected OrganizationDirectoryService organizationDirectoryService = null;
132 
133   /**
134    * Creates a new instance of the text analyzer service.
135    */
136   public TextAnalyzerServiceImpl() {
137     super(JOB_TYPE);
138   }
139 
140   /**
141    * OSGi callback on component activation.
142    *
143    * @param cc
144    *          the component context
145    */
146   @Override
147   @Activate
148   public void activate(ComponentContext cc) {
149     logger.info("Activating Text analyser service");
150     super.activate(cc);
151   }
152 
153   /**
154    * {@inheritDoc}
155    *
156    * @see org.opencastproject.textanalyzer.api.TextAnalyzerService#extract(org.opencastproject.mediapackage.Attachment)
157    */
158   @Override
159   public Job extract(Attachment image) throws TextAnalyzerException, MediaPackageException {
160     try {
161       return serviceRegistry.createJob(JOB_TYPE, Operation.Extract.toString(),
162               Arrays.asList(MediaPackageElementParser.getAsXml(image)), analysisJobLoad);
163     } catch (ServiceRegistryException e) {
164       throw new TextAnalyzerException("Unable to create job", e);
165     }
166   }
167 
168   /**
169    * Starts text extraction on the image and returns a receipt containing the final result in the form of an
170    * Mpeg7Catalog.
171    *
172    * @param image
173    *          the element to analyze
174    * @param block
175    *          <code>true</code> to make this operation synchronous
176    * @return a receipt containing the resulting mpeg-7 catalog
177    * @throws TextAnalyzerException
178    */
179   private Catalog extract(Job job, Attachment image) throws TextAnalyzerException, MediaPackageException {
180 
181     final Attachment attachment = image;
182     final URI imageUrl = attachment.getURI();
183 
184     File imageFile = null;
185     try {
186       Mpeg7CatalogImpl mpeg7 = Mpeg7CatalogImpl.newInstance();
187 
188       logger.info("Starting text extraction from {}", imageUrl);
189       try {
190         imageFile = workspace.get(imageUrl);
191       } catch (NotFoundException e) {
192         throw new TextAnalyzerException("Image " + imageUrl + " not found in workspace", e);
193       } catch (IOException e) {
194         throw new TextAnalyzerException("Unable to access " + imageUrl + " in workspace", e);
195       }
196       VideoText[] videoTexts = analyze(imageFile, image.getIdentifier());
197 
198       // Create a temporal decomposition
199       MediaTime mediaTime = new MediaTimeImpl(0, 0);
200       Video avContent = mpeg7.addVideoContent(image.getIdentifier(), mediaTime, null);
201       TemporalDecomposition<VideoSegment> temporalDecomposition = (TemporalDecomposition<VideoSegment>) avContent
202               .getTemporalDecomposition();
203 
204       // Add a segment
205       VideoSegment videoSegment = temporalDecomposition.createSegment("segment-0");
206       videoSegment.setMediaTime(mediaTime);
207 
208       // Add the video text to the spacio temporal decomposition of the segment
209       SpatioTemporalDecomposition spatioTemporalDecomposition = videoSegment.createSpatioTemporalDecomposition(true,
210               false);
211       for (VideoText videoText : videoTexts) {
212         spatioTemporalDecomposition.addVideoText(videoText);
213       }
214 
215       logger.info("Text extraction of {} finished, {} lines found", attachment.getURI(), videoTexts.length);
216 
217       URI uri;
218       InputStream in;
219       try {
220         in = mpeg7CatalogService.serialize(mpeg7);
221       } catch (IOException e) {
222         throw new TextAnalyzerException("Error serializing mpeg7", e);
223       }
224       try {
225         uri = workspace.putInCollection(COLLECTION_ID, job.getId() + ".xml", in);
226       } catch (IOException e) {
227         throw new TextAnalyzerException("Unable to put mpeg7 into the workspace", e);
228       }
229       Catalog catalog = (Catalog) MediaPackageElementBuilderFactory.newInstance().newElementBuilder()
230               .newElement(Catalog.TYPE, MediaPackageElements.TEXTS);
231       catalog.setURI(uri);
232 
233       logger.debug("Created MPEG7 catalog for {}", imageUrl);
234 
235       return catalog;
236     } catch (Exception e) {
237       logger.warn("Error extracting text from " + imageUrl, e);
238       if (e instanceof TextAnalyzerException) {
239         throw (TextAnalyzerException) e;
240       } else {
241         throw new TextAnalyzerException(e);
242       }
243     } finally {
244       try {
245         workspace.delete(imageUrl);
246       } catch (Exception e) {
247         logger.warn("Unable to delete temporary text analysis image {}", imageUrl, e);
248       }
249     }
250   }
251 
252   /**
253    * {@inheritDoc}
254    *
255    * @see org.opencastproject.job.api.AbstractJobProducer#process(org.opencastproject.job.api.Job)
256    */
257   @Override
258   protected String process(Job job) throws Exception {
259     Operation op = null;
260     String operation = job.getOperation();
261     List<String> arguments = job.getArguments();
262     try {
263       op = Operation.valueOf(operation);
264       switch (op) {
265         case Extract:
266           Attachment element = (Attachment) MediaPackageElementParser.getFromXml(arguments.get(0));
267           Catalog catalog = extract(job, element);
268           return MediaPackageElementParser.getAsXml(catalog);
269         default:
270           throw new IllegalStateException("Don't know how to handle operation '" + operation + "'");
271       }
272     } catch (IllegalArgumentException e) {
273       throw new ServiceRegistryException("This service can't handle operations of type '" + op + "'", e);
274     } catch (IndexOutOfBoundsException e) {
275       throw new ServiceRegistryException("This argument list for operation '" + op + "' does not meet expectations", e);
276     } catch (Exception e) {
277       throw new ServiceRegistryException("Error handling operation '" + op + "'", e);
278     }
279   }
280 
281   /**
282    * Returns the video text element for the given image.
283    *
284    * @param imageFile
285    *          the image
286    * @param id
287    *          the video text id
288    * @return the video text found on the image
289    * @throws TextAnalyzerException
290    *           if accessing the image fails
291    */
292   protected VideoText[] analyze(File imageFile, String id) throws TextAnalyzerException {
293 
294     /* Call the text extractor implementation to extract the text from the
295      * provided image file */
296     List<VideoText> videoTexts = new ArrayList<VideoText>();
297     List<String> extractedText;
298     try {
299       extractedText = textExtractor.extract(imageFile);
300     } catch (IOException | TextExtractorException e) {
301       logger.warn("Error extracting text from {}", imageFile, e);
302       throw new TextAnalyzerException(e);
303     }
304 
305     /* Get detected text as raw string */
306     int i = 1;
307     for (String line : extractedText) {
308       VideoText videoText = new VideoTextImpl(id + "-" + i++);
309       Textual text = dictionaryService.cleanUpText(line);
310       if (text != null) {
311         videoText.setText(text);
312         videoTexts.add(videoText);
313       }
314     }
315 
316 
317     return videoTexts.toArray(new VideoText[0]);
318   }
319 
320   /**
321    * Sets the receipt service
322    *
323    * @param serviceRegistry
324    *          the service registry
325    */
326   @Reference
327   protected void setServiceRegistry(ServiceRegistry serviceRegistry) {
328     this.serviceRegistry = serviceRegistry;
329   }
330 
331   /**
332    * {@inheritDoc}
333    *
334    * @see org.opencastproject.job.api.AbstractJobProducer#getServiceRegistry()
335    */
336   @Override
337   protected ServiceRegistry getServiceRegistry() {
338     return serviceRegistry;
339   }
340 
341   /**
342    * Sets the text extractor.
343    *
344    * @param textExtractor
345    *          a text extractor implementation
346    */
347   @Reference
348   protected void setTextExtractor(TextExtractor textExtractor) {
349     this.textExtractor = textExtractor;
350   }
351 
352   /**
353    * Sets the workspace
354    *
355    * @param workspace
356    *          an instance of the workspace
357    */
358   @Reference
359   protected void setWorkspace(Workspace workspace) {
360     this.workspace = workspace;
361   }
362 
363   /**
364    * Sets the mpeg7CatalogService
365    *
366    * @param mpeg7CatalogService
367    *          an instance of the mpeg7 catalog service
368    */
369   @Reference(name = "mpeg7service")
370   protected void setMpeg7CatalogService(Mpeg7CatalogService mpeg7CatalogService) {
371     this.mpeg7CatalogService = mpeg7CatalogService;
372   }
373 
374   /**
375    * Sets the dictionary service
376    *
377    * @param dictionaryService
378    *          an instance of the dicitonary service
379    */
380   @Reference
381   protected void setDictionaryService(DictionaryService dictionaryService) {
382     this.dictionaryService = dictionaryService;
383   }
384 
385   /**
386    * Callback for setting the security service.
387    *
388    * @param securityService
389    *          the securityService to set
390    */
391   @Reference
392   public void setSecurityService(SecurityService securityService) {
393     this.securityService = securityService;
394   }
395 
396   /**
397    * Callback for setting the user directory service.
398    *
399    * @param userDirectoryService
400    *          the userDirectoryService to set
401    */
402   @Reference
403   public void setUserDirectoryService(UserDirectoryService userDirectoryService) {
404     this.userDirectoryService = userDirectoryService;
405   }
406 
407   /**
408    * Sets a reference to the organization directory service.
409    *
410    * @param organizationDirectory
411    *          the organization directory
412    */
413   @Reference
414   public void setOrganizationDirectoryService(OrganizationDirectoryService organizationDirectory) {
415     this.organizationDirectoryService = organizationDirectory;
416   }
417 
418   /**
419    * {@inheritDoc}
420    *
421    * @see org.opencastproject.job.api.AbstractJobProducer#getSecurityService()
422    */
423   @Override
424   protected SecurityService getSecurityService() {
425     return securityService;
426   }
427 
428   /**
429    * {@inheritDoc}
430    *
431    * @see org.opencastproject.job.api.AbstractJobProducer#getUserDirectoryService()
432    */
433   @Override
434   protected UserDirectoryService getUserDirectoryService() {
435     return userDirectoryService;
436   }
437 
438   /**
439    * {@inheritDoc}
440    *
441    * @see org.opencastproject.job.api.AbstractJobProducer#getOrganizationDirectoryService()
442    */
443   @Override
444   protected OrganizationDirectoryService getOrganizationDirectoryService() {
445     return organizationDirectoryService;
446   }
447 
448   @Override
449   public void updated(@SuppressWarnings("rawtypes") Dictionary properties) throws ConfigurationException {
450     analysisJobLoad = LoadUtil.getConfiguredLoadValue(properties, ANALYSIS_JOB_LOAD_KEY, DEFAULT_ANALYSIS_JOB_LOAD,
451             serviceRegistry);
452   }
453 
454   @Reference(target = "(artifact=dictionary)")
455   public void setReadinessIndicator(ReadinessIndicator readinessIndicator) {
456     //Only activate service if ReadinessIndicator is registered.
457   }
458 }