View Javadoc
1   /*
2    * Licensed to The Apereo Foundation under one or more contributor license
3    * agreements. See the NOTICE file distributed with this work for additional
4    * information regarding copyright ownership.
5    *
6    *
7    * The Apereo Foundation licenses this file to you under the Educational
8    * Community License, Version 2.0 (the "License"); you may not use this file
9    * except in compliance with the License. You may obtain a copy of the License
10   * at:
11   *
12   *   http://opensource.org/licenses/ecl2.txt
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
16   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
17   * License for the specific language governing permissions and limitations under
18   * the License.
19   *
20   */
21  
22  
23  package org.opencastproject.textextractor.tesseract;
24  
25  import static java.nio.charset.StandardCharsets.UTF_8;
26  
27  import org.opencastproject.textextractor.api.TextExtractor;
28  import org.opencastproject.textextractor.api.TextExtractorException;
29  
30  import org.apache.commons.io.FileUtils;
31  import org.apache.commons.io.FilenameUtils;
32  import org.apache.commons.lang3.StringUtils;
33  import org.osgi.service.cm.ManagedService;
34  import org.osgi.service.component.ComponentContext;
35  import org.osgi.service.component.annotations.Component;
36  import org.slf4j.Logger;
37  import org.slf4j.LoggerFactory;
38  
39  import java.io.BufferedReader;
40  import java.io.File;
41  import java.io.FileInputStream;
42  import java.io.IOException;
43  import java.io.InputStreamReader;
44  import java.util.ArrayList;
45  import java.util.Arrays;
46  import java.util.Dictionary;
47  import java.util.List;
48  
49  /**
50   * Commandline wrapper around tesseract' <code>tesseract</code> command.
51   */
52  @Component(
53      immediate = true,
54      service = { TextExtractor.class,ManagedService.class },
55      property = {
56          "service.description=Tesseract Text Extractor"
57      }
58  )
59  public class TesseractTextExtractor implements TextExtractor, ManagedService {
60  
61    /** The logging facility */
62    private static final Logger logger = LoggerFactory.getLogger(TesseractTextExtractor.class);
63  
64    /** Default name of the tesseract binary */
65    public static final String TESSERACT_BINARY_DEFAULT = "tesseract";
66  
67    /** Configuration property that defines the path to the tesseract binary */
68    public static final String TESSERACT_BINARY_CONFIG_KEY =
69        "org.opencastproject.textanalyzer.tesseract.path";
70  
71    /** Configuration property that defines additional tesseract options like the
72     * language or the pagesegmode to use. This is just appended to the command
73     * line when tesseract is called. */
74    public static final String TESSERACT_OPTS_CONFIG_KEY =
75        "org.opencastproject.textanalyzer.tesseract.options";
76  
77    /** Binary of the tesseract command */
78    private String binary;
79  
80    /** Additional options for the tesseract command */
81    private String addOptions = "";
82  
83    /** Tesseract stderr lines not to log */
84    private static final List<String> stderrFilter = java.util.Arrays.asList(
85            "Page",
86            "Tesseract Open Source OCR Engine",
87            "Warning: Invalid resolution 0 dpi. Using 70 instead.",
88            "Estimating resolution as ");
89  
90    /**
91     * Creates a new tesseract command wrapper that will be using the default binary.
92     */
93    public TesseractTextExtractor() {
94      this(TESSERACT_BINARY_DEFAULT);
95    }
96  
97    /**
98     * Creates a new tesseract command wrapper that will be using the given binary.
99     *
100    * @param binary
101    *          the tesseract binary
102    */
103   public TesseractTextExtractor(String binary) {
104     this.binary = binary;
105   }
106 
107   /**
108    * Sets additional options for tesseract calls.
109    *
110    * @param addOptions
111    */
112   public void setAdditionalOptions(String addOptions) {
113     this.addOptions = addOptions;
114   }
115 
116   /**
117    * Returns the additional options for tesseract..
118    *
119    * @return additional options
120    */
121   public String getAdditionalOptions() {
122     return addOptions;
123   }
124 
125   /**
126    * {@inheritDoc}
127    *
128    * @see org.opencastproject.textextractor.api.TextExtractor#extract(java.io.File)
129    */
130   @Override
131   public List<String> extract(File image) throws TextExtractorException {
132     if (binary == null) {
133       throw new IllegalStateException("Binary is not set");
134     }
135 
136     File outputFile = null;
137     File outputFileBase = new File(image.getParentFile(), FilenameUtils.getBaseName(image.getName()));
138     // Run tesseract
139     List<String> command = getTesseractCommand(image, outputFileBase);
140     logger.info("Running Tesseract: {}", command);
141     try {
142       ProcessBuilder processBuilder = new ProcessBuilder(command);
143       processBuilder.redirectErrorStream(true);
144       // Mitigation for new Tesseract 4.x spawning too many threads locking up the system. Limit to one thread.
145       processBuilder.environment().put("OMP_THREAD_LIMIT", "1");
146       Process tesseractProcess = processBuilder.start();
147 
148       // listen to output
149       try (BufferedReader in = new BufferedReader(new InputStreamReader(tesseractProcess.getInputStream()))) {
150         String line;
151         while ((line = in.readLine()) != null) {
152           final String trimmedLine = line.trim();
153           if (stderrFilter.parallelStream().noneMatch(trimmedLine::startsWith)) {
154             logger.info(line);
155           } else {
156             logger.debug(line);
157           }
158         }
159       }
160 
161       // wait until the task is finished
162       int exitCode = tesseractProcess.waitFor();
163       if (exitCode != 0) {
164         throw new TextExtractorException("Tesseract exited abnormally with status " + exitCode);
165       }
166 
167       // Read the tesseract output file
168       outputFile = new File(outputFileBase.getAbsolutePath() + ".txt");
169       ArrayList<String> output = new ArrayList<>();
170       try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(outputFile), UTF_8))) {
171         String line;
172         while ((line = reader.readLine()) != null) {
173           final String trimmedLine = line.trim();
174           if (!trimmedLine.isEmpty()) {
175             output.add(trimmedLine);
176           }
177         }
178       }
179       return output;
180     } catch (IOException | InterruptedException e) {
181       throw new TextExtractorException("Error running text extractor " + binary, e);
182     } finally {
183       FileUtils.deleteQuietly(outputFile);
184     }
185   }
186 
187   /**
188    * Generate the command line to run Tesseract
189    *
190    * @param image
191    *          the image file
192    * @param outputFile
193    *          base name of output file. Tesseract will attach <code>.txt</code>
194    * @return the command line to runn Tesseract on the given input file
195    */
196   private List<String> getTesseractCommand(final File image, final File outputFile) {
197     List<String> args = new ArrayList<>();
198     args.add(binary);
199     args.add(image.getAbsolutePath());
200     args.add(outputFile.getAbsolutePath());
201     args.addAll(Arrays.asList(StringUtils.split(addOptions)));
202     return args;
203   }
204 
205   @Override
206   public void updated(Dictionary properties) {
207     String path = (String) properties.get(TESSERACT_BINARY_CONFIG_KEY);
208     if (path != null) {
209       logger.info("Setting Tesseract path to {}", path);
210       this.binary = path;
211     }
212     /* Set additional options for tesseract (i.e. language to use) */
213     String addopts = (String) properties.get(TESSERACT_OPTS_CONFIG_KEY);
214     if (addopts != null) {
215       logger.info("Setting additional options for Tesseract path to '{}'", addopts);
216       this.addOptions = addopts;
217     }
218   }
219 
220   public void activate(ComponentContext cc) {
221     // Configure ffmpeg
222     String path = cc.getBundleContext().getProperty(TESSERACT_BINARY_CONFIG_KEY);
223     if (path == null) {
224       logger.debug("DEFAULT " + TESSERACT_BINARY_CONFIG_KEY + ": " + TESSERACT_BINARY_DEFAULT);
225     } else {
226       this.binary = path;
227       logger.info("Setting Tesseract path to binary from config: {}", path);
228     }
229     /* Set additional options for tesseract (i.e. language to use) */
230     String addopts = cc.getBundleContext().getProperty(TESSERACT_OPTS_CONFIG_KEY);
231     if (addopts != null) {
232       logger.info("Setting additional options for Tesseract to '{}'", addopts);
233       this.addOptions = addopts;
234     } else {
235       logger.info("No additional options for Tesseract");
236       this.addOptions = "";
237     }
238   }
239 
240 }