TesseractTextExtractor.java

/*
 * Licensed to The Apereo Foundation under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 *
 * The Apereo Foundation licenses this file to you under the Educational
 * Community License, Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of the License
 * at:
 *
 *   http://opensource.org/licenses/ecl2.txt
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 */


package org.opencastproject.textextractor.tesseract;

import static java.nio.charset.StandardCharsets.UTF_8;

import org.opencastproject.textextractor.api.TextExtractor;
import org.opencastproject.textextractor.api.TextExtractorException;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import org.osgi.service.cm.ManagedService;
import org.osgi.service.component.ComponentContext;
import org.osgi.service.component.annotations.Component;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Dictionary;
import java.util.List;

/**
 * Commandline wrapper around tesseract' <code>tesseract</code> command.
 */
@Component(
    immediate = true,
    service = { TextExtractor.class,ManagedService.class },
    property = {
        "service.description=Tesseract Text Extractor"
    }
)
public class TesseractTextExtractor implements TextExtractor, ManagedService {

  /** The logging facility */
  private static final Logger logger = LoggerFactory.getLogger(TesseractTextExtractor.class);

  /** Default name of the tesseract binary */
  public static final String TESSERACT_BINARY_DEFAULT = "tesseract";

  /** Configuration property that defines the path to the tesseract binary */
  public static final String TESSERACT_BINARY_CONFIG_KEY =
      "org.opencastproject.textanalyzer.tesseract.path";

  /** Configuration property that defines additional tesseract options like the
   * language or the pagesegmode to use. This is just appended to the command
   * line when tesseract is called. */
  public static final String TESSERACT_OPTS_CONFIG_KEY =
      "org.opencastproject.textanalyzer.tesseract.options";

  /** Binary of the tesseract command */
  private String binary;

  /** Additional options for the tesseract command */
  private String addOptions = "";

  /** Tesseract stderr lines not to log */
  private static final List<String> stderrFilter = java.util.Arrays.asList(
          "Page",
          "Tesseract Open Source OCR Engine",
          "Warning: Invalid resolution 0 dpi. Using 70 instead.",
          "Estimating resolution as ");

  /**
   * Creates a new tesseract command wrapper that will be using the default binary.
   */
  public TesseractTextExtractor() {
    this(TESSERACT_BINARY_DEFAULT);
  }

  /**
   * Creates a new tesseract command wrapper that will be using the given binary.
   *
   * @param binary
   *          the tesseract binary
   */
  public TesseractTextExtractor(String binary) {
    this.binary = binary;
  }

  /**
   * Sets additional options for tesseract calls.
   *
   * @param addOptions
   */
  public void setAdditionalOptions(String addOptions) {
    this.addOptions = addOptions;
  }

  /**
   * Returns the additional options for tesseract..
   *
   * @return additional options
   */
  public String getAdditionalOptions() {
    return addOptions;
  }

  /**
   * {@inheritDoc}
   *
   * @see org.opencastproject.textextractor.api.TextExtractor#extract(java.io.File)
   */
  @Override
  public List<String> extract(File image) throws TextExtractorException {
    if (binary == null) {
      throw new IllegalStateException("Binary is not set");
    }

    File outputFile = null;
    File outputFileBase = new File(image.getParentFile(), FilenameUtils.getBaseName(image.getName()));
    // Run tesseract
    List<String> command = getTesseractCommand(image, outputFileBase);
    logger.info("Running Tesseract: {}", command);
    try {
      ProcessBuilder processBuilder = new ProcessBuilder(command);
      processBuilder.redirectErrorStream(true);
      // Mitigation for new Tesseract 4.x spawning too many threads locking up the system. Limit to one thread.
      processBuilder.environment().put("OMP_THREAD_LIMIT", "1");
      Process tesseractProcess = processBuilder.start();

      // listen to output
      try (BufferedReader in = new BufferedReader(new InputStreamReader(tesseractProcess.getInputStream()))) {
        String line;
        while ((line = in.readLine()) != null) {
          final String trimmedLine = line.trim();
          if (stderrFilter.parallelStream().noneMatch(trimmedLine::startsWith)) {
            logger.info(line);
          } else {
            logger.debug(line);
          }
        }
      }

      // wait until the task is finished
      int exitCode = tesseractProcess.waitFor();
      if (exitCode != 0) {
        throw new TextExtractorException("Tesseract exited abnormally with status " + exitCode);
      }

      // Read the tesseract output file
      outputFile = new File(outputFileBase.getAbsolutePath() + ".txt");
      ArrayList<String> output = new ArrayList<>();
      try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(outputFile), UTF_8))) {
        String line;
        while ((line = reader.readLine()) != null) {
          final String trimmedLine = line.trim();
          if (!trimmedLine.isEmpty()) {
            output.add(trimmedLine);
          }
        }
      }
      return output;
    } catch (IOException | InterruptedException e) {
      throw new TextExtractorException("Error running text extractor " + binary, e);
    } finally {
      FileUtils.deleteQuietly(outputFile);
    }
  }

  /**
   * Generate the command line to run Tesseract
   *
   * @param image
   *          the image file
   * @param outputFile
   *          base name of output file. Tesseract will attach <code>.txt</code>
   * @return the command line to runn Tesseract on the given input file
   */
  private List<String> getTesseractCommand(final File image, final File outputFile) {
    List<String> args = new ArrayList<>();
    args.add(binary);
    args.add(image.getAbsolutePath());
    args.add(outputFile.getAbsolutePath());
    args.addAll(Arrays.asList(StringUtils.split(addOptions)));
    return args;
  }

  @Override
  public void updated(Dictionary properties) {
    String path = (String) properties.get(TESSERACT_BINARY_CONFIG_KEY);
    if (path != null) {
      logger.info("Setting Tesseract path to {}", path);
      this.binary = path;
    }
    /* Set additional options for tesseract (i.e. language to use) */
    String addopts = (String) properties.get(TESSERACT_OPTS_CONFIG_KEY);
    if (addopts != null) {
      logger.info("Setting additional options for Tesseract path to '{}'", addopts);
      this.addOptions = addopts;
    }
  }

  public void activate(ComponentContext cc) {
    // Configure ffmpeg
    String path = cc.getBundleContext().getProperty(TESSERACT_BINARY_CONFIG_KEY);
    if (path == null) {
      logger.debug("DEFAULT " + TESSERACT_BINARY_CONFIG_KEY + ": " + TESSERACT_BINARY_DEFAULT);
    } else {
      this.binary = path;
      logger.info("Setting Tesseract path to binary from config: {}", path);
    }
    /* Set additional options for tesseract (i.e. language to use) */
    String addopts = cc.getBundleContext().getProperty(TESSERACT_OPTS_CONFIG_KEY);
    if (addopts != null) {
      logger.info("Setting additional options for Tesseract to '{}'", addopts);
      this.addOptions = addopts;
    } else {
      logger.info("No additional options for Tesseract");
      this.addOptions = "";
    }
  }

}