1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 package org.opencastproject.textextractor.tesseract;
24
25 import static java.nio.charset.StandardCharsets.UTF_8;
26
27 import org.opencastproject.textextractor.api.TextExtractor;
28 import org.opencastproject.textextractor.api.TextExtractorException;
29
30 import org.apache.commons.io.FileUtils;
31 import org.apache.commons.io.FilenameUtils;
32 import org.apache.commons.lang3.StringUtils;
33 import org.osgi.service.cm.ManagedService;
34 import org.osgi.service.component.ComponentContext;
35 import org.osgi.service.component.annotations.Component;
36 import org.slf4j.Logger;
37 import org.slf4j.LoggerFactory;
38
39 import java.io.BufferedReader;
40 import java.io.File;
41 import java.io.FileInputStream;
42 import java.io.IOException;
43 import java.io.InputStreamReader;
44 import java.util.ArrayList;
45 import java.util.Arrays;
46 import java.util.Dictionary;
47 import java.util.List;
48
49
50
51
52 @Component(
53 immediate = true,
54 service = { TextExtractor.class,ManagedService.class },
55 property = {
56 "service.description=Tesseract Text Extractor"
57 }
58 )
59 public class TesseractTextExtractor implements TextExtractor, ManagedService {
60
61
62 private static final Logger logger = LoggerFactory.getLogger(TesseractTextExtractor.class);
63
64
65 public static final String TESSERACT_BINARY_DEFAULT = "tesseract";
66
67
68 public static final String TESSERACT_BINARY_CONFIG_KEY =
69 "org.opencastproject.textanalyzer.tesseract.path";
70
71
72
73
74 public static final String TESSERACT_OPTS_CONFIG_KEY =
75 "org.opencastproject.textanalyzer.tesseract.options";
76
77
78 private String binary;
79
80
81 private String addOptions = "";
82
83
84 private static final List<String> stderrFilter = java.util.Arrays.asList(
85 "Page",
86 "Tesseract Open Source OCR Engine",
87 "Warning: Invalid resolution 0 dpi. Using 70 instead.",
88 "Estimating resolution as ");
89
90
91
92
93 public TesseractTextExtractor() {
94 this(TESSERACT_BINARY_DEFAULT);
95 }
96
97
98
99
100
101
102
103 public TesseractTextExtractor(String binary) {
104 this.binary = binary;
105 }
106
107
108
109
110
111
112 public void setAdditionalOptions(String addOptions) {
113 this.addOptions = addOptions;
114 }
115
116
117
118
119
120
121 public String getAdditionalOptions() {
122 return addOptions;
123 }
124
125
126
127
128
129
130 @Override
131 public List<String> extract(File image) throws TextExtractorException {
132 if (binary == null) {
133 throw new IllegalStateException("Binary is not set");
134 }
135
136 File outputFile = null;
137 File outputFileBase = new File(image.getParentFile(), FilenameUtils.getBaseName(image.getName()));
138
139 List<String> command = getTesseractCommand(image, outputFileBase);
140 logger.info("Running Tesseract: {}", command);
141 try {
142 ProcessBuilder processBuilder = new ProcessBuilder(command);
143 processBuilder.redirectErrorStream(true);
144
145 processBuilder.environment().put("OMP_THREAD_LIMIT", "1");
146 Process tesseractProcess = processBuilder.start();
147
148
149 try (BufferedReader in = new BufferedReader(new InputStreamReader(tesseractProcess.getInputStream()))) {
150 String line;
151 while ((line = in.readLine()) != null) {
152 final String trimmedLine = line.trim();
153 if (stderrFilter.parallelStream().noneMatch(trimmedLine::startsWith)) {
154 logger.info(line);
155 } else {
156 logger.debug(line);
157 }
158 }
159 }
160
161
162 int exitCode = tesseractProcess.waitFor();
163 if (exitCode != 0) {
164 throw new TextExtractorException("Tesseract exited abnormally with status " + exitCode);
165 }
166
167
168 outputFile = new File(outputFileBase.getAbsolutePath() + ".txt");
169 ArrayList<String> output = new ArrayList<>();
170 try (BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(outputFile), UTF_8))) {
171 String line;
172 while ((line = reader.readLine()) != null) {
173 final String trimmedLine = line.trim();
174 if (!trimmedLine.isEmpty()) {
175 output.add(trimmedLine);
176 }
177 }
178 }
179 return output;
180 } catch (IOException | InterruptedException e) {
181 throw new TextExtractorException("Error running text extractor " + binary, e);
182 } finally {
183 FileUtils.deleteQuietly(outputFile);
184 }
185 }
186
187
188
189
190
191
192
193
194
195
196 private List<String> getTesseractCommand(final File image, final File outputFile) {
197 List<String> args = new ArrayList<>();
198 args.add(binary);
199 args.add(image.getAbsolutePath());
200 args.add(outputFile.getAbsolutePath());
201 args.addAll(Arrays.asList(StringUtils.split(addOptions)));
202 return args;
203 }
204
205 @Override
206 public void updated(Dictionary properties) {
207 String path = (String) properties.get(TESSERACT_BINARY_CONFIG_KEY);
208 if (path != null) {
209 logger.info("Setting Tesseract path to {}", path);
210 this.binary = path;
211 }
212
213 String addopts = (String) properties.get(TESSERACT_OPTS_CONFIG_KEY);
214 if (addopts != null) {
215 logger.info("Setting additional options for Tesseract path to '{}'", addopts);
216 this.addOptions = addopts;
217 }
218 }
219
220 public void activate(ComponentContext cc) {
221
222 String path = cc.getBundleContext().getProperty(TESSERACT_BINARY_CONFIG_KEY);
223 if (path == null) {
224 logger.debug("DEFAULT " + TESSERACT_BINARY_CONFIG_KEY + ": " + TESSERACT_BINARY_DEFAULT);
225 } else {
226 this.binary = path;
227 logger.info("Setting Tesseract path to binary from config: {}", path);
228 }
229
230 String addopts = cc.getBundleContext().getProperty(TESSERACT_OPTS_CONFIG_KEY);
231 if (addopts != null) {
232 logger.info("Setting additional options for Tesseract to '{}'", addopts);
233 this.addOptions = addopts;
234 } else {
235 logger.info("No additional options for Tesseract");
236 this.addOptions = "";
237 }
238 }
239
240 }