org.apache.tika.parser.ocr.TesseractOCRParser Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ocr;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
import java.awt.Image;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.config.Param;
import org.apache.tika.config.TikaTaskTimeout;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractExternalProcessParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
/**
* TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
* create a {@link TesseractOCRConfig} object and pass it through a
* ParseContext. Tesseract-ocr must be installed and on system path or the path
* to its root folder must be provided:
*
* TesseractOCRConfig config = new TesseractOCRConfig();
* //Needed if tesseract is not on system path
* config.setTesseractPath(tesseractFolder);
* parseContext.set(TesseractOCRConfig.class, config);
*
*/
public class TesseractOCRParser extends AbstractExternalProcessParser implements Initializable {
public static final String TESS_META = "tess:";
public static final Property IMAGE_ROTATION = Property.externalRealSeq(TESS_META + "rotation");
public static final Property IMAGE_MAGICK =
Property.externalBooleanSeq(TESS_META + "image_magick_processed");
private static final String TESSDATA_PREFIX = "TESSDATA_PREFIX";
public static final Property
PSM0_PAGE_NUMBER = Property.externalInteger(TESS_META + "page_number");
public static final Property
PSM0_ORIENTATION = Property.externalInteger(TESS_META + "orientation");
public static final Property PSM0_ROTATE = Property.externalInteger(TESS_META + "rotate");
public static final Property PSM0_ORIENTATION_CONFIDENCE = Property.externalReal(TESS_META +
"orientation_confidence");
public static final Property PSM0_SCRIPT = Property.externalText(TESS_META + "script");
public static final Property PSM0_SCRIPT_CONFIDENCE = Property.externalReal(TESS_META +
"script_confidence");
private static final String OCR = "ocr-";
private static final Logger LOG = LoggerFactory.getLogger(TesseractOCRParser.class);
private static final Object[] LOCK = new Object[0];
private static final long serialVersionUID = -8167538283213097265L;
private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<>(
Arrays.asList(
new MediaType[]{MediaType.image(OCR + "png"), MediaType.image(OCR + "jpeg"),
MediaType.image(OCR + "tiff"), MediaType.image(OCR + "bmp"),
MediaType.image(OCR + "gif"),
//these are not currently covered by other parsers
MediaType.image("jp2"), MediaType.image("jpx"),
MediaType.image("x-portable-pixmap"),
//add the ocr- versions as well
MediaType.image(OCR + "jp2"), MediaType.image(OCR + "jpx"),
MediaType.image(OCR + "x-portable-pixmap"),
})));
private static volatile boolean HAS_WARNED = false;
private static volatile boolean HAS_CHECKED_FOR_IMAGE_MAGICK = false;
//if a user specifies a custom tess path or tessdata path
//load the available languages at initialization time
private final Set langs = new HashSet<>();
private final TesseractOCRConfig defaultConfig = new TesseractOCRConfig();
private String tesseractPath = "";
private String tessdataPath = "";
private String imageMagickPath = "";
//if set to true, this will run --list-langs
//at initialization and then check langs
//at parse time
private boolean preloadLangs = false;
private boolean hasTesseract;
private boolean hasImageMagick;
private ImagePreprocessor imagePreprocessor;
public static String getImageMagickProg() {
return System.getProperty("os.name").startsWith("Windows") ? "magick" : "convert";
}
public static String getTesseractProg() {
return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
}
@Override
public Set getSupportedTypes(ParseContext context) {
// If Tesseract is installed, offer our supported image types
TesseractOCRConfig config = context.get(TesseractOCRConfig.class);
if (hasTesseract) {
if (config == null || !config.isSkipOcr()) {
return SUPPORTED_TYPES;
}
}
// Otherwise don't advertise anything, so the other image parsers
// can be selected instead
return Collections.emptySet();
}
private void setEnv(ProcessBuilder pb) {
Map env = pb.environment();
if (!StringUtils.isBlank(getTessdataPath())) {
env.put(TESSDATA_PREFIX, getTessdataPath());
} else if (!StringUtils.isBlank(getTesseractPath())) {
//adding tessdata is required for at least >= 4.x
env.put(TESSDATA_PREFIX, getTesseractPath() + "tessdata");
}
}
public boolean hasTesseract() throws TikaConfigException {
// Fetch where the config says to find Tesseract
String tesseract = getTesseractPath() + getTesseractProg();
if (!StringUtils.isBlank(tesseractPath) && !Files.isDirectory(Paths.get(tesseractPath))) {
throw new TikaConfigException("tesseractPath (" + tesseractPath + ") " +
"doesn't point to an existing directory");
}
// Try running Tesseract from there, and see if it exists + works
String[] checkCmd = {tesseract};
boolean hasTesseract = ExternalParser.check(checkCmd);
LOG.debug("hasTesseract (path: " + Arrays.toString(checkCmd) + "): " + hasTesseract);
return hasTesseract;
}
synchronized boolean hasImageMagick() throws TikaConfigException {
if (HAS_CHECKED_FOR_IMAGE_MAGICK) {
return hasImageMagick;
}
// Fetch where the config says to find ImageMagick Program
String fullImageMagickPath = imageMagickPath + getImageMagickProg();
//check that directory exists
if (!StringUtils.isBlank(imageMagickPath) &&
!Files.isDirectory(Paths.get(imageMagickPath))) {
throw new TikaConfigException("imageMagickPath (" + imageMagickPath + ") " +
"doesn't point to an existing directory");
}
// Try running ImageMagick program from there, and see if it exists + works
String[] checkCmd = {fullImageMagickPath};
boolean hasImageMagick = ExternalParser.check(checkCmd);
if (!hasImageMagick) {
LOG.debug("ImageMagick does not appear to be installed " + "(commandline: " +
fullImageMagickPath + ")");
}
HAS_CHECKED_FOR_IMAGE_MAGICK = true;
return hasImageMagick;
}
public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
try (TemporaryResources tmp = new TemporaryResources()) {
int w = image.getWidth(null);
int h = image.getHeight(null);
BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
File file = tmp.createTemporaryFile();
try (OutputStream fos = new FileOutputStream(file)) {
ImageIO.write(bImage, "png", fos);
}
try (TikaInputStream tis = TikaInputStream.get(file)) {
parse(tis, handler, metadata, context);
}
}
}
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext parseContext) throws IOException, SAXException, TikaException {
TesseractOCRConfig userConfig = parseContext.get(TesseractOCRConfig.class);
TesseractOCRConfig config = defaultConfig;
if (userConfig != null) {
config = defaultConfig.cloneAndUpdate(userConfig);
}
// If Tesseract is not on the path with the current config, do not try to run OCR
// getSupportedTypes shouldn't have listed us as handling it, so this should only
// occur if someone directly calls this parser, not via DefaultParser or similar
if (!hasTesseract || (config != null && config.isSkipOcr())) {
return;
}
//if you haven't checked yet, and a per file config requests imagemagick
//and if the default is not to use image processing
if (! HAS_CHECKED_FOR_IMAGE_MAGICK && config.isEnableImagePreprocessing()) {
hasImageMagick = hasImageMagick();
}
try (TemporaryResources tmp = new TemporaryResources()) {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp, metadata);
//trigger the spooling to a tmp file if the stream wasn't
//already a TikaInputStream that contained a file
tikaStream.getPath();
//this is the text output file name specified on the tesseract
//commandline. The actual output file name will have a suffix added.
File tmpOCROutputFile = tmp.createTemporaryFile();
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
parse(tikaStream, tmpOCROutputFile, xhtml, metadata, parseContext, config);
xhtml.endDocument();
}
}
private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile,
ContentHandler xhtml,
Metadata metadata, ParseContext parseContext, TesseractOCRConfig config)
throws IOException, SAXException, TikaException {
warnOnFirstParse();
validateLangString(config.getLanguage());
File tmpTxtOutput = null;
try {
Path input = tikaInputStream.getPath();
long size = tikaInputStream.getLength();
if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
// Process image
if (config.isEnableImagePreprocessing() || config.isApplyRotation()) {
if (!hasImageMagick) {
LOG.warn(
"User has selected to preprocess images, " +
"but I can't find ImageMagick." +
"Backing off to original file.");
doOCR(input.toFile(), tmpOCROutputFile, config, parseContext);
} else {
// copy the contents of the original input file into a temporary file
// which will be preprocessed for OCR
try (TemporaryResources tmp = new TemporaryResources()) {
Path tmpFile = tmp.createTempFile();
Files.copy(input, tmpFile, StandardCopyOption.REPLACE_EXISTING);
imagePreprocessor.process(tmpFile, tmpFile, metadata, config);
doOCR(tmpFile.toFile(), tmpOCROutputFile, config, parseContext);
}
}
} else {
doOCR(input.toFile(), tmpOCROutputFile, config, parseContext);
}
String extension = config.getPageSegMode().equals("0") ? "osd" :
config.getOutputType().toString().toLowerCase(Locale.US);
// Tesseract appends the output type (.txt or .hocr or .osd) to output file name
tmpTxtOutput = new File(tmpOCROutputFile.getAbsolutePath() +
"." + extension);
if (tmpTxtOutput.exists()) {
try (InputStream is = new FileInputStream(tmpTxtOutput)) {
if (config.getPageSegMode().equals("0")) {
extractOSD(is, metadata);
} else if (config.getOutputType().equals(TesseractOCRConfig.OUTPUT_TYPE.HOCR)) {
extractHOCROutput(is, parseContext, xhtml);
} else {
extractOutput(is, xhtml);
}
}
}
}
} finally {
if (tmpTxtOutput != null) {
tmpTxtOutput.delete();
}
}
}
private void extractOSD(InputStream is, Metadata metadata) throws IOException {
Matcher matcher = Pattern.compile("^([^:]+):\\s+(.*)").matcher("");
try (BufferedReader reader = new BufferedReader(new InputStreamReader(is,
UTF_8))) {
String line = reader.readLine();
while (line != null) {
if (matcher.reset(line).find()) {
String k = matcher.group(1);
String v = matcher.group(2);
switch (k) {
case "Page number":
metadata.set(PSM0_PAGE_NUMBER, Integer.parseInt(v));
break;
case "Orientation in degrees":
metadata.set(PSM0_ORIENTATION, Integer.parseInt(v));
break;
case "Rotate":
metadata.set(PSM0_ROTATE, Integer.parseInt(v));
break;
case "Orientation confidence":
metadata.set(PSM0_ORIENTATION_CONFIDENCE, Double.parseDouble(v));
break;
case "Script":
metadata.set(PSM0_SCRIPT, v);
break;
case "Script confidence":
metadata.set(PSM0_SCRIPT_CONFIDENCE, Double.parseDouble(v));
break;
default:
LOG.warn("I regret I don't know how to parse {} with value {}", k, v);
}
}
line = reader.readLine();
}
}
}
private void warnOnFirstParse() {
if (!hasWarned()) {
warn();
}
}
/**
* Run external tesseract-ocr process.
*
* @param input File to be ocred
* @param output File to collect ocr result
* @param config Configuration of tesseract-ocr engine
* @throws TikaException if the extraction timed out
* @throws IOException if an input error occurred
*/
private void doOCR(File input, File output, TesseractOCRConfig config, ParseContext parseContext)
throws IOException, TikaException {
ArrayList cmd = new ArrayList<>(
Arrays.asList(getTesseractPath() + getTesseractProg(), input.getPath(),
output.getPath(), "--psm", config.getPageSegMode()));
//if --psm == 0, don't add anything else to the command line
if (! "0".equals(config.getPageSegMode())) {
if (!StringUtils.isBlank(config.getLanguage())) {
cmd.add("-l");
cmd.add(config.getLanguage());
}
for (Map.Entry entry : config.getOtherTesseractConfig().entrySet()) {
cmd.add("-c");
cmd.add(entry.getKey() + "=" + entry.getValue());
}
cmd.addAll(Arrays.asList("-c", "page_separator=" + config.getPageSeparator(), "-c",
(config.isPreserveInterwordSpacing()) ? "preserve_interword_spaces=1" :
"preserve_interword_spaces=0",
config.getOutputType().name().toLowerCase(Locale.US)));
}
LOG.debug("Tesseract command: " + String.join(" ", cmd));
ProcessBuilder pb = new ProcessBuilder(cmd);
setEnv(pb);
Process process = null;
String id = null;
long timeoutMillis = TikaTaskTimeout.getTimeoutMillis(parseContext,
config.getTimeoutSeconds() * 1000);
try {
process = pb.start();
id = register(process);
runOCRProcess(process, timeoutMillis);
} finally {
if (process != null) {
process.destroyForcibly();
}
if (id != null) {
release(id);
}
}
}
private void runOCRProcess(Process process, long timeoutMillis) throws IOException,
TikaException {
process.getOutputStream().close();
InputStream out = process.getInputStream();
InputStream err = process.getErrorStream();
StringBuilder outBuilder = new StringBuilder();
StringBuilder errBuilder = new StringBuilder();
Thread outThread = logStream(out, outBuilder);
Thread errThread = logStream(err, errBuilder);
outThread.start();
errThread.start();
int exitValue = Integer.MIN_VALUE;
try {
boolean finished = process.waitFor(timeoutMillis, TimeUnit.MILLISECONDS);
if (!finished) {
throw new TikaException("TesseractOCRParser timeout");
}
exitValue = process.exitValue();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new TikaException("TesseractOCRParser interrupted", e);
} catch (IllegalThreadStateException e) {
//this _should_ never be thrown
throw new TikaException("TesseractOCRParser timeout");
}
if (exitValue > 0) {
try {
//make sure this thread is actually done
errThread.join(1000);
} catch (InterruptedException e) {
//swallow
}
throw new TikaException(
"TesseractOCRParser bad exit value " + exitValue + " err msg: " +
errBuilder.toString());
}
}
/**
* Reads the contents of the given stream and write it to the given XHTML
* content handler. The stream is closed once fully processed.
*
* @param stream Stream where is the result of ocr
* @param xhtml XHTML content handler
* @throws SAXException if the XHTML SAX events could not be handled
* @throws IOException if an input error occurred
*/
private void extractOutput(InputStream stream, ContentHandler xhtml)
throws SAXException, IOException {
// 0) {
xhtml.characters(buffer, 0, n);
}
}
}
xhtml.endElement(XHTML, "div", "div");
}
private void extractHOCROutput(InputStream is, ParseContext parseContext, ContentHandler xhtml)
throws TikaException, IOException, SAXException {
if (parseContext == null) {
parseContext = new ParseContext();
}
// {
Reader reader = new InputStreamReader(stream, UTF_8);
char[] buffer = new char[1024];
try {
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
out.append(buffer, 0, n);
}
} catch (IOException e) {
//swallow
} finally {
IOUtils.closeQuietly(stream);
}
LOG.debug("{}", out);
});
}
@Override
public void initialize(Map params) throws TikaConfigException {
hasTesseract = hasTesseract();
if (isEnableImagePreprocessing()) {
hasImageMagick = hasImageMagick();
} else {
hasImageMagick = false;
}
if (preloadLangs) {
preloadLangs();
if (!StringUtils.isBlank(defaultConfig.getLanguage())) {
validateLangString(defaultConfig.getLanguage());
}
}
imagePreprocessor = new ImagePreprocessor(getImageMagickPath() + getImageMagickProg());
}
private void validateLangString(String language) throws TikaConfigException {
Set invalidlangs = new HashSet<>();
Set validLangs = new HashSet<>();
TesseractOCRConfig.getLangs(language, validLangs, invalidlangs);
if (invalidlangs.size() > 0) {
throw new TikaConfigException("Invalid language code(s): " + invalidlangs);
}
if (langs.size() > 0) {
for (String lang : validLangs) {
if (!langs.contains(lang)) {
throw new TikaConfigException(
"tesseract does not have " + lang + " available. I see only: " + langs);
}
}
}
}
@Override
public void checkInitialization(InitializableProblemHandler problemHandler)
throws TikaConfigException {
if (langs.size() > 0 && !StringUtils.isBlank(defaultConfig.getLanguage())) {
if (!langs.contains(defaultConfig.getLanguage())) {
throw new TikaConfigException("It doesn't look like tesseract has lang data for " +
defaultConfig.getLanguage() + ". " + "I see only: " + langs);
}
}
}
public Set getLangs() {
return langs;
}
protected boolean hasWarned() {
if (HAS_WARNED) {
return true;
}
synchronized (LOCK) {
if (HAS_WARNED) {
return true;
}
return false;
}
}
protected void warn() {
LOG.info("Tesseract is installed and is being invoked. " +
"This can add greatly to processing time. If you do not want tesseract " +
"to be applied to your files see: " +
"https://cwiki.apache.org/confluence/display/TIKA/TikaOCR#TikaOCR-disable-ocr");
HAS_WARNED = true;
}
public String getTesseractPath() {
return tesseractPath;
}
/**
* Set the path to the Tesseract executable's directory, needed if it is not on system path.
*
* Note that if you set this value, it is highly recommended that you also
* set the path to (and including) the 'tessdata' folder using {@link #setTessdataPath}.
*
*/
@Field
public void setTesseractPath(String tesseractPath) {
tesseractPath = FilenameUtils.normalize(tesseractPath);
if (!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator)) {
tesseractPath += File.separator;
}
this.tesseractPath = tesseractPath;
}
public String getTessdataPath() {
return this.tessdataPath;
}
/**
* Set the path to the 'tessdata' folder, which contains language files and config files. In
* some cases (such
* as on Windows), this folder is found in the Tesseract installation, but in other cases
* (such as when Tesseract is built from source), it may be located elsewhere.
*
* Make sure to include the 'tessdata' folder in this path: '/blah/de/blah/tessdata'
*/
@Field
public void setTessdataPath(String tessdataPath) {
tessdataPath = FilenameUtils.normalize(tessdataPath);
if (!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator)) {
tessdataPath += File.separator;
}
this.tessdataPath = tessdataPath;
}
public String getImageMagickPath() {
return imageMagickPath;
}
/**
* Set the path to the ImageMagick executable directory, needed if it is not on system path.
*
* @param imageMagickPath to ImageMagick executable directory.
*/
@Field
public void setImageMagickPath(String imageMagickPath) {
imageMagickPath = FilenameUtils.normalize(imageMagickPath);
if (!imageMagickPath.isEmpty() && !imageMagickPath.endsWith(File.separator)) {
imageMagickPath += File.separator;
}
this.imageMagickPath = imageMagickPath;
}
@Field
public void setOtherTesseractSettings(List settings) throws TikaConfigException {
for (String s : settings) {
String[] bits = s.trim().split("\\s+");
if (bits.length != 2) {
throw new TikaConfigException(
"Expected space delimited key value pair." + " However, I found " +
bits.length + " bits.");
}
defaultConfig.addOtherTesseractConfig(bits[0], bits[1]);
}
}
public List getOtherTesseractSettings() {
List settings = new ArrayList<>();
Map sorted = new TreeMap<>(defaultConfig.getOtherTesseractConfig());
for (Map.Entry e :sorted.entrySet()) {
settings.add(e.getKey() + " " + e.getValue());
}
return settings;
}
@Field
public void setSkipOCR(boolean skipOCR) {
defaultConfig.setSkipOcr(skipOCR);
}
public boolean isSkipOCR() {
return defaultConfig.isSkipOcr();
}
@Field
public void setLanguage(String language) {
defaultConfig.setLanguage(language);
}
public String getLanguage() {
return defaultConfig.getLanguage();
}
@Field
public void setPageSegMode(String pageSegMode) {
defaultConfig.setPageSegMode(pageSegMode);
}
public String getPageSegMode() {
return defaultConfig.getPageSegMode();
}
@Field
public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
defaultConfig.setMaxFileSizeToOcr(maxFileSizeToOcr);
}
public long getMaxFileSizeToOcr() {
return defaultConfig.getMaxFileSizeToOcr();
}
@Field
public void setMinFileSizeToOcr(long minFileSizeToOcr) {
defaultConfig.setMinFileSizeToOcr(minFileSizeToOcr);
}
public long getMinFileSizeToOcr() {
return defaultConfig.getMinFileSizeToOcr();
}
/**
* Set default timeout in seconds. This can be overridden per parse
* with {@link TikaTaskTimeout} sent in via the {@link ParseContext}
* at parse time.
*
* @param timeout
*/
@Field
public void setTimeout(int timeout) {
defaultConfig.setTimeoutSeconds(timeout);
}
public int getTimeout() {
return defaultConfig.getTimeoutSeconds();
}
@Field
public void setOutputType(String outputType) {
defaultConfig.setOutputType(outputType);
}
public String getOutputType() {
return defaultConfig.getOutputType().name();
}
@Field
public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) {
defaultConfig.setPreserveInterwordSpacing(preserveInterwordSpacing);
}
public boolean isPreserveInterwordSpacing() {
return defaultConfig.isPreserveInterwordSpacing();
}
@Field
public void setEnableImagePreprocessing(boolean enableImagePreprocessing) {
defaultConfig.setEnableImagePreprocessing(enableImagePreprocessing);
}
public boolean isEnableImagePreprocessing() {
return defaultConfig.isEnableImagePreprocessing();
}
@Field
public void setDensity(int density) {
defaultConfig.setDensity(density);
}
public int getDensity() {
return defaultConfig.getDensity();
}
@Field
public void setDepth(int depth) {
defaultConfig.setDepth(depth);
}
public int getDepth() {
return defaultConfig.getDepth();
}
@Field
public void setColorspace(String colorspace) {
defaultConfig.setColorspace(colorspace);
}
public String getColorspace() {
return defaultConfig.getColorspace();
}
@Field
public void setFilter(String filter) {
defaultConfig.setFilter(filter);
}
public String getFilter() {
return defaultConfig.getFilter();
}
@Field
public void setResize(int resize) {
defaultConfig.setResize(resize);
}
public int getResize() {
return defaultConfig.getResize();
}
@Field
public void setApplyRotation(boolean applyRotation) {
defaultConfig.setApplyRotation(applyRotation);
}
public boolean isApplyRotation() {
return defaultConfig.isApplyRotation();
}
/**
* If set to true
and if tesseract is found, this will load the
* langs that result from --list-langs. At parse time, the
* parser will verify that tesseract has the requested lang
* available.
*
* If set to false
(the default) and tesseract is found, if a user
* requests a language that tesseract does not have data for,
* a TikaException will be thrown with tesseract's native exception
* message, which is a bit less readable.
*
* @param preloadLangs
*/
@Field
public void setPreloadLangs(boolean preloadLangs) {
this.preloadLangs = preloadLangs;
}
public boolean isPreloadLangs() {
return this.preloadLangs;
}
public TesseractOCRConfig getDefaultConfig() {
return defaultConfig;
}
private void preloadLangs() {
String[] args = new String[]{getTesseractPath() + getTesseractProg(), "--list-langs"};
ProcessBuilder pb = new ProcessBuilder(args);
setEnv(pb);
Process process = null;
try {
process = pb.start();
getLangs(process, defaultConfig.getTimeoutSeconds());
} catch (TikaException | IOException e) {
LOG.warn("Problem preloading langs", e);
} finally {
if (process != null) {
process.destroyForcibly();
}
}
}
private void getLangs(Process process, int timeoutSeconds) throws IOException, TikaException {
process.getOutputStream().close();
InputStream out = process.getInputStream();
InputStream err = process.getErrorStream();
StringBuilder outBuilder = new StringBuilder();
StringBuilder errBuilder = new StringBuilder();
Thread outThread = logStream(out, outBuilder);
Thread errThread = logStream(err, errBuilder);
outThread.start();
errThread.start();
int exitValue = Integer.MIN_VALUE;
try {
boolean finished = process.waitFor(timeoutSeconds, TimeUnit.SECONDS);
if (!finished) {
throw new TikaException("TesseractOCRParser timeout");
}
exitValue = process.exitValue();
outThread.join(1000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new TikaException("TesseractOCRParser interrupted", e);
} catch (IllegalThreadStateException e) {
//this _should_ never be thrown
throw new TikaException("TesseractOCRParser timeout");
}
if (exitValue > 0) {
throw new TikaException(
"TesseractOCRParser bad exit value " + exitValue + " err msg: " +
errBuilder.toString());
}
for (String line : outBuilder.toString().split("[\r\n]+")) {
if (line.startsWith("List of available")) {
continue;
}
langs.add(line.trim());
}
}
private static class HOCRPassThroughHandler extends DefaultHandler {
public static final Set IGNORE =
unmodifiableSet("html", "head", "title", "meta", "body");
private final ContentHandler xhtml;
public HOCRPassThroughHandler(ContentHandler xhtml) {
this.xhtml = xhtml;
}
private static Set unmodifiableSet(String... elements) {
return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(elements)));
}
/**
* Starts the given element. Table cells and list items are automatically
* indented by emitting a tab character as ignorable whitespace.
*/
@Override
public void startElement(String uri, String local, String name, Attributes attributes)
throws SAXException {
if (!IGNORE.contains(name)) {
xhtml.startElement(uri, local, name, attributes);
}
}
/**
* Ends the given element. Block elements are automatically followed
* by a newline character.
*/
@Override
public void endElement(String uri, String local, String name) throws SAXException {
if (!IGNORE.contains(name)) {
xhtml.endElement(uri, local, name);
}
}
/**
* @see TIKA-210
*/
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
xhtml.characters(ch, start, length);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy