All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.ocr.TesseractOCRParser Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.ocr;

import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;

import java.awt.Image;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Reader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;

import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.config.Param;
import org.apache.tika.config.TikaTaskTimeout;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractExternalProcessParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;

/**
 * TesseractOCRParser powered by tesseract-ocr engine. To enable this parser,
 * create a {@link TesseractOCRConfig} object and pass it through a
 * ParseContext. Tesseract-ocr must be installed and on system path or the path
 * to its root folder must be provided:
 * 

* TesseractOCRConfig config = new TesseractOCRConfig();
* //Needed if tesseract is not on system path
* config.setTesseractPath(tesseractFolder);
* parseContext.set(TesseractOCRConfig.class, config);
*

*/ public class TesseractOCRParser extends AbstractExternalProcessParser implements Initializable { public static final String TESS_META = "tess:"; public static final Property IMAGE_ROTATION = Property.externalRealSeq(TESS_META + "rotation"); public static final Property IMAGE_MAGICK = Property.externalBooleanSeq(TESS_META + "image_magick_processed"); private static final String TESSDATA_PREFIX = "TESSDATA_PREFIX"; public static final Property PSM0_PAGE_NUMBER = Property.externalInteger(TESS_META + "page_number"); public static final Property PSM0_ORIENTATION = Property.externalInteger(TESS_META + "orientation"); public static final Property PSM0_ROTATE = Property.externalInteger(TESS_META + "rotate"); public static final Property PSM0_ORIENTATION_CONFIDENCE = Property.externalReal(TESS_META + "orientation_confidence"); public static final Property PSM0_SCRIPT = Property.externalText(TESS_META + "script"); public static final Property PSM0_SCRIPT_CONFIDENCE = Property.externalReal(TESS_META + "script_confidence"); private static final String OCR = "ocr-"; private static final Logger LOG = LoggerFactory.getLogger(TesseractOCRParser.class); private static final Object[] LOCK = new Object[0]; private static final long serialVersionUID = -8167538283213097265L; private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<>( Arrays.asList( new MediaType[]{MediaType.image(OCR + "png"), MediaType.image(OCR + "jpeg"), MediaType.image(OCR + "tiff"), MediaType.image(OCR + "bmp"), MediaType.image(OCR + "gif"), //these are not currently covered by other parsers MediaType.image("jp2"), MediaType.image("jpx"), MediaType.image("x-portable-pixmap"), //add the ocr- versions as well MediaType.image(OCR + "jp2"), MediaType.image(OCR + "jpx"), MediaType.image(OCR + "x-portable-pixmap"), }))); private static volatile boolean HAS_WARNED = false; private static volatile boolean HAS_CHECKED_FOR_IMAGE_MAGICK = false; //if a user specifies a custom tess path or tessdata path //load the available languages at initialization time private final Set langs = new HashSet<>(); private final TesseractOCRConfig defaultConfig = new TesseractOCRConfig(); private String tesseractPath = ""; private String tessdataPath = ""; private String imageMagickPath = ""; //if set to true, this will run --list-langs //at initialization and then check langs //at parse time private boolean preloadLangs = false; private boolean hasTesseract; private boolean hasImageMagick; private ImagePreprocessor imagePreprocessor; public static String getImageMagickProg() { return System.getProperty("os.name").startsWith("Windows") ? "magick" : "convert"; } public static String getTesseractProg() { return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract"; } @Override public Set getSupportedTypes(ParseContext context) { // If Tesseract is installed, offer our supported image types TesseractOCRConfig config = context.get(TesseractOCRConfig.class); if (hasTesseract) { if (config == null || !config.isSkipOcr()) { return SUPPORTED_TYPES; } } // Otherwise don't advertise anything, so the other image parsers // can be selected instead return Collections.emptySet(); } private void setEnv(ProcessBuilder pb) { Map env = pb.environment(); if (!StringUtils.isBlank(getTessdataPath())) { env.put(TESSDATA_PREFIX, getTessdataPath()); } else if (!StringUtils.isBlank(getTesseractPath())) { //adding tessdata is required for at least >= 4.x env.put(TESSDATA_PREFIX, getTesseractPath() + "tessdata"); } } public boolean hasTesseract() throws TikaConfigException { // Fetch where the config says to find Tesseract String tesseract = getTesseractPath() + getTesseractProg(); if (!StringUtils.isBlank(tesseractPath) && !Files.isDirectory(Paths.get(tesseractPath))) { throw new TikaConfigException("tesseractPath (" + tesseractPath + ") " + "doesn't point to an existing directory"); } // Try running Tesseract from there, and see if it exists + works String[] checkCmd = {tesseract}; boolean hasTesseract = ExternalParser.check(checkCmd); LOG.debug("hasTesseract (path: " + Arrays.toString(checkCmd) + "): " + hasTesseract); return hasTesseract; } synchronized boolean hasImageMagick() throws TikaConfigException { if (HAS_CHECKED_FOR_IMAGE_MAGICK) { return hasImageMagick; } // Fetch where the config says to find ImageMagick Program String fullImageMagickPath = imageMagickPath + getImageMagickProg(); //check that directory exists if (!StringUtils.isBlank(imageMagickPath) && !Files.isDirectory(Paths.get(imageMagickPath))) { throw new TikaConfigException("imageMagickPath (" + imageMagickPath + ") " + "doesn't point to an existing directory"); } // Try running ImageMagick program from there, and see if it exists + works String[] checkCmd = {fullImageMagickPath}; boolean hasImageMagick = ExternalParser.check(checkCmd); if (!hasImageMagick) { LOG.debug("ImageMagick does not appear to be installed " + "(commandline: " + fullImageMagickPath + ")"); } HAS_CHECKED_FOR_IMAGE_MAGICK = true; return hasImageMagick; } public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { try (TemporaryResources tmp = new TemporaryResources()) { int w = image.getWidth(null); int h = image.getHeight(null); BufferedImage bImage = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB); File file = tmp.createTemporaryFile(); try (OutputStream fos = new FileOutputStream(file)) { ImageIO.write(bImage, "png", fos); } try (TikaInputStream tis = TikaInputStream.get(file)) { parse(tis, handler, metadata, context); } } } @Override public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException { TesseractOCRConfig userConfig = parseContext.get(TesseractOCRConfig.class); TesseractOCRConfig config = defaultConfig; if (userConfig != null) { config = defaultConfig.cloneAndUpdate(userConfig); } // If Tesseract is not on the path with the current config, do not try to run OCR // getSupportedTypes shouldn't have listed us as handling it, so this should only // occur if someone directly calls this parser, not via DefaultParser or similar if (!hasTesseract || (config != null && config.isSkipOcr())) { return; } //if you haven't checked yet, and a per file config requests imagemagick //and if the default is not to use image processing if (! HAS_CHECKED_FOR_IMAGE_MAGICK && config.isEnableImagePreprocessing()) { hasImageMagick = hasImageMagick(); } try (TemporaryResources tmp = new TemporaryResources()) { TikaInputStream tikaStream = TikaInputStream.get(stream, tmp, metadata); //trigger the spooling to a tmp file if the stream wasn't //already a TikaInputStream that contained a file tikaStream.getPath(); //this is the text output file name specified on the tesseract //commandline. The actual output file name will have a suffix added. File tmpOCROutputFile = tmp.createTemporaryFile(); XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); parse(tikaStream, tmpOCROutputFile, xhtml, metadata, parseContext, config); xhtml.endDocument(); } } private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, ContentHandler xhtml, Metadata metadata, ParseContext parseContext, TesseractOCRConfig config) throws IOException, SAXException, TikaException { warnOnFirstParse(); validateLangString(config.getLanguage()); File tmpTxtOutput = null; try { Path input = tikaInputStream.getPath(); long size = tikaInputStream.getLength(); if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) { // Process image if (config.isEnableImagePreprocessing() || config.isApplyRotation()) { if (!hasImageMagick) { LOG.warn( "User has selected to preprocess images, " + "but I can't find ImageMagick." + "Backing off to original file."); doOCR(input.toFile(), tmpOCROutputFile, config, parseContext); } else { // copy the contents of the original input file into a temporary file // which will be preprocessed for OCR try (TemporaryResources tmp = new TemporaryResources()) { Path tmpFile = tmp.createTempFile(); Files.copy(input, tmpFile, StandardCopyOption.REPLACE_EXISTING); imagePreprocessor.process(tmpFile, tmpFile, metadata, config); doOCR(tmpFile.toFile(), tmpOCROutputFile, config, parseContext); } } } else { doOCR(input.toFile(), tmpOCROutputFile, config, parseContext); } String extension = config.getPageSegMode().equals("0") ? "osd" : config.getOutputType().toString().toLowerCase(Locale.US); // Tesseract appends the output type (.txt or .hocr or .osd) to output file name tmpTxtOutput = new File(tmpOCROutputFile.getAbsolutePath() + "." + extension); if (tmpTxtOutput.exists()) { try (InputStream is = new FileInputStream(tmpTxtOutput)) { if (config.getPageSegMode().equals("0")) { extractOSD(is, metadata); } else if (config.getOutputType().equals(TesseractOCRConfig.OUTPUT_TYPE.HOCR)) { extractHOCROutput(is, parseContext, xhtml); } else { extractOutput(is, xhtml); } } } } } finally { if (tmpTxtOutput != null) { tmpTxtOutput.delete(); } } } private void extractOSD(InputStream is, Metadata metadata) throws IOException { Matcher matcher = Pattern.compile("^([^:]+):\\s+(.*)").matcher(""); try (BufferedReader reader = new BufferedReader(new InputStreamReader(is, UTF_8))) { String line = reader.readLine(); while (line != null) { if (matcher.reset(line).find()) { String k = matcher.group(1); String v = matcher.group(2); switch (k) { case "Page number": metadata.set(PSM0_PAGE_NUMBER, Integer.parseInt(v)); break; case "Orientation in degrees": metadata.set(PSM0_ORIENTATION, Integer.parseInt(v)); break; case "Rotate": metadata.set(PSM0_ROTATE, Integer.parseInt(v)); break; case "Orientation confidence": metadata.set(PSM0_ORIENTATION_CONFIDENCE, Double.parseDouble(v)); break; case "Script": metadata.set(PSM0_SCRIPT, v); break; case "Script confidence": metadata.set(PSM0_SCRIPT_CONFIDENCE, Double.parseDouble(v)); break; default: LOG.warn("I regret I don't know how to parse {} with value {}", k, v); } } line = reader.readLine(); } } } private void warnOnFirstParse() { if (!hasWarned()) { warn(); } } /** * Run external tesseract-ocr process. * * @param input File to be ocred * @param output File to collect ocr result * @param config Configuration of tesseract-ocr engine * @throws TikaException if the extraction timed out * @throws IOException if an input error occurred */ private void doOCR(File input, File output, TesseractOCRConfig config, ParseContext parseContext) throws IOException, TikaException { ArrayList cmd = new ArrayList<>( Arrays.asList(getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "--psm", config.getPageSegMode())); //if --psm == 0, don't add anything else to the command line if (! "0".equals(config.getPageSegMode())) { if (!StringUtils.isBlank(config.getLanguage())) { cmd.add("-l"); cmd.add(config.getLanguage()); } for (Map.Entry entry : config.getOtherTesseractConfig().entrySet()) { cmd.add("-c"); cmd.add(entry.getKey() + "=" + entry.getValue()); } cmd.addAll(Arrays.asList("-c", "page_separator=" + config.getPageSeparator(), "-c", (config.isPreserveInterwordSpacing()) ? "preserve_interword_spaces=1" : "preserve_interword_spaces=0", config.getOutputType().name().toLowerCase(Locale.US))); } LOG.debug("Tesseract command: " + String.join(" ", cmd)); ProcessBuilder pb = new ProcessBuilder(cmd); setEnv(pb); Process process = null; String id = null; long timeoutMillis = TikaTaskTimeout.getTimeoutMillis(parseContext, config.getTimeoutSeconds() * 1000); try { process = pb.start(); id = register(process); runOCRProcess(process, timeoutMillis); } finally { if (process != null) { process.destroyForcibly(); } if (id != null) { release(id); } } } private void runOCRProcess(Process process, long timeoutMillis) throws IOException, TikaException { process.getOutputStream().close(); InputStream out = process.getInputStream(); InputStream err = process.getErrorStream(); StringBuilder outBuilder = new StringBuilder(); StringBuilder errBuilder = new StringBuilder(); Thread outThread = logStream(out, outBuilder); Thread errThread = logStream(err, errBuilder); outThread.start(); errThread.start(); int exitValue = Integer.MIN_VALUE; try { boolean finished = process.waitFor(timeoutMillis, TimeUnit.MILLISECONDS); if (!finished) { throw new TikaException("TesseractOCRParser timeout"); } exitValue = process.exitValue(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new TikaException("TesseractOCRParser interrupted", e); } catch (IllegalThreadStateException e) { //this _should_ never be thrown throw new TikaException("TesseractOCRParser timeout"); } if (exitValue > 0) { try { //make sure this thread is actually done errThread.join(1000); } catch (InterruptedException e) { //swallow } throw new TikaException( "TesseractOCRParser bad exit value " + exitValue + " err msg: " + errBuilder.toString()); } } /** * Reads the contents of the given stream and write it to the given XHTML * content handler. The stream is closed once fully processed. * * @param stream Stream where is the result of ocr * @param xhtml XHTML content handler * @throws SAXException if the XHTML SAX events could not be handled * @throws IOException if an input error occurred */ private void extractOutput(InputStream stream, ContentHandler xhtml) throws SAXException, IOException { //
0) { xhtml.characters(buffer, 0, n); } } } xhtml.endElement(XHTML, "div", "div"); } private void extractHOCROutput(InputStream is, ParseContext parseContext, ContentHandler xhtml) throws TikaException, IOException, SAXException { if (parseContext == null) { parseContext = new ParseContext(); } //
{ Reader reader = new InputStreamReader(stream, UTF_8); char[] buffer = new char[1024]; try { for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { out.append(buffer, 0, n); } } catch (IOException e) { //swallow } finally { IOUtils.closeQuietly(stream); } LOG.debug("{}", out); }); } @Override public void initialize(Map params) throws TikaConfigException { hasTesseract = hasTesseract(); if (isEnableImagePreprocessing()) { hasImageMagick = hasImageMagick(); } else { hasImageMagick = false; } if (preloadLangs) { preloadLangs(); if (!StringUtils.isBlank(defaultConfig.getLanguage())) { validateLangString(defaultConfig.getLanguage()); } } imagePreprocessor = new ImagePreprocessor(getImageMagickPath() + getImageMagickProg()); } private void validateLangString(String language) throws TikaConfigException { Set invalidlangs = new HashSet<>(); Set validLangs = new HashSet<>(); TesseractOCRConfig.getLangs(language, validLangs, invalidlangs); if (invalidlangs.size() > 0) { throw new TikaConfigException("Invalid language code(s): " + invalidlangs); } if (langs.size() > 0) { for (String lang : validLangs) { if (!langs.contains(lang)) { throw new TikaConfigException( "tesseract does not have " + lang + " available. I see only: " + langs); } } } } @Override public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException { if (langs.size() > 0 && !StringUtils.isBlank(defaultConfig.getLanguage())) { if (!langs.contains(defaultConfig.getLanguage())) { throw new TikaConfigException("It doesn't look like tesseract has lang data for " + defaultConfig.getLanguage() + ". " + "I see only: " + langs); } } } public Set getLangs() { return langs; } protected boolean hasWarned() { if (HAS_WARNED) { return true; } synchronized (LOCK) { if (HAS_WARNED) { return true; } return false; } } protected void warn() { LOG.info("Tesseract is installed and is being invoked. " + "This can add greatly to processing time. If you do not want tesseract " + "to be applied to your files see: " + "https://cwiki.apache.org/confluence/display/TIKA/TikaOCR#TikaOCR-disable-ocr"); HAS_WARNED = true; } public String getTesseractPath() { return tesseractPath; } /** * Set the path to the Tesseract executable's directory, needed if it is not on system path. *

* Note that if you set this value, it is highly recommended that you also * set the path to (and including) the 'tessdata' folder using {@link #setTessdataPath}. *

*/ @Field public void setTesseractPath(String tesseractPath) { tesseractPath = FilenameUtils.normalize(tesseractPath); if (!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator)) { tesseractPath += File.separator; } this.tesseractPath = tesseractPath; } public String getTessdataPath() { return this.tessdataPath; } /** * Set the path to the 'tessdata' folder, which contains language files and config files. In * some cases (such * as on Windows), this folder is found in the Tesseract installation, but in other cases * (such as when Tesseract is built from source), it may be located elsewhere. *

* Make sure to include the 'tessdata' folder in this path: '/blah/de/blah/tessdata' */ @Field public void setTessdataPath(String tessdataPath) { tessdataPath = FilenameUtils.normalize(tessdataPath); if (!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator)) { tessdataPath += File.separator; } this.tessdataPath = tessdataPath; } public String getImageMagickPath() { return imageMagickPath; } /** * Set the path to the ImageMagick executable directory, needed if it is not on system path. * * @param imageMagickPath to ImageMagick executable directory. */ @Field public void setImageMagickPath(String imageMagickPath) { imageMagickPath = FilenameUtils.normalize(imageMagickPath); if (!imageMagickPath.isEmpty() && !imageMagickPath.endsWith(File.separator)) { imageMagickPath += File.separator; } this.imageMagickPath = imageMagickPath; } @Field public void setOtherTesseractSettings(List settings) throws TikaConfigException { for (String s : settings) { String[] bits = s.trim().split("\\s+"); if (bits.length != 2) { throw new TikaConfigException( "Expected space delimited key value pair." + " However, I found " + bits.length + " bits."); } defaultConfig.addOtherTesseractConfig(bits[0], bits[1]); } } public List getOtherTesseractSettings() { List settings = new ArrayList<>(); Map sorted = new TreeMap<>(defaultConfig.getOtherTesseractConfig()); for (Map.Entry e :sorted.entrySet()) { settings.add(e.getKey() + " " + e.getValue()); } return settings; } @Field public void setSkipOCR(boolean skipOCR) { defaultConfig.setSkipOcr(skipOCR); } public boolean isSkipOCR() { return defaultConfig.isSkipOcr(); } @Field public void setLanguage(String language) { defaultConfig.setLanguage(language); } public String getLanguage() { return defaultConfig.getLanguage(); } @Field public void setPageSegMode(String pageSegMode) { defaultConfig.setPageSegMode(pageSegMode); } public String getPageSegMode() { return defaultConfig.getPageSegMode(); } @Field public void setMaxFileSizeToOcr(long maxFileSizeToOcr) { defaultConfig.setMaxFileSizeToOcr(maxFileSizeToOcr); } public long getMaxFileSizeToOcr() { return defaultConfig.getMaxFileSizeToOcr(); } @Field public void setMinFileSizeToOcr(long minFileSizeToOcr) { defaultConfig.setMinFileSizeToOcr(minFileSizeToOcr); } public long getMinFileSizeToOcr() { return defaultConfig.getMinFileSizeToOcr(); } /** * Set default timeout in seconds. This can be overridden per parse * with {@link TikaTaskTimeout} sent in via the {@link ParseContext} * at parse time. * * @param timeout */ @Field public void setTimeout(int timeout) { defaultConfig.setTimeoutSeconds(timeout); } public int getTimeout() { return defaultConfig.getTimeoutSeconds(); } @Field public void setOutputType(String outputType) { defaultConfig.setOutputType(outputType); } public String getOutputType() { return defaultConfig.getOutputType().name(); } @Field public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) { defaultConfig.setPreserveInterwordSpacing(preserveInterwordSpacing); } public boolean isPreserveInterwordSpacing() { return defaultConfig.isPreserveInterwordSpacing(); } @Field public void setEnableImagePreprocessing(boolean enableImagePreprocessing) { defaultConfig.setEnableImagePreprocessing(enableImagePreprocessing); } public boolean isEnableImagePreprocessing() { return defaultConfig.isEnableImagePreprocessing(); } @Field public void setDensity(int density) { defaultConfig.setDensity(density); } public int getDensity() { return defaultConfig.getDensity(); } @Field public void setDepth(int depth) { defaultConfig.setDepth(depth); } public int getDepth() { return defaultConfig.getDepth(); } @Field public void setColorspace(String colorspace) { defaultConfig.setColorspace(colorspace); } public String getColorspace() { return defaultConfig.getColorspace(); } @Field public void setFilter(String filter) { defaultConfig.setFilter(filter); } public String getFilter() { return defaultConfig.getFilter(); } @Field public void setResize(int resize) { defaultConfig.setResize(resize); } public int getResize() { return defaultConfig.getResize(); } @Field public void setApplyRotation(boolean applyRotation) { defaultConfig.setApplyRotation(applyRotation); } public boolean isApplyRotation() { return defaultConfig.isApplyRotation(); } /** * If set to true and if tesseract is found, this will load the * langs that result from --list-langs. At parse time, the * parser will verify that tesseract has the requested lang * available. *

* If set to false (the default) and tesseract is found, if a user * requests a language that tesseract does not have data for, * a TikaException will be thrown with tesseract's native exception * message, which is a bit less readable. * * @param preloadLangs */ @Field public void setPreloadLangs(boolean preloadLangs) { this.preloadLangs = preloadLangs; } public boolean isPreloadLangs() { return this.preloadLangs; } public TesseractOCRConfig getDefaultConfig() { return defaultConfig; } private void preloadLangs() { String[] args = new String[]{getTesseractPath() + getTesseractProg(), "--list-langs"}; ProcessBuilder pb = new ProcessBuilder(args); setEnv(pb); Process process = null; try { process = pb.start(); getLangs(process, defaultConfig.getTimeoutSeconds()); } catch (TikaException | IOException e) { LOG.warn("Problem preloading langs", e); } finally { if (process != null) { process.destroyForcibly(); } } } private void getLangs(Process process, int timeoutSeconds) throws IOException, TikaException { process.getOutputStream().close(); InputStream out = process.getInputStream(); InputStream err = process.getErrorStream(); StringBuilder outBuilder = new StringBuilder(); StringBuilder errBuilder = new StringBuilder(); Thread outThread = logStream(out, outBuilder); Thread errThread = logStream(err, errBuilder); outThread.start(); errThread.start(); int exitValue = Integer.MIN_VALUE; try { boolean finished = process.waitFor(timeoutSeconds, TimeUnit.SECONDS); if (!finished) { throw new TikaException("TesseractOCRParser timeout"); } exitValue = process.exitValue(); outThread.join(1000); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new TikaException("TesseractOCRParser interrupted", e); } catch (IllegalThreadStateException e) { //this _should_ never be thrown throw new TikaException("TesseractOCRParser timeout"); } if (exitValue > 0) { throw new TikaException( "TesseractOCRParser bad exit value " + exitValue + " err msg: " + errBuilder.toString()); } for (String line : outBuilder.toString().split("[\r\n]+")) { if (line.startsWith("List of available")) { continue; } langs.add(line.trim()); } } private static class HOCRPassThroughHandler extends DefaultHandler { public static final Set IGNORE = unmodifiableSet("html", "head", "title", "meta", "body"); private final ContentHandler xhtml; public HOCRPassThroughHandler(ContentHandler xhtml) { this.xhtml = xhtml; } private static Set unmodifiableSet(String... elements) { return Collections.unmodifiableSet(new HashSet<>(Arrays.asList(elements))); } /** * Starts the given element. Table cells and list items are automatically * indented by emitting a tab character as ignorable whitespace. */ @Override public void startElement(String uri, String local, String name, Attributes attributes) throws SAXException { if (!IGNORE.contains(name)) { xhtml.startElement(uri, local, name, attributes); } } /** * Ends the given element. Block elements are automatically followed * by a newline character. */ @Override public void endElement(String uri, String local, String name) throws SAXException { if (!IGNORE.contains(name)) { xhtml.endElement(uri, local, name); } } /** * @see TIKA-210 */ @Override public void characters(char[] ch, int start, int length) throws SAXException { xhtml.characters(ch, start, length); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy