All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.ocr.TesseractOCRConfig Maven / Gradle / Ivy

There is a newer version: 3.0.0-BETA2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.ocr;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Locale;
import java.util.Properties;

/**
 * Configuration for TesseractOCRParser.
 *
 * This allows to enable TesseractOCRParser and set its parameters:
 * 

* TesseractOCRConfig config = new TesseractOCRConfig();
* config.setTesseractPath(tesseractFolder);
* parseContext.set(TesseractOCRConfig.class, config);
*

* * Parameters can also be set by either editing the existing TesseractOCRConfig.properties file in, * tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it by creating your own * and placing it in the package org/apache/tika/parser/ocr on the classpath. * */ public class TesseractOCRConfig implements Serializable{ private static final long serialVersionUID = -4861942486845757891L; // Path to tesseract installation folder, if not on system path. private String tesseractPath = ""; // Path to the 'tessdata' folder, which contains language files and config files. private String tessdataPath = ""; // Language dictionary to be used. private String language = "eng"; // Tesseract page segmentation mode. private String pageSegMode = "1"; // Minimum file size to submit file to ocr. private int minFileSizeToOcr = 0; // Maximum file size to submit file to ocr. private int maxFileSizeToOcr = Integer.MAX_VALUE; // Maximum time (seconds) to wait for the ocring process termination private int timeout = 120; /** * Default contructor. */ public TesseractOCRConfig() { init(this.getClass().getResourceAsStream("TesseractOCRConfig.properties")); } /** * Loads properties from InputStream and then tries to close InputStream. * If there is an IOException, this silently swallows the exception * and goes back to the default. * * @param is */ public TesseractOCRConfig(InputStream is) { init(is); } private void init(InputStream is) { if (is == null) { return; } Properties props = new Properties(); try { props.load(is); } catch (IOException e) { } finally { if (is != null) { try { is.close(); } catch (IOException e) { //swallow } } } setTesseractPath( getProp(props, "tesseractPath", getTesseractPath())); setTessdataPath( getProp(props, "tessdataPath", getTessdataPath())); setLanguage( getProp(props, "language", getLanguage())); setPageSegMode( getProp(props, "pageSegMode", getPageSegMode())); setMinFileSizeToOcr( getProp(props, "minFileSizeToOcr", getMinFileSizeToOcr())); setMaxFileSizeToOcr( getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr())); setTimeout( getProp(props, "timeout", getTimeout())); } /** @see #setTesseractPath(String tesseractPath)*/ public String getTesseractPath() { return tesseractPath; } /** * Set the path to the Tesseract executable, needed if it is not on system path. *

* Note that if you set this value, it is highly recommended that you also * set the path to the 'tessdata' folder using {@link #setTessdataPath}. *

*/ public void setTesseractPath(String tesseractPath) { if(!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator)) tesseractPath += File.separator; this.tesseractPath = tesseractPath; } /** @see #setTessdataPath(String tessdataPath) */ public String getTessdataPath() { return tessdataPath; } /** * Set the path to the 'tessdata' folder, which contains language files and config files. In some cases (such * as on Windows), this folder is found in the Tesseract installation, but in other cases * (such as when Tesseract is built from source), it may be located elsewhere. */ public void setTessdataPath(String tessdataPath) { if(!tessdataPath.isEmpty() && !tessdataPath.endsWith(File.separator)) tessdataPath += File.separator; this.tessdataPath = tessdataPath; } /** @see #setLanguage(String language)*/ public String getLanguage() { return language; } /** * Set tesseract language dictionary to be used. Default is "eng". * Multiple languages may be specified, separated by plus characters. */ public void setLanguage(String language) { if (!language.matches("([A-Za-z](\\+?))*")) { throw new IllegalArgumentException("Invalid language code"); } this.language = language; } /** @see #setPageSegMode(String pageSegMode)*/ public String getPageSegMode() { return pageSegMode; } /** * Set tesseract page segmentation mode. * Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection) */ public void setPageSegMode(String pageSegMode) { if (!pageSegMode.matches("[1-9]|10")) { throw new IllegalArgumentException("Invalid language code"); } this.pageSegMode = pageSegMode; } /** @see #setMinFileSizeToOcr(int minFileSizeToOcr)*/ public int getMinFileSizeToOcr() { return minFileSizeToOcr; } /** * Set minimum file size to submit file to ocr. * Default is 0. */ public void setMinFileSizeToOcr(int minFileSizeToOcr) { this.minFileSizeToOcr = minFileSizeToOcr; } /** @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)*/ public int getMaxFileSizeToOcr() { return maxFileSizeToOcr; } /** * Set maximum file size to submit file to ocr. * Default is Integer.MAX_VALUE. */ public void setMaxFileSizeToOcr(int maxFileSizeToOcr) { this.maxFileSizeToOcr = maxFileSizeToOcr; } /** * Set maximum time (seconds) to wait for the ocring process to terminate. * Default value is 120s. */ public void setTimeout(int timeout) { this.timeout = timeout; } /** @see #setTimeout(int timeout)*/ public int getTimeout() { return timeout; } /** * Get property from the properties file passed in. * @param properties properties file to read from. * @param property the property to fetch. * @param defaultMissing default parameter to use. * @return the value. */ private int getProp(Properties properties, String property, int defaultMissing) { String p = properties.getProperty(property); if (p == null || p.isEmpty()){ return defaultMissing; } try { return Integer.parseInt(p); } catch (Throwable ex) { throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse TesseractOCRConfig variable %s, invalid integer value", property), ex); } } /** * Get property from the properties file passed in. * @param properties properties file to read from. * @param property the property to fetch. * @param defaultMissing default parameter to use. * @return the value. */ private String getProp(Properties properties, String property, String defaultMissing) { return properties.getProperty(property, defaultMissing); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy