org.apache.tika.parser.pdf.PDFPureJavaParserConfig Maven / Gradle / Ivy

Go to download
package org.apache.tika.parser.pdf;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Locale;
import java.util.Properties;

import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.tika.config.Field;

/**
 * Config for PDFParser.
 * 
 * This allows parameters to be set programmatically:
 * 

 * Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() (as before)
 * Constructor of PDFParser
 * Passing to PDFParser through a ParseContext: context.set(PDFParserConfig.class, config);
 * 
 * 
 * Parameters can also be set by modifying the PDFParserConfig.properties file,
 * which lives in the expected places, in trunk:
 * tika-parsers/src/main/resources/org/apache/tika/parser/pdf
 * 

 * Or, in tika-app-x.x.jar or tika-parsers-x.x.jar:
 * org/apache/tika/parser/pdf
 */
public class PDFPureJavaParserConfig implements Serializable {

    public enum OCR_STRATEGY {
        NO_OCR,
        OCR_ONLY,
        OCR_AND_TEXT_EXTRACTION;

        private static OCR_STRATEGY parse(String s) {
            if (s == null) {
                return NO_OCR;
            } else if ("no_ocr".equals(s.toLowerCase(Locale.ROOT))) {
                return NO_OCR;
            } else if ("ocr_only".equals(s.toLowerCase(Locale.ROOT))) {
                return OCR_ONLY;
            } else if (s.toLowerCase(Locale.ROOT).contains("ocr_and_text")) {
                return OCR_AND_TEXT_EXTRACTION;
            }
            StringBuilder sb = new StringBuilder();
            sb.append("I regret that I don't recognize '").append(s);
            sb.append("' as an OCR_STRATEGY. I only recognize:");
            int i = 0;
            for (OCR_STRATEGY strategy : OCR_STRATEGY.values()) {
                if (i++ > 0) {
                    sb.append(", ");
                }
                sb.append(strategy.toString());

            }
            throw new IllegalArgumentException(sb.toString());
        }
    }

    private static final long serialVersionUID = 6492570218190936986L;

    // True if we let PDFBox "guess" where spaces should go:
    private boolean enableAutoSpace = true;

    // True if we let PDFBox remove duplicate overlapping text:
    private boolean suppressDuplicateOverlappingText;

    // True if we extract annotation text ourselves
    // (workaround for PDFBOX-1143):
    private boolean extractAnnotationText = true;

    // True if we should sort text tokens by position
    // (necessary for some PDFs, but messes up other PDFs):
    @Field
    private boolean sortByPosition = false;

    //True if acroform content should be extracted
    private boolean extractAcroFormContent = true;

    //True if inline PDXImage objects should be extracted
    private boolean extractInlineImages = false;

    //True if inline images (as identified by their object id within
    //a pdf file) should only be extracted once.
    private boolean extractUniqueInlineImagesOnly = true;

    //The character width-based tolerance value used to estimate where spaces in text should be added
    private Float averageCharTolerance;

    //The space width-based tolerance value used to estimate where spaces in text should be added
    private Float spacingTolerance;

    //If the PDF has an XFA element, process only that and skip extracting
    //content from elsewhere in the document.
    private boolean ifXFAExtractOnlyXFA = false;

    private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.NO_OCR;

    private int ocrDPI = 300;
    private ImageType ocrImageType = ImageType.GRAY;
    private String ocrImageFormatName = "png";

    private AccessChecker accessChecker;

    //The PDFParser can throw IOExceptions if there is a problem
    //with a streams.  If this is set to true, Tika's
    //parser catches these exceptions, reports them in the metadata
    //and then throws the first stored exception after the parse has completed.
    private boolean catchIntermediateIOExceptions = true;

    private boolean extractActions = false;

    public PDFPureJavaParserConfig() {
        init(this.getClass().getResourceAsStream("PDFParser.properties"));
    }

    /**
     * Loads properties from InputStream and then tries to close InputStream.
     * If there is an IOException, this silently swallows the exception
     * and goes back to the default.
     *
     * @param is
     */
    public PDFPureJavaParserConfig(InputStream is) {
        init(is);
    }

    //initializes object and then tries to close inputstream
    private void init(InputStream is) {

        if (is == null) {
            return;
        }
        Properties props = new Properties();
        try {
            props.load(is);
        } catch (IOException e) {
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch (IOException e) {
                    //swallow
                }
            }
        }
        setEnableAutoSpace(
                getBooleanProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace()));
        setSuppressDuplicateOverlappingText(
                getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"),
                        getSuppressDuplicateOverlappingText()));
        setExtractAnnotationText(
                getBooleanProp(props.getProperty("extractAnnotationText"),
                        getExtractAnnotationText()));
        setSortByPosition(
                getBooleanProp(props.getProperty("sortByPosition"),
                        getSortByPosition()));
        setExtractAcroFormContent(
                getBooleanProp(props.getProperty("extractAcroFormContent"),
                        getExtractAcroFormContent()));
        setExtractInlineImages(
                getBooleanProp(props.getProperty("extractInlineImages"),
                        getExtractInlineImages()));
        setExtractUniqueInlineImagesOnly(
                getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
                        getExtractUniqueInlineImagesOnly()));

        setIfXFAExtractOnlyXFA(
            getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"),
                getIfXFAExtractOnlyXFA()));

        setCatchIntermediateIOExceptions(
                getBooleanProp(props.getProperty("catchIntermediateIOExceptions"),
                isCatchIntermediateIOExceptions()));

        setOcrStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));

        setOcrDPI(getIntProp(props.getProperty("ocrDPI"), getOcrDPI()));

        setOcrImageFormatName(props.getProperty("ocrImageFormatName"));

        setOcrImageType(parseImageType(props.getProperty("ocrImageType")));

        setExtractActions(getBooleanProp(props.getProperty("extractActions"), false));


        boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
        boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);

        if (checkExtractAccessPermission == false) {
            //silently ignore the crazy configuration of checkExtractAccessPermission = false,
            //but allowExtractionForAccessibility=false
            accessChecker = new AccessChecker();
        } else {
            accessChecker = new AccessChecker(allowExtractionForAccessibility);
        }
    }
    
    /**
     * Configures the given pdf2XHTML.
     *
     * @param pdf2XHTML
     */
    public void configure(PDF2XHTMLPureJava pdf2XHTML) {
        pdf2XHTML.setSortByPosition(getSortByPosition());
        if (getEnableAutoSpace()) {
            pdf2XHTML.setWordSeparator(" ");
        } else {
            pdf2XHTML.setWordSeparator("");
        }
        if (getAverageCharTolerance() != null) {
            pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance());
        }
        if (getSpacingTolerance() != null) {
            pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
        }
        pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
    }

    /**
     * @see #setExtractAcroFormContent(boolean)
     */
    public boolean getExtractAcroFormContent() {
        return extractAcroFormContent;
    }

    /**
     * If true (the default), extract content from AcroForms
     * at the end of the document.  If an XFA is found,
     * try to process that, otherwise, process the AcroForm.
     *
     * @param extractAcroFormContent
     */
    public void setExtractAcroFormContent(boolean extractAcroFormContent) {
        this.extractAcroFormContent = extractAcroFormContent;

    }

    /**
     * @see #setIfXFAExtractOnlyXFA(boolean)
     * @return how to handle XFA data if it exists
     */
    public boolean getIfXFAExtractOnlyXFA() {
        return ifXFAExtractOnlyXFA;
    }

    /**
     * If false (the default), extract content from the full PDF
     * as well as the XFA form.  This will likely lead to some duplicative
     * content.
     *
     * @param ifXFAExtractOnlyXFA
     */
    public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) {
        this.ifXFAExtractOnlyXFA = ifXFAExtractOnlyXFA;
    }


    /**
     * @see #setExtractInlineImages(boolean)
     */
    public boolean getExtractInlineImages() {
        return extractInlineImages;
    }

    /**
     * If true, extract inline embedded OBXImages.
     * Beware: some PDF documents of modest size (~4MB) can contain
     * thousands of embedded images totaling > 2.5 GB.  Also, at least as of PDFBox 1.8.5,
     * there can be surprisingly large memory consumption and/or out of memory errors.
     * Set to true with caution.
     * 

     * The default is false.
     * 

     * See also: {@see #setExtractUniqueInlineImagesOnly(boolean)};
     *
     * @param extractInlineImages
     */
    public void setExtractInlineImages(boolean extractInlineImages) {
        this.extractInlineImages = extractInlineImages;
    }

    /**
     * @see #setExtractUniqueInlineImagesOnly(boolean)
     */
    public boolean getExtractUniqueInlineImagesOnly() {
        return extractUniqueInlineImagesOnly;
    }

    /**
     * Multiple pages within a PDF file might refer to the same underlying image.
     * If {@link #extractUniqueInlineImagesOnly} is set to false, the
     * parser will call the EmbeddedExtractor each time the image appears on a page.
     * This might be desired for some use cases.  However, to avoid duplication of
     * extracted images, set this to true.  The default is true.
     * 

     * Note that uniqueness is determined only by the underlying PDF COSObject id, not by
     * file hash or similar equality metric.
     * If the PDF actually contains multiple copies of the same image
     * -- all with different object ids -- then all images will be extracted.
     * 

     * For this parameter to have any effect, {@link #extractInlineImages} must be
     * set to true.
     * 
     * Because of TIKA-1742 -- to avoid infinite recursion -- no matter the setting
     * of this parameter, the extractor will only pull out one copy of each image per
     * page.  This parameter tries to capture uniqueness across the entire document.
     *
     * @param extractUniqueInlineImagesOnly
     */
    public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) {
        this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly;

    }

    /**
     * @see #setEnableAutoSpace(boolean)
     */
    public boolean getEnableAutoSpace() {
        return enableAutoSpace;
    }

    /**
     * If true (the default), the parser should estimate
     * where spaces should be inserted between words.  For
     * many PDFs this is necessary as they do not include
     * explicit whitespace characters.
     */
    public void setEnableAutoSpace(boolean enableAutoSpace) {
        this.enableAutoSpace = enableAutoSpace;
    }

    /**
     * @see #setSuppressDuplicateOverlappingText(boolean)
     */
    public boolean getSuppressDuplicateOverlappingText() {
        return suppressDuplicateOverlappingText;
    }

    /**
     * If true, the parser should try to remove duplicated
     * text over the same region.  This is needed for some
     * PDFs that achieve bolding by re-writing the same
     * text in the same area.  Note that this can
     * slow down extraction substantially (PDFBOX-956) and
     * sometimes remove characters that were not in fact
     * duplicated (PDFBOX-1155).  By default this is disabled.
     */
    public void setSuppressDuplicateOverlappingText(
            boolean suppressDuplicateOverlappingText) {
        this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText;
    }

    /**
     * @see #setExtractAnnotationText(boolean)
     */
    public boolean getExtractAnnotationText() {
        return extractAnnotationText;
    }

    /**
     * If true (the default), text in annotations will be
     * extracted.
     */
    public void setExtractAnnotationText(boolean extractAnnotationText) {
        this.extractAnnotationText = extractAnnotationText;
    }

    /**
     * @see #setSortByPosition(boolean)
     */
    public boolean getSortByPosition() {
        return sortByPosition;
    }

    /**
     * If true, sort text tokens by their x/y position
     * before extracting text.  This may be necessary for
     * some PDFs (if the text tokens are not rendered "in
     * order"), while for other PDFs it can produce the
     * wrong result (for example if there are 2 columns,
     * the text will be interleaved).  Default is false.
     */
    public void setSortByPosition(boolean sortByPosition) {
        this.sortByPosition = sortByPosition;
    }

    /**
     * @see #setAverageCharTolerance(Float)
     */
    public Float getAverageCharTolerance() {
        return averageCharTolerance;
    }

    /**
     * See {@link PDFTextStripper#setAverageCharTolerance(float)}
     */
    public void setAverageCharTolerance(Float averageCharTolerance) {
        this.averageCharTolerance = averageCharTolerance;
    }

    /**
     * @see #setSpacingTolerance(Float)
     */
    public Float getSpacingTolerance() {
        return spacingTolerance;
    }

    /**
     * See {@link PDFTextStripper#setSpacingTolerance(float)}
     */
    public void setSpacingTolerance(Float spacingTolerance) {
        this.spacingTolerance = spacingTolerance;
    }

    public AccessChecker getAccessChecker() {
        return accessChecker;
    }

    public void setAccessChecker(AccessChecker accessChecker) {
        this.accessChecker = accessChecker;
    }

    /**
     * See {@link #setCatchIntermediateIOExceptions(boolean)}
     * @return whether or not to catch IOExceptions
     * @deprecated use {@link #getCatchIntermediateIOExceptions()}
     */
    public boolean isCatchIntermediateIOExceptions() {
        return catchIntermediateIOExceptions;
    }

    /**
     * See {@link #setCatchIntermediateIOExceptions(boolean)}
     * @return whether or not to catch IOExceptions
     */
    public boolean getCatchIntermediateIOExceptions() {
        return catchIntermediateIOExceptions;
    }
    /**
     * The PDFBox parser will throw an IOException if there is
     * a problem with a stream.  If this is set to true,
     * Tika's PDFParser will catch these exceptions and try to parse
     * the rest of the document.  After the parse is completed,
     * Tika's PDFParser will throw the first caught exception.
     * @param catchIntermediateIOExceptions
     */
    public void setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions) {
        this.catchIntermediateIOExceptions = catchIntermediateIOExceptions;
    }

    /**
     * Which strategy to use for OCR
     * @param ocrStrategy
     */
    public void setOcrStrategy(OCR_STRATEGY ocrStrategy) {
        this.ocrStrategy = ocrStrategy;
    }

    /**
     * Which strategy to use for OCR
     * @param ocrStrategyString
     */
    public void setOcrStrategy(String ocrStrategyString) {
        this.ocrStrategy = OCR_STRATEGY.parse(ocrStrategyString);
    }
    /**
     *
     * @return strategy to use for OCR
     */
    public OCR_STRATEGY getOcrStrategy() {
        return ocrStrategy;
    }

    private boolean getBooleanProp(String p, boolean defaultMissing) {
        if (p == null) {
            return defaultMissing;
        }
        if (p.toLowerCase(Locale.ROOT).equals("true")) {
            return true;
        } else if (p.toLowerCase(Locale.ROOT).equals("false")) {
            return false;
        } else {
            return defaultMissing;
        }
    }
    //throws NumberFormatException if there's a non-null unparseable
    //string passed in
    private int getIntProp(String p, int defaultMissing) {
        if (p == null) {
            return defaultMissing;
        }

        return Integer.parseInt(p);
    }

    /**
     * String representation of the image format used to render
     * the page image for OCR (examples: png, tiff, jpeg)
     * @return
     */
    public String getOcrImageFormatName() {
        return ocrImageFormatName;
    }

    /**
     * @see #getOcrImageFormatName()
     *
     * @param ocrImageFormatName name of image format used to render
     *                           page image
     */
    public void setOcrImageFormatName(String ocrImageFormatName) {
        this.ocrImageFormatName = ocrImageFormatName;
    }

    /**
     * Image type used to render the page image for OCR.
     * @see #setOcrImageType(ImageType)
     * @return image type
     */
    public ImageType getOcrImageType() {
        return ocrImageType;
    }

    /**
     * Image type used to render the page image for OCR.
     * @param ocrImageType
     */
    public void setOcrImageType(ImageType ocrImageType) {
        this.ocrImageType = ocrImageType;
    }

    /**
     * Image type used to render the page image for OCR.
     * @see #setOcrImageType(ImageType)
    */
    public void setOcrImageType(String ocrImageTypeString) {
        this.ocrImageType = parseImageType(ocrImageTypeString);
    }

    /**
     * Dots per inch used to render the page image for OCR
     * @return dots per inch
     */
    public int getOcrDPI() {
        return ocrDPI;
    }

    /**
     * Dots per inche used to render the page image for OCR
     * @param ocrDPI
     */
    public void setOcrDPI(int ocrDPI) {
        this.ocrDPI = ocrDPI;
    }

    /**
     * Whether or not to extract PDActions from the file.
     * Most Action types are handled inline; javascript macros
     * are processed as embedded documents.
     *
     * @param v
     */
    public void setExtractActions(boolean v) {
        extractActions = v;
    }

    /**
     * @see #setExtractActions(boolean)
     * @return whether or not to extract PDActions
     */
    public boolean getExtractActions() {
        return extractActions;
    }

    private ImageType parseImageType(String ocrImageType) {
        for (ImageType t : ImageType.values()) {
            if (ocrImageType.equalsIgnoreCase(t.toString())) {
                return t;
            }
        }
        StringBuilder sb = new StringBuilder();
        sb.append("I regret that I could not parse '");
        sb.append(ocrImageType);
        sb.append("'. I'm only familiar with: ");
        int i = 0;
        for (ImageType t : ImageType.values()) {
            if (i++ == 0) {
                sb.append(", ");
            }
            sb.append(t.toString());
        }
        throw new IllegalArgumentException(sb.toString());
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (!(o instanceof PDFPureJavaParserConfig)) return false;

        PDFPureJavaParserConfig config = (PDFPureJavaParserConfig) o;

        if (getEnableAutoSpace() != config.getEnableAutoSpace()) return false;
        if (getSuppressDuplicateOverlappingText() != config.getSuppressDuplicateOverlappingText()) return false;
        if (getExtractAnnotationText() != config.getExtractAnnotationText()) return false;
        if (getSortByPosition() != config.getSortByPosition()) return false;
        if (getExtractAcroFormContent() != config.getExtractAcroFormContent()) return false;
        if (getExtractInlineImages() != config.getExtractInlineImages()) return false;
        if (getExtractUniqueInlineImagesOnly() != config.getExtractUniqueInlineImagesOnly()) return false;
        if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) return false;
        if (getOcrDPI() != config.getOcrDPI()) return false;
        if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) return false;
        if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false;
        if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false;
        if (!getOcrStrategy().equals(config.getOcrStrategy())) return false;
        if (getOcrImageType() != config.getOcrImageType()) return false;
        if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) return false;
        if (getExtractActions() != config.getExtractActions()) return false;
        return getAccessChecker().equals(config.getAccessChecker());

    }

    @Override
    public int hashCode() {
        int result = (getEnableAutoSpace() ? 1 : 0);
        result = 31 * result + (getSuppressDuplicateOverlappingText() ? 1 : 0);
        result = 31 * result + (getExtractAnnotationText() ? 1 : 0);
        result = 31 * result + (getSortByPosition() ? 1 : 0);
        result = 31 * result + (getExtractAcroFormContent() ? 1 : 0);
        result = 31 * result + (getExtractInlineImages() ? 1 : 0);
        result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
        result = 31 * result + getAverageCharTolerance().hashCode();
        result = 31 * result + getSpacingTolerance().hashCode();
        result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
        result = 31 * result + ocrStrategy.hashCode();
        result = 31 * result + getOcrDPI();
        result = 31 * result + getOcrImageType().hashCode();
        result = 31 * result + getOcrImageFormatName().hashCode();
        result = 31 * result + getAccessChecker().hashCode();
        result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
        result = 31 * result + (getExtractActions() ? 1 : 0);
        return result;
    }

    @Override
    public String toString() {
        return "PDFParserConfig{" +
                "enableAutoSpace=" + enableAutoSpace +
                ", suppressDuplicateOverlappingText=" + suppressDuplicateOverlappingText +
                ", extractAnnotationText=" + extractAnnotationText +
                ", sortByPosition=" + sortByPosition +
                ", extractAcroFormContent=" + extractAcroFormContent +
                ", extractInlineImages=" + extractInlineImages +
                ", extractUniqueInlineImagesOnly=" + extractUniqueInlineImagesOnly +
                ", averageCharTolerance=" + averageCharTolerance +
                ", spacingTolerance=" + spacingTolerance +
                ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA +
                ", ocrStrategy=" + ocrStrategy +
                ", ocrDPI=" + ocrDPI +
                ", ocrImageType=" + ocrImageType +
                ", ocrImageFormatName='" + ocrImageFormatName + '\'' +
                ", accessChecker=" + accessChecker +
                ", extractActions=" + extractActions +
                ", catchIntermediateIOExceptions=" + catchIntermediateIOExceptions +
                '}';
    }
}