org.apache.tika.parser.pdf.PDFPureJavaParserConfig Maven / Gradle / Ivy
package org.apache.tika.parser.pdf;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Locale;
import java.util.Properties;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.tika.config.Field;
/**
* Config for PDFParser.
*
* This allows parameters to be set programmatically:
*
* - Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() (as before)
* - Constructor of PDFParser
* - Passing to PDFParser through a ParseContext: context.set(PDFParserConfig.class, config);
*
*
* Parameters can also be set by modifying the PDFParserConfig.properties file,
* which lives in the expected places, in trunk:
* tika-parsers/src/main/resources/org/apache/tika/parser/pdf
*
* Or, in tika-app-x.x.jar or tika-parsers-x.x.jar:
* org/apache/tika/parser/pdf
*/
public class PDFPureJavaParserConfig implements Serializable {
public enum OCR_STRATEGY {
NO_OCR,
OCR_ONLY,
OCR_AND_TEXT_EXTRACTION;
private static OCR_STRATEGY parse(String s) {
if (s == null) {
return NO_OCR;
} else if ("no_ocr".equals(s.toLowerCase(Locale.ROOT))) {
return NO_OCR;
} else if ("ocr_only".equals(s.toLowerCase(Locale.ROOT))) {
return OCR_ONLY;
} else if (s.toLowerCase(Locale.ROOT).contains("ocr_and_text")) {
return OCR_AND_TEXT_EXTRACTION;
}
StringBuilder sb = new StringBuilder();
sb.append("I regret that I don't recognize '").append(s);
sb.append("' as an OCR_STRATEGY. I only recognize:");
int i = 0;
for (OCR_STRATEGY strategy : OCR_STRATEGY.values()) {
if (i++ > 0) {
sb.append(", ");
}
sb.append(strategy.toString());
}
throw new IllegalArgumentException(sb.toString());
}
}
private static final long serialVersionUID = 6492570218190936986L;
// True if we let PDFBox "guess" where spaces should go:
private boolean enableAutoSpace = true;
// True if we let PDFBox remove duplicate overlapping text:
private boolean suppressDuplicateOverlappingText;
// True if we extract annotation text ourselves
// (workaround for PDFBOX-1143):
private boolean extractAnnotationText = true;
// True if we should sort text tokens by position
// (necessary for some PDFs, but messes up other PDFs):
@Field
private boolean sortByPosition = false;
//True if acroform content should be extracted
private boolean extractAcroFormContent = true;
//True if inline PDXImage objects should be extracted
private boolean extractInlineImages = false;
//True if inline images (as identified by their object id within
//a pdf file) should only be extracted once.
private boolean extractUniqueInlineImagesOnly = true;
//The character width-based tolerance value used to estimate where spaces in text should be added
private Float averageCharTolerance;
//The space width-based tolerance value used to estimate where spaces in text should be added
private Float spacingTolerance;
//If the PDF has an XFA element, process only that and skip extracting
//content from elsewhere in the document.
private boolean ifXFAExtractOnlyXFA = false;
private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.NO_OCR;
private int ocrDPI = 300;
private ImageType ocrImageType = ImageType.GRAY;
private String ocrImageFormatName = "png";
private AccessChecker accessChecker;
//The PDFParser can throw IOExceptions if there is a problem
//with a streams. If this is set to true, Tika's
//parser catches these exceptions, reports them in the metadata
//and then throws the first stored exception after the parse has completed.
private boolean catchIntermediateIOExceptions = true;
private boolean extractActions = false;
public PDFPureJavaParserConfig() {
init(this.getClass().getResourceAsStream("PDFParser.properties"));
}
/**
* Loads properties from InputStream and then tries to close InputStream.
* If there is an IOException, this silently swallows the exception
* and goes back to the default.
*
* @param is
*/
public PDFPureJavaParserConfig(InputStream is) {
init(is);
}
//initializes object and then tries to close inputstream
private void init(InputStream is) {
if (is == null) {
return;
}
Properties props = new Properties();
try {
props.load(is);
} catch (IOException e) {
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
//swallow
}
}
}
setEnableAutoSpace(
getBooleanProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace()));
setSuppressDuplicateOverlappingText(
getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"),
getSuppressDuplicateOverlappingText()));
setExtractAnnotationText(
getBooleanProp(props.getProperty("extractAnnotationText"),
getExtractAnnotationText()));
setSortByPosition(
getBooleanProp(props.getProperty("sortByPosition"),
getSortByPosition()));
setExtractAcroFormContent(
getBooleanProp(props.getProperty("extractAcroFormContent"),
getExtractAcroFormContent()));
setExtractInlineImages(
getBooleanProp(props.getProperty("extractInlineImages"),
getExtractInlineImages()));
setExtractUniqueInlineImagesOnly(
getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
getExtractUniqueInlineImagesOnly()));
setIfXFAExtractOnlyXFA(
getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"),
getIfXFAExtractOnlyXFA()));
setCatchIntermediateIOExceptions(
getBooleanProp(props.getProperty("catchIntermediateIOExceptions"),
isCatchIntermediateIOExceptions()));
setOcrStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
setOcrDPI(getIntProp(props.getProperty("ocrDPI"), getOcrDPI()));
setOcrImageFormatName(props.getProperty("ocrImageFormatName"));
setOcrImageType(parseImageType(props.getProperty("ocrImageType")));
setExtractActions(getBooleanProp(props.getProperty("extractActions"), false));
boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
if (checkExtractAccessPermission == false) {
//silently ignore the crazy configuration of checkExtractAccessPermission = false,
//but allowExtractionForAccessibility=false
accessChecker = new AccessChecker();
} else {
accessChecker = new AccessChecker(allowExtractionForAccessibility);
}
}
/**
* Configures the given pdf2XHTML.
*
* @param pdf2XHTML
*/
public void configure(PDF2XHTMLPureJava pdf2XHTML) {
pdf2XHTML.setSortByPosition(getSortByPosition());
if (getEnableAutoSpace()) {
pdf2XHTML.setWordSeparator(" ");
} else {
pdf2XHTML.setWordSeparator("");
}
if (getAverageCharTolerance() != null) {
pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance());
}
if (getSpacingTolerance() != null) {
pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
}
pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
}
/**
* @see #setExtractAcroFormContent(boolean)
*/
public boolean getExtractAcroFormContent() {
return extractAcroFormContent;
}
/**
* If true (the default), extract content from AcroForms
* at the end of the document. If an XFA is found,
* try to process that, otherwise, process the AcroForm.
*
* @param extractAcroFormContent
*/
public void setExtractAcroFormContent(boolean extractAcroFormContent) {
this.extractAcroFormContent = extractAcroFormContent;
}
/**
* @see #setIfXFAExtractOnlyXFA(boolean)
* @return how to handle XFA data if it exists
*/
public boolean getIfXFAExtractOnlyXFA() {
return ifXFAExtractOnlyXFA;
}
/**
* If false (the default), extract content from the full PDF
* as well as the XFA form. This will likely lead to some duplicative
* content.
*
* @param ifXFAExtractOnlyXFA
*/
public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) {
this.ifXFAExtractOnlyXFA = ifXFAExtractOnlyXFA;
}
/**
* @see #setExtractInlineImages(boolean)
*/
public boolean getExtractInlineImages() {
return extractInlineImages;
}
/**
* If true, extract inline embedded OBXImages.
* Beware: some PDF documents of modest size (~4MB) can contain
* thousands of embedded images totaling > 2.5 GB. Also, at least as of PDFBox 1.8.5,
* there can be surprisingly large memory consumption and/or out of memory errors.
* Set to true
with caution.
*
* The default is false
.
*
* See also: {@see #setExtractUniqueInlineImagesOnly(boolean)};
*
* @param extractInlineImages
*/
public void setExtractInlineImages(boolean extractInlineImages) {
this.extractInlineImages = extractInlineImages;
}
/**
* @see #setExtractUniqueInlineImagesOnly(boolean)
*/
public boolean getExtractUniqueInlineImagesOnly() {
return extractUniqueInlineImagesOnly;
}
/**
* Multiple pages within a PDF file might refer to the same underlying image.
* If {@link #extractUniqueInlineImagesOnly} is set to false
, the
* parser will call the EmbeddedExtractor each time the image appears on a page.
* This might be desired for some use cases. However, to avoid duplication of
* extracted images, set this to true
. The default is true
.
*
* Note that uniqueness is determined only by the underlying PDF COSObject id, not by
* file hash or similar equality metric.
* If the PDF actually contains multiple copies of the same image
* -- all with different object ids -- then all images will be extracted.
*
* For this parameter to have any effect, {@link #extractInlineImages} must be
* set to true
.
*
* Because of TIKA-1742 -- to avoid infinite recursion -- no matter the setting
* of this parameter, the extractor will only pull out one copy of each image per
* page. This parameter tries to capture uniqueness across the entire document.
*
* @param extractUniqueInlineImagesOnly
*/
public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) {
this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly;
}
/**
* @see #setEnableAutoSpace(boolean)
*/
public boolean getEnableAutoSpace() {
return enableAutoSpace;
}
/**
* If true (the default), the parser should estimate
* where spaces should be inserted between words. For
* many PDFs this is necessary as they do not include
* explicit whitespace characters.
*/
public void setEnableAutoSpace(boolean enableAutoSpace) {
this.enableAutoSpace = enableAutoSpace;
}
/**
* @see #setSuppressDuplicateOverlappingText(boolean)
*/
public boolean getSuppressDuplicateOverlappingText() {
return suppressDuplicateOverlappingText;
}
/**
* If true, the parser should try to remove duplicated
* text over the same region. This is needed for some
* PDFs that achieve bolding by re-writing the same
* text in the same area. Note that this can
* slow down extraction substantially (PDFBOX-956) and
* sometimes remove characters that were not in fact
* duplicated (PDFBOX-1155). By default this is disabled.
*/
public void setSuppressDuplicateOverlappingText(
boolean suppressDuplicateOverlappingText) {
this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText;
}
/**
* @see #setExtractAnnotationText(boolean)
*/
public boolean getExtractAnnotationText() {
return extractAnnotationText;
}
/**
* If true (the default), text in annotations will be
* extracted.
*/
public void setExtractAnnotationText(boolean extractAnnotationText) {
this.extractAnnotationText = extractAnnotationText;
}
/**
* @see #setSortByPosition(boolean)
*/
public boolean getSortByPosition() {
return sortByPosition;
}
/**
* If true, sort text tokens by their x/y position
* before extracting text. This may be necessary for
* some PDFs (if the text tokens are not rendered "in
* order"), while for other PDFs it can produce the
* wrong result (for example if there are 2 columns,
* the text will be interleaved). Default is false.
*/
public void setSortByPosition(boolean sortByPosition) {
this.sortByPosition = sortByPosition;
}
/**
* @see #setAverageCharTolerance(Float)
*/
public Float getAverageCharTolerance() {
return averageCharTolerance;
}
/**
* See {@link PDFTextStripper#setAverageCharTolerance(float)}
*/
public void setAverageCharTolerance(Float averageCharTolerance) {
this.averageCharTolerance = averageCharTolerance;
}
/**
* @see #setSpacingTolerance(Float)
*/
public Float getSpacingTolerance() {
return spacingTolerance;
}
/**
* See {@link PDFTextStripper#setSpacingTolerance(float)}
*/
public void setSpacingTolerance(Float spacingTolerance) {
this.spacingTolerance = spacingTolerance;
}
public AccessChecker getAccessChecker() {
return accessChecker;
}
public void setAccessChecker(AccessChecker accessChecker) {
this.accessChecker = accessChecker;
}
/**
* See {@link #setCatchIntermediateIOExceptions(boolean)}
* @return whether or not to catch IOExceptions
* @deprecated use {@link #getCatchIntermediateIOExceptions()}
*/
public boolean isCatchIntermediateIOExceptions() {
return catchIntermediateIOExceptions;
}
/**
* See {@link #setCatchIntermediateIOExceptions(boolean)}
* @return whether or not to catch IOExceptions
*/
public boolean getCatchIntermediateIOExceptions() {
return catchIntermediateIOExceptions;
}
/**
* The PDFBox parser will throw an IOException if there is
* a problem with a stream. If this is set to true
,
* Tika's PDFParser will catch these exceptions and try to parse
* the rest of the document. After the parse is completed,
* Tika's PDFParser will throw the first caught exception.
* @param catchIntermediateIOExceptions
*/
public void setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions) {
this.catchIntermediateIOExceptions = catchIntermediateIOExceptions;
}
/**
* Which strategy to use for OCR
* @param ocrStrategy
*/
public void setOcrStrategy(OCR_STRATEGY ocrStrategy) {
this.ocrStrategy = ocrStrategy;
}
/**
* Which strategy to use for OCR
* @param ocrStrategyString
*/
public void setOcrStrategy(String ocrStrategyString) {
this.ocrStrategy = OCR_STRATEGY.parse(ocrStrategyString);
}
/**
*
* @return strategy to use for OCR
*/
public OCR_STRATEGY getOcrStrategy() {
return ocrStrategy;
}
private boolean getBooleanProp(String p, boolean defaultMissing) {
if (p == null) {
return defaultMissing;
}
if (p.toLowerCase(Locale.ROOT).equals("true")) {
return true;
} else if (p.toLowerCase(Locale.ROOT).equals("false")) {
return false;
} else {
return defaultMissing;
}
}
//throws NumberFormatException if there's a non-null unparseable
//string passed in
private int getIntProp(String p, int defaultMissing) {
if (p == null) {
return defaultMissing;
}
return Integer.parseInt(p);
}
/**
* String representation of the image format used to render
* the page image for OCR (examples: png, tiff, jpeg)
* @return
*/
public String getOcrImageFormatName() {
return ocrImageFormatName;
}
/**
* @see #getOcrImageFormatName()
*
* @param ocrImageFormatName name of image format used to render
* page image
*/
public void setOcrImageFormatName(String ocrImageFormatName) {
this.ocrImageFormatName = ocrImageFormatName;
}
/**
* Image type used to render the page image for OCR.
* @see #setOcrImageType(ImageType)
* @return image type
*/
public ImageType getOcrImageType() {
return ocrImageType;
}
/**
* Image type used to render the page image for OCR.
* @param ocrImageType
*/
public void setOcrImageType(ImageType ocrImageType) {
this.ocrImageType = ocrImageType;
}
/**
* Image type used to render the page image for OCR.
* @see #setOcrImageType(ImageType)
*/
public void setOcrImageType(String ocrImageTypeString) {
this.ocrImageType = parseImageType(ocrImageTypeString);
}
/**
* Dots per inch used to render the page image for OCR
* @return dots per inch
*/
public int getOcrDPI() {
return ocrDPI;
}
/**
* Dots per inche used to render the page image for OCR
* @param ocrDPI
*/
public void setOcrDPI(int ocrDPI) {
this.ocrDPI = ocrDPI;
}
/**
* Whether or not to extract PDActions from the file.
* Most Action types are handled inline; javascript macros
* are processed as embedded documents.
*
* @param v
*/
public void setExtractActions(boolean v) {
extractActions = v;
}
/**
* @see #setExtractActions(boolean)
* @return whether or not to extract PDActions
*/
public boolean getExtractActions() {
return extractActions;
}
private ImageType parseImageType(String ocrImageType) {
for (ImageType t : ImageType.values()) {
if (ocrImageType.equalsIgnoreCase(t.toString())) {
return t;
}
}
StringBuilder sb = new StringBuilder();
sb.append("I regret that I could not parse '");
sb.append(ocrImageType);
sb.append("'. I'm only familiar with: ");
int i = 0;
for (ImageType t : ImageType.values()) {
if (i++ == 0) {
sb.append(", ");
}
sb.append(t.toString());
}
throw new IllegalArgumentException(sb.toString());
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof PDFPureJavaParserConfig)) return false;
PDFPureJavaParserConfig config = (PDFPureJavaParserConfig) o;
if (getEnableAutoSpace() != config.getEnableAutoSpace()) return false;
if (getSuppressDuplicateOverlappingText() != config.getSuppressDuplicateOverlappingText()) return false;
if (getExtractAnnotationText() != config.getExtractAnnotationText()) return false;
if (getSortByPosition() != config.getSortByPosition()) return false;
if (getExtractAcroFormContent() != config.getExtractAcroFormContent()) return false;
if (getExtractInlineImages() != config.getExtractInlineImages()) return false;
if (getExtractUniqueInlineImagesOnly() != config.getExtractUniqueInlineImagesOnly()) return false;
if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) return false;
if (getOcrDPI() != config.getOcrDPI()) return false;
if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) return false;
if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false;
if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false;
if (!getOcrStrategy().equals(config.getOcrStrategy())) return false;
if (getOcrImageType() != config.getOcrImageType()) return false;
if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) return false;
if (getExtractActions() != config.getExtractActions()) return false;
return getAccessChecker().equals(config.getAccessChecker());
}
@Override
public int hashCode() {
int result = (getEnableAutoSpace() ? 1 : 0);
result = 31 * result + (getSuppressDuplicateOverlappingText() ? 1 : 0);
result = 31 * result + (getExtractAnnotationText() ? 1 : 0);
result = 31 * result + (getSortByPosition() ? 1 : 0);
result = 31 * result + (getExtractAcroFormContent() ? 1 : 0);
result = 31 * result + (getExtractInlineImages() ? 1 : 0);
result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
result = 31 * result + getAverageCharTolerance().hashCode();
result = 31 * result + getSpacingTolerance().hashCode();
result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
result = 31 * result + ocrStrategy.hashCode();
result = 31 * result + getOcrDPI();
result = 31 * result + getOcrImageType().hashCode();
result = 31 * result + getOcrImageFormatName().hashCode();
result = 31 * result + getAccessChecker().hashCode();
result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
result = 31 * result + (getExtractActions() ? 1 : 0);
return result;
}
@Override
public String toString() {
return "PDFParserConfig{" +
"enableAutoSpace=" + enableAutoSpace +
", suppressDuplicateOverlappingText=" + suppressDuplicateOverlappingText +
", extractAnnotationText=" + extractAnnotationText +
", sortByPosition=" + sortByPosition +
", extractAcroFormContent=" + extractAcroFormContent +
", extractInlineImages=" + extractInlineImages +
", extractUniqueInlineImagesOnly=" + extractUniqueInlineImagesOnly +
", averageCharTolerance=" + averageCharTolerance +
", spacingTolerance=" + spacingTolerance +
", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA +
", ocrStrategy=" + ocrStrategy +
", ocrDPI=" + ocrDPI +
", ocrImageType=" + ocrImageType +
", ocrImageFormatName='" + ocrImageFormatName + '\'' +
", accessChecker=" + accessChecker +
", extractActions=" + extractActions +
", catchIntermediateIOExceptions=" + catchIntermediateIOExceptions +
'}';
}
}