org.apache.tika.parser.pdf.PDFParser.properties Maven / Gradle / Ivy

Go to download
#  Licensed to the Apache Software Foundation (ASF) under one or more
#  contributor license agreements.  See the NOTICE file distributed with
#  this work for additional information regarding copyright ownership.
#  The ASF licenses this file to You under the Apache License, Version 2.0
#  (the "License"); you may not use this file except in compliance with
#  the License.  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

enableAutoSpace true
extractAnnotationText true
sortByPosition	false
suppressDuplicateOverlappingText	false
extractAcroFormContent	true
extractBookmarksText true
extractInlineImages false
extractUniqueInlineImagesOnly true
checkExtractAccessPermission false
allowExtractionForAccessibility true
ifXFAExtractOnlyXFA false
catchIntermediateIOExceptions true
#options: no_ocr, ocr_only, ocr_and_text_extraction
ocrStrategy no_ocr
#dots per inch for the ocr rendering of the page image
ocrDPI 300
#if you request tif, make sure you have imageio jars on your classpath!
ocrImageFormatName png
#options: argb, binary, gray, rgb
ocrImageType gray
#scale to use when rendering a page image for OCR
ocrImageScale 2.0
# Use up to 500MB when loading a pdf into a PDDocument
maxMainMemoryBytes 524288000