org.apache.tika.parser.pdf.PDFParser.properties Maven / Gradle / Ivy
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
enableAutoSpace true
extractAnnotationText true
sortByPosition false
suppressDuplicateOverlappingText false
extractAcroFormContent true
extractBookmarksText true
extractInlineImages false
extractUniqueInlineImagesOnly true
checkExtractAccessPermission false
allowExtractionForAccessibility true
ifXFAExtractOnlyXFA false
catchIntermediateIOExceptions true
#options: no_ocr, ocr_only, ocr_and_text_extraction
ocrStrategy no_ocr
#dots per inch for the ocr rendering of the page image
ocrDPI 300
#if you request tif, make sure you have imageio jars on your classpath!
ocrImageFormatName png
#options: argb, binary, gray, rgb
ocrImageType gray
#scale to use when rendering a page image for OCR
ocrImageScale 2.0
# Use up to 500MB when loading a pdf into a PDDocument
maxMainMemoryBytes 524288000
#whether or not to set KCMS for faster (but legacy/unsupported) image rendering
setKCMS false