All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.pdf.PDFParser.properties Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
#  Licensed to the Apache Software Foundation (ASF) under one or more
#  contributor license agreements.  See the NOTICE file distributed with
#  this work for additional information regarding copyright ownership.
#  The ASF licenses this file to You under the Apache License, Version 2.0
#  (the "License"); you may not use this file except in compliance with
#  the License.  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

enableAutoSpace true
extractAnnotationText true
sortByPosition	false
suppressDuplicateOverlappingText	false
extractAcroFormContent	true
extractBookmarksText true
extractInlineImages false
extractUniqueInlineImagesOnly true
checkExtractAccessPermission false
allowExtractionForAccessibility true
ifXFAExtractOnlyXFA false
catchIntermediateIOExceptions true
#options: no_ocr, ocr_only, ocr_and_text_extraction, auto
ocrStrategy no_ocr
#dots per inch for the ocr rendering of the page image
ocrDPI 300
#if you request tif, make sure you have imageio jars on your classpath!
ocrImageFormatName png
#options: argb, binary, gray, rgb
ocrImageType gray
#scale to use when rendering a page image for OCR
#as of Tika 1.23, this is no longer used; use ocrDPI instead
ocrImageScale 2.0
# Use up to 500MB when loading a pdf into a PDDocument
maxMainMemoryBytes 524288000
#whether or not to set KCMS for faster (but legacy/unsupported) image rendering
setKCMS false
#whether or not to add processing to detect angles and extract
#text accordingly PDFBOX-4371
detectAngles false




© 2015 - 2024 Weber Informatics LLC | Privacy Policy