org.apache.tika.parser.microsoft.OfficeParserConfig Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft;
import java.io.Serializable;
public class OfficeParserConfig implements Serializable {
private boolean extractMacros = false;
private boolean includeDeletedContent = false;
private boolean includeMoveFromContent = false;
private boolean includeShapeBasedContent = true;
private boolean includeHeadersAndFooters = true;
private boolean includeMissingRows = false;
private boolean includeSlideNotes = true;
private boolean includeSlideMasterContent = true;
private boolean concatenatePhoneticRuns = true;
private boolean useSAXDocxExtractor = false;
private boolean useSAXPptxExtractor = false;
private boolean extractAllAlternativesFromMSG;
/**
* Sets whether or not MSOffice parsers should extract macros.
* As of Tika 1.15, the default is false
.
*
* @param extractMacros
*/
public void setExtractMacros(boolean extractMacros) {
this.extractMacros = extractMacros;
}
/**
*
* @return whether or not to extract macros
*/
public boolean getExtractMacros() {
return extractMacros;
}
/**
* Sets whether or not the parser should include deleted content.
*
* This has only been implemented in the streaming docx parser
* ({@link org.apache.tika.parser.microsoft.ooxml.SXWPFWordExtractorDecorator} so far!!!
* @param includeDeletedContent
*/
public void setIncludeDeletedContent(boolean includeDeletedContent) {
this.includeDeletedContent = includeDeletedContent;
}
public boolean getIncludeDeletedContent() {
return includeDeletedContent;
}
/**
* With track changes on, when a section is moved, the content
* is stored in both the "moveFrom" section and in the "moveTo" section.
*
* If you'd like to include the section both in its original location (moveFrom)
* and in its new location (moveTo), set this to true
*
* Default: false
*
* This has only been implemented in the streaming docx parser
* ({@link org.apache.tika.parser.microsoft.ooxml.SXWPFWordExtractorDecorator} so far!!!
* @param includeMoveFromContent
*/
public void setIncludeMoveFromContent(boolean includeMoveFromContent) {
this.includeMoveFromContent = includeMoveFromContent;
}
public boolean getIncludeMoveFromContent() {
return includeMoveFromContent;
}
/**
* In Excel and Word, there can be text stored within drawing shapes.
* (In PowerPoint everything is in a Shape)
*
* If you'd like to skip processing these to look for text, set this to
* false
*
* Default: true
* @param includeShapeBasedContent
*/
public void setIncludeShapeBasedContent(boolean includeShapeBasedContent) {
this.includeShapeBasedContent = includeShapeBasedContent;
}
public boolean getIncludeShapeBasedContent() {
return includeShapeBasedContent;
}
/**
* Whether or not to include headers and footers.
*
* This only operates on headers and footers in Word and Excel,
* not master slide content in Powerpoint.
*
* Default: true
* @param includeHeadersAndFooters
*/
public void setIncludeHeadersAndFooters(boolean includeHeadersAndFooters) {
this.includeHeadersAndFooters = includeHeadersAndFooters;
}
public boolean getIncludeHeadersAndFooters() {
return includeHeadersAndFooters;
}
public boolean getUseSAXDocxExtractor() {
return useSAXDocxExtractor;
}
/**
* Use the experimental SAX-based streaming DOCX parser?
* If set to false
, the classic parser will be used; if true
,
* the new experimental parser will be used.
*
* Default: false
(classic DOM parser)
* @param useSAXDocxExtractor
*/
public void setUseSAXDocxExtractor(boolean useSAXDocxExtractor) {
this.useSAXDocxExtractor = useSAXDocxExtractor;
}
/**
* Use the experimental SAX-based streaming DOCX parser?
* If set to false
, the classic parser will be used; if true
,
* the new experimental parser will be used.
*
* Default: false
(classic DOM parser)
* @param useSAXPptxExtractor
*/
public void setUseSAXPptxExtractor(boolean useSAXPptxExtractor) {
this.useSAXPptxExtractor = useSAXPptxExtractor;
}
public boolean getUseSAXPptxExtractor() {
return useSAXPptxExtractor;
}
public boolean getConcatenatePhoneticRuns() {
return concatenatePhoneticRuns;
}
/**
* Microsoft Excel files can sometimes contain phonetic (furigana) strings.
* See PHONETIC.
* This sets whether or not the parser will concatenate the phonetic runs to the original text.
*
* This is currently only supported by the xls and xlsx parsers (not the xlsb parser),
* and the default is true
.
*
*
* @param concatenatePhoneticRuns
*/
public void setConcatenatePhoneticRuns(boolean concatenatePhoneticRuns) {
this.concatenatePhoneticRuns = concatenatePhoneticRuns;
}
/**
* Some .msg files can contain body content in html, rtf and/or text.
* The default behavior is to pick the first non-null value and include only that.
* If you'd like to extract all non-null body content, which is likely duplicative,
* set this value to true.
*
* @param extractAllAlternativesFromMSG whether or not to extract all alternative parts
* @since 1.17
*/
public void setExtractAllAlternativesFromMSG(boolean extractAllAlternativesFromMSG) {
this.extractAllAlternativesFromMSG = extractAllAlternativesFromMSG;
}
public boolean getExtractAllAlternativesFromMSG() {
return extractAllAlternativesFromMSG;
}
/**
* For table-like formats, and tables within other formats, should
* missing rows in sparse tables be output where detected?
* The default is to only output rows defined within the file, which
* avoid lots of blank lines, but means layout isn't preserved.
*/
public void setIncludeMissingRows(boolean includeMissingRows) {
this.includeMissingRows = includeMissingRows;
}
public boolean getIncludeMissingRows() {
return includeMissingRows;
}
public boolean getIncludeSlideNotes() {
return includeSlideNotes;
}
/**
* Whether or not to process slide notes content. If set
* to false
, the parser will skip the text content
* and all embedded objects from the slide notes in ppt and ppt[xm].
* The default is true
.
*
* @param includeSlideNotes whether or not to process slide notes
* @since 1.19.1
*/
public void setIncludeSlideNotes(boolean includeSlideNotes) {
this.includeSlideNotes = includeSlideNotes;
}
/**
* @since 1.19.1
* @return whether or not to process content in slide masters
*/
public boolean getIncludeSlideMasterContent() {
return includeSlideMasterContent;
}
/**
* Whether or not to include contents from any of the three
* types of masters -- slide, notes, handout -- in a .ppt or ppt[xm] file.
* If set to false
, the parser will not extract
* text or embedded objects from any of the masters.
*
* @since 1.19.1
* @param includeSlideMasterContent
*/
public void setIncludeSlideMasterContent(boolean includeSlideMasterContent) {
this.includeSlideMasterContent = includeSlideMasterContent;
}
}