org.apache.tika.parser.iwork.PagesContentHandler Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.iwork;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
class PagesContentHandler extends DefaultHandler {
private final XHTMLContentHandler xhtml;
private final Metadata metadata;
/** The (interesting) part of the document we're in. Should be more structured... */
private enum DocumentPart {
METADATA, PARSABLE_TEXT,
HEADERS, HEADER_ODD, HEADER_EVEN, HEADER_FIRST,
FOOTERS, FOOTER_ODD, FOOTER_EVEN, FOOTER_FIRST,
FOOTNOTES, ANNOTATIONS;
}
private DocumentPart inPart = null;
private boolean ghostText;
private static String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
private boolean parseProperty = false;
private int pageCount = 0;
private int slPageCount = 0;
private HeaderFooter headers = null;
private HeaderFooter footers = null;
private Footnotes footnotes = null;
private Annotations annotations = null;
private Map>> tableData =
new HashMap>>();
private String activeTableId;
private int numberOfColumns = 0;
private List activeRow = new ArrayList();
private String metaDataLocalName;
private String metaDataQName;
PagesContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
this.xhtml = xhtml;
this.metadata = metadata;
}
@Override
public void endDocument() throws SAXException {
metadata.set(Metadata.PAGE_COUNT, String.valueOf(pageCount));
if (pageCount > 0) {
doFooter();
xhtml.endElement("div");
}
}
@Override
public void startElement(
String uri, String localName, String qName, Attributes attributes)
throws SAXException {
if (parseProperty) {
String value = parsePrimitiveElementValue(qName, attributes);
if (value != null) {
Object metaDataKey = resolveMetaDataKey(metaDataLocalName);
if(metaDataKey instanceof Property) {
metadata.set((Property)metaDataKey, value);
} else {
metadata.add((String)metaDataKey, value);
}
}
}
if ("sl:publication-info".equals(qName)) {
inPart = DocumentPart.METADATA;
} else if ("sf:metadata".equals(qName)) {
inPart = DocumentPart.METADATA;
} else if ("sf:page-start".equals(qName) || "sl:page-group".equals(qName)) {
if (pageCount > 0) {
doFooter();
xhtml.endElement("div");
}
xhtml.startElement("div");
if ("sl:page-group".equals(qName)) {
slPageCount++;
} else {
pageCount++;
}
doHeader();
} else if ("sf:p".equals(qName)) {
if (pageCount+slPageCount > 0) {
inPart = DocumentPart.PARSABLE_TEXT;
xhtml.startElement("p");
}
} else if ("sf:attachment".equals(qName)) {
String kind = attributes.getValue("sf:kind");
if ("tabular-attachment".equals(kind)) {
activeTableId = attributes.getValue("sfa:ID");
tableData.put(activeTableId, new ArrayList>());
}
} else if ("sf:attachment-ref".equals(qName)) {
String idRef = attributes.getValue("sfa:IDREF");
outputTable(idRef);
} else if ("sf:headers".equals(qName)) {
headers = new HeaderFooter(qName);
inPart = DocumentPart.HEADERS;
} else if ("sf:footers".equals(qName)) {
footers = new HeaderFooter(qName);
inPart = DocumentPart.FOOTERS;
} else if ("sf:header".equals(qName)) {
inPart = headers.identifyPart(attributes.getValue("sf:name"));
} else if ("sf:footer".equals(qName)) {
inPart = footers.identifyPart(attributes.getValue("sf:name"));
} else if ("sf:page-number".equals(qName)) {
if (inPart == DocumentPart.FOOTER_ODD
|| inPart == DocumentPart.FOOTER_FIRST
|| inPart == DocumentPart.FOOTER_EVEN) {
// We are in a footer
footers.hasAutoPageNumber = true;
footers.autoPageNumberFormat = attributes.getValue("sf:format");
} else {
headers.hasAutoPageNumber = true;
headers.autoPageNumberFormat = attributes.getValue("sf:format");
}
xhtml.characters(Integer.toString(this.pageCount));
} else if ("sf:footnotes".equals(qName)) {
footnotes = new Footnotes();
inPart = DocumentPart.FOOTNOTES;
} else if ("sf:footnote-mark".equals(qName)) {
footnotes.recordMark(attributes.getValue("sf:mark"));
} else if ("sf:footnote".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
// What about non auto-numbered?
String footnoteMark = attributes.getValue("sf:autonumber");
if (footnotes != null) {
String footnoteText = footnotes.footnotes.get(footnoteMark);
if (footnoteText != null) {
xhtml.startElement("div", "style", "footnote");
xhtml.characters("Footnote:" ); // As shown in Pages
xhtml.characters(footnoteText);
xhtml.endElement("div");
}
}
} else if ("sf:annotations".equals(qName)) {
annotations = new Annotations();
inPart = DocumentPart.ANNOTATIONS;
} else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
annotations.start(attributes.getValue("sf:target"));
} else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
xhtml.startElement("div", "style", "annotated");
String annotationText = annotations.annotations.get(attributes.getValue("sfa:ID"));
if (annotationText != null) {
xhtml.startElement("div", "style", "annotation");
xhtml.characters(annotationText);
xhtml.endElement("div");
}
} else if ("sf:ghost-text".equals(qName)) {
ghostText = true;
}
if (activeTableId != null) {
parseTableData(qName, attributes);
}
if (inPart == DocumentPart.METADATA) {
metaDataLocalName = localName;
metaDataQName = qName;
parseProperty = true;
}
}
@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
if (metaDataLocalName != null && metaDataLocalName.equals(localName)) {
metaDataLocalName = null;
parseProperty = false;
}
if ("sl:publication-info".equals(qName)) {
inPart = null;
} else if ("sf:metadata".equals(qName)) {
inPart = null;
} else if ("sf:p".equals(qName) && (pageCount+slPageCount) > 0) {
inPart = null;
xhtml.endElement("p");
} else if ("sf:attachment".equals(qName)) {
activeTableId = null;
} else if ("sf:annotation".equals(qName) && inPart == DocumentPart.ANNOTATIONS) {
annotations.end();
} else if ("sf:annotation-field".equals(qName) && inPart == DocumentPart.PARSABLE_TEXT) {
xhtml.endElement("div");
} else if ("sf:ghost-text".equals(qName)) {
ghostText = false;
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (length > 0) {
if (inPart == DocumentPart.PARSABLE_TEXT) {
if (!ghostText) {
xhtml.characters(ch, start, length);
}
} else if(inPart != null) {
String str = new String(ch, start, length);
if (inPart == DocumentPart.HEADER_FIRST) headers.defaultFirst = str;
if (inPart == DocumentPart.HEADER_EVEN) headers.defaultEven = str;
if (inPart == DocumentPart.HEADER_ODD) headers.defaultOdd = str;
if (inPart == DocumentPart.FOOTER_FIRST) footers.defaultFirst = str;
if (inPart == DocumentPart.FOOTER_EVEN) footers.defaultEven = str;
if (inPart == DocumentPart.FOOTER_ODD) footers.defaultOdd = str;
if (inPart == DocumentPart.FOOTNOTES) footnotes.text(str);
if (inPart == DocumentPart.ANNOTATIONS) annotations.text(str);
}
}
}
private void parseTableData(String qName, Attributes attributes) {
if ("sf:grid".equals(qName)) {
String numberOfColumns = attributes.getValue("sf:numcols");
this.numberOfColumns = Integer.parseInt(numberOfColumns);
} else if ("sf:ct".equals(qName)) {
activeRow.add(attributes.getValue("sfa:s"));
if (activeRow.size() >= 3) {
tableData.get(activeTableId).add(activeRow);
activeRow = new ArrayList();
}
}
}
private void outputTable(String idRef) throws SAXException {
List> tableData = this.tableData.get(idRef);
if (tableData != null) {
xhtml.startElement("table");
for (List row : tableData) {
xhtml.startElement("tr");
for (String cell : row) {
xhtml.element("td", cell);
}
xhtml.endElement("tr");
}
xhtml.endElement("table");
}
}
/**
* Returns a resolved key that is common in other document types or
* returns the specified metaDataLocalName if no common key could be found.
* The key could be a simple String key, or could be a {@link Property}
*
* @param metaDataLocalName The localname of the element containing metadata
* @return a resolved key that is common in other document types
*/
private Object resolveMetaDataKey(String metaDataLocalName) {
Object metaDataKey = metaDataLocalName;
if ("sf:authors".equals(metaDataQName)) {
metaDataKey = TikaCoreProperties.CREATOR;
} else if ("sf:title".equals(metaDataQName)) {
metaDataKey = TikaCoreProperties.TITLE;
} else if ("sl:SLCreationDateProperty".equals(metaDataQName)) {
metaDataKey = TikaCoreProperties.CREATED;
} else if ("sl:SLLastModifiedDateProperty".equals(metaDataQName)) {
metaDataKey = Metadata.LAST_MODIFIED;
} else if ("sl:language".equals(metaDataQName)) {
metaDataKey = TikaCoreProperties.LANGUAGE;
}
return metaDataKey;
}
/**
* Returns the value of a primitive element e.g.:
* <sl:number sfa:number="0" sfa:type="f"/> - the number attribute
* <sl:string sfa:string="en"/> = the string attribute
*
* Returns null
if the value could not be extracted from
* the list of attributes.
*
* @param qName The fully qualified name of the element containing
* the value to extract
* @param attributes The list of attributes of which one contains the
* value to be extracted
* @return the value of a primitive element
*/
private String parsePrimitiveElementValue(
String qName, Attributes attributes) {
if ("sl:string".equals(qName) || "sf:string".equals(qName)) {
return attributes.getValue("sfa:string");
} else if ("sl:number".equals(qName)) {
return attributes.getValue("sfa:number");
} else if ("sl:date".equals(qName)) {
return attributes.getValue("sf:val");
}
return null;
}
private void doHeader() throws SAXException {
if (headers != null) {
headers.output("header");
}
}
private void doFooter() throws SAXException {
if (footers != null) {
footers.output("footer");
}
}
/**
* Represents the Headers or Footers in a document
*/
private class HeaderFooter {
private String type; // sf:headers or sf:footers
private String defaultOdd;
private String defaultEven;
private String defaultFirst;
private boolean hasAutoPageNumber;
private String autoPageNumberFormat;
// TODO Can there be custom ones?
private HeaderFooter(String type) {
this.type = type;
}
private DocumentPart identifyPart(String name) {
if("SFWPDefaultOddHeaderIdentifier".equals(name))
return DocumentPart.HEADER_ODD;
if("SFWPDefaultEvenHeaderIdentifier".equals(name))
return DocumentPart.HEADER_EVEN;
if("SFWPDefaultFirstHeaderIdentifier".equals(name))
return DocumentPart.HEADER_FIRST;
if("SFWPDefaultOddFooterIdentifier".equals(name))
return DocumentPart.FOOTER_ODD;
if("SFWPDefaultEvenFooterIdentifier".equals(name))
return DocumentPart.FOOTER_EVEN;
if("SFWPDefaultFirstFooterIdentifier".equals(name))
return DocumentPart.FOOTER_FIRST;
return null;
}
private void output(String what) throws SAXException {
String text = null;
if (pageCount == 1 && defaultFirst != null) {
text = defaultFirst;
} else if (pageCount % 2 == 0 && defaultEven != null) {
text = defaultEven;
} else {
text = defaultOdd;
}
if (text != null) {
xhtml.startElement("div", "class", "header");
xhtml.characters(text);
if (hasAutoPageNumber) {
if (autoPageNumberFormat == null) { // raw number
xhtml.characters("\t" + pageCount);
} else if (autoPageNumberFormat.equals("upper-roman")){
xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumerals(pageCount));
} else if (autoPageNumberFormat.equals("lower-roman")){
xhtml.characters("\t" + AutoPageNumberUtils.asRomanNumeralsLower(pageCount));
} else if (autoPageNumberFormat.equals("upper-alpha")){
xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumeric(pageCount));
} else if (autoPageNumberFormat.equals("lower-alpha")){
xhtml.characters("\t" + AutoPageNumberUtils.asAlphaNumericLower(pageCount));
}
}
xhtml.endElement("div");
}
}
}
/**
* Represents Footnotes in a document. The way these work
* in the file format isn't very clean...
*/
private static class Footnotes {
/** Mark -> Text */
Map footnotes = new HashMap();
String lastSeenMark = null;
/**
* Normally happens before the text of the mark
*/
private void recordMark(String mark) {
lastSeenMark = mark;
}
private void text(String text) {
if (lastSeenMark != null) {
if (footnotes.containsKey(lastSeenMark)) {
text = footnotes.get(lastSeenMark) + text;
}
footnotes.put(lastSeenMark, text);
}
}
}
/**
* Represents Annotations in a document. We currently
* just grab all the sf:p text in each one
*/
private class Annotations {
/** ID -> Text */
Map annotations = new HashMap();
String currentID = null;
StringBuffer currentText = null;
private void start(String id) {
currentID = id;
currentText = new StringBuffer();
}
private void text(String text) {
if (text != null && text.length() > 0 && currentText != null) {
currentText.append(text);
}
}
private void end() {
if (currentText.length() > 0) {
annotations.put(currentID, currentText.toString());
currentID = null;
currentText = null;
}
}
}
}