org.apache.tika.parser.microsoft.xml.WordMLParser Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.xml;
import javax.xml.namespace.QName;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.codec.binary.Base64;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
/**
* Parses wordml 2003 format word files. These are single xml files
* that predate ooxml.
*
* @see {@url https://en.wikipedia.org/wiki/Microsoft_Office_XML_formats}
*/
public class WordMLParser extends AbstractXML2003Parser {
//map between wordml and xhtml entities
private final static Map WORDML_TO_XHTML =
new ConcurrentHashMap<>();
//ignore all characters within these elements
private final static Set IGNORE_CHARACTERS =
Collections.newSetFromMap(new ConcurrentHashMap());
private static final MediaType MEDIA_TYPE = MediaType.application("vnd.ms-wordml");
private static final Set SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
MEDIA_TYPE)));
static {
WORDML_TO_XHTML.put(P, P);
WORDML_TO_XHTML.put("tbl", TABLE);
WORDML_TO_XHTML.put(TR, TR);
WORDML_TO_XHTML.put("tc", TD);//not a typo -- table cell -> tc
IGNORE_CHARACTERS.add(new QName(WORD_ML_URL, HLINK));
IGNORE_CHARACTERS.add(new QName(WORD_ML_URL, PICT));
IGNORE_CHARACTERS.add(new QName(WORD_ML_URL, BIN_DATA));
IGNORE_CHARACTERS.add(new QName(MS_OFFICE_PROPERTIES_URN,
DOCUMENT_PROPERTIES));
}
@Override
public Set getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
protected ContentHandler getContentHandler(ContentHandler ch,
Metadata metadata, ParseContext context) {
return new TeeContentHandler(
super.getContentHandler(ch, metadata, context),
new WordMLHandler(ch),
new HyperlinkHandler(ch,
WORD_ML_URL),
new PictHandler(ch,
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)));
}
@Override
public void setContentType(Metadata metadata) {
metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
}
private class WordMLHandler extends DefaultHandler {
private final ContentHandler handler;
private boolean ignoreCharacters;
private boolean inBody = false;
//use inP to keep track of whether the handler is
//in a paragraph or not. was allowed
//in wordml. Use this boolean to prevent within
private boolean inP;
public WordMLHandler(ContentHandler handler) {
this.handler = handler;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs)
throws SAXException {
localName = localName.toLowerCase(Locale.US);
if (WORD_ML_URL.equals(uri)) {
if (BODY.equals(localName)) {
inBody = true;
return;
}
String html = WORDML_TO_XHTML.get(localName);
if (html != null) {
if (P.equals(localName)) {
//close p if already in a p to prevent nested
if (inP) {
handler.endElement(XHTMLContentHandler.XHTML, P, P);
}
inP = true;
}
handler.startElement(XHTMLContentHandler.XHTML, html, html, EMPTY_ATTRS);
if (html.equals(TABLE)) {
handler.startElement(XHTMLContentHandler.XHTML, TBODY, TBODY, EMPTY_ATTRS);
}
}
if (BR.equals(localName)) {
handler.characters(NEWLINE, 0, 1);
}
}
if (IGNORE_CHARACTERS.contains(new QName(uri, localName))) {
ignoreCharacters = true;
}
}
@Override
public void characters(char[] str , int offset, int len) throws SAXException {
if (!ignoreCharacters && inBody) {
handler.characters(str, offset, len);
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if (WORD_ML_URL.equals(uri)) {
//for now, don't bother checking for end of body...if there's any text
//after the close of body, we should extract it
localName = localName.toLowerCase(Locale.US);
String html = WORDML_TO_XHTML.get(localName);
if (html != null) {
if (html.equals(TABLE)) {
handler.endElement(XHTMLContentHandler.XHTML, TBODY, TBODY);
}
if (P.equals(html) && !inP) {
//start p if not already in one to prevent non-matching
handler.startElement(XHTMLContentHandler.XHTML, P, P, EMPTY_ATTRS);
}
handler.endElement(XHTMLContentHandler.XHTML, html, html);
if (P.equals(html)) {
inP = false;
}
}
}
if (IGNORE_CHARACTERS.contains(new QName(uri, localName))) {
ignoreCharacters = false;
}
}
}
private class PictHandler extends DefaultHandler {
final StringBuilder buffer = new StringBuilder();
final ContentHandler handler;
byte[] rawBytes = null;
EmbeddedDocumentExtractor embeddedDocumentExtractor;
boolean inPict = false;
boolean inBin = false;
String pictName = null;
String pictSource = null;
final Base64 base64 = new Base64();
public PictHandler(ContentHandler handler, EmbeddedDocumentExtractor embeddedDocumentExtractor) {
this.handler = handler;
this.embeddedDocumentExtractor = embeddedDocumentExtractor;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attrs)
throws SAXException {
if (WORD_ML_URL.equals(uri)) {
if (PICT.equals(localName)) {
inPict = true;
} else if (BIN_DATA.equals(localName)) {
inBin = true;
pictName = attrs.getValue(WORD_ML_URL, NAME_ATTR);
if (pictName != null) {
pictName = pictName.replaceFirst("wordml://", "");
}
}
} else if (MS_VML_URN.equals(uri)) {
if (localName.equals("imagedata")) {
//src is an internal designator with an extension
String src = attrs.getValue("", "src");
//title appears to be the original file name
String title = attrs.getValue(MS_OFFICE_PROPERTIES_URN, "title");
if (title != null && ! title.equals("")) {
if (src != null) {
//take the extention from the src and append it to the title
int i = src.lastIndexOf(".");
if (i > -1 && i +1 < src.length()) {
String ext = src.substring(i);
title += ext;
}
}
pictSource = title;
}
}
}
}
@Override
public void characters(char[] str , int offset, int len) throws SAXException {
if (inBin) {
buffer.append(str, offset, len);
} else if (inPict){
handler.characters(str, offset, len);
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if (!WORD_ML_URL.equals(uri)) {
return;
}
//somewhat tricky...
//can't just dump bin_data at the end of the
//bin_data element because there may be metadata
//after it, if it is within a pict element
// .
//However, if you aren't in a pict (say docOLEdata), then do dump binary
//data at the end of the bin data.
if (PICT.equals(localName)) {
inPict = false;
AttributesImpl attrs = new AttributesImpl();
if (pictName != null) {
attrs.addAttribute(XHTMLContentHandler.XHTML,
HREF, HREF, CDATA, pictName);
}
handler.startElement(XHTMLContentHandler.XHTML,
IMG, IMG, attrs);
handler.endElement(
XHTMLContentHandler.XHTML, IMG, IMG);
handleEmbedded();
} else if (BIN_DATA.equals(localName)) {
inBin = false;
rawBytes = base64.decode(buffer.toString());
//reset
buffer.setLength(0);
if (! inPict) {
handleEmbedded();
}
}
}
private void handleEmbedded() throws SAXException {
if (rawBytes != null) {
try (TikaInputStream is = TikaInputStream.get(rawBytes)) {
Metadata metadata = new Metadata();
if (pictName != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, pictName);
}
if (pictSource != null) {
metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, pictSource);
}
if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
embeddedDocumentExtractor.parseEmbedded(is,
handler, metadata, false);
}
} catch (IOException e) {
//log
}
}
//reset
pictName = null;
pictSource = null;
rawBytes = null;
}
}
}