org.apache.tika.parser.microsoft.xml.WordMLParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-parsers Show documentation
There is a newer version: 1.0.18
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.xml;

import javax.xml.namespace.QName;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.commons.codec.binary.Base64;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Parses wordml 2003 format word files.  These are single xml files
 * that predate ooxml.
 *
 * @see {@url https://en.wikipedia.org/wiki/Microsoft_Office_XML_formats}
 */
public class WordMLParser extends AbstractXML2003Parser {


    //map between wordml and xhtml entities
    private final static Map WORDML_TO_XHTML =
            new ConcurrentHashMap<>();

    //ignore all characters within these elements
    private final static Set IGNORE_CHARACTERS =
            Collections.newSetFromMap(new ConcurrentHashMap());

    private static final MediaType MEDIA_TYPE = MediaType.application("vnd.ms-wordml");
    private static final Set SUPPORTED_TYPES =
            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
                    MEDIA_TYPE)));

    static {
        WORDML_TO_XHTML.put(P, P);
        WORDML_TO_XHTML.put("tbl", TABLE);
        WORDML_TO_XHTML.put(TR, TR);
        WORDML_TO_XHTML.put("tc", TD);//not a typo -- table cell -> tc

        IGNORE_CHARACTERS.add(new QName(WORD_ML_URL, HLINK));
        IGNORE_CHARACTERS.add(new QName(WORD_ML_URL, PICT));
        IGNORE_CHARACTERS.add(new QName(WORD_ML_URL, BIN_DATA));
        IGNORE_CHARACTERS.add(new QName(MS_OFFICE_PROPERTIES_URN,
                DOCUMENT_PROPERTIES));
    }

    @Override
    public Set getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    @Override
    protected ContentHandler getContentHandler(ContentHandler ch,
                                        Metadata metadata, ParseContext context) {

        return new TeeContentHandler(
                super.getContentHandler(ch, metadata, context),
                new WordMLHandler(ch),
                new HyperlinkHandler(ch,
                        WORD_ML_URL),
                new PictHandler(ch,
                        EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)));
    }

    @Override
    public void setContentType(Metadata metadata) {
        metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
    }

    private class WordMLHandler extends DefaultHandler {
        private final ContentHandler handler;
        private boolean ignoreCharacters;
        private boolean inBody = false;

        //use inP to keep track of whether the handler is
        //in a paragraph or not. 
 was allowed
        //in wordml. Use this boolean to prevent  within 

        private boolean inP;

        public WordMLHandler(ContentHandler handler) {
            this.handler = handler;
        }

        @Override
        public void startElement(String uri, String localName, String qName, Attributes attrs)
                throws SAXException {
            localName = localName.toLowerCase(Locale.US);
            if (WORD_ML_URL.equals(uri)) {
                if (BODY.equals(localName)) {
                    inBody = true;
                    return;
                }
                String html = WORDML_TO_XHTML.get(localName);
                if (html != null) {
                    if (P.equals(localName)) {
                        //close p if already in a p to prevent nested 

                        if (inP) {
                            handler.endElement(XHTMLContentHandler.XHTML, P, P);
                        }
                        inP = true;
                    }
                    handler.startElement(XHTMLContentHandler.XHTML, html, html, EMPTY_ATTRS);
                    if (html.equals(TABLE)) {
                        handler.startElement(XHTMLContentHandler.XHTML, TBODY, TBODY, EMPTY_ATTRS);
                    }
                }
                if (BR.equals(localName)) {
                    handler.characters(NEWLINE, 0, 1);
                }

            }
            if (IGNORE_CHARACTERS.contains(new QName(uri, localName))) {
                ignoreCharacters = true;
            }
        }

        @Override
        public void characters(char[] str , int offset, int len) throws SAXException {
            if (!ignoreCharacters && inBody) {
                handler.characters(str, offset, len);
            }
        }
        
        @Override
        public void endElement(String uri, String localName, String qName) throws SAXException {
            if (WORD_ML_URL.equals(uri)) {
                //for now, don't bother checking for end of body...if there's any text
                //after the close of body, we should extract it
                localName = localName.toLowerCase(Locale.US);
                String html = WORDML_TO_XHTML.get(localName);
                if (html != null) {
                    if (html.equals(TABLE)) {
                        handler.endElement(XHTMLContentHandler.XHTML, TBODY, TBODY);
                    }
                    if (P.equals(html) && !inP) {
                        //start p if not already in one to prevent non-matching 
                        handler.startElement(XHTMLContentHandler.XHTML, P, P, EMPTY_ATTRS);
                    }
                    handler.endElement(XHTMLContentHandler.XHTML, html, html);

                    if (P.equals(html)) {
                        inP = false;
                    }
                }
            }
            if (IGNORE_CHARACTERS.contains(new QName(uri, localName))) {
                ignoreCharacters = false;
            }

        }
    }

    private class PictHandler extends DefaultHandler {
        final StringBuilder buffer = new StringBuilder();
        final ContentHandler handler;
        byte[] rawBytes = null;
        EmbeddedDocumentExtractor embeddedDocumentExtractor;
        boolean inPict = false;
        boolean inBin = false;
        String pictName = null;
        String pictSource = null;
        final Base64 base64 = new Base64();

        public PictHandler(ContentHandler handler, EmbeddedDocumentExtractor embeddedDocumentExtractor) {
            this.handler = handler;
            this.embeddedDocumentExtractor = embeddedDocumentExtractor;
        }

        @Override
        public void startElement(String uri, String localName, String qName, Attributes attrs)
                throws SAXException {
            if (WORD_ML_URL.equals(uri)) {
                if (PICT.equals(localName)) {
                    inPict = true;
                } else if (BIN_DATA.equals(localName)) {
                    inBin = true;
                    pictName = attrs.getValue(WORD_ML_URL, NAME_ATTR);
                    if (pictName != null) {
                        pictName = pictName.replaceFirst("wordml://", "");
                    }
                }
            } else if (MS_VML_URN.equals(uri)) {
                if (localName.equals("imagedata")) {
                    //src is an internal designator with an extension
                    String src = attrs.getValue("", "src");
                    //title appears to be the original file name
                    String title = attrs.getValue(MS_OFFICE_PROPERTIES_URN, "title");
                    if (title != null && ! title.equals("")) {
                        if (src != null) {
                            //take the extention from the src and append it to the title
                            int i = src.lastIndexOf(".");
                            if (i > -1 && i +1 < src.length()) {
                                String ext = src.substring(i);
                                title += ext;
                            }
                        }
                        pictSource = title;
                    }
                }
            }
        }

        @Override
        public void characters(char[] str , int offset, int len) throws SAXException {
            if (inBin) {
                buffer.append(str, offset, len);
            } else if (inPict){
                handler.characters(str, offset, len);
            }
        }

        @Override
        public void endElement(String uri, String localName, String qName) throws SAXException {
            if (!WORD_ML_URL.equals(uri)) {
                return;
            }
            //somewhat tricky...
            //can't just dump bin_data at the end of the
            //bin_data element because there may be metadata
            //after it, if it is within a pict element
            //.
            //However, if you aren't in a pict (say docOLEdata), then do dump binary
            //data at the end of the bin data.
            if (PICT.equals(localName)) {
                inPict = false;
                AttributesImpl attrs = new AttributesImpl();
                if (pictName != null) {
                    attrs.addAttribute(XHTMLContentHandler.XHTML,
                            HREF, HREF, CDATA, pictName);
                }
                handler.startElement(XHTMLContentHandler.XHTML,
                        IMG, IMG, attrs);
                handler.endElement(
                        XHTMLContentHandler.XHTML, IMG, IMG);
                handleEmbedded();
            } else if (BIN_DATA.equals(localName)) {
                inBin = false;
                rawBytes = base64.decode(buffer.toString());
                //reset
                buffer.setLength(0);

                if (! inPict) {
                    handleEmbedded();
                }
            }
        }

        private void handleEmbedded() throws SAXException {
            if (rawBytes != null) {
                try (TikaInputStream is = TikaInputStream.get(rawBytes)) {
                    Metadata metadata = new Metadata();
                    if (pictName != null) {
                        metadata.set(Metadata.RESOURCE_NAME_KEY, pictName);
                    }
                    if (pictSource != null) {
                        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, pictSource);
                    }
                    if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
                        embeddedDocumentExtractor.parseEmbedded(is,
                                handler, metadata, false);
                    }
                } catch (IOException e) {
                    //log
                }
            }
            //reset
            pictName = null;
            pictSource = null;
            rawBytes = null;
        }
    }
}