All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.microsoft.xml.WordMLParser Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft.xml;

import javax.xml.namespace.QName;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.commons.codec.binary.Base64;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Parses wordml 2003 format word files.  These are single xml files
 * that predate ooxml.
 *
 * @see {@url https://en.wikipedia.org/wiki/Microsoft_Office_XML_formats}
 */
public class WordMLParser extends AbstractXML2003Parser {


    //map between wordml and xhtml entities
    private final static Map WORDML_TO_XHTML =
            new ConcurrentHashMap<>();

    //ignore all characters within these elements
    private final static Set IGNORE_CHARACTERS =
            Collections.newSetFromMap(new ConcurrentHashMap());

    private static final MediaType MEDIA_TYPE = MediaType.application("vnd.ms-wordml");
    private static final Set SUPPORTED_TYPES =
            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
                    MEDIA_TYPE)));

    static {
        WORDML_TO_XHTML.put(P, P);
        WORDML_TO_XHTML.put("tbl", TABLE);
        WORDML_TO_XHTML.put(TR, TR);
        WORDML_TO_XHTML.put("tc", TD);//not a typo -- table cell -> tc

        IGNORE_CHARACTERS.add(new QName(WORD_ML_URL, HLINK));
        IGNORE_CHARACTERS.add(new QName(WORD_ML_URL, PICT));
        IGNORE_CHARACTERS.add(new QName(WORD_ML_URL, BIN_DATA));
        IGNORE_CHARACTERS.add(new QName(MS_OFFICE_PROPERTIES_URN,
                DOCUMENT_PROPERTIES));
    }

    @Override
    public Set getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    @Override
    protected ContentHandler getContentHandler(ContentHandler ch,
                                        Metadata metadata, ParseContext context) {

        return new TeeContentHandler(
                super.getContentHandler(ch, metadata, context),
                new WordMLHandler(ch),
                new HyperlinkHandler(ch,
                        WORD_ML_URL),
                new PictHandler(ch,
                        EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)));
    }

    @Override
    public void setContentType(Metadata metadata) {
        metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
    }

    private class WordMLHandler extends DefaultHandler {
        private final ContentHandler handler;
        private boolean ignoreCharacters;
        private boolean inBody = false;

        //use inP to keep track of whether the handler is
        //in a paragraph or not. 

was allowed //in wordml. Use this boolean to prevent

within

private boolean inP; public WordMLHandler(ContentHandler handler) { this.handler = handler; } @Override public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException { localName = localName.toLowerCase(Locale.US); if (WORD_ML_URL.equals(uri)) { if (BODY.equals(localName)) { inBody = true; return; } String html = WORDML_TO_XHTML.get(localName); if (html != null) { if (P.equals(localName)) { //close p if already in a p to prevent nested

if (inP) { handler.endElement(XHTMLContentHandler.XHTML, P, P); } inP = true; } handler.startElement(XHTMLContentHandler.XHTML, html, html, EMPTY_ATTRS); if (html.equals(TABLE)) { handler.startElement(XHTMLContentHandler.XHTML, TBODY, TBODY, EMPTY_ATTRS); } } if (BR.equals(localName)) { handler.characters(NEWLINE, 0, 1); } } if (IGNORE_CHARACTERS.contains(new QName(uri, localName))) { ignoreCharacters = true; } } @Override public void characters(char[] str , int offset, int len) throws SAXException { if (!ignoreCharacters && inBody) { handler.characters(str, offset, len); } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { if (WORD_ML_URL.equals(uri)) { //for now, don't bother checking for end of body...if there's any text //after the close of body, we should extract it localName = localName.toLowerCase(Locale.US); String html = WORDML_TO_XHTML.get(localName); if (html != null) { if (html.equals(TABLE)) { handler.endElement(XHTMLContentHandler.XHTML, TBODY, TBODY); } if (P.equals(html) && !inP) { //start p if not already in one to prevent non-matching

handler.startElement(XHTMLContentHandler.XHTML, P, P, EMPTY_ATTRS); } handler.endElement(XHTMLContentHandler.XHTML, html, html); if (P.equals(html)) { inP = false; } } } if (IGNORE_CHARACTERS.contains(new QName(uri, localName))) { ignoreCharacters = false; } } } private class PictHandler extends DefaultHandler { final StringBuilder buffer = new StringBuilder(); final ContentHandler handler; byte[] rawBytes = null; EmbeddedDocumentExtractor embeddedDocumentExtractor; boolean inPict = false; boolean inBin = false; String pictName = null; String pictSource = null; final Base64 base64 = new Base64(); public PictHandler(ContentHandler handler, EmbeddedDocumentExtractor embeddedDocumentExtractor) { this.handler = handler; this.embeddedDocumentExtractor = embeddedDocumentExtractor; } @Override public void startElement(String uri, String localName, String qName, Attributes attrs) throws SAXException { if (WORD_ML_URL.equals(uri)) { if (PICT.equals(localName)) { inPict = true; } else if (BIN_DATA.equals(localName)) { inBin = true; pictName = attrs.getValue(WORD_ML_URL, NAME_ATTR); if (pictName != null) { pictName = pictName.replaceFirst("wordml://", ""); } } } else if (MS_VML_URN.equals(uri)) { if (localName.equals("imagedata")) { //src is an internal designator with an extension String src = attrs.getValue("", "src"); //title appears to be the original file name String title = attrs.getValue(MS_OFFICE_PROPERTIES_URN, "title"); if (title != null && ! title.equals("")) { if (src != null) { //take the extention from the src and append it to the title int i = src.lastIndexOf("."); if (i > -1 && i +1 < src.length()) { String ext = src.substring(i); title += ext; } } pictSource = title; } } } } @Override public void characters(char[] str , int offset, int len) throws SAXException { if (inBin) { buffer.append(str, offset, len); } else if (inPict){ handler.characters(str, offset, len); } } @Override public void endElement(String uri, String localName, String qName) throws SAXException { if (!WORD_ML_URL.equals(uri)) { return; } //somewhat tricky... //can't just dump bin_data at the end of the //bin_data element because there may be metadata //after it, if it is within a pict element //. //However, if you aren't in a pict (say docOLEdata), then do dump binary //data at the end of the bin data. if (PICT.equals(localName)) { inPict = false; AttributesImpl attrs = new AttributesImpl(); if (pictName != null) { attrs.addAttribute(XHTMLContentHandler.XHTML, HREF, HREF, CDATA, pictName); } handler.startElement(XHTMLContentHandler.XHTML, IMG, IMG, attrs); handler.endElement( XHTMLContentHandler.XHTML, IMG, IMG); handleEmbedded(); } else if (BIN_DATA.equals(localName)) { inBin = false; rawBytes = base64.decode(buffer.toString()); //reset buffer.setLength(0); if (! inPict) { handleEmbedded(); } } } private void handleEmbedded() throws SAXException { if (rawBytes != null) { try (TikaInputStream is = TikaInputStream.get(rawBytes)) { Metadata metadata = new Metadata(); if (pictName != null) { metadata.set(Metadata.RESOURCE_NAME_KEY, pictName); } if (pictSource != null) { metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, pictSource); } if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) { embeddedDocumentExtractor.parseEmbedded(is, handler, metadata, false); } } catch (IOException e) { //log } } //reset pictName = null; pictSource = null; rawBytes = null; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy