All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.odf.FlatOpenDocumentParser Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.odf;

import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

public class FlatOpenDocumentParser extends AbstractParser {

    private static final long serialVersionUID = -8739250869531737584L;

    static final MediaType FLAT_OD = MediaType.application("vnd.oasis.opendocument.tika.flat.document");

    static final MediaType FLAT_ODT = MediaType.application("vnd.oasis.opendocument.flat.text");
    static final MediaType FLAT_ODP = MediaType.application("vnd.oasis.opendocument.flat.presentation");
    static final MediaType FLAT_ODS = MediaType.application("vnd.oasis.opendocument.flat.spreadsheet");

    static final MediaType ODT = MediaType.application("vnd.oasis.opendocument.text");
    static final MediaType ODP = MediaType.application("vnd.oasis.opendocument.presentation");
    static final MediaType ODS = MediaType.application("vnd.oasis.opendocument.spreadsheet");

    private static final Set SUPPORTED_TYPES =
            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
                    FLAT_OD, FLAT_ODT, FLAT_ODP, FLAT_ODS
                  )));

    private boolean extractMacros = false;

    @Override
    public Set getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    @Override
    public void parse(InputStream stream, ContentHandler handler,
                      Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
        final XHTMLContentHandler xhtml =
                new XHTMLContentHandler(handler, metadata);

        xhtml.startDocument();
        try {
            ContentHandler fodHandler = getContentHandler(xhtml, metadata, context);
            XMLReaderUtils.parseSAX(
                    new CloseShieldInputStream(stream),
                    new OfflineContentHandler(
                            new EmbeddedContentHandler(fodHandler)
                    ), context);
            //can only detect subtype (text/pres/sheet) during parse.
            //update it here.
            MediaType detected = ((FlatOpenDocumentParserHandler)fodHandler).getDetectedType();
            if (detected != null) {
                metadata.set(Metadata.CONTENT_TYPE, detected.toString());
            }
        } finally {
            xhtml.endDocument();
        }
    }

    @Field
    public void setExtractMacros(boolean extractMacros) {
        this.extractMacros = extractMacros;
    }

    private ContentHandler getContentHandler(ContentHandler handler,
                                             Metadata metadata, ParseContext context) {
        return new FlatOpenDocumentParserHandler(handler, metadata, context, extractMacros);
    }

    private static class FlatOpenDocumentParserHandler extends ContentHandlerDecorator {
        private static final String META = "meta";
        private static final String BODY = "body";
        private static final String SCRIPTS = "scripts";
        private static final String DOCUMENT = "document";


        private final ContentHandler defaultHandler = new DefaultHandler();

        private final ContentHandler bodyHandler;
        private final ContentHandler metadataHandler;
        private final ContentHandler macroHandler;

        private ContentHandler currentHandler = defaultHandler;

        private MediaType detectedType = null;
        private final boolean extractMacros;

        private FlatOpenDocumentParserHandler(ContentHandler baseHandler,
                                              Metadata metadata,
                                              ParseContext parseContext, boolean extractMacros) {
            this.extractMacros = extractMacros;

            this.bodyHandler =
                    new OfflineContentHandler(
                            new OpenDocumentBodyHandler(
                                    new NSNormalizerContentHandler(baseHandler), parseContext));

            this.metadataHandler = new OfflineContentHandler(
                    new NSNormalizerContentHandler(
                            OpenDocumentMetaParser.getContentHandler(metadata, parseContext)
                    )
            );

            if (extractMacros) {
                this.macroHandler = new OfflineContentHandler(
                        new FlatOpenDocumentMacroHandler(baseHandler, parseContext)
                );
            } else {
                this.macroHandler = null;
            }
        }

        MediaType getDetectedType() {
            return detectedType;
        }
        @Override
        public void startElement(
                String namespaceURI, String localName, String qName,
                Attributes attrs) throws SAXException {

            if (META.equals(localName)) {
                currentHandler = metadataHandler;
            } else if (BODY.equals(localName)) {
                currentHandler = bodyHandler;
            } else if (extractMacros && SCRIPTS.equals(localName)) {
                currentHandler = macroHandler;
            }

            //trust the mimetype element if it exists for the subtype
            if (DOCUMENT.equals(localName)) {
                String mime = XMLReaderUtils.getAttrValue("mimetype", attrs);
                if (mime != null) {
                    if (mime.equals(ODT.toString())) {
                        detectedType = FLAT_ODT;
                    } else if (mime.equals(ODP.toString())) {
                        detectedType = FLAT_ODP;
                    } else if (mime.equals(ODS.toString())) {
                        detectedType = FLAT_ODS;
                    }
                }
            }
            currentHandler.startElement(namespaceURI, localName, qName, attrs);
        }

        @Override
        public void characters(char[] ch, int start, int length)
                throws SAXException {
            currentHandler.characters(ch, start, length);
        }

        @Override
        public void endElement(
                String namespaceURI, String localName, String qName) throws SAXException {
            if (META.equals(localName)) {
                currentHandler = defaultHandler;
            } else if (BODY.equals(localName)) {
                currentHandler = defaultHandler;
            } else if (extractMacros && SCRIPTS.equals(localName)) {
                currentHandler = defaultHandler;
            }
            currentHandler.endElement(namespaceURI, localName, qName);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy