All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.mif.MIFExtractor Maven / Gradle / Ivy

There is a newer version: 3.1.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.mif;

import java.io.IOException;
import java.util.ArrayList;
import java.util.EmptyStackException;
import java.util.List;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import org.apache.tika.detect.AutoDetectReader;

/**
 * Helper Class to Parse and Extract Adobe MIF Files.
 */
public class MIFExtractor {

    private static final Pattern openTagPattern = Pattern.compile("<(\\S*).*");
    private static final Pattern selfTagPattern = Pattern.compile("(<(\\S*))(\\s)(.*)(\\>).*");
    private static final String OPEN_TAG_MARKER = "<";
    private static final String CLOSE_TAG_MARKER = ">";
    private static final String START_TAG_VALUE = "`";
    private static final String END_TAG_VALUE = "'";

    /**
     * Parsers the file supplied through the reader and emits events to the supplied content
     * handler.
     *
     * @param reader  the reader to use.
     * @param handler the content handler to use.
     * @throws IOException  on any IO error.
     * @throws SAXException on any SAX error.
     */
    static void parse(AutoDetectReader reader, ContentHandler handler)
            throws IOException, SAXException {
        handler.startDocument();
        String line;
        Tag currentTag = new Tag();
        Stack parents = new Stack<>();
        while ((line = reader.readLine()) != null) {
            if (line.contains(OPEN_TAG_MARKER) && !line.contains(CLOSE_TAG_MARKER)) {
                Matcher matcher = openTagPattern.matcher(line.trim());
                if (matcher.matches()) {
                    if (!parents.empty()) {
                        currentTag = new Tag();
                        currentTag.setParent(parents.peek());
                    }
                    currentTag.setName(matcher.group(1));
                    parents.push(currentTag);
                    Attributes attrs = new AttributesImpl();
                    handler.startElement(StringUtils.EMPTY, matcher.group(1), matcher.group(1),
                            attrs);
                }
            } else if (line.trim().startsWith(CLOSE_TAG_MARKER)) {
                try {
                    String tmp = line.trim();
                    String tagName = tmp.substring(tmp.lastIndexOf(" ") + 1);
                    Tag parent = parents.peek();
                    if (tagName.equals(parent.getName())) {
                        parents.pop();
                    }
                    if (!parents.empty()) {
                        parents.peek().addChild(parent);
                    }
                    handler.endElement(StringUtils.EMPTY, parent.getName(), parent.getName());
                } catch (EmptyStackException ex) {
                    // Shouldn't happen, swallow to keep parsing
                }
            } else {
                Matcher matcher = selfTagPattern.matcher(line.trim());
                if (matcher.matches()) {
                    if (!parents.empty()) {
                        Tag child = new Tag();
                        child.setName(matcher.group(2));
                        child.setValue(matcher.group(4));
                        child.setParent(parents.peek());
                        currentTag.addChild(child);
                        processTag(handler, child);
                    } else {
                        currentTag.setName(matcher.group(2));
                        currentTag.setValue(matcher.group(4));
                        processTag(handler, currentTag);
                        currentTag = new Tag();
                    }
                }
            }
        }
        handler.endDocument();
    }

    /**
     * Process a tag and emit events to Content Handler.
     *
     * @param handler the content handler.
     * @param tag     the tag to process.
     * @throws SAXException on any SAX error.
     */
    private static void processTag(ContentHandler handler, Tag tag) throws SAXException {
        Attributes attrs = new AttributesImpl();
        handler.startElement(StringUtils.EMPTY, tag.getName(), tag.getName(), attrs);
        String value = StringUtils.removeStart(tag.getValue(), START_TAG_VALUE);
        value = StringUtils.removeEnd(value, END_TAG_VALUE);
        String content = StringEscapeUtils.escapeXml(value);
        handler.characters(content.toCharArray(), 0, content.length());
        handler.endElement(StringUtils.EMPTY, tag.getName(), tag.getName());
    }

    /**
     * Helper model class for a MIF Tag to support parsing.
     */
    private static final class Tag {

        private List children = new ArrayList<>();
        private Tag parent;
        private String name;
        private String value;

        public void addChild(Tag child) {
            children.add(child);
        }

        public List getChildren() {
            return children;
        }

        public void setChildren(List children) {
            this.children = children;
        }

        public String getName() {
            return name;
        }

        public void setName(String name) {
            this.name = name;
        }

        public String getValue() {
            return value;
        }

        public void setValue(String value) {
            this.value = value;
        }

        public String toString() {
            return name + ":" + value;
        }

        public Tag getParent() {
            return parent;
        }

        public void setParent(Tag parent) {
            this.parent = parent;
        }

    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy