All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.microsoft.FormattingUtils Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.microsoft;

import java.util.Deque;
import java.util.EnumSet;
import java.util.Locale;

import org.apache.poi.wp.usermodel.CharacterRun;
import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.xml.sax.SAXException;

import org.apache.tika.sax.XHTMLContentHandler;

public class FormattingUtils {
    private FormattingUtils() {
    }

    /**
     * Closes all tags until {@code currentState} contains only tags from {@code desired} set,
     * then open all required tags to reach desired state.
     *
     * @param xhtml        handler
     * @param desired      desired formatting state
     * @param currentState current formatting state (stack of open formatting tags)
     * @throws SAXException pass underlying handler exception
     */
    public static void ensureFormattingState(XHTMLContentHandler xhtml, EnumSet desired,
                                             Deque currentState) throws SAXException {
        EnumSet undesired = EnumSet.complementOf(desired);

        while (!currentState.isEmpty() && currentState.stream().anyMatch(undesired::contains)) {
            xhtml.endElement(currentState.pop().tagName());
        }

        desired.removeAll(currentState);
        for (FormattingUtils.Tag tag : desired) {
            currentState.push(tag);
            xhtml.startElement(tag.tagName());
        }
    }

    /**
     * Closes all formatting tags.
     *
     * @param xhtml           handler
     * @param formattingState current formatting state (stack of open formatting tags)
     * @throws SAXException pass underlying handler exception
     */
    public static void closeStyleTags(XHTMLContentHandler xhtml, Deque formattingState)
            throws SAXException {
        ensureFormattingState(xhtml, EnumSet.noneOf(Tag.class), formattingState);
    }

    public static EnumSet toTags(CharacterRun run) {
        EnumSet tags = EnumSet.noneOf(Tag.class);
        if (run.isBold()) {
            tags.add(Tag.B);
        }
        if (run.isItalic()) {
            tags.add(Tag.I);
        }
        if (run.isStrikeThrough()) {
            tags.add(Tag.S);
        }
        if (run instanceof XWPFRun) {
            XWPFRun xwpfRun = (XWPFRun) run;
            if (xwpfRun.getUnderline() != UnderlinePatterns.NONE) {
                tags.add(Tag.U);
            }
        } else if (run instanceof org.apache.poi.hwpf.usermodel.CharacterRun) {
            org.apache.poi.hwpf.usermodel.CharacterRun hwpfRun =
                    (org.apache.poi.hwpf.usermodel.CharacterRun) run;
            if (hwpfRun.getUnderlineCode() != 0) {
                tags.add(Tag.U);
            }
        }
        return tags;
    }

    public enum Tag {
        // DON'T reorder elements to avoid breaking tests: EnumSet is iterated in natural order
        // as enum variants are declared
        B, I, S, U;

        public String tagName() {
            return name().toLowerCase(Locale.ROOT);
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy