All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.james.jmap.utils.JsoupHtmlTextExtractor Maven / Gradle / Ivy

/****************************************************************
 O * Licensed to the Apache Software Foundation (ASF) under one   *
 * or more contributor license agreements.  See the NOTICE file *
 * distributed with this work for additional information        *
 * regarding copyright ownership.  The ASF licenses this file   *
 * to you under the Apache License, Version 2.0 (the            *
 * "License"); you may not use this file except in compliance   *
 * with the License.  You may obtain a copy of the License at   *
 *                                                              *
 *   http://www.apache.org/licenses/LICENSE-2.0                 *
 *                                                              *
 * Unless required by applicable law or agreed to in writing,   *
 * software distributed under the License is distributed on an  *
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
 * KIND, either express or implied.  See the License for the    *
 * specific language governing permissions and limitations      *
 * under the License.                                           *
 ****************************************************************/

package org.apache.james.jmap.utils;

import java.util.Optional;
import java.util.stream.Stream;

import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class JsoupHtmlTextExtractor implements HtmlTextExtractor {

    private static final Logger LOGGER = LoggerFactory.getLogger(JsoupHtmlTextExtractor.class);
    public static final String BR_TAG = "br";
    public static final String UL_TAG = "ul";
    public static final String OL_TAG = "ol";
    public static final String LI_TAG = "li";
    public static final String P_TAG = "p";
    public static final String IMG_TAG = "img";
    public static final String ALT_TAG = "alt";
    public static final int INITIAL_LIST_NESTED_LEVEL = 0;

    @Override
    public String toPlainText(String html) {
        try {
            Document document = Jsoup.parse(html);

            Element body = Optional.ofNullable(document.body()).orElse(document);

            return flatten(body, INITIAL_LIST_NESTED_LEVEL)
                .map(this::convertNodeToText)
                .reduce("", (s1, s2) -> s1 + s2);
        } catch (Exception e) {
            LOGGER.warn("Failed extracting text from html", e);
            return html;
        }
    }

    private String convertNodeToText(HTMLNode htmlNode) {
        Node node = htmlNode.underlyingNode;
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            return textNode.getWholeText();
        }
        if (node instanceof Element) {
            Element element = (Element) node;
            if (element.tagName().equals(BR_TAG)) {
                return "\n";
            }
            if (isList(element)) {
                return convertListElement(htmlNode.listNestedLevel);
            }
            if (element.tagName().equals(OL_TAG)) {
                return "\n\n";
            }
            if (element.tagName().equals(LI_TAG)) {
                return "\n" + StringUtils.repeat(" ", htmlNode.listNestedLevel) + "- ";
            }
            if (element.tagName().equals(P_TAG)) {
                return "\n\n";
            }
            if (element.tagName().equals(IMG_TAG)) {
                return generateImageAlternativeText(element);
            }
        }
        return "";
    }

    private String generateImageAlternativeText(Element element) {
        return Optional.ofNullable(element.attributes().get(ALT_TAG))
            .map(StringUtils::normalizeSpace)
            .filter(s -> !s.isEmpty())
            .map(s -> "[" + s + "]")
            .orElse("");
    }

    private String convertListElement(int nestedLevel) {
        if (nestedLevel == 0) {
            return "\n\n";
        } else {
            return "";
        }
    }

    Stream flatten(Node base, int listNestedLevel) {
        Position position = getPosition(base);
        int nextElementLevel = getNewNestedLevel(listNestedLevel, base);

        Stream baseStream = Stream.of(new HTMLNode(base, listNestedLevel));
        Stream flatChildren = base.childNodes()
            .stream()
            .flatMap(node -> flatten(node, nextElementLevel));
        
        switch (position) {
            case PREFIX:
                return Stream.concat(baseStream, flatChildren);
            case SUFFIX:
                return Stream.concat(flatChildren, baseStream);
            default:
                throw new RuntimeException("Unexpected POSITION for node element: " + position);
        }
    }

    private int getNewNestedLevel(int listNestedLevel, Node node) {
        if (node instanceof Element) {
            Element element = (Element) node;
            if (isList(element)) {
                return listNestedLevel + 1;
            }
        }
        return listNestedLevel;
    }

    private boolean isList(Element element) {
        return element.tagName().equals(UL_TAG) || element.tagName().equals(OL_TAG);
    }

    private enum Position {
        PREFIX,
        SUFFIX
    }

    private Position getPosition(Node node) {
        if (node instanceof Element) {
            Element element = (Element) node;
            if (element.tagName().equals(LI_TAG)) {
                return Position.PREFIX;
            }
        }
        return Position.SUFFIX;
    }

    private static class HTMLNode {
        private final Node underlyingNode;
        private final int listNestedLevel;

        public HTMLNode(Node underlyingNode, int listNestedLevel) {
            this.underlyingNode = underlyingNode;
            this.listNestedLevel = listNestedLevel;
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy