com.metreeca.xml.actions.Extract Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of metreeca-xml Show documentation
Connector kit for XML data.
The newest version!
/*
 * Copyright © 2013-2022 Metreeca srl
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.metreeca.xml.actions;

import com.metreeca.rest.Xtream;
import com.metreeca.rest.actions.Clean;

import org.w3c.dom.*;

import java.util.*;
import java.util.function.Function;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import static java.util.Arrays.asList;
import static java.util.Comparator.comparingDouble;

/**
 * Main X/HTMl content extraction.
 *
 * Identifies the X/HTML node containing the main textual content of a complex page.
 */
public final class Extract implements Function> {

	private static final Collection textual=new HashSet<>(asList(
			"h1", "h2", "h3", "h4", "h5", "h6",
			"p", "blockquote", "pre",
			"ul", "ol", "dl", "li", "dt", "dd",
			"table", "th", "td"
	));

	private static final Collection ignored=new HashSet<>(asList(
			"style", "script"
	));


	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	@Override public Optional apply(final Node root) {
		if ( root == null ) { return Optional.empty(); } else {
			return Xtream

					.of(annotate(root))

					.flatMap(new XPath<>(x -> x.nodes("//*")))

					.max(comparingDouble(value -> get(value, "echars", 0.0)))

					.map((node -> {

						try {

							// create a new document to provide a root for xpath queries

							final Document document=DocumentBuilderFactory
									.newInstance()
									.newDocumentBuilder()
									.newDocument();

							document.setDocumentURI(node.getBaseURI());
							document.appendChild(document.adoptNode(node.cloneNode(true)));
							document.normalizeDocument();

							return document;

						} catch ( final ParserConfigurationException unexpected ) {
							throw new RuntimeException(unexpected);
						}

					}));
		}
	}


	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	private  T annotate(final T node) {

		if ( node instanceof Document ) {

			((Document)node).normalizeDocument();

			annotate(((Document)node).getDocumentElement());

		} else if ( node instanceof Element && !ignored.contains(node.getNodeName()) ) {

			double xchars=0;
			double echars=0;

			int nodes=0;
			int blobs=0;

			final NodeList children=node.getChildNodes();

			for (int i=0, n=children.getLength(); i < n; ++i) {

				final Node child=annotate(children.item(i));

				xchars+=get(child, "xchars", 0.0);
				echars+=get(child, "echars", 0.0);

				if ( child instanceof Element ) { ++nodes; }
				if ( textual.contains(child.getNodeName()) ) { ++blobs; }

			}

			final boolean text=textual.contains(node.getNodeName()) && echars == 0;

			set(node, "xchars", xchars);
			set(node, "echars", text ? xchars : echars*(blobs+1)/(nodes+1));

			((Element)node).setAttribute("chars", String.format("%.1f/%.0f",
					get(node, "echars", 0.0),
					get(node, "xchars", 0.0)
			));

		} else if ( node instanceof Text ) {

			final double length=Clean.normalize(node.getTextContent()).length();

			set(node, "xchars", length*length);

		}

		return node;

	}


	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	@SuppressWarnings("unchecked") private  T get(final Node node, final String label, final T value) {
		return Optional.ofNullable((T)node.getUserData(label)).orElse(value);
	}

	private  void set(final Node node, final String label, final T value) {
		node.setUserData(label, value, null);
	}

}