com.metreeca.text.linkers.GraphLinker Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of metreeca-text Show documentation
A lightweight NLP framework.
The newest version!
/*
 * Copyright © 2013-2022 Metreeca srl
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.metreeca.text.linkers;

import com.metreeca.json.Frame;
import com.metreeca.rest.Xtream;
import com.metreeca.rest.actions.Clean;
import com.metreeca.rest.services.Logger;
import com.metreeca.text.*;

import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.model.vocabulary.RDFS;

import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.*;
import java.util.function.DoubleUnaryOperator;
import java.util.function.Function;
import java.util.stream.IntStream;
import java.util.stream.Stream;

import static com.metreeca.json.Frame.frame;
import static com.metreeca.rest.Toolbox.service;
import static com.metreeca.rest.services.Logger.logger;
import static com.metreeca.rest.services.Logger.time;

import static java.lang.String.format;
import static java.util.Collections.emptySet;
import static java.util.stream.Collectors.*;

public final class GraphLinker implements Function>> {

	private Function> finder=chunk -> chunk.tokens().stream().map(Chunk::new); // !!! default?

	private Function reader=token -> token.text(token.isUpper());
	private Function normalizer=new Clean().space(true).marks(true).smart(true);

	private Function, Stream>> matcher=anchors -> Stream.empty(); // !!! default?


	private final Logger logger=service(logger());


	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	/**
	 * Configures anchor extraction.
	 *
	 * @param finder a function extracting a stream of anchor chunks from a phrase chunk
	 *
	 * @return this linker
	 *
	 * @throws NullPointerException if {@code finder} is {@code null}
	 */
	public GraphLinker finder(final Function> finder) {

		if ( finder == null ) {
			throw new NullPointerException("null finder");
		}

		this.finder=finder;

		return this;
	}

	/**
	 * Configures token to text conversion.
	 *
	 * @param reader a function mapping tokens to textual representations to be fed to the {@linkplain #matcher}
	 *
	 * @return this linker
	 *
	 * @throws NullPointerException if {@code reader} is {@code null}
	 */
	public GraphLinker reader(final Function reader) {

		if ( reader == null ) {
			throw new NullPointerException("null reader");
		}

		this.reader=reader;

		return this;
	}

	/**
	 * Configures text normalization.
	 *
	 * @param normalizer a text normalization function
	 *
	 * @return this linker
	 *
	 * @throws NullPointerException if {@code normalizer} is {@code null}
	 */
	public GraphLinker normalizer(final Function normalizer) {

		if ( normalizer == null ) {
			throw new NullPointerException("null normalizer");
		}

		this.normalizer=normalizer;

		return this;
	}

	/**
	 * Configures candidate identification.
	 *
	 * @param matcher a function converting a stream of anchors to a stream of candidate entity descriptions
	 *
	 * @return this linker
	 *
	 * @throws NullPointerException if {@code matcher} is {@code null}
	 */
	public GraphLinker matcher(final Function, Stream>> matcher) {

		if ( matcher == null ) {
			throw new NullPointerException("null matcher");
		}

		this.matcher=matcher;

		return this;
	}


	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	@Override public Stream> apply(final Chunk chunk) {
		if ( chunk == null || chunk.tokens().isEmpty() ) { return Stream.empty(); } else {

			return time(() -> {

				final List targets=chunk
						.map(finder)
						.collect(toList());

				final List tokens=chunk.tokens().stream()
						.map(token -> token.root(reader.andThen(normalizer).apply(token)))
						.collect(toList());

				return Xtream

						.from(targets)

						.map(Chunk::text)// extract target anchors
						.map(normalizer)
						.distinct()

						.pipe(matcher) // match targets to candidates

						.map(match -> match.source(normalizer.apply(match.source()))) // normalize labels
						.distinct()

						.flatMap(match -> // look for complete anchors
								anchor(match.source(), tokens).map(match::source)
						)

						// make sure the complete anchor includes at least one of the original partial anchors
						// to prevent matching tokens like 'is' if the text includes 'Iceland'

						.filter(match -> targets.stream().anyMatch(target -> match.source().contains(target)))

						.sorted(Comparator // prefer longer anchors
								.>comparingInt(match -> match.source().length())
								.reversed()
						)

						.prune((x, y) -> // remove partially overlapping anchors
								!x.source().matches(y.source()) && x.source().intersects(y.source())
						)

						.batch(toList())

						.bagMap(this::weight) // compute local rank

						.sorted(Comparator // prefer weightier anchors
								.>comparingDouble(Match::weight)
								.reversed()
						)

						.prune((x, y) -> // retain only the weightiest match for each anchor
								x.source().intersects(y.source())
						)

						// retain only minimal identifying infos

						.map(match -> match.target(frame(match.target().focus())
								.values(RDF.TYPE, match.target().values(RDF.TYPE))
								.values(RDFS.LABEL, match.target().values(RDFS.LABEL))
								.values(RDFS.COMMENT, match.target().values(RDFS.COMMENT))
						));

			}).apply((t, v) -> logger.info(this, format(
					"processed <%,d> chars in <%,d> ms (<%,d> chars/s)", chunk.length(), t, 1000L*chunk.length()/t
			)));

		}
	}


	////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

	private Stream anchor(final CharSequence label, final List tokens) {
		return IntStream.range(0, tokens.size())

				.mapToObj(i -> {

					final StringBuilder anchor=new StringBuilder(label.length());

					int j=i;

					while ( j < tokens.size() && anchor.length() < label.length() ) {

						if ( j > i && !tokens.get(j-1).borders(tokens.get(j)) ) {
							anchor.append(' ');
						}

						anchor.append(tokens.get(j++).root()); // use the extracted/normalized text
					}

					return i < j && anchor.toString().contentEquals(label) ? tokens.subList(i, j) : null;

				})

				.filter(Objects::nonNull)

				.map(Chunk::new);
	}

	private Collection> weight(final Collection> matches) {

		final double min=matches.parallelStream().mapToDouble(Match::weight).min().orElse(0);
		final double max=matches.parallelStream().mapToDouble(Match::weight).max().orElse(0);

		final DoubleUnaryOperator weight=transform(min, max, 0.0, 0.1);

		final Map> resources=Xtream // resources to alternatives for the same anchor

				.from(matches)

				.groupBy(match -> match.source().text(), mapping(match -> match.target().focus(), toSet()))

				.map(Map.Entry::getValue)

				.flatMap(entities -> entities.stream().flatMap(x ->
						entities.stream().map(y ->
								new SimpleImmutableEntry<>(x, y)
						)
				))

				.collect(groupingBy(Map.Entry::getKey, mapping(SimpleImmutableEntry::getValue, toSet())));

		final Collection connections=matches.stream()

				.map(Match::target)
				.flatMap(Frame::model)

				// consider only links to other candidates

				.filter(statement -> resources.containsKey(statement.getSubject()))
				.filter(statement -> resources.containsKey(statement.getObject()))

				// ignore links to alternatives for the same anchor

				.filter(statement -> !resources.getOrDefault(statement.getSubject(), emptySet())
						.contains(statement.getObject())
				)

				.collect(toSet());

		final Map connectivity=resources.keySet().stream().collect(toMap(
				value -> value,
				value -> Stream
						.concat(
								connections.stream()
										.filter(statement -> statement.getSubject().equals(value))
										.map(Statement::getObject),
								connections.stream()
										.filter(statement -> statement.getObject().equals(value))
										.map(Statement::getSubject)
						)
						.distinct()
						.count()
		));

		return matches.parallelStream()

				.map(match -> match.weight(weight.applyAsDouble(match.weight())
						+connectivity.getOrDefault(match.target().focus(), 0L) // !!! scale
				))

				.collect(toList());
	}


	private DoubleUnaryOperator transform(final double xmin, final double xmax, final double ymin, final double ymax) {

		final double p=(xmax > xmin) ? (ymax-ymin)/(xmax-xmin) : 0;
		final double q=(xmax > xmin) ? ymin-p*xmin : ymax;

		return v -> p*v+q;

	}

}