
com.metreeca.text.linkers.GraphLinker Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of metreeca-text Show documentation
Show all versions of metreeca-text Show documentation
A lightweight NLP framework.
The newest version!
/*
* Copyright © 2013-2022 Metreeca srl
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metreeca.text.linkers;
import com.metreeca.json.Frame;
import com.metreeca.rest.Xtream;
import com.metreeca.rest.actions.Clean;
import com.metreeca.rest.services.Logger;
import com.metreeca.text.*;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.model.vocabulary.RDFS;
import java.util.AbstractMap.SimpleImmutableEntry;
import java.util.*;
import java.util.function.DoubleUnaryOperator;
import java.util.function.Function;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import static com.metreeca.json.Frame.frame;
import static com.metreeca.rest.Toolbox.service;
import static com.metreeca.rest.services.Logger.logger;
import static com.metreeca.rest.services.Logger.time;
import static java.lang.String.format;
import static java.util.Collections.emptySet;
import static java.util.stream.Collectors.*;
public final class GraphLinker implements Function>> {
private Function> finder=chunk -> chunk.tokens().stream().map(Chunk::new); // !!! default?
private Function reader=token -> token.text(token.isUpper());
private Function normalizer=new Clean().space(true).marks(true).smart(true);
private Function, Stream>> matcher=anchors -> Stream.empty(); // !!! default?
private final Logger logger=service(logger());
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/**
* Configures anchor extraction.
*
* @param finder a function extracting a stream of anchor chunks from a phrase chunk
*
* @return this linker
*
* @throws NullPointerException if {@code finder} is {@code null}
*/
public GraphLinker finder(final Function> finder) {
if ( finder == null ) {
throw new NullPointerException("null finder");
}
this.finder=finder;
return this;
}
/**
* Configures token to text conversion.
*
* @param reader a function mapping tokens to textual representations to be fed to the {@linkplain #matcher}
*
* @return this linker
*
* @throws NullPointerException if {@code reader} is {@code null}
*/
public GraphLinker reader(final Function reader) {
if ( reader == null ) {
throw new NullPointerException("null reader");
}
this.reader=reader;
return this;
}
/**
* Configures text normalization.
*
* @param normalizer a text normalization function
*
* @return this linker
*
* @throws NullPointerException if {@code normalizer} is {@code null}
*/
public GraphLinker normalizer(final Function normalizer) {
if ( normalizer == null ) {
throw new NullPointerException("null normalizer");
}
this.normalizer=normalizer;
return this;
}
/**
* Configures candidate identification.
*
* @param matcher a function converting a stream of anchors to a stream of candidate entity descriptions
*
* @return this linker
*
* @throws NullPointerException if {@code matcher} is {@code null}
*/
public GraphLinker matcher(final Function, Stream>> matcher) {
if ( matcher == null ) {
throw new NullPointerException("null matcher");
}
this.matcher=matcher;
return this;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@Override public Stream> apply(final Chunk chunk) {
if ( chunk == null || chunk.tokens().isEmpty() ) { return Stream.empty(); } else {
return time(() -> {
final List targets=chunk
.map(finder)
.collect(toList());
final List tokens=chunk.tokens().stream()
.map(token -> token.root(reader.andThen(normalizer).apply(token)))
.collect(toList());
return Xtream
.from(targets)
.map(Chunk::text)// extract target anchors
.map(normalizer)
.distinct()
.pipe(matcher) // match targets to candidates
.map(match -> match.source(normalizer.apply(match.source()))) // normalize labels
.distinct()
.flatMap(match -> // look for complete anchors
anchor(match.source(), tokens).map(match::source)
)
// make sure the complete anchor includes at least one of the original partial anchors
// to prevent matching tokens like 'is' if the text includes 'Iceland'
.filter(match -> targets.stream().anyMatch(target -> match.source().contains(target)))
.sorted(Comparator // prefer longer anchors
.>comparingInt(match -> match.source().length())
.reversed()
)
.prune((x, y) -> // remove partially overlapping anchors
!x.source().matches(y.source()) && x.source().intersects(y.source())
)
.batch(toList())
.bagMap(this::weight) // compute local rank
.sorted(Comparator // prefer weightier anchors
.>comparingDouble(Match::weight)
.reversed()
)
.prune((x, y) -> // retain only the weightiest match for each anchor
x.source().intersects(y.source())
)
// retain only minimal identifying infos
.map(match -> match.target(frame(match.target().focus())
.values(RDF.TYPE, match.target().values(RDF.TYPE))
.values(RDFS.LABEL, match.target().values(RDFS.LABEL))
.values(RDFS.COMMENT, match.target().values(RDFS.COMMENT))
));
}).apply((t, v) -> logger.info(this, format(
"processed <%,d> chars in <%,d> ms (<%,d> chars/s)", chunk.length(), t, 1000L*chunk.length()/t
)));
}
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
private Stream anchor(final CharSequence label, final List tokens) {
return IntStream.range(0, tokens.size())
.mapToObj(i -> {
final StringBuilder anchor=new StringBuilder(label.length());
int j=i;
while ( j < tokens.size() && anchor.length() < label.length() ) {
if ( j > i && !tokens.get(j-1).borders(tokens.get(j)) ) {
anchor.append(' ');
}
anchor.append(tokens.get(j++).root()); // use the extracted/normalized text
}
return i < j && anchor.toString().contentEquals(label) ? tokens.subList(i, j) : null;
})
.filter(Objects::nonNull)
.map(Chunk::new);
}
private Collection> weight(final Collection> matches) {
final double min=matches.parallelStream().mapToDouble(Match::weight).min().orElse(0);
final double max=matches.parallelStream().mapToDouble(Match::weight).max().orElse(0);
final DoubleUnaryOperator weight=transform(min, max, 0.0, 0.1);
final Map> resources=Xtream // resources to alternatives for the same anchor
.from(matches)
.groupBy(match -> match.source().text(), mapping(match -> match.target().focus(), toSet()))
.map(Map.Entry::getValue)
.flatMap(entities -> entities.stream().flatMap(x ->
entities.stream().map(y ->
new SimpleImmutableEntry<>(x, y)
)
))
.collect(groupingBy(Map.Entry::getKey, mapping(SimpleImmutableEntry::getValue, toSet())));
final Collection connections=matches.stream()
.map(Match::target)
.flatMap(Frame::model)
// consider only links to other candidates
.filter(statement -> resources.containsKey(statement.getSubject()))
.filter(statement -> resources.containsKey(statement.getObject()))
// ignore links to alternatives for the same anchor
.filter(statement -> !resources.getOrDefault(statement.getSubject(), emptySet())
.contains(statement.getObject())
)
.collect(toSet());
final Map connectivity=resources.keySet().stream().collect(toMap(
value -> value,
value -> Stream
.concat(
connections.stream()
.filter(statement -> statement.getSubject().equals(value))
.map(Statement::getObject),
connections.stream()
.filter(statement -> statement.getObject().equals(value))
.map(Statement::getSubject)
)
.distinct()
.count()
));
return matches.parallelStream()
.map(match -> match.weight(weight.applyAsDouble(match.weight())
+connectivity.getOrDefault(match.target().focus(), 0L) // !!! scale
))
.collect(toList());
}
private DoubleUnaryOperator transform(final double xmin, final double xmax, final double ymin, final double ymax) {
final double p=(xmax > xmin) ? (ymax-ymin)/(xmax-xmin) : 0;
final double q=(xmax > xmin) ? ymin-p*xmin : ymax;
return v -> p*v+q;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy