
com.metreeca.text.matchers.SPARQLMatcher Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of metreeca-text Show documentation
Show all versions of metreeca-text Show documentation
A lightweight NLP framework.
The newest version!
/*
* Copyright © 2013-2022 Metreeca srl
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.metreeca.text.matchers;
import com.metreeca.json.Frame;
import com.metreeca.json.Values;
import com.metreeca.rdf4j.actions.*;
import com.metreeca.rdf4j.services.Graph;
import com.metreeca.rest.Xtream;
import com.metreeca.rest.actions.Fill;
import com.metreeca.rest.services.Logger;
import com.metreeca.text.*;
import com.metreeca.text.tokenizers.PatternTokenizer;
import org.eclipse.rdf4j.model.*;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.repository.RepositoryConnection;
import java.math.BigDecimal;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Stream;
import static com.metreeca.json.Frame.frame;
import static com.metreeca.json.Values.literal;
import static com.metreeca.rdf4j.services.Graph.graph;
import static com.metreeca.rest.Toolbox.service;
import static com.metreeca.rest.services.Logger.logger;
import static java.util.Arrays.asList;
import static java.util.stream.Collectors.joining;
public final class SPARQLMatcher implements Function, Stream>> {
private final int size=1_000;
private Collection labels=Notes.Labels;
private Set languages=Notes.Languages;
private final IRI context=RDF.NIL; // !!!
private final Graph graph=service(graph());
private final Logger logger=service(logger());
public SPARQLMatcher labels(final IRI... labels) {
if ( labels == null ) {
throw new NullPointerException("null labels");
}
return labels(asList(labels));
}
public SPARQLMatcher labels(final Collection labels) {
if ( labels == null || labels.stream().anyMatch(Objects::isNull) ) {
throw new NullPointerException("null labels");
}
this.labels=new HashSet<>(labels);
return this;
}
public SPARQLMatcher languages(final String... languages) {
if ( languages == null ) {
throw new NullPointerException("null languages");
}
return languages(asList(languages));
}
public SPARQLMatcher languages(final Collection languages) {
if ( languages == null || languages.stream().anyMatch(Objects::isNull) ) {
throw new NullPointerException("null languages");
}
this.languages=new HashSet<>(languages);
return this;
}
@Override public Stream> apply(final Stream anchors) {
return Xtream.from(anchors)
.batch(size)
.flatMap(new Fill>() // keep aligned with index definition
.model("prefix base: \n"
+"prefix rdfs: \n"
+"\n"
+"construct { ?s ?p ?o; base:weight ?w } where {\n"
+"\n"
+"\tvalues ?a {\n"
+"\t\t{anchors}\n"
+"\t}\n"
+"\n"
+"\t?s ?p ?o; \n"
+"\t\tbase:anchor ?a;\n"
+"\t\tbase:weight ?w.\t\n"
+"\n"
+"\tfilter (isIRI(?o) || lang(?o) in ({languages}) && ?p in (\n"
+"\t\t{labels}, \n"
+"\t\trdfs:description\n"
+"\t))\n"
+"\n"
+"}"
)
.value("anchors", batch -> batch.stream()
.map(Values::quote)
.collect(joining("\n\t\t"))
)
.value("languages", languages.stream()
.map(Values::quote)
.collect(joining(", "))
)
.value("labels", labels.stream()
.map(Values::format)
.collect(joining(",\n\t\t"))
)
)
.flatMap(new GraphQuery().graph(graph))
.groupBy(Statement::getSubject)
.map(entry -> frame(entry.getKey(), entry.getValue()))
.flatMap(frame -> {
final double weight=frame
.decimal(Notes.weight)
.orElse(BigDecimal.ZERO)
.doubleValue();
return frame.model()
.filter(s -> labels.contains(s.getPredicate()))
.map(Statement::getObject)
.map(Value::stringValue)
.map(label -> new Match<>(label, frame, weight));
});
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public static final class Indexer implements Runnable {
private Collection labels=Notes.Labels;
private Set languages=Notes.Languages;
private IRI context=RDF.NIL; // !!!
private Function analyzer=new PatternTokenizer().defaults();
private final Graph graph=service(graph());
private final Logger logger=service(logger());
public Indexer labels(final IRI... labels) {
if ( labels == null ) {
throw new NullPointerException("null labels");
}
return labels(asList(labels));
}
public Indexer labels(final Collection labels) {
if ( labels == null || labels.stream().anyMatch(Objects::isNull) ) {
throw new NullPointerException("null labels");
}
this.labels=new HashSet<>(labels);
return this;
}
public Indexer languages(final String... languages) {
if ( languages == null ) {
throw new NullPointerException("null languages");
}
return languages(asList(languages));
}
public Indexer languages(final Collection languages) {
if ( languages == null || languages.stream().anyMatch(Objects::isNull) ) {
throw new NullPointerException("null languages");
}
this.languages=new HashSet<>(languages);
return this;
}
public Indexer context(final IRI context) {
if ( context == null ) {
throw new NullPointerException("null context");
}
this.context=context;
return this;
}
public Indexer analyzer(final Function analyzer) {
if ( analyzer == null ) {
throw new NullPointerException("null analyzer");
}
this.analyzer=analyzer;
return this;
}
@Override public void run() {
graph.update(connection -> {
clean();
anchors(connection);
weights();
return this;
});
}
private void clean() {
logger.info(this, "cleaning");
Stream.of("prefix base: \n"
+"\n"
+"delete { ?e base:weight ?w; base:anchor ?a }\n"
+"where { ?e a base:Entity; base:weight ?w; base:anchor ?a }")
.forEach(new Update()
.graph(graph)
.remove(context) // !!! context
);
}
private void anchors(final RepositoryConnection connection) {
service(logger()).info(this, "extracting anchors");
Stream
.of("")
.flatMap(new Fill<>()
.model("prefix base: \n"
+"\n"
+"select ?e ?l {\n"
+"\n"
+"\tvalues ?p {\n"
+"\t\t{labels}\n"
+"\t}\n"
+"\n"
+"\t?e a base:Entity; ?p ?l filter (lang(?l) in ({languages}))\n"
+"\n"
+"}")
.value("labels", labels.stream()
.map(Values::format)
.collect(joining("\n\t\t"))
)
.value("languages", languages.stream()
.map(Values::literal)
.map(Values::format)
.collect(joining(", "))
)
)
.flatMap(new TupleQuery() // !!! context / include inferred
.graph(graph)
)
.forEach(bindings -> {
final Resource entity=(Resource)bindings.getValue("e");
final Value label=bindings.getValue("l");
analyzer.apply(new Token(label.stringValue())).tokens().forEach(anchor ->
connection.add(entity, Notes.anchor, literal(anchor.text(true)), context)
);
});
}
private void weights() {
service(logger()).info(this, "computing weights");
Stream
.of("prefix base: \n"
+"\n"
+"insert { ?e base:weight ?w } where {\n"
+"\n"
+"\tselect ?e (count(?c) as ?w) {\n"
+"\n"
+"\t\t?e a base:Entity. ?c ?p ?e.\n"
+"\n"
+"\t} group by ?e\n"
+"\n"
+"}"
)
.forEach(new Update()
.graph(graph)
.insert(context) // !!! review
);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy