org.molgenis.semanticsearch.explain.service.ExplainServiceHelper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of molgenis-semantic-search Show documentation
Show all versions of molgenis-semantic-search Show documentation
Semantic data search service functionality.
package org.molgenis.semanticsearch.explain.service;
import static com.google.common.collect.Lists.newArrayList;
import com.google.common.base.Splitter;
import com.google.common.collect.FluentIterable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.search.Explanation;
import org.molgenis.data.MolgenisDataAccessException;
import org.molgenis.semanticsearch.string.NGramDistanceAlgorithm;
public class ExplainServiceHelper {
private static final Pattern REGEXR_PATTERN = Pattern.compile("^weight\\(\\w*:(\\w*)(.*|)\\s.*");
private final Splitter termSplitter = Splitter.onPattern("[^\\p{IsAlphabetic}]+");
public enum Options {
/** elasticsearch description that is product of: */
PRODUCT_OF("product of:"),
/** elasticsearch description that is max of: */
MAX_OF("max of:"),
/** elasticsearch description that is sum of: */
SUM_OF("sum of:"),
/** elasticsearch description that starts with weight: */
WEIGHT("weight");
private String label;
Options(String label) {
this.label = label;
}
public String toString() {
return label;
}
}
/**
* This method is able to recursively collect all the matched words from Elasticsearch Explanation
* document
*
* @return a set of matched words that are matched to different ontology terms
*/
Set findMatchedWords(Explanation explanation) {
Set words = new HashSet<>();
String description = explanation.getDescription();
if (description.startsWith(Options.SUM_OF.toString())
|| description.startsWith(Options.PRODUCT_OF.toString())) {
if (newArrayList(explanation.getDetails()).stream().allMatch(this::reachLastLevel)) {
words.add(extractMatchedWords(explanation.getDetails()));
} else {
for (Explanation subExplanation : explanation.getDetails()) {
words.addAll(findMatchedWords(subExplanation));
}
}
} else if (description.startsWith(Options.MAX_OF.toString())) {
Explanation maxExplanation =
newArrayList(explanation.getDetails()).stream()
.max(
(explanation1, explanation2) ->
Float.compare(explanation1.getValue(), explanation2.getValue()))
.orElseThrow(
() ->
new IllegalStateException(
"explanation.getDetails() shouldn't return an empty array"));
words.addAll(findMatchedWords(maxExplanation));
} else if (description.startsWith(Options.WEIGHT.toString())) {
words.add(getMatchedWord(description));
}
return words;
}
private String extractMatchedWords(Explanation[] explanations) {
List collect =
newArrayList(explanations).stream()
.map(explanation -> getMatchedWord(explanation.getDescription()))
.collect(Collectors.toList());
return StringUtils.join(collect, ' ');
}
private boolean reachLastLevel(Explanation explanation) {
return explanation.getDescription().startsWith(Options.WEIGHT.toString());
}
/**
* This method is able to find the queries that are used in the matching. Only queries that
* contain all matched words are potential queries.
*
* @return a map of potential queries and their matching scores
*/
Map findMatchQueries(
String matchedWordsString, Map collectExpandedQueryMap) {
Map qualifiedQueries = new HashMap<>();
Set matchedWords = splitIntoTerms(matchedWordsString);
for (Entry entry : collectExpandedQueryMap.entrySet()) {
Set wordsInQuery = splitIntoTerms(entry.getKey());
if (wordsInQuery.containsAll(matchedWords)) {
qualifiedQueries.put(
entry.getKey(),
NGramDistanceAlgorithm.stringMatching(matchedWordsString, entry.getKey()));
}
}
return qualifiedQueries;
}
String removeBoostFromQuery(String description) {
return description.replaceAll("\\^\\d*\\.{0,1}\\d+", "");
}
String getMatchedWord(String description) {
Matcher matcher = REGEXR_PATTERN.matcher(description);
if (matcher.find()) {
return matcher.group(1);
}
throw new MolgenisDataAccessException("Failed to find matched word in : " + description);
}
private Set splitIntoTerms(String description) {
return FluentIterable.from(termSplitter.split(description))
.transform(String::toLowerCase)
.filter(w -> !StringUtils.isEmpty(w))
.toSet();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy