org.molgenis.ontology.roc.InformationContentService Maven / Gradle / Ivy
package org.molgenis.ontology.roc;
import static java.util.Objects.requireNonNull;
import static org.molgenis.ontology.core.meta.OntologyMetadata.ONTOLOGY;
import static org.molgenis.ontology.core.meta.OntologyTermMetadata.ONTOLOGY_TERM;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.UncheckedExecutionException;
import java.math.BigDecimal;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.molgenis.data.DataService;
import org.molgenis.data.Entity;
import org.molgenis.data.QueryRule;
import org.molgenis.data.QueryRule.Operator;
import org.molgenis.data.support.QueryImpl;
import org.molgenis.ontology.core.meta.OntologyMetadata;
import org.molgenis.ontology.core.meta.OntologyTermMetadata;
import org.molgenis.semanticsearch.string.NGramDistanceAlgorithm;
import org.molgenis.semanticsearch.string.Stemmer;
public class InformationContentService {
private static final String NON_WORD_SEPARATOR = "[^a-zA-Z0-9]";
private static final String SINGLE_WHITESPACE = " ";
private final LoadingCache cachedTotalWordCount =
CacheBuilder.newBuilder()
.maximumSize(Integer.MAX_VALUE)
.expireAfterWrite(1, TimeUnit.DAYS)
.build(
new CacheLoader() {
@Override
public Long load(String ontologyIri) {
Entity ontologyEntity =
dataService.findOne(
ONTOLOGY,
new QueryImpl<>().eq(OntologyMetadata.ONTOLOGY_IRI, ontologyIri));
if (ontologyEntity != null) {
return dataService.count(
ONTOLOGY_TERM,
new QueryImpl<>().eq(OntologyTermMetadata.ONTOLOGY, ontologyEntity));
}
return (long) 0;
}
});
private final LoadingCache cachedInverseDocumentFreq =
CacheBuilder.newBuilder()
.maximumSize(Integer.MAX_VALUE)
.expireAfterWrite(1, TimeUnit.DAYS)
.build(
new CacheLoader() {
public Double load(OntologyWord key) throws ExecutionException {
String ontologyIri = key.getOntologyIri();
Entity ontologyEntity =
dataService.findOne(
ONTOLOGY,
new QueryImpl<>().eq(OntologyMetadata.ONTOLOGY_IRI, ontologyIri));
if (ontologyEntity != null) {
QueryRule queryRule =
new QueryRule(
Arrays.asList(
new QueryRule(
OntologyTermMetadata.ONTOLOGY_TERM_SYNONYM,
Operator.FUZZY_MATCH,
key.getWord())));
queryRule.setOperator(Operator.DIS_MAX);
QueryRule finalQuery =
new QueryRule(
Arrays.asList(
new QueryRule(
OntologyTermMetadata.ONTOLOGY, Operator.EQUALS, ontologyEntity),
new QueryRule(Operator.AND),
queryRule));
long wordCount = dataService.count(ONTOLOGY_TERM, new QueryImpl<>(finalQuery));
Long total = cachedTotalWordCount.get(ontologyIri);
BigDecimal idfValue =
BigDecimal.valueOf(
total == null ? 0 : (1 + Math.log((double) total / (wordCount + 1))));
return idfValue.doubleValue();
}
return (double) 0;
}
});
private final DataService dataService;
public InformationContentService(DataService dataService) {
this.dataService = requireNonNull(dataService);
}
public Map redistributedNGramScore(String queryString, String ontologyIri) {
Map wordIDFMap = createWordIDF(queryString, ontologyIri);
Map wordWeightedSimilarity = new HashMap<>();
if (wordIDFMap.size() > 0) {
double averageIDFValue =
wordIDFMap.values().stream().mapToDouble(Double::doubleValue).average().getAsDouble();
double queryStringLength =
StringUtils.join(createStemmedWordSet(queryString), SINGLE_WHITESPACE).trim().length();
double totalContribution = 0;
double totalDenominator = 0;
for (Entry entry : wordIDFMap.entrySet()) {
double diff = entry.getValue() - averageIDFValue;
if (diff < 0) {
Double contributedSimilarity =
(entry.getKey().length() / queryStringLength * 100) * (diff / averageIDFValue);
totalContribution += Math.abs(contributedSimilarity);
wordWeightedSimilarity.put(entry.getKey(), contributedSimilarity);
} else {
totalDenominator += diff;
}
}
for (Entry entry : wordIDFMap.entrySet()) {
double diff = entry.getValue() - averageIDFValue;
if (diff > 0) {
if (totalDenominator == 0) {
throw new IllegalStateException("totalDenominator is 0");
}
wordWeightedSimilarity.put(
entry.getKey(), ((diff / totalDenominator) * totalContribution));
}
}
}
return wordWeightedSimilarity;
}
Map createWordIDF(String queryString, String ontologyIri) {
Map wordFreqMap = new HashMap<>();
Set wordsInQueryString = createStemmedWordSet(queryString);
wordsInQueryString.stream()
.forEach(
word -> {
Double wordIDF = null;
try {
wordIDF = cachedInverseDocumentFreq.get(new OntologyWord(ontologyIri, word));
} catch (ExecutionException e) {
throw new UncheckedExecutionException(e);
}
if (wordIDF != null && wordIDF != 0) {
wordFreqMap.put(word, wordIDF);
}
});
return wordFreqMap;
}
public Set createStemmedWordSet(String queryString) {
Set uniqueTerms =
Sets.newHashSet(queryString.toLowerCase().trim().split(NON_WORD_SEPARATOR)).stream()
.filter(term -> !NGramDistanceAlgorithm.STOPWORDSLIST.contains(term))
.map(Stemmer::stem)
.filter(StringUtils::isNotBlank)
.collect(Collectors.toSet());
return Sets.newHashSet(uniqueTerms);
}
}