All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.molgenis.data.semanticsearch.service.impl.SemanticSearchServiceHelper Maven / Gradle / Ivy
package org.molgenis.data.semanticsearch.service.impl;
import static java.util.Arrays.stream;
import static java.util.Objects.requireNonNull;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.molgenis.MolgenisFieldTypes;
import org.molgenis.data.DataService;
import org.molgenis.data.Entity;
import org.molgenis.data.EntityMetaData;
import org.molgenis.data.MolgenisDataAccessException;
import org.molgenis.data.QueryRule;
import org.molgenis.data.QueryRule.Operator;
import org.molgenis.data.meta.AttributeMetaDataMetaData;
import org.molgenis.data.meta.EntityMetaDataMetaData;
import org.molgenis.data.semanticsearch.string.NGramDistanceAlgorithm;
import org.molgenis.data.semanticsearch.string.Stemmer;
import org.molgenis.data.support.QueryImpl;
import org.molgenis.ontology.core.model.OntologyTerm;
import org.molgenis.ontology.core.service.OntologyService;
import org.molgenis.ontology.ic.TermFrequencyService;
import org.springframework.beans.factory.annotation.Autowired;
import com.google.common.collect.Sets;
public class SemanticSearchServiceHelper
{
private final TermFrequencyService termFrequencyService;
private final DataService dataService;
private final OntologyService ontologyService;
private final Stemmer stemmer = new Stemmer();
public final static int MAX_NUM_TAGS = 3;
private final static char SPACE_CHAR = ' ';
private final static String COMMA_CHAR = ",";
private final static String CARET_CHARACTER = "^";
private final static String ESCAPED_CARET_CHARACTER = "\\^";
private final static String ILLEGAL_CHARS_REGEX = "[^\\p{L}'a-zA-Z0-9\\.~]+";
@Autowired
public SemanticSearchServiceHelper(DataService dataService, OntologyService ontologyService,
TermFrequencyService termFrequencyService)
{
this.dataService = requireNonNull(dataService);
this.ontologyService = requireNonNull(ontologyService);
this.termFrequencyService = requireNonNull(termFrequencyService);
}
/**
* Create a disMaxJunc query rule based on the given search terms as well as the information from given ontology
* terms
*
* @param ontologyTerms
* @param searchTerms
*
* @return disMaxJunc queryRule
*/
public QueryRule createDisMaxQueryRuleForAttribute(Set searchTerms, Collection ontologyTerms)
{
List queryTerms = new ArrayList();
if (searchTerms != null)
{
queryTerms.addAll(searchTerms.stream().filter(StringUtils::isNotBlank).map(this::processQueryString)
.collect(Collectors.toList()));
}
// Handle tags with only one ontologyterm
ontologyTerms.stream().filter(ontologyTerm -> !ontologyTerm.getIRI().contains(COMMA_CHAR)).forEach(ot -> {
queryTerms.addAll(parseOntologyTermQueries(ot));
});
QueryRule disMaxQueryRule = createDisMaxQueryRuleForTerms(queryTerms);
// Handle tags with multiple ontologyterms
ontologyTerms.stream().filter(ontologyTerm -> ontologyTerm.getIRI().contains(COMMA_CHAR)).forEach(ot -> {
disMaxQueryRule.getNestedRules().add(createShouldQueryRule(ot.getIRI()));
});
return disMaxQueryRule;
}
/**
* Create disMaxJunc query rule based a list of queryTerm. All queryTerms are lower cased and stop words are removed
*
* @param queryTerms
* @return disMaxJunc queryRule
*/
public QueryRule createDisMaxQueryRuleForTerms(List queryTerms)
{
List rules = new ArrayList();
queryTerms.stream().filter(StringUtils::isNotEmpty).map(this::escapeCharsExcludingCaretChar).forEach(query -> {
rules.add(new QueryRule(AttributeMetaDataMetaData.LABEL, Operator.FUZZY_MATCH, query));
rules.add(new QueryRule(AttributeMetaDataMetaData.DESCRIPTION, Operator.FUZZY_MATCH, query));
});
QueryRule finalDisMaxQuery = new QueryRule(rules);
finalDisMaxQuery.setOperator(Operator.DIS_MAX);
return finalDisMaxQuery;
}
/**
* Create a disMaxQueryRule with corresponding boosted value
*
* @param queryTerms
* @param boostValue
* @return a disMaxQueryRule with boosted value
*/
public QueryRule createBoostedDisMaxQueryRuleForTerms(List queryTerms, Double boostValue)
{
QueryRule finalDisMaxQuery = createDisMaxQueryRuleForTerms(queryTerms);
if (boostValue != null && boostValue.intValue() != 0)
{
finalDisMaxQuery.setValue(boostValue);
}
return finalDisMaxQuery;
}
/**
* Create a boolean should query for composite tags containing multiple ontology terms
*
* @param multiOntologyTermIri
* @return return a boolean should queryRule
*/
public QueryRule createShouldQueryRule(String multiOntologyTermIri)
{
QueryRule shouldQueryRule = new QueryRule(new ArrayList());
shouldQueryRule.setOperator(Operator.SHOULD);
for (String ontologyTermIri : multiOntologyTermIri.split(COMMA_CHAR))
{
OntologyTerm ontologyTerm = ontologyService.getOntologyTerm(ontologyTermIri);
List queryTerms = parseOntologyTermQueries(ontologyTerm);
Double termFrequency = getBestInverseDocumentFrequency(queryTerms);
shouldQueryRule.getNestedRules().add(createBoostedDisMaxQueryRuleForTerms(queryTerms, termFrequency));
}
return shouldQueryRule;
}
/**
* Create a list of string queries based on the information collected from current ontologyterm including label,
* synonyms and child ontologyterms
*
* @param ontologyTerm
* @return
*/
public List parseOntologyTermQueries(OntologyTerm ontologyTerm)
{
List queryTerms = getOtLabelAndSynonyms(ontologyTerm).stream().map(this::processQueryString)
.collect(Collectors. toList());
for (OntologyTerm childOt : ontologyService.getChildren(ontologyTerm))
{
double boostedNumber = Math.pow(0.5, ontologyService.getOntologyTermDistance(ontologyTerm, childOt));
getOtLabelAndSynonyms(childOt)
.forEach(synonym -> queryTerms.add(parseBoostQueryString(synonym, boostedNumber)));
}
return queryTerms;
}
/**
* A helper function to collect synonyms as well as label of ontologyterm
*
* @param ontologyTerm
* @return a list of synonyms plus label
*/
public Set getOtLabelAndSynonyms(OntologyTerm ontologyTerm)
{
Set allTerms = Sets.newLinkedHashSet(ontologyTerm.getSynonyms());
allTerms.add(ontologyTerm.getLabel());
return allTerms;
}
public Map collectExpandedQueryMap(Set queryTerms, Collection ontologyTerms)
{
Map expandedQueryMap = new LinkedHashMap();
queryTerms.stream().filter(StringUtils::isNotBlank)
.forEach(queryTerm -> expandedQueryMap.put(stemmer.cleanStemPhrase(queryTerm), queryTerm));
for (OntologyTerm ontologyTerm : ontologyTerms)
{
if (!ontologyTerm.getIRI().contains(COMMA_CHAR))
{
collectOntologyTermQueryMap(expandedQueryMap, ontologyTerm);
}
else
{
for (String ontologyTermIri : ontologyTerm.getIRI().split(COMMA_CHAR))
{
collectOntologyTermQueryMap(expandedQueryMap, ontologyService.getOntologyTerm(ontologyTermIri));
}
}
}
return expandedQueryMap;
}
public void collectOntologyTermQueryMap(Map expanedQueryMap, OntologyTerm ontologyTerm)
{
if (ontologyTerm != null)
{
getOtLabelAndSynonyms(ontologyTerm)
.forEach(term -> expanedQueryMap.put(stemmer.cleanStemPhrase(term), ontologyTerm.getLabel()));
for (OntologyTerm childOntologyTerm : ontologyService.getChildren(ontologyTerm))
{
getOtLabelAndSynonyms(childOntologyTerm)
.forEach(term -> expanedQueryMap.put(stemmer.cleanStemPhrase(term), ontologyTerm.getLabel()));
}
}
}
/**
* A helper function that gets identifiers of all the attributes from one entityMetaData
*
* @param sourceEntityMetaData
* @return
*/
public List getAttributeIdentifiers(EntityMetaData sourceEntityMetaData)
{
Entity entityMetaDataEntity = dataService.findOne(EntityMetaDataMetaData.ENTITY_NAME,
new QueryImpl().eq(EntityMetaDataMetaData.FULL_NAME, sourceEntityMetaData.getName()));
if (entityMetaDataEntity == null) throw new MolgenisDataAccessException(
"Could not find EntityMetaDataEntity by the name of " + sourceEntityMetaData.getName());
List attributeIdentifiers = new ArrayList();
recursivelyCollectAttributeIdentifiers(entityMetaDataEntity.getEntities(EntityMetaDataMetaData.ATTRIBUTES),
attributeIdentifiers);
return attributeIdentifiers;
}
private void recursivelyCollectAttributeIdentifiers(Iterable attributeEntities,
List attributeIdentifiers)
{
for (Entity attributeEntity : attributeEntities)
{
if (!attributeEntity.getString(AttributeMetaDataMetaData.DATA_TYPE)
.equals(MolgenisFieldTypes.COMPOUND.toString()))
{
attributeIdentifiers.add(attributeEntity.getString(AttributeMetaDataMetaData.IDENTIFIER));
}
Iterable entities = attributeEntity.getEntities(AttributeMetaDataMetaData.PARTS);
if (entities != null)
{
recursivelyCollectAttributeIdentifiers(entities, attributeIdentifiers);
}
}
}
public List findTags(String description, List ontologyIds)
{
Set searchTerms = removeStopWords(description);
List matchingOntologyTerms = ontologyService.findOntologyTerms(ontologyIds, searchTerms,
MAX_NUM_TAGS);
return matchingOntologyTerms;
}
public String processQueryString(String queryString)
{
return StringUtils.join(removeStopWords(queryString), SPACE_CHAR);
}
public String parseBoostQueryString(String queryString, double boost)
{
return StringUtils.join(removeStopWords(queryString).stream().map(word -> word + CARET_CHARACTER + boost)
.collect(Collectors.toSet()), SPACE_CHAR);
}
public String escapeCharsExcludingCaretChar(String string)
{
return QueryParser.escape(string).replace(ESCAPED_CARET_CHARACTER, CARET_CHARACTER);
}
public Set removeStopWords(String description)
{
Set searchTerms = stream(description.split(ILLEGAL_CHARS_REGEX)).map(String::toLowerCase)
.filter(w -> !NGramDistanceAlgorithm.STOPWORDSLIST.contains(w) && StringUtils.isNotEmpty(w))
.collect(Collectors.toSet());
return searchTerms;
}
private Double getBestInverseDocumentFrequency(List terms)
{
Optional findFirst = terms.stream().sorted(new Comparator()
{
public int compare(String o1, String o2)
{
return Integer.compare(o1.length(), o2.length());
}
}).findFirst();
return findFirst.isPresent() ? termFrequencyService.getTermFrequency(findFirst.get()) : null;
}
}