All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.molgenis.data.semanticsearch.service.impl.SemanticSearchServiceImpl Maven / Gradle / Ivy

package org.molgenis.data.semanticsearch.service.impl;

import static java.util.Objects.requireNonNull;

import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.spell.StringDistance;
import org.elasticsearch.common.base.Joiner;
import org.elasticsearch.common.collect.Lists;
import org.molgenis.data.AttributeMetaData;
import org.molgenis.data.DataService;
import org.molgenis.data.Entity;
import org.molgenis.data.EntityMetaData;
import org.molgenis.data.MolgenisDataAccessException;
import org.molgenis.data.QueryRule;
import org.molgenis.data.QueryRule.Operator;
import org.molgenis.data.meta.AttributeMetaDataMetaData;
import org.molgenis.data.meta.MetaDataService;
import org.molgenis.data.semanticsearch.explain.bean.ExplainedAttributeMetaData;
import org.molgenis.data.semanticsearch.explain.bean.ExplainedQueryString;
import org.molgenis.data.semanticsearch.explain.service.ElasticSearchExplainService;
import org.molgenis.data.semanticsearch.semantic.Hit;
import org.molgenis.data.semanticsearch.service.SemanticSearchService;
import org.molgenis.data.semanticsearch.string.NGramDistanceAlgorithm;
import org.molgenis.data.semanticsearch.string.Stemmer;
import org.molgenis.data.support.QueryImpl;
import org.molgenis.ontology.core.model.Ontology;
import org.molgenis.ontology.core.model.OntologyTerm;
import org.molgenis.ontology.core.service.OntologyService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;

import com.google.common.base.Splitter;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.Ordering;

import autovalue.shaded.com.google.common.common.collect.Sets;

public class SemanticSearchServiceImpl implements SemanticSearchService
{
	private static final Logger LOG = LoggerFactory.getLogger(SemanticSearchServiceImpl.class);

	private final DataService dataService;
	private final OntologyService ontologyService;
	private final MetaDataService metaDataService;
	private final SemanticSearchServiceHelper semanticSearchServiceHelper;
	private final ElasticSearchExplainService elasticSearchExplainService;

	public static final int MAX_NUM_TAGS = 100;
	private static final float CUTOFF = 0.4f;
	private Splitter termSplitter = Splitter.onPattern("[^\\p{IsAlphabetic}]+");
	private Joiner termJoiner = Joiner.on(' ');
	private static final String UNIT_ONTOLOGY_IRI = "http://purl.obolibrary.org/obo/uo.owl";

	// We only explain the top 10 suggested attributes because beyond that the attributes are not high quliaty anymore
	private static final int MAX_NUMBER_EXPLAINED_ATTRIBUTES = 10;

	@Autowired
	public SemanticSearchServiceImpl(DataService dataService, OntologyService ontologyService,
			MetaDataService metaDataService, SemanticSearchServiceHelper semanticSearchServiceHelper,
			ElasticSearchExplainService elasticSearchExplainService)
	{
		this.dataService = requireNonNull(dataService);
		this.ontologyService = requireNonNull(ontologyService);
		this.metaDataService = requireNonNull(metaDataService);
		this.semanticSearchServiceHelper = requireNonNull(semanticSearchServiceHelper);
		this.elasticSearchExplainService = requireNonNull(elasticSearchExplainService);
	}

	@Override
	public Map findAttributes(EntityMetaData sourceEntityMetaData,
			Set queryTerms, Collection ontologyTerms)
	{
		Iterable attributeIdentifiers = semanticSearchServiceHelper
				.getAttributeIdentifiers(sourceEntityMetaData);

		QueryRule disMaxQueryRule = semanticSearchServiceHelper.createDisMaxQueryRuleForAttribute(queryTerms,
				ontologyTerms);

		List finalQueryRules = Lists
				.newArrayList(new QueryRule(AttributeMetaDataMetaData.IDENTIFIER, Operator.IN, attributeIdentifiers));

		if (disMaxQueryRule.getNestedRules().size() > 0)
		{
			finalQueryRules.addAll(Arrays.asList(new QueryRule(Operator.AND), disMaxQueryRule));
		}

		Stream attributeMetaDataEntities = dataService.findAll(AttributeMetaDataMetaData.ENTITY_NAME,
				new QueryImpl(finalQueryRules));

		Map collectExpanedQueryMap = semanticSearchServiceHelper.collectExpandedQueryMap(queryTerms,
				ontologyTerms);

		// Because the explain-API can be computationally expensive we limit the explanation to the top 10 attributes
		Map explainedAttributes = new LinkedHashMap<>();
		AtomicInteger count = new AtomicInteger(0);
		attributeMetaDataEntities.forEach(attributeEntity ->
		// for (Entity attributeEntity : attributeMetaDataEntities)
		{
			AttributeMetaData attribute = sourceEntityMetaData
					.getAttribute(attributeEntity.getString(AttributeMetaDataMetaData.NAME));
			if (count.get() < MAX_NUMBER_EXPLAINED_ATTRIBUTES)
			{
				Set explanations = convertAttributeEntityToExplainedAttribute(attributeEntity,
						sourceEntityMetaData, collectExpanedQueryMap, finalQueryRules);

				boolean singleMatchHighQuality = isSingleMatchHighQuality(queryTerms,
						Sets.newHashSet(collectExpanedQueryMap.values()), explanations);

				explainedAttributes.put(attribute,
						ExplainedAttributeMetaData.create(attribute, explanations, singleMatchHighQuality));
			}
			else
			{
				explainedAttributes.put(attribute, ExplainedAttributeMetaData.create(attribute));
			}
			count.incrementAndGet();
		});

		return explainedAttributes;
	}

	boolean isSingleMatchHighQuality(Collection queryTerms, Collection ontologyTermQueries,
			Iterable explanations)
	{
		Map matchedTags = new HashMap<>();

		for (ExplainedQueryString explanation : explanations)
		{
			matchedTags.put(explanation.getTagName().toLowerCase(), explanation.getScore());
		}

		ontologyTermQueries.removeAll(queryTerms);

		if (queryTerms.size() > 0 && queryTerms.stream().anyMatch(token -> isGoodMatch(matchedTags, token)))
			return true;

		if (ontologyTermQueries.size() > 0
				&& ontologyTermQueries.stream().allMatch(token -> isGoodMatch(matchedTags, token)))
			return true;

		return false;
	}

	boolean isGoodMatch(Map matchedTags, String label)
	{
		label = label.toLowerCase();
		return matchedTags.containsKey(label) && matchedTags.get(label).intValue() == 100
				|| Sets.newHashSet(label.split(" ")).stream()
						.allMatch(word -> matchedTags.containsKey(word) && matchedTags.get(word).intValue() == 100);
	}

	@Override
	public Map decisionTreeToFindRelevantAttributes(
			EntityMetaData sourceEntityMetaData, AttributeMetaData targetAttribute,
			Collection ontologyTermsFromTags, Set searchTerms)
	{
		Set queryTerms = createLexicalSearchQueryTerms(targetAttribute, searchTerms);

		Collection ontologyTerms = ontologyTermsFromTags;

		if (null != searchTerms && !searchTerms.isEmpty())
		{
			Set escapedSearchTerms = searchTerms.stream().filter(StringUtils::isNotBlank)
					.map(QueryParser::escape).collect(Collectors.toSet());
			ontologyTerms = ontologyService.findExcatOntologyTerms(ontologyService.getAllOntologiesIds(),
					escapedSearchTerms, MAX_NUM_TAGS);
		}
		else if (null == ontologyTerms || ontologyTerms.size() == 0)
		{
			List allOntologiesIds = ontologyService.getAllOntologiesIds();
			Ontology unitOntology = ontologyService.getOntology(UNIT_ONTOLOGY_IRI);
			if (unitOntology != null)
			{
				allOntologiesIds.remove(unitOntology.getId());
			}
			Hit ontologyTermHit = findTags(targetAttribute, allOntologiesIds);
			ontologyTerms = ontologyTermHit != null ? Arrays.asList(ontologyTermHit.getResult())
					: Collections.emptyList();
		}

		return findAttributes(sourceEntityMetaData, queryTerms, ontologyTerms);
	}

	/**
	 * A helper function to create a list of queryTerms based on the information from the targetAttribute as well as
	 * user defined searchTerms. If the user defined searchTerms exist, the targetAttribute information will not be
	 * used.
	 * 
	 * @param targetAttribute
	 * @param searchTerms
	 * @return list of queryTerms
	 */
	public Set createLexicalSearchQueryTerms(AttributeMetaData targetAttribute, Set searchTerms)
	{
		Set queryTerms = new HashSet<>();

		if (searchTerms != null && !searchTerms.isEmpty())
		{
			queryTerms.addAll(searchTerms);
		}

		if (queryTerms.size() == 0)
		{
			if (StringUtils.isNotBlank(targetAttribute.getLabel()))
			{
				queryTerms.add(targetAttribute.getLabel());
			}

			if (StringUtils.isNotBlank(targetAttribute.getDescription()))
			{
				queryTerms.add(targetAttribute.getDescription());
			}
		}

		return queryTerms;
	}

	/**
	 * A helper function to explain each of the matched attributes returned by the explain-API
	 * 
	 * @param attributeEntity
	 * @param sourceEntityMetaData
	 * @param collectExpanedQueryMap
	 * @param finalQueryRules
	 * @return
	 */
	public Set convertAttributeEntityToExplainedAttribute(Entity attributeEntity,
			EntityMetaData sourceEntityMetaData, Map collectExpanedQueryMap,
			List finalQueryRules)
	{
		String attributeId = attributeEntity.getString(AttributeMetaDataMetaData.IDENTIFIER);
		String attributeName = attributeEntity.getString(AttributeMetaDataMetaData.NAME);
		AttributeMetaData attribute = sourceEntityMetaData.getAttribute(attributeName);
		if (attribute == null)
		{
			throw new MolgenisDataAccessException("The attributeMetaData : " + attributeName
					+ " does not exsit in EntityMetaData : " + sourceEntityMetaData.getName());
		}
		Explanation explanation = elasticSearchExplainService.explain(new QueryImpl(finalQueryRules),
				dataService.getEntityMetaData(AttributeMetaDataMetaData.ENTITY_NAME), attributeId);

		Set detectedQueryStrings = elasticSearchExplainService
				.findQueriesFromExplanation(collectExpanedQueryMap, explanation);

		return detectedQueryStrings;
	}

	@Override
	public Map> findTags(String entity, List ontologyIds)
	{
		Map> result = new LinkedHashMap>();
		EntityMetaData emd = metaDataService.getEntityMetaData(entity);
		for (AttributeMetaData amd : emd.getAtomicAttributes())
		{
			Hit tag = findTags(amd, ontologyIds);
			if (tag != null)
			{
				result.put(amd, tag);
			}
		}
		return result;
	}

	@Override
	public Hit findTags(AttributeMetaData attribute, List ontologyIds)
	{
		String description = attribute.getDescription() == null ? attribute.getLabel() : attribute.getDescription();
		Set searchTerms = splitIntoTerms(description);
		Stemmer stemmer = new Stemmer();

		if (LOG.isDebugEnabled())
		{
			LOG.debug("findOntologyTerms({},{},{})", ontologyIds, searchTerms, MAX_NUM_TAGS);
		}

		List candidates = ontologyService.findOntologyTerms(ontologyIds, searchTerms, MAX_NUM_TAGS);

		if (LOG.isDebugEnabled())
		{
			LOG.debug("Candidates: {}", candidates);
		}

		List> hits = candidates.stream()
				.filter(ontologyTerm -> filterOntologyTerm(splitIntoTerms(stemmer.stemAndJoin(searchTerms)),
						ontologyTerm, stemmer))
				.map(ontolgoyTerm -> Hit. create(ontolgoyTerm,
						bestMatchingSynonym(ontolgoyTerm, searchTerms).getScore()))
				.sorted(Ordering.natural().reverse()).collect(Collectors.toList());

		if (LOG.isDebugEnabled())
		{
			LOG.debug("Hits: {}", hits);
		}

		Hit result = null;
		String bestMatchingSynonym = null;
		for (Hit hit : hits)
		{
			String bestMatchingSynonymForHit = bestMatchingSynonym(hit.getResult(), searchTerms).getResult();
			if (result == null)
			{
				result = hit;
				bestMatchingSynonym = bestMatchingSynonymForHit;
			}
			else
			{
				Set jointTerms = Sets.union(splitIntoTerms(bestMatchingSynonym),
						splitIntoTerms(bestMatchingSynonymForHit));
				String joinedSynonyms = termJoiner.join(jointTerms);
				Hit joinedHit = Hit.create(OntologyTerm.and(result.getResult(), hit.getResult()),
						distanceFrom(joinedSynonyms, searchTerms, stemmer));
				if (joinedHit.compareTo(result) > 0)
				{
					result = joinedHit;
					bestMatchingSynonym = bestMatchingSynonym + " " + bestMatchingSynonymForHit;
				}
			}

			if (LOG.isDebugEnabled())
			{
				LOG.debug("result: {}", result);
			}
		}
		if (result != null && result.getScore() >= CUTOFF)
		{
			if (LOG.isDebugEnabled())
			{
				LOG.debug("Tag {} with {}", attribute, result);
			}
			return result;
		}
		return null;
	}

	private boolean filterOntologyTerm(Set keywordsFromAttribute, OntologyTerm ontologyTerm, Stemmer stemmer)
	{
		Set ontologyTermSynonyms = semanticSearchServiceHelper.getOtLabelAndSynonyms(ontologyTerm);

		for (String synonym : ontologyTermSynonyms)
		{
			Set splitIntoTerms = splitIntoTerms(stemmer.stemAndJoin(splitIntoTerms(synonym)));
			if (splitIntoTerms.size() != 0 && keywordsFromAttribute.containsAll(splitIntoTerms)) return true;
		}

		return false;
	}

	/**
	 * Computes the best matching synonym which is closest to a set of search terms.
* Will stem the {@link OntologyTerm} 's synonyms and the search terms, and then compute the maximum * {@link StringDistance} between them. 0 means disjunct, 1 means identical * * @param ontologyTerm * the {@link OntologyTerm} * @param searchTerms * the search terms * @return the maximum {@link StringDistance} between the ontologyterm and the search terms */ public Hit bestMatchingSynonym(OntologyTerm ontologyTerm, Set searchTerms) { Stemmer stemmer = new Stemmer(); Optional> bestSynonym = ontologyTerm.getSynonyms().stream() .map(synonym -> Hit. create(synonym, distanceFrom(synonym, searchTerms, stemmer))) .max(Comparator.naturalOrder()); return bestSynonym.get(); } float distanceFrom(String synonym, Set searchTerms, Stemmer stemmer) { String s1 = stemmer.stemAndJoin(splitIntoTerms(synonym)); String s2 = stemmer.stemAndJoin(searchTerms); float distance = (float) NGramDistanceAlgorithm.stringMatching(s1, s2) / 100; LOG.debug("Similarity between: {} and {} is {}", s1, s2, distance); return distance; } private Set splitIntoTerms(String description) { return FluentIterable.from(termSplitter.split(description)).transform(String::toLowerCase) .filter(w -> !NGramDistanceAlgorithm.STOPWORDSLIST.contains(w)).filter(StringUtils::isNotEmpty).toSet(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy