All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.molgenis.semanticsearch.service.impl.SemanticSearchServiceHelper Maven / Gradle / Ivy

The newest version!
package org.molgenis.semanticsearch.service.impl;

import static java.util.Arrays.stream;
import static java.util.Objects.requireNonNull;
import static org.molgenis.data.meta.AttributeType.COMPOUND;
import static org.molgenis.data.meta.model.EntityTypeMetadata.ENTITY_TYPE_META_DATA;

import com.google.common.collect.Sets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.molgenis.data.DataService;
import org.molgenis.data.Entity;
import org.molgenis.data.MolgenisDataAccessException;
import org.molgenis.data.QueryRule;
import org.molgenis.data.QueryRule.Operator;
import org.molgenis.data.meta.model.AttributeMetadata;
import org.molgenis.data.meta.model.EntityType;
import org.molgenis.data.meta.model.EntityTypeMetadata;
import org.molgenis.data.support.QueryImpl;
import org.molgenis.ontology.core.ic.TermFrequencyService;
import org.molgenis.ontology.core.model.OntologyTerm;
import org.molgenis.ontology.core.service.OntologyService;
import org.molgenis.semanticsearch.string.NGramDistanceAlgorithm;
import org.molgenis.semanticsearch.string.Stemmer;

public class SemanticSearchServiceHelper {
  private final TermFrequencyService termFrequencyService;

  private final DataService dataService;

  private final OntologyService ontologyService;

  public static final int MAX_NUM_TAGS = 3;

  private static final char SPACE_CHAR = ' ';
  private static final String COMMA_CHAR = ",";
  private static final String CARET_CHARACTER = "^";
  private static final String ESCAPED_CARET_CHARACTER = "\\^";
  private static final String ILLEGAL_CHARS_REGEX = "[^\\p{L}'a-zA-Z0-9\\.~]+";

  public SemanticSearchServiceHelper(
      DataService dataService,
      OntologyService ontologyService,
      TermFrequencyService termFrequencyService) {
    this.dataService = requireNonNull(dataService);
    this.ontologyService = requireNonNull(ontologyService);
    this.termFrequencyService = requireNonNull(termFrequencyService);
  }

  /**
   * Create a disMaxJunc query rule based on the given search terms as well as the information from
   * given ontology terms
   *
   * @return disMaxJunc queryRule
   */
  public QueryRule createDisMaxQueryRuleForAttribute(
      Set searchTerms, Collection ontologyTerms) {
    List queryTerms = new ArrayList<>();

    if (searchTerms != null) {
      queryTerms.addAll(
          searchTerms.stream()
              .filter(StringUtils::isNotBlank)
              .map(this::processQueryString)
              .collect(Collectors.toList()));
    }

    // Handle tags with only one ontologyterm
    ontologyTerms.stream()
        .filter(ontologyTerm -> !ontologyTerm.getIRI().contains(COMMA_CHAR))
        .forEach(ot -> queryTerms.addAll(parseOntologyTermQueries(ot)));

    QueryRule disMaxQueryRule = createDisMaxQueryRuleForTerms(queryTerms);

    // Handle tags with multiple ontologyterms
    ontologyTerms.stream()
        .filter(ontologyTerm -> ontologyTerm.getIRI().contains(COMMA_CHAR))
        .forEach(ot -> disMaxQueryRule.getNestedRules().add(createShouldQueryRule(ot.getIRI())));

    return disMaxQueryRule;
  }

  /**
   * Create disMaxJunc query rule based a list of queryTerm. All queryTerms are lower cased and stop
   * words are removed
   *
   * @return disMaxJunc queryRule
   */
  public QueryRule createDisMaxQueryRuleForTerms(List queryTerms) {
    List rules = new ArrayList<>();
    queryTerms.stream()
        .filter(StringUtils::isNotEmpty)
        .map(this::escapeCharsExcludingCaretChar)
        .forEach(
            query -> {
              rules.add(new QueryRule(AttributeMetadata.LABEL, Operator.FUZZY_MATCH, query));
              rules.add(new QueryRule(AttributeMetadata.DESCRIPTION, Operator.FUZZY_MATCH, query));
            });
    QueryRule finalDisMaxQuery = new QueryRule(rules);
    finalDisMaxQuery.setOperator(Operator.DIS_MAX);
    return finalDisMaxQuery;
  }

  /**
   * Create a disMaxQueryRule with corresponding boosted value
   *
   * @return a disMaxQueryRule with boosted value
   */
  public QueryRule createBoostedDisMaxQueryRuleForTerms(
      List queryTerms, Double boostValue) {
    QueryRule finalDisMaxQuery = createDisMaxQueryRuleForTerms(queryTerms);
    if (boostValue != null && boostValue.intValue() != 0) {
      finalDisMaxQuery.setValue(boostValue);
    }
    return finalDisMaxQuery;
  }

  /**
   * Create a boolean should query for composite tags containing multiple ontology terms
   *
   * @return return a boolean should queryRule
   */
  public QueryRule createShouldQueryRule(String multiOntologyTermIri) {
    QueryRule shouldQueryRule = new QueryRule(new ArrayList<>());
    shouldQueryRule.setOperator(Operator.SHOULD);
    for (String ontologyTermIri : multiOntologyTermIri.split(COMMA_CHAR)) {
      OntologyTerm ontologyTerm = ontologyService.getOntologyTerm(ontologyTermIri);
      List queryTerms = parseOntologyTermQueries(ontologyTerm);
      Double termFrequency = getBestInverseDocumentFrequency(queryTerms);
      shouldQueryRule
          .getNestedRules()
          .add(createBoostedDisMaxQueryRuleForTerms(queryTerms, termFrequency));
    }
    return shouldQueryRule;
  }

  /**
   * Create a list of string queries based on the information collected from current ontologyterm
   * including label, synonyms and child ontologyterms
   */
  public List parseOntologyTermQueries(OntologyTerm ontologyTerm) {
    List queryTerms =
        getOtLabelAndSynonyms(ontologyTerm).stream()
            .map(this::processQueryString)
            .collect(Collectors.toList());

    for (OntologyTerm childOt : ontologyService.getChildren(ontologyTerm)) {
      double boostedNumber =
          Math.pow(0.5, ontologyService.getOntologyTermDistance(ontologyTerm, childOt));
      getOtLabelAndSynonyms(childOt)
          .forEach(synonym -> queryTerms.add(parseBoostQueryString(synonym, boostedNumber)));
    }
    return queryTerms;
  }

  /**
   * A helper function to collect synonyms as well as label of ontologyterm
   *
   * @return a list of synonyms plus label
   */
  public Set getOtLabelAndSynonyms(OntologyTerm ontologyTerm) {
    Set allTerms = Sets.newLinkedHashSet(ontologyTerm.getSynonyms());
    allTerms.add(ontologyTerm.getLabel());
    return allTerms;
  }

  public Map collectExpandedQueryMap(
      Set queryTerms, Collection ontologyTerms) {
    Map expandedQueryMap = new LinkedHashMap<>();

    queryTerms.stream()
        .filter(StringUtils::isNotBlank)
        .forEach(queryTerm -> expandedQueryMap.put(Stemmer.cleanStemPhrase(queryTerm), queryTerm));

    for (OntologyTerm ontologyTerm : ontologyTerms) {
      if (!ontologyTerm.getIRI().contains(COMMA_CHAR)) {
        collectOntologyTermQueryMap(expandedQueryMap, ontologyTerm);
      } else {
        for (String ontologyTermIri : ontologyTerm.getIRI().split(COMMA_CHAR)) {
          collectOntologyTermQueryMap(
              expandedQueryMap, ontologyService.getOntologyTerm(ontologyTermIri));
        }
      }
    }
    return expandedQueryMap;
  }

  public void collectOntologyTermQueryMap(
      Map expanedQueryMap, OntologyTerm ontologyTerm) {
    if (ontologyTerm != null) {
      getOtLabelAndSynonyms(ontologyTerm)
          .forEach(
              term -> expanedQueryMap.put(Stemmer.cleanStemPhrase(term), ontologyTerm.getLabel()));

      for (OntologyTerm childOntologyTerm : ontologyService.getChildren(ontologyTerm)) {
        getOtLabelAndSynonyms(childOntologyTerm)
            .forEach(
                term ->
                    expanedQueryMap.put(Stemmer.cleanStemPhrase(term), ontologyTerm.getLabel()));
      }
    }
  }

  /** A helper function that gets identifiers of all the attributes from one EntityType */
  public List getAttributeIdentifiers(EntityType sourceEntityType) {
    Entity entityTypeEntity =
        dataService.findOne(
            ENTITY_TYPE_META_DATA,
            new QueryImpl<>().eq(EntityTypeMetadata.ID, sourceEntityType.getId()));

    if (entityTypeEntity == null)
      throw new MolgenisDataAccessException(
          "Could not find EntityTypeEntity by the name of " + sourceEntityType.getId());

    List attributeIdentifiers = new ArrayList<>();

    recursivelyCollectAttributeIdentifiers(
        entityTypeEntity.getEntities(EntityTypeMetadata.ATTRIBUTES), attributeIdentifiers);

    return attributeIdentifiers;
  }

  private void recursivelyCollectAttributeIdentifiers(
      Iterable attributeEntities, List attributeIdentifiers) {
    for (Entity attributeEntity : attributeEntities) {
      if (!attributeEntity.getString(AttributeMetadata.TYPE).equals(COMPOUND.toString())) {
        attributeIdentifiers.add(attributeEntity.getString(AttributeMetadata.ID));
      }
      Iterable entities = attributeEntity.getEntities(AttributeMetadata.CHILDREN);

      if (entities != null) {
        recursivelyCollectAttributeIdentifiers(entities, attributeIdentifiers);
      }
    }
  }

  public List findTags(String description, List ontologyIds) {
    Set searchTerms = removeStopWords(description);

    return ontologyService.findOntologyTerms(ontologyIds, searchTerms, MAX_NUM_TAGS);
  }

  public String processQueryString(String queryString) {
    return StringUtils.join(removeStopWords(queryString), SPACE_CHAR);
  }

  public String parseBoostQueryString(String queryString, double boost) {
    return StringUtils.join(
        removeStopWords(queryString).stream()
            .map(word -> word + CARET_CHARACTER + boost)
            .collect(Collectors.toSet()),
        SPACE_CHAR);
  }

  public String escapeCharsExcludingCaretChar(String string) {
    return QueryParser.escape(string).replace(ESCAPED_CARET_CHARACTER, CARET_CHARACTER);
  }

  public Set removeStopWords(String description) {
    return stream(description.split(ILLEGAL_CHARS_REGEX))
        .map(String::toLowerCase)
        .filter(w -> !NGramDistanceAlgorithm.STOPWORDSLIST.contains(w) && StringUtils.isNotEmpty(w))
        .collect(Collectors.toSet());
  }

  private Double getBestInverseDocumentFrequency(List terms) {
    Optional findFirst =
        terms.stream().sorted(Comparator.comparingInt(String::length)).findFirst();

    return findFirst.map(termFrequencyService::getTermFrequency).orElse(null);
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy