org.sonarsource.sonarlint.core.client.api.util.TextSearchIndex Maven / Gradle / Ivy

/*
 * SonarLint Core - Implementation
 * Copyright (C) 2016-2021 SonarSource SA
 * mailto:info AT sonarsource DOT com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 3 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */
package org.sonarsource.sonarlint.core.client.api.util;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.stream.Collectors;

/**
 * Indexes text associated to objects, and performs full text search to find matching objects.
 * It is a positional index, so it supports queries consisted of multiple terms, in which case it will find partial term matches in sequence (distance = 1).
 * The result is sorted by score. The score of each term matches is the ratio of the term matches (1 for exact match), 
 * and the global score is the sum of the term's scores in the object divided by the total term frequency in the object.
 * 
 * The generic type should properly implement equals and hashCode.
 * An object cannot be indexed twice. 
 * 
 * Performance of indexing: O(N)
 * Performance of search: O(log N) on the number of indexed terms + O(N) on the number of results
 */
public class TextSearchIndex {
  private static final String SPLIT_PATTERN = "\\W";
  private TreeMap> termToObj;
  private Map objToWordFrequency;

  public TextSearchIndex() {
    clear();
  }

  public int size() {
    return objToWordFrequency.size();
  }

  public boolean isEmpty() {
    return objToWordFrequency.isEmpty();
  }

  public void index(T obj, String text) {
    if (objToWordFrequency.containsKey(obj)) {
      throw new IllegalArgumentException("Already indexed");
    }
    List terms = tokenize(text);
    objToWordFrequency.put(obj, terms.size());

    int i = 0;
    for (String s : terms) {
      addToDictionary(s, i, obj);
      i++;
    }
  }

  /**
   * Search for indexed objects based on a query. Results will be sorted by score (highest first).
   * Score is in the interval ]0,1].
   * 
   * @return A map of results reverse-sorted by value (score). Can be empty, but never null
   */
  public Map search(String query) {
    List terms = tokenize(query);

    if (terms.isEmpty()) {
      return Collections.emptyMap();
    }

    List matched;

    // positional search
    Iterator it = terms.iterator();
    matched = searchTerm(it.next());

    while (it.hasNext()) {
      List termMatches = searchTerm(it.next());
      matched = matchPositional(matched, termMatches, 1);

      if (matched.isEmpty()) {
        break;
      }
    }

    // convert results and calc score
    return prepareResult(matched);
  }

  private List matchPositional(List previousMatches, List termMatches, int maxDistance) {
    List matches = new LinkedList<>();

    for (SearchResult e1 : previousMatches) {
      for (SearchResult e2 : termMatches) {
        if (!e1.obj.equals(e2.obj)) {
          continue;
        }

        int dist = e2.lastIdx - e1.lastIdx;
        if (dist > 0 && dist <= maxDistance) {
          e2.score += e1.score;
          matches.add(e2);
        }
      }
    }
    return matches;
  }

  private Map prepareResult(List entries) {
    Map objToScore = new HashMap<>();

    for (SearchResult e : entries) {
      double score = e.score / objToWordFrequency.get(e.obj);
      Double previousScore = objToScore.get(e.obj);

      if (previousScore == null || previousScore < score) {
        objToScore.put(e.obj, score);
      }
    }

    return objToScore.entrySet().stream()
      .sorted(Map.Entry.comparingByValue().reversed())
      .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new));
  }

  /**
   * Returns any term prefixed by the given text
   */
  private List searchTerm(String termPrefix) {
    List entries = new LinkedList<>();

    SortedMap> tailMap = termToObj.tailMap(termPrefix);
    for (Entry> e : tailMap.entrySet()) {
      if (!e.getKey().startsWith(termPrefix)) {
        break;
      }
      double score = ((double) termPrefix.length()) / e.getKey().length();
      e.getValue().stream()
        .map(v -> new SearchResult(score, v.obj, v.tokenIndex))
        .forEach(entries::add);
    }

    return entries;
  }

  public void clear() {
    termToObj = new TreeMap<>();
    objToWordFrequency = new HashMap<>();
  }

  /**
   * @return Can be empty, but never null
   */
  public Set getTokens() {
    return Collections.unmodifiableSet(termToObj.keySet());
  }

  private void addToDictionary(String token, int tokenIndex, T obj) {
    List entries = termToObj.get(token);

    if (entries == null) {
      entries = new LinkedList<>();
      termToObj.put(token, entries);
    }

    entries.add(new DictEntry(obj, tokenIndex));
  }

  private static List tokenize(String text) {
    String[] split = text.split(SPLIT_PATTERN);
    List terms = new ArrayList<>(split.length);

    for (String s : split) {
      if (!s.isEmpty()) {
        terms.add(s.toLowerCase(Locale.ENGLISH));
      }
    }

    return terms;
  }

  private class SearchResult {
    private double score;
    private T obj;
    private int lastIdx;

    public SearchResult(double score, T obj, int lastIdx) {
      this.score = score;
      this.obj = obj;
      this.lastIdx = lastIdx;
    }
  }

  private class DictEntry {
    T obj;
    int tokenIndex;

    public DictEntry(T obj, int tokenIndex) {
      this.obj = obj;
      this.tokenIndex = tokenIndex;
    }
  }
}