All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.spelling.PossibilityIterator Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.spelling;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.PriorityQueue;
import java.util.Set;

/**
 * Given a list of possible Spelling Corrections for multiple mis-spelled words in a query, This
 * iterator returns Possible Correction combinations ordered by reasonable probability that such a
 * combination will return actual hits if re-queried. This implementation simply ranks the Possible
 * Combinations by the sum of their component ranks.
 */
public class PossibilityIterator implements Iterator {
  private List> possibilityList = new ArrayList<>();
  private Iterator rankedPossibilityIterator = null;
  private int correctionIndex[];
  private boolean done = false;
  private Iterator> nextOnes = null;
  private int nextOnesRank = 0;
  private int nextOnesIndex = 0;
  private boolean suggestionsMayOverlap = false;

  @SuppressWarnings("unused")
  private PossibilityIterator() {
    throw new AssertionError("You shan't go here.");
  }

  /**
   * We assume here that the passed-in inner LinkedHashMaps are already sorted in order of "Best
   * Possible Correction".
   */
  public PossibilityIterator(
      Map> suggestions,
      int maximumRequiredSuggestions,
      int maxEvaluations,
      boolean overlap) {
    this.suggestionsMayOverlap = overlap;
    for (Map.Entry> entry : suggestions.entrySet()) {
      Token token = entry.getKey();
      if (entry.getValue().size() == 0) {
        continue;
      }
      List possibleCorrections = new ArrayList<>();
      for (Map.Entry entry1 : entry.getValue().entrySet()) {
        SpellCheckCorrection correction = new SpellCheckCorrection();
        correction.setOriginal(token);
        correction.setCorrection(entry1.getKey());
        correction.setNumberOfOccurences(entry1.getValue());
        possibleCorrections.add(correction);
      }
      possibilityList.add(possibleCorrections);
    }

    int wrapSize = possibilityList.size();
    if (wrapSize == 0) {
      done = true;
    } else {
      correctionIndex = new int[wrapSize];
      for (int i = 0; i < wrapSize; i++) {
        int suggestSize = possibilityList.get(i).size();
        if (suggestSize == 0) {
          done = true;
          break;
        }
        correctionIndex[i] = 0;
      }
    }
    PriorityQueue rankedPossibilities =
        new PriorityQueue<>(11, new RankComparator());
    Set removeDuplicates = null;
    if (suggestionsMayOverlap) {
      removeDuplicates = new HashSet<>();
    }
    long numEvaluations = 0;
    while (numEvaluations < maxEvaluations && internalHasNext()) {
      RankedSpellPossibility rsp = internalNext();
      numEvaluations++;
      if (rankedPossibilities.size() >= maximumRequiredSuggestions
          && rsp.rank >= rankedPossibilities.peek().rank) {
        continue;
      }
      if (!isSuggestionForReal(rsp)) {
        continue;
      }
      if (removeDuplicates == null) {
        rankedPossibilities.offer(rsp);
      } else {
        // Needs to be in token-offset order so that the match-and-replace
        // option for collations can work.
        rsp.corrections.sort(new StartOffsetComparator());
        if (removeDuplicates.add(rsp)) {
          rankedPossibilities.offer(rsp);
        }
      }
      if (rankedPossibilities.size() > maximumRequiredSuggestions) {
        RankedSpellPossibility removed = rankedPossibilities.poll();
        if (removeDuplicates != null) {
          removeDuplicates.remove(removed);
        }
      }
    }

    RankedSpellPossibility[] rpArr = new RankedSpellPossibility[rankedPossibilities.size()];
    for (int i = rankedPossibilities.size() - 1; i >= 0; i--) {
      rpArr[i] = rankedPossibilities.remove();
    }
    rankedPossibilityIterator = Arrays.asList(rpArr).iterator();
  }

  private boolean isSuggestionForReal(RankedSpellPossibility rsp) {
    for (SpellCheckCorrection corr : rsp.corrections) {
      if (!corr.getOriginalAsString().equals(corr.getCorrection())) {
        return true;
      }
    }
    return false;
  }

  private boolean internalHasNext() {
    if (nextOnes != null && nextOnes.hasNext()) {
      return true;
    }
    if (done) {
      return false;
    }
    internalNextAdvance();
    if (nextOnes != null && nextOnes.hasNext()) {
      return true;
    }
    return false;
  }

  /**
   * This method is converting the independent LinkHashMaps containing various (silo'ed) suggestions
   * for each mis-spelled word into individual "holistic query corrections", aka. "Spell Check
   * Possibility"
   *
   * 

Rank here is the sum of each selected term's position in its respective LinkedHashMap. */ private RankedSpellPossibility internalNext() { if (nextOnes != null && nextOnes.hasNext()) { RankedSpellPossibility rsl = new RankedSpellPossibility(); rsl.corrections = nextOnes.next(); rsl.rank = nextOnesRank; rsl.index = nextOnesIndex++; return rsl; } if (done) { throw new NoSuchElementException(); } internalNextAdvance(); if (nextOnes != null && nextOnes.hasNext()) { RankedSpellPossibility rsl = new RankedSpellPossibility(); rsl.corrections = nextOnes.next(); rsl.rank = nextOnesRank; rsl.index = nextOnesIndex++; return rsl; } throw new NoSuchElementException(); } private void internalNextAdvance() { List possibleCorrection = null; if (nextOnes != null && nextOnes.hasNext()) { possibleCorrection = nextOnes.next(); } else { if (done) { throw new NoSuchElementException(); } possibleCorrection = new ArrayList<>(); List> possibleCorrections = null; int rank = 0; while (!done && (possibleCorrections == null || possibleCorrections.size() == 0)) { rank = 0; for (int i = 0; i < correctionIndex.length; i++) { List singleWordPossibilities = possibilityList.get(i); SpellCheckCorrection singleWordPossibility = singleWordPossibilities.get(correctionIndex[i]); rank += correctionIndex[i]; if (i == correctionIndex.length - 1) { correctionIndex[i]++; if (correctionIndex[i] == singleWordPossibilities.size()) { correctionIndex[i] = 0; if (correctionIndex.length == 1) { done = true; } for (int ii = i - 1; ii >= 0; ii--) { correctionIndex[ii]++; if (correctionIndex[ii] >= possibilityList.get(ii).size() && ii > 0) { correctionIndex[ii] = 0; } else { break; } } } } possibleCorrection.add(singleWordPossibility); } if (correctionIndex[0] == possibilityList.get(0).size()) { done = true; } if (suggestionsMayOverlap) { possibleCorrections = separateOverlappingTokens(possibleCorrection); } else { possibleCorrections = new ArrayList<>(1); possibleCorrections.add(possibleCorrection); } } nextOnes = possibleCorrections.iterator(); nextOnesRank = rank; nextOnesIndex = 0; } } private List> separateOverlappingTokens( List possibleCorrection) { List> ret = null; if (possibleCorrection.size() == 1) { ret = new ArrayList<>(1); ret.add(possibleCorrection); return ret; } ret = new ArrayList<>(); for (int i = 0; i < possibleCorrection.size(); i++) { List c = compatible(possibleCorrection, i); ret.add(c); } return ret; } private List compatible(List all, int pos) { List priorPassCompatibles = null; { List firstPassCompatibles = new ArrayList<>(all.size()); SpellCheckCorrection sacred = all.get(pos); firstPassCompatibles.add(sacred); int index = pos; boolean gotOne = false; for (int i = 0; i < all.size() - 1; i++) { index++; if (index == all.size()) { index = 0; } SpellCheckCorrection disposable = all.get(index); if (!conflicts(sacred, disposable)) { firstPassCompatibles.add(disposable); gotOne = true; } } if (!gotOne) { return firstPassCompatibles; } priorPassCompatibles = firstPassCompatibles; } { pos = 1; while (true) { if (pos == priorPassCompatibles.size() - 1) { return priorPassCompatibles; } List subsequentPassCompatibles = new ArrayList<>(priorPassCompatibles.size()); SpellCheckCorrection sacred = null; for (int i = 0; i <= pos; i++) { sacred = priorPassCompatibles.get(i); subsequentPassCompatibles.add(sacred); } int index = pos; boolean gotOne = false; for (int i = 0; i < priorPassCompatibles.size() - 1; i++) { index++; if (index == priorPassCompatibles.size()) { break; } SpellCheckCorrection disposable = priorPassCompatibles.get(index); if (!conflicts(sacred, disposable)) { subsequentPassCompatibles.add(disposable); gotOne = true; } } if (!gotOne || pos == priorPassCompatibles.size() - 1) { return subsequentPassCompatibles; } priorPassCompatibles = subsequentPassCompatibles; pos++; } } } private boolean conflicts(SpellCheckCorrection c1, SpellCheckCorrection c2) { int s1 = c1.getOriginal().startOffset(); int e1 = c1.getOriginal().endOffset(); int s2 = c2.getOriginal().startOffset(); int e2 = c2.getOriginal().endOffset(); if (s2 >= s1 && s2 <= e1) { return true; } if (s1 >= s2 && s1 <= e2) { return true; } return false; } @Override public boolean hasNext() { return rankedPossibilityIterator.hasNext(); } @Override public PossibilityIterator.RankedSpellPossibility next() { return rankedPossibilityIterator.next(); } @Override public void remove() { throw new UnsupportedOperationException(); } public static class RankedSpellPossibility { public List corrections; public int rank; public int index; @Override // hashCode() and equals() only consider the actual correction, not the rank // or index. public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((corrections == null) ? 0 : corrections.hashCode()); return result; } @Override // hashCode() and equals() only consider the actual correction, not the rank // or index. public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (!(obj instanceof RankedSpellPossibility)) return false; RankedSpellPossibility other = (RankedSpellPossibility) obj; return Objects.equals(corrections, other.corrections); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("rank=").append(rank).append(" (").append(index).append(")"); if (corrections != null) { for (SpellCheckCorrection corr : corrections) { sb.append(" "); sb.append(corr.getOriginal()) .append(">") .append(corr.getCorrection()) .append(" (") .append(corr.getNumberOfOccurences()) .append(")"); } } return sb.toString(); } } private static class StartOffsetComparator implements Comparator { @Override public int compare(SpellCheckCorrection o1, SpellCheckCorrection o2) { return o1.getOriginal().startOffset() - o2.getOriginal().startOffset(); } } private static class RankComparator implements Comparator { // Rank poorer suggestions ahead of better ones for use with a PriorityQueue @Override public int compare(RankedSpellPossibility r1, RankedSpellPossibility r2) { int retval = r2.rank - r1.rank; if (retval == 0) { retval = r2.index - r1.index; } return retval; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy