org.apache.solr.analysis.CommonGramsFilter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr Server
The newest version!
/*
 * Licensed under the Apache License, 
 * Version 2.0 (the "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software distributed under the License 
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and limitations under the License. 
 */

package org.apache.solr.analysis;

import java.io.IOException;
import java.util.Arrays;
import java.util.Set;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;

/*
 * TODO: Rewrite to use new TokenStream api from lucene 2.9 when BufferedTokenStream uses it.
 * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and
 * associated constructors 
 */

/**
 * Construct bigrams for frequently occurring terms while indexing. Single terms
 * are still indexed too, with bigrams overlaid. This is achieved through the
 * use of {@link Token#setPositionIncrement(int)}. Bigrams have a type
 * of "gram" Example
 * 
 * input:"the quick brown fox"
 * output:|"the","the-quick"|"brown"|"fox"|
 * "the-quick" has a position increment of 0 so it is in the same position
 * as "the" "the-quick" has a term.type() of "gram"
 * 
 * 
 */

/*
 * Constructors and makeCommonSet based on similar code in StopFilter
 */

public class CommonGramsFilter extends BufferedTokenStream {

  private static final char SEPARATOR = '_';

  private final CharArraySet commonWords;

  private StringBuilder buffer = new StringBuilder();

  /**
   * Construct a token stream filtering the given input using a Set of common
   * words to create bigrams. Outputs both unigrams with position increment and
   * bigrams with position increment 0 type=gram where one or both of the words
   * in a potential bigram are in the set of common words .
   * 
   * @param input TokenStream input in filter chain
   * @param commonWords The set of common words.
   * 
   */
  public CommonGramsFilter(TokenStream input, Set commonWords) {
    this(input, commonWords, false);
  }

  /**
   * Construct a token stream filtering the given input using a Set of common
   * words to create bigrams, case-sensitive if ignoreCase is false (unless Set
   * is CharArraySet). If commonWords is an instance of
   * {@link CharArraySet} (true if makeCommonSet() was used to
   * construct the set) it will be directly used and ignoreCase
   * will be ignored since CharArraySet directly controls case
   * sensitivity.
   * 
   * If commonWords is not an instance of {@link CharArraySet}, a
   * new CharArraySet will be constructed and ignoreCase will be
   * used to specify the case sensitivity of that set.
   * 
   * @param input TokenStream input in filter chain.
   * @param commonWords The set of common words.
   * @param ignoreCase -Ignore case when constructing bigrams for common words.
   */
  public CommonGramsFilter(TokenStream input, Set commonWords,
      boolean ignoreCase) {
    super(input);
    if (commonWords instanceof CharArraySet) {
      this.commonWords = (CharArraySet) commonWords;
    } else {
      this.commonWords = new CharArraySet(commonWords.size(), ignoreCase);
      this.commonWords.addAll(commonWords);
    }
    init();
  }

  /**
   * Construct a token stream filtering the given input using an Array of common
   * words to create bigrams.
   * 
   * @param input Tokenstream in filter chain
   * @param commonWords words to be used in constructing bigrams
   */
  public CommonGramsFilter(TokenStream input, String[] commonWords) {
    this(input, commonWords, false);
    init();
  }

  /**
   * Construct a token stream filtering the given input using an Array of common
   * words to create bigrams and is case-sensitive if ignoreCase is false.
   * 
   * @param input Tokenstream in filter chain
   * @param commonWords words to be used in constructing bigrams
   * @param ignoreCase -Ignore case when constructing bigrams for common words.
   */
  public CommonGramsFilter(TokenStream input, String[] commonWords,
      boolean ignoreCase) {
    super(input);
    this.commonWords = (CharArraySet) makeCommonSet(commonWords, ignoreCase);
    init();
  }

  // Here for future moving to 2.9 api See StopFilter code

  public void init() {
    /**
     * termAtt = (TermAttribute) addAttribute(TermAttribute.class); posIncrAtt
     * =(PositionIncrementAttribute)
     * addAttribute(PositionIncrementAttribute.class); typeAdd =(TypeAttribute)
     * addAttribute(TypeAttribute.class);
     */
  }

  /**
   * Build a CharArraySet from an array of common words, appropriate for passing
   * into the CommonGramsFilter constructor. This permits this commonWords
   * construction to be cached once when an Analyzer is constructed.
   * 
   * @see #makeCommonSet(java.lang.String[], boolean) passing false to
   *      ignoreCase
   */
  public static final CharArraySet makeCommonSet(String[] commonWords) {
    return makeCommonSet(commonWords, false);
  }

  /**
   * Build a CharArraySet from an array of common words, appropriate for passing
   * into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
   * false.
   * 
   * @param commonWords
   * @param ignoreCase If true, all words are lower cased first.
   * @return a Set containing the words
   */
  public static final CharArraySet makeCommonSet(String[] commonWords,
      boolean ignoreCase) {
    CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase);
    commonSet.addAll(Arrays.asList(commonWords));
    return commonSet;
  }

  /**
   * Inserts bigrams for common words into a token stream. For each input token,
   * output the token. If the token and/or the following token are in the list
   * of common words also output a bigram with position increment 0 and
   * type="gram"
   */
  /*
   * TODO: implement new lucene 2.9 API incrementToken() instead of deprecated
   * Token.next() TODO:Consider adding an option to not emit unigram stopwords
   * as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
   * changed to work with this. TODO: Consider optimizing for the case of three
   * commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
   * "of-the", "the-year" but with proper management of positions we could
   * eliminate the middle bigram "of-the"and save a disk seek and a whole set of
   * position lookups.
   */
  public Token process(Token token) throws IOException {
    Token next = peek(1);
    // if this is the last token just spit it out. Any commongram would have
    // been output in the previous call
    if (next == null) {
      return token;
    }

    /**
     * if this token or next are common then construct a bigram with type="gram"
     * position increment = 0, and put it in the output queue. It will be
     * returned when super.next() is called, before this method gets called with
     * a new token from the input stream See implementation of next() in
     * BufferedTokenStream
     */

    if (isCommon(token) || isCommon(next)) {
      Token gram = gramToken(token, next);
      write(gram);
    }
    // we always return the unigram token
    return token;
  }

  /** True if token is for a common term. */
  private boolean isCommon(Token token) {
    return commonWords != null
        && commonWords.contains(token.termBuffer(), 0, token.termLength());
  }

  /** Construct a compound token. */
  private Token gramToken(Token first, Token second) {
    buffer.setLength(0);
    buffer.append(first.termText());
    buffer.append(SEPARATOR);
    buffer.append(second.termText());
    Token result = new Token(buffer.toString(), first.startOffset(), second
        .endOffset(), "gram");
    result.setPositionIncrement(0);
    return result;
  }
  
  public void reset() throws IOException {
    super.reset();
    buffer.setLength(0);
  }
}