All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.analysis.CommonGramsQueryFilter Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.analysis;

import java.io.IOException;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Token;

/**
 * Wrap a CommonGramsFilter optimizing phrase queries by only returning single
 * words when they are not a member of a bigram.
 * 
 * Example:
 * 
    *
  • query input to CommonGramsFilter: "the rain in spain falls mainly" *
  • output of CommomGramsFilter/input to CommonGramsQueryFilter: * |"the, "the-rain"|"rain" "rain-in"|"in, "in-spain"|"spain"|"falls"|"mainly" *
  • output of CommonGramsQueryFilter:"the-rain", "rain-in" ,"in-spain", * "falls", "mainly" *
*/ /* * TODO: When org.apache.solr.analysis.BufferedTokenStream is changed to use the * 2.9 lucene TokenStream api, make necessary changes here. * See:http://hudson.zones * .apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache * /lucene/analysis/TokenStream.html and * http://svn.apache.org/viewvc/lucene/java * /trunk/src/java/org/apache/lucene/analysis/package.html?revision=718798 */ public class CommonGramsQueryFilter extends BufferedTokenStream { //private CharArraySet commonWords; private Token prev; /** * Constructor * * @param input must be a CommonGramsFilter! * */ public CommonGramsQueryFilter(CommonGramsFilter input) { super(input); prev = new Token(); } public void reset() throws IOException { super.reset(); prev = new Token(); } /** * Output bigrams whenever possible to optimize queries. Only output unigrams * when they are not a member of a bigram. Example: *
    *
  • input: "the rain in spain falls mainly" *
  • output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly" */ public Token process(Token token) throws IOException { Token next = peek(1); /* * Deal with last token (next=null when current token is the last word) Last * token will be a unigram. If previous token was a bigram, then we already * output the last token as part of the unigram and should not additionally * output the unigram.

    Example: If the end of the input to the * CommonGramsFilter is "...the plain"

    • current token = "plain"
    • *
    • next token = null
    • previous token = "the-plain" (bigram)
    • *
    • Since the word "plain" was already output as part of the bigram we * don't output it.
    Example: If the end of the input to the * CommonGramsFilter is "falls mainly"
    • current token = * "mainly"
    • next token = null
    • previous token = "falls" * (unigram)
    • Since we haven't yet output the current token, we * output it
    */ // Deal with special case of last token if (next == null) { if (prev == null) { // This is the first and only token i.e. one word query return token; } if (prev != null && prev.type() != "gram") { // If previous token was a unigram, output the current token return token; } else { // If previous token was a bigram, we already output it and this token // was output as part of the bigram so we are done. return null; } } /* * Possible cases are: |token |next 1|word |gram 2|word |word The * CommonGramsFilter we are wrapping always outputs the unigram word prior * to outputting an optional bigram: "the sound of" gets output as |"the", * "the_sound"|"sound", "sound_of" For case 1 we consume the gram from the * input stream and output it rather than the current token This means that * the call to super.next() which reads a token from input and passes it on * to this process method will always get a token of type word */ if (next != null && next.type() == "gram") { // consume "next" token from list and output it token = read(); // use this to clone the token because clone requires all these args but // won't take the token.type // see // http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/Token.html prev.reinit(token.termBuffer(), 0, token.termLength(), token .startOffset(), token.endOffset(), token.type()); token.setPositionIncrement(1); return token; } // if the next token is not a bigram, then output the token // see note above regarding this method of copying token to prev prev.reinit(token.termBuffer(), 0, token.termLength(), token.startOffset(), token.endOffset(), token.type()); assert token.type() == "word"; return token; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy