All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.analysis.CommonGramsFilter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed under the Apache License, 
 * Version 2.0 (the "License"); you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software distributed under the License 
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and limitations under the License. 
 */

package org.apache.solr.analysis;

import java.io.IOException;
import java.util.Arrays;
import java.util.Set;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;

/*
 * TODO: Rewrite to use new TokenStream api from lucene 2.9 when BufferedTokenStream uses it.
 * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and
 * associated constructors 
 */

/**
 * Construct bigrams for frequently occurring terms while indexing. Single terms
 * are still indexed too, with bigrams overlaid. This is achieved through the
 * use of {@link Token#setPositionIncrement(int)}. Bigrams have a type
 * of "gram" Example
 * 
    *
  • input:"the quick brown fox"
  • *
  • output:|"the","the-quick"|"brown"|"fox"|
  • *
  • "the-quick" has a position increment of 0 so it is in the same position * as "the" "the-quick" has a term.type() of "gram"
  • * *
*/ /* * Constructors and makeCommonSet based on similar code in StopFilter */ public class CommonGramsFilter extends BufferedTokenStream { private static final char SEPARATOR = '_'; private final CharArraySet commonWords; private StringBuilder buffer = new StringBuilder(); /** * Construct a token stream filtering the given input using a Set of common * words to create bigrams. Outputs both unigrams with position increment and * bigrams with position increment 0 type=gram where one or both of the words * in a potential bigram are in the set of common words . * * @param input TokenStream input in filter chain * @param commonWords The set of common words. * */ public CommonGramsFilter(TokenStream input, Set commonWords) { this(input, commonWords, false); } /** * Construct a token stream filtering the given input using a Set of common * words to create bigrams, case-sensitive if ignoreCase is false (unless Set * is CharArraySet). If commonWords is an instance of * {@link CharArraySet} (true if makeCommonSet() was used to * construct the set) it will be directly used and ignoreCase * will be ignored since CharArraySet directly controls case * sensitivity. *

* If commonWords is not an instance of {@link CharArraySet}, a * new CharArraySet will be constructed and ignoreCase will be * used to specify the case sensitivity of that set. * * @param input TokenStream input in filter chain. * @param commonWords The set of common words. * @param ignoreCase -Ignore case when constructing bigrams for common words. */ public CommonGramsFilter(TokenStream input, Set commonWords, boolean ignoreCase) { super(input); if (commonWords instanceof CharArraySet) { this.commonWords = (CharArraySet) commonWords; } else { this.commonWords = new CharArraySet(commonWords.size(), ignoreCase); this.commonWords.addAll(commonWords); } init(); } /** * Construct a token stream filtering the given input using an Array of common * words to create bigrams. * * @param input Tokenstream in filter chain * @param commonWords words to be used in constructing bigrams */ public CommonGramsFilter(TokenStream input, String[] commonWords) { this(input, commonWords, false); init(); } /** * Construct a token stream filtering the given input using an Array of common * words to create bigrams and is case-sensitive if ignoreCase is false. * * @param input Tokenstream in filter chain * @param commonWords words to be used in constructing bigrams * @param ignoreCase -Ignore case when constructing bigrams for common words. */ public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) { super(input); this.commonWords = (CharArraySet) makeCommonSet(commonWords, ignoreCase); init(); } // Here for future moving to 2.9 api See StopFilter code public void init() { /** * termAtt = (TermAttribute) addAttribute(TermAttribute.class); posIncrAtt * =(PositionIncrementAttribute) * addAttribute(PositionIncrementAttribute.class); typeAdd =(TypeAttribute) * addAttribute(TypeAttribute.class); */ } /** * Build a CharArraySet from an array of common words, appropriate for passing * into the CommonGramsFilter constructor. This permits this commonWords * construction to be cached once when an Analyzer is constructed. * * @see #makeCommonSet(java.lang.String[], boolean) passing false to * ignoreCase */ public static final CharArraySet makeCommonSet(String[] commonWords) { return makeCommonSet(commonWords, false); } /** * Build a CharArraySet from an array of common words, appropriate for passing * into the CommonGramsFilter constructor,case-sensitive if ignoreCase is * false. * * @param commonWords * @param ignoreCase If true, all words are lower cased first. * @return a Set containing the words */ public static final CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) { CharArraySet commonSet = new CharArraySet(commonWords.length, ignoreCase); commonSet.addAll(Arrays.asList(commonWords)); return commonSet; } /** * Inserts bigrams for common words into a token stream. For each input token, * output the token. If the token and/or the following token are in the list * of common words also output a bigram with position increment 0 and * type="gram" */ /* * TODO: implement new lucene 2.9 API incrementToken() instead of deprecated * Token.next() TODO:Consider adding an option to not emit unigram stopwords * as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be * changed to work with this. TODO: Consider optimizing for the case of three * commongrams i.e "man of the year" normally produces 3 bigrams: "man-of", * "of-the", "the-year" but with proper management of positions we could * eliminate the middle bigram "of-the"and save a disk seek and a whole set of * position lookups. */ public Token process(Token token) throws IOException { Token next = peek(1); // if this is the last token just spit it out. Any commongram would have // been output in the previous call if (next == null) { return token; } /** * if this token or next are common then construct a bigram with type="gram" * position increment = 0, and put it in the output queue. It will be * returned when super.next() is called, before this method gets called with * a new token from the input stream See implementation of next() in * BufferedTokenStream */ if (isCommon(token) || isCommon(next)) { Token gram = gramToken(token, next); write(gram); } // we always return the unigram token return token; } /** True if token is for a common term. */ private boolean isCommon(Token token) { return commonWords != null && commonWords.contains(token.termBuffer(), 0, token.termLength()); } /** Construct a compound token. */ private Token gramToken(Token first, Token second) { buffer.setLength(0); buffer.append(first.termText()); buffer.append(SEPARATOR); buffer.append(second.termText()); Token result = new Token(buffer.toString(), first.startOffset(), second .endOffset(), "gram"); result.setPositionIncrement(0); return result; } public void reset() throws IOException { super.reset(); buffer.setLength(0); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy