org.apache.lucene.analysis.miscellaneous.PatternAnalyzer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers Show documentation
Additional Analyzers
There is a newer version: 3.6.2
package org.apache.lucene.analysis.miscellaneous;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;

/**
 * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
 * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern}
 * (with behaviour identical to {@link String#split(String)}),
 * and that combines the functionality of
 * {@link org.apache.lucene.analysis.LetterTokenizer},
 * {@link org.apache.lucene.analysis.LowerCaseTokenizer},
 * {@link org.apache.lucene.analysis.WhitespaceTokenizer},
 * {@link org.apache.lucene.analysis.StopFilter} into a single efficient
 * multi-purpose class.
 * 
 * If you are unsure how exactly a regular expression should look like, consider 
 * prototyping by simply trying various expressions on some test texts via
 * {@link String#split(String)}. Once you are satisfied, give that regex to 
 * PatternAnalyzer. Also see Java Regular Expression Tutorial.
 * 

 * This class can be considerably faster than the "normal" Lucene tokenizers. 
 * It can also serve as a building block in a compound Lucene
 * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this 
 * stemming example:
 * 
 * PatternAnalyzer pat = ...
 * TokenStream tokenStream = new SnowballFilter(
 *     pat.tokenStream("content", "James is running round in the woods"), 
 *     "English"));
 * 
 *
 */
public class PatternAnalyzer extends Analyzer {
  
  /** "\\W+"; Divides text at non-letters (NOT Character.isLetter(c)) */
  public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
  
  /** "\\s+"; Divides text at whitespaces (Character.isWhitespace(c)) */
  public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
  
  private static final CharArraySet EXTENDED_ENGLISH_STOP_WORDS =
    CharArraySet.unmodifiableSet(new CharArraySet(Arrays.asList(
      "a", "about", "above", "across", "adj", "after", "afterwards",
      "again", "against", "albeit", "all", "almost", "alone", "along",
      "already", "also", "although", "always", "among", "amongst", "an",
      "and", "another", "any", "anyhow", "anyone", "anything",
      "anywhere", "are", "around", "as", "at", "be", "became", "because",
      "become", "becomes", "becoming", "been", "before", "beforehand",
      "behind", "being", "below", "beside", "besides", "between",
      "beyond", "both", "but", "by", "can", "cannot", "co", "could",
      "down", "during", "each", "eg", "either", "else", "elsewhere",
      "enough", "etc", "even", "ever", "every", "everyone", "everything",
      "everywhere", "except", "few", "first", "for", "former",
      "formerly", "from", "further", "had", "has", "have", "he", "hence",
      "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
      "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
      "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
      "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
      "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
      "must", "my", "myself", "namely", "neither", "never",
      "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
      "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
      "once one", "only", "onto", "or", "other", "others", "otherwise",
      "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
      "rather", "s", "same", "seem", "seemed", "seeming", "seems",
      "several", "she", "should", "since", "so", "some", "somehow",
      "someone", "something", "sometime", "sometimes", "somewhere",
      "still", "such", "t", "than", "that", "the", "their", "them",
      "themselves", "then", "thence", "there", "thereafter", "thereby",
      "therefor", "therein", "thereupon", "these", "they", "this",
      "those", "though", "through", "throughout", "thru", "thus", "to",
      "together", "too", "toward", "towards", "under", "until", "up",
      "upon", "us", "very", "via", "was", "we", "well", "were", "what",
      "whatever", "whatsoever", "when", "whence", "whenever",
      "whensoever", "where", "whereafter", "whereas", "whereat",
      "whereby", "wherefrom", "wherein", "whereinto", "whereof",
      "whereon", "whereto", "whereunto", "whereupon", "wherever",
      "wherewith", "whether", "which", "whichever", "whichsoever",
      "while", "whilst", "whither", "who", "whoever", "whole", "whom",
      "whomever", "whomsoever", "whose", "whosoever", "why", "will",
      "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
      "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
      "yourselves"
    ), true));
    
  /**
   * A lower-casing word analyzer with English stop words (can be shared
   * freely across threads without harm); global per class loader.
   */
  public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
    Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    
  /**
   * A lower-casing word analyzer with extended  English stop words
   * (can be shared freely across threads without harm); global per class
   * loader. The stop words are borrowed from
   * http://thomas.loc.gov/home/stopwords.html, see
   * http://thomas.loc.gov/home/all.about.inquery.html
   */
  public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
    Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
    
  private final Pattern pattern;
  private final boolean toLowerCase;
  private final Set stopWords;

  private final Version matchVersion;
  
  /**
   * Constructs a new instance with the given parameters.
   * 
   * @param matchVersion If >= {@link Version#LUCENE_29}, StopFilter.enablePositionIncrement is set to true
   * @param pattern
   *            a regular expression delimiting tokens
   * @param toLowerCase
   *            if true returns tokens after applying
   *            String.toLowerCase()
   * @param stopWords
   *            if non-null, ignores all tokens that are contained in the
   *            given stop set (after previously having applied toLowerCase()
   *            if applicable). For example, created via
   *            {@link StopFilter#makeStopSet(String[])}and/or
   *            {@link org.apache.lucene.analysis.WordlistLoader}as in
   *            WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")
   *            or other stop words
   *            lists .
   */
  public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, Set stopWords) {
    if (pattern == null) 
      throw new IllegalArgumentException("pattern must not be null");
    
    if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
    else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
    
    if (stopWords != null && stopWords.size() == 0) stopWords = null;
    
    this.pattern = pattern;
    this.toLowerCase = toLowerCase;
    this.stopWords = stopWords;
    this.matchVersion = matchVersion;
  }
  
  /**
   * Creates a token stream that tokenizes the given string into token terms
   * (aka words).
   * 
   * @param fieldName
   *            the name of the field to tokenize (currently ignored).
   * @param text
   *            the string to tokenize
   * @return a new token stream
   */
  public TokenStream tokenStream(String fieldName, String text) {
    // Ideally the Analyzer superclass should have a method with the same signature, 
    // with a default impl that simply delegates to the StringReader flavour. 
    if (text == null) 
      throw new IllegalArgumentException("text must not be null");
    
    TokenStream stream;
    if (pattern == NON_WORD_PATTERN) { // fast path
      stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
    }
    else if (pattern == WHITESPACE_PATTERN) { // fast path
      stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
    }
    else {
      stream = new PatternTokenizer(text, pattern, toLowerCase);
      if (stopWords != null) stream = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords);
    }
    
    return stream;
  }
  
  /**
   * Creates a token stream that tokenizes all the text in the given Reader;
   * This implementation forwards to tokenStream(String, String) and is
   * less efficient than tokenStream(String, String).
   * 
   * @param fieldName
   *            the name of the field to tokenize (currently ignored).
   * @param reader
   *            the reader delivering the text
   * @return a new token stream
   */
  @Override
  public TokenStream tokenStream(String fieldName, Reader reader) {
    if (reader instanceof FastStringReader) { // fast path
      return tokenStream(fieldName, ((FastStringReader)reader).getString());
    }
    
    try {
      String text = toString(reader);
      return tokenStream(fieldName, text);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
  
  /**
   * Indicates whether some other object is "equal to" this one.
   * 
   * @param other
   *            the reference object with which to compare.
   * @return true if equal, false otherwise
   */
  @Override
  public boolean equals(Object other) {
    if (this  == other) return true;
    if (this  == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
    if (other == DEFAULT_ANALYZER && this  == EXTENDED_ANALYZER) return false;
    
    if (other instanceof PatternAnalyzer) {
      PatternAnalyzer p2 = (PatternAnalyzer) other;
      return 
        toLowerCase == p2.toLowerCase &&
        eqPattern(pattern, p2.pattern) &&
        eq(stopWords, p2.stopWords);
    }
    return false;
  }
  
  /**
   * Returns a hash code value for the object.
   * 
   * @return the hash code.
   */
  @Override
  public int hashCode() {
    if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
    if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
    
    int h = 1;
    h = 31*h + pattern.pattern().hashCode();
    h = 31*h + pattern.flags();
    h = 31*h + (toLowerCase ? 1231 : 1237);
    h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
    return h;
  }
  
  /** equality where o1 and/or o2 can be null */
  private static boolean eq(Object o1, Object o2) {
    return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
  }
  
  /** assumes p1 and p2 are not null */
  private static boolean eqPattern(Pattern p1, Pattern p2) {
    return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
  }
    
  /**
   * Reads until end-of-stream and returns all read chars, finally closes the stream.
   * 
   * @param input the input stream
   * @throws IOException if an I/O error occurs while reading the stream
   */
  private static String toString(Reader input) throws IOException {
    try {
      int len = 256;
      char[] buffer = new char[len];
      char[] output = new char[len];
      
      len = 0;
      int n;
      while ((n = input.read(buffer)) >= 0) {
        if (len + n > output.length) { // grow capacity
          char[] tmp = new char[Math.max(output.length << 1, len + n)];
          System.arraycopy(output, 0, tmp, 0, len);
          System.arraycopy(buffer, 0, tmp, len, n);
          buffer = output; // use larger buffer for future larger bulk reads
          output = tmp;
        } else {
          System.arraycopy(buffer, 0, output, len, n);
        }
        len += n;
      }

      return new String(output, 0, len);
    } finally {
      if (input != null) input.close();
    }
  }
  
  
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  /**
   * The work horse; performance isn't fantastic, but it's not nearly as bad
   * as one might think - kudos to the Sun regex developers.
   */
  private static final class PatternTokenizer extends TokenStream {
    
    private final String str;
    private final boolean toLowerCase;
    private Matcher matcher;
    private int pos = 0;
    private static final Locale locale = Locale.getDefault();
    private TermAttribute termAtt = addAttribute(TermAttribute.class);
    private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    
    public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
      this.str = str;
      this.matcher = pattern.matcher(str);
      this.toLowerCase = toLowerCase;
    }

    @Override
    public final boolean incrementToken() {
      if (matcher == null) return false;
      clearAttributes();
      while (true) { // loop takes care of leading and trailing boundary cases
        int start = pos;
        int end;
        boolean isMatch = matcher.find();
        if (isMatch) {
          end = matcher.start();
          pos = matcher.end();
        } else { 
          end = str.length();
          matcher = null; // we're finished
        }
        
        if (start != end) { // non-empty match (header/trailer)
          String text = str.substring(start, end);
          if (toLowerCase) text = text.toLowerCase(locale);
          termAtt.setTermBuffer(text);
          offsetAtt.setOffset(start, end);
          return true;
        }
        if (!isMatch) return false;
      }
    }
    
    @Override
    public final void end() {
      // set final offset
      final int finalOffset = str.length();
    	this.offsetAtt.setOffset(finalOffset, finalOffset);
    }    
  } 
  
  
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  /**
   * Special-case class for best performance in common cases; this class is
   * otherwise unnecessary.
   */
  private static final class FastStringTokenizer extends TokenStream {
    
    private final String str;
    private int pos;
    private final boolean isLetter;
    private final boolean toLowerCase;
    private final Set stopWords;
    private static final Locale locale = Locale.getDefault();
    private TermAttribute termAtt = addAttribute(TermAttribute.class);
    private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    
    public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
      this.str = str;
      this.isLetter = isLetter;
      this.toLowerCase = toLowerCase;
      this.stopWords = stopWords;
    }

    @Override
    public boolean incrementToken() {
      clearAttributes();
      // cache loop instance vars (performance)
      String s = str;
      int len = s.length();
      int i = pos;
      boolean letter = isLetter;
      
      int start = 0;
      String text;
      do {
        // find beginning of token
        text = null;
        while (i < len && !isTokenChar(s.charAt(i), letter)) {
          i++;
        }
        
        if (i < len) { // found beginning; now find end of token
          start = i;
          while (i < len && isTokenChar(s.charAt(i), letter)) {
            i++;
          }
          
          text = s.substring(start, i);
          if (toLowerCase) text = text.toLowerCase(locale);
//          if (toLowerCase) {            
////            use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
////            see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
//            text = s.substring(start, i).toLowerCase(); 
////            char[] chars = new char[i-start];
////            for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
////            text = new String(chars);
//          } else {
//            text = s.substring(start, i);
//          }
        }
      } while (text != null && isStopWord(text));
      
      pos = i;
      if (text == null)
      {
        return false;
      }
      termAtt.setTermBuffer(text);
      offsetAtt.setOffset(start, i);
      return true;
    }
    
    @Override
    public final void end() {
      // set final offset
      final int finalOffset = str.length();
      this.offsetAtt.setOffset(finalOffset, finalOffset);
    }    
    
    private boolean isTokenChar(char c, boolean isLetter) {
      return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
    }
    
    private boolean isStopWord(String text) {
      return stopWords != null && stopWords.contains(text);
    }
    
  }

  
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  /**
   * A StringReader that exposes it's contained string for fast direct access.
   * Might make sense to generalize this to CharSequence and make it public?
   */
  static final class FastStringReader extends StringReader {

    private final String s;
    
    FastStringReader(String s) {
      super(s);
      this.s = s;
    }
    
    String getString() {
      return s;
    }
  }
  
}