org.apache.lucene.analysis.miscellaneous.PatternAnalyzer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analyzers Show documentation
Additional Analyzers
The newest version!
package org.apache.lucene.analysis.miscellaneous;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.ReusableAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.util.Version;

/**
 * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
 * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern}
 * (with behaviour identical to {@link String#split(String)}),
 * and that combines the functionality of
 * {@link org.apache.lucene.analysis.LetterTokenizer},
 * {@link org.apache.lucene.analysis.LowerCaseTokenizer},
 * {@link org.apache.lucene.analysis.WhitespaceTokenizer},
 * {@link org.apache.lucene.analysis.StopFilter} into a single efficient
 * multi-purpose class.
 * 
 * If you are unsure how exactly a regular expression should look like, consider 
 * prototyping by simply trying various expressions on some test texts via
 * {@link String#split(String)}. Once you are satisfied, give that regex to 
 * PatternAnalyzer. Also see Java Regular Expression Tutorial.
 * 

 * This class can be considerably faster than the "normal" Lucene tokenizers. 
 * It can also serve as a building block in a compound Lucene
 * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this 
 * stemming example:
 * 
 * PatternAnalyzer pat = ...
 * TokenStream tokenStream = new SnowballFilter(
 *     pat.tokenStream("content", "James is running round in the woods"), 
 *     "English"));
 * 
 */
public final class PatternAnalyzer extends ReusableAnalyzerBase {
  
  /** "\\W+"; Divides text at non-letters (NOT Character.isLetter(c)) */
  public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
  
  /** "\\s+"; Divides text at whitespaces (Character.isWhitespace(c)) */
  public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
  
  private static final CharArraySet EXTENDED_ENGLISH_STOP_WORDS =
    CharArraySet.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, 
        Arrays.asList(
      "a", "about", "above", "across", "adj", "after", "afterwards",
      "again", "against", "albeit", "all", "almost", "alone", "along",
      "already", "also", "although", "always", "among", "amongst", "an",
      "and", "another", "any", "anyhow", "anyone", "anything",
      "anywhere", "are", "around", "as", "at", "be", "became", "because",
      "become", "becomes", "becoming", "been", "before", "beforehand",
      "behind", "being", "below", "beside", "besides", "between",
      "beyond", "both", "but", "by", "can", "cannot", "co", "could",
      "down", "during", "each", "eg", "either", "else", "elsewhere",
      "enough", "etc", "even", "ever", "every", "everyone", "everything",
      "everywhere", "except", "few", "first", "for", "former",
      "formerly", "from", "further", "had", "has", "have", "he", "hence",
      "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
      "herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
      "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
      "latter", "latterly", "least", "less", "ltd", "many", "may", "me",
      "meanwhile", "might", "more", "moreover", "most", "mostly", "much",
      "must", "my", "myself", "namely", "neither", "never",
      "nevertheless", "next", "no", "nobody", "none", "noone", "nor",
      "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
      "once one", "only", "onto", "or", "other", "others", "otherwise",
      "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
      "rather", "s", "same", "seem", "seemed", "seeming", "seems",
      "several", "she", "should", "since", "so", "some", "somehow",
      "someone", "something", "sometime", "sometimes", "somewhere",
      "still", "such", "t", "than", "that", "the", "their", "them",
      "themselves", "then", "thence", "there", "thereafter", "thereby",
      "therefor", "therein", "thereupon", "these", "they", "this",
      "those", "though", "through", "throughout", "thru", "thus", "to",
      "together", "too", "toward", "towards", "under", "until", "up",
      "upon", "us", "very", "via", "was", "we", "well", "were", "what",
      "whatever", "whatsoever", "when", "whence", "whenever",
      "whensoever", "where", "whereafter", "whereas", "whereat",
      "whereby", "wherefrom", "wherein", "whereinto", "whereof",
      "whereon", "whereto", "whereunto", "whereupon", "wherever",
      "wherewith", "whether", "which", "whichever", "whichsoever",
      "while", "whilst", "whither", "who", "whoever", "whole", "whom",
      "whomever", "whomsoever", "whose", "whosoever", "why", "will",
      "with", "within", "without", "would", "xsubj", "xcal", "xauthor",
      "xother ", "xnote", "yet", "you", "your", "yours", "yourself",
      "yourselves"
    ), true));
    
  /**
   * A lower-casing word analyzer with English stop words (can be shared
   * freely across threads without harm); global per class loader.
   */
  public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
    Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    
  /**
   * A lower-casing word analyzer with extended  English stop words
   * (can be shared freely across threads without harm); global per class
   * loader. The stop words are borrowed from
   * http://thomas.loc.gov/home/stopwords.html, see
   * http://thomas.loc.gov/home/all.about.inquery.html
   */
  public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
    Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
    
  private final Pattern pattern;
  private final boolean toLowerCase;
  private final Set stopWords;

  private final Version matchVersion;
  
  /**
   * Constructs a new instance with the given parameters.
   * 
   * @param matchVersion currently does nothing
   * @param pattern
   *            a regular expression delimiting tokens
   * @param toLowerCase
   *            if true returns tokens after applying
   *            String.toLowerCase()
   * @param stopWords
   *            if non-null, ignores all tokens that are contained in the
   *            given stop set (after previously having applied toLowerCase()
   *            if applicable). For example, created via
   *            {@link StopFilter#makeStopSet(Version, String[])}and/or
   *            {@link org.apache.lucene.analysis.WordlistLoader}as in
   *            WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")
   *            or other stop words
   *            lists .
   */
  public PatternAnalyzer(Version matchVersion, Pattern pattern, boolean toLowerCase, Set stopWords) {
    if (pattern == null) 
      throw new IllegalArgumentException("pattern must not be null");
    
    if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
    else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
    
    if (stopWords != null && stopWords.size() == 0) stopWords = null;
    
    this.pattern = pattern;
    this.toLowerCase = toLowerCase;
    this.stopWords = stopWords;
    this.matchVersion = matchVersion;
  }
  
  /**
   * Creates a token stream that tokenizes the given string into token terms
   * (aka words).
   * 
   * @param fieldName
   *            the name of the field to tokenize (currently ignored).
   * @param reader
   *            reader (e.g. charfilter) of the original text. can be null.
   * @param text
   *            the string to tokenize
   * @return a new token stream
   */
  public TokenStreamComponents createComponents(String fieldName, Reader reader, String text) {
    // Ideally the Analyzer superclass should have a method with the same signature, 
    // with a default impl that simply delegates to the StringReader flavour. 
    if (text == null) 
      throw new IllegalArgumentException("text must not be null");
    
    if (pattern == NON_WORD_PATTERN) { // fast path
      return new TokenStreamComponents(new FastStringTokenizer(reader, text, true, toLowerCase, stopWords));
    } else if (pattern == WHITESPACE_PATTERN) { // fast path
      return new TokenStreamComponents(new FastStringTokenizer(reader, text, false, toLowerCase, stopWords));
    }

    Tokenizer tokenizer = new PatternTokenizer(reader, text, pattern, toLowerCase);
    TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer;
    return new TokenStreamComponents(tokenizer, result);
  }
  
  /**
   * Creates a token stream that tokenizes all the text in the given Reader;
   * This implementation forwards to tokenStream(String, Reader, String) and is
   * less efficient than tokenStream(String, Reader, String).
   * 
   * @param fieldName
   *            the name of the field to tokenize (currently ignored).
   * @param reader
   *            the reader delivering the text
   * @return a new token stream
   */
  @Override
  public TokenStreamComponents createComponents(String fieldName, Reader reader) {
    try {
      String text = toString(reader);
      return createComponents(fieldName, reader, text);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
  
  /**
   * Indicates whether some other object is "equal to" this one.
   * 
   * @param other
   *            the reference object with which to compare.
   * @return true if equal, false otherwise
   */
  @Override
  public boolean equals(Object other) {
    if (this  == other) return true;
    if (this  == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
    if (other == DEFAULT_ANALYZER && this  == EXTENDED_ANALYZER) return false;
    
    if (other instanceof PatternAnalyzer) {
      PatternAnalyzer p2 = (PatternAnalyzer) other;
      return 
        toLowerCase == p2.toLowerCase &&
        eqPattern(pattern, p2.pattern) &&
        eq(stopWords, p2.stopWords);
    }
    return false;
  }
  
  /**
   * Returns a hash code value for the object.
   * 
   * @return the hash code.
   */
  @Override
  public int hashCode() {
    if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
    if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
    
    int h = 1;
    h = 31*h + pattern.pattern().hashCode();
    h = 31*h + pattern.flags();
    h = 31*h + (toLowerCase ? 1231 : 1237);
    h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
    return h;
  }
  
  /** equality where o1 and/or o2 can be null */
  private static boolean eq(Object o1, Object o2) {
    return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
  }
  
  /** assumes p1 and p2 are not null */
  private static boolean eqPattern(Pattern p1, Pattern p2) {
    return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
  }
    
  /**
   * Reads until end-of-stream and returns all read chars, finally closes the stream.
   * 
   * @param input the input stream
   * @throws IOException if an I/O error occurs while reading the stream
   */
  private static String toString(Reader input) throws IOException {
    if (input instanceof FastStringReader) { // fast path
      return ((FastStringReader) input).getString();
    }

    try {
      int len = 256;
      char[] buffer = new char[len];
      char[] output = new char[len];
      
      len = 0;
      int n;
      while ((n = input.read(buffer)) >= 0) {
        if (len + n > output.length) { // grow capacity
          char[] tmp = new char[Math.max(output.length << 1, len + n)];
          System.arraycopy(output, 0, tmp, 0, len);
          System.arraycopy(buffer, 0, tmp, len, n);
          buffer = output; // use larger buffer for future larger bulk reads
          output = tmp;
        } else {
          System.arraycopy(buffer, 0, output, len, n);
        }
        len += n;
      }

      return new String(output, 0, len);
    } finally {
      input.close();
    }
  }
  
  
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  /**
   * The work horse; performance isn't fantastic, but it's not nearly as bad
   * as one might think - kudos to the Sun regex developers.
   */
  private static final class PatternTokenizer extends Tokenizer {

    private final Pattern pattern;
    private String str;
    private final boolean toLowerCase;
    private Matcher matcher;
    private int pos = 0;
    private static final Locale locale = Locale.getDefault();
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    
    public PatternTokenizer(Reader input, String str, Pattern pattern, boolean toLowerCase) {
      super(input);
      this.pattern = pattern;
      this.str = str;
      this.matcher = pattern.matcher(str);
      this.toLowerCase = toLowerCase;
    }

    @Override
    public final boolean incrementToken() {
      if (matcher == null) return false;
      clearAttributes();
      while (true) { // loop takes care of leading and trailing boundary cases
        int start = pos;
        int end;
        boolean isMatch = matcher.find();
        if (isMatch) {
          end = matcher.start();
          pos = matcher.end();
        } else { 
          end = str.length();
          matcher = null; // we're finished
        }
        
        if (start != end) { // non-empty match (header/trailer)
          String text = str.substring(start, end);
          if (toLowerCase) text = text.toLowerCase(locale);
          termAtt.setEmpty().append(text);
          offsetAtt.setOffset(correctOffset(start), correctOffset(end));
          return true;
        }
        if (!isMatch) return false;
      }
    }
    
    @Override
    public final void end() {
      // set final offset
      final int finalOffset = correctOffset(str.length());
      this.offsetAtt.setOffset(finalOffset, finalOffset);
    }

    @Override
    public void reset(Reader input) throws IOException {
      super.reset(input);
      this.str = PatternAnalyzer.toString(input);
      this.matcher = pattern.matcher(this.str);
    }

    @Override
    public void reset() throws IOException {
      super.reset();
      this.pos = 0;
    }
  }
  
  
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  /**
   * Special-case class for best performance in common cases; this class is
   * otherwise unnecessary.
   */
  private static final class FastStringTokenizer extends Tokenizer {
    
    private String str;
    private int pos;
    private final boolean isLetter;
    private final boolean toLowerCase;
    private final Set stopWords;
    private static final Locale locale = Locale.getDefault();
    private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
    
    public FastStringTokenizer(Reader input, String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
      super(input);
      this.str = str;
      this.isLetter = isLetter;
      this.toLowerCase = toLowerCase;
      this.stopWords = stopWords;
    }

    @Override
    public boolean incrementToken() {
      clearAttributes();
      // cache loop instance vars (performance)
      String s = str;
      int len = s.length();
      int i = pos;
      boolean letter = isLetter;
      
      int start = 0;
      String text;
      do {
        // find beginning of token
        text = null;
        while (i < len && !isTokenChar(s.charAt(i), letter)) {
          i++;
        }
        
        if (i < len) { // found beginning; now find end of token
          start = i;
          while (i < len && isTokenChar(s.charAt(i), letter)) {
            i++;
          }
          
          text = s.substring(start, i);
          if (toLowerCase) text = text.toLowerCase(locale);
//          if (toLowerCase) {            
////            use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
////            see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
//            text = s.substring(start, i).toLowerCase(); 
////            char[] chars = new char[i-start];
////            for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
////            text = new String(chars);
//          } else {
//            text = s.substring(start, i);
//          }
        }
      } while (text != null && isStopWord(text));
      
      pos = i;
      if (text == null)
      {
        return false;
      }
      termAtt.setEmpty().append(text);
      offsetAtt.setOffset(correctOffset(start), correctOffset(i));
      return true;
    }
    
    @Override
    public final void end() {
      // set final offset
      final int finalOffset = str.length();
      this.offsetAtt.setOffset(correctOffset(finalOffset), correctOffset(finalOffset));
    }    
    
    private boolean isTokenChar(char c, boolean isLetter) {
      return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
    }
    
    private boolean isStopWord(String text) {
      return stopWords != null && stopWords.contains(text);
    }

    @Override
    public void reset(Reader input) throws IOException {
      super.reset(input);
      this.str = PatternAnalyzer.toString(input);
    }

    @Override
    public void reset() throws IOException {
      super.reset();
      this.pos = 0;
    }
  }

  
  ///////////////////////////////////////////////////////////////////////////////
  // Nested classes:
  ///////////////////////////////////////////////////////////////////////////////
  /**
   * A StringReader that exposes it's contained string for fast direct access.
   * Might make sense to generalize this to CharSequence and make it public?
   */
  static final class FastStringReader extends StringReader {

    private final String s;
    
    FastStringReader(String s) {
      super(s);
      this.s = s;
    }
    
    String getString() {
      return s;
    }
  }
  
}