eu.dicodeproject.analysis.lucene.CleansingAnalyzer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of integration Show documentation
The examples module provides glue code implementation for extracting common phrases, key word distributions and more from tweets stored on HDFS/HBase. It builds on Mahout for more sophisticated analysis.
The newest version!
/**
 * Copyright (C) 2010, 2011 Neofonie GmbH
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package eu.dicodeproject.analysis.lucene;

import java.io.IOException;
import java.io.Reader;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;

/**
 * In contrast to the Lucene standard analyser this one adds filtering tokens of
 * less then minimum length (default two characters) and tokens that contain
 * only digits.
 */
public final class CleansingAnalyzer extends Analyzer {
  /** Min number of characters for non-skipped words. */
  private final int lowerBound;
  /** Track position increments due to skipped tokens. */
  private final boolean enablePositionIncrement;

  /** Default init of lower bound to be equal to 2. */
  public CleansingAnalyzer() {
    this(2, false);
  }

  public CleansingAnalyzer(int lowerBound, boolean enablePositionIncrement) {
    this.lowerBound = lowerBound;
    this.enablePositionIncrement = enablePositionIncrement;
  }

  /**
   * Delegate most of the analysis to the Lucene standard analyzer, add
   * filtering tokens of less than minimum length and filtering tokens that are
   * digit only.
   * */
  @Override
  public TokenStream tokenStream(String fieldName, Reader reader) {
    return new CleansingFilter(new StopFilter(false, new LowerCaseFilter(new StandardFilter(new StandardTokenizer(
	Version.LUCENE_30, reader))), StopAnalyzer.ENGLISH_STOP_WORDS_SET, true), this.lowerBound,
	this.enablePositionIncrement);
  }

  /**
   * TokenFilter that throws out tokens of smaller than minimum length as well
   * as digit only terms.
   * */
  private static class CleansingFilter extends TokenFilter {
    /*
     * (non-Javadoc)
     * 
     * @see java.lang.Object#hashCode()
     */
    @Override
    public int hashCode() {
      final int prime = 31;
      int result = super.hashCode();
      result = prime * result + ((digitPattern == null) ? 0 : digitPattern.hashCode());
      result = prime * result + (enablePositionIncrement ? 1231 : 1237);
      result = prime * result + lowerBound;
      result = prime * result + ((posIncrAtt == null) ? 0 : posIncrAtt.hashCode());
      result = prime * result + ((termAtt == null) ? 0 : termAtt.hashCode());
      return result;
    }

    /*
     * (non-Javadoc)
     * 
     * @see java.lang.Object#equals(java.lang.Object)
     */
    @Override
    public boolean equals(Object obj) {
      if (this == obj) {
	return true;
      }
      if (!super.equals(obj)) {
	return false;
      }
      if (!(obj instanceof CleansingFilter)) {
	return false;
      }
      CleansingFilter other = (CleansingFilter) obj;
      if (digitPattern == null) {
	if (other.digitPattern != null) {
	  return false;
	}
      } else if (!digitPattern.equals(other.digitPattern)) {
	return false;
      }
      if (enablePositionIncrement != other.enablePositionIncrement) {
	return false;
      }
      if (lowerBound != other.lowerBound) {
	return false;
      }
      if (posIncrAtt == null) {
	if (other.posIncrAtt != null) {
	  return false;
	}
      } else if (!posIncrAtt.equals(other.posIncrAtt)) {
	return false;
      }
      if (termAtt == null) {
	if (other.termAtt != null) {
	  return false;
	}
      } else if (!termAtt.equals(other.termAtt)) {
	return false;
      }
      return true;
    }

    /** Min number of characters a token must have to not be skipped. */
    private final int lowerBound;

    /** Whether position increment should be enabled. */
    private final boolean enablePositionIncrement;

    /** RegEx pattern for digit only tokens. */
    private final Pattern digitPattern = Pattern.compile("[0-9]*");

    private final TermAttribute termAtt = addAttribute(TermAttribute.class);
    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

    protected CleansingFilter(TokenStream input, int lowerBound, boolean enablePositionIncrement) {
      super(input);
      this.lowerBound = lowerBound;
      this.enablePositionIncrement = enablePositionIncrement;
    }

    /**
     * Construct a token stream filtering the given input.
     */
    protected CleansingFilter(TokenStream input) {
      this(input, 2, false);
    }

    /**
     * Skip all garbage (too short, digits only) tokens. Implementation heavily
     * inspired by Lucene's StopWordFilter.
     * */
    @Override
    public boolean incrementToken() throws IOException {
      int skippedPositions = 0;
      while (input.incrementToken()) {
	String token = new String(termAtt.termBuffer(), 0, termAtt.termLength());
	if (!isTooShort(token) && (!isDigitOnly(token))) {
	  if (this.enablePositionIncrement) {
	    posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
	  }
	  return true;
	}
	skippedPositions += posIncrAtt.getPositionIncrement();
      }
      return false;
    }

    /** Returns true if the token is shorter than lowerBound. */
    private boolean isTooShort(String token) {
      return token.length() <= this.lowerBound;
    }

    /** Returns true if the token is digit only. */
    private boolean isDigitOnly(String token) {
      return digitPattern.matcher(token).matches();
    }
  }

}