All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.dicodeproject.analysis.lucene.CleansingAnalyzer Maven / Gradle / Ivy

Go to download

The examples module provides glue code implementation for extracting common phrases, key word distributions and more from tweets stored on HDFS/HBase. It builds on Mahout for more sophisticated analysis.

The newest version!
/**
 * Copyright (C) 2010, 2011 Neofonie GmbH
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package eu.dicodeproject.analysis.lucene;

import java.io.IOException;
import java.io.Reader;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;

/**
 * In contrast to the Lucene standard analyser this one adds filtering tokens of
 * less then minimum length (default two characters) and tokens that contain
 * only digits.
 */
public final class CleansingAnalyzer extends Analyzer {
  /** Min number of characters for non-skipped words. */
  private final int lowerBound;
  /** Track position increments due to skipped tokens. */
  private final boolean enablePositionIncrement;

  /** Default init of lower bound to be equal to 2. */
  public CleansingAnalyzer() {
    this(2, false);
  }

  public CleansingAnalyzer(int lowerBound, boolean enablePositionIncrement) {
    this.lowerBound = lowerBound;
    this.enablePositionIncrement = enablePositionIncrement;
  }

  /**
   * Delegate most of the analysis to the Lucene standard analyzer, add
   * filtering tokens of less than minimum length and filtering tokens that are
   * digit only.
   * */
  @Override
  public TokenStream tokenStream(String fieldName, Reader reader) {
    return new CleansingFilter(new StopFilter(false, new LowerCaseFilter(new StandardFilter(new StandardTokenizer(
	Version.LUCENE_30, reader))), StopAnalyzer.ENGLISH_STOP_WORDS_SET, true), this.lowerBound,
	this.enablePositionIncrement);
  }

  /**
   * TokenFilter that throws out tokens of smaller than minimum length as well
   * as digit only terms.
   * */
  private static class CleansingFilter extends TokenFilter {
    /*
     * (non-Javadoc)
     * 
     * @see java.lang.Object#hashCode()
     */
    @Override
    public int hashCode() {
      final int prime = 31;
      int result = super.hashCode();
      result = prime * result + ((digitPattern == null) ? 0 : digitPattern.hashCode());
      result = prime * result + (enablePositionIncrement ? 1231 : 1237);
      result = prime * result + lowerBound;
      result = prime * result + ((posIncrAtt == null) ? 0 : posIncrAtt.hashCode());
      result = prime * result + ((termAtt == null) ? 0 : termAtt.hashCode());
      return result;
    }

    /*
     * (non-Javadoc)
     * 
     * @see java.lang.Object#equals(java.lang.Object)
     */
    @Override
    public boolean equals(Object obj) {
      if (this == obj) {
	return true;
      }
      if (!super.equals(obj)) {
	return false;
      }
      if (!(obj instanceof CleansingFilter)) {
	return false;
      }
      CleansingFilter other = (CleansingFilter) obj;
      if (digitPattern == null) {
	if (other.digitPattern != null) {
	  return false;
	}
      } else if (!digitPattern.equals(other.digitPattern)) {
	return false;
      }
      if (enablePositionIncrement != other.enablePositionIncrement) {
	return false;
      }
      if (lowerBound != other.lowerBound) {
	return false;
      }
      if (posIncrAtt == null) {
	if (other.posIncrAtt != null) {
	  return false;
	}
      } else if (!posIncrAtt.equals(other.posIncrAtt)) {
	return false;
      }
      if (termAtt == null) {
	if (other.termAtt != null) {
	  return false;
	}
      } else if (!termAtt.equals(other.termAtt)) {
	return false;
      }
      return true;
    }

    /** Min number of characters a token must have to not be skipped. */
    private final int lowerBound;

    /** Whether position increment should be enabled. */
    private final boolean enablePositionIncrement;

    /** RegEx pattern for digit only tokens. */
    private final Pattern digitPattern = Pattern.compile("[0-9]*");

    private final TermAttribute termAtt = addAttribute(TermAttribute.class);
    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

    protected CleansingFilter(TokenStream input, int lowerBound, boolean enablePositionIncrement) {
      super(input);
      this.lowerBound = lowerBound;
      this.enablePositionIncrement = enablePositionIncrement;
    }

    /**
     * Construct a token stream filtering the given input.
     */
    protected CleansingFilter(TokenStream input) {
      this(input, 2, false);
    }

    /**
     * Skip all garbage (too short, digits only) tokens. Implementation heavily
     * inspired by Lucene's StopWordFilter.
     * */
    @Override
    public boolean incrementToken() throws IOException {
      int skippedPositions = 0;
      while (input.incrementToken()) {
	String token = new String(termAtt.termBuffer(), 0, termAtt.termLength());
	if (!isTooShort(token) && (!isDigitOnly(token))) {
	  if (this.enablePositionIncrement) {
	    posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
	  }
	  return true;
	}
	skippedPositions += posIncrAtt.getPositionIncrement();
      }
      return false;
    }

    /** Returns true if the token is shorter than lowerBound. */
    private boolean isTooShort(String token) {
      return token.length() <= this.lowerBound;
    }

    /** Returns true if the token is digit only. */
    private boolean isDigitOnly(String token) {
      return digitPattern.matcher(token).matches();
    }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy