org.apache.solr.analysis.WordDelimiterIterator Maven / Gradle / Ivy

Go to download
// original code from Apache Solr - ported to work with Lucene 3.x and reformatted to Search coding style
package org.apache.solr.analysis;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import static org.apache.solr.analysis.WordDelimiterFilter.ALPHA;
import static org.apache.solr.analysis.WordDelimiterFilter.DIGIT;
import static org.apache.solr.analysis.WordDelimiterFilter.LOWER;
import static org.apache.solr.analysis.WordDelimiterFilter.SUBWORD_DELIM;
import static org.apache.solr.analysis.WordDelimiterFilter.UPPER;
import static org.apache.solr.analysis.WordDelimiterFilter.isAlpha;
import static org.apache.solr.analysis.WordDelimiterFilter.isDigit;
import static org.apache.solr.analysis.WordDelimiterFilter.isSubwordDelim;
import static org.apache.solr.analysis.WordDelimiterFilter.isUpper;

/**
 * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
 *
 * @lucene.internal
 */
public final class WordDelimiterIterator {

	/**
	 * Indicates the end of iteration
	 */
	public static final int DONE = -1;

	public static final byte[] DEFAULT_WORD_DELIM_TABLE;

	char text[];
	int length;

	/**
	 * start position of text, excluding leading delimiters
	 */
	int startBounds;
	/**
	 * end position of text, excluding trailing delimiters
	 */
	int endBounds;

	/**
	 * Beginning of subword
	 */
	int current;
	/**
	 * End of subword
	 */
	int end;

	/* does this string end with a possessive such as 's */
	private boolean hasFinalPossessive = false;

	/**
	 * If false, causes case changes to be ignored (subwords will only be generated
	 * given SUBWORD_DELIM tokens). (Defaults to true)
	 */
	final boolean splitOnCaseChange;

	/**
	 * If false, causes numeric changes to be ignored (subwords will only be generated
	 * given SUBWORD_DELIM tokens). (Defaults to true)
	 */
	final boolean splitOnNumerics;

	/**
	 * If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
	 * 
	 * "O'Neil's" => "O", "Neil"
	 */
	final boolean stemEnglishPossessive;

	private final byte[] charTypeTable;

	/**
	 * if true, need to skip over a possessive found in the last call to next()
	 */
	private boolean skipPossessive = false;

	// TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
	// done if separated by these chars?) "," would be an obvious candidate...

	static {
		byte[] tab = new byte[256];
		for ( int i = 0; i < 256; i++ ) {
			byte code = 0;
			if ( Character.isLowerCase( i ) ) {
				code |= LOWER;
			}
			else if ( Character.isUpperCase( i ) ) {
				code |= UPPER;
			}
			else if ( Character.isDigit( i ) ) {
				code |= DIGIT;
			}
			if ( code == 0 ) {
				code = SUBWORD_DELIM;
			}
			tab[i] = code;
		}
		DEFAULT_WORD_DELIM_TABLE = tab;
	}

	/**
	 * Create a new WordDelimiterIterator operating with the supplied rules.
	 *
	 * @param charTypeTable table containing character types
	 * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
	 * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se"
	 * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
	 */
	WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) {
		this.charTypeTable = charTypeTable;
		this.splitOnCaseChange = splitOnCaseChange;
		this.splitOnNumerics = splitOnNumerics;
		this.stemEnglishPossessive = stemEnglishPossessive;
	}

	/**
	 * Advance to the next subword in the string.
	 *
	 * @return index of the next subword, or {@link #DONE} if all subwords have been returned
	 */
	int next() {
		current = end;
		if ( current == DONE ) {
			return DONE;
		}

		if ( skipPossessive ) {
			current += 2;
			skipPossessive = false;
		}

		int lastType = 0;

		while ( current < endBounds && ( isSubwordDelim( lastType = charType( text[current] ) ) ) ) {
			current++;
		}

		if ( current >= endBounds ) {
			return end = DONE;
		}

		for ( end = current + 1; end < endBounds; end++ ) {
			int type = charType( text[end] );
			if ( isBreak( lastType, type ) ) {
				break;
			}
			lastType = type;
		}

		if ( end < endBounds - 1 && endsWithPossessive( end + 2 ) ) {
			skipPossessive = true;
		}

		return end;
	}


	/**
	 * Return the type of the current subword.
	 * This currently uses the type of the first character in the subword.
	 *
	 * @return type of the current word
	 */
	int type() {
		if ( end == DONE ) {
			return 0;
		}

		int type = charType( text[current] );
		switch ( type ) {
			// return ALPHA word type for both lower and upper
			case LOWER:
			case UPPER:
				return ALPHA;
			default:
				return type;
		}
	}

	/**
	 * Reset the text to a new value, and reset all state
	 *
	 * @param text New text
	 * @param length length of the text
	 */
	void setText(char text[], int length) {
		this.text = text;
		this.length = this.endBounds = length;
		current = startBounds = end = 0;
		skipPossessive = hasFinalPossessive = false;
		setBounds();
	}

	// ================================================= Helper Methods ================================================

	/**
	 * Determines whether the transition from lastType to type indicates a break
	 *
	 * @param lastType Last subword type
	 * @param type Current subword type
	 *
	 * @return {@code true} if the transition indicates a break, {@code false} otherwise
	 */
	private boolean isBreak(int lastType, int type) {
		if ( ( type & lastType ) != 0 ) {
			return false;
		}

		if ( !splitOnCaseChange && isAlpha( lastType ) && isAlpha( type ) ) {
			// ALPHA->ALPHA: always ignore if case isn't considered.
			return false;
		}
		else if ( isUpper( lastType ) && isAlpha( type ) ) {
			// UPPER->letter: Don't split
			return false;
		}
		else if ( !splitOnNumerics && ( ( isAlpha( lastType ) && isDigit( type ) ) || ( isDigit( lastType ) && isAlpha(
				type
		) ) ) ) {
			// ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
			return false;
		}

		return true;
	}

	/**
	 * Determines if the current word contains only one subword.  Note, it could be potentially surrounded by delimiters
	 *
	 * @return {@code true} if the current word contains only one subword, {@code false} otherwise
	 */
	boolean isSingleWord() {
		if ( hasFinalPossessive ) {
			return current == startBounds && end == endBounds - 2;
		}
		else {
			return current == startBounds && end == endBounds;
		}
	}

	/**
	 * Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
	 * it yet, simply note it.
	 */
	private void setBounds() {
		while ( startBounds < length && ( isSubwordDelim( charType( text[startBounds] ) ) ) ) {
			startBounds++;
		}

		while ( endBounds > startBounds && ( isSubwordDelim( charType( text[endBounds - 1] ) ) ) ) {
			endBounds--;
		}
		if ( endsWithPossessive( endBounds ) ) {
			hasFinalPossessive = true;
		}
		current = startBounds;
	}

	/**
	 * Determines if the text at the given position indicates an English possessive which should be removed
	 *
	 * @param pos Position in the text to check if it indicates an English possessive
	 *
	 * @return {@code true} if the text at the position indicates an English posessive, {@code false} otherwise
	 */
	private boolean endsWithPossessive(int pos) {
		return ( stemEnglishPossessive &&
				pos > 2 &&
				text[pos - 2] == '\'' &&
				( text[pos - 1] == 's' || text[pos - 1] == 'S' ) &&
				isAlpha( charType( text[pos - 3] ) ) &&
				( pos == endBounds || isSubwordDelim( charType( text[pos] ) ) ) );
	}

	/**
	 * Determines the type of the given character
	 *
	 * @param ch Character whose type is to be determined
	 *
	 * @return Type of the character
	 */
	private int charType(int ch) {
		if ( ch < charTypeTable.length ) {
			return charTypeTable[ch];
		}
		return getType( ch );
	}

	/**
	 * Computes the type of the given character
	 *
	 * @param ch Character whose type is to be determined
	 *
	 * @return Type of the character
	 */
	public static byte getType(int ch) {
		switch ( Character.getType( ch ) ) {
			case Character.UPPERCASE_LETTER:
				return UPPER;
			case Character.LOWERCASE_LETTER:
				return LOWER;

			case Character.TITLECASE_LETTER:
			case Character.MODIFIER_LETTER:
			case Character.OTHER_LETTER:
			case Character.NON_SPACING_MARK:
			case Character.ENCLOSING_MARK:  // depends what it encloses?
			case Character.COMBINING_SPACING_MARK:
				return ALPHA;

			case Character.DECIMAL_DIGIT_NUMBER:
			case Character.LETTER_NUMBER:
			case Character.OTHER_NUMBER:
				return DIGIT;

			// case Character.SPACE_SEPARATOR:
			// case Character.LINE_SEPARATOR:
			// case Character.PARAGRAPH_SEPARATOR:
			// case Character.CONTROL:
			// case Character.FORMAT:
			// case Character.PRIVATE_USE:

			case Character.SURROGATE:  // prevent splitting
				return ALPHA | DIGIT;

			// case Character.DASH_PUNCTUATION:
			// case Character.START_PUNCTUATION:
			// case Character.END_PUNCTUATION:
			// case Character.CONNECTOR_PUNCTUATION:
			// case Character.OTHER_PUNCTUATION:
			// case Character.MATH_SYMBOL:
			// case Character.CURRENCY_SYMBOL:
			// case Character.MODIFIER_SYMBOL:
			// case Character.OTHER_SYMBOL:
			// case Character.INITIAL_QUOTE_PUNCTUATION:
			// case Character.FINAL_QUOTE_PUNCTUATION:

			default:
				return SUBWORD_DELIM;
		}
	}
}