All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.analysis.WordDelimiterIterator Maven / Gradle / Ivy

// original code from Apache Solr - ported to work with Lucene 3.x and reformatted to Search coding style
package org.apache.solr.analysis;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import static org.apache.solr.analysis.WordDelimiterFilter.ALPHA;
import static org.apache.solr.analysis.WordDelimiterFilter.DIGIT;
import static org.apache.solr.analysis.WordDelimiterFilter.LOWER;
import static org.apache.solr.analysis.WordDelimiterFilter.SUBWORD_DELIM;
import static org.apache.solr.analysis.WordDelimiterFilter.UPPER;
import static org.apache.solr.analysis.WordDelimiterFilter.isAlpha;
import static org.apache.solr.analysis.WordDelimiterFilter.isDigit;
import static org.apache.solr.analysis.WordDelimiterFilter.isSubwordDelim;
import static org.apache.solr.analysis.WordDelimiterFilter.isUpper;

/**
 * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
 *
 * @lucene.internal
 */
public final class WordDelimiterIterator {

	/**
	 * Indicates the end of iteration
	 */
	public static final int DONE = -1;

	public static final byte[] DEFAULT_WORD_DELIM_TABLE;

	char text[];
	int length;

	/**
	 * start position of text, excluding leading delimiters
	 */
	int startBounds;
	/**
	 * end position of text, excluding trailing delimiters
	 */
	int endBounds;

	/**
	 * Beginning of subword
	 */
	int current;
	/**
	 * End of subword
	 */
	int end;

	/* does this string end with a possessive such as 's */
	private boolean hasFinalPossessive = false;

	/**
	 * If false, causes case changes to be ignored (subwords will only be generated
	 * given SUBWORD_DELIM tokens). (Defaults to true)
	 */
	final boolean splitOnCaseChange;

	/**
	 * If false, causes numeric changes to be ignored (subwords will only be generated
	 * given SUBWORD_DELIM tokens). (Defaults to true)
	 */
	final boolean splitOnNumerics;

	/**
	 * If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
	 * 

* "O'Neil's" => "O", "Neil" */ final boolean stemEnglishPossessive; private final byte[] charTypeTable; /** * if true, need to skip over a possessive found in the last call to next() */ private boolean skipPossessive = false; // TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be // done if separated by these chars?) "," would be an obvious candidate... static { byte[] tab = new byte[256]; for ( int i = 0; i < 256; i++ ) { byte code = 0; if ( Character.isLowerCase( i ) ) { code |= LOWER; } else if ( Character.isUpperCase( i ) ) { code |= UPPER; } else if ( Character.isDigit( i ) ) { code |= DIGIT; } if ( code == 0 ) { code = SUBWORD_DELIM; } tab[i] = code; } DEFAULT_WORD_DELIM_TABLE = tab; } /** * Create a new WordDelimiterIterator operating with the supplied rules. * * @param charTypeTable table containing character types * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se" * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" */ WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) { this.charTypeTable = charTypeTable; this.splitOnCaseChange = splitOnCaseChange; this.splitOnNumerics = splitOnNumerics; this.stemEnglishPossessive = stemEnglishPossessive; } /** * Advance to the next subword in the string. * * @return index of the next subword, or {@link #DONE} if all subwords have been returned */ int next() { current = end; if ( current == DONE ) { return DONE; } if ( skipPossessive ) { current += 2; skipPossessive = false; } int lastType = 0; while ( current < endBounds && ( isSubwordDelim( lastType = charType( text[current] ) ) ) ) { current++; } if ( current >= endBounds ) { return end = DONE; } for ( end = current + 1; end < endBounds; end++ ) { int type = charType( text[end] ); if ( isBreak( lastType, type ) ) { break; } lastType = type; } if ( end < endBounds - 1 && endsWithPossessive( end + 2 ) ) { skipPossessive = true; } return end; } /** * Return the type of the current subword. * This currently uses the type of the first character in the subword. * * @return type of the current word */ int type() { if ( end == DONE ) { return 0; } int type = charType( text[current] ); switch ( type ) { // return ALPHA word type for both lower and upper case LOWER: case UPPER: return ALPHA; default: return type; } } /** * Reset the text to a new value, and reset all state * * @param text New text * @param length length of the text */ void setText(char text[], int length) { this.text = text; this.length = this.endBounds = length; current = startBounds = end = 0; skipPossessive = hasFinalPossessive = false; setBounds(); } // ================================================= Helper Methods ================================================ /** * Determines whether the transition from lastType to type indicates a break * * @param lastType Last subword type * @param type Current subword type * * @return {@code true} if the transition indicates a break, {@code false} otherwise */ private boolean isBreak(int lastType, int type) { if ( ( type & lastType ) != 0 ) { return false; } if ( !splitOnCaseChange && isAlpha( lastType ) && isAlpha( type ) ) { // ALPHA->ALPHA: always ignore if case isn't considered. return false; } else if ( isUpper( lastType ) && isAlpha( type ) ) { // UPPER->letter: Don't split return false; } else if ( !splitOnNumerics && ( ( isAlpha( lastType ) && isDigit( type ) ) || ( isDigit( lastType ) && isAlpha( type ) ) ) ) { // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split return false; } return true; } /** * Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters * * @return {@code true} if the current word contains only one subword, {@code false} otherwise */ boolean isSingleWord() { if ( hasFinalPossessive ) { return current == startBounds && end == endBounds - 2; } else { return current == startBounds && end == endBounds; } } /** * Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove * it yet, simply note it. */ private void setBounds() { while ( startBounds < length && ( isSubwordDelim( charType( text[startBounds] ) ) ) ) { startBounds++; } while ( endBounds > startBounds && ( isSubwordDelim( charType( text[endBounds - 1] ) ) ) ) { endBounds--; } if ( endsWithPossessive( endBounds ) ) { hasFinalPossessive = true; } current = startBounds; } /** * Determines if the text at the given position indicates an English possessive which should be removed * * @param pos Position in the text to check if it indicates an English possessive * * @return {@code true} if the text at the position indicates an English posessive, {@code false} otherwise */ private boolean endsWithPossessive(int pos) { return ( stemEnglishPossessive && pos > 2 && text[pos - 2] == '\'' && ( text[pos - 1] == 's' || text[pos - 1] == 'S' ) && isAlpha( charType( text[pos - 3] ) ) && ( pos == endBounds || isSubwordDelim( charType( text[pos] ) ) ) ); } /** * Determines the type of the given character * * @param ch Character whose type is to be determined * * @return Type of the character */ private int charType(int ch) { if ( ch < charTypeTable.length ) { return charTypeTable[ch]; } return getType( ch ); } /** * Computes the type of the given character * * @param ch Character whose type is to be determined * * @return Type of the character */ public static byte getType(int ch) { switch ( Character.getType( ch ) ) { case Character.UPPERCASE_LETTER: return UPPER; case Character.LOWERCASE_LETTER: return LOWER; case Character.TITLECASE_LETTER: case Character.MODIFIER_LETTER: case Character.OTHER_LETTER: case Character.NON_SPACING_MARK: case Character.ENCLOSING_MARK: // depends what it encloses? case Character.COMBINING_SPACING_MARK: return ALPHA; case Character.DECIMAL_DIGIT_NUMBER: case Character.LETTER_NUMBER: case Character.OTHER_NUMBER: return DIGIT; // case Character.SPACE_SEPARATOR: // case Character.LINE_SEPARATOR: // case Character.PARAGRAPH_SEPARATOR: // case Character.CONTROL: // case Character.FORMAT: // case Character.PRIVATE_USE: case Character.SURROGATE: // prevent splitting return ALPHA | DIGIT; // case Character.DASH_PUNCTUATION: // case Character.START_PUNCTUATION: // case Character.END_PUNCTUATION: // case Character.CONNECTOR_PUNCTUATION: // case Character.OTHER_PUNCTUATION: // case Character.MATH_SYMBOL: // case Character.CURRENCY_SYMBOL: // case Character.MODIFIER_SYMBOL: // case Character.OTHER_SYMBOL: // case Character.INITIAL_QUOTE_PUNCTUATION: // case Character.FINAL_QUOTE_PUNCTUATION: default: return SUBWORD_DELIM; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy