com.aliasi.tokenizer.IndoEuropeanTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliasi-lingpipe Show documentation
This is the original Lingpipe: http://alias-i.com/lingpipe/web/download.html There were not made any changes to the source code.
There is a newer version: 4.1.2-JL1.0
Show newest version
/*
 * LingPipe v. 4.1.0
 * Copyright (C) 2003-2011 Alias-i
 *
 * This program is licensed under the Alias-i Royalty Free License
 * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
 * Royalty Free License Version 1 for more details.
 *
 * You should have received a copy of the Alias-i Royalty Free License
 * Version 1 along with this program; if not, visit
 * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
 * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
 * +1 (718) 290-9170.
 */

package com.aliasi.tokenizer;

/**
 * @author  Bob Carpenter
 * @version 3.8.1
 * @since   LingPipe1.0
 */
class IndoEuropeanTokenizer extends Tokenizer {

    private final char[] mChars;
    private final int mLastPosition;
    private final int mStartPosition;

    private int mPosition;
    private int mTokenStart;
    private int mLastTokenIndex;

    private int mLastTokenStartPosition = -1;
    private int mLastTokenEndPosition = -1;

    /**
     * Construct a tokenizer from the specified character range.  The
     * characters are not copied, so they should not be modified during
     * tokenization.
     *
     * @param ch Characters to tokenize.
     * @param offset Index of first character to tokenize.
     * @param length Number of characters to tokenize.
     * @throws IllegalArgumentException If the slice parameters are
     * out of bounds.
     */
    public IndoEuropeanTokenizer(char[] ch, int offset, int length) {
        if (offset < 0 || offset + length > ch.length) {
            String msg = "Illegal slice."
                + " cs.length=" + ch.length
                + " offset=" + offset
                + " length=" + length;
            throw new IllegalArgumentException(msg);
        }
        mChars = ch;
        mPosition = offset;
        mLastPosition = offset+length;
        mTokenStart = -1;
        mLastTokenIndex = -1;
        mStartPosition = offset;
    }

    /**
     * Creates a tokenizer from the specified string.
     *
     * @param chars Characters to tokenize.
     */
    public IndoEuropeanTokenizer(String chars) {
        this(chars.toCharArray(),0,chars.length());
    }

    /**
     * Create a tokenizer from the specified string buffer.  The
     * contents of the buffer are copied, so modifications to the
     * buffer do not affect tokenization.
     *
     * @param chars String buffer whose characters are tokenized.
     */
    public IndoEuropeanTokenizer(StringBuilder chars) {
        this(chars.toString());
    }

    @Override
    public int lastTokenStartPosition() {
        return mLastTokenStartPosition;
    }

    @Override
    public int lastTokenEndPosition() {
        return mLastTokenEndPosition;
    }


    /**
     * Returns the next whitespace.  Returns the same result for
     * subsequent calls without a call to nextToken.
     *
     * @return The next space.
     */
    @Override
    public String nextWhitespace() {
        StringBuilder sb = new StringBuilder();
        while (hasMoreCharacters()
               && Character.isWhitespace(currentChar())) {
            sb.append(currentChar());
            ++mPosition;
        }
        return sb.toString();
    }

    /**
     * Returns true if the specified character is a
     * letter as determined by {@link Character#isLetter(char)} or is
     * a Devanagari character in the unicode range 0x0900
     * to 0x097F.
     *
     * @param c Character to test.
     * @return true if the character is a Java letter or
     * a Devanagari character.
     */
    private static boolean isLetter(char c) {
        return Character.isLetter(c) || devanagari(c);
    }

    /**
     * Returns true if the specified character is in the
     * Devanagari range, unicode 0x0900 to
     * 0x097F, inclusive.
     *
     * @param code Code number to test.
     * @return true if
     */
    private static boolean devanagari(char unicode) {
        return (unicode >= 0x0900 && unicode <= 0x097F);
    }

    /**
     * Returns the next token in the stream, or null if
     * there are no more tokens.  Flushes any whitespace that has
     * not been returned.
     *
     * @return The next token, or null if there are no
     * more tokens.
     */
    @Override
    public String nextToken()  {
        skipWhitespace();
        if (!hasMoreCharacters()) return null;
        mTokenStart = mPosition;
        ++mLastTokenIndex;
        char startChar = mChars[mPosition++];
        // update to deal with initial period digits properly
        if (startChar == '.') {
            while (currentCharEquals('.')) ++mPosition;
            return currentToken();
        }
        if (startChar == '-') {
            while (currentCharEquals('-')) ++mPosition;
            return currentToken();
        }
        if (startChar == '=') {
            while (currentCharEquals('=')) ++mPosition;
            return currentToken();
        }
        if (startChar == '\'') {
            if (currentCharEquals('\'')) ++mPosition;
            return currentToken();
        }
        if (startChar == '`') {
            if (currentCharEquals('`')) ++mPosition;
            return currentToken();
        }
        if (isLetter(startChar)) return alphaNumToken();
        if (Character.isDigit(startChar)) return numToken();
        return currentToken(); // other single character symbol
    }

    /**
     * Returns true if there are more characters
     * in the input character sequence.
     *
     * @return true if there are more characters
     * to be tokenized.
     */
    private boolean hasMoreCharacters() {
        return mPosition < mLastPosition;
    }

    /**
     * Returns the character in the underlying sequence at
     * the current position.
     *
     * @return The character in the underlying sequence at
     * the current position.
     */
    private char currentChar() {
        return mChars[mPosition];
    }

    /**
     * Returns true if there are more characters and the
     * current character is equal to the specified character.
     *
     * @param c Character to test.
     * @return true if the current character is equal to
     * the specified character.
     */
    private boolean currentCharEquals(char c) {
        return hasMoreCharacters() && currentChar() == c;
    }

    /**
     * Advances the position to the first character of the
     * next token, or to the end of the file if there are
     * no more tokens.
     */
    private void skipWhitespace()  {
        while (hasMoreCharacters()
               && Character.isWhitespace(currentChar()))
            ++mPosition;
    }

    /**
     * Returns the current token as a string.
     *
     * @return Current token as a string.
     */
    private String currentToken() {
        int length = mPosition-mTokenStart;
        mLastTokenStartPosition = mTokenStart - mStartPosition;
        mLastTokenEndPosition = mLastTokenStartPosition + length;
        return new String(mChars,mTokenStart,length);
    }

    /**
     * Completes and returns a token that begins with the previous
     * letter character.
     *
     * @return Longest token extending the previous character.
     */
    private String alphaNumToken() {
        while (hasMoreCharacters()
               && (isLetter(currentChar())
                   || Character.isDigit(currentChar()))) ++mPosition;
        return currentToken();
    }


    /**
     * Completes and returns a token that begins with the previous
     * digit character.
     *
     * @return Token beginning at previous character, and extending
     * to all subsequent digits, commas, and periods.
     */
    private String numToken() {
        while (hasMoreCharacters()) {
            if (isLetter(currentChar())) {
                ++mPosition;
                return alphaNumToken();
            }
            if (Character.isDigit(currentChar())) {
                ++mPosition;
                continue;
            }
            if (currentChar() == '.' || currentChar() == ',') {
                return numPunctToken();
            }
            return currentToken();
        }
        return currentToken();
    }

    /**
     * Completes and returns a token that begins with previous
     * numbers and commas or periods.
     *
     * @return Token beginning at previous character, and extending
     * to all subsequent digits, commas, and periods.
     */
    private String numPunctToken() {
        while (hasMoreCharacters()) {
            if (Character.isDigit(currentChar())) {
                ++mPosition;
            } else if (currentChar() == '.'
                       || currentChar() == ',') {
                ++mPosition;
                if (!hasMoreCharacters() || !Character.isDigit(currentChar())) {
                    --mPosition;
                    return currentToken();
                }
            } else {
                return currentToken();
            }
        }
        return currentToken();
    }

    /**
     * Returns a tokenized version of the specified string.
     *
     * @param phrase Characters to tokenize.
     * @return Array of tokens generated by characters.
     */
    public static String[] tokenize(String phrase) {
        return new IndoEuropeanTokenizer(phrase).tokenize();
    }

}