com.code972.hebmorph.Tokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hebmorph-lucene Show documentation
There is a newer version: 6.6.1
/***************************************************************************
 *   Copyright (C) 2010-2015 by                                            *
 *      Itamar Syn-Hershko                      *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU Affero General Public License           *
 *   version 3, as published by the Free Software Foundation.              *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU Affero General Public License for more details.                   *
 *                                                                         *
 *   You should have received a copy of the GNU Affero General Public      *
 *   License along with this program; if not, see                          *
 *   .                                       *
 **************************************************************************/
package com.code972.hebmorph;

import com.code972.hebmorph.datastructures.DictRadix;

import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;

public class Tokenizer {

    public static class TokenType {
        public static int Hebrew = 1;
        public static int NonHebrew = 2;
        public static int Numeric = 4;
        public static int Mixed = 8;
        public static int Construct = 16;
        public static int Acronym = 32;
        public static int Exact = 64;
        public static int Custom = 128;
    }

    public static final char[] Geresh = {'\'', '\u05F3', '\u2018', '\u2019', '\u201B', '\uFF07'};
    public static final char[] Gershayim = {'\"', '\u05F4', '\u201C', '\u201D', '\u201F', '\u275E', '\uFF02'};
    public static final char[] Makaf = {'-', '\u2012', '\u2013', '\u2014', '\u2015', '\u05BE'};
    public static final char[] CharsFollowingPrefixes = HebrewUtils.concatenateCharArrays(Geresh, Gershayim, Makaf);
    public static final char[] LettersAcceptingGeresh = {'ז', 'ג', 'ץ', 'צ', 'ח'};

    private Reader input;
    private int dataLen = 0, inputOffset = 0;

    /// Both are necessary since the tokenizer does some normalization when necessary, and therefore
    /// it isn't always possible to get correct end-offset by looking at the length of the returned token
    /// string
    private int tokenOffset = 0, tokenLengthInSource = 0;

    public final int getOffset() {
        return tokenOffset;
    }

    public int getLengthInSource() {
        return tokenLengthInSource;
    }

    private Character suffixForExactMatch = null;

    public Character getSuffixForExactMatch() {
        return suffixForExactMatch;
    }

    public void setSuffixForExactMatch(final Character suffixForExactMatch) {
        this.suffixForExactMatch = suffixForExactMatch;
    }

    private final HashMap hebrewPrefixes;
    private final DictRadix specialCases;
    private static final Byte dummyData = (byte) 0;

    public void addSpecialCase(final String token) {
        if (token.length() > TOKENIZATION_EXCEPTION_MAX_LENGTH)
            throw new IllegalArgumentException("Special tokenization rule must be at most "
                    + TOKENIZATION_EXCEPTION_MAX_LENGTH + " in length");

        if (token.contains(" "))
            throw new IllegalArgumentException("Special tokenization rule cannot contain spaces");

        specialCases.addNode(token, dummyData);
    }

    public void clearSpecialCases() {
        specialCases.clear();
    }

    public static boolean isLegalPrefix(final String prefix, final HashMap prefixesTree) {
        return prefixesTree.containsKey(prefix);
    }

    public static boolean isLegalPrefix(final char[] prefix, int length, final HashMap prefixesTree) {
        return prefixesTree.containsKey(new String(prefix, 0, length));
    }

    private static final int IO_BUFFER_SIZE = 4096;
    private char[] ioBuffer = new char[IO_BUFFER_SIZE];
    private int ioBufferIndex = 0;

    private final char[] wordBuffer = new char[DictionaryLoader.MaxWordLength];
    private byte currentTokenLength = 0;
    private int tokenType = 0;

    public Tokenizer(final Reader input, HashMap prefixes) {
        this(input, prefixes, null);
    }

    public Tokenizer(final Reader input, HashMap prefixes, final DictRadix specialCases) {
        this.input = input;
        this.specialCases = specialCases != null ? specialCases : new DictRadix(false);
        this.hebrewPrefixes = prefixes;
    }

    final static int TOKENIZATION_EXCEPTION_MAX_LENGTH = 25;
    private char[] tokenizationExceptionBuffer = new char[TOKENIZATION_EXCEPTION_MAX_LENGTH];

    private boolean isRecognizedException(char[] prefix, byte length, char c) {
        if (length >= TOKENIZATION_EXCEPTION_MAX_LENGTH)
            return false; // custom tokenization exceptions are limited in length

        System.arraycopy(prefix, 0, tokenizationExceptionBuffer, 0, length);
        tokenizationExceptionBuffer[length] = c;
        return isRecognizedException(tokenizationExceptionBuffer, length + 1, (byte) (length + 1));
    }

    private boolean isRecognizedException(char c) {
        tokenizationExceptionBuffer[0] = c;
        return isRecognizedException(tokenizationExceptionBuffer, 1, (byte) (1));
    }

    private boolean isRecognizedException(char[] token, int tokenLen, byte length) {
        return isRecognizedException(token, tokenLen, length, false);
    }

    private boolean isRecognizedException(char[] token, int tokenLen, byte length, boolean exact) {
        int i = 0;
        while (i < tokenLen && HebrewUtils.isHebrewLetter(token[i])) {
            if (!isLegalPrefix(token, i + 1, hebrewPrefixes)) {
                i = 0;
                break;
            }
            i++;
        }

        try {
            specialCases.lookup(token, i, length - i, i, !exact);
            return true;
        } catch (IllegalArgumentException e) {
            return false;
        }
    }

    // Niqqud is not being removed by design, to allow for a future analyzer extension to take advantage of it
    // This is a job for a normalizer, anyway
    public int nextToken(final Reference tokenString) throws IOException {
        currentTokenLength = 0;
        tokenOffset = 0; // invalidate
        tokenType = 0;
        boolean avoidTryingCustom = false;
        while (true) {
            if (ioBufferIndex >= dataLen) {
                inputOffset += dataLen;
                dataLen = input.read(ioBuffer, 0, ioBuffer.length);
                if (dataLen <= 0) {
                    dataLen = 0; // so next offset += dataLen won't decrement offset
                    if ((tokenType & TokenType.Custom) > 0 && currentTokenLength > 0
                            && !isRecognizedException(wordBuffer, wordBuffer.length, currentTokenLength, true)) {

                        abortCustomToken();
                    }
                    if (currentTokenLength == 0) {
                        tokenString.ref = "";
                        tokenLengthInSource = 0;
                        tokenOffset = inputOffset;
                        return 0;
                    }
                    break; // return what we have collected so far as a token
                }
                ioBufferIndex = 0;
            }

            char c = ioBuffer[ioBufferIndex++];
            c = HebrewCharacters.collapseAlternate(c);

            boolean appendCurrentChar = false;

            if (currentTokenLength == 0) { // first char, figure out what it is
                if (HebrewUtils.isHebrewLetter(c)) {
                    if (!HebrewUtils.isFinalHebrewLetter(c)) {
                        tokenType |= TokenType.Hebrew;
                        appendCurrentChar = true;
                    }
                } else if (Character.isLetterOrDigit(c)) {
                    tokenType |= TokenType.NonHebrew;
                    if (Character.isDigit(c))
                        tokenType |= TokenType.Numeric;

                    appendCurrentChar = true;
                } else if (!avoidTryingCustom && !Character.isWhitespace(c) && isRecognizedException(c)) {
                    tokenType |= TokenType.Custom;
                    appendCurrentChar = true;
                }
                // Everything else will be ignored
            } else { // we should consume every letter or digit, and tokenize on everything else
                if (!avoidTryingCustom && (tokenType & TokenType.Custom) > 0 && !Character.isSpaceChar(c)) {
                    wordBuffer[currentTokenLength] = c;
                    if (!isRecognizedException(wordBuffer, wordBuffer.length, (byte) (currentTokenLength + 1))) {
                        // Tokenize on non-alphanumeric
                        if (!Character.isLetterOrDigit(c))
                            break;

                        tokenType &= ~TokenType.Custom;
                        avoidTryingCustom = true;
                        ioBufferIndex--;
                        if (ioBufferIndex >= currentTokenLength) {
                            ioBufferIndex -= currentTokenLength;
                            currentTokenLength = 0;
                            continue;
                        } else {
                            abortCustomToken();
                            continue;
                        }
                    }
                    appendCurrentChar = true;
                } else if (HebrewUtils.isHebrewLetter(c) || HebrewUtils.isNiqqudChar(c)) {
                    appendCurrentChar = true;
                } else if (Character.isLetterOrDigit(c)) {
                    if (tokenType == TokenType.Hebrew)
                        tokenType |= TokenType.Mixed;
                    appendCurrentChar = true;
                } else if (HebrewUtils.isOfChars(c, Gershayim)) {
                    c = '"';
                    // Tokenize if previous char wasn't part of a word
                    if (!HebrewUtils.isHebrewLetter(wordBuffer[currentTokenLength - 1]) && !HebrewUtils.isNiqqudChar(wordBuffer[currentTokenLength - 1]))
                        break;

                    // TODO: Is it possible to support cases like שה"שםעצם in the tokenizer?
                    tokenType |= TokenType.Acronym;
                    appendCurrentChar = true;
                } else if (HebrewUtils.isOfChars(c, Geresh)) {
                    c = '\'';
                    // Tokenize if previous char wasn't part of a word or another Geresh (which we handle below)
                    // and only do this for Hebrew tokens
                    if ((tokenType & TokenType.Hebrew) > 0) {
                        // TODO: Is it possible to handle cases which are similar to Merchaot - ה'חלל הפנוי' here?
                        if (!HebrewUtils.isHebrewLetter(wordBuffer[currentTokenLength - 1]) && !HebrewUtils.isNiqqudChar(wordBuffer[currentTokenLength - 1])
                                && !HebrewUtils.isOfChars(wordBuffer[currentTokenLength - 1], Geresh))
                            break;
                    }

                    appendCurrentChar = true;
                } else if (!avoidTryingCustom && !isSuffixForExactMatch(c) && !Character.isSpaceChar(c) && isRecognizedException(wordBuffer, currentTokenLength, c)) {
                    tokenType |= TokenType.Custom;
                    appendCurrentChar = true;
                } else {
                    // Flag makaf connected words as constructs
                    if (HebrewUtils.isOfChars(c, Makaf)) {
                        tokenType |= TokenType.Construct;
                        c = '-';
                        // TODO: Detect words where Makaf is used for shortening a word (א-ל, י-ם and similar), instead of tokenizing on it
                    } else if (suffixForExactMatch != null && suffixForExactMatch.equals(c)) {
                        tokenType |= TokenType.Exact;
                    }

                    // at non-Letter w/ chars
                    break; // return 'em
                }
            }

            if (appendCurrentChar) {
                // Consume normally
                if (currentTokenLength == 0) { // mark the start of a new token
                    tokenOffset = inputOffset + ioBufferIndex - 1;
                } else if (currentTokenLength == wordBuffer.length - 1) { // clip lengthy tokens
                    continue;
                }
                // Note that tokens larger than 128 chars will get clipped.

                // Fix a common replacement of double-Geresh with Gershayim; call it Gershayim normalization if you wish
                if (HebrewUtils.isOfChars(c, Geresh)) {
                    if (wordBuffer[currentTokenLength - 1] == c) {
                        wordBuffer[currentTokenLength - 1] = '"';
                        tokenType |= TokenType.Acronym;
                    }
                    //					else if (isOfChars(wordBuffer[length - 1], LettersAcceptingGeresh))
                    //					{
                    //						wordBuffer[length++] = c;
                    //					}
                    else
                        wordBuffer[currentTokenLength++] = c;
                } else {
                    wordBuffer[currentTokenLength++] = c; // TODO: Normalize c
                }
            }
        }

        // Store token's actual length in source (regardless of misc normalizations)
        if (dataLen <= 0)
            tokenLengthInSource = Math.max(inputOffset - tokenOffset, 0);
        else {
            tokenLengthInSource = Math.max(inputOffset + ioBufferIndex - 1 - tokenOffset, 0);
        }

        if (HebrewUtils.isOfChars(wordBuffer[currentTokenLength - 1], Gershayim)) {
            wordBuffer[--currentTokenLength] = '\0';
            tokenLengthInSource = Math.max(tokenLengthInSource - 1, 0); // Don't include Gershayim in the offset calculation
        }
        // Geresh trimming; only try this if it isn't one-char in length (without the Geresh)
        if ((currentTokenLength > 2) && wordBuffer[currentTokenLength - 1] == '\'') {
            // All letters which this Geresh may mean something for
            if (((tokenType & TokenType.Hebrew) == 0) || !HebrewUtils.isOfChars(wordBuffer[currentTokenLength - 2], LettersAcceptingGeresh)) {
                wordBuffer[--currentTokenLength] = '\0';
                tokenLengthInSource = Math.max(tokenLengthInSource - 1, 0); // Don't include this Geresh in the offset calculation
            }
            // TODO: Support marking abbrevations (פרופ') and Hebrew's th (ת')
            // TODO: Handle ה (Hashem)
        }

        tokenString.ref = new String(wordBuffer, 0, currentTokenLength);
        return tokenType;
    }

    private void abortCustomToken() {
        int start = 0, pos = 0;
        boolean started = false;
        while (pos + start < currentTokenLength) {
            if (!started && !HebrewUtils.isHebrewLetter(wordBuffer[start]) &&
                    !HebrewUtils.isNiqqudChar(wordBuffer[start]) && !Character.isLetterOrDigit(wordBuffer[start])) {
                start++;
                continue;
            }

            started = true;

            Character c = wordBuffer[pos + start];
            if (HebrewUtils.isHebrewLetter(c) || HebrewUtils.isNiqqudChar(c)) {
                tokenType |= TokenType.Hebrew;
            } else if (Character.isLetterOrDigit(c)) {
                if (tokenType == TokenType.Hebrew)
                    tokenType |= TokenType.Mixed;
                else
                    tokenType |= TokenType.NonHebrew;
            } else if (HebrewUtils.isOfChars(c, Gershayim)) {
                c = '"';
                tokenType |= TokenType.Acronym;
            } else if (HebrewUtils.isOfChars(c, Geresh)) {
                c = '\'';
            } else {
                break;
            }

            wordBuffer[pos] = c;
            pos++;
        }

        currentTokenLength = (byte) (pos);
    }

    private boolean isSuffixForExactMatch(char c) {
        if (suffixForExactMatch == null)
            return false;
        return c == suffixForExactMatch;
    }

    public final void reset(final Reader _input) {
        input = _input;
        inputOffset = 0;
        dataLen = 0;
        ioBufferIndex = 0;
        tokenOffset = 0;
        tokenLengthInSource = 0;
        currentTokenLength = 0;
        tokenType = 0;
    }
}