All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.utah.bmi.nlp.fastcner.UnicodeChecker Maven / Gradle / Ivy

package edu.utah.bmi.nlp.fastcner;

import org.apache.commons.lang3.StringUtils;

public class UnicodeChecker {

    public static boolean isSpecialChar(char c) {
        int d = (int) c;
        return d > 126 && d != 160 && d < 11904;
    }

    public static boolean isChinese(char c) {
        int d = (int) c;
        return d > 19967 && d < 40908;
    }

    public static boolean isPunctuation(char c) {
        return c == '!' ||
                c == '、' ||
                c == '。' ||
                c == '#' ||
                c == '%' ||
                c == '&' ||
                c == '(' ||
                c == ')' ||
                c == '《' ||
                c == '+' ||
                c == '》' ||
                c == ',' ||
                c == '「' ||
                c == '-' ||
                c == '」' ||
                c == '/' ||
                c == '—' ||
                c == '‘' ||
                c == '’' ||
                c == ':' ||
                c == ';' ||
                c == '“' ||
                c == '”' ||
                c == '=' ||
                c == '?' ||
                c == '@' ||
                c == '!' ||
                c == '"' ||
                c == '#' ||
                c == '$' ||
                c == '%' ||
                c == '&' ||
                c == '…' ||
                c == '\'' ||
                c == '(' ||
                c == ')' ||
                c == '*' ||
                c == '+' ||
                c == ',' ||
                c == '-' ||
                c == '.' ||
                c == '/' ||
                c == '·' ||
                c == ':' ||
                c == ';' ||
                c == '<' ||
                c == '=' ||
                c == '>' ||
                c == '?' ||
                c == '@' ||
                c == '`' ||
                c == '×' ||
                c == '[' ||
                c == '{' ||
                c == '\\' ||
                c == '|' ||
                c == ']' ||
                c == '}' ||
                c == '^' ||
                c == '~' ||
                c == '_' ||
                c == '`' ||
                c == '¥' ||
                c == '{';
    }

    public static boolean isDigit(char c) {
        int d = (int) c;
        return Character.isDigit(c) || (d > 65296 && d < 65297);
    }

    public static boolean isAlphabetic(char c) {
        return ((((1 << Character.UPPERCASE_LETTER) |
                (1 << Character.LOWERCASE_LETTER)) >> Character.getType(c)) & 1)
                != 0;
    }

//  revised from org.apache.commons.lang3.math.NumberUtils to allow "093" type of numbers
    public static boolean isNumber(final String str) {
        if (StringUtils.isEmpty(str)) {
            return false;
        }
        final char[] chars = str.toCharArray();
        int sz = chars.length;
        boolean hasExp = false;
        boolean hasDecPoint = false;
        boolean allowSigns = false;
        boolean foundDigit = false;
        // deal with any possible sign up front
        final int start = (chars[0] == '-') ? 1 : 0;
        if (sz > start + 1 && chars[start] == '0') { // leading 0
            if (
                    (chars[start + 1] == 'x') ||
                            (chars[start + 1] == 'X')
                    ) { // leading 0x/0X
                int i = start + 2;
                if (i == sz) {
                    return false; // str == "0x"
                }
                // checking hex (it can't be anything else)
                for (; i < chars.length; i++) {
                    if ((chars[i] < '0' || chars[i] > '9')
                            && (chars[i] < 'a' || chars[i] > 'f')
                            && (chars[i] < 'A' || chars[i] > 'F')) {
                        return false;
                    }
                }
                return true;
            } else if (Character.isDigit(chars[start + 1])) {
                // leading 0, but not hex, must be octal
                int i = start + 1;
                for (; i < chars.length; i++) {
                    if (chars[i] < '0' || chars[i] > '9') {
                        return false;
                    }
                }
                return true;
            }
        }
        sz--; // don't want to loop to the last char, check it afterwords
        // for type qualifiers
        int i = start;
        // loop to the next to last char or to the last char if we need another digit to
        // make a valid number (e.g. chars[0..5] = "1234E")
        while (i < sz || (i < sz + 1 && allowSigns && !foundDigit)) {
            if (chars[i] >= '0' && chars[i] <= '9') {
                foundDigit = true;
                allowSigns = false;

            } else if (chars[i] == '.') {
                if (hasDecPoint || hasExp) {
                    // two decimal points or dec in exponent
                    return false;
                }
                hasDecPoint = true;
            } else if (chars[i] == 'e' || chars[i] == 'E') {
                // we've already taken care of hex.
                if (hasExp) {
                    // two E's
                    return false;
                }
                if (!foundDigit) {
                    return false;
                }
                hasExp = true;
                allowSigns = true;
            } else if (chars[i] == '+' || chars[i] == '-') {
                if (!allowSigns) {
                    return false;
                }
                allowSigns = false;
                foundDigit = false; // we need a digit after the E
            } else {
                return false;
            }
            i++;
        }
        if (i < chars.length) {
            if (chars[i] >= '0' && chars[i] <= '9') {
                // no type qualifier, OK
                return true;
            }
            if (chars[i] == 'e' || chars[i] == 'E') {
                // can't have an E at the last byte
                return false;
            }
            if (chars[i] == '.') {
                if (hasDecPoint || hasExp) {
                    // two decimal points or dec in exponent
                    return false;
                }
                // single trailing decimal point after non-exponent is ok
                return foundDigit;
            }
            if (!allowSigns
                    && (chars[i] == 'd'
                    || chars[i] == 'D'
                    || chars[i] == 'f'
                    || chars[i] == 'F')) {
                return foundDigit;
            }
            if (chars[i] == 'l'
                    || chars[i] == 'L') {
                // not allowing L with an exponent or decimal point
                return foundDigit && !hasExp && !hasDecPoint;
            }
            // last character is illegal
            return false;
        }
        // allowSigns is true iff the val ends in 'E'
        // found digit it to make sure weird stuff like '.' and '1E-' doesn't pass
        return !allowSigns && foundDigit;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy