edu.emory.mathcs.nlp.common.util.CharUtils Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2015, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.mathcs.nlp.common.util;
import edu.emory.mathcs.nlp.common.constant.CharConst;
/**
* @author Jinho D. Choi ({@code [email protected]})
*/
public class CharUtils
{
private CharUtils() {}
/**
* @param start inclusive
* @param end inclusive
*/
public static boolean isRange(char c, int start, int end)
{
return start <= c && c <= end;
}
/** {@code tIndex + sc.length <= tc.length}. */
public static boolean regionMatches(char[] source, char[] target, int sIndex)
{
if (source.length < sIndex+target.length) return false;
int i, len = target.length;
for (i=0; i=0; i--)
{
c = toUpperCase(cs[i]);
if (cs[i] != c)
{
cs[i] = c;
b = true;
}
}
return b;
}
static public char toUpperCase(char c)
{
if ((97 <= c && c <= 122) || (224 <= c && c <= 254 && c != 247))
return (char)(c-32);
if (c == 154 || c == 156 || c == 158)
return (char)(c-16);
if (c == 255)
return (char)159;
return c;
}
static public boolean toLowerCase(char[] cs)
{
boolean b = false;
char c; int i;
for (i=cs.length-1; i>=0; i--)
{
c = toLowerCase(cs[i]);
if (cs[i] != c)
{
cs[i] = c;
b = true;
}
}
return b;
}
static public char toLowerCase(char c)
{
if ((65 <= c && c <= 90) || (192 <= c && c <= 222 && c != 215))
return (char)(c+32);
if (c == 138 || c == 140 || c == 142)
return (char)(c+16);
if (c == 159)
return (char)255;
return c;
}
// ----------------------------------- Alphabet -----------------------------------
/** {@link CharUtils#isWhiteSpace(char)} */
public static boolean isWhiteSpace(char c)
{
return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\u00A0' || c == '\u2028' || c == '\u2029';
}
public static boolean isAlnum(char c)
{
return isAlphabet(c) || isDigit(c);
}
public static boolean isAlphabet(char c)
{
return isUpperCase(c) || isLowerCase(c);
}
/** About 10 times faster than {@link Character#isUpperCase(char)}. */
public static boolean isUpperCase(char c)
{
return isRange(c, 'A', 'Z');
}
/** About 10 times faster than {@link Character#isLowerCase(char)}. */
public static boolean isLowerCase(char c)
{
return isRange(c, 'a', 'z');
}
/** @return {@code true} if {@code c == 'a', 'e', 'i', 'o', 'u'}*/
public static boolean isVowel(char c)
{
return (c == 'a') || (c == 'e') || (c == 'i') || (c == 'o') || (c == 'u') || (c == 'A') || (c == 'E') || (c == 'I') || (c == 'O') || (c == 'U');
}
public static boolean isConsonant(char c)
{
return isAlphabet(c) && !isVowel(c);
}
public static boolean containsOnlyConsonants(String s)
{
return containsOnlyConsonants(s.toCharArray());
}
public static boolean containsOnlyConsonants(char[] cs)
{
for (int i=cs.length-2; i>=0; i--)
{
if (!CharUtils.isConsonant(cs[i]))
return false;
}
// Y being either a vowel or consonant is likely to be language dependent,
// while handling this correctly requires complex heuristics we can handle final 'y' this way.
char last = cs[cs.length-1];
return last != 'y' && last != 'Y' && CharUtils.isConsonant(last);
}
// ----------------------------------- Symbols -----------------------------------
/** Punctuation in the standard ASCII. */
public static boolean isPunctuation(char c)
{
return isRange(c, '!', '/') || isRange(c, ':', '@') || isRange(c, '[', '`') || isRange(c, '{', '~');
}
/** Unicode: 2000 ~ 206F. */
public static boolean isGeneralPunctuation(char c)
{
return isRange(c, '\u2000', '\u206F');
}
/** Unicode: '$', 00A2 ~ 00A5, 20A0 ~ 20CF. */
public static boolean isCurrency(char c)
{
return (c == '$') || isRange(c, '\u00A2', '\u00A5') || isRange(c, '\u20A0', '\u20CF');
}
/** Unicode: 2190 ~ 21FF, 27F0 ~ 27FF, 2900 ~ 297F. */
public static boolean isArrow(char c)
{
return isRange(c, '\u2190', '\u21FF') || isRange(c, '\u27F0', '\u27FF') || isRange(c, '\u2900', '\u297F');
}
/** Unicode: 3001 ~ 3003, 3008 ~ 301F. */
public static boolean isCJKSymbol(char c)
{
return isRange(c, '\u3001', '\u3003') || isRange(c, '\u3008', '\u301F');
}
public static boolean isHyphen(char c)
{
return c == '-' || isRange(c, '\u2010', '\u2014');
}
public static boolean isApostrophe(char c)
{
return c == '\'' || c == '\u2019';
}
public static boolean isListMark(char c)
{
return c == '-' || c == '\u2022' || c == '\u2023' || c == '\u203B' || c == '\u2043';
}
public static boolean isFinalMark(char c)
{
return c == '.' || c == '?' || c == '!' || c == '\u203C' || isRange(c, '\u2047', '\u2049');
}
public static boolean isSeparatorMark(char c)
{
return c == ',' || c == ';' || c == ':' || c == '|' || c == '/' || c == '\\';
}
public static boolean isQuotationMark(char c)
{
return isSingleQuotationMark(c) || isDoubleQuotationMark(c);
}
public static boolean isSingleQuotationMark(char c)
{
return c == '\'' || isRange(c, '\u2018', '\u201B');
}
public static boolean isDoubleQuotationMark(char c)
{
return c == '"' || isRange(c, '\u201C', '\u201F');
}
public static char generalizeSymbol(char c)
{
if (CharUtils.isCurrency(c)) return CharConst.DOLLAR;
if (CharUtils.isSingleQuotationMark(c)) return CharConst.SINGLE_QUOTE;
if (CharUtils.isDoubleQuotationMark(c)) return CharConst.DOUBLE_QUOTE;
if (CharUtils.isApostrophe(c)) return '\'';
if (CharUtils.isListMark(c) || CharUtils.isHyphen(c)) return CharConst.HYPHEN;
return c;
}
// ----------------------------------- Bracket -----------------------------------
public static boolean isBracket(char c)
{
return isLeftBracket(c) || isRightgBracket(c);
}
public static boolean isLeftBracket(char c)
{
return c == '(' || c == '{' ||c == '[' ||c == '<';
}
public static boolean isRightgBracket(char c)
{
return c == ')' || c == '}' ||c == ']' ||c == '>';
}
public static boolean isDigit(char c)
{
return 48 <= c && c <= 57;
}
public static boolean isPreDigitSymbol(char c)
{
return c == '.' || c == '-' || c == '+' || c == '\u00B1' || isRange(c, '\u2212', '\u2213') || isRange(c, '\u221A', '\u221C');
}
// ----------------------------------- Boolean: char[] -----------------------------------
public static boolean containsDigitPunctuationOnly(char[] cs, int beginIndex, int endIndex)
{
int i;
for (i=beginIndex; i
© 2015 - 2024 Weber Informatics LLC | Privacy Policy