opennlp.tools.util.featuregen.FeatureGeneratorUtil Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.util.featuregen;
import java.util.regex.Pattern;
/**
* This class provide common utilities for feature generation.
*/
public class FeatureGeneratorUtil {
private static final String TOKEN_CLASS_PREFIX = "wc";
private static final String TOKEN_AND_CLASS_PREFIX = "w&c";
private static final Pattern capPeriod = Pattern.compile("^[A-Z]\\.$");
/**
* Generates a class name for the specified token.
* The classes are as follows where the first matching class is used:
*
* - jah - Japanese Hiragana
* - jak - Japanese Katakana
* - lc - lowercase alphabetic
* - 2d - two digits
* - 4d - four digits
* - an - alpha-numeric
* - dd - digits and dashes
* - ds - digits and slashes
* - dc - digits and commas
* - dp - digits and periods
* - num - digits
* - sc - single capital letter
* - ac - all capital letters
* - ic - initial capital letter
* - other - other
*
* @param token A token or word.
* @return The class name that the specified token belongs in.
*/
public static String tokenFeature(String token) {
StringPattern pattern = StringPattern.recognize(token);
String feat;
if (pattern.isAllHiragana()) {
feat = "jah";
}
else if (pattern.isAllKatakana()) {
feat = "jak";
}
else if (pattern.isAllLowerCaseLetter()) {
feat = "lc";
}
else if (pattern.digits() == 2) {
feat = "2d";
}
else if (pattern.digits() == 4) {
feat = "4d";
}
else if (pattern.containsDigit()) {
if (pattern.containsLetters()) {
feat = "an";
}
else if (pattern.containsHyphen()) {
feat = "dd";
}
else if (pattern.containsSlash()) {
feat = "ds";
}
else if (pattern.containsComma()) {
feat = "dc";
}
else if (pattern.containsPeriod()) {
feat = "dp";
}
else {
feat = "num";
}
}
else if (pattern.isAllCapitalLetter()) {
if (token.length() == 1) {
feat = "sc";
}
else {
feat = "ac";
}
}
else if (capPeriod.matcher(token).find()) {
feat = "cp";
}
else if (pattern.isInitialCapitalLetter()) {
feat = "ic";
}
else {
feat = "other";
}
return (feat);
}
}