opennlp.tools.util.featuregen.TokenPatternFeatureGenerator Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.util.featuregen;
import java.util.List;
import java.util.regex.Pattern;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.StringUtil;
/**
* Partitions tokens into sub-tokens based on character classes and generates
* class features for each of the sub-tokens and combinations of those sub-tokens.
*/
public class TokenPatternFeatureGenerator implements AdaptiveFeatureGenerator {
private static final String SUB_TOKEN_PREFIX = "st=" ;
private static final String SUB_TOKEN_PART2_PREFIX = "pt2=" ;
private static final String SUB_TOKEN_PART3_PREFIX = "pt3=" ;
private final Pattern noLetters = Pattern.compile("[^a-zA-Z]");
private final Tokenizer tokenizer;
/**
* Initializes a {@link TokenPatternFeatureGenerator}.
* For tokenization the {@link SimpleTokenizer} is used.
*/
public TokenPatternFeatureGenerator() {
this(SimpleTokenizer.INSTANCE);
}
/**
* Initializes a {@link TokenPatternFeatureGenerator} instance.
*
* @param supportTokenizer The {@link Tokenizer} to be used.
*/
public TokenPatternFeatureGenerator(Tokenizer supportTokenizer) {
tokenizer = supportTokenizer;
}
@Override
public void createFeatures(List feats, String[] toks, int index, String[] preds) {
String[] tokenized = tokenizer.tokenize(toks[index]);
if (tokenized.length == 1) {
feats.add(SUB_TOKEN_PREFIX + StringUtil.toLowerCase(toks[index]));
return;
}
feats.add("stn=" + tokenized.length);
StringBuilder pattern = new StringBuilder();
for (int i = 0; i < tokenized.length; i++) {
if (i < tokenized.length - 1) {
feats.add(SUB_TOKEN_PART2_PREFIX + FeatureGeneratorUtil.tokenFeature(tokenized[i]) +
FeatureGeneratorUtil.tokenFeature(tokenized[i + 1]));
}
if (i < tokenized.length - 2) {
feats.add(SUB_TOKEN_PART3_PREFIX + FeatureGeneratorUtil.tokenFeature(tokenized[i]) +
FeatureGeneratorUtil.tokenFeature(tokenized[i + 1]) +
FeatureGeneratorUtil.tokenFeature(tokenized[i + 2]));
}
pattern.append(FeatureGeneratorUtil.tokenFeature(tokenized[i]));
if (!noLetters.matcher(tokenized[i]).find()) {
feats.add(SUB_TOKEN_PREFIX + StringUtil.toLowerCase(tokenized[i]));
}
}
feats.add("pta=" + pattern);
}
}