com.cybozu.labs.langdetect.util.NGram Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of language-detector Show documentation
Show all versions of language-detector Show documentation
Language Detection Library for Java.
/*
* Copyright 2011 Nakatani Shuyo
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cybozu.labs.langdetect.util;
import org.jetbrains.annotations.Nullable;
import java.util.HashMap;
import java.util.Map;
/**
* TODO document.
*
* Users don't use this class directly.
*
* TODO this class treats a word as "upper case" if the first 2 characters are upper case. That seems like a simplification,
* would need documentation.
*
* @author Nakatani Shuyo
*/
public class NGram {
/**
* ngrams are created from 1gram to this amount, currently 2grams and 3grams.
*/
public final static int N_GRAM = 3;
private StringBuilder grams_;
private boolean capitalword_;
/**
*
*/
public NGram() {
grams_ = new StringBuilder(" ");
capitalword_ = false;
}
/**
* @param ch
*/
public void addChar(char ch) {
ch = CharNormalizer.normalize(ch);
char lastchar = grams_.charAt(grams_.length() - 1);
if (lastchar == ' ') {
grams_ = new StringBuilder(" ");
capitalword_ = false;
if (ch==' ') return;
} else if (grams_.length() >= N_GRAM) {
grams_.deleteCharAt(0);
}
grams_.append(ch);
if (Character.isUpperCase(ch)){
if (Character.isUpperCase(lastchar)) capitalword_ = true;
} else {
capitalword_ = false;
}
}
/**
* TODO this method has some weird, undocumented behavior to ignore ngrams with upper case.
*
* Get n-Gram
* @param n length of n-gram
* @return n-Gram String (null if it is invalid)
*/
@Nullable
public String get(int n) {
if (capitalword_) return null;
int len = grams_.length();
if (n < 1 || n > N_GRAM || len < n) return null;
if (n == 1) {
char ch = grams_.charAt(len - 1);
if (ch == ' ') return null;
return Character.toString(ch);
} else {
return grams_.substring(len - n, len);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy