org.unlaxer.jaddress.normalizer.VariantNormalizer Maven / Gradle / Ivy
package org.unlaxer.jaddress.normalizer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.unlaxer.jaddress.util.normalize.WordReplacer;
import com.atilika.kuromoji.TokenBase;
import com.atilika.kuromoji.ipadic.Token;
import com.atilika.kuromoji.ipadic.Tokenizer;
import com.ibm.icu.text.Transliterator;
public class VariantNormalizer {
private Map numberMap;
private Map digitMap;
private Tokenizer tokenizer;
private Transliterator transliterator;
private Transliterator transliterator2;
private WordReplacer wordReplacer;
public VariantNormalizer() {
tokenizer = new Tokenizer();
wordReplacer =WordReplacer.SINGLETON.get();
String baseRule = Transliterator.getInstance("Any-NFKD;Hiragana-Katakana;Any-Upper;Fullwidth-Halfwidth;")
.toRules(true);
String hyphen = "[\\-~~―-ーー/⁄/\のノ之乃]>-;";
String space = "[ \\t \\\\n]+>' ';^' '>;' '$>;";
transliterator = Transliterator.createFromRules("full", baseRule + hyphen + space,
Transliterator.FORWARD);
transliterator2 = Transliterator.getInstance("Fullwidth-Halfwidth;");
initdigiMap();
initNumberMap();
}
public String normalize(String value) {
return wordReplacer.apply(transliterator.transliterate(value));
}
public String normalizeForAddress(CharSequence charSequence) {
List numlist = new ArrayList<>();
StringBuilder sb = new StringBuilder();
Boolean isNumBlock = false;
Boolean isSum = false;
for (int i = 0; i < charSequence.length(); i++) {
char c = charSequence.charAt(i);
c = numberMap.getOrDefault(c, c);
if (digitMap.containsKey(c)) {
int num = digitMap.get(c);
if (num >= 10) {
isSum = true;
int listSize = numlist.size();
if (listSize > 0) {
if (numlist.get(listSize - 1) < 10) {
num = numlist.get(listSize - 1) * num;
numlist.remove(listSize - 1);
}
}
}
numlist.add(num);
isNumBlock = true;
} else {
if (isNumBlock) {
numFunc(numlist, sb, isSum);
isSum = false;
}
sb.append(c);
isNumBlock = false;
}
}
if (isNumBlock) {
numFunc(numlist, sb, isSum);
}
return wordReplacer.apply(sb.toString());
}
private void numFunc(List numlist, StringBuilder sb, boolean isSum) {
if (numlist.size() != 0) {
if (isSum) {
int sum = numlist.stream().mapToInt(Integer::intValue).sum();
sb.append(transliterator2.transliterate(String.valueOf(sum)));
} else {
StringBuilder val = new StringBuilder();
numlist.stream().forEach(s -> val.append(s));
sb.append(transliterator2.transliterate(val.toString()));
}
numlist.clear();
}
}
public String normalizeKuromoji(String value) {
List tokens = tokenizer.tokenize(value);
// for (Token token : tokens) {
// System.out.println(token.getSurface() + "\t" + token.getAllFeatures());
// }
return tokens.stream()
.map(TokenBase::getSurface)
.collect(Collectors.joining("_"));
}
private void initNumberMap() {
numberMap = new HashMap<>();
numberMap.put('拾', '十');
numberMap.put('什', '十');
numberMap.put('陌', '百');
numberMap.put('佰', '百');
numberMap.put('阡', '千');
numberMap.put('仟', '千');
numberMap.put('萬', '万');
numberMap.put('一', '1');
numberMap.put('壱', '1');
numberMap.put('壹', '1');
numberMap.put('弌', '1');
numberMap.put('二', '2');
numberMap.put('弐', '2');
numberMap.put('貳', '2');
numberMap.put('貮', '2');
numberMap.put('三', '3');
numberMap.put('参', '3');
numberMap.put('參', '3');
numberMap.put('弎', '3');
numberMap.put('四', '4');
numberMap.put('肆', '4');
numberMap.put('五', '5');
numberMap.put('伍', '5');
numberMap.put('六', '6');
numberMap.put('陸', '6');
numberMap.put('七', '7');
numberMap.put('漆', '7');
numberMap.put('質', '7');
numberMap.put('柒', '7');
numberMap.put('八', '8');
numberMap.put('捌', '8');
numberMap.put('九', '9');
numberMap.put('玖', '9');
numberMap.put('〇', '0');
numberMap.put('零', '0');
}
private void initdigiMap() {
digitMap = new HashMap<>();
digitMap.put('十', 10);
digitMap.put('百', 100);
digitMap.put('千', 1000);
digitMap.put('万', 10000);
// digitMap.put('廿', 20);
// digitMap.put('卄', 20);
// digitMap.put('卅', 30);
// digitMap.put('丗', 30);
// digitMap.put('卌', 40);
// digitMap.put('皕', 200);
digitMap.put('1', 1);
digitMap.put('2', 2);
digitMap.put('3', 3);
digitMap.put('4', 4);
digitMap.put('5', 5);
digitMap.put('6', 6);
digitMap.put('7', 7);
digitMap.put('8', 8);
digitMap.put('9', 9);
digitMap.put('0', 0);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy