All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.unlaxer.jaddress.normalizer.VariantNormalizer Maven / Gradle / Ivy

package org.unlaxer.jaddress.normalizer;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import org.unlaxer.jaddress.util.normalize.WordReplacer;

import com.atilika.kuromoji.TokenBase;
import com.atilika.kuromoji.ipadic.Token;
import com.atilika.kuromoji.ipadic.Tokenizer;
import com.ibm.icu.text.Transliterator;

public class VariantNormalizer {
	private Map numberMap;
	private Map digitMap;
	private Tokenizer tokenizer;
	private Transliterator transliterator;
	private Transliterator transliterator2;
	private WordReplacer wordReplacer;

	public VariantNormalizer() {
		
		tokenizer = new Tokenizer();
		wordReplacer =WordReplacer.SINGLETON.get();

		String baseRule = Transliterator.getInstance("Any-NFKD;Hiragana-Katakana;Any-Upper;Fullwidth-Halfwidth;")
				.toRules(true);
		String hyphen = "[\\-~~―-ーー/⁄/\のノ之乃]>-;";
		String space = "[ \\t \\\\n]+>' ';^' '>;' '$>;";

		transliterator = Transliterator.createFromRules("full", baseRule + hyphen + space,
				Transliterator.FORWARD);

		transliterator2 = Transliterator.getInstance("Fullwidth-Halfwidth;");

		initdigiMap();
		initNumberMap();
	}

	public String normalize(String value) {
		return wordReplacer.apply(transliterator.transliterate(value));
	}

	public String normalizeForAddress(CharSequence charSequence) {
		List numlist = new ArrayList<>();
		StringBuilder sb = new StringBuilder();
		Boolean isNumBlock = false;
		Boolean isSum = false;

		for (int i = 0; i < charSequence.length(); i++) {
			char c = charSequence.charAt(i);
			c = numberMap.getOrDefault(c, c);

			if (digitMap.containsKey(c)) {
				int num = digitMap.get(c);

				if (num >= 10) {
					isSum = true;
					int listSize = numlist.size();
					if (listSize > 0) {
						if (numlist.get(listSize - 1) < 10) {
							num = numlist.get(listSize - 1) * num;
							numlist.remove(listSize - 1);
						}
					}
				}
				numlist.add(num);
				isNumBlock = true;
			} else {
				if (isNumBlock) {
					numFunc(numlist, sb, isSum);
					isSum = false;
				}
				sb.append(c);
				isNumBlock = false;
			}
		}
		if (isNumBlock) {
			numFunc(numlist, sb, isSum);
		}

		return wordReplacer.apply(sb.toString());
	}

	private void numFunc(List numlist, StringBuilder sb, boolean isSum) {
		if (numlist.size() != 0) {
			if (isSum) {
				int sum = numlist.stream().mapToInt(Integer::intValue).sum();
				sb.append(transliterator2.transliterate(String.valueOf(sum)));
			} else {
				StringBuilder val = new StringBuilder(); 
				numlist.stream().forEach(s -> val.append(s));
				sb.append(transliterator2.transliterate(val.toString()));
			}
			numlist.clear();
		}
	}

	public String normalizeKuromoji(String value) {
		List tokens = tokenizer.tokenize(value);
//	        for (Token token : tokens) {
//	            System.out.println(token.getSurface() + "\t" + token.getAllFeatures());
//	        }

		return tokens.stream()
				.map(TokenBase::getSurface)
				.collect(Collectors.joining("_"));
	}

	private void initNumberMap() {
		numberMap = new HashMap<>();
		numberMap.put('拾', '十');
		numberMap.put('什', '十');

		numberMap.put('陌', '百');
		numberMap.put('佰', '百');

		numberMap.put('阡', '千');
		numberMap.put('仟', '千');
		numberMap.put('萬', '万');

		numberMap.put('一', '1');
		numberMap.put('壱', '1');
		numberMap.put('壹', '1');
		numberMap.put('弌', '1');

		numberMap.put('二', '2');
		numberMap.put('弐', '2');
		numberMap.put('貳', '2');
		numberMap.put('貮', '2');

		numberMap.put('三', '3');
		numberMap.put('参', '3');
		numberMap.put('參', '3');
		numberMap.put('弎', '3');

		numberMap.put('四', '4');
		numberMap.put('肆', '4');

		numberMap.put('五', '5');
		numberMap.put('伍', '5');

		numberMap.put('六', '6');
		numberMap.put('陸', '6');

		numberMap.put('七', '7');
		numberMap.put('漆', '7');
		numberMap.put('質', '7');
		numberMap.put('柒', '7');

		numberMap.put('八', '8');
		numberMap.put('捌', '8');

		numberMap.put('九', '9');
		numberMap.put('玖', '9');

		numberMap.put('〇', '0');
		numberMap.put('零', '0');
	}

	private void initdigiMap() {
		digitMap = new HashMap<>();
		digitMap.put('十', 10);
		digitMap.put('百', 100);
		digitMap.put('千', 1000);
		digitMap.put('万', 10000);

//		digitMap.put('廿', 20);
//		digitMap.put('卄', 20);
//		digitMap.put('卅', 30);
//		digitMap.put('丗', 30);
//		digitMap.put('卌', 40);
//		digitMap.put('皕', 200);

		digitMap.put('1', 1);
		digitMap.put('2', 2);
		digitMap.put('3', 3);
		digitMap.put('4', 4);
		digitMap.put('5', 5);
		digitMap.put('6', 6);
		digitMap.put('7', 7);
		digitMap.put('8', 8);
		digitMap.put('9', 9);
		digitMap.put('0', 0);
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy