All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.unlaxer.jaddress.normalizer.VariantNormalizerImpl Maven / Gradle / Ivy

package org.unlaxer.jaddress.normalizer;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import org.apache.lucene.analysis.ja.ModifiedJapaneseIterationMarkCharFilter;
import org.unlaxer.jaddress.util.normalize.WordReplacer;

import com.atilika.kuromoji.TokenBase;
import com.atilika.kuromoji.ipadic.Token;
import com.atilika.kuromoji.ipadic.Tokenizer;
import com.ibm.icu.text.Transliterator;

public class VariantNormalizerImpl implements VariantNormalizer {
	private Map numberMap;
	private Map digitMap;
	private Tokenizer tokenizer;
	private Transliterator transliterator;
	private Transliterator transliterator2;
	private WordReplacer wordReplacer;

	VariantNormalizerImpl() {
		
		tokenizer = new Tokenizer();
		wordReplacer =WordReplacer.SINGLETON.get();

		String baseRule = Transliterator.getInstance("Any-NFKD;Hiragana-Katakana;Any-Upper;Fullwidth-Halfwidth;")
				.toRules(true);
		String hyphen = "[\\-~~―-ーー/⁄/\のノ之乃]>-;";
		String space = "[ \\t \\\\n]+>' ';^' '>;' '$>;";

		transliterator = Transliterator.createFromRules("full", baseRule + hyphen + space,
				Transliterator.FORWARD);

		transliterator2 = Transliterator.getInstance("Fullwidth-Halfwidth;");

		initdigiMap();
		initNumberMap();
	}

	@Override
	public String normalize(String value) {
		return value == null ? 
				null:
				踊り字(wordReplacer.apply(transliterator.transliterate(value)));
	}
	
	@Override
	public String normalizeForAddressAsString(CharSequence charSequence) {
		return charSequence == null ?
				null:
				normalizeForAddress(charSequence).normalized;
	}

	@Override
	public NormalizeResult normalizeForAddress(CharSequence charSequence) {
		NormalizeResult result = new NormalizeResult();
		if(charSequence == null) {
			return result;
		}
		Numberd numberd = new Numberd(-1);

		Boolean isNumBlock = false;

		for (int i = 0; i < charSequence.length(); i++) {
			char c = charSequence.charAt(i);
			c = numberMap.getOrDefault(c, c);

			if (digitMap.containsKey(c)) {
				if(!isNumBlock) {
					numberd = new Numberd(i);
				}
				numberd.addValue(digitMap.get(c), c);

				isNumBlock = true;
			} else {
				if (isNumBlock) {
					numFunc(numberd);
					if(numberd.hasValue()) {
						result.sb.append(numberd.getNormalized());
						result.numberdList.add(numberd);
					}
				}
				result.sb.append(c);
				
				isNumBlock = false;
			}
		}
		if (isNumBlock) {
			numFunc(numberd);
			if(numberd.hasValue()) {
				result.sb.append(numberd.getNormalized());
				result.numberdList.add(numberd);
			}
		}

		result.normalized = 踊り字(wordReplacer.apply(result.sb.toString()));

//		System.out.println(charSequence.toString());
//		System.out.println(result);
		return result;
	}

	private void numFunc(Numberd numberd) {
		if (numberd.numlist.size() != 0) {
			if (numberd.isSum) {
				int sum = numberd.numlist.stream().mapToInt(Integer::intValue).sum();
				numberd.setNormalized(transliterator2.transliterate(String.valueOf(sum)));;
			} else {
				StringBuilder val = new StringBuilder(); 
				numberd.numlist.stream().forEach(s -> val.append(s));
				numberd.setNormalized(transliterator2.transliterate(val.toString()));
			}
		}
	}

	@Override
	public String normalizeKuromoji(String value) {
		List tokens = tokenizer.tokenize(value);
//	        for (Token token : tokens) {
//	            System.out.println(token.getSurface() + "\t" + token.getAllFeatures());
//	        }

		return 踊り字(tokens.stream()
				.map(TokenBase::getSurface)
				.collect(Collectors.joining("_")));
	}

	private void initNumberMap() {
		numberMap = new HashMap<>();
		numberMap.put('拾', '十');
		numberMap.put('什', '十');

		numberMap.put('陌', '百');
		numberMap.put('佰', '百');

		numberMap.put('阡', '千');
		numberMap.put('仟', '千');
		numberMap.put('萬', '万');

		numberMap.put('一', '1');
		numberMap.put('壱', '1');
		numberMap.put('壹', '1');
		numberMap.put('弌', '1');

		numberMap.put('二', '2');
		numberMap.put('弐', '2');
		numberMap.put('貳', '2');
		numberMap.put('貮', '2');

		numberMap.put('三', '3');
		numberMap.put('参', '3');
		numberMap.put('參', '3');
		numberMap.put('弎', '3');

		numberMap.put('四', '4');
		numberMap.put('肆', '4');

		numberMap.put('五', '5');
		numberMap.put('伍', '5');

		numberMap.put('六', '6');
		numberMap.put('陸', '6');

		numberMap.put('七', '7');
		numberMap.put('漆', '7');
		numberMap.put('質', '7');
		numberMap.put('柒', '7');

		numberMap.put('八', '8');
		numberMap.put('捌', '8');

		numberMap.put('九', '9');
		numberMap.put('玖', '9');

		numberMap.put('〇', '0');
		numberMap.put('零', '0');
	}

	private void initdigiMap() {
		digitMap = new HashMap<>();
		digitMap.put('十', 10);
		digitMap.put('百', 100);
		digitMap.put('千', 1000);
		digitMap.put('万', 10000);

//		digitMap.put('廿', 20);
//		digitMap.put('卄', 20);
//		digitMap.put('卅', 30);
//		digitMap.put('丗', 30);
//		digitMap.put('卌', 40);
//		digitMap.put('皕', 200);

		digitMap.put('1', 1);
		digitMap.put('2', 2);
		digitMap.put('3', 3);
		digitMap.put('4', 4);
		digitMap.put('5', 5);
		digitMap.put('6', 6);
		digitMap.put('7', 7);
		digitMap.put('8', 8);
		digitMap.put('9', 9);
		digitMap.put('0', 0);
	}
	
	String 踊り字(String source) {
		return ModifiedJapaneseIterationMarkCharFilter.normalize(source, false,true);
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy