org.unlaxer.jaddress.parser.CharacterKind Maven / Gradle / Ivy

Go to download
package org.unlaxer.jaddress.parser;

import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

import org.unlaxer.jaddress.UserHomeContext;
import org.unlaxer.util.collection.Comparators;
import org.unlaxer.util.function.Unchecked;

import io.vavr.Tuple2;
import io.vavr.control.Try;

public enum CharacterKind implements CombinedCharacterKindAllMatch{
	
	terminator(Collections.emptySet(),true),
	
	// https://hydrocul.github.io/wiki/blog/2014/1101-hyphen-minus-wave-tilde.html
	/*
	- 	2D	U+002D	ASCIIのハイフン
	‐	E28090	U+2010	別のハイフン
	‑	E28091	U+2011	改行しないハイフン
	–	E28093	U+2013	ENダッシュ
	—	E28094	U+2014	EMダッシュ
	―	E28095	U+2015	全角のダッシュ
	−	E28892	U+2212	全角のマイナス
	
	ー	E383BC	U+30FC	全角の長音
	ｰ	EFBDB0	U+FF70	半角カナの長音
	*/
	
	alphabet(
		new RangeBothInclusive("A".codePointAt(0), "Z".codePointAt(0)),
		new RangeBothInclusive("a".codePointAt(0), "z".codePointAt(0)),
		new RangeBothInclusive("Ａ".codePointAt(0), "Ｚ".codePointAt(0)),
		new RangeBothInclusive("ａ".codePointAt(0), "ｚ".codePointAt(0))
	),
	arabicNumber(
		new RangeBothInclusive("0".codePointAt(0), "9".codePointAt(0)),
		new RangeBothInclusive("０".codePointAt(0), "９".codePointAt(0))
	),
	japaneseAddressNumber(
			toCodePointSet("一壱弌壹二弍弐貳三弎参參四亖五六七八九十拾什百佰陌千〇○")//仟阡万零
	),
	hiragana(new RangeBothInclusive("ぁ".codePointAt(0), "ん".codePointAt(0))),
	katakana(new RangeBothInclusive("ァ".codePointAt(0), "ヶ".codePointAt(0))),// E382A1 - E383B6
	cyouon(toCodePointSet("ｰー")),
	
	//TODO separates hyphen / dash /wave
	delimitorHyphen(
			/*
			- 	2D	U+002D	ASCIIのハイフン
			‐	E28090	U+2010	別のハイフン
			‑	E28091	U+2011	改行しないハイフン
			–	E28093	U+2013	ENダッシュ
			—	E28094	U+2014	EMダッシュ
			―	E28095	U+2015	全角のダッシュ
			−	E28892	U+2212	全角のマイナス
			
			ー	E383BC	U+30FC	全角の長音
			ｰ	EFBDB0	U+FF70	半角カナの長音
			*/
			toCodePointSet("-‐‑–—―−－"),//<-最後に ASCIIのマイナス追加
			true
	),//  hyphen
	
	delimitorSlash(
			toCodePointSet("/⁄／＼"),
			true
	),//  splash

	delimitorSpace(
			
			toCodePointSet(" 	　"),//space tab zenkaku-space
			true
	),// space ,
	
	delimitorComma(
			toCodePointSet(".,;:、。：；"),
			true
	),// comma
	delimitorJapaneseSymbol(
			toCodePointSet("・·~～"), //ｰー <-これは長音記号
			true
	),// japanese delimitor
	
	//TODO set to before hiragana?
	delimitorJapanese(
			toCodePointSet("のノ之乃"),
			true
	),// japanese suffix
	suffix丁目(
			toCodePointSet(
					"丁目",
					"丁" /*大阪府堺市の「丁」*/,
					"番町", /*		三重県名張市・徳島県徳島市の「番町」 三重県名張市では「○○△町域Top1」という町名は見られない。かわりに「○○△番町」という町名が使われている */
					"町目" /* 福島県いわき市・郡山市の「町目」 福島県いわき市や郡山市の一部では、「町域Top1」のかわりに「町目」が使われている。いわき市平一町目、郡山市西田町三町目など。*/
			),
			false
	),
	suffix地番(
			toCodePointSet("番地","番"),
			false
	),
	suffix号室(
			toCodePointSet(
				List.of("号室")),
			false
	),
	suffix号(
			toCodePointSet("号"),
			false
	),
	
	
	suffix階(
			toCodePointSet("階","F","Ｆ"),
			false
	),
	
	suffix棟(
			toCodePointSet("号棟","号館","番館","番棟","棟"),
			false
	),
	
	_堂(
			toCodePointSet("堂"),
			false
	),
	十干(
			toCodePointSet("甲","乙","丙","丁","戊","己","庚","辛","壬","癸"),
			false
	),
	
	prefixNo(
			toCodePointSet(
				"No","Ｎｏ","No.","Ｎｏ．",
				"NO","ＮＯ","NO.","ＮＯ．"
			),
			false
	),
	prefix第(
			toCodePointSet(
				"第"
			),
			false
	),




	// japanese delimitor
	
	symbol(
			toCodePointSet(
				"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾_｀｛｜｝～"
			),
			false
	),// symbol
	
	normal,
	;

	// fields
	
	final boolean isDelimitor;
	
	final RangeBothInclusive[] ranges;
	
	final Set dictionary;
	final Set> multiCharacterDictionary;
	
	final List  strings;

	
	// constructors

	private CharacterKind(RangeBothInclusive... ranges) {
		this.ranges = ranges;
		dictionary = Collections.emptySet();
		multiCharacterDictionary = Collections.emptySet();
		isDelimitor = false;
		strings = Stream.of(ranges)
				.map(RangeBothInclusive::stream)
				.flatMap(IntStream::boxed)
				.map(codePoint-> new String(new int[] {codePoint},0,1))
				.collect(Collectors.toList());
		strings.sort(Comparators.longerIsFirst);
	}

	private CharacterKind(Set dictonary) {
		this(dictonary,false);
	}
	
	private CharacterKind(Set dictonary , boolean isDelimitor) {
		ranges = new RangeBothInclusive[] {};
		this.dictionary = dictonary;
		multiCharacterDictionary = Collections.emptySet();
		this.isDelimitor = isDelimitor;
		
		strings = dictonary.stream()
				.map(codePoint-> new String(new int[] {codePoint},0,1))
				.collect(Collectors.toList());
		strings.sort(Comparators.longerIsFirst);

	}
	
	private CharacterKind(
			Tuple2>,Set> dictonaries , 
			boolean isDelimitor) {
		ranges = new RangeBothInclusive[] {};
		this.dictionary = dictonaries._2;
		this.multiCharacterDictionary = dictonaries._1;
		this.isDelimitor = isDelimitor;
		
		strings = new ArrayList<>();
		
		Set collect = dictonaries._2().stream()
			.map(codePoint-> new String(new int[] {codePoint},0,1))
			.collect(Collectors.toSet());
		
		
		Set collect2 = dictonaries._1().stream()
			.map(list->{
				int[] codePoints = new int[list.size()];
				int i =0 ;
				for(Integer codePoint : list) {
					codePoints[i++] = codePoint;
				}
				return new String(codePoints , 0 , codePoints.length);
			})
			.collect(Collectors.toSet());
		
		strings.addAll(collect);
		strings.addAll(collect2);
		strings.sort(Comparators.longerIsFirst);
	}

	private CharacterKind(
			Set> multiCharacterDictionary ,
			Set dictonary , 
			boolean isDelimitor) {
		
		ranges = new RangeBothInclusive[] {};
		this.dictionary = dictonary;
		this.multiCharacterDictionary = multiCharacterDictionary;
		this.isDelimitor = isDelimitor;
		
		strings = new ArrayList<>();
		
		Set collect = dictonary.stream()
			.map(codePoint-> new String(new int[] {codePoint},0,1))
			.collect(Collectors.toSet());
		
		
		Set collect2 = multiCharacterDictionary.stream()
			.map(list->{
				int[] codePoints = new int[list.size()];
				int i =0 ;
				for(Integer codePoint : list) {
					codePoints[i++] = codePoint;
				}
				return new String(codePoints , 0 , codePoints.length);
			})
			.collect(Collectors.toSet());
		
		strings.addAll(collect);
		strings.addAll(collect2);
		strings.sort(Comparators.longerIsFirst);
	}

	
	@Override
	public boolean isJapanese() {
		return 
			this == normal ||
			this == katakana ||
			this == hiragana ||
			this == delimitorJapaneseSymbol ||
			this == delimitorJapanese ||
			this == suffix丁目 ||
			this == suffix地番 ||
			this == suffix号 ||
			this == japaneseAddressNumber ;
	}
	
	@Override
	public boolean isDelimitor() {
		return isDelimitor;
	}
	
	@Override
	public boolean  isJapanesAddressDelimitor(){
		return 
				this == suffix丁目 ||
				this == suffix地番 || 
				this == suffix号 ; 
	}
	
	@Override
	public boolean isNumber() {
		return this == arabicNumber || this == japaneseAddressNumber;
	}
	
	
	public List  strings(){
		return strings;
	}
	
	final Set dictionary(){
		
		if(dictionaryByCharacterKind.containsKey(this)) {
			return dictionaryByCharacterKind.get(this);
		}
		
		Set dictionaryFromFile =
			fromFile()
				.map(Tuple2::_2)
				.orElse(dictionary);
			dictionaryByCharacterKind.put(this, dictionaryFromFile);
		return dictionaryByCharacterKind.get(this);
	}
	
	final Set> multiCharacterDictionary(){
		
		if(multiCharacterDictionaryByCharacterKind.containsKey(this)) {
			return multiCharacterDictionaryByCharacterKind.get(this);
		}
		
		Set> dictionaryFromFile =
				fromFile()
					.map(Tuple2::_1)
					.orElse(multiCharacterDictionary);
		multiCharacterDictionaryByCharacterKind.put(this, dictionaryFromFile);
		return multiCharacterDictionaryByCharacterKind.get(this);
	}

	Optional>, Set>> fromFile(){
		
		Path pathWithFolderAndFile = 
			UserHomeContext.getPathWithFolderAndFile("unlaxer-JAP", name()+".txt");
		
		if(Files.exists(pathWithFolderAndFile)) {
			
			return Try.ofSupplier(()->{
				List words;
				words = Unchecked.of(
						()->Files.newBufferedReader(pathWithFolderAndFile).lines()
							.collect(Collectors.toList())).get();
				Tuple2>, Set> codePointSet = toCodePointSet(words);
				return codePointSet;
			}).toJavaOptional();
		
		}else {
			return Optional.empty();
		}
	}
	

	
	
	static Tuple2> ,Set> toCodePointSet(String... words){
		return toCodePointSet(List.of(words));
	}
	
	static Tuple2> ,Set> toCodePointSet(Collection words){
		
		Set single = words.stream()
			.filter(word->word.length() == 1)
			.map(character->character.codePointAt(0))
			.collect(Collectors.toSet());
		
		Set> multi = words.stream()
				.filter(word->word.length() > 1)
				.map(CharacterKind::toCodePoints)
				.collect(Collectors.toSet());
		
		return new Tuple2<>(multi , single);
	}

	
	static Set toCodePointSet(String words){
		return Stream.of(words.split(""))
			.map(character->{
				assert character.codePointCount(0, character.length())==1;
				return character.codePointAt(0);
				})
			.collect(Collectors.toSet());
	}
	
	static List toCodePoints(String word){
		return word.codePoints().boxed()
			.collect(Collectors.toList());
	}
	
	
	public boolean hasMultipleCharacter() {
		return false == multiCharacterDictionary.isEmpty();
	}

	@Override
	public boolean isArabicNumber() {
		return this == arabicNumber;
	}

	@Override
	public boolean isSymbol() {
		return this == symbol;
	}

	@Override
	public boolean isAlphabet() {
		return this == alphabet;
	}

	@Override
	public boolean isJapaneseNumber() {
		return this == japaneseAddressNumber;
	}

	@Override
	public boolean isHiragana() {
		return this == hiragana;
	}

	@Override
	public boolean isKatakana() {
		return this == katakana;
	}

	@Override
	public boolean isDelimitorHyphen() {
		return this == delimitorHyphen;
	}

	@Override
	public boolean isDelimitorSlash() {
		return this == delimitorSlash;
	}

	@Override
	public boolean isDelimitorSpace() {
		return this == delimitorSpace;
	}

	@Override
	public boolean isDelimitorComma() {
		return this == delimitorComma;
	}

	@Override
	public boolean isDelimitorJapanese() {
		return this == delimitorJapanese;
	}

	@Override
	public boolean isDelimitorJapaneseCyoumeAddress() {
		return this == suffix丁目;
	}

	@Override
	public boolean isDelimitorJapaneseBanchiAddress() {
		return this == suffix地番;
	}

	@Override
	public boolean isDelimitorJapaneseGouAddress() {
		return this == suffix号;
	}

	@Override
	public boolean isNormal() {
		return this == normal;
	}

	@Override
	public boolean isAllKind() {
		return true;
	}
	
	static Map>> multiCharacterDictionaryByCharacterKind = new HashMap<>();
	
	static Map> dictionaryByCharacterKind = new HashMap<>();
	
	List matched(ListIterator codePointIterator){
		
		if(this == normal) {
			return List.of(codePointIterator.next());
		}
		
		if(hasMultipleCharacter()) {
			
			List matched = new ArrayList<>();
			
			for(List wordOfDictionary : multiCharacterDictionary) {
				
				int consumed = 0;
				boolean match = true;
				
				for (Integer codePointOfDictionary : wordOfDictionary) {
					
					if(false == codePointIterator.hasNext()) {
						resetIterator(codePointIterator, consumed);
						match = false;
						matched.clear();
						break;
					}
					
					Integer codePoint = codePointIterator.next();
					consumed++;
					
					if(codePoint.intValue() != codePointOfDictionary.intValue()) {
						
						resetIterator(codePointIterator, consumed);
						match = false;
						matched.clear();
						break;
					}
					matched.add(codePoint);
				}
				if(match) {
					return matched;
				}
			}
		}
		
		int codePoint = codePointIterator.next();
		
		for (RangeBothInclusive range : ranges) {
			
			if(range.in(codePoint)) {
				
				return List.of(codePoint);
			}
		}
		boolean contains = dictionary.contains(codePoint);
		
		if(contains) {
			return List.of(codePoint);
		}
		codePointIterator.previous();
		return Collections.emptyList();
	}
	
	static void resetIterator(ListIterator iterator , int consumed) {
		for(int i = 0 ; i < consumed ; i++) {
			iterator.previous();
		}
	}

	@Override
	public boolean isTerminator() {
		return this == terminator;
	}

}