All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.unlaxer.jaddress.parser.CharacterKind Maven / Gradle / Ivy

package org.unlaxer.jaddress.parser;

import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;

import org.unlaxer.jaddress.UserHomeContext;
import org.unlaxer.util.function.Unchecked;

import io.vavr.Tuple2;
import io.vavr.control.Try;

public enum CharacterKind implements CombinedCharacterKindAllMatch{
	
	alphabet(
		new RangeBothInclusive("A".codePointAt(0), "Z".codePointAt(0)),
		new RangeBothInclusive("a".codePointAt(0), "z".codePointAt(0)),
		new RangeBothInclusive("A".codePointAt(0), "Z".codePointAt(0)),
		new RangeBothInclusive("a".codePointAt(0), "z".codePointAt(0))
	),
	arabicNumber(
		new RangeBothInclusive("0".codePointAt(0), "9".codePointAt(0)),
		new RangeBothInclusive("0".codePointAt(0), "9".codePointAt(0))
	),
	japaneseNumber(
			toCodePointSet("一二三四五六七八九十百千万")
	),
	hiragana(new RangeBothInclusive("ぁ".codePointAt(0), "ん".codePointAt(0))),
	katakana(new RangeBothInclusive("ァ".codePointAt(0), "ヶ".codePointAt(0))),// E382A1 - E383B6
	delimitorHyphen(
			toCodePointSet("-―-")
	),//  hyphen
	
	delimitorSlash(
			toCodePointSet("/⁄/\")
	),//  splash

	delimitorSpace(
			
			toCodePointSet(" 	 ")//space tab zenkaku-space
	),// space ,
	
	delimitorComma(
			toCodePointSet(".,;:、。:;"),
			true
	),// comma
	delimitorJapaneseSymbol(
			toCodePointSet("・·ーー~~"),
			true
	),// japanese delimitor
	
	delimitorJapanese(
			toCodePointSet("のノ之乃"),
			true
	),// japanese delimitor
	delimitorJapaneseCyoumeAddress(
			toCodePointSet(
					"丁目",
					"丁" /*大阪府堺市の「丁」*/,
					"番町", /*		三重県名張市・徳島県徳島市の「番町」 三重県名張市では「○○△丁目」という町名は見られない。かわりに「○○△番町」という町名が使われている */
					"町目" /* 福島県いわき市・郡山市の「町目」 福島県いわき市や郡山市の一部では、「丁目」のかわりに「町目」が使われている。いわき市平一町目、郡山市西田町三町目など。*/
			),
			true
	),
	delimitorJapaneseBanchiAddress(
			toCodePointSet("番地","番"),
			true
	),
	delimitorJapaneseGouAddress(
			toCodePointSet("号"),
			true
	),
	// japanese delimitor
	
	symbol(
			toCodePointSet(
				"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~!”#$%&’()*+,-./:;<=>?@[¥]^_`{|}~"
			),
			true
	),// symbol
	
	normal,
	;
	
	@Override
	public boolean isJapanese() {
		return 
			this == normal ||
			this == katakana ||
			this == hiragana ||
			this == delimitorJapaneseSymbol ||
			this == delimitorJapanese ||
			this == delimitorJapaneseCyoumeAddress ||
			this == delimitorJapaneseBanchiAddress ||
			this == delimitorJapaneseGouAddress ||
			this == japaneseNumber ;
	}
	
	@Override
	public boolean isDelimitor() {
		return isDelimitor;
	}
	
	@Override
	public boolean  isJapanesAddressDelimitor(){
		return 
				this == delimitorJapaneseCyoumeAddress ||
				this == delimitorJapaneseBanchiAddress || 
				this == delimitorJapaneseGouAddress ; 
	}
	
	@Override
	public boolean isNumber() {
		return this == arabicNumber || this == japaneseNumber;
	}
	
	final RangeBothInclusive[] ranges;
	
	final Set dictionary;
	final Set> multiCharacterDictionary;
	
	final Set  strings;
	
	public Set  strings(){
		return strings;
	}
	
	final Set dictionary(){
		
		if(dictionaryByCharacterKind.containsKey(this)) {
			return dictionaryByCharacterKind.get(this);
		}
		
		Set dictionaryFromFile =
			fromFile()
				.map(Tuple2::_2)
				.orElse(dictionary);
			dictionaryByCharacterKind.put(this, dictionaryFromFile);
		return dictionaryByCharacterKind.get(this);
	}
	
	final Set> multiCharacterDictionary(){
		
		if(multiCharacterDictionaryByCharacterKind.containsKey(this)) {
			return multiCharacterDictionaryByCharacterKind.get(this);
		}
		
		Set> dictionaryFromFile =
				fromFile()
					.map(Tuple2::_1)
					.orElse(multiCharacterDictionary);
		multiCharacterDictionaryByCharacterKind.put(this, dictionaryFromFile);
		return multiCharacterDictionaryByCharacterKind.get(this);
	}

	Optional>, Set>> fromFile(){
		
		Path pathWithFolderAndFile = UserHomeContext.getPathWithFolderAndFile("unlaxer-JAP", name()+".txt");
		
		if(Files.exists(pathWithFolderAndFile)) {
			
					return Try.ofSupplier(()->{
							List words;
							words = Unchecked.of(()->Files.newBufferedReader(pathWithFolderAndFile).lines().collect(Collectors.toList())).get();
							Tuple2>, Set> codePointSet = toCodePointSet(words);
							return codePointSet;
					}).toJavaOptional();
			
		}else {
			return Optional.empty();
		}
	}
	

	
	final boolean isDelimitor;
	
	static Tuple2> ,Set> toCodePointSet(String... words){
		return toCodePointSet(List.of(words));
	}
	
	static Tuple2> ,Set> toCodePointSet(Collection words){
		
		Set single = words.stream()
			.filter(word->word.length() == 1)
			.map(character->character.codePointAt(0))
			.collect(Collectors.toSet());
		
		Set> multi = words.stream()
				.filter(word->word.length() > 1)
				.map(CharacterKind::toCodePoints)
				.collect(Collectors.toSet());
		
		return new Tuple2<>(multi , single);
	}

	
	static Set toCodePointSet(String words){
		return Stream.of(words.split(""))
			.map(character->{
				assert character.codePointCount(0, character.length())==1;
				return character.codePointAt(0);
				})
			.collect(Collectors.toSet());
	}
	
	static List toCodePoints(String word){
		return word.codePoints().boxed()
			.collect(Collectors.toList());
	}
	
	
	private CharacterKind(RangeBothInclusive... ranges) {
		this.ranges = ranges;
		dictionary = Collections.emptySet();
		multiCharacterDictionary = Collections.emptySet();
		isDelimitor = false;
		strings = Stream.of(ranges)
				.map(RangeBothInclusive::stream)
				.flatMap(IntStream::boxed)
				.map(codePoint-> new String(new int[] {codePoint},0,1))
				.collect(Collectors.toSet());
	}

	private CharacterKind(Set dictonary) {
		this(dictonary,false);
	}
	
	private CharacterKind(Set dictonary , boolean isDelimitor) {
		ranges = new RangeBothInclusive[] {};
		this.dictionary = dictonary;
		multiCharacterDictionary = Collections.emptySet();
		this.isDelimitor = isDelimitor;
		
		strings = dictonary.stream()
				.map(codePoint-> new String(new int[] {codePoint},0,1))
				.collect(Collectors.toSet());
	}
	
	private CharacterKind(
			Tuple2>,Set> dictonaries , 
			boolean isDelimitor) {
		ranges = new RangeBothInclusive[] {};
		this.dictionary = dictonaries._2;
		this.multiCharacterDictionary = dictonaries._1;
		this.isDelimitor = isDelimitor;
		
		strings = new HashSet<>();
		
		Set collect = dictonaries._2().stream()
			.map(codePoint-> new String(new int[] {codePoint},0,1))
			.collect(Collectors.toSet());
		
		
		Set collect2 = dictonaries._1().stream()
			.map(list->{
				int[] codePoints = new int[list.size()];
				int i =0 ;
				for(Integer codePoint : list) {
					codePoints[i++] = codePoint;
				}
				return new String(codePoints , 0 , codePoints.length);
			})
			.collect(Collectors.toSet());
		
		strings.addAll(collect);
		strings.addAll(collect2);
	}

	
	private CharacterKind(
			Set> multiCharacterDictionary ,
			Set dictonary , 
			boolean isDelimitor) {
		
		ranges = new RangeBothInclusive[] {};
		this.dictionary = dictonary;
		this.multiCharacterDictionary = multiCharacterDictionary;
		this.isDelimitor = isDelimitor;
		
		strings = new HashSet<>();
		
		Set collect = dictonary.stream()
			.map(codePoint-> new String(new int[] {codePoint},0,1))
			.collect(Collectors.toSet());
		
		
		Set collect2 = multiCharacterDictionary.stream()
			.map(list->{
				int[] codePoints = new int[list.size()];
				int i =0 ;
				for(Integer codePoint : list) {
					codePoints[i++] = codePoint;
				}
				return new String(codePoints , 0 , codePoints.length);
			})
			.collect(Collectors.toSet());
		
		strings.addAll(collect);
		strings.addAll(collect2);

	}

	List matched(ListIterator codePointIterator){
		
		if(this == normal) {
			return List.of(codePointIterator.next());
		}
		
		if(hasMultipleCharacter()) {
			
			List matched = new ArrayList<>();
			
			for(List wordOfDictionary : multiCharacterDictionary) {
				
				int consumed = 0;
				boolean match = true;
				
				for (Integer codePointOfDictionary : wordOfDictionary) {
					
					if(false == codePointIterator.hasNext()) {
						resetIterator(codePointIterator, consumed);
						match = false;
						matched.clear();
						break;
					}
					
					Integer codePoint = codePointIterator.next();
					consumed++;
					
					if(codePoint.intValue() != codePointOfDictionary.intValue()) {
						
						resetIterator(codePointIterator, consumed);
						match = false;
						matched.clear();
						break;
					}
					matched.add(codePoint);
				}
				if(match) {
					return matched;
				}
			}
		}
		
		int codePoint = codePointIterator.next();
		
		for (RangeBothInclusive range : ranges) {
			
			if(range.in(codePoint)) {
				
				return List.of(codePoint);
			}
		}
		boolean contains = dictionary.contains(codePoint);
		
		if(contains) {
			return List.of(codePoint);
		}
		codePointIterator.previous();
		return Collections.emptyList();
	}
	
	static void resetIterator(ListIterator iterator , int consumed) {
		for(int i = 0 ; i < consumed ; i++) {
			iterator.previous();
		}
	}
	public static StringAndCharacterKinds stringAndCharacterKindsOf(String string) {
		return stringAndCharacterKindsOf(string , true);
	}
	
	/**
	 * @param string
	 * @param concatJapaneseSymbolToNormal
	 * @return StringAndCharacterKinds
	 * 
	 * concatJapaneseSymbolToNormal true -> "肉ー" is [normal("肉ー")]
	 * concatJapaneseSymbolToNormal false -> "肉ー" is [normal("肉"),delimitorJapaneseSymbol("ー")]
	 */
	public static StringAndCharacterKinds stringAndCharacterKindsOf(
			String string , 
			boolean concatJapaneseSymbolToNormal) {
		
		if(string == null || string.isEmpty()) {
			return StringAndCharacterKinds.empty();
		}
		
		List collect = characterKindsOf(string);
		
		List> rans = new ArrayList>();
		
		CharacterKind last = null;
		
		var ran = new ArrayList();
		
		for (CodePointAndCharacterKind codePointAndCharacterKind : collect) {
			
			CharacterKind characterKind = codePointAndCharacterKind.characterKind;
			
			if(concatJapaneseSymbolToNormal && last != null && last.isJapanese() &&  characterKind.isJapanese()) {
				
				codePointAndCharacterKind = codePointAndCharacterKind.as(CharacterKind.normal);
				
				ran.add(codePointAndCharacterKind);
				
			}else  if(last != characterKind) {
				
				if(false == ran.isEmpty()) {
					rans.add(new ArrayList<>(ran));
					ran.clear();
				}
				ran.add(codePointAndCharacterKind);
				last = characterKind;
				
			}else {
				ran.add(codePointAndCharacterKind);
			}
		}
		if(false == ran.isEmpty()) {
			rans.add(ran);
		}
		
		List results = new ArrayList<>();
		
		for (List sameKindList : rans) {
			
			CharacterKind characterKind = sameKindList.get(0).characterKind;
			
			int[] codePoints = new int[sameKindList.size()];

			int index =0;
			
			for (CodePointAndCharacterKind codePointAndCharacterKind : sameKindList) {
				
				codePoints[index++] = codePointAndCharacterKind.codePoint;
			}
			
			String ranString = new String(codePoints , 0 , codePoints.length);
			StringAndCharacterKind stringAndCharacterKind = new StringAndCharacterKind(characterKind, ranString);
			
			results.add(stringAndCharacterKind);
		}
		return new StringAndCharacterKinds(results);
	}
		
	static List codePointAndCharacterKinds(ListIterator codePointIterator){

		
		for(CharacterKind characterKind : CharacterKind.values()) {
			
			List matched = characterKind.matched(codePointIterator);
			if(matched.isEmpty()) {
				continue;
			}
			return create(matched , characterKind);
		}
		throw new IllegalArgumentException();
	}

	static List create(List wordOfDictionary,
			CharacterKind characterKind) {
		
		List collect = wordOfDictionary.stream()
			.map(codePoint->new CodePointAndCharacterKind(characterKind, codePoint))
			.collect(Collectors.toList());
		
		return collect;
	}

	public boolean hasMultipleCharacter() {
		return false == multiCharacterDictionary.isEmpty();
	}


	public static List characterKindsOf(String string) {
		
		ListIterator listIterator = string.codePoints().boxed().collect(Collectors.toList()).listIterator();

		List results = new ArrayList();
		while (listIterator.hasNext()) {
			List codePointAndCharacterKinds = codePointAndCharacterKinds(listIterator);
			results.addAll(codePointAndCharacterKinds);
		}
		return results;
	}

	@Override
	public boolean isArabicNumber() {
		return this == arabicNumber;
	}

	@Override
	public boolean isSymbol() {
		return this == symbol;
	}

	@Override
	public boolean isAlphabet() {
		return this == alphabet;
	}

	@Override
	public boolean isJapaneseNumber() {
		return this == japaneseNumber;
	}

	@Override
	public boolean isHiragana() {
		return this == hiragana;
	}

	@Override
	public boolean isKatakana() {
		return this == katakana;
	}

	@Override
	public boolean isDelimitorHyphen() {
		return this == delimitorHyphen;
	}

	@Override
	public boolean isDelimitorSlash() {
		return this == delimitorSlash;
	}

	@Override
	public boolean isDelimitorSpace() {
		return this == delimitorSpace;
	}

	@Override
	public boolean isDelimitorComma() {
		return this == delimitorComma;
	}

	@Override
	public boolean isDelimitorJapanese() {
		return this == delimitorJapanese;
	}

	@Override
	public boolean isDelimitorJapaneseCyoumeAddress() {
		return this == delimitorJapaneseCyoumeAddress;
	}

	@Override
	public boolean isDelimitorJapaneseBanchiAddress() {
		return this == delimitorJapaneseBanchiAddress;
	}

	@Override
	public boolean isDelimitorJapaneseGouAddress() {
		return this == delimitorJapaneseGouAddress;
	}

	@Override
	public boolean isNormal() {
		return this == normal;
	}

	@Override
	public boolean isAllKind() {
		return true;
	}
	
	static Map>> multiCharacterDictionaryByCharacterKind = new HashMap<>();
	
	static Map> dictionaryByCharacterKind = new HashMap<>();
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy