org.unlaxer.jaddress.parser.CharacterKind Maven / Gradle / Ivy
package org.unlaxer.jaddress.parser;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import org.unlaxer.jaddress.UserHomeContext;
import org.unlaxer.util.function.Unchecked;
import io.vavr.Tuple2;
import io.vavr.control.Try;
public enum CharacterKind implements CombinedCharacterKindAllMatch{
alphabet(
new RangeBothInclusive("A".codePointAt(0), "Z".codePointAt(0)),
new RangeBothInclusive("a".codePointAt(0), "z".codePointAt(0)),
new RangeBothInclusive("A".codePointAt(0), "Z".codePointAt(0)),
new RangeBothInclusive("a".codePointAt(0), "z".codePointAt(0))
),
arabicNumber(
new RangeBothInclusive("0".codePointAt(0), "9".codePointAt(0)),
new RangeBothInclusive("0".codePointAt(0), "9".codePointAt(0))
),
japaneseNumber(
toCodePointSet("一二三四五六七八九十百千万")
),
hiragana(new RangeBothInclusive("ぁ".codePointAt(0), "ん".codePointAt(0))),
katakana(new RangeBothInclusive("ァ".codePointAt(0), "ヶ".codePointAt(0))),// E382A1 - E383B6
delimitorHyphen(
toCodePointSet("-―-")
),// hyphen
delimitorSlash(
toCodePointSet("/⁄/\")
),// splash
delimitorSpace(
toCodePointSet(" ")//space tab zenkaku-space
),// space ,
delimitorComma(
toCodePointSet(".,;:、。:;"),
true
),// comma
delimitorJapaneseSymbol(
toCodePointSet("・·ーー~~"),
true
),// japanese delimitor
delimitorJapanese(
toCodePointSet("のノ之乃"),
true
),// japanese delimitor
delimitorJapaneseCyoumeAddress(
toCodePointSet(
"丁目",
"丁" /*大阪府堺市の「丁」*/,
"番町", /* 三重県名張市・徳島県徳島市の「番町」 三重県名張市では「○○△丁目」という町名は見られない。かわりに「○○△番町」という町名が使われている */
"町目" /* 福島県いわき市・郡山市の「町目」 福島県いわき市や郡山市の一部では、「丁目」のかわりに「町目」が使われている。いわき市平一町目、郡山市西田町三町目など。*/
),
true
),
delimitorJapaneseBanchiAddress(
toCodePointSet("番地","番"),
true
),
delimitorJapaneseGouAddress(
toCodePointSet("号"),
true
),
// japanese delimitor
symbol(
toCodePointSet(
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~!”#$%&’()*+,-./:;<=>?@[¥]^_`{|}~"
),
true
),// symbol
normal,
;
@Override
public boolean isJapanese() {
return
this == normal ||
this == katakana ||
this == hiragana ||
this == delimitorJapaneseSymbol ||
this == delimitorJapanese ||
this == delimitorJapaneseCyoumeAddress ||
this == delimitorJapaneseBanchiAddress ||
this == delimitorJapaneseGouAddress ||
this == japaneseNumber ;
}
@Override
public boolean isDelimitor() {
return isDelimitor;
}
@Override
public boolean isJapanesAddressDelimitor(){
return
this == delimitorJapaneseCyoumeAddress ||
this == delimitorJapaneseBanchiAddress ||
this == delimitorJapaneseGouAddress ;
}
@Override
public boolean isNumber() {
return this == arabicNumber || this == japaneseNumber;
}
final RangeBothInclusive[] ranges;
final Set dictionary;
final Set> multiCharacterDictionary;
final Set strings;
public Set strings(){
return strings;
}
final Set dictionary(){
if(dictionaryByCharacterKind.containsKey(this)) {
return dictionaryByCharacterKind.get(this);
}
Set dictionaryFromFile =
fromFile()
.map(Tuple2::_2)
.orElse(dictionary);
dictionaryByCharacterKind.put(this, dictionaryFromFile);
return dictionaryByCharacterKind.get(this);
}
final Set> multiCharacterDictionary(){
if(multiCharacterDictionaryByCharacterKind.containsKey(this)) {
return multiCharacterDictionaryByCharacterKind.get(this);
}
Set> dictionaryFromFile =
fromFile()
.map(Tuple2::_1)
.orElse(multiCharacterDictionary);
multiCharacterDictionaryByCharacterKind.put(this, dictionaryFromFile);
return multiCharacterDictionaryByCharacterKind.get(this);
}
Optional>, Set>> fromFile(){
Path pathWithFolderAndFile = UserHomeContext.getPathWithFolderAndFile("unlaxer-JAP", name()+".txt");
if(Files.exists(pathWithFolderAndFile)) {
return Try.ofSupplier(()->{
List words;
words = Unchecked.of(()->Files.newBufferedReader(pathWithFolderAndFile).lines().collect(Collectors.toList())).get();
Tuple2>, Set> codePointSet = toCodePointSet(words);
return codePointSet;
}).toJavaOptional();
}else {
return Optional.empty();
}
}
final boolean isDelimitor;
static Tuple2> ,Set> toCodePointSet(String... words){
return toCodePointSet(List.of(words));
}
static Tuple2> ,Set> toCodePointSet(Collection words){
Set single = words.stream()
.filter(word->word.length() == 1)
.map(character->character.codePointAt(0))
.collect(Collectors.toSet());
Set> multi = words.stream()
.filter(word->word.length() > 1)
.map(CharacterKind::toCodePoints)
.collect(Collectors.toSet());
return new Tuple2<>(multi , single);
}
static Set toCodePointSet(String words){
return Stream.of(words.split(""))
.map(character->{
assert character.codePointCount(0, character.length())==1;
return character.codePointAt(0);
})
.collect(Collectors.toSet());
}
static List toCodePoints(String word){
return word.codePoints().boxed()
.collect(Collectors.toList());
}
private CharacterKind(RangeBothInclusive... ranges) {
this.ranges = ranges;
dictionary = Collections.emptySet();
multiCharacterDictionary = Collections.emptySet();
isDelimitor = false;
strings = Stream.of(ranges)
.map(RangeBothInclusive::stream)
.flatMap(IntStream::boxed)
.map(codePoint-> new String(new int[] {codePoint},0,1))
.collect(Collectors.toSet());
}
private CharacterKind(Set dictonary) {
this(dictonary,false);
}
private CharacterKind(Set dictonary , boolean isDelimitor) {
ranges = new RangeBothInclusive[] {};
this.dictionary = dictonary;
multiCharacterDictionary = Collections.emptySet();
this.isDelimitor = isDelimitor;
strings = dictonary.stream()
.map(codePoint-> new String(new int[] {codePoint},0,1))
.collect(Collectors.toSet());
}
private CharacterKind(
Tuple2>,Set> dictonaries ,
boolean isDelimitor) {
ranges = new RangeBothInclusive[] {};
this.dictionary = dictonaries._2;
this.multiCharacterDictionary = dictonaries._1;
this.isDelimitor = isDelimitor;
strings = new HashSet<>();
Set collect = dictonaries._2().stream()
.map(codePoint-> new String(new int[] {codePoint},0,1))
.collect(Collectors.toSet());
Set collect2 = dictonaries._1().stream()
.map(list->{
int[] codePoints = new int[list.size()];
int i =0 ;
for(Integer codePoint : list) {
codePoints[i++] = codePoint;
}
return new String(codePoints , 0 , codePoints.length);
})
.collect(Collectors.toSet());
strings.addAll(collect);
strings.addAll(collect2);
}
private CharacterKind(
Set> multiCharacterDictionary ,
Set dictonary ,
boolean isDelimitor) {
ranges = new RangeBothInclusive[] {};
this.dictionary = dictonary;
this.multiCharacterDictionary = multiCharacterDictionary;
this.isDelimitor = isDelimitor;
strings = new HashSet<>();
Set collect = dictonary.stream()
.map(codePoint-> new String(new int[] {codePoint},0,1))
.collect(Collectors.toSet());
Set collect2 = multiCharacterDictionary.stream()
.map(list->{
int[] codePoints = new int[list.size()];
int i =0 ;
for(Integer codePoint : list) {
codePoints[i++] = codePoint;
}
return new String(codePoints , 0 , codePoints.length);
})
.collect(Collectors.toSet());
strings.addAll(collect);
strings.addAll(collect2);
}
List matched(ListIterator codePointIterator){
if(this == normal) {
return List.of(codePointIterator.next());
}
if(hasMultipleCharacter()) {
List matched = new ArrayList<>();
for(List wordOfDictionary : multiCharacterDictionary) {
int consumed = 0;
boolean match = true;
for (Integer codePointOfDictionary : wordOfDictionary) {
if(false == codePointIterator.hasNext()) {
resetIterator(codePointIterator, consumed);
match = false;
matched.clear();
break;
}
Integer codePoint = codePointIterator.next();
consumed++;
if(codePoint.intValue() != codePointOfDictionary.intValue()) {
resetIterator(codePointIterator, consumed);
match = false;
matched.clear();
break;
}
matched.add(codePoint);
}
if(match) {
return matched;
}
}
}
int codePoint = codePointIterator.next();
for (RangeBothInclusive range : ranges) {
if(range.in(codePoint)) {
return List.of(codePoint);
}
}
boolean contains = dictionary.contains(codePoint);
if(contains) {
return List.of(codePoint);
}
codePointIterator.previous();
return Collections.emptyList();
}
static void resetIterator(ListIterator iterator , int consumed) {
for(int i = 0 ; i < consumed ; i++) {
iterator.previous();
}
}
public static StringAndCharacterKinds stringAndCharacterKindsOf(String string) {
return stringAndCharacterKindsOf(string , true);
}
/**
* @param string
* @param concatJapaneseSymbolToNormal
* @return StringAndCharacterKinds
*
* concatJapaneseSymbolToNormal true -> "肉ー" is [normal("肉ー")]
* concatJapaneseSymbolToNormal false -> "肉ー" is [normal("肉"),delimitorJapaneseSymbol("ー")]
*/
public static StringAndCharacterKinds stringAndCharacterKindsOf(
String string ,
boolean concatJapaneseSymbolToNormal) {
if(string == null || string.isEmpty()) {
return StringAndCharacterKinds.empty();
}
List collect = characterKindsOf(string);
List> rans = new ArrayList>();
CharacterKind last = null;
var ran = new ArrayList();
for (CodePointAndCharacterKind codePointAndCharacterKind : collect) {
CharacterKind characterKind = codePointAndCharacterKind.characterKind;
if(concatJapaneseSymbolToNormal && last != null && last.isJapanese() && characterKind.isJapanese()) {
codePointAndCharacterKind = codePointAndCharacterKind.as(CharacterKind.normal);
ran.add(codePointAndCharacterKind);
}else if(last != characterKind) {
if(false == ran.isEmpty()) {
rans.add(new ArrayList<>(ran));
ran.clear();
}
ran.add(codePointAndCharacterKind);
last = characterKind;
}else {
ran.add(codePointAndCharacterKind);
}
}
if(false == ran.isEmpty()) {
rans.add(ran);
}
List results = new ArrayList<>();
for (List sameKindList : rans) {
CharacterKind characterKind = sameKindList.get(0).characterKind;
int[] codePoints = new int[sameKindList.size()];
int index =0;
for (CodePointAndCharacterKind codePointAndCharacterKind : sameKindList) {
codePoints[index++] = codePointAndCharacterKind.codePoint;
}
String ranString = new String(codePoints , 0 , codePoints.length);
StringAndCharacterKind stringAndCharacterKind = new StringAndCharacterKind(characterKind, ranString);
results.add(stringAndCharacterKind);
}
return new StringAndCharacterKinds(results);
}
static List codePointAndCharacterKinds(ListIterator codePointIterator){
for(CharacterKind characterKind : CharacterKind.values()) {
List matched = characterKind.matched(codePointIterator);
if(matched.isEmpty()) {
continue;
}
return create(matched , characterKind);
}
throw new IllegalArgumentException();
}
static List create(List wordOfDictionary,
CharacterKind characterKind) {
List collect = wordOfDictionary.stream()
.map(codePoint->new CodePointAndCharacterKind(characterKind, codePoint))
.collect(Collectors.toList());
return collect;
}
public boolean hasMultipleCharacter() {
return false == multiCharacterDictionary.isEmpty();
}
public static List characterKindsOf(String string) {
ListIterator listIterator = string.codePoints().boxed().collect(Collectors.toList()).listIterator();
List results = new ArrayList();
while (listIterator.hasNext()) {
List codePointAndCharacterKinds = codePointAndCharacterKinds(listIterator);
results.addAll(codePointAndCharacterKinds);
}
return results;
}
@Override
public boolean isArabicNumber() {
return this == arabicNumber;
}
@Override
public boolean isSymbol() {
return this == symbol;
}
@Override
public boolean isAlphabet() {
return this == alphabet;
}
@Override
public boolean isJapaneseNumber() {
return this == japaneseNumber;
}
@Override
public boolean isHiragana() {
return this == hiragana;
}
@Override
public boolean isKatakana() {
return this == katakana;
}
@Override
public boolean isDelimitorHyphen() {
return this == delimitorHyphen;
}
@Override
public boolean isDelimitorSlash() {
return this == delimitorSlash;
}
@Override
public boolean isDelimitorSpace() {
return this == delimitorSpace;
}
@Override
public boolean isDelimitorComma() {
return this == delimitorComma;
}
@Override
public boolean isDelimitorJapanese() {
return this == delimitorJapanese;
}
@Override
public boolean isDelimitorJapaneseCyoumeAddress() {
return this == delimitorJapaneseCyoumeAddress;
}
@Override
public boolean isDelimitorJapaneseBanchiAddress() {
return this == delimitorJapaneseBanchiAddress;
}
@Override
public boolean isDelimitorJapaneseGouAddress() {
return this == delimitorJapaneseGouAddress;
}
@Override
public boolean isNormal() {
return this == normal;
}
@Override
public boolean isAllKind() {
return true;
}
static Map>> multiCharacterDictionaryByCharacterKind = new HashMap<>();
static Map> dictionaryByCharacterKind = new HashMap<>();
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy