org.unlaxer.jaddress.parser.CharacterKind Maven / Gradle / Ivy
package org.unlaxer.jaddress.parser;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import org.unlaxer.jaddress.UserHomeContext;
import org.unlaxer.util.collection.Comparators;
import org.unlaxer.util.function.Unchecked;
import io.vavr.Tuple2;
import io.vavr.control.Try;
public enum CharacterKind implements CombinedCharacterKindAllMatch{
terminator(Collections.emptySet(),true),
// https://hydrocul.github.io/wiki/blog/2014/1101-hyphen-minus-wave-tilde.html
/*
- 2D U+002D ASCIIのハイフン
‐ E28090 U+2010 別のハイフン
‑ E28091 U+2011 改行しないハイフン
– E28093 U+2013 ENダッシュ
— E28094 U+2014 EMダッシュ
― E28095 U+2015 全角のダッシュ
− E28892 U+2212 全角のマイナス
ー E383BC U+30FC 全角の長音
ー EFBDB0 U+FF70 半角カナの長音
*/
alphabet(
new RangeBothInclusive("A".codePointAt(0), "Z".codePointAt(0)),
new RangeBothInclusive("a".codePointAt(0), "z".codePointAt(0)),
new RangeBothInclusive("A".codePointAt(0), "Z".codePointAt(0)),
new RangeBothInclusive("a".codePointAt(0), "z".codePointAt(0))
),
arabicNumber(
new RangeBothInclusive("0".codePointAt(0), "9".codePointAt(0)),
new RangeBothInclusive("0".codePointAt(0), "9".codePointAt(0))
),
japaneseAddressNumber(
toCodePointSet("一壱弌壹二弍弐貳三弎参參四亖五六七八九十拾什百佰陌千〇○")//仟阡万零
),
hiragana(new RangeBothInclusive("ぁ".codePointAt(0), "ん".codePointAt(0))),
katakana(new RangeBothInclusive("ァ".codePointAt(0), "ヶ".codePointAt(0))),// E382A1 - E383B6
cyouon(toCodePointSet("ーー")),
//TODO separates hyphen / dash /wave
delimitorHyphen(
/*
- 2D U+002D ASCIIのハイフン
‐ E28090 U+2010 別のハイフン
‑ E28091 U+2011 改行しないハイフン
– E28093 U+2013 ENダッシュ
— E28094 U+2014 EMダッシュ
― E28095 U+2015 全角のダッシュ
− E28892 U+2212 全角のマイナス
ー E383BC U+30FC 全角の長音
ー EFBDB0 U+FF70 半角カナの長音
*/
toCodePointSet("-‐‑–—―−-"),//<-最後に ASCIIのマイナス追加
true
),// hyphen
delimitorSlash(
toCodePointSet("/⁄/\"),
true
),// splash
delimitorSpace(
toCodePointSet(" "),//space tab zenkaku-space
true
),// space ,
delimitorComma(
toCodePointSet(".,;:、。:;"),
true
),// comma
delimitorJapaneseSymbol(
toCodePointSet("・·~~"), //ーー <-これは長音記号
true
),// japanese delimitor
//TODO set to before hiragana?
delimitorJapanese(
toCodePointSet("のノ之乃"),
true
),// japanese suffix
suffix丁目(
toCodePointSet(
"丁目",
"丁" /*大阪府堺市の「丁」*/,
"番町", /* 三重県名張市・徳島県徳島市の「番町」 三重県名張市では「○○△町域Top1」という町名は見られない。かわりに「○○△番町」という町名が使われている */
"町目" /* 福島県いわき市・郡山市の「町目」 福島県いわき市や郡山市の一部では、「町域Top1」のかわりに「町目」が使われている。いわき市平一町目、郡山市西田町三町目など。*/
),
false
),
suffix地番(
toCodePointSet("番地","番"),
false
),
suffix号室(
toCodePointSet(
List.of("号室")),
false
),
suffix号(
toCodePointSet("号"),
false
),
suffix階(
toCodePointSet("階","F","F"),
false
),
suffix棟(
toCodePointSet("号棟","号館","番館","番棟","棟"),
false
),
_堂(
toCodePointSet("堂"),
false
),
十干(
toCodePointSet("甲","乙","丙","丁","戊","己","庚","辛","壬","癸"),
false
),
prefixNo(
toCodePointSet(
"No","No","No.","No.",
"NO","NO","NO.","NO."
),
false
),
prefix第(
toCodePointSet(
"第"
),
false
),
// japanese delimitor
symbol(
toCodePointSet(
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~!”#$%&’()*+,-./:;<=>?@[¥]^_`{|}~"
),
false
),// symbol
normal,
;
// fields
final boolean isDelimitor;
final RangeBothInclusive[] ranges;
final Set dictionary;
final Set> multiCharacterDictionary;
final List strings;
// constructors
private CharacterKind(RangeBothInclusive... ranges) {
this.ranges = ranges;
dictionary = Collections.emptySet();
multiCharacterDictionary = Collections.emptySet();
isDelimitor = false;
strings = Stream.of(ranges)
.map(RangeBothInclusive::stream)
.flatMap(IntStream::boxed)
.map(codePoint-> new String(new int[] {codePoint},0,1))
.collect(Collectors.toList());
strings.sort(Comparators.longerIsFirst);
}
private CharacterKind(Set dictonary) {
this(dictonary,false);
}
private CharacterKind(Set dictonary , boolean isDelimitor) {
ranges = new RangeBothInclusive[] {};
this.dictionary = dictonary;
multiCharacterDictionary = Collections.emptySet();
this.isDelimitor = isDelimitor;
strings = dictonary.stream()
.map(codePoint-> new String(new int[] {codePoint},0,1))
.collect(Collectors.toList());
strings.sort(Comparators.longerIsFirst);
}
private CharacterKind(
Tuple2>,Set> dictonaries ,
boolean isDelimitor) {
ranges = new RangeBothInclusive[] {};
this.dictionary = dictonaries._2;
this.multiCharacterDictionary = dictonaries._1;
this.isDelimitor = isDelimitor;
strings = new ArrayList<>();
Set collect = dictonaries._2().stream()
.map(codePoint-> new String(new int[] {codePoint},0,1))
.collect(Collectors.toSet());
Set collect2 = dictonaries._1().stream()
.map(list->{
int[] codePoints = new int[list.size()];
int i =0 ;
for(Integer codePoint : list) {
codePoints[i++] = codePoint;
}
return new String(codePoints , 0 , codePoints.length);
})
.collect(Collectors.toSet());
strings.addAll(collect);
strings.addAll(collect2);
strings.sort(Comparators.longerIsFirst);
}
private CharacterKind(
Set> multiCharacterDictionary ,
Set dictonary ,
boolean isDelimitor) {
ranges = new RangeBothInclusive[] {};
this.dictionary = dictonary;
this.multiCharacterDictionary = multiCharacterDictionary;
this.isDelimitor = isDelimitor;
strings = new ArrayList<>();
Set collect = dictonary.stream()
.map(codePoint-> new String(new int[] {codePoint},0,1))
.collect(Collectors.toSet());
Set collect2 = multiCharacterDictionary.stream()
.map(list->{
int[] codePoints = new int[list.size()];
int i =0 ;
for(Integer codePoint : list) {
codePoints[i++] = codePoint;
}
return new String(codePoints , 0 , codePoints.length);
})
.collect(Collectors.toSet());
strings.addAll(collect);
strings.addAll(collect2);
strings.sort(Comparators.longerIsFirst);
}
@Override
public boolean isJapanese() {
return
this == normal ||
this == katakana ||
this == hiragana ||
this == delimitorJapaneseSymbol ||
this == delimitorJapanese ||
this == suffix丁目 ||
this == suffix地番 ||
this == suffix号 ||
this == japaneseAddressNumber ;
}
@Override
public boolean isDelimitor() {
return isDelimitor;
}
@Override
public boolean isJapanesAddressDelimitor(){
return
this == suffix丁目 ||
this == suffix地番 ||
this == suffix号 ;
}
@Override
public boolean isNumber() {
return this == arabicNumber || this == japaneseAddressNumber;
}
public List strings(){
return strings;
}
final Set dictionary(){
if(dictionaryByCharacterKind.containsKey(this)) {
return dictionaryByCharacterKind.get(this);
}
Set dictionaryFromFile =
fromFile()
.map(Tuple2::_2)
.orElse(dictionary);
dictionaryByCharacterKind.put(this, dictionaryFromFile);
return dictionaryByCharacterKind.get(this);
}
final Set> multiCharacterDictionary(){
if(multiCharacterDictionaryByCharacterKind.containsKey(this)) {
return multiCharacterDictionaryByCharacterKind.get(this);
}
Set> dictionaryFromFile =
fromFile()
.map(Tuple2::_1)
.orElse(multiCharacterDictionary);
multiCharacterDictionaryByCharacterKind.put(this, dictionaryFromFile);
return multiCharacterDictionaryByCharacterKind.get(this);
}
Optional>, Set>> fromFile(){
Path pathWithFolderAndFile =
UserHomeContext.getPathWithFolderAndFile("unlaxer-JAP", name()+".txt");
if(Files.exists(pathWithFolderAndFile)) {
return Try.ofSupplier(()->{
List words;
words = Unchecked.of(
()->Files.newBufferedReader(pathWithFolderAndFile).lines()
.collect(Collectors.toList())).get();
Tuple2>, Set> codePointSet = toCodePointSet(words);
return codePointSet;
}).toJavaOptional();
}else {
return Optional.empty();
}
}
static Tuple2> ,Set> toCodePointSet(String... words){
return toCodePointSet(List.of(words));
}
static Tuple2> ,Set> toCodePointSet(Collection words){
Set single = words.stream()
.filter(word->word.length() == 1)
.map(character->character.codePointAt(0))
.collect(Collectors.toSet());
Set> multi = words.stream()
.filter(word->word.length() > 1)
.map(CharacterKind::toCodePoints)
.collect(Collectors.toSet());
return new Tuple2<>(multi , single);
}
static Set toCodePointSet(String words){
return Stream.of(words.split(""))
.map(character->{
assert character.codePointCount(0, character.length())==1;
return character.codePointAt(0);
})
.collect(Collectors.toSet());
}
static List toCodePoints(String word){
return word.codePoints().boxed()
.collect(Collectors.toList());
}
public boolean hasMultipleCharacter() {
return false == multiCharacterDictionary.isEmpty();
}
@Override
public boolean isArabicNumber() {
return this == arabicNumber;
}
@Override
public boolean isSymbol() {
return this == symbol;
}
@Override
public boolean isAlphabet() {
return this == alphabet;
}
@Override
public boolean isJapaneseNumber() {
return this == japaneseAddressNumber;
}
@Override
public boolean isHiragana() {
return this == hiragana;
}
@Override
public boolean isKatakana() {
return this == katakana;
}
@Override
public boolean isDelimitorHyphen() {
return this == delimitorHyphen;
}
@Override
public boolean isDelimitorSlash() {
return this == delimitorSlash;
}
@Override
public boolean isDelimitorSpace() {
return this == delimitorSpace;
}
@Override
public boolean isDelimitorComma() {
return this == delimitorComma;
}
@Override
public boolean isDelimitorJapanese() {
return this == delimitorJapanese;
}
@Override
public boolean isDelimitorJapaneseCyoumeAddress() {
return this == suffix丁目;
}
@Override
public boolean isDelimitorJapaneseBanchiAddress() {
return this == suffix地番;
}
@Override
public boolean isDelimitorJapaneseGouAddress() {
return this == suffix号;
}
@Override
public boolean isNormal() {
return this == normal;
}
@Override
public boolean isAllKind() {
return true;
}
static Map>> multiCharacterDictionaryByCharacterKind = new HashMap<>();
static Map> dictionaryByCharacterKind = new HashMap<>();
List matched(ListIterator codePointIterator){
if(this == normal) {
return List.of(codePointIterator.next());
}
if(hasMultipleCharacter()) {
List matched = new ArrayList<>();
for(List wordOfDictionary : multiCharacterDictionary) {
int consumed = 0;
boolean match = true;
for (Integer codePointOfDictionary : wordOfDictionary) {
if(false == codePointIterator.hasNext()) {
resetIterator(codePointIterator, consumed);
match = false;
matched.clear();
break;
}
Integer codePoint = codePointIterator.next();
consumed++;
if(codePoint.intValue() != codePointOfDictionary.intValue()) {
resetIterator(codePointIterator, consumed);
match = false;
matched.clear();
break;
}
matched.add(codePoint);
}
if(match) {
return matched;
}
}
}
int codePoint = codePointIterator.next();
for (RangeBothInclusive range : ranges) {
if(range.in(codePoint)) {
return List.of(codePoint);
}
}
boolean contains = dictionary.contains(codePoint);
if(contains) {
return List.of(codePoint);
}
codePointIterator.previous();
return Collections.emptyList();
}
static void resetIterator(ListIterator iterator , int consumed) {
for(int i = 0 ; i < consumed ; i++) {
iterator.previous();
}
}
@Override
public boolean isTerminator() {
return this == terminator;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy