All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.unlaxer.jaddress.parser.processor.HeuristicBlockTokenizer Maven / Gradle / Ivy

package org.unlaxer.jaddress.parser.processor;

import static org.unlaxer.jaddress.parser.ResolverResultKindOfBoolean.actual丁目;
import static org.unlaxer.jaddress.parser.ResolverResultKindOfBoolean.actual地番;
import static org.unlaxer.jaddress.parser.ResolverResultKindOfBoolean.actual支号;
import static org.unlaxer.jaddress.parser.ResolverResultKindOfBoolean.actual枝番号;
import static org.unlaxer.jaddress.parser.ResolverResultKindOfBoolean.expects丁目;
import static org.unlaxer.jaddress.parser.ResolverResultKindOfBoolean.expects地番;
import static org.unlaxer.jaddress.parser.ResolverResultKindOfBoolean.expects支号;
import static org.unlaxer.jaddress.parser.ResolverResultKindOfBoolean.expects枝番号;
import static org.unlaxer.jaddress.parser.ResolverResultKindOfBoolean.丁目が最大値を超えた;

import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.SortedMap;
import java.util.stream.Collectors;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.unlaxer.jaddress.entity.standard.定義済みRange階層要素;
import org.unlaxer.jaddress.entity.standard.確定済み階層要素;
import org.unlaxer.jaddress.entity.standard.階層要素;
import org.unlaxer.jaddress.parser.AddressContext;
import org.unlaxer.jaddress.parser.AddressElement;
import org.unlaxer.jaddress.parser.AddressElementFactory;
import org.unlaxer.jaddress.parser.AddressProcessor;
import org.unlaxer.jaddress.parser.AddressToken;
import org.unlaxer.jaddress.parser.CharacterKind;
import org.unlaxer.jaddress.parser.HeuristicBlockMatcher;
import org.unlaxer.jaddress.parser.HeuristicBlockMatcher.MatchResult;
import org.unlaxer.jaddress.parser.NumberParser;
import org.unlaxer.jaddress.parser.ParsingState;
import org.unlaxer.jaddress.parser.ParsingTarget;
import org.unlaxer.jaddress.parser.ResolverResult;
import org.unlaxer.jaddress.parser.ResolverResultKindOfBoolean;
import org.unlaxer.jaddress.parser.SeparatorKind;
import org.unlaxer.jaddress.parser.StringAndCharacterKind;
import org.unlaxer.jaddress.parser.StringAndCharacterKinds;
import org.unlaxer.jaddress.parser.StringIndex;
import org.unlaxer.jaddress.parser.TargetStateAndElement;
import org.unlaxer.jaddress.parser.TripletAddressToken;
import org.unlaxer.jaddress.parser.processor.BlockHierarchyResolver.BlockPatternResolverResult;
import org.unlaxer.util.Singletons;
import org.unlaxer.util.collection.TreeNode;

import io.vavr.Tuple2;

public class HeuristicBlockTokenizer implements AddressProcessor{
	
	Logger logger = LoggerFactory.getLogger(getClass());
	
	@Override
	public ParsingState targetState() {
		return ParsingState.丁目以降を分割する;
	}

	static NumberParser numberParser = Singletons.get(NumberParser.class);
	
	@Override
	public TargetStateAndElement process(ParsingTarget parsingTarget) {
		
		AddressContext addressContext = parsingTarget.addressContext();
		
		BlockPatternResolverResult blockPatternResolverResult = 
				parsingTarget.intermediateResult().blockPatternResolverResult();
		

		TreeNode targetNode = targetNode(parsingTarget); 
		
		AddressElement addressElement = targetNode.get();
		
		logger.debug("target block address {}" , addressElement.asString());
		
		SortedMap<階層要素, TripletCharacterKinds> characterKindsBy階層要素 = 
			blockPatternResolverResult.characterKindsBy階層要素;
		
		TripletAddressToken matchWithSuccessor = TripletAddressToken.onlySuccessorOf(addressElement);
		
		boolean hasSuccessor = false;
		
		Map<階層要素,Boolean> skipBy階層要素 = new HashMap<階層要素, Boolean>(); 
		
		for (階層要素 _階層要素 : characterKindsBy階層要素.keySet()) {

			addResolverResult(_階層要素, expectsResolverResultBy階層要素, parsingTarget);
			
			TripletCharacterKinds tripletCharacterKinds = characterKindsBy階層要素.get(_階層要素);
			
			logger.debug("current is {}" , _階層要素);
			
			Boolean skip = skipBy階層要素.get(_階層要素);
			if(skip != null && skip) {
				continue;
			}
			TripletAddressToken matchWithSuccessorPrevious = matchWithSuccessor;
			matchWithSuccessor = numberParser.matchWithSuccessor(matchWithSuccessor);
			
			MatchResult matchResult = HeuristicBlockMatcher.isMatch(parsingTarget, matchWithSuccessor, _階層要素);
			boolean matched = 
					matchWithSuccessor.isMatched() && 
					matchResult.isMatch();
			
			if(matched) {
				int value = value(matchWithSuccessor);
				
				if(_階層要素 == 階層要素.町域Top1 && value > 42) {//日本で最大の丁目は42丁目らしいです!
					
					parsingTarget.addResolverResult(new ResolverResult(丁目が最大値を超えた));
					
					_階層要素 = 階層要素.町域Top2;
					skipBy階層要素.put(階層要素.町域Top2, true);
				}
				
				Tuple2 create = 
						create(matchWithSuccessor, _階層要素 , tripletCharacterKinds , matchResult);
				
				addResolverResult(_階層要素, actualResolverResultBy階層要素, parsingTarget);
				addressContext.addChild(targetNode,create._1());
				
				matchWithSuccessor = create._2();
			}else {
//				//FIXME! ★1との処理関係を把握 (successorで上書きされる?)
//				//ここに入る条件としては想定よりもBlockが短い時。(丁目番地号を想定していたのに丁目しかない時など)
//				//★1は想定通りであろうがなかろうが定義済みRange階層要素.建物以降のAddressElementを作成する
//				if(matchWithSuccessor.original != null && matchWithSuccessor.original.isPresent()) {
//					AddressElement create = 
//						AddressElementFactory.of(matchWithSuccessor.original , 定義済みRange階層要素.建物以降);
//					addressContext.addChild(targetNode,create);
//					hasSuccessor = true;
//				}
//				break;
				matchWithSuccessor = matchWithSuccessorPrevious;
			}
		}
		//★1
		if(matchWithSuccessor != null && matchWithSuccessor.successor().isPresent()) {
			AddressElement create = 
				AddressElementFactory.of(matchWithSuccessor.successor(), 定義済みRange階層要素.建物以降);
			addressContext.addChild(targetNode,create);
			hasSuccessor = true;
		}
		
		parsingTarget.intermediateResult().setHas建物以降(hasSuccessor);
		
		return 
				new TargetStateAndElement(
					ParsingState.都道府県から枝番までで建物階層と建物名をDBを用いて求める, 
					定義済みRange階層要素.建物以降);
	}
	
	int value(TripletAddressToken matchWithSuccessor) {
		//FIXME! 漢字
		try {
			int parseInt = Integer.parseInt(matchWithSuccessor.matched().asString());
			return parseInt;
		}catch (Exception e) {
			return -1;
		}
	}
	
	Tuple2 create(
			TripletAddressToken triplet ,階層要素 _階層要素 , 
			TripletCharacterKinds tripletCharacterKinds,MatchResult matchResult) {
		
		AddressElement create = create(triplet, _階層要素);
		
		if(matchResult == MatchResult.withSuffix) {
			確定済み階層要素.from(_階層要素).ifPresent(create::set確定済み階層要素);
		}
		
		AddressToken successor = triplet.successor();
		
		List collect = tripletCharacterKinds.suffix().collection().stream()
				.map(CharacterKind::strings)
				.flatMap(Collection::stream)
				.collect(Collectors.toList());
		
		for (String suffix : collect) {
			
			StringIndex indexOf = successor.indexOf(suffix);
			if(indexOf.isValid()) {
				if(indexOf.value == 0 || isHeaderSpace(indexOf, successor)) {
					
					create.setSuffix(suffix);
					
					AddressToken substring = successor.substring(
						StringIndex.of(indexOf.value + suffix.length()),  
						SeparatorKind.domainSpecificSeparator, 
						SeparatorKind.domainSpecificSeparator);
					
					TripletAddressToken onlySuccessorOf = TripletAddressToken.onlySuccessorOf(substring);
					
					return new Tuple2(create, onlySuccessorOf);
				}
			}
		}
		return new Tuple2(create, triplet);
	}
	
	Optional searchSuffix(
			AddressToken successor ,
			階層要素 _階層要素 ,
			SortedMap<階層要素, TripletCharacterKinds> characterKindsBy階層要素
			){
		
		
		
		return Optional.empty();
		
	}
	
	
	
	
	
	boolean isHeaderSpace(StringIndex indexOf  , AddressToken successor) {
		AddressToken substring = successor.substring(
				StringIndex.of(0), 
				indexOf, 
				SeparatorKind.domainSpecificSeparator, 
				SeparatorKind.domainSpecificSeparator);
		
		boolean allMatch = substring.stringAndCharacterKinds().stream()
			.map(StringAndCharacterKind::characterKind)
			.allMatch(CharacterKind::isDelimitorSpace);
		
		return allMatch;
	}
	
	AddressElement create(TripletAddressToken triplet ,階層要素 _階層要素) {
		switch (_階層要素) {
			
		case 町域Top1:
			return AddressElementFactory.of(
					triplet.joinPredecessorAndMatched(),
					_階層要素 
			); 
			
		case 町域Top2:
		case 町域Top3:
		case 町域Top4:
			StringAndCharacterKinds stringAndCharacterKindOf = 
				StringAndCharacterKinds.of(triplet.predecessor().asString() , false);
			
			StringAndCharacterKinds cutFilterchracterKindIndexOf = 
				stringAndCharacterKindOf.cutFilterchracterKindIndexOf(filterKindBy階層要素.get(_階層要素));
			
			StringAndCharacterKinds join = 
				cutFilterchracterKindIndexOf.join(triplet.matched().stringAndCharacterKinds());
			
			return AddressElementFactory.of(
				join,
				_階層要素,
				triplet.predecessor().separatorKindOfLeading(),
				triplet.matched().separatorKindOfTailing()
//				matchedString.successor
			); 
			
			default:
				throw new IllegalArgumentException("Unexpected value: " + _階層要素);
			}
	}
	static Map<階層要素, CharacterKind[]> filterKindBy階層要素 = Map.of(
			
		階層要素.町域Top2,createWithDefaultDelimitor(CharacterKind.suffix丁目),
		階層要素.町域Top3,createWithDefaultDelimitor(CharacterKind.suffix地番),
		階層要素.町域Top4,createWithDefaultDelimitor(CharacterKind.suffix号)
	);
	
	static Map<階層要素, List> suffixBy階層要素 = Map.of(
		階層要素.町域Top2,CharacterKind.suffix丁目.strings(),
		階層要素.町域Top3,CharacterKind.suffix地番.strings(),
		階層要素.町域Top4,CharacterKind.suffix号.strings(),
		階層要素.建物Bottom3, CharacterKind.suffix号室.strings(),
		階層要素.建物Bottom2,CharacterKind.suffix階.strings(),
		階層要素.建物Bottom1,CharacterKind.suffix棟.strings()
	);

	
	void addResolverResult(
			階層要素 _階層要素,
			Map<階層要素 ,ResolverResultKindOfBoolean> resultBy階層要素 , 
			ParsingTarget parsingTarget){
		
		ResolverResultKindOfBoolean resolverResultKindOfBoolean = 
				resultBy階層要素.get(_階層要素);
		
		if(resolverResultKindOfBoolean != null) {
			parsingTarget.addResolverResult(
				new ResolverResult(resolverResultKindOfBoolean));
		}
		
	}
	
	static Map<階層要素 ,ResolverResultKindOfBoolean> expectsResolverResultBy階層要素 = 
		Map.of(
			階層要素.町域Top1 , expects丁目, 
			階層要素.町域Top2 , expects地番,
			階層要素.町域Top3 , expects支号,
			階層要素.町域Top4 , expects枝番号
	    );
	static Map<階層要素 ,ResolverResultKindOfBoolean> actualResolverResultBy階層要素 = 
		Map.of(
			階層要素.町域Top1 , actual丁目, 
			階層要素.町域Top2 , actual地番,
			階層要素.町域Top3 , actual支号,
			階層要素.町域Top4 , actual枝番号
	    );
	
	static CharacterKind[] createWithDefaultDelimitor(CharacterKind mainCharacterKind) {
		
		CharacterKind[] create  =new CharacterKind[] {
			CharacterKind.delimitorSpace, 
			CharacterKind.delimitorComma , 
			CharacterKind.delimitorSpace, 
			CharacterKind.delimitorHyphen , 
			CharacterKind.delimitorJapanese , 
			CharacterKind.delimitorSlash
		};
		
		create[0] = mainCharacterKind;
		return create;
	}
	static class MatchedSuffix{
		public final 階層要素 _階層要素;
		public final String suffix;
		public final StringIndex index;
		public MatchedSuffix(階層要素 _階層要素, String suffix, StringIndex index) {
			super();
			this._階層要素 = _階層要素;
			this.suffix = suffix;
			this.index = index;
		}
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy