All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.amygdalum.stringsearchalgorithms.patternsearch.chars.GlushkovFactorExtender Maven / Gradle / Ivy

Go to download

Searching and Matching Strings with efficient algorithms: - Knuth-Morris-Pratt - Shift-And/Or - Boyer-Moore-Horspool - Sunday (QuickSearch) - BNDM - BOM - Aho-Corasick - Set-Horspool - Wu-Manber - Set-BOM

There is a newer version: 0.4.4
Show newest version
package net.amygdalum.stringsearchalgorithms.patternsearch.chars;

import static net.amygdalum.stringsearchalgorithms.patternsearch.chars.GlushkovAnalyzerOption.FACTORS;

import java.util.ArrayList;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;

import net.amygdalum.regexparser.RegexNode;
import net.amygdalum.regexparser.RegexParser;
import net.amygdalum.regexparser.RegexParserOption;
import net.amygdalum.stringsearchalgorithms.search.StringMatch;
import net.amygdalum.util.bits.BitSet;
import net.amygdalum.util.io.CharProvider;
import net.amygdalum.util.io.ReverseCharProvider;
import net.amygdalum.util.io.StringCharProvider;

public class GlushkovFactorExtender implements FactorExtender {

	private String pattern;
	private Set bestFactors;
	private DualGlushkovAutomaton factors;
	private GlushkovAutomaton automaton;
	private int minLength;

	private int factorLength;
	private BitSet factorInitial;

	public GlushkovFactorExtender(String pattern, RegexParserOption ...options) {
		RegexNode root = parseAndNormalizeRegex(pattern, options);
		BestFactorAnalyzer bestFactorAnalyzer = new BestFactorAnalyzer(root).analyze();
		GlushkovAnalyzer analyzer = new GlushkovAnalyzer(root).analyze();
		this.pattern = pattern;
		this.bestFactors = bestFactorAnalyzer.getBestFactors(asStrings(analyzer.firstChars()), asStrings(analyzer.lastChars()));
		this.factors = analyzer.buildReverseAutomaton(FACTORS);
		this.automaton = analyzer.buildAutomaton();
		this.minLength = analyzer.minLength();
	}

	private GlushkovFactorExtender(String pattern, DualGlushkovAutomaton factors, GlushkovAutomaton automaton, int minLength, int factorLength, BitSet factorInitial) {
		this.pattern = pattern;
		this.factors = factors;
		this.automaton = automaton;
		this.minLength = minLength;
		this.factorLength = factorLength;
		this.factorInitial = factorInitial;
	}

	private static RegexNode parseAndNormalizeRegex(String pattern, RegexParserOption ...options) {
		RegexParser parser = new RegexParser(pattern, options);
		RegexNode root = parser.parse();
		return root.accept(new GlushkovNormalizer());
	}

	public GlushkovFactorExtender forFactor(String factor) {
		BitSet factorInitial = backTrack(factors.getInitial(), factor);
		return new GlushkovFactorExtender(pattern, factors, automaton, minLength, factor.length(), factorInitial);
	}

	private Set asStrings(Set chars) {
		Set strings = new LinkedHashSet<>();
		for (Character c : chars) {
			strings.add(c.toString());
		}
		return strings;
	}

	@Override
	public String getPattern() {
		return pattern;
	}

	@Override
	public int getPatternLength() {
		return minLength;
	}

	@Override
	public List getBestFactors(int max) {
		Set bestFactorsMax = new LinkedHashSet<>();
		for (String factor : bestFactors) {
			if (factor.length() <= max) {
				bestFactorsMax.add(factor);
			} else {
				bestFactorsMax.add(factor.substring(0, max));
			}
		}
		return new ArrayList<>(bestFactorsMax);
	}

	@Override
	public boolean hasFactor(String factor) {
		BitSet factorInitial = backTrack(factors.getInitial(), factor);
		return !factorInitial.isEmpty();
	};

	@Override
	public SortedSet extendFactor(CharProvider chars, boolean longest) {
		long pos = chars.current();
		List starts = findStarts(chars);
		MatchBuilder listener = new MatchBuilder(longest);
		match(starts, chars, listener);
		chars.move(pos);
		return listener.getMatches();
	}

	private BitSet backTrack(BitSet state, String factor) {
		CharProvider reverse = new ReverseCharProvider(new StringCharProvider(factor, factor.length()));
		while (!reverse.finished() && !state.isEmpty()) {
			char c = reverse.next();
			state = factors.next(state, c);
		}
		return state;
	}

	private List findStarts(CharProvider chars) {
		long factorStart = chars.current() - factorLength;
		chars.move(factorStart);
		List starts = new LinkedList<>();
		BitSet state = factorInitial;
		CharProvider reverse = new ReverseCharProvider(chars);
		while (!reverse.finished() && !state.isEmpty()) {
			if (factors.isFinal(state)) {
				starts.add(0, chars.current());
			}
			char c = reverse.next();
			state = factors.next(state, c);
		}
		if (reverse.finished() && factors.isFinal(state)) {
			starts.add(0, chars.current());
		}
		return starts;
	}

	private void match(List starts, CharProvider chars, MatchListener... listeners) {
		boolean notify = listeners != null && listeners.length > 0;
		for (long start : starts) {
			chars.move(start);
			BitSet state = automaton.getInitial();
			while (!chars.finished() && !state.isEmpty()) {
				if (notify && automaton.isFinal(state)) {
					long end = chars.current();
					for (MatchListener listener : listeners) {
						listener.notify(start, end, chars);
					}
				}
				char c = chars.next();
				state = automaton.next(state, c);
			}
			if (notify && chars.finished() && automaton.isFinal(state)) {
				long end = chars.current();
				for (MatchListener listener : listeners) {
					listener.notify(start, end, chars);
				}
			}
		}
	}

	@Override
	public String toString() {
		return getClass().getSimpleName();
	}

	public static class Factory implements FactorExtenderFactory {

		private RegexParserOption[] options;

		public Factory(RegexParserOption... options) {
			this.options = options;
		}

		@Override
		public FactorExtender of(String pattern) {
			return new GlushkovFactorExtender(pattern, options);
		}

	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy