All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.amygdalum.stringsearchalgorithms.patternsearch.chars.GlushkovAnalyzer Maven / Gradle / Ivy

Go to download

Searching and Matching Strings with efficient algorithms: - Knuth-Morris-Pratt - Shift-And/Or - Boyer-Moore-Horspool - Sunday (QuickSearch) - BNDM - BOM - Aho-Corasick - Set-Horspool - Wu-Manber - Set-BOM

There is a newer version: 0.4.4
Show newest version
package net.amygdalum.stringsearchalgorithms.patternsearch.chars;

import static java.util.Arrays.asList;
import static net.amygdalum.stringsearchalgorithms.patternsearch.chars.GlushkovAnalyzerOption.FACTORS;
import static net.amygdalum.stringsearchalgorithms.patternsearch.chars.GlushkovAnalyzerOption.SELF_LOOP;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;

import net.amygdalum.regexparser.AlternativesNode;
import net.amygdalum.regexparser.AnyCharNode;
import net.amygdalum.regexparser.BoundedLoopNode;
import net.amygdalum.regexparser.CharClassNode;
import net.amygdalum.regexparser.CompClassNode;
import net.amygdalum.regexparser.ConcatNode;
import net.amygdalum.regexparser.DefinedCharNode;
import net.amygdalum.regexparser.EmptyNode;
import net.amygdalum.regexparser.GroupNode;
import net.amygdalum.regexparser.OptionalNode;
import net.amygdalum.regexparser.RangeCharNode;
import net.amygdalum.regexparser.RegexNode;
import net.amygdalum.regexparser.RegexNodeVisitor;
import net.amygdalum.regexparser.SingleCharNode;
import net.amygdalum.regexparser.SpecialCharClassNode;
import net.amygdalum.regexparser.StringNode;
import net.amygdalum.regexparser.UnboundedLoopNode;
import net.amygdalum.util.bits.BitSet;
import net.amygdalum.util.io.BitMaskCharClassMapper;
import net.amygdalum.util.io.CharClassMapper;
import net.amygdalum.util.io.LowByteCharClassMapper;
import net.amygdalum.util.io.SmallRangeCharClassMapper;
import net.amygdalum.util.map.BitSetObjectMap;
import net.amygdalum.util.map.CharObjectMap;
import net.amygdalum.util.text.CharRange;
import net.amygdalum.util.text.CharRangeAccumulator;
import net.amygdalum.util.worklist.WorkSet;

public class GlushkovAnalyzer implements RegexNodeVisitor {

	private RegexNode root;
	private List charCollector;
	private Map> first;
	private Map> last;
	private Map> follow;
	private Map> precede;
	private Map minLength;
	private DefinedCharNode[] chars;
	private int len;
	private CharClassMapper mapper;
	private char[] alphabet;

	public GlushkovAnalyzer(RegexNode root) {
		this.root = root;
		this.first = new LinkedHashMap<>();
		this.last = new LinkedHashMap<>();
		this.follow = new LinkedHashMap<>();
		this.precede = new LinkedHashMap<>();
		this.minLength = new LinkedHashMap<>();
		this.charCollector = new ArrayList<>();
		this.charCollector.add(null);
	}

	public CharClassMapper mapper() {
		return mapper;
	}

	private DefinedCharNode[] characters() {
		return charCollector.toArray(new DefinedCharNode[0]);
	}

	public Set firstChars() {
		Set firstChars = new LinkedHashSet<>();
		for (int index : first(root)) {
			for (char c : chars[index].chars()) {
				firstChars.add(c);
			}
		}
		return firstChars;
	}

	public Set lastChars() {
		Set lastChars = new LinkedHashSet<>();
		for (int index : last(root)) {
			for (char c : chars[index].chars()) {
				lastChars.add(c);
			}
		}
		return lastChars;
	}

	private void first(RegexNode node, Integer... value) {
		first(node, new LinkedHashSet<>(asList(value)));
	}

	private void first(RegexNode node, Set value) {
		first.put(node, value);
	}

	private Set first(RegexNode node) {
		return first.get(node);
	}

	private List> first(List nodes) {
		List> result = new ArrayList<>(nodes.size());
		for (RegexNode node : nodes) {
			result.add(first(node));
		}
		return result;
	}

	private void last(RegexNode node, Integer... value) {
		last(node, new LinkedHashSet<>(asList(value)));
	}

	private void last(RegexNode node, Set value) {
		last.put(node, value);
	}

	private Set last(RegexNode node) {
		return last.get(node);
	}

	private List> last(List nodes) {
		List> result = new ArrayList<>(nodes.size());
		for (RegexNode node : nodes) {
			result.add(last(node));
		}
		return result;
	}

	private void appendFollow(int key, Collection append) {
		Set followSet = follow.get(key);
		if (followSet == null) {
			followSet = new LinkedHashSet();
			follow.put(key, followSet);
		}
		followSet.addAll(append);
	}

	private Set follow(Integer i) {
		Set set = follow.get(i);
		if (set == null) {
			return Collections.emptySet();
		} else {
			return set;
		}
	}

	private void appendPrecede(int key, Collection append) {
		Set precedeSet = precede.get(key);
		if (precedeSet == null) {
			precedeSet = new LinkedHashSet();
			precede.put(key, precedeSet);
		}
		precedeSet.addAll(append);
	}

	private void minLength(RegexNode node, Integer value) {
		minLength.put(node, value);
	}

	private List minLength(List nodes) {
		List result = new ArrayList<>(nodes.size());
		for (RegexNode node : nodes) {
			result.add(minLength(node));
		}
		return result;
	}

	private Integer minLength(RegexNode node) {
		return minLength.get(node);
	}

	public GlushkovAnalyzer analyze() {
		root.accept(this);
		appendFollow(0, first(root));
		for (int f : first(root)) {
			appendPrecede(f, asList(0));
		}
		chars = characters();
		len = chars.length;
		mapper = computeMapper(chars);
		alphabet = mapper.representatives();
		return this;
	}

	private CharClassMapper computeMapper(DefinedCharNode[] nodes) {
		CharRangeAccumulator acc = new CharRangeAccumulator();

		for (DefinedCharNode node : nodes) {
			if (node != null) {
				acc.split(node.getFrom(), node.getTo());
			}
		}

		List liveRanges = acc.getRanges();

		boolean lowByte = computeLowByte(liveRanges);
		boolean smallRange = computeSmallRange(liveRanges, lowByte);
		if (smallRange) {
			return new SmallRangeCharClassMapper(liveRanges);
		} else if (lowByte) {
			return new LowByteCharClassMapper(liveRanges);
		} else {
			return new BitMaskCharClassMapper(liveRanges);
		}
	}

	public boolean computeLowByte(List liveRanges) {
		Set highbytes = new HashSet<>();
		for (CharRange range : liveRanges) {
			highbytes.add(range.from & 0xff00);
			highbytes.add(range.to & 0xff00);
		}
		return highbytes.size() <= 1;
	}

	public boolean computeSmallRange(List liveRanges, boolean lowByte) {
		if (liveRanges.isEmpty()) {
			return true;
		} else {
			char min = liveRanges.get(0).from;
			char max = liveRanges.get(liveRanges.size() - 1).to;
			if (lowByte) {
				return max - min <= 64;
			} else {
				return max - min <= 256;
			}
		}
	}

	public GlushkovAutomaton buildAutomaton(GlushkovAnalyzerOption... options) {
		BitSet initial = FACTORS.in(options) ? all() : initial();

		BitSet finals = finals();

		CharObjectMap reachableByChar = reachableByChar(options);

		BitSetObjectMap reachableByState = reachableByState(reachableByChar, options);

		return new GlushkovAutomaton(initial, finals, reachableByChar, reachableByState);
	}

	public DualGlushkovAutomaton buildReverseAutomaton(GlushkovAnalyzerOption... options) {
		BitSet initial = FACTORS.in(options) ? all() : finals();

		BitSet finals = initial();

		CharObjectMap reachableByChar = reachableByChar(options);

		BitSetObjectMap reachableByState = sourceableByState(reachableByChar, options);

		return new DualGlushkovAutomaton(initial, finals, reachableByChar, reachableByState);
	}

	public int minLength() {
		return minLength(root);
	}

	private BitSet initial() {
		return BitSet.bits(len, 0);
	}

	private CharObjectMap reachableByChar(GlushkovAnalyzerOption... options) {
		BitSet defaultValue = SELF_LOOP.in(options) ? initial() : BitSet.empty(len);

		CharObjectMap reachable = new CharObjectMap(defaultValue);
		for (int i = 1; i < len; i++) {
			for (char c : chars[i].chars()) {
				BitSet b = reachable.get(c);
				if (b == defaultValue) {
					b = defaultValue.clone();
					reachable.put(c, b);
				}
				b.set(i);
			}
		}
		return reachable;
	}

	private BitSetObjectMap reachableByState(CharObjectMap reachableByChar, GlushkovAnalyzerOption... options) {
		BitSet defaultValue = SELF_LOOP.in(options) ? initial() : BitSet.empty(len);
		BitSet start = FACTORS.in(options) ? all() : initial();

		return new Collector(len, alphabet, follow, reachableByChar, defaultValue)
			.collect(start);
	}

	private BitSetObjectMap sourceableByState(CharObjectMap reachableByChar, GlushkovAnalyzerOption... options) {
		BitSet defaultValue = SELF_LOOP.in(options) ? finals() : BitSet.empty(len);
		BitSet start = FACTORS.in(options) ? all() : finals();

		return new Collector(len, alphabet, precede, reachableByChar, defaultValue)
			.collect(allFinals(start, reachableByChar, options));
	}

	private List allFinals(BitSet initial, CharObjectMap reachableByChar, GlushkovAnalyzerOption... options) {
		BitSet start = FACTORS.in(options) ? all() : initial();
		BitSet defaultValue = SELF_LOOP.in(options) ? initial() : BitSet.empty(len);

		Collection possible = possibleStartsByState(start, reachableByChar, defaultValue);

		return filterPossiblesStartsByChar(initial, reachableByChar, possible);
	}

	private Collection possibleStartsByState(BitSet next, CharObjectMap reachableByChar, BitSet defaultValue) {
		Map possible = new LinkedHashMap<>();
		possibleStartsByState(possible, next, reachableByChar, defaultValue);
		return possible.values();
	}

	private void possibleStartsByState(Map possible, BitSet start, CharObjectMap reachableByChar, BitSet defaultValue) {
		Queue nexts = new WorkSet<>();
		nexts.add(start);
		nexts.add(start.or(initial()));
		while (!nexts.isEmpty()) {
			BitSet next = nexts.remove();
			BitSet td = possible.get(next);
			if (td == null) {
				td = (BitSet) defaultValue.clone();
				for (int i = 0; i < len; i++) {
					if (next.get(i)) {
						td = td.or(bits(len, follow(i)));
					}
				}
				possible.put(next, td);
				BitSet n = (BitSet) td.clone();
				for (char c : alphabet) {
					BitSet cand = n.and(reachableByChar.get(c));
					if (!nexts.contains(cand)) {
						nexts.add(cand);
					}
					cand = cand.or(initial());
					if (!nexts.contains(cand)) {
						nexts.add(cand);
					}
				}
			}
		}
	}

	private List filterPossiblesStartsByChar(BitSet initial, CharObjectMap reachableByChar, Collection possible) {
		Set filteredPossible = new LinkedHashSet<>();
		for (BitSet value : possible) {
			BitSet finalValue = (BitSet) initial.and(value);
			for (char c : alphabet) {
				BitSet charFilter = reachableByChar.get(c);
				BitSet state = finalValue.and(charFilter);
				if (!state.isEmpty()) {
					filteredPossible.add(state);
				}
			}
		}
		return new ArrayList<>(filteredPossible);
	}

	private static BitSet bits(int len, Set ints) {
		BitSet bits = BitSet.empty(len);
		for (int i : ints) {
			bits.set(i);
		}
		return bits;
	}

	private BitSet finals() {
		BitSet finals = BitSet.empty(len);
		for (int x : last(root)) {
			finals.set(x);
		}
		if (minLength.get(root) == 0) {
			finals.set(0);
		}
		return finals;
	}

	private BitSet all() {
		return BitSet.all(len);
	}

	@Override
	public Void visitAlternatives(AlternativesNode node) {
		List subNodes = node.getSubNodes();
		for (RegexNode subNode : subNodes) {
			subNode.accept(this);
		}

		first(node, union(first(subNodes)));

		last(node, union(last(subNodes)));

		minLength(node, minimum(minLength(subNodes)));

		return null;
	}

	@Override
	public Void visitAnyChar(AnyCharNode node) {
		throw new UnsupportedOperationException("decomposed normal from does not contain char classes");
	}

	@Override
	public Void visitCharClass(CharClassNode node) {
		throw new UnsupportedOperationException("decomposed normal from does not contain char classes");
	}

	@Override
	public Void visitCompClass(CompClassNode node) {
		throw new UnsupportedOperationException("decomposed normal from does not contain char classes");
	}

	@Override
	public Void visitConcat(ConcatNode node) {
		List subNodes = node.getSubNodes();
		for (RegexNode subNode : subNodes) {
			subNode.accept(this);
		}

		first(node, union(concatFirst(subNodes)));

		last(node, union(concatLast(subNodes)));

		minLength(node, sum(minLength(subNodes)));

		for (int i = 0; i < subNodes.size() - 1; i++) {
			RegexNode current = subNodes.get(i);
			for (int j = i + 1; j < subNodes.size(); j++) {
				RegexNode next = subNodes.get(j);
				for (int x : last(current)) {
					appendFollow(x, first(next));
				}
				if (minLength(next) > 0) {
					break;
				}
			}
		}
		for (int i = subNodes.size() - 1; i >= 1; i--) {
			RegexNode current = subNodes.get(i);
			for (int j = i - 1; j >= 0; j--) {
				RegexNode prev = subNodes.get(j);
				for (int y : first(current)) {
					appendPrecede(y, last(prev));
				}
				if (minLength(prev) > 0) {
					break;
				}
			}
		}
		return null;
	}

	private List> concatFirst(List subNodes) {
		List> result = new ArrayList<>();
		int minLength = 0;
		for (RegexNode subNode : subNodes) {
			if (minLength > 0) {
				break;
			}
			result.add(first(subNode));
			minLength += minLength(subNode);
		}
		return result;
	}

	private List> concatLast(List subNodes) {
		List reverseSubNodes = new ArrayList<>(subNodes);
		Collections.reverse(reverseSubNodes);
		List> result = new ArrayList<>();
		int minLength = 0;
		for (RegexNode subNode : reverseSubNodes) {
			if (minLength > 0) {
				break;
			}
			result.add(last(subNode));
			minLength += minLength(subNode);
		}
		return result;
	}

	@Override
	public Void visitEmpty(EmptyNode node) {
		first(node, new HashSet());

		last(node, new HashSet());

		minLength(node, 0);

		return null;
	}

	@Override
	public Void visitGroup(GroupNode node) {
		RegexNode subNode = node.getSubNode();
		subNode.accept(this);

		first(node, first(subNode));

		last(node, last(subNode));

		minLength(node, minLength(subNode));

		return null;
	}

	@Override
	public Void visitBoundedLoop(BoundedLoopNode node) {
		throw new UnsupportedOperationException("decomposed normal from does not contain bounded loops");
	}

	@Override
	public Void visitUnboundedLoop(UnboundedLoopNode node) {
		if (node.getFrom() > 0) {
			throw new UnsupportedOperationException("decomposed normal from does not contain plus loops");
		}
		RegexNode subNode = node.getSubNode();
		subNode.accept(this);

		first(node, first(subNode));

		last(node, last(subNode));

		minLength(node, 0);

		RegexNode current = subNode;
		RegexNode next = subNode;
		for (int x : last(current)) {
			appendFollow(x, first(next));
		}
		for (int y : first(next)) {
			appendPrecede(y, last(current));
		}
		return null;
	}

	@Override
	public Void visitOptional(OptionalNode node) {
		RegexNode subNode = node.getSubNode();
		subNode.accept(this);

		first(node, first(subNode));

		last(node, last(subNode));

		minLength(node, 0);

		return null;
	}

	@Override
	public Void visitRangeChar(RangeCharNode node) {
		int pos = charCollector.size();
		charCollector.add(node);

		first(node, pos);

		last(node, pos);

		minLength(node, 1);

		return null;
	}

	@Override
	public Void visitSingleChar(SingleCharNode node) {
		int pos = charCollector.size();
		charCollector.add(node);

		first(node, pos);

		last(node, pos);

		minLength(node, 1);

		return null;
	}

	@Override
	public Void visitSpecialCharClass(SpecialCharClassNode node) {
		throw new UnsupportedOperationException("decomposed normal from does not contain char classes");
	}

	@Override
	public Void visitString(StringNode node) {
		throw new UnsupportedOperationException("decomposed normal from does not contain strings");
	}

	private Set union(List> values) {
		Set result = new LinkedHashSet<>();
		for (Set value : values) {
			result.addAll(value);
		}
		return result;
	}

	private Integer minimum(List values) {
		int min = Integer.MAX_VALUE;
		for (Integer value : values) {
			if (value < min) {
				min = value;
			}
		}
		return min;
	}

	private Integer sum(List values) {
		int sum = 0;
		for (Integer value : values) {
			sum += value;
		}
		return sum;
	}

	private static class Collector {

		private int len;
		private char[] alphabet;
		private Map> next;
		private BitSetObjectMap accumulator;
		private CharObjectMap reachableByChar;
		private BitSet defaultValue;
		private WorkSet todo;

		public Collector(int len, char[] alphabet, Map> next, CharObjectMap reachableByChar, BitSet defaultValue) {
			this.len = len;
			this.alphabet = alphabet;
			this.next = next;
			this.accumulator = new BitSetObjectMap(defaultValue);
			this.reachableByChar = reachableByChar;
			this.defaultValue = defaultValue;
			this.todo = new WorkSet<>();
		}

		public BitSetObjectMap collect(BitSet... start) {
			return collect(asList(start));
		}

		public BitSetObjectMap collect(Collection start) {
			todo.addAll(start);

			while (!todo.isEmpty()) {
				BitSet current = todo.remove();
				computeReachables(current);
			}

			return accumulator;
		}

		private void computeReachables(BitSet d) {
			BitSet td = accumulator.get(d);
			if (td == defaultValue) {
				td = (BitSet) defaultValue.clone();
			}
			for (int i = 0; i < len; i++) {
				if (d.get(i)) {
					td = td.or(bits(len, next(i)));
				}
			}
			accumulator.put(d, td);

			BitSet n = (BitSet) td.clone();
			for (char c : alphabet) {
				BitSet next = n.and(reachableByChar.get(c));
				if (accumulator.get(next) == defaultValue) {
					todo.add(next);
				}
			}

		}

		private Set next(Integer i) {
			Set set = next.get(i);
			if (set == null) {
				return Collections.emptySet();
			} else {
				return set;
			}
		}

	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy