net.amygdalum.stringsearchalgorithms.patternsearch.chars.GlushkovAnalyzer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stringsearchalgorithms Show documentation
Show all versions of stringsearchalgorithms Show documentation
Searching and Matching Strings with efficient algorithms:
- Knuth-Morris-Pratt
- Shift-And/Or
- Boyer-Moore-Horspool
- Sunday (QuickSearch)
- BNDM
- BOM
- Aho-Corasick
- Set-Horspool
- Wu-Manber
- Set-BOM
package net.amygdalum.stringsearchalgorithms.patternsearch.chars;
import static java.util.Arrays.asList;
import static net.amygdalum.stringsearchalgorithms.patternsearch.chars.GlushkovAnalyzerOption.FACTORS;
import static net.amygdalum.stringsearchalgorithms.patternsearch.chars.GlushkovAnalyzerOption.SELF_LOOP;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import net.amygdalum.regexparser.AlternativesNode;
import net.amygdalum.regexparser.AnyCharNode;
import net.amygdalum.regexparser.BoundedLoopNode;
import net.amygdalum.regexparser.CharClassNode;
import net.amygdalum.regexparser.CompClassNode;
import net.amygdalum.regexparser.ConcatNode;
import net.amygdalum.regexparser.DefinedCharNode;
import net.amygdalum.regexparser.EmptyNode;
import net.amygdalum.regexparser.GroupNode;
import net.amygdalum.regexparser.OptionalNode;
import net.amygdalum.regexparser.RangeCharNode;
import net.amygdalum.regexparser.RegexNode;
import net.amygdalum.regexparser.RegexNodeVisitor;
import net.amygdalum.regexparser.SingleCharNode;
import net.amygdalum.regexparser.SpecialCharClassNode;
import net.amygdalum.regexparser.StringNode;
import net.amygdalum.regexparser.UnboundedLoopNode;
import net.amygdalum.util.bits.BitSet;
import net.amygdalum.util.io.BitMaskCharClassMapper;
import net.amygdalum.util.io.CharClassMapper;
import net.amygdalum.util.io.LowByteCharClassMapper;
import net.amygdalum.util.io.SmallRangeCharClassMapper;
import net.amygdalum.util.map.BitSetObjectMap;
import net.amygdalum.util.map.CharObjectMap;
import net.amygdalum.util.text.CharRange;
import net.amygdalum.util.text.CharRangeAccumulator;
import net.amygdalum.util.worklist.WorkSet;
public class GlushkovAnalyzer implements RegexNodeVisitor {
private RegexNode root;
private List charCollector;
private Map> first;
private Map> last;
private Map> follow;
private Map> precede;
private Map minLength;
private DefinedCharNode[] chars;
private int len;
private CharClassMapper mapper;
private char[] alphabet;
public GlushkovAnalyzer(RegexNode root) {
this.root = root;
this.first = new LinkedHashMap<>();
this.last = new LinkedHashMap<>();
this.follow = new LinkedHashMap<>();
this.precede = new LinkedHashMap<>();
this.minLength = new LinkedHashMap<>();
this.charCollector = new ArrayList<>();
this.charCollector.add(null);
}
public CharClassMapper mapper() {
return mapper;
}
private DefinedCharNode[] characters() {
return charCollector.toArray(new DefinedCharNode[0]);
}
public Set firstChars() {
Set firstChars = new LinkedHashSet<>();
for (int index : first(root)) {
for (char c : chars[index].chars()) {
firstChars.add(c);
}
}
return firstChars;
}
public Set lastChars() {
Set lastChars = new LinkedHashSet<>();
for (int index : last(root)) {
for (char c : chars[index].chars()) {
lastChars.add(c);
}
}
return lastChars;
}
private void first(RegexNode node, Integer... value) {
first(node, new LinkedHashSet<>(asList(value)));
}
private void first(RegexNode node, Set value) {
first.put(node, value);
}
private Set first(RegexNode node) {
return first.get(node);
}
private List> first(List nodes) {
List> result = new ArrayList<>(nodes.size());
for (RegexNode node : nodes) {
result.add(first(node));
}
return result;
}
private void last(RegexNode node, Integer... value) {
last(node, new LinkedHashSet<>(asList(value)));
}
private void last(RegexNode node, Set value) {
last.put(node, value);
}
private Set last(RegexNode node) {
return last.get(node);
}
private List> last(List nodes) {
List> result = new ArrayList<>(nodes.size());
for (RegexNode node : nodes) {
result.add(last(node));
}
return result;
}
private void appendFollow(int key, Collection append) {
Set followSet = follow.get(key);
if (followSet == null) {
followSet = new LinkedHashSet();
follow.put(key, followSet);
}
followSet.addAll(append);
}
private Set follow(Integer i) {
Set set = follow.get(i);
if (set == null) {
return Collections.emptySet();
} else {
return set;
}
}
private void appendPrecede(int key, Collection append) {
Set precedeSet = precede.get(key);
if (precedeSet == null) {
precedeSet = new LinkedHashSet();
precede.put(key, precedeSet);
}
precedeSet.addAll(append);
}
private void minLength(RegexNode node, Integer value) {
minLength.put(node, value);
}
private List minLength(List nodes) {
List result = new ArrayList<>(nodes.size());
for (RegexNode node : nodes) {
result.add(minLength(node));
}
return result;
}
private Integer minLength(RegexNode node) {
return minLength.get(node);
}
public GlushkovAnalyzer analyze() {
root.accept(this);
appendFollow(0, first(root));
for (int f : first(root)) {
appendPrecede(f, asList(0));
}
chars = characters();
len = chars.length;
mapper = computeMapper(chars);
alphabet = mapper.representatives();
return this;
}
private CharClassMapper computeMapper(DefinedCharNode[] nodes) {
CharRangeAccumulator acc = new CharRangeAccumulator();
for (DefinedCharNode node : nodes) {
if (node != null) {
acc.split(node.getFrom(), node.getTo());
}
}
List liveRanges = acc.getRanges();
boolean lowByte = computeLowByte(liveRanges);
boolean smallRange = computeSmallRange(liveRanges, lowByte);
if (smallRange) {
return new SmallRangeCharClassMapper(liveRanges);
} else if (lowByte) {
return new LowByteCharClassMapper(liveRanges);
} else {
return new BitMaskCharClassMapper(liveRanges);
}
}
public boolean computeLowByte(List liveRanges) {
Set highbytes = new HashSet<>();
for (CharRange range : liveRanges) {
highbytes.add(range.from & 0xff00);
highbytes.add(range.to & 0xff00);
}
return highbytes.size() <= 1;
}
public boolean computeSmallRange(List liveRanges, boolean lowByte) {
if (liveRanges.isEmpty()) {
return true;
} else {
char min = liveRanges.get(0).from;
char max = liveRanges.get(liveRanges.size() - 1).to;
if (lowByte) {
return max - min <= 64;
} else {
return max - min <= 256;
}
}
}
public GlushkovAutomaton buildAutomaton(GlushkovAnalyzerOption... options) {
BitSet initial = FACTORS.in(options) ? all() : initial();
BitSet finals = finals();
CharObjectMap reachableByChar = reachableByChar(options);
BitSetObjectMap reachableByState = reachableByState(reachableByChar, options);
return new GlushkovAutomaton(initial, finals, reachableByChar, reachableByState);
}
public DualGlushkovAutomaton buildReverseAutomaton(GlushkovAnalyzerOption... options) {
BitSet initial = FACTORS.in(options) ? all() : finals();
BitSet finals = initial();
CharObjectMap reachableByChar = reachableByChar(options);
BitSetObjectMap reachableByState = sourceableByState(reachableByChar, options);
return new DualGlushkovAutomaton(initial, finals, reachableByChar, reachableByState);
}
public int minLength() {
return minLength(root);
}
private BitSet initial() {
return BitSet.bits(len, 0);
}
private CharObjectMap reachableByChar(GlushkovAnalyzerOption... options) {
BitSet defaultValue = SELF_LOOP.in(options) ? initial() : BitSet.empty(len);
CharObjectMap reachable = new CharObjectMap(defaultValue);
for (int i = 1; i < len; i++) {
for (char c : chars[i].chars()) {
BitSet b = reachable.get(c);
if (b == defaultValue) {
b = defaultValue.clone();
reachable.put(c, b);
}
b.set(i);
}
}
return reachable;
}
private BitSetObjectMap reachableByState(CharObjectMap reachableByChar, GlushkovAnalyzerOption... options) {
BitSet defaultValue = SELF_LOOP.in(options) ? initial() : BitSet.empty(len);
BitSet start = FACTORS.in(options) ? all() : initial();
return new Collector(len, alphabet, follow, reachableByChar, defaultValue)
.collect(start);
}
private BitSetObjectMap sourceableByState(CharObjectMap reachableByChar, GlushkovAnalyzerOption... options) {
BitSet defaultValue = SELF_LOOP.in(options) ? finals() : BitSet.empty(len);
BitSet start = FACTORS.in(options) ? all() : finals();
return new Collector(len, alphabet, precede, reachableByChar, defaultValue)
.collect(allFinals(start, reachableByChar, options));
}
private List allFinals(BitSet initial, CharObjectMap reachableByChar, GlushkovAnalyzerOption... options) {
BitSet start = FACTORS.in(options) ? all() : initial();
BitSet defaultValue = SELF_LOOP.in(options) ? initial() : BitSet.empty(len);
Collection possible = possibleStartsByState(start, reachableByChar, defaultValue);
return filterPossiblesStartsByChar(initial, reachableByChar, possible);
}
private Collection possibleStartsByState(BitSet next, CharObjectMap reachableByChar, BitSet defaultValue) {
Map possible = new LinkedHashMap<>();
possibleStartsByState(possible, next, reachableByChar, defaultValue);
return possible.values();
}
private void possibleStartsByState(Map possible, BitSet start, CharObjectMap reachableByChar, BitSet defaultValue) {
Queue nexts = new WorkSet<>();
nexts.add(start);
nexts.add(start.or(initial()));
while (!nexts.isEmpty()) {
BitSet next = nexts.remove();
BitSet td = possible.get(next);
if (td == null) {
td = (BitSet) defaultValue.clone();
for (int i = 0; i < len; i++) {
if (next.get(i)) {
td = td.or(bits(len, follow(i)));
}
}
possible.put(next, td);
BitSet n = (BitSet) td.clone();
for (char c : alphabet) {
BitSet cand = n.and(reachableByChar.get(c));
if (!nexts.contains(cand)) {
nexts.add(cand);
}
cand = cand.or(initial());
if (!nexts.contains(cand)) {
nexts.add(cand);
}
}
}
}
}
private List filterPossiblesStartsByChar(BitSet initial, CharObjectMap reachableByChar, Collection possible) {
Set filteredPossible = new LinkedHashSet<>();
for (BitSet value : possible) {
BitSet finalValue = (BitSet) initial.and(value);
for (char c : alphabet) {
BitSet charFilter = reachableByChar.get(c);
BitSet state = finalValue.and(charFilter);
if (!state.isEmpty()) {
filteredPossible.add(state);
}
}
}
return new ArrayList<>(filteredPossible);
}
private static BitSet bits(int len, Set ints) {
BitSet bits = BitSet.empty(len);
for (int i : ints) {
bits.set(i);
}
return bits;
}
private BitSet finals() {
BitSet finals = BitSet.empty(len);
for (int x : last(root)) {
finals.set(x);
}
if (minLength.get(root) == 0) {
finals.set(0);
}
return finals;
}
private BitSet all() {
return BitSet.all(len);
}
@Override
public Void visitAlternatives(AlternativesNode node) {
List subNodes = node.getSubNodes();
for (RegexNode subNode : subNodes) {
subNode.accept(this);
}
first(node, union(first(subNodes)));
last(node, union(last(subNodes)));
minLength(node, minimum(minLength(subNodes)));
return null;
}
@Override
public Void visitAnyChar(AnyCharNode node) {
throw new UnsupportedOperationException("decomposed normal from does not contain char classes");
}
@Override
public Void visitCharClass(CharClassNode node) {
throw new UnsupportedOperationException("decomposed normal from does not contain char classes");
}
@Override
public Void visitCompClass(CompClassNode node) {
throw new UnsupportedOperationException("decomposed normal from does not contain char classes");
}
@Override
public Void visitConcat(ConcatNode node) {
List subNodes = node.getSubNodes();
for (RegexNode subNode : subNodes) {
subNode.accept(this);
}
first(node, union(concatFirst(subNodes)));
last(node, union(concatLast(subNodes)));
minLength(node, sum(minLength(subNodes)));
for (int i = 0; i < subNodes.size() - 1; i++) {
RegexNode current = subNodes.get(i);
for (int j = i + 1; j < subNodes.size(); j++) {
RegexNode next = subNodes.get(j);
for (int x : last(current)) {
appendFollow(x, first(next));
}
if (minLength(next) > 0) {
break;
}
}
}
for (int i = subNodes.size() - 1; i >= 1; i--) {
RegexNode current = subNodes.get(i);
for (int j = i - 1; j >= 0; j--) {
RegexNode prev = subNodes.get(j);
for (int y : first(current)) {
appendPrecede(y, last(prev));
}
if (minLength(prev) > 0) {
break;
}
}
}
return null;
}
private List> concatFirst(List subNodes) {
List> result = new ArrayList<>();
int minLength = 0;
for (RegexNode subNode : subNodes) {
if (minLength > 0) {
break;
}
result.add(first(subNode));
minLength += minLength(subNode);
}
return result;
}
private List> concatLast(List subNodes) {
List reverseSubNodes = new ArrayList<>(subNodes);
Collections.reverse(reverseSubNodes);
List> result = new ArrayList<>();
int minLength = 0;
for (RegexNode subNode : reverseSubNodes) {
if (minLength > 0) {
break;
}
result.add(last(subNode));
minLength += minLength(subNode);
}
return result;
}
@Override
public Void visitEmpty(EmptyNode node) {
first(node, new HashSet());
last(node, new HashSet());
minLength(node, 0);
return null;
}
@Override
public Void visitGroup(GroupNode node) {
RegexNode subNode = node.getSubNode();
subNode.accept(this);
first(node, first(subNode));
last(node, last(subNode));
minLength(node, minLength(subNode));
return null;
}
@Override
public Void visitBoundedLoop(BoundedLoopNode node) {
throw new UnsupportedOperationException("decomposed normal from does not contain bounded loops");
}
@Override
public Void visitUnboundedLoop(UnboundedLoopNode node) {
if (node.getFrom() > 0) {
throw new UnsupportedOperationException("decomposed normal from does not contain plus loops");
}
RegexNode subNode = node.getSubNode();
subNode.accept(this);
first(node, first(subNode));
last(node, last(subNode));
minLength(node, 0);
RegexNode current = subNode;
RegexNode next = subNode;
for (int x : last(current)) {
appendFollow(x, first(next));
}
for (int y : first(next)) {
appendPrecede(y, last(current));
}
return null;
}
@Override
public Void visitOptional(OptionalNode node) {
RegexNode subNode = node.getSubNode();
subNode.accept(this);
first(node, first(subNode));
last(node, last(subNode));
minLength(node, 0);
return null;
}
@Override
public Void visitRangeChar(RangeCharNode node) {
int pos = charCollector.size();
charCollector.add(node);
first(node, pos);
last(node, pos);
minLength(node, 1);
return null;
}
@Override
public Void visitSingleChar(SingleCharNode node) {
int pos = charCollector.size();
charCollector.add(node);
first(node, pos);
last(node, pos);
minLength(node, 1);
return null;
}
@Override
public Void visitSpecialCharClass(SpecialCharClassNode node) {
throw new UnsupportedOperationException("decomposed normal from does not contain char classes");
}
@Override
public Void visitString(StringNode node) {
throw new UnsupportedOperationException("decomposed normal from does not contain strings");
}
private Set union(List> values) {
Set result = new LinkedHashSet<>();
for (Set value : values) {
result.addAll(value);
}
return result;
}
private Integer minimum(List values) {
int min = Integer.MAX_VALUE;
for (Integer value : values) {
if (value < min) {
min = value;
}
}
return min;
}
private Integer sum(List values) {
int sum = 0;
for (Integer value : values) {
sum += value;
}
return sum;
}
private static class Collector {
private int len;
private char[] alphabet;
private Map> next;
private BitSetObjectMap accumulator;
private CharObjectMap reachableByChar;
private BitSet defaultValue;
private WorkSet todo;
public Collector(int len, char[] alphabet, Map> next, CharObjectMap reachableByChar, BitSet defaultValue) {
this.len = len;
this.alphabet = alphabet;
this.next = next;
this.accumulator = new BitSetObjectMap(defaultValue);
this.reachableByChar = reachableByChar;
this.defaultValue = defaultValue;
this.todo = new WorkSet<>();
}
public BitSetObjectMap collect(BitSet... start) {
return collect(asList(start));
}
public BitSetObjectMap collect(Collection start) {
todo.addAll(start);
while (!todo.isEmpty()) {
BitSet current = todo.remove();
computeReachables(current);
}
return accumulator;
}
private void computeReachables(BitSet d) {
BitSet td = accumulator.get(d);
if (td == defaultValue) {
td = (BitSet) defaultValue.clone();
}
for (int i = 0; i < len; i++) {
if (d.get(i)) {
td = td.or(bits(len, next(i)));
}
}
accumulator.put(d, td);
BitSet n = (BitSet) td.clone();
for (char c : alphabet) {
BitSet next = n.and(reachableByChar.get(c));
if (accumulator.get(next) == defaultValue) {
todo.add(next);
}
}
}
private Set next(Integer i) {
Set set = next.get(i);
if (set == null) {
return Collections.emptySet();
} else {
return set;
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy