it.unive.lisa.analysis.string.tarsis.RegexAutomaton Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lisa-analyses Show documentation
Show all versions of lisa-analyses Show documentation
A library for static analysis
The newest version!
package it.unive.lisa.analysis.string.tarsis;
import it.unive.lisa.util.datastructures.automaton.Automaton;
import it.unive.lisa.util.datastructures.automaton.CyclicAutomatonException;
import it.unive.lisa.util.datastructures.automaton.State;
import it.unive.lisa.util.datastructures.automaton.Transition;
import it.unive.lisa.util.datastructures.regex.Atom;
import it.unive.lisa.util.datastructures.regex.RegularExpression;
import it.unive.lisa.util.datastructures.regex.TopAtom;
import it.unive.lisa.util.datastructures.regex.symbolic.SymbolicChar;
import it.unive.lisa.util.datastructures.regex.symbolic.SymbolicString;
import it.unive.lisa.util.datastructures.regex.symbolic.UnknownSymbolicChar;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.Vector;
import org.apache.commons.lang3.tuple.Pair;
/**
* A class that describes an generic automaton(dfa, nfa, epsilon nfa) using an
* alphabet of strings, extended with a special symbol for statically unknown
* ones. Transition symbols are {@link RegularExpression}s.
*
* @author Luca Negrini
*/
public class RegexAutomaton extends Automaton {
/**
* Builds a {@link RegexAutomaton} recognizing the top string, that is, with
* a single transition recognizing {@link TopAtom}.
*
* @return the automaton
*/
public static RegexAutomaton topString() {
State q0 = new State(0, true, true);
State q1 = new State(1, false, true);
SortedSet states = new TreeSet<>();
states.add(q0);
states.add(q1);
SortedSet> delta = new TreeSet<>();
delta.add(new Transition<>(q0, q1, TopAtom.INSTANCE));
RegexAutomaton result = new RegexAutomaton(states, delta);
result.deterministic = Optional.of(true);
result.minimized = Optional.of(true);
return result;
}
/**
* Builds a {@link RegexAutomaton} recognizing the empty language.
*
* @return the automaton
*/
public static RegexAutomaton emptyLang() {
SortedSet newStates = new TreeSet<>();
State initialState = new State(0, true, false);
newStates.add(initialState);
RegexAutomaton result = new RegexAutomaton(newStates, Collections.emptySortedSet());
result.deterministic = Optional.of(true);
result.minimized = Optional.of(true);
return result;
}
/**
* Builds a {@link RegexAutomaton} recognizing the given string.
*
* @param string the string to recognize
*
* @return the automaton
*/
public static RegexAutomaton string(
String string) {
State q0 = new State(0, true, false);
State q1 = new State(1, false, true);
SortedSet states = new TreeSet<>();
states.add(q0);
states.add(q1);
SortedSet> delta = new TreeSet<>();
delta.add(new Transition<>(q0, q1, new Atom(string)));
RegexAutomaton result = new RegexAutomaton(states, delta);
result.deterministic = Optional.of(true);
result.minimized = Optional.of(true);
return result;
}
/**
* Builds a {@link RegexAutomaton} recognizing the given string.
*
* @param s the string to recognize
*
* @return the automaton
*/
public static RegexAutomaton string(
SymbolicString s) {
List result = new ArrayList<>();
String collector = "";
for (SymbolicChar ch : s.collapseTopChars()) {
if (ch instanceof UnknownSymbolicChar) {
if (!collector.isEmpty())
result.add(string(collector));
collector = "";
result.add(topString());
} else
collector += ch.asChar();
}
if (!collector.isEmpty())
result.add(string(collector));
if (result.isEmpty())
return emptyStr();
if (result.size() == 1)
return result.get(0);
RegexAutomaton r = result.get(0);
for (int i = 1; i < result.size(); i++)
r = r.concat(result.get(i));
return r;
}
/**
* Builds a {@link RegexAutomaton} recognizing the empty string.
*
* @return the automaton
*/
public static RegexAutomaton emptyStr() {
State q0 = new State(0, true, false);
State q1 = new State(1, false, true);
SortedSet states = new TreeSet<>();
states.add(q0);
states.add(q1);
SortedSet> delta = new TreeSet<>();
delta.add(new Transition<>(q0, q1, Atom.EPSILON));
RegexAutomaton result = new RegexAutomaton(states, delta);
result.deterministic = Optional.of(true);
result.minimized = Optional.of(true);
return result;
}
/**
* Builds a {@link RegexAutomaton} recognizing the given strings.
*
* @param strings the strings to recognize
*
* @return the automaton
*/
public static RegexAutomaton strings(
String... strings) {
RegexAutomaton a = emptyLang();
for (String s : strings)
a = a.union(string(s));
return a;
}
@Override
public RegexAutomaton singleString(
String string) {
return string(string);
}
@Override
public RegexAutomaton unknownString() {
return topString();
}
@Override
public RegexAutomaton emptyLanguage() {
return emptyLang();
}
@Override
public RegexAutomaton emptyString() {
return emptyStr();
}
@Override
public RegexAutomaton from(
SortedSet states,
SortedSet> transitions) {
return new RegexAutomaton(states, transitions);
}
@Override
public RegularExpression epsilon() {
return Atom.EPSILON;
}
@Override
public RegularExpression concat(
RegularExpression first,
RegularExpression second) {
return first.comp(second);
}
@Override
public RegularExpression symbolToRegex(
RegularExpression symbol) {
return symbol;
}
/**
* Builds a new automaton with given {@code states} and {@code transitions}.
*
* @param states the set of states of the new automaton
* @param transitions the set of the transitions of the new automaton
*/
public RegexAutomaton(
SortedSet states,
SortedSet> transitions) {
super(states, transitions);
}
/**
* Yields {@code true} if and only if there is at least one transition in
* this automaton that recognizes a top string.
*
* @return {@code true} if that condition holds
*/
public boolean acceptsTopEventually() {
removeUnreachableStates();
for (Transition t : transitions)
if (t.getSymbol() instanceof TopAtom)
return true;
return false;
}
/**
* Yields a new automaton that is built by exploding this one, that is, by
* ensuring that each transition recognizes regular expressions of at most
* one character (excluding the ones recognizing the top string).
*
* This automaton is never modified by this method.
*
* @return the exploded automaton
*/
public RegexAutomaton explode() {
SortedSet exStates = new TreeSet<>();
SortedSet> exTransitions = new TreeSet<>();
int counter = 0;
Map mapping = new HashMap<>();
for (State origin : states) {
State st = new State(counter++, origin.isInitial(), origin.isFinal());
State replaced = mapping.computeIfAbsent(origin, s -> st);
exStates.add(replaced);
for (Transition t : getOutgoingTransitionsFrom(origin)) {
State st1 = new State(counter++, t.getDestination().isInitial(), t.getDestination().isFinal());
State dest = mapping.computeIfAbsent(t.getDestination(), s -> st1);
exStates.add(dest);
if (t.getSymbol().maxLength() < 2)
exTransitions.add(new Transition<>(replaced, dest, t.getSymbol()));
else {
RegularExpression[] regexes = t.getSymbol().explode();
State last = replaced;
for (RegularExpression regex : regexes)
if (regex == regexes[regexes.length - 1])
exTransitions.add(new Transition<>(last, dest, regex));
else {
State temp = new State(counter++, false, false);
exStates.add(temp);
exTransitions.add(new Transition<>(last, temp, regex));
last = temp;
}
}
}
}
return new RegexAutomaton(exStates, exTransitions).minimize();
}
@Override
public RegexAutomaton intersection(
RegexAutomaton other) {
if (this == other)
return this;
int code = 0;
Map> stateMapping = new HashMap<>();
SortedSet newStates = new TreeSet<>();
SortedSet> newDelta = new TreeSet>();
for (State s1 : states)
for (State s2 : other.states) {
State s = new State(code++, s1.isInitial() && s2.isInitial(), s1.isFinal() && s2.isFinal());
stateMapping.put(s, Pair.of(s1, s2));
newStates.add(s);
}
for (Transition t1 : getTransitions()) {
for (Transition t2 : other.getTransitions()) {
State from = getStateFromPair(stateMapping, Pair.of(t1.getSource(), t2.getSource()));
State to = getStateFromPair(stateMapping, Pair.of(t1.getDestination(), t2.getDestination()));
if (t1.getSymbol().equals(t2.getSymbol()))
newDelta.add(new Transition(from, to, t1.getSymbol()));
else if (t1.getSymbol() == TopAtom.INSTANCE && t2.getSymbol() != TopAtom.INSTANCE)
newDelta.add(new Transition(from, to, t2.getSymbol()));
else if (t1.getSymbol() != TopAtom.INSTANCE && t2.getSymbol() == TopAtom.INSTANCE)
newDelta.add(new Transition(from, to, t1.getSymbol()));
}
}
RegexAutomaton result = from(newStates, newDelta).minimize();
return result;
}
/**
* Yields a new automaton that is built by collapsing {@code this}, that is,
* by merging together subsequent states that are never the root of a
* branch, the destination of a loop, or that have at least one outgoing
* transition recognizing the top string.
*
* {@code this} is never modified by this method.
*
* @return the collapsed automaton
*/
public RegexAutomaton collapse() {
HashSet> collected = new HashSet<>();
Set> paths = getAllPaths();
if (paths.isEmpty())
return this;
for (List v : paths)
collected.addAll(findMergableStatesInPath(v));
if (collected.isEmpty())
return this;
RegexAutomaton collapsed = copy();
Set> edgesToRemove = new HashSet<>();
Set statesToRemove = new HashSet<>();
for (Vector v : collected) {
String accumulated = "";
if (v.size() == 1)
statesToRemove.add(v.firstElement());
else
for (int i = 0; i < v.size() - 1; i++) {
State from = v.get(i);
State to = v.get(i + 1);
Transition t = getAllTransitionsConnecting(from, to).iterator().next();
accumulated += ((Atom) t.getSymbol()).toString();
edgesToRemove.add(t);
statesToRemove.add(from);
statesToRemove.add(to);
}
Transition in = collapsed.getIngoingTransitionsFrom(v.firstElement()).iterator().next();
edgesToRemove.add(in);
accumulated = ((Atom) in.getSymbol()).toString() + accumulated;
Transition out = collapsed.getOutgoingTransitionsFrom(v.lastElement()).iterator().next();
edgesToRemove.add(out);
accumulated += ((Atom) out.getSymbol()).toString();
collapsed.addTransition(in.getSource(), out.getDestination(), new Atom(accumulated));
}
collapsed.removeTransitions(edgesToRemove);
collapsed.removeStates(statesToRemove);
return collapsed.minimize();
}
private Set> findMergableStatesInPath(
List v) {
Set> collected = new HashSet<>();
if (v.size() == 1)
return collected;
Vector sequence = new Vector<>();
boolean collecting = false;
Set> tmp;
for (int i = 0; i < v.size() - 1; i++) {
State from = v.get(i);
State to = v.get(i + 1);
if (getAllTransitionsConnecting(from, to).size() != 1) {
// more than one edge connecting the nodes: this is an or
if (collecting) {
collecting = false;
collected.add(sequence);
sequence = new Vector<>();
}
} else if (getIngoingTransitionsFrom(to).size() != 1) {
// more than one edge reaching `to`: this is the join of an or
if (collecting) {
collecting = false;
collected.add(sequence);
sequence = new Vector<>();
}
} else if ((tmp = getOutgoingTransitionsFrom(to)).size() == 1
&& !(tmp.iterator().next().getSymbol() instanceof TopAtom)) {
// reading just a symbol that is not top!
sequence.add(to);
if (!collecting)
collecting = true;
} else if (collecting) {
collecting = false;
collected.add(sequence);
sequence = new Vector<>();
}
}
return collected;
}
/**
* Yields a new automaton where all occurrences of strings recognized by
* {@code toReplace} are replaced with the automaton {@code str}, assuming
* that {@code toReplace} is finite (i.e., no loops nor top-transitions).
* The resulting automaton is then collapsed.
*
* If {@code toReplace} recognizes a single string, than this method
* performs a must-replacement, meaning that the string recognized by
* {@code toReplace} will effectively be replaced. Otherwise, occurrences of
* strings of {@code toReplace} are not replaced in the resulting automaton:
* instead, a branching will be introduced to model an or between the
* original string of {@code toReplace} and the whole {@code str}.
*
* {@code this} is never modified by this method.
*
* @param toReplace the automaton recognizing the strings to replace
* @param str the automaton that must be used as replacement
*
* @return the replaced automaton
*
* @throws CyclicAutomatonException if {@code toReplace} contains loops
*/
public RegexAutomaton replace(
RegexAutomaton toReplace,
RegexAutomaton str)
throws CyclicAutomatonException {
Collection automata = new ArrayList<>();
boolean isSingleString = toReplace.getLanguage().size() == 1;
for (String s : toReplace.getLanguage())
automata.add(new StringReplacer(this).replace(s, str, isSingleString).collapse());
if (automata.size() == 1)
return automata.iterator().next();
return union(automata.toArray(new RegexAutomaton[automata.size()]));
}
private RegexAutomaton union(
RegexAutomaton... automata) {
RegexAutomaton result = emptyLanguage();
for (RegexAutomaton a : automata)
result = a.union(result);
return result;
}
/**
* Yields an automaton that corresponds to the {@code n}-time concatenation
* of {@code this}.
*
* @param n the number of repetitions
*
* @return an automaton that corresponds to the {@code n}-time concatenation
* of {@code this}
*/
public RegexAutomaton repeat(
long n) {
if (n == 0)
return emptyString();
return toRegex().simplify().repeat(n).toAutomaton(this).minimize();
}
/**
* Yields a new automaton where leading whitespaces have been removed from
* {@code this}.
*
* @return a new automaton where leading whitespaces have been removed from
* {@code this}
*/
public RegexAutomaton trimLeft() {
return this.toRegex().trimLeft().simplify().toAutomaton(this);
}
/**
* Yields a new automaton where trailing whitespaces have been removed from
* {@code this}.
*
* @return a new automaton where trailing whitespaces have been removed from
* {@code this}
*/
public RegexAutomaton trimRight() {
return this.toRegex().trimRight().simplify().toAutomaton(this);
}
/**
* Yields a new automaton where trailing and leading whitespaces have been
* removed from {@code this}.
*
* @return a new automaton where trailing and leading whitespaces have been
* removed from {@code this}
*/
public RegexAutomaton trim() {
return this.toRegex().trimRight().simplify().trimLeft().simplify().toAutomaton(this);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy