it.unive.lisa.analysis.string.tarsis.RegexAutomaton Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lisa-analyses Show documentation
A library for static analysis
The newest version!
package it.unive.lisa.analysis.string.tarsis;

import it.unive.lisa.util.datastructures.automaton.Automaton;
import it.unive.lisa.util.datastructures.automaton.CyclicAutomatonException;
import it.unive.lisa.util.datastructures.automaton.State;
import it.unive.lisa.util.datastructures.automaton.Transition;
import it.unive.lisa.util.datastructures.regex.Atom;
import it.unive.lisa.util.datastructures.regex.RegularExpression;
import it.unive.lisa.util.datastructures.regex.TopAtom;
import it.unive.lisa.util.datastructures.regex.symbolic.SymbolicChar;
import it.unive.lisa.util.datastructures.regex.symbolic.SymbolicString;
import it.unive.lisa.util.datastructures.regex.symbolic.UnknownSymbolicChar;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.Vector;
import org.apache.commons.lang3.tuple.Pair;

/**
 * A class that describes an generic automaton(dfa, nfa, epsilon nfa) using an
 * alphabet of strings, extended with a special symbol for statically unknown
 * ones. Transition symbols are {@link RegularExpression}s.
 *
 * @author Luca Negrini
 */
public class RegexAutomaton extends Automaton {

	/**
	 * Builds a {@link RegexAutomaton} recognizing the top string, that is, with
	 * a single transition recognizing {@link TopAtom}.
	 * 
	 * @return the automaton
	 */
	public static RegexAutomaton topString() {
		State q0 = new State(0, true, true);
		State q1 = new State(1, false, true);

		SortedSet states = new TreeSet<>();
		states.add(q0);
		states.add(q1);

		SortedSet> delta = new TreeSet<>();
		delta.add(new Transition<>(q0, q1, TopAtom.INSTANCE));

		RegexAutomaton result = new RegexAutomaton(states, delta);
		result.deterministic = Optional.of(true);
		result.minimized = Optional.of(true);
		return result;
	}

	/**
	 * Builds a {@link RegexAutomaton} recognizing the empty language.
	 * 
	 * @return the automaton
	 */
	public static RegexAutomaton emptyLang() {
		SortedSet newStates = new TreeSet<>();
		State initialState = new State(0, true, false);
		newStates.add(initialState);

		RegexAutomaton result = new RegexAutomaton(newStates, Collections.emptySortedSet());
		result.deterministic = Optional.of(true);
		result.minimized = Optional.of(true);
		return result;
	}

	/**
	 * Builds a {@link RegexAutomaton} recognizing the given string.
	 * 
	 * @param string the string to recognize
	 * 
	 * @return the automaton
	 */
	public static RegexAutomaton string(
			String string) {
		State q0 = new State(0, true, false);
		State q1 = new State(1, false, true);

		SortedSet states = new TreeSet<>();
		states.add(q0);
		states.add(q1);

		SortedSet> delta = new TreeSet<>();
		delta.add(new Transition<>(q0, q1, new Atom(string)));

		RegexAutomaton result = new RegexAutomaton(states, delta);
		result.deterministic = Optional.of(true);
		result.minimized = Optional.of(true);
		return result;
	}

	/**
	 * Builds a {@link RegexAutomaton} recognizing the given string.
	 * 
	 * @param s the string to recognize
	 * 
	 * @return the automaton
	 */
	public static RegexAutomaton string(
			SymbolicString s) {
		List result = new ArrayList<>();
		String collector = "";
		for (SymbolicChar ch : s.collapseTopChars()) {
			if (ch instanceof UnknownSymbolicChar) {
				if (!collector.isEmpty())
					result.add(string(collector));

				collector = "";
				result.add(topString());
			} else
				collector += ch.asChar();
		}

		if (!collector.isEmpty())
			result.add(string(collector));

		if (result.isEmpty())
			return emptyStr();

		if (result.size() == 1)
			return result.get(0);

		RegexAutomaton r = result.get(0);
		for (int i = 1; i < result.size(); i++)
			r = r.concat(result.get(i));

		return r;
	}

	/**
	 * Builds a {@link RegexAutomaton} recognizing the empty string.
	 * 
	 * @return the automaton
	 */
	public static RegexAutomaton emptyStr() {
		State q0 = new State(0, true, false);
		State q1 = new State(1, false, true);

		SortedSet states = new TreeSet<>();
		states.add(q0);
		states.add(q1);

		SortedSet> delta = new TreeSet<>();
		delta.add(new Transition<>(q0, q1, Atom.EPSILON));

		RegexAutomaton result = new RegexAutomaton(states, delta);
		result.deterministic = Optional.of(true);
		result.minimized = Optional.of(true);
		return result;
	}

	/**
	 * Builds a {@link RegexAutomaton} recognizing the given strings.
	 * 
	 * @param strings the strings to recognize
	 * 
	 * @return the automaton
	 */
	public static RegexAutomaton strings(
			String... strings) {
		RegexAutomaton a = emptyLang();

		for (String s : strings)
			a = a.union(string(s));

		return a;
	}

	@Override
	public RegexAutomaton singleString(
			String string) {
		return string(string);
	}

	@Override
	public RegexAutomaton unknownString() {
		return topString();
	}

	@Override
	public RegexAutomaton emptyLanguage() {
		return emptyLang();
	}

	@Override
	public RegexAutomaton emptyString() {
		return emptyStr();
	}

	@Override
	public RegexAutomaton from(
			SortedSet states,
			SortedSet> transitions) {
		return new RegexAutomaton(states, transitions);
	}

	@Override
	public RegularExpression epsilon() {
		return Atom.EPSILON;
	}

	@Override
	public RegularExpression concat(
			RegularExpression first,
			RegularExpression second) {
		return first.comp(second);
	}

	@Override
	public RegularExpression symbolToRegex(
			RegularExpression symbol) {
		return symbol;
	}

	/**
	 * Builds a new automaton with given {@code states} and {@code transitions}.
	 *
	 * @param states      the set of states of the new automaton
	 * @param transitions the set of the transitions of the new automaton
	 */
	public RegexAutomaton(
			SortedSet states,
			SortedSet> transitions) {
		super(states, transitions);
	}

	/**
	 * Yields {@code true} if and only if there is at least one transition in
	 * this automaton that recognizes a top string.
	 * 
	 * @return {@code true} if that condition holds
	 */
	public boolean acceptsTopEventually() {
		removeUnreachableStates();
		for (Transition t : transitions)
			if (t.getSymbol() instanceof TopAtom)
				return true;

		return false;
	}

	/**
	 * Yields a new automaton that is built by exploding this one, that is, by
	 * ensuring that each transition recognizes regular expressions of at most
	 * one character (excluding the ones recognizing the top string). 

	 * 

	 * This automaton is never modified by this method.
	 * 
	 * @return the exploded automaton
	 */
	public RegexAutomaton explode() {
		SortedSet exStates = new TreeSet<>();
		SortedSet> exTransitions = new TreeSet<>();
		int counter = 0;
		Map mapping = new HashMap<>();

		for (State origin : states) {
			State st = new State(counter++, origin.isInitial(), origin.isFinal());
			State replaced = mapping.computeIfAbsent(origin, s -> st);
			exStates.add(replaced);
			for (Transition t : getOutgoingTransitionsFrom(origin)) {
				State st1 = new State(counter++, t.getDestination().isInitial(), t.getDestination().isFinal());
				State dest = mapping.computeIfAbsent(t.getDestination(), s -> st1);
				exStates.add(dest);

				if (t.getSymbol().maxLength() < 2)
					exTransitions.add(new Transition<>(replaced, dest, t.getSymbol()));
				else {
					RegularExpression[] regexes = t.getSymbol().explode();
					State last = replaced;
					for (RegularExpression regex : regexes)
						if (regex == regexes[regexes.length - 1])
							exTransitions.add(new Transition<>(last, dest, regex));
						else {
							State temp = new State(counter++, false, false);
							exStates.add(temp);
							exTransitions.add(new Transition<>(last, temp, regex));
							last = temp;
						}
				}
			}
		}

		return new RegexAutomaton(exStates, exTransitions).minimize();
	}

	@Override
	public RegexAutomaton intersection(
			RegexAutomaton other) {
		if (this == other)
			return this;

		int code = 0;
		Map> stateMapping = new HashMap<>();
		SortedSet newStates = new TreeSet<>();
		SortedSet> newDelta = new TreeSet>();

		for (State s1 : states)
			for (State s2 : other.states) {
				State s = new State(code++, s1.isInitial() && s2.isInitial(), s1.isFinal() && s2.isFinal());
				stateMapping.put(s, Pair.of(s1, s2));
				newStates.add(s);
			}

		for (Transition t1 : getTransitions()) {
			for (Transition t2 : other.getTransitions()) {
				State from = getStateFromPair(stateMapping, Pair.of(t1.getSource(), t2.getSource()));
				State to = getStateFromPair(stateMapping, Pair.of(t1.getDestination(), t2.getDestination()));

				if (t1.getSymbol().equals(t2.getSymbol()))
					newDelta.add(new Transition(from, to, t1.getSymbol()));
				else if (t1.getSymbol() == TopAtom.INSTANCE && t2.getSymbol() != TopAtom.INSTANCE)
					newDelta.add(new Transition(from, to, t2.getSymbol()));
				else if (t1.getSymbol() != TopAtom.INSTANCE && t2.getSymbol() == TopAtom.INSTANCE)
					newDelta.add(new Transition(from, to, t1.getSymbol()));
			}
		}

		RegexAutomaton result = from(newStates, newDelta).minimize();
		return result;
	}

	/**
	 * Yields a new automaton that is built by collapsing {@code this}, that is,
	 * by merging together subsequent states that are never the root of a
	 * branch, the destination of a loop, or that have at least one outgoing
	 * transition recognizing the top string.

	 * 

	 * {@code this} is never modified by this method.
	 * 
	 * @return the collapsed automaton
	 */
	public RegexAutomaton collapse() {
		HashSet> collected = new HashSet<>();

		Set> paths = getAllPaths();
		if (paths.isEmpty())
			return this;

		for (List v : paths)
			collected.addAll(findMergableStatesInPath(v));

		if (collected.isEmpty())
			return this;

		RegexAutomaton collapsed = copy();
		Set> edgesToRemove = new HashSet<>();
		Set statesToRemove = new HashSet<>();
		for (Vector v : collected) {
			String accumulated = "";
			if (v.size() == 1)
				statesToRemove.add(v.firstElement());
			else
				for (int i = 0; i < v.size() - 1; i++) {
					State from = v.get(i);
					State to = v.get(i + 1);
					Transition t = getAllTransitionsConnecting(from, to).iterator().next();
					accumulated += ((Atom) t.getSymbol()).toString();
					edgesToRemove.add(t);
					statesToRemove.add(from);
					statesToRemove.add(to);
				}

			Transition in = collapsed.getIngoingTransitionsFrom(v.firstElement()).iterator().next();
			edgesToRemove.add(in);
			accumulated = ((Atom) in.getSymbol()).toString() + accumulated;
			Transition out = collapsed.getOutgoingTransitionsFrom(v.lastElement()).iterator().next();
			edgesToRemove.add(out);
			accumulated += ((Atom) out.getSymbol()).toString();

			collapsed.addTransition(in.getSource(), out.getDestination(), new Atom(accumulated));
		}

		collapsed.removeTransitions(edgesToRemove);
		collapsed.removeStates(statesToRemove);
		return collapsed.minimize();
	}

	private Set> findMergableStatesInPath(
			List v) {
		Set> collected = new HashSet<>();
		if (v.size() == 1)
			return collected;

		Vector sequence = new Vector<>();
		boolean collecting = false;
		Set> tmp;
		for (int i = 0; i < v.size() - 1; i++) {
			State from = v.get(i);
			State to = v.get(i + 1);
			if (getAllTransitionsConnecting(from, to).size() != 1) {
				// more than one edge connecting the nodes: this is an or
				if (collecting) {
					collecting = false;
					collected.add(sequence);
					sequence = new Vector<>();
				}
			} else if (getIngoingTransitionsFrom(to).size() != 1) {
				// more than one edge reaching `to`: this is the join of an or
				if (collecting) {
					collecting = false;
					collected.add(sequence);
					sequence = new Vector<>();
				}
			} else if ((tmp = getOutgoingTransitionsFrom(to)).size() == 1
					&& !(tmp.iterator().next().getSymbol() instanceof TopAtom)) {
				// reading just a symbol that is not top!
				sequence.add(to);
				if (!collecting)
					collecting = true;
			} else if (collecting) {
				collecting = false;
				collected.add(sequence);
				sequence = new Vector<>();
			}
		}

		return collected;
	}

	/**
	 * Yields a new automaton where all occurrences of strings recognized by
	 * {@code toReplace} are replaced with the automaton {@code str}, assuming
	 * that {@code toReplace} is finite (i.e., no loops nor top-transitions).
	 * The resulting automaton is then collapsed.

	 * 

	 * If {@code toReplace} recognizes a single string, than this method
	 * performs a must-replacement, meaning that the string recognized by
	 * {@code toReplace} will effectively be replaced. Otherwise, occurrences of
	 * strings of {@code toReplace} are not replaced in the resulting automaton:
	 * instead, a branching will be introduced to model an or between the
	 * original string of {@code toReplace} and the whole {@code str}. 

	 * 

	 * {@code this} is never modified by this method.
	 * 
	 * @param toReplace the automaton recognizing the strings to replace
	 * @param str       the automaton that must be used as replacement
	 * 
	 * @return the replaced automaton
	 * 
	 * @throws CyclicAutomatonException if {@code toReplace} contains loops
	 */
	public RegexAutomaton replace(
			RegexAutomaton toReplace,
			RegexAutomaton str)
			throws CyclicAutomatonException {
		Collection automata = new ArrayList<>();
		boolean isSingleString = toReplace.getLanguage().size() == 1;
		for (String s : toReplace.getLanguage())
			automata.add(new StringReplacer(this).replace(s, str, isSingleString).collapse());

		if (automata.size() == 1)
			return automata.iterator().next();

		return union(automata.toArray(new RegexAutomaton[automata.size()]));
	}

	private RegexAutomaton union(
			RegexAutomaton... automata) {
		RegexAutomaton result = emptyLanguage();

		for (RegexAutomaton a : automata)
			result = a.union(result);

		return result;
	}

	/**
	 * Yields an automaton that corresponds to the {@code n}-time concatenation
	 * of {@code this}.
	 * 
	 * @param n the number of repetitions
	 * 
	 * @return an automaton that corresponds to the {@code n}-time concatenation
	 *             of {@code this}
	 */
	public RegexAutomaton repeat(
			long n) {
		if (n == 0)
			return emptyString();
		return toRegex().simplify().repeat(n).toAutomaton(this).minimize();
	}

	/**
	 * Yields a new automaton where leading whitespaces have been removed from
	 * {@code this}.
	 * 
	 * @return a new automaton where leading whitespaces have been removed from
	 *             {@code this}
	 */
	public RegexAutomaton trimLeft() {
		return this.toRegex().trimLeft().simplify().toAutomaton(this);
	}

	/**
	 * Yields a new automaton where trailing whitespaces have been removed from
	 * {@code this}.
	 * 
	 * @return a new automaton where trailing whitespaces have been removed from
	 *             {@code this}
	 */
	public RegexAutomaton trimRight() {
		return this.toRegex().trimRight().simplify().toAutomaton(this);
	}

	/**
	 * Yields a new automaton where trailing and leading whitespaces have been
	 * removed from {@code this}.
	 * 
	 * @return a new automaton where trailing and leading whitespaces have been
	 *             removed from {@code this}
	 */
	public RegexAutomaton trim() {
		return this.toRegex().trimRight().simplify().trimLeft().simplify().toAutomaton(this);
	}
}