org.evosuite.utils.RegexDistanceUtils Maven / Gradle / Ivy
/**
* Copyright (C) 2010-2018 Gordon Fraser, Andrea Arcuri and EvoSuite
* contributors
*
* This file is part of EvoSuite.
*
* EvoSuite is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3.0 of the License, or
* (at your option) any later version.
*
* EvoSuite is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with EvoSuite. If not, see .
*/
package org.evosuite.utils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.regex.Pattern;
import org.jgrapht.DirectedGraph;
import org.jgrapht.alg.CycleDetector;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
import org.jgrapht.traverse.TopologicalOrderIterator;
import dk.brics.automaton.Automaton;
import dk.brics.automaton.RegExp;
import dk.brics.automaton.State;
import dk.brics.automaton.Transition;
/**
* Class used to define the distance between a string and a regex
*/
public class RegexDistanceUtils {
/*
* Automatons for regex can be expensive to build. So we cache them,
* as we might need to access to them several times during the search
*/
private static Map> regexStateCache = new HashMap>();
private static Map regexAutomatonCache = new HashMap();
public static Automaton getRegexAutomaton(String regex) {
if (!regexAutomatonCache.containsKey(regex)) {
cacheRegex(regex);
}
return regexAutomatonCache.get(regex);
}
public static String getRegexInstance(String regex) {
if (!regexAutomatonCache.containsKey(regex)) {
cacheRegex(regex);
}
Automaton automaton = regexAutomatonCache.get(regex);
return automaton.getShortestExample(true);
}
public static String getNonMatchingRegexInstance(String regex) {
if (!regexAutomatonCache.containsKey(regex)) {
cacheRegex(regex);
}
Automaton automaton = regexAutomatonCache.get(regex);
return automaton.getShortestExample(false);
}
private static class GraphTransition {
public enum TransitionType{INSERTION, DELETION, REPLACEMENT,
/**
* A phantom transition is an artificial transition from the sink/final states to a single artificial sink/state.
* This is used to simplify the recursion calculation of the subpath costs.
*/
PHANTOM};
public final double cost;
public final int fromRow;
public final State fromState;
public final TransitionType type;
public GraphTransition(double cost, int fromRow, State fromState, TransitionType type) {
this.cost = cost;
this.fromRow = fromRow;
this.fromState = fromState;
this.type = type;
}
}
/**
* Normalize x in [0,1]
*
* @param x
* @return
*/
private static double normalize(double x) {
return x / (x + 1.0);
}
/**
* Java regular expressions contain predefined character classes which the
* regex parser cannot handle
*
* @param regex
* @return
*/
public static String expandRegex(String regex) {
// . Any character (may or may not match line terminators)
// \d A digit: [0-9]
String newRegex = regex.replaceAll("\\\\d", "[0-9]");
// \D A non-digit: [^0-9]
newRegex = newRegex.replaceAll("\\\\D", "[^0-9]");
// \s A whitespace character: [ \t\n\x0B\f\r]
newRegex = newRegex.replaceAll("\\\\s", "[ \\t\\n\\f\\r]");
// \S A non-whitespace character: [^\s]
newRegex = newRegex.replaceAll("\\\\S", "[^ \\t\\n\\f\\r]");
// \w A word character: [a-zA-Z_0-9]
newRegex = newRegex.replaceAll("\\\\w", "[a-zA-Z_0-9]");
// \W A non-word character: [^\w]
newRegex = newRegex.replaceAll("\\\\W", "[^a-zA-Z_0-9]");
if(newRegex.startsWith("^"))
newRegex = newRegex.substring(1);
if(newRegex.endsWith("$"))
newRegex = newRegex.substring(0, newRegex.length() - 1);
// TODO: Some of these should be handled, not just ignored!
newRegex = removeFlagExpressions(newRegex);
newRegex = removeReluctantOperators(newRegex);
return newRegex;
}
protected static String removeFlagExpressions(String regex) {
// Case insensitive
regex = regex.replaceAll("\\(\\?i\\)", "");
// Unix lines mode
regex = regex.replaceAll("\\(\\?d\\)", "");
// Permit comments and whitespace in pattern
regex = regex.replaceAll("\\(\\?x\\)", "");
// Multiline mode
regex = regex.replaceAll("\\(\\?m\\)", "");
// Dotall
regex = regex.replaceAll("\\(\\?s\\)", "");
// Unicode case
regex = regex.replaceAll("\\(\\?u\\)", "");
return regex;
}
protected static String removeReluctantOperators(String regex) {
regex = regex.replaceAll("\\+\\?", "\\+");
regex = regex.replaceAll("\\*\\?", "\\*");
regex = regex.replaceAll("\\?\\?", "\\?");
return regex;
}
/**
* Ensure that each row has the full data structures containing the target state
*
* @param transitions
* @param state
* @param numRows
*/
private static void ensureState(
Map>> transitions, State state,
int numRows) {
for (int row = 0; row <= numRows; row++) {
if (!transitions.containsKey(row))
transitions.put(row, new HashMap>());
if (!transitions.get(row).containsKey(state))
transitions.get(row).put(state, new HashSet());
}
}
private static void cacheRegex(String regex) {
String r = expandRegex(regex);
Automaton automaton = new RegExp(r, RegExp.NONE).toAutomaton();
automaton.expandSingleton();
// We convert this to a graph without self-loops in order to determine the topological order
DirectedGraph regexGraph = new DefaultDirectedGraph(
DefaultEdge.class);
Set visitedStates = new HashSet();
Queue states = new LinkedList();
State initialState = automaton.getInitialState();
states.add(initialState);
while (!states.isEmpty()) {
State currentState = states.poll();
if (visitedStates.contains(currentState))
continue;
if (!regexGraph.containsVertex(currentState))
regexGraph.addVertex(currentState);
for (Transition t : currentState.getTransitions()) {
// Need to get rid of back edges, otherwise there is no topological order!
if (!t.getDest().equals(currentState)) {
regexGraph.addVertex(t.getDest());
regexGraph.addEdge(currentState, t.getDest());
states.add(t.getDest());
CycleDetector det = new CycleDetector(
regexGraph);
if (det.detectCycles()) {
regexGraph.removeEdge(currentState, t.getDest());
}
}
}
visitedStates.add(currentState);
}
TopologicalOrderIterator iterator = new TopologicalOrderIterator(
regexGraph);
List topologicalOrder = new ArrayList();
while (iterator.hasNext()) {
topologicalOrder.add(iterator.next());
}
regexStateCache.put(regex, topologicalOrder);
regexAutomatonCache.put(regex, automaton);
}
/**
*
* Get the distance between the arg and the given regex.
* All operations (insertion/deletion/replacement) cost 1.
* There is no assumption on where and how the operations
* can be done (ie all sequences are valid).
*
*/
public static int getStandardDistance(String arg, String regex) {
if(!isSupportedRegex(regex)) {
return getDefaultDistance(arg, regex);
}
RegexGraph graph = new RegexGraph(arg, regex);
CostMatrix matrix = new CostMatrix();
return matrix.calculateStandardCost(graph);
}
private static int getDefaultDistance(String arg, String regex) {
Pattern p = Pattern.compile(regex);
if (p.matcher(arg).matches())
return 0;
else
return 1;
}
/**
* Determine whether the regex requires features that are
* not supported by the regex automaton library
*
* @param regex
* @return
*/
private static boolean isSupportedRegex(String regex) {
if(regex.contains("\\b"))
return false;
return true;
}
/**
* Get the distance between the arg and the given regex.
* Insertion/deletion cost 1, whereas replacement is in [0,1] depending
* on the actual character values.
*
* Note: the distance is tailored for the StringAVM algorithm,
* in which characters are only inserted/appended at the end.
*
* @param arg
* @param regex
* @return
*/
public static double getDistanceTailoredForStringAVM(String arg, String regex) {
RegexGraph graph = new RegexGraph(arg, regex);
CostMatrix matrix = new CostMatrix();
return matrix.calculateCostForStringAVM(graph);
}
protected static Automaton getAndCacheAutomaton(String regex){
/*
* Cache it if first time we build it
*/
if (!regexAutomatonCache.containsKey(regex)) {
/*
* Create an automaton representing the regex
*/
cacheRegex(regex);
}
Automaton automaton = regexAutomatonCache.get(regex);
return automaton;
}
/**
* A graph created based on an "arg" that is matched against a "regex".
* There is going to be arg.length+1 copies of the regex automaton.
* Each copy represents a "row".
* Each automaton state, in topological order, represents a "column".
* The graph can be considered as a "rows"x"columns" matrix.
*
* @author arcuri
*
*/
private static class RegexGraph {
private Map>> transitions;
private Map intToStateMap;
private Map stateToIntMap;
/**
* Build the graph
* @param arg
* @param regex
*/
public RegexGraph(String arg, String regex){
transitions = createGraph(arg,regex);
}
public int getNumberOfRows(){
return transitions.keySet().size();
}
public int getNumberOfColumns(){
return stateToIntMap.size();
}
/**
* Get all the incoming transitions to the node located at coordinate "row" and "column"
* @param row
* @param column
* @return
*/
public Set getIncomingTransitions(int row, int column){
State state = intToStateMap.get(column);
return transitions.get(row).get(state);
}
public int getColumn(State state){
return stateToIntMap.get(state);
}
private Map>> createGraph(String arg, String regex){
/*
* Create a graph to calculate the distance. The algorithm is based on what discussed in:
*
* Mohammad Alshraideh and Leonardo Bottaci
* Search-based software test data generation for string data using program-specific search operators
* http://neo.lcc.uma.es/mase/attachments/085_TestDataGenerationForStringData.pdf
*
* and
*
* EUGENE W. MYERS and WEBB MILLER
* APPROXIMATE MATCHING OF REGULAR EXPRESSIONS
* http://www.cs.mun.ca/~harold/Courses/Old/Ling6800.W06/Diary/reg.aprox.pdf
*/
Automaton automaton = getAndCacheAutomaton(regex);
final int NUM_CHARS = arg.length();
List topologicalOrder = regexStateCache.get(regex);
Map>> transitions = new HashMap>>();
intToStateMap = new HashMap();
stateToIntMap = new HashMap();
int numState = 0;
for (State currentState : topologicalOrder) {
/*
* Init data structure to quickly map/access state/index
*/
stateToIntMap.put(currentState, numState);
intToStateMap.put(numState, currentState);
numState++;
for (Transition t : currentState.getTransitions()) {
State destination = t.getDest();
ensureState(transitions, destination , NUM_CHARS);
for (int row = 0; row <= NUM_CHARS; row++) {
/*
* add an insertion edge from currentState in row to target state in same row
*/
transitions.get(row).get(destination).add(new GraphTransition(1.0, row, currentState, GraphTransition.TransitionType.INSERTION));
}
for (int row = 0; row < NUM_CHARS; row++) {
/*
* Add a replacement edge from currentState in row to t.getDest in row+1
* if charAt row+1 == the parameter of this transition, this is a zero-cost edge
*/
double cost = 0.0;
if (arg.charAt(row) < t.getMin() || arg.charAt(row) > t.getMax()) {
int distMin = Math.abs(arg.charAt(row) - t.getMin());
int distMax = Math.abs(arg.charAt(row) - t.getMax());
cost = normalize(Math.min(distMin, distMax));
}
/*
* Important: even if the cost is 0 (eg match on the arg/regex in which we replace char X with X), we CANNOT
* use a PHANTOM transition. Even if we do not replace anything, we still need to consider it as a replacement
* transition. Consider the case
*
* "ac".matches("abc")
*
* If we used a phantom transition to represent the alignment c/c, then it would be possible to insert 'b' in the
* middle of "abc". On the other hand, if we use a replacement c/c, then inserting 'b' would not be allowed, as an
* insertion cannot be followed by a replacement.
*/
transitions.get(row + 1).get(destination).add(new GraphTransition(cost, row, currentState, GraphTransition.TransitionType.REPLACEMENT));
}
}
ensureState(transitions, currentState, NUM_CHARS);
for (int row = 0; row < NUM_CHARS; row++) {
/*
* add a deletion edge with cost 1 from currentState to currentState in next row
*/
transitions.get(row + 1).get(currentState).add(new GraphTransition(1.0, row, currentState, GraphTransition.TransitionType.DELETION));
}
}
// Add zero-cost transitions from accepting states to final state
State finalState = new State();
ensureState(transitions, finalState, NUM_CHARS);
for (State s : automaton.getStates()) {
if (s.isAccept()) {
transitions.get(NUM_CHARS).get(finalState).add(new GraphTransition(0, NUM_CHARS, s, GraphTransition.TransitionType.PHANTOM));
}
}
intToStateMap.put(numState, finalState);
stateToIntMap.put(finalState, numState);
return transitions;
}
}
/**
* Class used to calculate the cost, ie the actual distance, based on a RegexGraph.
*
* @author arcuri
*/
private static class CostMatrix{
private final int DEL = 0;
private final int REP = 1;
private final int INS = 2;
public CostMatrix() {
super();
}
public int calculateStandardCost(RegexGraph graph){
final int ROWS = graph.getNumberOfRows();
final int COLUMNS = graph.getNumberOfColumns();
final double[][] matrix = new double[ROWS][COLUMNS];
// First row is cost of matching empty sequence on regex
final int FIRST_ROW = 0;
/*
* init first starting state with 0 costs
*/
matrix[FIRST_ROW][0] = 0;
//look at first row (which is special)
for (int col = 1; col < graph.getNumberOfColumns(); col++) {
double min = Double.MAX_VALUE;
for (GraphTransition t : graph.getIncomingTransitions(FIRST_ROW, col)) {
int otherCol = graph.getColumn(t.fromState);
//self transition
if (col == otherCol){
continue;
}
double otherCost = matrix[FIRST_ROW][otherCol];
min = Math.min(min, getSubPathCost(otherCost, Math.ceil(t.cost)));
}
matrix[FIRST_ROW][col] = min;
}
//then look at the other rows
for(int i=1; i