org.evosuite.utils.RegexDistanceUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of evosuite-client Show documentation
The newest version!
/**
 * Copyright (C) 2010-2018 Gordon Fraser, Andrea Arcuri and EvoSuite
 * contributors
 *
 * This file is part of EvoSuite.
 *
 * EvoSuite is free software: you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3.0 of the License, or
 * (at your option) any later version.
 *
 * EvoSuite is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with EvoSuite. If not, see .
 */
package org.evosuite.utils;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.regex.Pattern;

import org.jgrapht.DirectedGraph;
import org.jgrapht.alg.CycleDetector;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
import org.jgrapht.traverse.TopologicalOrderIterator;

import dk.brics.automaton.Automaton;
import dk.brics.automaton.RegExp;
import dk.brics.automaton.State;
import dk.brics.automaton.Transition;

/**
 *  Class used to define the distance between a string and a regex
 */
public class RegexDistanceUtils {

	/*
	 * Automatons for regex can be expensive to build. So we cache them,
	 * as we might need to access to them several times during the search
	 */
	private static Map> regexStateCache = new HashMap>();
	private static Map regexAutomatonCache = new HashMap();

	public static Automaton getRegexAutomaton(String regex) {
		if (!regexAutomatonCache.containsKey(regex)) {
			cacheRegex(regex);
		}
		return regexAutomatonCache.get(regex);
	}

	public static String getRegexInstance(String regex) {
		if (!regexAutomatonCache.containsKey(regex)) {
			cacheRegex(regex);
		}
		Automaton automaton = regexAutomatonCache.get(regex);
		return automaton.getShortestExample(true);
	}

	public static String getNonMatchingRegexInstance(String regex) {
		if (!regexAutomatonCache.containsKey(regex)) {
			cacheRegex(regex);
		}
		Automaton automaton = regexAutomatonCache.get(regex);
		return automaton.getShortestExample(false);
	}

	private static class GraphTransition {
				
		public enum TransitionType{INSERTION, DELETION, REPLACEMENT, 
			/**
			 * A phantom transition is an artificial transition from the sink/final states to a single artificial sink/state.
			 * This is used to simplify the recursion calculation of the subpath costs. 
			 */
			PHANTOM};
		
		public final double cost;
		public final int fromRow;
		public final State fromState;
		public final TransitionType type;
		
		public GraphTransition(double cost, int fromRow, State fromState, TransitionType type) {
			this.cost = cost;
			this.fromRow = fromRow;
			this.fromState = fromState;
			this.type = type;
		}
	}

	/**
	 * Normalize x in [0,1]
	 * 
	 * @param x
	 * @return
	 */
	private static double normalize(double x) {
		return x / (x + 1.0);
	}

	/**
	 * Java regular expressions contain predefined character classes which the
	 * regex parser cannot handle
	 * 
	 * @param regex
	 * @return
	 */
	public static String expandRegex(String regex) {
		// .	Any character (may or may not match line terminators)
		// \d	A digit: [0-9]
		String newRegex = regex.replaceAll("\\\\d", "[0-9]");

		// \D	A non-digit: [^0-9]
		newRegex = newRegex.replaceAll("\\\\D", "[^0-9]");

		// \s	A whitespace character: [ \t\n\x0B\f\r]
		newRegex = newRegex.replaceAll("\\\\s", "[ \\t\\n\\f\\r]");

		// \S	A non-whitespace character: [^\s]
		newRegex = newRegex.replaceAll("\\\\S", "[^ \\t\\n\\f\\r]");

		// \w	A word character: [a-zA-Z_0-9]
		newRegex = newRegex.replaceAll("\\\\w", "[a-zA-Z_0-9]");

		// \W	A non-word character: [^\w]
		newRegex = newRegex.replaceAll("\\\\W", "[^a-zA-Z_0-9]");

		if(newRegex.startsWith("^"))
			newRegex = newRegex.substring(1);
		
		if(newRegex.endsWith("$"))
			newRegex = newRegex.substring(0, newRegex.length() - 1);
		
		// TODO: Some of these should be handled, not just ignored!
		newRegex = removeFlagExpressions(newRegex);
		
		newRegex = removeReluctantOperators(newRegex);
		
		return newRegex;
	}
	
	protected static String removeFlagExpressions(String regex) {
		// Case insensitive
		regex = regex.replaceAll("\\(\\?i\\)", "");

		// Unix lines mode
		regex = regex.replaceAll("\\(\\?d\\)", "");

		// Permit comments and whitespace in pattern
		regex = regex.replaceAll("\\(\\?x\\)", "");

		// Multiline mode
		regex = regex.replaceAll("\\(\\?m\\)", "");

		// Dotall
		regex = regex.replaceAll("\\(\\?s\\)", "");

		// Unicode case
		regex = regex.replaceAll("\\(\\?u\\)", "");

		return regex;
	}
	
	protected static String removeReluctantOperators(String regex) {
		regex = regex.replaceAll("\\+\\?", "\\+");
		regex = regex.replaceAll("\\*\\?", "\\*");
		regex = regex.replaceAll("\\?\\?", "\\?");
		
		return regex;
	}

	/**
	 * Ensure that each row has the full data structures containing the target state
	 * 
	 * @param transitions 
	 * @param state
	 * @param numRows
	 */
	private static void ensureState(
			Map>> transitions, State state,
			int numRows) {
		for (int row = 0; row <= numRows; row++) {
			if (!transitions.containsKey(row))
				transitions.put(row, new HashMap>());
			if (!transitions.get(row).containsKey(state))
				transitions.get(row).put(state, new HashSet());
		}
	}

	private static void cacheRegex(String regex) {
		String r = expandRegex(regex);
		Automaton automaton = new RegExp(r, RegExp.NONE).toAutomaton();
		automaton.expandSingleton();

		// We convert this to a graph without self-loops in order to determine the topological order
		DirectedGraph regexGraph = new DefaultDirectedGraph(
				DefaultEdge.class);
		Set visitedStates = new HashSet();
		Queue states = new LinkedList();
		State initialState = automaton.getInitialState();
		states.add(initialState);

		while (!states.isEmpty()) {
			State currentState = states.poll();
			if (visitedStates.contains(currentState))
				continue;
			if (!regexGraph.containsVertex(currentState))
				regexGraph.addVertex(currentState);
			for (Transition t : currentState.getTransitions()) {
				// Need to get rid of back edges, otherwise there is no topological order!
				if (!t.getDest().equals(currentState)) {
					regexGraph.addVertex(t.getDest());
					regexGraph.addEdge(currentState, t.getDest());
					states.add(t.getDest());
					CycleDetector det = new CycleDetector(
							regexGraph);
					if (det.detectCycles()) {
						regexGraph.removeEdge(currentState, t.getDest());
					}
				}
			}
			visitedStates.add(currentState);
		}

		TopologicalOrderIterator iterator = new TopologicalOrderIterator(
				regexGraph);
		List topologicalOrder = new ArrayList();
		while (iterator.hasNext()) {
			topologicalOrder.add(iterator.next());
		}

		regexStateCache.put(regex, topologicalOrder);
		regexAutomatonCache.put(regex, automaton);
	}

	/**
	 * 
	 * Get the distance between the arg and the given regex.
	 * All operations (insertion/deletion/replacement) cost 1.
	 * There is no assumption on where and how the operations
	 * can be done (ie all sequences are valid). 
	 * 
	 */
	public static int getStandardDistance(String arg, String regex) {
		if(!isSupportedRegex(regex)) {
			return getDefaultDistance(arg, regex);
		}

		RegexGraph graph = new RegexGraph(arg, regex);		
		CostMatrix matrix = new CostMatrix();
		return matrix.calculateStandardCost(graph);		
	}
	
	private static int getDefaultDistance(String arg, String regex) {
        Pattern p = Pattern.compile(regex);
        if (p.matcher(arg).matches())
        	return 0;
        else
        	return 1;

	}
	
	/**
	 * Determine whether the regex requires features that are 
	 * not supported by the regex automaton library
	 *  
	 * @param regex
	 * @return
	 */
	private static boolean isSupportedRegex(String regex) {
		if(regex.contains("\\b"))
			return false;
		
		return true;
	}

	/**
	 * Get the distance between the arg and the given regex.
	 * Insertion/deletion cost 1, whereas replacement is in [0,1] depending
	 * on the actual character values. 
	 * 
	 *  Note: the distance is tailored for the StringAVM algorithm,
	 * in which characters are only inserted/appended at the end.

	 * 
	 * @param arg
	 * @param regex
	 * @return
	 */
	public static double getDistanceTailoredForStringAVM(String arg, String regex) {
		RegexGraph graph = new RegexGraph(arg, regex);		
		CostMatrix matrix = new CostMatrix();
		return matrix.calculateCostForStringAVM(graph);		
	}
	
	protected static Automaton getAndCacheAutomaton(String regex){
		/*
		 * Cache it if first time we build it
		 */
		if (!regexAutomatonCache.containsKey(regex)) {
			/*
			 * Create an automaton representing the regex
			 */
			cacheRegex(regex);
		}
		
		Automaton automaton = regexAutomatonCache.get(regex);
		return automaton;
	}

	
	/**
	 * A graph created based on an "arg" that is matched against a "regex".
	 * There is going to be arg.length+1 copies of the regex automaton. 
	 * Each copy represents a "row".
	 * Each automaton state, in topological order, represents a "column".
	 * The graph can be considered as a "rows"x"columns" matrix.
	 * 
	 * @author arcuri
	 *
	 */
	private static class RegexGraph {

		private  Map>> transitions;
		private  Map intToStateMap;
		private  Map stateToIntMap;
		
		/**
		 * Build the graph
		 * @param arg
		 * @param regex
		 */
		public RegexGraph(String arg, String regex){
			transitions = createGraph(arg,regex);
		}
		
		public int getNumberOfRows(){
			return transitions.keySet().size();
		}
		
		public int getNumberOfColumns(){
			return stateToIntMap.size();
		}
		
		/**
		 * Get all the incoming transitions to the node located at coordinate "row" and "column"
		 * @param row
		 * @param column
		 * @return
		 */
		public Set getIncomingTransitions(int row, int column){
			State state = intToStateMap.get(column);
			return transitions.get(row).get(state);
		}
		
		public int getColumn(State state){
			return stateToIntMap.get(state);
		}
		
		private  Map>> createGraph(String arg, String regex){

			/*
			 * Create a graph to calculate the distance. The algorithm is based on what discussed in:
			 * 
			 * Mohammad Alshraideh and Leonardo Bottaci
			 * Search-based software test data generation for string data using program-specific search operators
			 * http://neo.lcc.uma.es/mase/attachments/085_TestDataGenerationForStringData.pdf
			 * 
			 * and 
			 * 
			 * EUGENE W. MYERS and WEBB MILLER
			 * APPROXIMATE MATCHING OF REGULAR EXPRESSIONS
			 * http://www.cs.mun.ca/~harold/Courses/Old/Ling6800.W06/Diary/reg.aprox.pdf
			 */

			Automaton automaton = getAndCacheAutomaton(regex);
			final int NUM_CHARS = arg.length();


			List topologicalOrder = regexStateCache.get(regex);

			Map>> transitions = new HashMap>>();

			intToStateMap = new HashMap();
			stateToIntMap = new HashMap();
			int numState = 0;

			for (State currentState : topologicalOrder) {

				/*
				 * Init data structure to quickly map/access state/index
				 */
				stateToIntMap.put(currentState, numState);
				intToStateMap.put(numState, currentState);
				numState++;

				for (Transition t : currentState.getTransitions()) {

					State destination = t.getDest();
					ensureState(transitions, destination , NUM_CHARS);

					for (int row = 0; row <= NUM_CHARS; row++) {
						/*
						 *  add an insertion edge from currentState in row to target state in same row
						 */

						transitions.get(row).get(destination).add(new GraphTransition(1.0, row, currentState, GraphTransition.TransitionType.INSERTION));
					}

					for (int row = 0; row < NUM_CHARS; row++) {
						/*
						 *  Add a replacement edge from currentState in row to t.getDest in row+1
						 *  if charAt row+1 == the parameter of this transition, this is a zero-cost edge
						 */

						double cost = 0.0;

						if (arg.charAt(row) < t.getMin() || arg.charAt(row) > t.getMax()) {					
							int distMin = Math.abs(arg.charAt(row) - t.getMin());
							int distMax = Math.abs(arg.charAt(row) - t.getMax());
							cost = normalize(Math.min(distMin, distMax));
						}

						/*
						 * Important: even if the cost is 0 (eg match on the arg/regex in which we replace char X with X), we CANNOT
						 * use a PHANTOM transition. Even if we do not replace anything, we still need to consider it as a replacement 
						 * transition. Consider the case
						 * 
						 *  "ac".matches("abc")
						 *  
						 *  If we used a phantom transition to represent the alignment c/c, then it would be possible to insert 'b' in the 
						 *  middle of "abc". On the other hand, if we use a replacement c/c, then inserting 'b' would not be allowed, as an
						 *  insertion cannot be followed by a replacement.  
						 */

						transitions.get(row + 1).get(destination).add(new GraphTransition(cost, row, currentState, GraphTransition.TransitionType.REPLACEMENT));
					}
				}

				ensureState(transitions, currentState, NUM_CHARS);

				for (int row = 0; row < NUM_CHARS; row++) {

					/*
					 * add a deletion edge with cost 1 from currentState to currentState in next row
					 */

					transitions.get(row + 1).get(currentState).add(new GraphTransition(1.0, row, currentState,  GraphTransition.TransitionType.DELETION));
				}			
			}

			// Add zero-cost transitions from accepting states to final state
			State finalState = new State();
			ensureState(transitions, finalState, NUM_CHARS);
			for (State s : automaton.getStates()) {
				if (s.isAccept()) {
					transitions.get(NUM_CHARS).get(finalState).add(new GraphTransition(0, NUM_CHARS, s, GraphTransition.TransitionType.PHANTOM));
				}
			}
			intToStateMap.put(numState, finalState); 
			stateToIntMap.put(finalState, numState);	

			return transitions;
		}
	}

	/**
	 * Class used to calculate the cost, ie the actual distance, based on a RegexGraph.	
	 * 
	 * @author arcuri
	 */
	private static class CostMatrix{
		
		private final int DEL = 0;
		private final int REP = 1;
		private final int INS = 2;
		
		public CostMatrix() {
			super();			
		}

		public int calculateStandardCost(RegexGraph graph){
			final int ROWS = graph.getNumberOfRows();
			final int COLUMNS = graph.getNumberOfColumns();
			
			final double[][] matrix = new double[ROWS][COLUMNS]; 
			
			// First row is cost of matching empty sequence on regex
			final int FIRST_ROW = 0;
			
			/*
			 * init first starting state with 0 costs
			 */
			matrix[FIRST_ROW][0] = 0;
			
			//look at first row (which is special)
			for (int col = 1; col < graph.getNumberOfColumns(); col++) {

				double min = Double.MAX_VALUE;
				
				for (GraphTransition t :  graph.getIncomingTransitions(FIRST_ROW, col)) {

					int otherCol = graph.getColumn(t.fromState);

					//self transition
					if (col == otherCol){
						continue;
					}
					
					double otherCost = matrix[FIRST_ROW][otherCol];

					min = Math.min(min, getSubPathCost(otherCost, Math.ceil(t.cost)));
				}
				
				matrix[FIRST_ROW][col] = min;
			}
		
			//then look at the other rows
			for(int i=1; i