All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.evosuite.utils.RegexDistanceUtils Maven / Gradle / Ivy

/**
 * Copyright (C) 2010-2018 Gordon Fraser, Andrea Arcuri and EvoSuite
 * contributors
 *
 * This file is part of EvoSuite.
 *
 * EvoSuite is free software: you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3.0 of the License, or
 * (at your option) any later version.
 *
 * EvoSuite is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with EvoSuite. If not, see .
 */
package org.evosuite.utils;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
import java.util.regex.Pattern;

import org.jgrapht.DirectedGraph;
import org.jgrapht.alg.CycleDetector;
import org.jgrapht.graph.DefaultDirectedGraph;
import org.jgrapht.graph.DefaultEdge;
import org.jgrapht.traverse.TopologicalOrderIterator;

import dk.brics.automaton.Automaton;
import dk.brics.automaton.RegExp;
import dk.brics.automaton.State;
import dk.brics.automaton.Transition;

/**
 *  Class used to define the distance between a string and a regex
 */
public class RegexDistanceUtils {

	/*
	 * Automatons for regex can be expensive to build. So we cache them,
	 * as we might need to access to them several times during the search
	 */
	private static Map> regexStateCache = new HashMap>();
	private static Map regexAutomatonCache = new HashMap();

	public static Automaton getRegexAutomaton(String regex) {
		if (!regexAutomatonCache.containsKey(regex)) {
			cacheRegex(regex);
		}
		return regexAutomatonCache.get(regex);
	}

	public static String getRegexInstance(String regex) {
		if (!regexAutomatonCache.containsKey(regex)) {
			cacheRegex(regex);
		}
		Automaton automaton = regexAutomatonCache.get(regex);
		return automaton.getShortestExample(true);
	}

	public static String getNonMatchingRegexInstance(String regex) {
		if (!regexAutomatonCache.containsKey(regex)) {
			cacheRegex(regex);
		}
		Automaton automaton = regexAutomatonCache.get(regex);
		return automaton.getShortestExample(false);
	}

	private static class GraphTransition {
				
		public enum TransitionType{INSERTION, DELETION, REPLACEMENT, 
			/**
			 * A phantom transition is an artificial transition from the sink/final states to a single artificial sink/state.
			 * This is used to simplify the recursion calculation of the subpath costs. 
			 */
			PHANTOM};
		
		public final double cost;
		public final int fromRow;
		public final State fromState;
		public final TransitionType type;
		
		public GraphTransition(double cost, int fromRow, State fromState, TransitionType type) {
			this.cost = cost;
			this.fromRow = fromRow;
			this.fromState = fromState;
			this.type = type;
		}
	}

	/**
	 * Normalize x in [0,1]
	 * 
	 * @param x
	 * @return
	 */
	private static double normalize(double x) {
		return x / (x + 1.0);
	}

	/**
	 * Java regular expressions contain predefined character classes which the
	 * regex parser cannot handle
	 * 
	 * @param regex
	 * @return
	 */
	public static String expandRegex(String regex) {
		// .	Any character (may or may not match line terminators)
		// \d	A digit: [0-9]
		String newRegex = regex.replaceAll("\\\\d", "[0-9]");

		// \D	A non-digit: [^0-9]
		newRegex = newRegex.replaceAll("\\\\D", "[^0-9]");

		// \s	A whitespace character: [ \t\n\x0B\f\r]
		newRegex = newRegex.replaceAll("\\\\s", "[ \\t\\n\\f\\r]");

		// \S	A non-whitespace character: [^\s]
		newRegex = newRegex.replaceAll("\\\\S", "[^ \\t\\n\\f\\r]");

		// \w	A word character: [a-zA-Z_0-9]
		newRegex = newRegex.replaceAll("\\\\w", "[a-zA-Z_0-9]");

		// \W	A non-word character: [^\w]
		newRegex = newRegex.replaceAll("\\\\W", "[^a-zA-Z_0-9]");

		if(newRegex.startsWith("^"))
			newRegex = newRegex.substring(1);
		
		if(newRegex.endsWith("$"))
			newRegex = newRegex.substring(0, newRegex.length() - 1);
		
		// TODO: Some of these should be handled, not just ignored!
		newRegex = removeFlagExpressions(newRegex);
		
		newRegex = removeReluctantOperators(newRegex);
		
		return newRegex;
	}
	
	protected static String removeFlagExpressions(String regex) {
		// Case insensitive
		regex = regex.replaceAll("\\(\\?i\\)", "");

		// Unix lines mode
		regex = regex.replaceAll("\\(\\?d\\)", "");

		// Permit comments and whitespace in pattern
		regex = regex.replaceAll("\\(\\?x\\)", "");

		// Multiline mode
		regex = regex.replaceAll("\\(\\?m\\)", "");

		// Dotall
		regex = regex.replaceAll("\\(\\?s\\)", "");

		// Unicode case
		regex = regex.replaceAll("\\(\\?u\\)", "");

		return regex;
	}
	
	protected static String removeReluctantOperators(String regex) {
		regex = regex.replaceAll("\\+\\?", "\\+");
		regex = regex.replaceAll("\\*\\?", "\\*");
		regex = regex.replaceAll("\\?\\?", "\\?");
		
		return regex;
	}

	/**
	 * Ensure that each row has the full data structures containing the target state
	 * 
	 * @param transitions 
	 * @param state
	 * @param numRows
	 */
	private static void ensureState(
			Map>> transitions, State state,
			int numRows) {
		for (int row = 0; row <= numRows; row++) {
			if (!transitions.containsKey(row))
				transitions.put(row, new HashMap>());
			if (!transitions.get(row).containsKey(state))
				transitions.get(row).put(state, new HashSet());
		}
	}

	private static void cacheRegex(String regex) {
		String r = expandRegex(regex);
		Automaton automaton = new RegExp(r, RegExp.NONE).toAutomaton();
		automaton.expandSingleton();

		// We convert this to a graph without self-loops in order to determine the topological order
		DirectedGraph regexGraph = new DefaultDirectedGraph(
				DefaultEdge.class);
		Set visitedStates = new HashSet();
		Queue states = new LinkedList();
		State initialState = automaton.getInitialState();
		states.add(initialState);

		while (!states.isEmpty()) {
			State currentState = states.poll();
			if (visitedStates.contains(currentState))
				continue;
			if (!regexGraph.containsVertex(currentState))
				regexGraph.addVertex(currentState);
			for (Transition t : currentState.getTransitions()) {
				// Need to get rid of back edges, otherwise there is no topological order!
				if (!t.getDest().equals(currentState)) {
					regexGraph.addVertex(t.getDest());
					regexGraph.addEdge(currentState, t.getDest());
					states.add(t.getDest());
					CycleDetector det = new CycleDetector(
							regexGraph);
					if (det.detectCycles()) {
						regexGraph.removeEdge(currentState, t.getDest());
					}
				}
			}
			visitedStates.add(currentState);
		}

		TopologicalOrderIterator iterator = new TopologicalOrderIterator(
				regexGraph);
		List topologicalOrder = new ArrayList();
		while (iterator.hasNext()) {
			topologicalOrder.add(iterator.next());
		}

		regexStateCache.put(regex, topologicalOrder);
		regexAutomatonCache.put(regex, automaton);
	}

	/**
	 * 

* Get the distance between the arg and the given regex. * All operations (insertion/deletion/replacement) cost 1. * There is no assumption on where and how the operations * can be done (ie all sequences are valid). *

*/ public static int getStandardDistance(String arg, String regex) { if(!isSupportedRegex(regex)) { return getDefaultDistance(arg, regex); } RegexGraph graph = new RegexGraph(arg, regex); CostMatrix matrix = new CostMatrix(); return matrix.calculateStandardCost(graph); } private static int getDefaultDistance(String arg, String regex) { Pattern p = Pattern.compile(regex); if (p.matcher(arg).matches()) return 0; else return 1; } /** * Determine whether the regex requires features that are * not supported by the regex automaton library * * @param regex * @return */ private static boolean isSupportedRegex(String regex) { if(regex.contains("\\b")) return false; return true; } /** *

Get the distance between the arg and the given regex. * Insertion/deletion cost 1, whereas replacement is in [0,1] depending * on the actual character values.

* *

Note: the distance is tailored for the StringAVM algorithm, * in which characters are only inserted/appended at the end.

* * @param arg * @param regex * @return */ public static double getDistanceTailoredForStringAVM(String arg, String regex) { RegexGraph graph = new RegexGraph(arg, regex); CostMatrix matrix = new CostMatrix(); return matrix.calculateCostForStringAVM(graph); } protected static Automaton getAndCacheAutomaton(String regex){ /* * Cache it if first time we build it */ if (!regexAutomatonCache.containsKey(regex)) { /* * Create an automaton representing the regex */ cacheRegex(regex); } Automaton automaton = regexAutomatonCache.get(regex); return automaton; } /** * A graph created based on an "arg" that is matched against a "regex". * There is going to be arg.length+1 copies of the regex automaton. * Each copy represents a "row". * Each automaton state, in topological order, represents a "column". * The graph can be considered as a "rows"x"columns" matrix. * * @author arcuri * */ private static class RegexGraph { private Map>> transitions; private Map intToStateMap; private Map stateToIntMap; /** * Build the graph * @param arg * @param regex */ public RegexGraph(String arg, String regex){ transitions = createGraph(arg,regex); } public int getNumberOfRows(){ return transitions.keySet().size(); } public int getNumberOfColumns(){ return stateToIntMap.size(); } /** * Get all the incoming transitions to the node located at coordinate "row" and "column" * @param row * @param column * @return */ public Set getIncomingTransitions(int row, int column){ State state = intToStateMap.get(column); return transitions.get(row).get(state); } public int getColumn(State state){ return stateToIntMap.get(state); } private Map>> createGraph(String arg, String regex){ /* * Create a graph to calculate the distance. The algorithm is based on what discussed in: * * Mohammad Alshraideh and Leonardo Bottaci * Search-based software test data generation for string data using program-specific search operators * http://neo.lcc.uma.es/mase/attachments/085_TestDataGenerationForStringData.pdf * * and * * EUGENE W. MYERS and WEBB MILLER * APPROXIMATE MATCHING OF REGULAR EXPRESSIONS * http://www.cs.mun.ca/~harold/Courses/Old/Ling6800.W06/Diary/reg.aprox.pdf */ Automaton automaton = getAndCacheAutomaton(regex); final int NUM_CHARS = arg.length(); List topologicalOrder = regexStateCache.get(regex); Map>> transitions = new HashMap>>(); intToStateMap = new HashMap(); stateToIntMap = new HashMap(); int numState = 0; for (State currentState : topologicalOrder) { /* * Init data structure to quickly map/access state/index */ stateToIntMap.put(currentState, numState); intToStateMap.put(numState, currentState); numState++; for (Transition t : currentState.getTransitions()) { State destination = t.getDest(); ensureState(transitions, destination , NUM_CHARS); for (int row = 0; row <= NUM_CHARS; row++) { /* * add an insertion edge from currentState in row to target state in same row */ transitions.get(row).get(destination).add(new GraphTransition(1.0, row, currentState, GraphTransition.TransitionType.INSERTION)); } for (int row = 0; row < NUM_CHARS; row++) { /* * Add a replacement edge from currentState in row to t.getDest in row+1 * if charAt row+1 == the parameter of this transition, this is a zero-cost edge */ double cost = 0.0; if (arg.charAt(row) < t.getMin() || arg.charAt(row) > t.getMax()) { int distMin = Math.abs(arg.charAt(row) - t.getMin()); int distMax = Math.abs(arg.charAt(row) - t.getMax()); cost = normalize(Math.min(distMin, distMax)); } /* * Important: even if the cost is 0 (eg match on the arg/regex in which we replace char X with X), we CANNOT * use a PHANTOM transition. Even if we do not replace anything, we still need to consider it as a replacement * transition. Consider the case * * "ac".matches("abc") * * If we used a phantom transition to represent the alignment c/c, then it would be possible to insert 'b' in the * middle of "abc". On the other hand, if we use a replacement c/c, then inserting 'b' would not be allowed, as an * insertion cannot be followed by a replacement. */ transitions.get(row + 1).get(destination).add(new GraphTransition(cost, row, currentState, GraphTransition.TransitionType.REPLACEMENT)); } } ensureState(transitions, currentState, NUM_CHARS); for (int row = 0; row < NUM_CHARS; row++) { /* * add a deletion edge with cost 1 from currentState to currentState in next row */ transitions.get(row + 1).get(currentState).add(new GraphTransition(1.0, row, currentState, GraphTransition.TransitionType.DELETION)); } } // Add zero-cost transitions from accepting states to final state State finalState = new State(); ensureState(transitions, finalState, NUM_CHARS); for (State s : automaton.getStates()) { if (s.isAccept()) { transitions.get(NUM_CHARS).get(finalState).add(new GraphTransition(0, NUM_CHARS, s, GraphTransition.TransitionType.PHANTOM)); } } intToStateMap.put(numState, finalState); stateToIntMap.put(finalState, numState); return transitions; } } /** * Class used to calculate the cost, ie the actual distance, based on a RegexGraph. * * @author arcuri */ private static class CostMatrix{ private final int DEL = 0; private final int REP = 1; private final int INS = 2; public CostMatrix() { super(); } public int calculateStandardCost(RegexGraph graph){ final int ROWS = graph.getNumberOfRows(); final int COLUMNS = graph.getNumberOfColumns(); final double[][] matrix = new double[ROWS][COLUMNS]; // First row is cost of matching empty sequence on regex final int FIRST_ROW = 0; /* * init first starting state with 0 costs */ matrix[FIRST_ROW][0] = 0; //look at first row (which is special) for (int col = 1; col < graph.getNumberOfColumns(); col++) { double min = Double.MAX_VALUE; for (GraphTransition t : graph.getIncomingTransitions(FIRST_ROW, col)) { int otherCol = graph.getColumn(t.fromState); //self transition if (col == otherCol){ continue; } double otherCost = matrix[FIRST_ROW][otherCol]; min = Math.min(min, getSubPathCost(otherCost, Math.ceil(t.cost))); } matrix[FIRST_ROW][col] = min; } //then look at the other rows for(int i=1; i