com.mifmif.common.regex.Generex Maven / Gradle / Ivy
Show all versions of generex Show documentation
/*
* Copyright 2014 y.mifrah
*
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mifmif.common.regex;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.mifmif.common.regex.util.Iterable;
import com.mifmif.common.regex.util.Iterator;
import dk.brics.automaton.Automaton;
import dk.brics.automaton.RegExp;
import dk.brics.automaton.State;
import dk.brics.automaton.Transition;
/**
* A Java utility class that help generating string values that match a given regular expression.It generate all values that are matched by the Regex, a random
* value, or you can generate only a specific string based on it's lexicographical order .
*
* @author y.mifrah
*
*/
public class Generex implements Iterable {
/**
* The predefined character classes supported by {@code Generex}.
*
* An immutable map containing as keys the character classes and values the equivalent regular expression syntax.
*
* @see #createRegExp(String)
*/
private static final Map PREDEFINED_CHARACTER_CLASSES;
private RegExp regExp;
private Automaton automaton;
private List matchedStrings = new ArrayList();
private Node rootNode;
private boolean isTransactionNodeBuilt;
static {
Map characterClasses = new HashMap();
characterClasses.put("\\\\d", "[0-9]");
characterClasses.put("\\\\D", "[^0-9]");
characterClasses.put("\\\\s", "[ \t\n\f\r]");
characterClasses.put("\\\\S", "[^ \t\n\f\r]");
characterClasses.put("\\\\w", "[a-zA-Z_0-9]");
characterClasses.put("\\\\W", "[^a-zA-Z_0-9]");
PREDEFINED_CHARACTER_CLASSES = Collections.unmodifiableMap(characterClasses);
}
public Generex(String regex) {
regex=requote(regex);
regExp = createRegExp(regex);
automaton = regExp.toAutomaton();
random = new Random();
}
public Generex(Automaton automaton) {
this.automaton = automaton;
random = new Random();
}
/**
* Creates a {@code RegExp} instance from the given regular expression.
*
* Predefined character classes are replaced with equivalent regular expression syntax prior creating the instance.
*
* @param regex
* the regular expression used to build the {@code RegExp} instance
* @return a {@code RegExp} instance for the given regular expression
* @throws NullPointerException
* if the given regular expression is {@code null}
* @throws IllegalArgumentException
* if an error occurred while parsing the given regular expression
* @throws StackOverflowError
* if the regular expression has to many transitions
* @see #PREDEFINED_CHARACTER_CLASSES
* @see #isValidPattern(String)
*/
private static RegExp createRegExp(String regex) {
String finalRegex = regex;
for (Entry charClass : PREDEFINED_CHARACTER_CLASSES.entrySet()) {
finalRegex = finalRegex.replaceAll(charClass.getKey(), charClass.getValue());
}
return new RegExp(finalRegex);
}
/**
* initialize the random instance used with a seed value to generate a
* pseudo random suite of strings based on the passed seed and matches the used regular expression
* instance
*
* @param seed
*/
public void setSeed(long seed) {
random=new Random(seed);
}
/**
* @param indexOrder
* ( 1<= indexOrder <=n)
* @return The matched string by the given pattern in the given it's order in the sorted list of matched String.
* indexOrder
between 1 and n
where n
is the number of matched String.
* If indexOrder >= n , return an empty string. if there is an infinite number of String that matches the given Regex, the method throws
* {@code StackOverflowError}
*/
public String getMatchedString(int indexOrder) {
buildRootNode();
if (indexOrder == 0)
indexOrder = 1;
String result = buildStringFromNode(rootNode, indexOrder);
result = result.substring(1, result.length() - 1);
return result;
}
private String buildStringFromNode(Node node, int indexOrder) {
String result = "";
long passedStringNbr = 0;
long step = node.getNbrMatchedString() / node.getNbrChar();
for (char usedChar = node.getMinChar(); usedChar <= node.getMaxChar(); ++usedChar) {
passedStringNbr += step;
if (passedStringNbr >= indexOrder) {
passedStringNbr -= step;
indexOrder -= passedStringNbr;
result = result.concat("" + usedChar);
break;
}
}
long passedStringNbrInChildNode = 0;
if (result.length() == 0)
passedStringNbrInChildNode = passedStringNbr;
for (Node childN : node.getNextNodes()) {
passedStringNbrInChildNode += childN.getNbrMatchedString();
if (passedStringNbrInChildNode >= indexOrder) {
passedStringNbrInChildNode -= childN.getNbrMatchedString();
indexOrder -= passedStringNbrInChildNode;
result = result.concat(buildStringFromNode(childN, indexOrder));
break;
}
}
return result;
}
/**
* Tells whether or not the given pattern (or {@code Automaton}) is infinite, that is, generates an infinite number of strings.
*
* For example, the pattern "a+" generates an infinite number of strings whether "a{5}" does not.
*
* @return {@code true} if the pattern (or {@code Automaton}) generates an infinite number of strings, {@code false} otherwise
*/
public boolean isInfinite() {
return !automaton.isFinite();
}
/**
* @return first string in lexicographical order that is matched by the given pattern.
*/
public String getFirstMatch() {
buildRootNode();
Node node = rootNode;
String result = "";
while (node.getNextNodes().size() > 0) {
result = result.concat("" + node.getMinChar());
node = node.getNextNodes().get(0);
}
result = result.substring(1);
return result;
}
/**
* @return the number of strings that are matched by the given pattern.
* @throws StackOverflowError
* if the given pattern generates a large, possibly infinite, number of strings.
*/
public long matchedStringsSize() {
buildRootNode();
return rootNode.getNbrMatchedString();
}
/**
* Prepare the rootNode and it's child nodes so that we can get matchedString by index
*/
private void buildRootNode() {
if (isTransactionNodeBuilt)
return;
isTransactionNodeBuilt = true;
rootNode = new Node();
rootNode.setNbrChar(1);
List nextNodes = prepareTransactionNodes(automaton.getInitialState());
rootNode.setNextNodes(nextNodes);
rootNode.updateNbrMatchedString();
}
private int matchedStringCounter = 0;
private void generate(String strMatch, State state, int limit) {
if (matchedStringCounter == limit)
return;
++matchedStringCounter;
List transitions = state.getSortedTransitions(true);
if (transitions.size() == 0) {
matchedStrings.add(strMatch);
return;
}
if (state.isAccept()) {
matchedStrings.add(strMatch);
}
for (Transition transition : transitions) {
for (char c = transition.getMin(); c <= transition.getMax(); ++c) {
generate(strMatch + c, transition.getDest(), limit);
}
}
}
/**
* Build list of nodes that present possible transactions from the state
.
*
* @param state
* @return
*/
private List prepareTransactionNodes(State state) {
List transactionNodes = new ArrayList();
if (preparedTransactionNode == Integer.MAX_VALUE / 2)
return transactionNodes;
++preparedTransactionNode;
if (state.isAccept()) {
Node acceptedNode = new Node();
acceptedNode.setNbrChar(1);
transactionNodes.add(acceptedNode);
}
List transitions = state.getSortedTransitions(true);
for (Transition transition : transitions) {
Node trsNode = new Node();
int nbrChar = transition.getMax() - transition.getMin() + 1;
trsNode.setNbrChar(nbrChar);
trsNode.setMaxChar(transition.getMax());
trsNode.setMinChar(transition.getMin());
List nextNodes = prepareTransactionNodes(transition.getDest());
trsNode.setNextNodes(nextNodes);
transactionNodes.add(trsNode);
}
return transactionNodes;
}
private int preparedTransactionNode;
private Random random;
/**
* Generate all Strings that matches the given Regex.
*
* @return
*/
public List getAllMatchedStrings() {
matchedStrings = new ArrayList();
generate("", automaton.getInitialState(), Integer.MAX_VALUE);
return matchedStrings;
}
/**
* Generate subList with a size of limit
of Strings that matches the given Regex. the Strings are ordered in lexicographical order.
*
* @param limit
* @return
*/
public List getMatchedStrings(int limit) {
matchedStrings = new ArrayList();
generate("", automaton.getInitialState(), limit);
return matchedStrings;
}
/**
* Generate and return a random String that match the pattern used in this Generex.
*
* @return
*/
public String random() {
return prepareRandom("", automaton.getInitialState(), 1, Integer.MAX_VALUE);
}
/**
* Generate and return a random String that match the pattern used in this Generex, and the string has a length >= minLength
*
* @param minLength
* @return
*/
public String random(int minLength) {
return prepareRandom("", automaton.getInitialState(), minLength, Integer.MAX_VALUE);
}
/**
* Generate and return a random String that match the pattern used in this Generex, and the string has a length >= minLength
and <=
* maxLength
*
*
* @param minLength
* @param maxLength
* @return
*/
public String random(int minLength, int maxLength) {
return prepareRandom("", automaton.getInitialState(), minLength, maxLength);
}
private String prepareRandom(String strMatch, State state, int minLength, int maxLength) {
List transitions = state.getSortedTransitions(false);
if (state.isAccept()) {
if (strMatch.length() == maxLength) {
return strMatch;
}
if (random.nextInt() > 0.3*Integer.MAX_VALUE && strMatch.length() >= minLength) {
return strMatch;
}
}
if (transitions.size() == 0) {
return strMatch;
}
Transition randomTransition = transitions.get(random.nextInt(transitions.size()));
int diff = randomTransition.getMax() - randomTransition.getMin() + 1;
int randomOffset = diff;
if (diff > 0) {
randomOffset = (int) (random.nextInt(diff));
}
char randomChar = (char) (randomOffset + randomTransition.getMin());
return prepareRandom(strMatch + randomChar, randomTransition.getDest(), minLength, maxLength);
}
public Iterator iterator() {
return new GenerexIterator(automaton.getInitialState());
}
/**
* Tells whether or not the given regular expression is a valid pattern (for {@code Generex}).
*
* @param regex
* the regular expression that will be validated
* @return {@code true} if the regular expression is valid, {@code false} otherwise
* @throws NullPointerException
* if the given regular expression is {@code null}
*/
public static boolean isValidPattern(String regex) {
try {
createRegExp(regex);
return true;
} catch (IllegalArgumentException ignore) { // NOPMD - Not valid.
} catch (StackOverflowError ignore) { // NOPMD - Possibly valid but stack not big enough to handle it.
}
return false;
}
/**
* Requote a regular expression by escaping some parts of it from generation without need to escape each special character one by one.
*
* this is done by setting the part to be interpreted as normal characters (thus, quote all meta-characters) between \Q and \E , ex :
*
* minion_\d{3}\[email protected]\E
*
* will be transformed to :
*
* minion_\d{3}\@gru\.evil
* @param regex
* @return
*/
private static String requote(String regex) {
final Pattern patternRequoted = Pattern.compile("\\\\Q(.*?)\\\\E");
// http://stackoverflow.com/questions/399078/what-special-characters-must-be-escaped-in-regular-expressions
// adding "@" prevents StackOverflowError inside generex: https://github.com/mifmif/Generex/issues/21
final Pattern patternSpecial = Pattern.compile("[.^$*+?(){|\\[\\\\@]");
StringBuilder sb = new StringBuilder(regex);
Matcher matcher = patternRequoted.matcher(sb);
while (matcher.find()) {
sb.replace(matcher.start(), matcher.end(), patternSpecial.matcher(matcher.group(1)).replaceAll("\\\\$0"));
//matcher.reset();
}
return sb.toString();
}
}