All Downloads are FREE. Search and download functionalities are using the official Maven repository.

preprocessor.ParsingPreprocessor Maven / Gradle / Ivy

Go to download

A tool to perform static analysis on regexes to determine whether they are vulnerable to ReDoS.

There is a newer version: 1.0.8
Show newest version
package preprocessor;

import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import preprocessor.ParsingPreprocessor.CountClosureOperator.BoundsType;
import preprocessor.ParsingPreprocessor.EscapeFactor.EscapeType;

public abstract class ParsingPreprocessor implements Preprocessor {
	
	protected static boolean ALLOW_LOOKAROUND = false;
	protected static boolean ALLOW_LINE_BOUNDARY = false;
	protected static boolean ALLOW_ZERO_ONCE = false;
	
	private final static int MAX_REPETITION = 256;
	
	private final int MAX_REGEX_LENGTH = 1<<23;

	private LinkedList rules;
	
	@Override
	public String applyRules(String regex) {
		for (PreprocessorRule rule : rules) {
			List tokenStream = tokenize(regex, 0);
			
			regex = rule.process(tokenStream);
			if (regex.length() > MAX_REGEX_LENGTH) {
				throw new RegexException("Regular expression length exceeded.");
			}
		}
		return regex;
	}

	private static List tokenize(String regex, int currentLevel) {
		List tokenStream = new LinkedList();

		int i;
		char regexArr[] = regex.toCharArray();
		StringBuilder groupBuilder;
		int level;
		i = 0;
		boolean escaped = false;
		while (i < regexArr.length) {
			switch (regexArr[i]) {
			case '(':
				GroupFactor.GroupType groupType = GroupFactor.GroupType.NORMAL;

				if (i < regex.length() - 2 && regexArr[i + 1] == '?') {
					if (regexArr[i + 2] == '<') {
						checkAllowedFunctionality(ALLOW_LOOKAROUND, "Lookaround symbol");
						
						if (i < regex.length() - 3) {
							
							/* check for look behind */
							switch (regexArr[i + 3]) {
							case '=':
								groupType = GroupFactor.GroupType.POSLOOKBEHIND;
								break;
							case '!':
								
								groupType = GroupFactor.GroupType.NEGLOOKBEHIND;
								break;
							default:
								throw new PatternSyntaxException("Unkown look-behind group", regex, i);

							}
						} else {
							throw new PatternSyntaxException("Unkown look-behind group", regex, i);
						}
						i += 4;
					} else {
						/*
						 * check for noncapturing group (?: and look ahead (?=
						 * and negative look ahead (?!
						 */
						switch (regexArr[i + 2]) {
						case ':':
							groupType = GroupFactor.GroupType.NONCAPTURING;
							break;
						case '=':
							checkAllowedFunctionality(ALLOW_LOOKAROUND, "Positive lookahead");
							groupType = GroupFactor.GroupType.POSLOOKAHEAD;
							break;
						case '!':
							checkAllowedFunctionality(ALLOW_LOOKAROUND, "Negative lookahead");
							groupType = GroupFactor.GroupType.NEGLOOKAHEAD;
							break;
						default:
							throw new PatternSyntaxException("Unkown inline modifier", regex, i);

						}
						i += 3;
					}
				} else {
					i++;
				}

				groupBuilder = new StringBuilder();
				level = 1;
				do {
					if (i >= regexArr.length) {
						throw new PatternSyntaxException("Unmatched '('", regex, i);
					}

					if (regexArr[i] == '\\') {
						/* skipping the escaped character */
						groupBuilder.append(regexArr[i]);
						i++;

						escaped = true;
					} else if (regexArr[i] == '[') {
						/* ignore ( and ) in character classes */
						while (i < regexArr.length && regexArr[i] != ']') {
							groupBuilder.append(regexArr[i]);
							i++;
						}
					}

					/* increasing and decreasing levels */
					if (!escaped && regexArr[i] == '(') {
						if (level != 0) {
							groupBuilder.append("(");
						}
						level++;
					} else if (!escaped && regexArr[i] == ')') {
						level--;
						if (level != 0) {
							groupBuilder.append(")");
						}
					} else {
						groupBuilder.append(regexArr[i]);
					}

					escaped = false;
					i++;
				} while (level != 0);
				List groupTokens = tokenize(groupBuilder.toString(), currentLevel + 1);
				RegexToken groupFactor = new GroupFactor(groupTokens, groupType);
				tokenStream.add(groupFactor);
				break;
			case '[':
				groupBuilder = new StringBuilder();
				level = 1;
				i++;

				while (level != 0) {
					if (i >= regexArr.length) {
						throw new PatternSyntaxException("Unclosed character class", regex, i);
					}
					if (regexArr[i] == '\\') {
						/* skipping the escaped character */
						groupBuilder.append(regexArr[i]);
						i++;
						escaped = true;
					}

					if (!escaped && regexArr[i] == '[') {
						level++;
					} else if (!escaped && regexArr[i] == ']') {
						level--;
					}

					if (level != 0) {
						groupBuilder.append(regexArr[i]);
					}

					escaped = false;
					i++;
				}
				String symbols = groupBuilder.toString();
				RegexToken characterClassFactor = new CharacterClassFactor(symbols);
				tokenStream.add(characterClassFactor);
				break;
			case '\\':
				i++;
				if (i < regexArr.length && regexArr[i] == 'Q') {
					i++;
					while (true) {
						if (i < regexArr.length) {
							if (regexArr[i] == '\\') {
								i++;
								if (i < regexArr.length && regexArr[i] == 'E') {
									
									break;
								} else if (i < regexArr.length) {
									
									String currentSymbol = "" + regexArr[i];
									RegexToken verbatimChar;
									
									if (canEscapeVerbatim(currentSymbol)) {
										
										verbatimChar = new EscapeFactor("\\" + currentSymbol, EscapeType.VERBATIM);
									} else {
										verbatimChar = new SingleFactor(currentSymbol);
									}
									tokenStream.add(verbatimChar);
								}
							} else {
								String currentSymbol = "" + regexArr[i];
								RegexToken verbatimChar;
								if (canEscapeVerbatim(currentSymbol)) {
									verbatimChar = new EscapeFactor("\\" + currentSymbol, EscapeType.VERBATIM);
								} else {
									verbatimChar = new SingleFactor(currentSymbol);

								}
								tokenStream.add(verbatimChar);
							}
						} else {
							break;
						}
						i++;

					}
					i++;
				} else if (i < regexArr.length && regexArr[i] == 'x') {
					
					groupBuilder = new StringBuilder("\\x");
					i++;
					if (i < regexArr.length && regexArr[i] == '{') {
						while (i < regexArr.length && regexArr[i] != '}') {
							groupBuilder.append(regexArr[i]);
							i++;
						}
						groupBuilder.append("}");
						i++;
					} else if (i < regexArr.length) {
						groupBuilder.append(regexArr[i]);
						i++;
						if (i < regexArr.length) {
							groupBuilder.append(regexArr[i]);
							i++;
						} else {
							throw new PatternSyntaxException("Illegal hexadecimal escape sequence", regex, i);
						}
					} else {
						throw new PatternSyntaxException("Illegal hexadecimal escape sequence", regex, i);
					}

					String escapeSequence = groupBuilder.toString();
					RegexToken escapeFactor = new EscapeFactor(escapeSequence, EscapeType.HEX);
					tokenStream.add(escapeFactor);
				} else if (i < regexArr.length && regexArr[i] == '0') {
					groupBuilder = new StringBuilder("\\0");
					String hexNumberStr = "";
					/*
					 * Read octal symbols until larger than allowed max up to a
					 * maximum of three characters
					 */
					int tmpNum = 0;
					int octalDigitCounter = 0;
					i++;
					while (i < regexArr.length && tmpNum < 0377 && ('0' <= regexArr[i] && regexArr[i] <= '7') && octalDigitCounter < 3) {
						hexNumberStr += regexArr[i];
						groupBuilder.append(regexArr[i]);
						tmpNum = Integer.parseInt(hexNumberStr, 8);
						i++;
						octalDigitCounter++;
					}

					String escapeSequence = groupBuilder.toString();
					RegexToken escapeFactor = new EscapeFactor(escapeSequence, EscapeType.OCTAL);
					tokenStream.add(escapeFactor);
				} else if (i < regexArr.length && regexArr[i] == 'u') {
					groupBuilder = new StringBuilder("\\u");
					i++;
					if (i < regexArr.length - 3) {
						for (int j = 0; j < 4; j++) {
							groupBuilder.append(regexArr[i]);
							i++;
						}

					} else {
						throw new PatternSyntaxException("Illegal unicode escape sequence", regex, i);
					}

					String escapeSequence = groupBuilder.toString();
					RegexToken escapeFactor = new EscapeFactor(escapeSequence, EscapeType.UNICODE);
					tokenStream.add(escapeFactor);
					throw new PatternSyntaxException("Illegal/unsupported escape sequence", regex, i);
				} else if (i < regexArr.length && regexArr[i] == 'p') {
					groupBuilder = new StringBuilder("\\p");
					i++;
					if (regexArr[i] == '{') {
						while (regexArr[i] != '}') {
							groupBuilder.append(regexArr[i]);
							i++;
							if (i >= regexArr.length) {
								throw new PatternSyntaxException("Unclosed character family near index", regex, i);
							}
						}
						groupBuilder.append('}');
						i++;
					} else {
						groupBuilder.append(regexArr[i]);
						i++;
					}

					String escapeSequence = groupBuilder.toString();
					RegexToken escapeFactor = new EscapeFactor(escapeSequence, EscapeType.CHARACTER_PROPERTY);
					tokenStream.add(escapeFactor);
				} else if (i < regexArr.length) {
					String escapedSequence = "\\" + regexArr[i];
					RegexToken escapeFactor = new EscapeFactor(escapedSequence, EscapeType.CHARACTER);
					tokenStream.add(escapeFactor);
					i++;
				} else {
					String escapedSequence = "\\";
					RegexToken escapeFactor = new EscapeFactor(escapedSequence, EscapeType.CHARACTER);
					tokenStream.add(escapeFactor);
					i++;
				}
				break;
			case '{':
				String operatorSymbol;
				RegexToken operatorToken;
				QuantifiableOperator.Quantifier quantifier = QuantifiableOperator.Quantifier.GREEDY;
				StringBuilder countedClosureBuilder = new StringBuilder("{");
				while (regexArr[i] != '}') {

					i++;
					if (i >= regexArr.length) {
						throw new PatternSyntaxException("Unclosed counted closure", regex, i);
					}
					countedClosureBuilder.append(regexArr[i]);
				}
				i++;
				if (i < regexArr.length) {
					switch (regexArr[i]) {
					case '?':
						quantifier = QuantifiableOperator.Quantifier.RELUCTANT;
						i++;
						break;
					case '+':
						quantifier = QuantifiableOperator.Quantifier.POSSESIVE;
						i++;
						break;
					}
					
				}

				operatorSymbol = countedClosureBuilder.toString();
				Pattern boundedPattern = Pattern.compile("\\{(\\d+),(\\d+)\\}");
				Pattern unboundedPattern = Pattern.compile("\\{(\\d+),\\}");
				Pattern constantRepititionPattern = Pattern.compile("\\{(\\d+)\\}");
				
				Matcher boundedMatcher = boundedPattern.matcher(operatorSymbol);
				Matcher unboundedMatcher = unboundedPattern.matcher(operatorSymbol);
				Matcher constantRepititionMatcher = constantRepititionPattern.matcher(operatorSymbol);
				int low, high;
				if (boundedMatcher.find()) {
					String lowStr = boundedMatcher.group(1);
					low = Integer.parseInt(lowStr);
					String highStr = boundedMatcher.group(2);
					high = Integer.parseInt(highStr);
					
					if (high < low || low < 0 || high > MAX_REPETITION) {
						throw new PatternSyntaxException("Illegal repetition range", regex, i);
					}
					
					operatorToken = new CountClosureOperator(operatorSymbol, quantifier, low, high);
					tokenStream.add(operatorToken);
					
				} else if (unboundedMatcher.find()) {
					String lowStr = unboundedMatcher.group(1);
					low = Integer.parseInt(lowStr);					
					
					if (low < 0 || low > MAX_REPETITION) {
						throw new PatternSyntaxException("Illegal repetition range", regex, i);
					}
					operatorToken = new CountClosureOperator(operatorSymbol, quantifier, low, BoundsType.UNBOUNDED);
					tokenStream.add(operatorToken);
					
				} else if (constantRepititionMatcher.find()) {
					String lowStr = constantRepititionMatcher.group(1);
					low = Integer.parseInt(lowStr);					
					
					if (low < 0 || low > MAX_REPETITION) {
						throw new PatternSyntaxException("Illegal repetition range", regex, i);
					}
					operatorToken = new CountClosureOperator(operatorSymbol, quantifier, low, BoundsType.CONSTANT_REPETITION);
					tokenStream.add(operatorToken);
				} else {
					throw new PatternSyntaxException("Illegal repetition range", regex, i);
				}
				
				
				break;
			case '*':
				i++;
				quantifier = QuantifiableOperator.Quantifier.GREEDY;
				if (i < regexArr.length) {
					switch (regexArr[i]) {
					case '?':
						quantifier = QuantifiableOperator.Quantifier.RELUCTANT;
						i++;
						break;
					case '+':
						quantifier = QuantifiableOperator.Quantifier.POSSESIVE;
						i++;
						break;
					}
					
				}
				
				operatorToken = new QuantifiableOperator("*", RegexOperator.OperatorType.STAR, quantifier);
				tokenStream.add(operatorToken);
				break;
			case '+':
				i++;
				quantifier = QuantifiableOperator.Quantifier.GREEDY;
				if (i < regexArr.length) {
					switch (regexArr[i]) {
					case '?':
						quantifier = QuantifiableOperator.Quantifier.RELUCTANT;
						i++;
						break;
					case '+':
						quantifier = QuantifiableOperator.Quantifier.POSSESIVE;
						i++;
						break;
					}
					
				}
				
				operatorToken = new QuantifiableOperator("+", RegexOperator.OperatorType.PLUS, quantifier);
				tokenStream.add(operatorToken);
				break;
			case '?':
				i++;
				quantifier = QuantifiableOperator.Quantifier.GREEDY;
				if (i < regexArr.length) {
					switch (regexArr[i]) {
					case '?':
						quantifier = QuantifiableOperator.Quantifier.RELUCTANT;
						i++;
						break;
					case '+':
						quantifier = QuantifiableOperator.Quantifier.POSSESIVE;
						i++;
						break;
					}
					
				}
				checkAllowedFunctionality(ALLOW_ZERO_ONCE, "? operator");
				operatorToken = new QuantifiableOperator("?", RegexOperator.OperatorType.QM, quantifier);
				tokenStream.add(operatorToken);
				break;
			case '|':
				i++;
				operatorToken = new RegexOperator("|", RegexOperator.OperatorType.OR);
				tokenStream.add(operatorToken);
				break;
			case '.':
				i++;
				RegexToken wildCardToken = new WildCardFactor();
				tokenStream.add(wildCardToken);
				break;
			case '^':
				if (i == 0 && currentLevel == 0) {
					i++; /* We ignore line boundaries at the start */
				} else {
					checkAllowedFunctionality(ALLOW_LINE_BOUNDARY, "Line Boundary ^");
				}
				break;
			case '$':
				if (i == regexArr.length - 1 && currentLevel == 0) {
					i++; /* We ignore line boundaries at the end */
				} else {
					checkAllowedFunctionality(ALLOW_LINE_BOUNDARY, "Line Boundary $");
				}
				break;
			default:
				String escapedSequence = "" + regexArr[i];
				RegexToken escapeFactor = new SingleFactor(escapedSequence);
				tokenStream.add(escapeFactor);
				i++;
			}
		}
		
		return tokenStream;
	}

	protected void addRule(PreprocessorRule rule) {
		rules.add(rule);
	}

	public ParsingPreprocessor() {
		rules = new LinkedList();

	}

	private static boolean canEscapeVerbatim(String symbol) {
		/*
		 * we can escape any characters, except letters, to get their verbatim
		 * symbol
		 */
		return !symbol.matches("[a-zA-Z]");
	}
	
	private static void checkAllowedFunctionality(boolean isHandled, String message) {
		if (!isHandled) {
			throw new RuntimeException("Unhandled Functionality: " + message);
		}
	}
	
	static interface RegexToken {

		public enum TokenType {
			REGEX_FACTOR, REGEX_OPERATOR
		}

		public TokenType getTokenType();
		
		public String getRepresentation();

	}

	static abstract class RegexFactor implements RegexToken {
		public enum FactorType {
			CHARACTER_CLASS, SINGLE_CHARACTER, ESCAPED_CHARACTER, GROUP, WILD_CARD
		}

		protected FactorContentType factorContent;

		public FactorContentType getFactorContent() {
			return factorContent;
		}

		public RegexFactor(FactorContentType factorContent) {
			this.factorContent = factorContent;
		}

		@Override
		public TokenType getTokenType() {
			return TokenType.REGEX_FACTOR;
		}

		public abstract FactorType getFactorType();

	}

	static class RegexOperator implements RegexToken {

		protected final String operatorSequence;

		public String getOperator() {
			return operatorSequence;
		}

		public RegexOperator(String operatorSequence, OperatorType operatorType) {
			this.operatorSequence = operatorSequence;
			this.operatorType = operatorType;
		}

		public boolean getIsQuantifiable() {
			return false;
		}

		public enum OperatorType {
			PLUS, STAR, QM, COUNT, OR
		}

		private final OperatorType operatorType;
		public OperatorType getOperatorType() {
			return operatorType;
		}
		
		@Override
		public TokenType getTokenType() {
			return TokenType.REGEX_OPERATOR;
		}
		
		@Override
		public String toString() {
			return "O( " + operatorSequence + " )";
		}

		@Override
		public String getRepresentation() {
			return operatorSequence;
		}
		
		@Override
		public boolean equals(Object o) {
			if (o == null) {
				return false;
			}
			
			if (o == this) {
				return true;
			}
			
			if (!(o instanceof RegexOperator)) {
				return false;
			}
			
			RegexOperator ro = (RegexOperator) o;
			return ro.operatorSequence.equals(operatorSequence);
		}

		@Override
		public int hashCode() {
			return operatorSequence.hashCode();
		}
	}

	static class QuantifiableOperator extends RegexOperator {

		public enum Quantifier {
			GREEDY, POSSESIVE, RELUCTANT
		}

		private Quantifier quantifier;
		public Quantifier getOperatorQuantifier() {
			return quantifier;
		}

		public QuantifiableOperator(String operatorSequence, OperatorType operatorType, Quantifier quantifier) {
			super(operatorSequence, operatorType);
			
			this.quantifier = quantifier;
		}
		
		public QuantifiableOperator(String operatorSequence, OperatorType operatorType) {
			super(operatorSequence, operatorType);
			
			this.quantifier = Quantifier.GREEDY;
		}
		
		@Override
		public boolean getIsQuantifiable() {
			return true;
		}

		public void setOperatorQuantifier(Quantifier quantifier) {
			this.quantifier = quantifier;
		}
		
		@Override
		public String toString() {
			String q = "";
			switch (quantifier) {
			case GREEDY:
				q = "G";
				break;
			case POSSESIVE:
				q = "P";
				break;
			case RELUCTANT:
				q = "R";
				break;
			}
			return q + "QO( " + operatorSequence + " )";
		}

	}
	
	static class CountClosureOperator extends QuantifiableOperator {
		
		public enum BoundsType {
			BOUNDED, UNBOUNDED, CONSTANT_REPETITION
		}

		private final int low;
		public int getLow() {
			return low;
		}
		
		private final int high;
		public int getHigh() {
			return high;
		}
		
		private final BoundsType boundsType;
		public BoundsType getBoundsType() {
			return boundsType;
		}
		
		public CountClosureOperator(String operatorSequence, Quantifier quantifier, int low, int high) {
			super(operatorSequence, OperatorType.COUNT, quantifier);
			this.low = low;
			this.high = high;
			
			this.boundsType = BoundsType.BOUNDED;
		}
		
		public CountClosureOperator(String operatorSequence, Quantifier quantifier, int low, BoundsType boundsType) {
			super(operatorSequence, OperatorType.COUNT, quantifier);
			this.low = low;
			
			switch (boundsType) {
			case UNBOUNDED:
				high = Integer.MAX_VALUE;
				break;
			case CONSTANT_REPETITION:
				high = low;
				break;
			default:
				throw new IllegalArgumentException("Upper bounds needs to be specified.");
			}
			this.boundsType = boundsType;
		}
		
	}
	
	static class WildCardFactor extends RegexFactor {
		public WildCardFactor() {
			super(".");
		}

		@Override
		public FactorType getFactorType() {
			return FactorType.WILD_CARD;
		}

		@Override
		public String toString() {
			return factorContent;
		}

		@Override
		public String getRepresentation() {
			return factorContent;
		}
	}

	static class CharacterClassFactor extends RegexFactor {

		public CharacterClassFactor(String factorContent) {
			super(factorContent);
		}

		@Override
		public FactorType getFactorType() {
			return FactorType.CHARACTER_CLASS;
		}

		@Override
		public String toString() {
			return "[" + factorContent + "]";
		}

		@Override
		public String getRepresentation() {
			return "[" + factorContent + "]";
		}
	}

	static class GroupFactor extends RegexFactor> {
		
		
		private int level;
		public int getLevel() {
			return level;
		}

		public GroupFactor(List factorContent, GroupType groupType) {
			super(factorContent);
			this.groupType = groupType;
		}

		public GroupFactor(String processedContent, GroupType groupType, int level) {
			super(ParsingPreprocessor.tokenize(processedContent, level + 1));
			this.level = level;
			this.groupType = groupType;
		}

		@Override
		public FactorType getFactorType() {
			return FactorType.GROUP;
		}

		private GroupType groupType;

		public GroupType getGroupType() {
			return groupType;
		}
		
		private String groupPrefix() {
			switch (groupType) {
			case NORMAL:
				return "";
			case NONCAPTURING:
				return "?:";
			case POSLOOKAHEAD:
				return "?=";
			case POSLOOKBEHIND:
				return "?<=";
			case NEGLOOKAHEAD:
				return "?!";
			case NEGLOOKBEHIND:
				return "? {
		
		public enum EscapeType {
			CHARACTER, OCTAL, UNICODE, HEX, VERBATIM, CHARACTER_PROPERTY
		}
		
		private final EscapeType type;
		public EscapeType getEscapeType() {
			return type;
		}

		public EscapeFactor(String factorContent, EscapeType type) {
			super(factorContent);
			this.type = type;
		}

		@Override
		public FactorType getFactorType() {
			return FactorType.ESCAPED_CHARACTER;
		}

		@Override
		public String toString() {
			return factorContent;
		}
		
		@Override
		public String getRepresentation() {
			return factorContent;
		}
		
		@Override
		public boolean equals(Object o) {
			if (o == null) {
				return false;
			}
			
			if (o == this) {
				return true;
			}
			
			if (!(o instanceof EscapeFactor)) {
				return false;
			}
			
			EscapeFactor ef = (EscapeFactor) o;
			return ef.factorContent.equals(factorContent);
		}

		@Override
		public int hashCode() {
			return type.hashCode() * 13 + factorContent.hashCode();
		}
	}

	static class SingleFactor extends RegexFactor {

		public SingleFactor(String factorContent) {
			super(factorContent);
		}

		@Override
		public FactorType getFactorType() {
			return FactorType.SINGLE_CHARACTER;
		}

		@Override
		public String toString() {
			return factorContent;
		}
		
		@Override
		public String getRepresentation() {
			return factorContent;
		}
		
		@Override
		public boolean equals(Object o) {
			if (o == null) {
				return false;
			}
			
			if (o == this) {
				return true;
			}
			
			if (!(o instanceof SingleFactor)) {
				return false;
			}
			
			SingleFactor sf = (SingleFactor) o;
			return sf.factorContent.equals(factorContent);
		}

		@Override
		public int hashCode() {
			return factorContent.hashCode() * 13;
		}
		
	}
	
	public static void main(String [] args) {
		List tokenStream = tokenize("\\x{FFFF}+", 0);
		System.out.println(tokenStream);
		for (RegexToken rt : tokenStream) {
			System.out.print(rt.getRepresentation());
		}
	}
	
	private static class RegexException extends RuntimeException {
		
		private static final long serialVersionUID = 1L;
		public RegexException(String message) {
			super(message);
		}
		
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy