All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.sonarsource.analyzer.commons.regex.RegexParser Maven / Gradle / Ivy

The newest version!
/*
 * SonarSource Analyzers Regex Parsing Commons
 * Copyright (C) 2009-2024 SonarSource SA
 * mailto:info AT sonarsource DOT com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 3 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */
package org.sonarsource.analyzer.commons.regex;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.StringJoiner;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.annotation.CheckForNull;
import javax.annotation.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.sonarsource.analyzer.commons.regex.ast.AtomicGroupTree;
import org.sonarsource.analyzer.commons.regex.ast.BackReferenceTree;
import org.sonarsource.analyzer.commons.regex.ast.BoundaryTree;
import org.sonarsource.analyzer.commons.regex.ast.CapturingGroupTree;
import org.sonarsource.analyzer.commons.regex.ast.CharacterClassElementTree;
import org.sonarsource.analyzer.commons.regex.ast.CharacterClassIntersectionTree;
import org.sonarsource.analyzer.commons.regex.ast.CharacterClassTree;
import org.sonarsource.analyzer.commons.regex.ast.CharacterClassUnionTree;
import org.sonarsource.analyzer.commons.regex.ast.CharacterRangeTree;
import org.sonarsource.analyzer.commons.regex.ast.CharacterTree;
import org.sonarsource.analyzer.commons.regex.ast.ConditionalSubpatternTree;
import org.sonarsource.analyzer.commons.regex.ast.CurlyBraceQuantifier;
import org.sonarsource.analyzer.commons.regex.ast.DisjunctionTree;
import org.sonarsource.analyzer.commons.regex.ast.DotTree;
import org.sonarsource.analyzer.commons.regex.ast.EscapedCharacterClassTree;
import org.sonarsource.analyzer.commons.regex.ast.FinalState;
import org.sonarsource.analyzer.commons.regex.ast.FlagSet;
import org.sonarsource.analyzer.commons.regex.ast.GroupTree;
import org.sonarsource.analyzer.commons.regex.ast.IndexRange;
import org.sonarsource.analyzer.commons.regex.ast.LookAroundTree;
import org.sonarsource.analyzer.commons.regex.ast.MiscEscapeSequenceTree;
import org.sonarsource.analyzer.commons.regex.ast.NonCapturingGroupTree;
import org.sonarsource.analyzer.commons.regex.ast.PosixCharacterClassElementTree;
import org.sonarsource.analyzer.commons.regex.ast.Quantifier;
import org.sonarsource.analyzer.commons.regex.ast.ReferenceConditionTree;
import org.sonarsource.analyzer.commons.regex.ast.RegexSyntaxElement;
import org.sonarsource.analyzer.commons.regex.ast.RegexToken;
import org.sonarsource.analyzer.commons.regex.ast.RegexTree;
import org.sonarsource.analyzer.commons.regex.ast.RepetitionTree;
import org.sonarsource.analyzer.commons.regex.ast.SequenceTree;
import org.sonarsource.analyzer.commons.regex.ast.SimpleQuantifier;
import org.sonarsource.analyzer.commons.regex.ast.SourceCharacter;
import org.sonarsource.analyzer.commons.regex.ast.StartState;

import static org.sonarsource.analyzer.commons.regex.RegexLexer.EOF;

public class RegexParser {

  private static final Logger LOG = LoggerFactory.getLogger(RegexParser.class);

  private static final String HEX_DIGIT = "hexadecimal digit";

  private static final String POSIX_CHARACTER_CLASS_PATTERN = "[:%s%s:]";

  private static final Set POSIX_CHARACTER_CLASSES = new HashSet<>(Arrays.asList(
    "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph", "lower", "print", "punct", "space", "upper", "word", "xdigit", "<", ">"
  ));

  private static final Map POSIX_CHARACTER_CLASS_LOOKUP = posixCharacterClassMap(false);

  private static final Map POSIX_CHARACTER_CLASS_NEGATION_LOOKUP = posixCharacterClassMap(true);

  private static Map posixCharacterClassMap(boolean negative) {
    return POSIX_CHARACTER_CLASSES.stream()
      .collect(Collectors.toMap(posix -> String.format(POSIX_CHARACTER_CLASS_PATTERN, negative ? "^" : "", posix), posix -> posix));
  }

  protected final RegexSource source;

  protected final RegexLexer characters;

  protected FlagSet activeFlags;

  protected final List backReferences = new ArrayList<>();

  protected final Map capturingGroups = new HashMap<>();

  protected final List errors = new ArrayList<>();

  protected int groupNumber = 1;

  public RegexParser(RegexSource source, FlagSet initialFlags) {
    this.source = source;
    this.characters = source.createLexer();
    this.characters.setFreeSpacingMode(initialFlags.contains(Pattern.COMMENTS));
    this.activeFlags = initialFlags;
  }

  public RegexParseResult parse() {
    FlagSet initialFlags = activeFlags;
    List results = new ArrayList<>();
    do {
      RegexTree result = parseDisjunction();
      results.add(result);
      if (characters.isNotAtEnd()) {
        error("Unexpected '" + characters.getCurrent().getCharacter() + "'");
        characters.moveNext();
      }
    } while (characters.isNotAtEnd());
    if (characters.isInQuotingMode()) {
      expected("'\\E'");
    }
    RegexTree result = combineTrees(results, (range, elements) -> new SequenceTree(source, range, elements, initialFlags));
    StartState startState = new StartState(result, initialFlags);
    FinalState finalState = new FinalState(activeFlags);
    result.setContinuation(finalState);
    backReferences.forEach(reference -> reference.setGroup(capturingGroups.get(reference.groupName())));
    return new RegexParseResult(result, startState, finalState, errors, characters.hasComments());
  }

  protected RegexTree parseDisjunction() {
    FlagSet disjunctionFlags = activeFlags;
    List alternatives = new ArrayList<>();
    List orOperators = new ArrayList<>();
    RegexTree first = parseSequence();
    alternatives.add(first);
    while (characters.currentIs('|')) {
      orOperators.add(characters.getCurrent());
      characters.moveNext();
      RegexTree next = parseSequence();
      alternatives.add(next);
    }
    return combineTrees(alternatives, (range, elements) -> new DisjunctionTree(source, range, elements, orOperators, disjunctionFlags));
  }

  protected RegexTree parseSequence() {
    FlagSet sequenceFlags = activeFlags;
    List elements = new ArrayList<>();
    RegexTree element = parseRepetition();
    while (element != null) {
      elements.add(element);
      element = parseRepetition();
    }
    if (elements.isEmpty()) {
      int index = characters.getCurrentStartIndex();
      return new SequenceTree(source, new IndexRange(index, index), elements, sequenceFlags);
    } else {
      return combineTrees(elements, (range, items) -> new SequenceTree(source, range, items, sequenceFlags));
    }
  }

  @CheckForNull
  protected RegexTree parseRepetition() {
    FlagSet repetitionFlags = activeFlags;
    RegexTree element = parsePrimaryExpression();
    if (characters.isInQuotingMode()) {
      return element;
    }
    Quantifier quantifier = parseQuantifier();
    if (element == null) {
      if (quantifier != null) {
        errors.add(new SyntaxError(quantifier, "Unexpected quantifier '" + quantifier.getText() + "'"));
      }
      return null;
    }
    if (quantifier == null) {
      return element;
    } else {
      return new RepetitionTree(source, element.getRange().merge(quantifier.getRange()), element, quantifier, repetitionFlags);
    }
  }

  @CheckForNull
  protected Quantifier parseQuantifier() {
    SimpleQuantifier.Kind kind;
    switch (characters.getCurrentChar()) {
      case '*':
        kind = SimpleQuantifier.Kind.STAR;
        break;
      case '+':
        kind = SimpleQuantifier.Kind.PLUS;
        break;
      case '?':
        kind = SimpleQuantifier.Kind.QUESTION_MARK;
        break;
      case '{':
        return parseCurlyBraceQuantifier();
      default:
        return null;
    }
    SourceCharacter current = characters.getCurrent();
    characters.moveNext();
    Quantifier.Modifier modifier = parseQuantifierModifier();
    IndexRange range = current.getRange().extendTo(characters.getCurrentStartIndex());
    return new SimpleQuantifier(source, range, modifier, kind);
  }

  CurlyBraceQuantifier parseCurlyBraceQuantifier() {
    if (supportsAnyOfFeatures(RegexFeature.UNESCAPED_CURLY_BRACKET) && !isCurlyBraceQuantifier()) {
      return null;
    }
    SourceCharacter openingBrace = characters.getCurrent();
    characters.moveNext();
    RegexToken lowerBound = parseInteger();
    if (lowerBound == null && !supportsAnyOfFeatures(RegexFeature.ONLY_UPPER_BOUND_QUANTIFIER)) {
      expected("integer");
      return null;
    }
    RegexToken comma = null;
    RegexToken upperBound = null;
    if (characters.currentIs(',')) {
      comma = new RegexToken(source, characters.getCurrent().getRange());
      characters.moveNext();
      upperBound = parseInteger();
    }
    Quantifier.Modifier modifier;
    if (characters.currentIs('}')) {
      characters.moveNext();
    } else {
      if (comma == null) {
        expected("',' or '}'");
      } else if (upperBound == null) {
        expected("integer or '}'");
      } else {
        expected("'}'");
      }
    }
    modifier = parseQuantifierModifier();
    IndexRange range = openingBrace.getRange().extendTo(characters.getCurrentStartIndex());
    return new CurlyBraceQuantifier(source, range, modifier, lowerBound, comma, upperBound);
  }

  private boolean isCurlyBraceQuantifier() {
    int index = 1;
    if (!isAsciiDigit(characters.lookAhead(index)) && !supportsAnyOfFeatures(RegexFeature.ONLY_UPPER_BOUND_QUANTIFIER)) {
      return false;
    }
    do {
      index++;
    } while (isAsciiDigit(characters.lookAhead(index)));
    if (characters.lookAhead(index) == '}') {
      return true;
    }
    if (characters.lookAhead(index) != ',') {
      return false;
    }
    do {
      index++;
    } while (isAsciiDigit(characters.lookAhead(index)));
    return characters.lookAhead(index) == '}';
  }

  Quantifier.Modifier parseQuantifierModifier() {
    if (characters.currentIs('?')) {
      characters.moveNext();
      return Quantifier.Modifier.RELUCTANT;
    } else if (characters.currentIs('+') && supportsAnyOfFeatures(RegexFeature.POSSESSIVE_QUANTIFIER)) {
      characters.moveNext();
      return Quantifier.Modifier.POSSESSIVE;
    } else {
      return Quantifier.Modifier.GREEDY;
    }
  }

  @CheckForNull
  protected RegexToken parseInteger() {
    int startIndex = characters.getCurrentStartIndex();
    if (!isAsciiDigit(characters.getCurrentChar())) {
      return null;
    }
    while(isAsciiDigit(characters.getCurrentChar())) {
      characters.moveNext();
    }
    IndexRange range = new IndexRange(startIndex, characters.getCurrentStartIndex());
    return new RegexToken(source, range);
  }

  @CheckForNull
  protected RegexTree parsePrimaryExpression() {
    if (characters.isInQuotingMode() && characters.isNotAtEnd()) {
      return readCharacter();
    }
    switch (characters.getCurrentChar()) {
      case '(':
        if (characters.currentIs("(?P=") && supportsAnyOfFeatures(RegexFeature.PYTHON_SYNTAX_GROUP_NAME)) {
          return parsePythonBackReference();
        }
        return parseGroup();
      case '\\':
        return parseEscapeSequence();
      case '[':
        return parseCharacterClass();
      case '.':
        DotTree tree = new DotTree(source, characters.getCurrentIndexRange(), activeFlags);
        characters.moveNext();
        return tree;
      case '^':
        BoundaryTree lineStart = new BoundaryTree(source, BoundaryTree.Type.LINE_START, characters.getCurrentIndexRange(), activeFlags);
        characters.moveNext();
        return lineStart;
      case '$':
        BoundaryTree lineEnd = new BoundaryTree(source, BoundaryTree.Type.LINE_END, characters.getCurrentIndexRange(), activeFlags);
        characters.moveNext();
        return lineEnd;
      default:
        if (isPlainTextCharacter(characters.getCurrentChar())) {
          return readCharacter();
        } else {
          return null;
        }
    }
  }

  private RegexTree parsePythonBackReference() {
    SourceCharacter openingParen = characters.getCurrent();
    // Discard '(?'
    characters.moveNext(2);
    return parseEscapedSequence('=', ')', "a group name",
      dh -> collect(new BackReferenceTree(source, openingParen, null, dh.opener, dh.closer, activeFlags)));
  }

  protected CharacterTree readCharacter() {
    SourceCharacter character = characters.getCurrent();
    characters.moveNext();
    return characterTree(character);
  }

  protected GroupTree parseGroup() {
    SourceCharacter openingParen = characters.getCurrent();
    characters.moveNext();
    if (characters.currentIs("?=")) {
      characters.moveNext(2);
      return finishGroup(openingParen, (range, inner) -> LookAroundTree.positiveLookAhead(source, range, inner, activeFlags));
    } else if (characters.currentIs("?<=")) {
      characters.moveNext(3);
      return finishGroup(openingParen, (range, inner) -> LookAroundTree.positiveLookBehind(source, range, inner, activeFlags));
    } else if (characters.currentIs("?!")) {
      characters.moveNext(2);
      return finishGroup(openingParen, (range, inner) -> LookAroundTree.negativeLookAhead(source, range, inner, activeFlags));
    } else if (characters.currentIs("? LookAroundTree.negativeLookBehind(source, range, inner, activeFlags));
    } else if (characters.currentIs("?>") && supportsAnyOfFeatures(RegexFeature.ATOMIC_GROUP)) {
      characters.moveNext(2);
      return finishGroup(openingParen, (range, inner) -> new AtomicGroupTree(source, range, inner, activeFlags));
    } else if (characters.currentIs("?<") && supportsAnyOfFeatures(RegexFeature.JAVA_SYNTAX_GROUP_NAME, RegexFeature.DOTNET_SYNTAX_GROUP_NAME)) {
      return finishGroup(openingParen, newNamedCapturingGroup(2, '>'));
    } else if (characters.currentIs("?'") && supportsAnyOfFeatures(RegexFeature.DOTNET_SYNTAX_GROUP_NAME)) {
      return finishGroup(openingParen, newNamedCapturingGroup(2, '\''));
    } else if (characters.currentIs("?P<") && supportsAnyOfFeatures(RegexFeature.PYTHON_SYNTAX_GROUP_NAME)) {
      return finishGroup(openingParen, newNamedCapturingGroup(3, '>'));
    } else if (characters.currentIs("?")) {
      return parseNonCapturingGroup(openingParen);
    } else {
      return finishGroup(openingParen, newCapturingGroup(null));
    }
  }

  protected GroupConstructor newNamedCapturingGroup(int namePrefixLength, char nameDelimiter) {
    characters.moveNext(namePrefixLength);
    String name = parseGroupName(nameDelimiter);
    if (characters.currentIs(nameDelimiter)) {
      characters.moveNext();
    } else {
      expected("'" + nameDelimiter + "'");
    }
    return newCapturingGroup(name);
  }

  protected GroupConstructor newCapturingGroup(@Nullable String name) {
    int index = groupNumber;
    groupNumber++;
    return (range, inner) -> index(new CapturingGroupTree(source, range, name, index, inner, activeFlags));
  }

  protected String parseGroupName(char nameDelimiter) {
    StringBuilder sb = new StringBuilder();
    while (characters.isNotAtEnd() && !characters.currentIs(nameDelimiter)) {
      sb.append(characters.getCurrent().getCharacter());
      characters.moveNext();
    }
    String name = sb.toString();
    if (name.isEmpty()) {
      expected("a name for the group");
    }
    return name;
  }

  protected GroupTree parseNonCapturingGroup(SourceCharacter openingParen) {
    // Discard '?'
    characters.moveNext();
    if (characters.currentIs("R)") && source.supportsFeature(RegexFeature.RECURSION)) {
      return parseRecursion(openingParen);
    }
    if (characters.currentIs("(") && source.supportsFeature(RegexFeature.CONDITIONAL_SUBPATTERN)) {
      return parseConditionalSubpattern(openingParen);
    }

    FlagSet enabledFlags = parseFlags();
    FlagSet disabledFlags;
    if (characters.currentIs('-')) {
      characters.moveNext();
      disabledFlags = parseFlags();
    } else {
      disabledFlags = new FlagSet();
    }

    boolean previousFreeSpacingMode = characters.getFreeSpacingMode();
    if (disabledFlags.contains(Pattern.COMMENTS)) {
      characters.setFreeSpacingMode(false);
    } else if (enabledFlags.contains(Pattern.COMMENTS)) {
      characters.setFreeSpacingMode(true);
    }

    FlagSet previousFlags = activeFlags;
    if (!enabledFlags.isEmpty() || !disabledFlags.isEmpty()) {
      activeFlags = new FlagSet(activeFlags);
      activeFlags.addAll(enabledFlags);
      activeFlags.removeAll(disabledFlags);
    }
    if (characters.currentIs(')')) {
      SourceCharacter closingParen = characters.getCurrent();
      characters.moveNext();
      IndexRange range = openingParen.getRange().merge(closingParen.getRange());
      return new NonCapturingGroupTree(source, range, enabledFlags, disabledFlags, null, activeFlags);
    }
    if (characters.currentIs(':')) {
      characters.moveNext();
    } else {
      expected("flag or ':' or ')'");
    }
    GroupTree group = finishGroup(previousFreeSpacingMode, openingParen, (range, inner) ->
      new NonCapturingGroupTree(source, range, enabledFlags, disabledFlags, inner, activeFlags)
    );
    activeFlags = previousFlags;
    return group;
  }

  private GroupTree parseConditionalSubpattern(SourceCharacter openingParen) {
    GroupTree condition = parseCondition();
    RegexTree subpattern = parseDisjunction();
    SourceCharacter closingParen = characters.getCurrent();
    characters.moveNext();
    if (subpattern.is(RegexTree.Kind.DISJUNCTION)) {
      if (((DisjunctionTree) subpattern).getAlternatives().size() > 2) {
        error("More than two alternatives in the subpattern");
      }
      DisjunctionTree disjunction = (DisjunctionTree) subpattern;
      return new ConditionalSubpatternTree(source, openingParen, closingParen, condition, disjunction.getAlternatives().get(0),
        disjunction.getOrOperators().get(0), disjunction.getAlternatives().get(1), activeFlags);
    } else {
      return new ConditionalSubpatternTree(source, openingParen, closingParen, condition, subpattern, activeFlags);
    }
  }

  private GroupTree parseCondition() {
    SourceCharacter openingParen = characters.getCurrent();
    characters.moveNext();
    if (characters.currentIs("?=")) {
      characters.moveNext(2);
      return finishGroup(openingParen, (range, inner) -> LookAroundTree.positiveLookAhead(source, range, inner, activeFlags));
    } else if (characters.currentIs("?<=")) {
      characters.moveNext(3);
      return finishGroup(openingParen, (range, inner) -> LookAroundTree.positiveLookBehind(source, range, inner, activeFlags));
    } else if (characters.currentIs("?!")) {
      characters.moveNext(2);
      return finishGroup(openingParen, (range, inner) -> LookAroundTree.negativeLookAhead(source, range, inner, activeFlags));
    } else if (characters.currentIs("? LookAroundTree.negativeLookBehind(source, range, inner, activeFlags));
    } else if (characters.currentIs("+")) {
      // Skip '+' as first character since it would be identified as quantifier at the beginning of a sequence
      CharacterTree plus = readCharacter();
      return finishGroup(openingParen, (range, inner) -> conditionGroupReference(source, range, plus, inner, activeFlags));
    } else {
      // TODO Allow only valid conditions: signed sequence of digits or 'R'
      return finishGroup(openingParen, (range, inner) -> conditionGroupReference(source, range, null, inner, activeFlags));
    }
  }

  public ReferenceConditionTree conditionGroupReference(RegexSource source, IndexRange range, @Nullable CharacterTree plus, RegexTree inner, FlagSet activeFlags) {
    StringBuilder reference = new StringBuilder();
    if (plus != null) {
      reference.append('+');
    }
    if (inner.is(RegexTree.Kind.CHARACTER)) {
      reference.append(((CharacterTree) inner).characterAsString());
    } else if (inner.is(RegexTree.Kind.SEQUENCE)){
      ((SequenceTree) inner).getItems().stream()
        .filter(CharacterTree.class::isInstance)
        .map(i -> ((CharacterTree) i).characterAsString())
        .forEach(reference::append);
    } else {
      error("Conditional subpattern has invalid condition.");
    }
    return new ReferenceConditionTree(source, range, reference.toString(), activeFlags);
  }

  private GroupTree parseRecursion(SourceCharacter openingParen) {
    // Discard 'R'
    characters.moveNext();
    SourceCharacter closingParen = characters.getCurrent();
    characters.moveNext();
    IndexRange range = openingParen.getRange().merge(closingParen.getRange());
    return new NonCapturingGroupTree(source, range, new FlagSet(), new FlagSet(), null, activeFlags);
  }

  protected FlagSet parseFlags() {
    FlagSet flags = new FlagSet();
    while (characters.isNotAtEnd()) {
      Integer flag = parseFlag(characters.getCurrent().getCharacter());
      if (flag == null) {
        break;
      }
      flags.add(flag, characters.getCurrent());
      characters.moveNext();
    }
    return flags;
  }

  @CheckForNull
  protected static Integer parseFlag(char ch) {
    switch (ch) {
      case 'i':
        return Pattern.CASE_INSENSITIVE;
      case 'd':
        return Pattern.UNIX_LINES;
      case 'm':
        return Pattern.MULTILINE;
      case 's':
        return Pattern.DOTALL;
      case 'u':
        return Pattern.UNICODE_CASE;
      case 'x':
        return Pattern.COMMENTS;
      case 'U':
        return Pattern.UNICODE_CHARACTER_CLASS;
      default:
        return null;
    }
  }

  protected GroupTree finishGroup(SourceCharacter openingParen, GroupConstructor groupConstructor) {
    return finishGroup(characters.getFreeSpacingMode(), openingParen, groupConstructor);
  }

  protected GroupTree finishGroup(boolean previousFreeSpacingMode, SourceCharacter openingParen, GroupConstructor groupConstructor) {
    FlagSet previousFlagSet = activeFlags;
    RegexTree inner = parseDisjunction();
    activeFlags = previousFlagSet;
    characters.setFreeSpacingMode(previousFreeSpacingMode);
    if (characters.currentIs(')')) {
      characters.moveNext();
    } else {
      expected("')'");
    }
    IndexRange range = openingParen.getRange().extendTo(characters.getCurrentStartIndex());
    return groupConstructor.construct(range, inner);
  }

  protected RegexTree parseEscapeSequence() {
    SourceCharacter backslash = characters.getCurrent();
    characters.moveNext();
    if (characters.isAtEnd()) {
      expected("any character");
      return characterTree(backslash);
    } else if (isEscapedCharacterClass()) {
      return parseEscapedProperty(backslash);
    } else if (isEscapedBackReference()) {
      return parseNamedBackReference(backslash);
    } else {
      SourceCharacter character = characters.getCurrent();
      switch (character.getCharacter()) {
        case '0':
          if (source.supportsFeature(RegexFeature.PHP_BINARY_ZERO)) {
            return parsePhpOctalEscapeOrBinaryZero(backslash);
          } else {
            return parseOctalEscape(backslash);
          }
        case '1':
        case '2':
        case '3':
          if (source.supportsFeature(RegexFeature.PYTHON_OCTAL_ESCAPE)) {
            return parsePythonOctalEscapeOrNumericalBackReference(backslash);
          } else {
            return parseNumericalBackReference(backslash);
          }
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
          return parseNumericalBackReference(backslash);
        case 'b':
        case 'B':
        case 'A':
        case 'G':
        case 'Z':
        case 'z':
          return parseBoundary(backslash);
        case 'w':
        case 'W':
        case 'd':
        case 'D':
        case 'S':
        case 's':
        case 'h':
        case 'H':
        case 'v':
        case 'V':
          return parseEscapedCharacterClass(backslash);
        case 'u':
          return parseUnicodeEscape(backslash);
        case 'x':
          return parseHexEscape(backslash);
        case 't':
        case 'n':
        case 'r':
        case 'f':
        case 'a':
        case 'e':
          characters.moveNext();
          char c = simpleEscapeToCharacter(character.getCharacter());
          IndexRange range = backslash.getRange().extendTo(characters.getCurrentStartIndex());
          return characterTree(new SourceCharacter(source, range, c, true));
        case 'c':
          return parseControlSequence(backslash);
        case 'N':
          return parseNamedUnicodeCharacter(backslash);
        case 'R':
        case 'X':
          characters.moveNext();
          return new MiscEscapeSequenceTree(source, backslash.getRange().extendTo(characters.getCurrentStartIndex()), activeFlags);
        case 'E':
          error("\\E used without \\Q");
          // Fallthrough
        default:
          characters.moveNext();
          return new CharacterTree(source, backslash.getRange().merge(character.getRange()), character.getCharacter(),
            character.isEscapeSequence(), activeFlags);
      }
    }
  }

  private boolean isEscapedCharacterClass() {
    return (characters.currentIs('p') || characters.currentIs('P')) && supportsAnyOfFeatures(RegexFeature.ESCAPED_CHARACTER_CLASS);
  }

  private boolean isEscapedBackReference() {
    return (characters.currentIs('k') && supportsAnyOfFeatures(RegexFeature.DOTNET_SYNTAX_GROUP_NAME, RegexFeature.JAVA_SYNTAX_GROUP_NAME, RegexFeature.PERL_SYNTAX_GROUP_NAME))
      || (characters.currentIs('g') && supportsAnyOfFeatures(RegexFeature.PERL_SYNTAX_GROUP_NAME));
  }

  protected RegexTree parseNamedUnicodeCharacter(SourceCharacter backslash) {
    return parseEscapedSequence('{', '}', "a Unicode character name", content ->
      // TODO: Once we move to Java 9+, use Character.codePointOf to produce a CharacterTree with the named Unicode
      //       character instead of a MiscEscapeSequenceTree and produce a syntax error for illegal character names
      new MiscEscapeSequenceTree(source, backslash.getRange().merge(content.closer.getRange()), activeFlags)
    );
  }

  protected RegexTree parseControlSequence(SourceCharacter backslash) {
    SourceCharacter c = characters.getCurrent();
    characters.moveNext();
    if (characters.isAtEnd()) {
      expected("any character");
      return characterTree(c);
    }
    char controlCharacter = (char) (0x40 ^ characters.getCurrentChar());
    characters.moveNext();
    IndexRange range = backslash.getRange().extendTo(characters.getCurrentStartIndex());
    return characterTree(new SourceCharacter(source, range, controlCharacter, true));
  }

  protected static char simpleEscapeToCharacter(char escapeCharacter) {
    switch (escapeCharacter) {
      case 't':
        return '\t';
      case 'n':
        return '\n';
      case 'r':
        return '\r';
      case 'f':
        return '\f';
      case 'a':
        return '\u0007';
      case 'e':
        return '\u001B';
      default:
        throw new IllegalArgumentException("Unsupported argument for simpleEscapeToCharacter: " + escapeCharacter);
    }
  }

  protected RegexTree parseUnicodeEscape(SourceCharacter backslash) {
    // Discard 'u'
    characters.moveNext();
    char codeUnit = (char) parseFixedAmountOfHexDigits(4);
    return characterTree(new SourceCharacter(source, backslash.getRange().extendTo(characters.getCurrentStartIndex()), codeUnit, true));
  }

  protected RegexTree parseHexEscape(SourceCharacter backslash) {
    // Discard 'x'
    characters.moveNext();
    int codePoint = 0;
    if (characters.currentIs('{')) {
      // Discard '{'
      characters.moveNext();
      if (!isHexDigit(characters.getCurrentChar())) {
        expected(HEX_DIGIT);
      }
      while (isHexDigit(characters.getCurrentChar())) {
        codePoint *= 16;
        codePoint += parseHexDigit();
      }
      if (characters.currentIs('}')) {
        characters.moveNext();
      } else {
        expected(HEX_DIGIT + " or '}'");
      }
    } else {
      codePoint = parseFixedAmountOfHexDigits(2);
    }
    IndexRange range = backslash.getRange().extendTo(characters.getCurrentStartIndex());
    CharacterTree tree = new CharacterTree(source, range, codePoint, true, activeFlags);
    if (!Character.isValidCodePoint(codePoint)) {
      errors.add(new SyntaxError(tree, "Invalid Unicode code point"));
    }
    return tree;
  }

  protected int parseFixedAmountOfHexDigits(int amount) {
    int i = 0;
    char result = 0;
    while (i < amount && isHexDigit(characters.getCurrentChar())) {
      result *= 16;
      result += parseHexDigit();
      i++;
    }
    if (i < amount) {
      expected(HEX_DIGIT);
    }
    return result;
  }

  protected int parseHexDigit() {
    int value = Integer.parseInt("" + characters.getCurrent().getCharacter(), 16);
    characters.moveNext();
    return value;
  }

  protected RegexTree parseEscapedCharacterClass(SourceCharacter backslash) {
    RegexTree result = new EscapedCharacterClassTree(source, backslash, characters.getCurrent(), activeFlags);
    characters.moveNext();
    return result;
  }

  protected RegexTree parseEscapedProperty(SourceCharacter backslash) {
    return parseEscapedSequence('{', '}', "a property name",
      dh -> new EscapedCharacterClassTree(source, backslash, dh.marker, dh.opener, dh.closer, activeFlags));
  }

  protected RegexTree parseNamedBackReference(SourceCharacter backslash) {
    if(characters.currentIs("k<") && supportsAnyOfFeatures(RegexFeature.DOTNET_SYNTAX_GROUP_NAME, RegexFeature.JAVA_SYNTAX_GROUP_NAME)) {
      return parseNamedBackReference(backslash, '<', '>');
    } else if(characters.currentIs("k'") && supportsAnyOfFeatures(RegexFeature.DOTNET_SYNTAX_GROUP_NAME)) {
      return parseNamedBackReference(backslash, '\'', '\'');
    } else if((characters.currentIs("k{") || characters.currentIs("g{")) && supportsAnyOfFeatures(RegexFeature.PERL_SYNTAX_GROUP_NAME)) {
      return parseNamedBackReference(backslash, '{', '}');
    }
    characters.moveNext();
    expectedNamedBackReferenceOpener();
    return characterTree(backslash);
  }

  protected RegexTree parseNamedBackReference(SourceCharacter backslash, char opener, char closer) {
    return parseEscapedSequence(opener, closer, "a group name",
      dh -> collect(new BackReferenceTree(source, backslash, dh.marker, dh.opener, dh.closer, activeFlags)));
  }

  private void expectedNamedBackReferenceOpener() {
    StringJoiner joiner = new StringJoiner(" or ");
    joiner.setEmptyValue("valid name opener");
    if (source.supportsFeature(RegexFeature.DOTNET_SYNTAX_GROUP_NAME)) {
      joiner.add("'<'");
      joiner.add("'''");
    } else if (source.supportsFeature(RegexFeature.JAVA_SYNTAX_GROUP_NAME)) {
      joiner.add("'<'");
    }
    if (source.supportsFeature(RegexFeature.PERL_SYNTAX_GROUP_NAME)) {
      joiner.add("'{'");
    }
    expected(joiner.toString());
  }

  protected BackReferenceTree collect(BackReferenceTree backReference) {
    backReferences.add(backReference);
    return backReference;
  }

  protected CapturingGroupTree index(CapturingGroupTree capturingGroup) {
    capturingGroups.put(Integer.toString(capturingGroup.getGroupNumber()), capturingGroup);
    capturingGroup.getName().ifPresent(name -> capturingGroups.put(name, capturingGroup));
    return capturingGroup;
  }

  protected RegexTree parseEscapedSequence(char opener, char closer, String expected, Function builder) {
    SourceCharacter marker = characters.getCurrent();
    characters.moveNext();

    if (!characters.currentIs(opener)) {
      expected(("'" + opener + "'"));
      return characterTree(marker);
    }
    SourceCharacter openerChar = characters.getCurrent();
    boolean atLeastOneChar = false;
    do {
      characters.moveNext();
      if (characters.isAtEnd()) {
        expected(atLeastOneChar ? ("'" + closer + "'") : expected);
        return characterTree(openerChar);
      }
      if (!atLeastOneChar && characters.currentIs(closer)) {
        expected(expected);
        return characterTree(openerChar);
      }
      atLeastOneChar = true;
    } while (!characters.currentIs(closer));
    SourceCharacter closerChar = characters.getCurrent();
    characters.moveNext();
    return builder.apply(new EscapedSequenceDataHolder(marker, openerChar, closerChar));
  }

  protected static final class EscapedSequenceDataHolder {
    private final SourceCharacter marker;
    private final SourceCharacter opener;
    private final SourceCharacter closer;

    private EscapedSequenceDataHolder(SourceCharacter marker, SourceCharacter opener, SourceCharacter closer) {
      this.marker = marker;
      this.opener = opener;
      this.closer = closer;
    }
  }

  /**
   * Parses a numerical back reference greedily, taking as many numbers as it can. The first digit is always treated
   * as a back reference, but multi digit numbers are only treated as a back reference if at least that many back
   * references exist at this point in the regex. See {@link java.util.regex.Pattern#ref(int refNum)}
   */
  protected RegexTree parseNumericalBackReference(SourceCharacter backslash) {
    SourceCharacter firstDigit = characters.getCurrent();
    SourceCharacter lastDigit = firstDigit;
    int referenceNumber = firstDigit.getCharacter() - '0';
    do {
      characters.moveNext();
      if (!characters.isAtEnd()) {
        SourceCharacter currentChar = characters.getCurrent();
        char asChar = currentChar.getCharacter();
        int newReferenceNumber = (referenceNumber * 10) + (asChar - '0');
        boolean matchingGroupExistsAtThisPoint = newReferenceNumber < groupNumber;
        if (isAsciiDigit(asChar) && matchingGroupExistsAtThisPoint) {
          lastDigit = currentChar;
          referenceNumber = newReferenceNumber;
        } else {
          break;
        }
      }
    } while (!characters.isAtEnd());
    return collect(new BackReferenceTree(source, backslash, null, firstDigit, lastDigit, activeFlags));
  }

  protected RegexTree parseOctalEscape(SourceCharacter backslash) {
    // Discard '0'
    characters.moveNext();
    char byteValue = 0;
    int i = 0;
    while (i < 3 && isOctalDigit(characters.getCurrentChar())) {
      int newValue = byteValue * 8 + characters.getCurrentChar() - '0';
      if (newValue > 0xFF) {
        break;
      }
      byteValue = (char) newValue;
      characters.moveNext();
      i++;
    }
    if (i == 0) {
      expected("octal digit");
    }
    IndexRange range = backslash.getRange().extendTo(characters.getCurrentStartIndex());
    return characterTree(new SourceCharacter(source, range, byteValue, true));
  }

  protected RegexTree parsePythonOctalEscapeOrNumericalBackReference(SourceCharacter backslash) {
    var firstDigit = characters.getCurrent();
    var lastDigit = firstDigit;
    var escapedNumberSb = new StringBuilder();
    do {
      if (!isAsciiDigit(characters.getCurrent().getCharacter())) {
        break;
      }

      lastDigit = characters.getCurrent();
      escapedNumberSb.append(lastDigit.getCharacter());
      characters.moveNext();
    } while (!characters.isAtEnd() && !isOctalEscape(escapedNumberSb));

    if (isOctalEscape(escapedNumberSb)) {
      var range = backslash.getRange().extendTo(characters.getCurrentStartIndex());
      char escapedChar = (char) Integer.parseInt(escapedNumberSb, 0, escapedNumberSb.length(), 8);
      return characterTree(new SourceCharacter(source, range, escapedChar, true));
    } else {
      return collect(new BackReferenceTree(source, backslash, null, firstDigit, lastDigit, activeFlags));
    }
  }

  protected RegexTree parsePhpOctalEscapeOrBinaryZero(SourceCharacter backslash) {
    int nextCharacter = characters.lookAhead(1);
    if (nextCharacter == '\\' || nextCharacter == EOF) {
      // \0 is followed by another escape sequence or is at the end of the regex. Should be treated as a binary zero.
      characters.moveNext();
      return characterTree(new SourceCharacter(source, backslash.getRange().extendTo(characters.getCurrentStartIndex()), '\0', true));
    }
    return parseOctalEscape(backslash);
  }

  private static boolean isOctalEscape(CharSequence escapedDigit) {
    try {
      return escapedDigit.length() == 3 && Integer.parseInt(escapedDigit, 0, escapedDigit.length(), 8) <= 255;
    } catch (NumberFormatException e) {
      return false;
    }
  }

  protected RegexTree parseBoundary(SourceCharacter backslash) {
    if (characters.currentIs("b{")) {
      return parseEscapedSequence(
        '{',
        '}',
        "an Unicode extended grapheme cluster",
        dh -> new BoundaryTree(source, BoundaryTree.Type.UNICODE_EXTENDED_GRAPHEME_CLUSTER, backslash.getRange().merge(dh.closer.getRange()), activeFlags));
    }
    SourceCharacter boundary = characters.getCurrent();
    characters.moveNext();
    return new BoundaryTree(source, BoundaryTree.Type.forKey(boundary.getCharacter()), backslash.getRange().merge(boundary.getRange()), activeFlags);
  }

  protected CharacterClassTree parseCharacterClass() {
    SourceCharacter openingBracket = characters.getCurrent();
    characters.moveNext();
    boolean negated = false;
    if (characters.currentIs('^')) {
      characters.moveNext();
      negated = true;
    }
    CharacterClassElementTree contents = parseCharacterClassIntersection();
    if (characters.currentIs(']')) {
      characters.moveNext();
    } else {
      expected("']'");
    }
    IndexRange range = openingBracket.getRange().extendTo(characters.getCurrentStartIndex());
    return new CharacterClassTree(source, range, openingBracket, negated, contents, activeFlags);
  }

  protected CharacterClassElementTree parseCharacterClassIntersection() {
    FlagSet characterClassFlags = activeFlags;
    List elements = new ArrayList<>();
    List andOperators = new ArrayList<>();
    elements.add(parseCharacterClassUnion(true));
    while (characters.currentIs("&&")) {
      SourceCharacter firstAnd = characters.getCurrent();
      characters.moveNext();
      SourceCharacter secondAnd = characters.getCurrent();
      characters.moveNext();
      andOperators.add(new RegexToken(source, firstAnd.getRange().merge(secondAnd.getRange())));
      elements.add(parseCharacterClassUnion(false));
    }
    return combineTrees(elements, (range, items) -> new CharacterClassIntersectionTree(source, range, items, andOperators, characterClassFlags));
  }

  protected CharacterClassElementTree parseCharacterClassUnion(boolean isAtBeginning) {
    FlagSet characterClassFlags = activeFlags;
    List elements = new ArrayList<>();
    CharacterClassElementTree element = parseCharacterClassElement(isAtBeginning);
    while (element != null) {
      elements.add(element);
      element = parseCharacterClassElement(false);
    }
    if (elements.isEmpty()) {
      IndexRange range = new IndexRange(characters.getCurrentStartIndex(), characters.getCurrentStartIndex());
      return new CharacterClassUnionTree(source, range, elements, characterClassFlags);
    } else {
      return combineTrees(elements, (range, items) -> new CharacterClassUnionTree(source, range, items, characterClassFlags));
    }
  }

  @CheckForNull
  protected CharacterClassElementTree parseCharacterClassElement(boolean isAtBeginning) {
    if (characters.currentIs("[:") && source.supportsFeature(RegexFeature.POSIX_CHARACTER_CLASS)) {
      PosixCharacterClassElementTree tree = parsePosixCharacterClass();
      if (tree != null) return tree;
    }
    if (characters.isInQuotingMode() && characters.isNotAtEnd()) {
      return readCharacter();
    }
    if (characters.isAtEnd() || characters.currentIs("&&")) {
      return null;
    }
    SourceCharacter startCharacter = characters.getCurrent();
    switch (startCharacter.getCharacter()) {
      case '\\':
        RegexTree escape = parseEscapeSequence();
        if (escape.is(RegexTree.Kind.CHARACTER)) {
          return parseCharacterRange((CharacterTree) escape);
        } else if (escape instanceof CharacterClassElementTree) {
          return (CharacterClassElementTree) escape;
        } else {
          errors.add(new SyntaxError(escape, "Invalid escape sequence inside character class"));
          // Produce dummy AST and keep parsing to catch more errors.
          // The 'x' here doesn't matter because we're not going to actually use the AST when there are syntax errors.
          return characterTree(new SourceCharacter(source, escape.getRange(), 'x'));
        }
      case ']':
        if (isAtBeginning) {
          characters.moveNext();
          return parseCharacterRange(characterTree(startCharacter));
        } else {
          return null;
        }
      case '[':
        if (supportsAnyOfFeatures(RegexFeature.NESTED_CHARTER_CLASS)) {
          return parseCharacterClass();
        }
        // no break is expected
      default:
        characters.moveNext();
        return parseCharacterRange(characterTree(startCharacter));
    }
  }

  @CheckForNull
  protected PosixCharacterClassElementTree parsePosixCharacterClass() {
    SourceCharacter openingBracket = characters.getCurrent();
    boolean isNegation = characters.lookAhead(2) == '^';
    Map posixLookup = isNegation ? POSIX_CHARACTER_CLASS_NEGATION_LOOKUP : POSIX_CHARACTER_CLASS_LOOKUP;
    Optional> posixClass = posixLookup.entrySet().stream()
      .filter(posix -> characters.currentIs(posix.getKey())).findFirst();
    if (posixClass.isPresent()) {
      characters.moveNext(posixClass.get().getKey().length());
      return new PosixCharacterClassElementTree(source, openingBracket, characters.getCurrent(), isNegation, posixClass.get().getValue(), activeFlags);
    }
    return null;
  }

  protected CharacterClassElementTree parseCharacterRange(CharacterTree startCharacter) {
    if (characters.currentIs('-') && !characters.isInQuotingMode()) {
      int lookAhead = characters.lookAhead(1);
      if (lookAhead == EOF || lookAhead == ']') {
        return startCharacter;
      } else if (lookAhead == '\\') {
        characters.moveNext();
        SourceCharacter backslash = characters.getCurrent();
        RegexTree escape = parseEscapeSequence();
        if (escape.is(RegexTree.Kind.CHARACTER)) {
          return characterRange(startCharacter, (CharacterTree) escape);
        } else {
          expected("simple character", escape);
          return characterRange(startCharacter, characterTree(backslash));
        }
      } else {
        characters.moveNext();
        SourceCharacter endCharacter = characters.getCurrent();
        characters.moveNext();
        return characterRange(startCharacter, characterTree(endCharacter));
      }
    } else {
      return startCharacter;
    }
  }

  protected CharacterTree characterTree(SourceCharacter character) {
    char c1 = character.getCharacter();
    if (Character.isHighSurrogate(c1)) {
      // c1 is in the range from '\uD800' to '\uDBFF', it should be the first char of a series of two,
      // and it is one 'Supplementary Multilingual Plane' character encoded using UTF-16
      char c2 = (char) characters.getCurrentChar();
      if (c2 == '\\') {
        // skip '\\u'
        characters.moveNext(2);
        int codePoint = parseFixedAmountOfHexDigits(4);
        IndexRange newRange = new IndexRange(character.getRange().getBeginningOffset(), character.getRange().getEndingOffset() + 1);
        return new CharacterTree(character.getSource(), newRange, Character.toCodePoint(c1, (char) codePoint), true, activeFlags);
      } else if (Character.isLowSurrogate(c2)) {
        characters.moveNext();
        // c2 is in the range from '\uDC00' to '\uDFFF' it's the second part of the UTF-16 code point
        IndexRange newRange = new IndexRange(character.getRange().getBeginningOffset(), character.getRange().getEndingOffset() + 1);
        return new CharacterTree(character.getSource(), newRange, Character.toCodePoint(c1, c2), true, activeFlags);
      } else {
        LOG.warn("Couldn't parse '{}{}', two high surrogate characters in a row. Please check your encoding.", c1, c2);
      }
    }
    return new CharacterTree(source, character.getRange(), character.getCharacter(), character.isEscapeSequence(), activeFlags);
  }

  protected CharacterRangeTree characterRange(CharacterTree startCharacter, CharacterTree endCharacter) {
    IndexRange range = startCharacter.getRange().merge(endCharacter.getRange());
    CharacterRangeTree characterRange = new CharacterRangeTree(source, range, startCharacter, endCharacter, activeFlags);
    if (startCharacter.codePointOrUnit() > endCharacter.codePointOrUnit()) {
      errors.add(new SyntaxError(characterRange, "Illegal character range"));
    }
    return characterRange;
  }

  protected void expected(String expectedToken, String actual) {
    error("Expected " + expectedToken + ", but found " + actual);
  }

  protected void expected(String expectedToken, RegexSyntaxElement actual) {
    expected(expectedToken, "'" + actual.getText() + "'");
  }

  protected void expected(String expectedToken) {
    String actual = characters.isAtEnd() ? "the end of the regex" : ("'" + characters.getCurrent().getCharacter() + "'");
    expected(expectedToken, actual);
  }

  protected void error(String message) {
    IndexRange range = characters.getCurrentIndexRange();
    RegexToken offendingToken = new RegexToken(source, range);
    errors.add(new SyntaxError(offendingToken, message));
  }

  protected boolean supportsAnyOfFeatures(RegexFeature... features) {
    return Arrays.stream(features).anyMatch(source::supportsFeature);
  }

  protected static  T combineTrees(List elements, TreeConstructor treeConstructor) {
    if (elements.size() == 1) {
      return elements.get(0);
    } else {
      IndexRange range = elements.get(0).getRange().merge(elements.get(elements.size() - 1).getRange());
      return treeConstructor.construct(range, elements);
    }
  }

  protected interface TreeConstructor {
    T construct(IndexRange range, List elements);
  }

  protected interface GroupConstructor {
    GroupTree construct(IndexRange range, RegexTree element);
  }

  protected static boolean isAsciiDigit(int c) {
    return '0' <= c && c <= '9';
  }

  protected static boolean isOctalDigit(int c) {
    return '0' <= c && c <= '7';
  }

  protected static boolean isHexDigit(int c) {
    return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F');
  }

  protected boolean isPlainTextCharacter(int c) {
    if (c == '{') {
      return supportsAnyOfFeatures(RegexFeature.UNESCAPED_CURLY_BRACKET);
    }

    switch (c) {
      case EOF:
      case '(':
      case ')':
      case '\\':
      case '*':
      case '+':
      case '?':
      case '|':
      case '[':
      case '.':
        return false;
      default:
        return true;
    }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy