org.sonarsource.analyzer.commons.regex.finders.UnicodeUnawareCharClassFinder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sonar-regex-parsing Show documentation
Logic useful to read and analyze regular expressions
The newest version!
/*
 * SonarSource Analyzers Regex Parsing Commons
 * Copyright (C) 2009-2024 SonarSource SA
 * mailto:info AT sonarsource DOT com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 3 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */
package org.sonarsource.analyzer.commons.regex.finders;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.sonarsource.analyzer.commons.regex.RegexIssueLocation;
import org.sonarsource.analyzer.commons.regex.RegexIssueReporter;
import org.sonarsource.analyzer.commons.regex.RegexParseResult;
import org.sonarsource.analyzer.commons.regex.ast.CharacterRangeTree;
import org.sonarsource.analyzer.commons.regex.ast.EscapedCharacterClassTree;
import org.sonarsource.analyzer.commons.regex.ast.NonCapturingGroupTree;
import org.sonarsource.analyzer.commons.regex.ast.RegexBaseVisitor;
import org.sonarsource.analyzer.commons.regex.ast.RegexTree;

public class UnicodeUnawareCharClassFinder extends RegexBaseVisitor {

  private static final List unicodeAwareClassesWithFlag = Arrays.asList('s', 'S', 'w', 'W');
  private static final Set unicodeAwarePropertiesWithFlag = new HashSet<>(Arrays.asList(
    "Lower", "Upper", "Alpha", "Alnum", "Punct", "Graph", "Print", "Blank", "Space"));

  private static final Map unicodeUnawareCharacterRanges = new HashMap<>();
  static {
    unicodeUnawareCharacterRanges.put('a', 'z');
    unicodeUnawareCharacterRanges.put('A', 'Z');
  }

  private final RegexIssueReporter.ElementIssue regexElementIssueReporter;
  private final RegexIssueReporter.InvocationIssue invocationIssueReporter;

  private final List unicodeUnawareRanges = new ArrayList<>();
  private final List unicodeAwareWithFlag = new ArrayList<>();
  private boolean containsUnicodeCharacterFlag = false;

  public UnicodeUnawareCharClassFinder(RegexIssueReporter.ElementIssue regexElementIssueReporter, RegexIssueReporter.InvocationIssue invocationIssueReporter) {
    this.regexElementIssueReporter = regexElementIssueReporter;
    this.invocationIssueReporter = invocationIssueReporter;
  }

  @Override
  protected void before(RegexParseResult regexParseResult) {
    containsUnicodeCharacterFlag |= regexParseResult.getInitialFlags().contains(Pattern.UNICODE_CHARACTER_CLASS);
  }

  @Override
  protected void after(RegexParseResult regexParseResult) {
    int unicodeUnawareRangeSize = unicodeUnawareRanges.size();
    if (unicodeUnawareRangeSize == 1) {
      regexElementIssueReporter.report(unicodeUnawareRanges.get(0), "Replace this character range with a Unicode-aware character class.", null, Collections.emptyList());
    } else if (unicodeUnawareRangeSize > 1) {
      List secondaries = unicodeUnawareRanges.stream()
        .map(tree -> new RegexIssueLocation(tree, "Character range"))
        .collect(Collectors.toList());
      regexElementIssueReporter.report(regexParseResult.getResult(), "Replace these character ranges with Unicode-aware character classes.", null, secondaries);
    }


    if (!unicodeAwareWithFlag.isEmpty() && !containsUnicodeCharacterFlag) {
      List secondaries = unicodeAwareWithFlag.stream()
        .map(tree -> new RegexIssueLocation(tree, "Predefined/POSIX character class"))
        .collect(Collectors.toList());
      invocationIssueReporter.report("Enable the \"u\" flag or use a Unicode-aware alternative.", null, secondaries);
    }
  }

  @Override
  public void visitCharacterRange(CharacterRangeTree tree) {
    int lowerBound = tree.getLowerBound().codePointOrUnit();
    if (lowerBound < 0xFFFF) {
      Character expectedUpperBoundChar = unicodeUnawareCharacterRanges.get((char) lowerBound);
      if (expectedUpperBoundChar != null && expectedUpperBoundChar == tree.getUpperBound().codePointOrUnit()) {
        unicodeUnawareRanges.add(tree);
      }
    }
  }

  @Override
  public void visitEscapedCharacterClass(EscapedCharacterClassTree tree) {
    String property = tree.property();
    if ((property != null && unicodeAwarePropertiesWithFlag.contains(property)) ||
      unicodeAwareClassesWithFlag.contains(tree.getType())) {

      unicodeAwareWithFlag.add(tree);
    }
  }

  @Override
  public void visitNonCapturingGroup(NonCapturingGroupTree tree) {
    containsUnicodeCharacterFlag |= tree.activeFlags().contains(Pattern.UNICODE_CHARACTER_CLASS);
    super.visitNonCapturingGroup(tree);
  }
}