All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.sonarsource.analyzer.commons.regex.helpers.SimplifiedRegexCharacterClass Maven / Gradle / Ivy

The newest version!
/*
 * SonarSource Analyzers Regex Parsing Commons
 * Copyright (C) 2009-2024 SonarSource SA
 * mailto:info AT sonarsource DOT com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 3 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 */
package org.sonarsource.analyzer.commons.regex.helpers;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.TreeMap;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import org.sonarsource.analyzer.commons.regex.ast.AutomatonState;
import org.sonarsource.analyzer.commons.regex.ast.CharacterClassElementTree;
import org.sonarsource.analyzer.commons.regex.ast.CharacterClassIntersectionTree;
import org.sonarsource.analyzer.commons.regex.ast.CharacterClassTree;
import org.sonarsource.analyzer.commons.regex.ast.CharacterRangeTree;
import org.sonarsource.analyzer.commons.regex.ast.CharacterTree;
import org.sonarsource.analyzer.commons.regex.ast.DotTree;
import org.sonarsource.analyzer.commons.regex.ast.EscapedCharacterClassTree;
import org.sonarsource.analyzer.commons.regex.ast.MiscEscapeSequenceTree;
import org.sonarsource.analyzer.commons.regex.ast.RegexBaseVisitor;
import org.sonarsource.analyzer.commons.regex.ast.RegexSyntaxElement;

public class SimplifiedRegexCharacterClass {

  /**
   * This map defines the contents of the character class in the following way:
* For any entry {@code codepoint -> tree}, all the codepoints from {@code codepoint} up to (and excluding) the next * entry are in the character class and belong to the given tree.
* For any entry {@code codepoint -> null}, all the codepoints from {@code codepoint} up to (and excluding) the next * entry are not part of the character class.
* So a codepoint is contained in this class if and only if {@code contents.floorEntry(codePoint).getValue()} is * non-null and the tree returned by {@code getValue} will be the element of the character class which matches that * code point. */ private TreeMap contents = new TreeMap<>(); private boolean containsUnknownCharacters = false; public SimplifiedRegexCharacterClass() { } public SimplifiedRegexCharacterClass(CharacterClassElementTree tree) { add(tree); } public SimplifiedRegexCharacterClass(DotTree tree) { add(tree); } @Nullable public static SimplifiedRegexCharacterClass of(AutomatonState tree) { if (tree instanceof CharacterClassElementTree) { return new SimplifiedRegexCharacterClass((CharacterClassElementTree) tree); } else if (tree instanceof DotTree) { return new SimplifiedRegexCharacterClass((DotTree) tree); } else { return null; } } public boolean isEmpty() { return contents.isEmpty() && !containsUnknownCharacters; } public void add(CharacterClassElementTree tree) { new Builder(this).visitInCharClass(tree); } public void add(DotTree tree) { char[] orderedExcludedCharacters; if (tree.activeFlags().contains(Pattern.DOTALL)) { orderedExcludedCharacters = new char[] {}; } else if (tree.activeFlags().contains(Pattern.UNIX_LINES)) { orderedExcludedCharacters = new char[] {'\n'}; } else { orderedExcludedCharacters = new char[] {'\n', '\r', '\u0085', '\u2028', '\u2029'}; } int from = 0; for (char excludedCharacter : orderedExcludedCharacters) { int to = excludedCharacter - 1; if (to > from) { addRange(from, to, tree); } from = excludedCharacter + 1; } addRange(from, Character.MAX_CODE_POINT, tree); } public boolean matchesAnyCharacter() { return contents.containsKey(0) && !contents.containsValue(null); } public boolean intersects(SimplifiedRegexCharacterClass that, boolean defaultAnswer) { if (defaultAnswer && ((containsUnknownCharacters && !that.isEmpty()) || (!isEmpty() && that.containsUnknownCharacters))) { return true; } return !findIntersections(that, true).isEmpty(); } public List findIntersections(SimplifiedRegexCharacterClass that) { return findIntersections(that, false); } private List findIntersections(SimplifiedRegexCharacterClass that, boolean stopAtFirst) { Iterator> iter = that.contents.entrySet().iterator(); List intersections = new ArrayList<>(); if (!iter.hasNext()) { return intersections; } Map.Entry entry = iter.next(); while (iter.hasNext()) { Map.Entry nextEntry = iter.next(); int to = (nextEntry.getValue() == null) ? (nextEntry.getKey() - 1) : nextEntry.getKey(); RegexSyntaxElement value = entry.getValue(); if (value != null && hasEntryBetween(entry.getKey(), to)) { intersections.add(value); if (stopAtFirst) { return intersections; } } entry = nextEntry; } RegexSyntaxElement value = entry.getValue(); if (value != null && hasEntryBetween(entry.getKey(), Character.MAX_CODE_POINT)) { intersections.add(value); } return intersections; } /** * @param from inclusive * @param to inclusive */ private boolean hasEntryBetween(int from, int to) { Map.Entry before = contents.floorEntry(from); return ((before != null && before.getValue() != null) || !contents.subMap(from, false, to, true).isEmpty()); } public boolean supersetOf(SimplifiedRegexCharacterClass that, boolean defaultAnswer) { if ((isEmpty() && !that.isEmpty()) || (that.containsUnknownCharacters && !defaultAnswer)) { return false; } Iterator> thatIter = that.contents.entrySet().iterator(); if (!thatIter.hasNext()) { // that.contents is empty, any set is a superset of it return true; } Map.Entry thatEntry = thatIter.next(); while (thatIter.hasNext()) { Map.Entry thatNextEntry = thatIter.next(); if (notSupersetOfEntries(thatEntry, thatNextEntry)) { return false; } thatEntry = thatNextEntry; } if (thatEntry.getValue() == null) { return true; } Map.Entry lastEntry = contents.lastEntry(); return lastEntry.getValue() != null && lastEntry.getKey() <= thatEntry.getKey(); } private boolean notSupersetOfEntries(Map.Entry thatEntry, Map.Entry thatNextEntry) { if (thatEntry.getValue() != null) { Map.Entry thisBefore = contents.floorEntry(thatEntry.getKey()); if (thisBefore == null || thisBefore.getValue() == null) { return true; } int to = (thatNextEntry.getValue() == null) ? (thatNextEntry.getKey() - 1) : thatNextEntry.getKey(); return contents.subMap(thatEntry.getKey(), false, to, true).values().stream() .anyMatch(Objects::isNull); } return false; } public void addRange(int from, int to, RegexSyntaxElement tree) { Map.Entry oldEntry = contents.floorEntry(to); Integer oldEnd = oldEntry == null ? null : contents.higherKey(oldEntry.getKey()); contents.put(from, tree); for (Map.Entry entry : contents.subMap(from, false, to, true).entrySet()) { if (entry.getValue() == null) { entry.setValue(tree); } } int next = to + 1; if (next <= Character.MAX_CODE_POINT) { if (oldEntry != null && oldEntry.getValue() != null && (oldEnd == null || oldEnd > next)) { contents.put(next, oldEntry.getValue()); } else if (!contents.containsKey(next)) { contents.put(next, null); } } } private static class Builder extends RegexBaseVisitor { private SimplifiedRegexCharacterClass characters; public Builder(SimplifiedRegexCharacterClass characters) { this.characters = characters; } @Override public void visitCharacter(CharacterTree tree) { addRange(tree.codePointOrUnit(), tree.codePointOrUnit(), tree); } @Override public void visitCharacterRange(CharacterRangeTree tree) { addRange(tree.getLowerBound().codePointOrUnit(), tree.getUpperBound().codePointOrUnit(), tree); } @Override public void visitMiscEscapeSequence(MiscEscapeSequenceTree tree) { characters.containsUnknownCharacters = true; } @Override public void visitCharacterClass(CharacterClassTree tree) { if (tree.isNegated()) { SimplifiedRegexCharacterClass old = characters; SimplifiedRegexCharacterClass inner = new SimplifiedRegexCharacterClass(); characters = inner; super.visitCharacterClass(tree); characters = old; if (inner.containsUnknownCharacters) { // When negating a class that contains unknown characters, we can't know for sure whether any character is in the // class, so we don't add any known characters to it characters.containsUnknownCharacters = true; characters.contents = new TreeMap<>(); return; } boolean lastInsertedIsNotNull = false; if (inner.contents.get(0) == null) { characters.contents.put(0, tree); lastInsertedIsNotNull = true; } for (Map.Entry entry : inner.contents.entrySet()) { if (entry.getValue() == null) { characters.contents.put(entry.getKey(), tree); lastInsertedIsNotNull = true; } else if (lastInsertedIsNotNull) { characters.contents.put(entry.getKey(), null); lastInsertedIsNotNull = false; } } } else { super.visitCharacterClass(tree); } } @Override public void visitCharacterClassIntersection(CharacterClassIntersectionTree tree) { characters.containsUnknownCharacters = true; } @Override public void visitEscapedCharacterClass(EscapedCharacterClassTree tree) { switch (tree.getType()) { case 'd': characters.addRange('0', '9', tree); if (tree.activeFlags().contains(Pattern.UNICODE_CHARACTER_CLASS)) { characters.containsUnknownCharacters = true; } break; case 'D': characters.addRange(0x00, '0' - 1, tree); if (tree.activeFlags().contains(Pattern.UNICODE_CHARACTER_CLASS)) { characters.addRange('9' + 1, 0xff, tree); characters.containsUnknownCharacters = true; } else { characters.addRange('9' + 1, Character.MAX_CODE_POINT, tree); } break; case 'w': characters.addRange('0', '9', tree); characters.addRange('A', 'Z', tree); characters.addRange('_', '_', tree); characters.addRange('a', 'z', tree); if (tree.activeFlags().contains(Pattern.UNICODE_CHARACTER_CLASS)) { characters.containsUnknownCharacters = true; } break; case 'W': characters.addRange(0x00, '0' - 1, tree); characters.addRange('9' + 1, 'A' - 1, tree); characters.addRange('Z'+1, '_' - 1, tree); characters.addRange('`', '`', tree); if (tree.activeFlags().contains(Pattern.UNICODE_CHARACTER_CLASS)) { characters.addRange('z' + 1, 'µ' - 1, tree); characters.containsUnknownCharacters = true; } else { characters.addRange('z' + 1, Character.MAX_CODE_POINT, tree); } break; case 's': characters.addRange('\t', '\r', tree); characters.addRange(' ', ' ', tree); if (tree.activeFlags().contains(Pattern.UNICODE_CHARACTER_CLASS)) { characters.addRange(0x85, 0x85, tree); characters.addRange(0xA0, 0xA0, tree); characters.addRange(0x1680, 0x1680, tree); characters.addRange(0x2000, 0x200A, tree); characters.addRange(0x2028, 0x2029, tree); characters.addRange(0x202F, 0x202F, tree); characters.addRange(0x205F, 0x205F, tree); characters.addRange(0x3000, 0x3000, tree); } break; case 'S': characters.addRange(0x00, '\t' - 1, tree); characters.addRange('\r' + 1, ' ' - 1, tree); if (tree.activeFlags().contains(Pattern.UNICODE_CHARACTER_CLASS)) { characters.addRange(' ' + 1, 0x84, tree); characters.addRange(0x86, 0x9F, tree); characters.addRange(0xA1, 0x167F, tree); characters.addRange(0x1681, 0x1FFF, tree); characters.addRange(0x200B, 0x2027, tree); characters.addRange(0x202A, 0x202E, tree); characters.addRange(0x2030, 0x205E, tree); characters.addRange(0x2060, 0x2FFF, tree); characters.addRange(0x3001, Character.MAX_CODE_POINT, tree); } else { characters.addRange(' ' + 1, Character.MAX_CODE_POINT, tree); } break; default: characters.containsUnknownCharacters = true; break; } } private void addRange(int from, int to, CharacterClassElementTree tree) { characters.addRange(from, to, tree); if (tree.activeFlags().contains(Pattern.CASE_INSENSITIVE)) { addCaseInsensitiveRangeFor(from, to, tree, 'A', 'Z', 'a', 'z'); if (tree.activeFlags().contains(Pattern.UNICODE_CASE)) { addCaseInsensitiveRangeFor(from, to, tree, 'À', 'Þ', 'à', 'þ'); } } } private void addCaseInsensitiveRangeFor(int from, int to, CharacterClassElementTree tree, char upperStart, char upperEnd, char lowerStart, char lowerEnd) { final int lowerCaseShift = lowerStart - upperStart; if (from <= upperEnd && to >= upperStart) { characters.addRange(Math.max(from, upperStart) + lowerCaseShift, Math.min(to, upperEnd) + lowerCaseShift, tree); } if (from <= lowerEnd && to >= lowerStart) { characters.addRange(Math.max(from, lowerStart) - lowerCaseShift, Math.min(to, lowerEnd) - lowerCaseShift, tree); } } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy