org.antlr.v4.misc.EscapeSequenceParsing Maven / Gradle / Ivy
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.misc;
import org.antlr.v4.runtime.misc.IntervalSet;
import org.antlr.v4.unicode.UnicodeData;
import java.util.Objects;
/**
* Utility class to parse escapes like:
* \\n
* \\uABCD
* \\u{10ABCD}
* \\p{Foo}
* \\P{Bar}
* \\p{Baz=Blech}
* \\P{Baz=Blech}
*/
public abstract class EscapeSequenceParsing {
public static class Result {
public enum Type {
INVALID,
CODE_POINT,
PROPERTY
};
public final Type type;
public final int codePoint;
public final IntervalSet propertyIntervalSet;
public final int startOffset;
public final int parseLength;
public Result(Type type, int codePoint, IntervalSet propertyIntervalSet, int startOffset, int parseLength) {
this.type = type;
this.codePoint = codePoint;
this.propertyIntervalSet = propertyIntervalSet;
this.startOffset = startOffset;
this.parseLength = parseLength;
}
@Override
public String toString() {
return String.format(
"%s type=%s codePoint=%d propertyIntervalSet=%s parseLength=%d",
super.toString(),
type,
codePoint,
propertyIntervalSet,
parseLength);
}
@Override
public boolean equals(Object other) {
if (!(other instanceof Result)) {
return false;
}
Result that = (Result) other;
if (this == that) {
return true;
}
return Objects.equals(this.type, that.type) &&
Objects.equals(this.codePoint, that.codePoint) &&
Objects.equals(this.propertyIntervalSet, that.propertyIntervalSet) &&
Objects.equals(this.parseLength, that.parseLength);
}
@Override
public int hashCode() {
return Objects.hash(type, codePoint, propertyIntervalSet, parseLength);
}
}
/**
* Parses a single escape sequence starting at {@code startOff}.
*
* Returns a type of INVALID if no valid escape sequence was found, a Result otherwise.
*/
public static Result parseEscape(String s, int startOff) {
int offset = startOff;
if (offset + 2 > s.length() || s.codePointAt(offset) != '\\') {
return invalid(startOff, s.length()-1);
}
// Move past backslash
offset++;
int escaped = s.codePointAt(offset);
// Move past escaped code point
offset += Character.charCount(escaped);
if (escaped == 'u') {
// \\u{1} is the shortest we support
if (offset + 3 > s.length()) {
return invalid(startOff, s.length()-1);
}
int hexStartOffset;
int hexEndOffset; // appears to be exclusive
if (s.codePointAt(offset) == '{') {
hexStartOffset = offset + 1;
hexEndOffset = s.indexOf('}', hexStartOffset);
if (hexEndOffset == -1) {
return invalid(startOff, s.length()-1);
}
offset = hexEndOffset + 1;
}
else {
if (offset + 4 > s.length()) {
return invalid(startOff, s.length()-1);
}
hexStartOffset = offset;
hexEndOffset = offset + 4;
offset = hexEndOffset;
}
int codePointValue = CharSupport.parseHexValue(s, hexStartOffset, hexEndOffset);
if (codePointValue == -1 || codePointValue > Character.MAX_CODE_POINT) {
return invalid(startOff, startOff+6-1);
}
return new Result(
Result.Type.CODE_POINT,
codePointValue,
IntervalSet.EMPTY_SET,
startOff,
offset - startOff);
}
else if (escaped == 'p' || escaped == 'P') {
// \p{L} is the shortest we support
if (offset + 3 > s.length()) {
return invalid(startOff, s.length()-1);
}
if (s.codePointAt(offset) != '{') {
return invalid(startOff, offset);
}
int openBraceOffset = offset;
int closeBraceOffset = s.indexOf('}', openBraceOffset);
if (closeBraceOffset == -1) {
return invalid(startOff, s.length()-1);
}
String propertyName = s.substring(openBraceOffset + 1, closeBraceOffset);
IntervalSet propertyIntervalSet = UnicodeData.getPropertyCodePoints(propertyName);
if (propertyIntervalSet == null) {
return invalid(startOff, closeBraceOffset);
}
offset = closeBraceOffset + 1;
if (escaped == 'P') {
propertyIntervalSet = propertyIntervalSet.complement(IntervalSet.COMPLETE_CHAR_SET);
}
return new Result(
Result.Type.PROPERTY,
-1,
propertyIntervalSet,
startOff,
offset - startOff);
}
else if (escaped < CharSupport.ANTLRLiteralEscapedCharValue.length) {
int codePoint = CharSupport.ANTLRLiteralEscapedCharValue[escaped];
if (codePoint == 0) {
if (escaped != ']' && escaped != '-') { // escape ']' and '-' only in char sets.
return invalid(startOff, startOff+1);
}
else {
codePoint = escaped;
}
}
return new Result(
Result.Type.CODE_POINT,
codePoint,
IntervalSet.EMPTY_SET,
startOff,
offset - startOff);
}
else {
return invalid(startOff,s.length()-1);
}
}
private static Result invalid(int start, int stop) { // start..stop is inclusive
return new Result(
Result.Type.INVALID,
0,
IntervalSet.EMPTY_SET,
start,
stop - start + 1);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy