org.antlr.v4.misc.CharSupport Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of antlr4 Show documentation
The ANTLR 4 grammar compiler.
The newest version!
/*
 * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
 * Use of this file is governed by the BSD 3-clause license that
 * can be found in the LICENSE.txt file in the project root.
 */

package org.antlr.v4.misc;

import org.antlr.v4.runtime.Lexer;
import org.antlr.v4.runtime.misc.Interval;
import org.antlr.v4.runtime.misc.IntervalSet;

import java.util.Iterator;

/** */
public class CharSupport {
	/** When converting ANTLR char and string literals, here is the
	 *  value set of escape chars.
	 */
	public final static int[] ANTLRLiteralEscapedCharValue = new int[255];

	/** Given a char, we need to be able to show as an ANTLR literal.
	 */
	public final static String[] ANTLRLiteralCharValueEscape = new String[255];

	static {
		ANTLRLiteralEscapedCharValue['n'] = '\n';
		ANTLRLiteralEscapedCharValue['r'] = '\r';
		ANTLRLiteralEscapedCharValue['t'] = '\t';
		ANTLRLiteralEscapedCharValue['b'] = '\b';
		ANTLRLiteralEscapedCharValue['f'] = '\f';
		ANTLRLiteralEscapedCharValue['\\'] = '\\';
		ANTLRLiteralCharValueEscape['\n'] = "\\n";
		ANTLRLiteralCharValueEscape['\r'] = "\\r";
		ANTLRLiteralCharValueEscape['\t'] = "\\t";
		ANTLRLiteralCharValueEscape['\b'] = "\\b";
		ANTLRLiteralCharValueEscape['\f'] = "\\f";
		ANTLRLiteralCharValueEscape['\\'] = "\\\\";
	}

	/** Return a string representing the escaped char for code c.  E.g., If c
	 *  has value 0x100, you will get "\\u0100".  ASCII gets the usual
	 *  char (non-hex) representation.  Non-ASCII characters are spit out
	 *  as \\uXXXX or \\u{XXXXXX} escapes.
	 */
	public static String getANTLRCharLiteralForChar(int c) {
		String result;
		if ( c < Lexer.MIN_CHAR_VALUE ) {
			result = "";
		}
		else {
			String charValueEscape = c < ANTLRLiteralCharValueEscape.length ? ANTLRLiteralCharValueEscape[c] : null;
			if (charValueEscape != null) {
				result = charValueEscape;
			}
			else if (Character.UnicodeBlock.of((char) c) == Character.UnicodeBlock.BASIC_LATIN &&
					!Character.isISOControl((char) c)) {
				if (c == '\\') {
					result = "\\\\";
				}
				else if (c == '\'') {
					result = "\\'";
				}
				else {
					result = Character.toString((char) c);
				}
			}
			else if (c <= 0xFFFF) {
				result = String.format("\\u%04X", c);
			} else {
				result = String.format("\\u{%06X}", c);
			}
		}
		return '\'' + result + '\'';
	}

	/** Given a literal like (the 3 char sequence with single quotes) 'a',
	 *  return the int value of 'a'. Convert escape sequences here also.
	 *  Return -1 if not single char.
	 */
	public static int getCharValueFromGrammarCharLiteral(String literal) {
		if ( literal==null || literal.length()<3 ) return -1;
		return getCharValueFromCharInGrammarLiteral(literal.substring(1,literal.length()-1));
	}

	public static String getStringFromGrammarStringLiteral(String literal) {
		StringBuilder buf = new StringBuilder();
		int i = 1; // skip first quote
		int n = literal.length()-1; // skip last quote
		while ( i < n ) { // scan all but last quote
			int end = i+1;
			if ( literal.charAt(i) == '\\' ) {
				end = i+2;
				if ( i+1 < n && literal.charAt(i+1) == 'u' ) {
					if ( i+2 < n && literal.charAt(i+2) == '{' ) { // extended escape sequence
						end = i + 3;
						while (true) {
							if ( end + 1 > n ) return null; // invalid escape sequence.
							char charAt = literal.charAt(end++);
							if (charAt == '}') {
								break;
							}
							if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) {
								return null; // invalid escape sequence.
							}
						}
					}
					else {
						for (end = i + 2; end < i + 6; end++) {
							if ( end>n ) return null; // invalid escape sequence.
							char charAt = literal.charAt(end);
							if (!Character.isDigit(charAt) && !(charAt >= 'a' && charAt <= 'f') && !(charAt >= 'A' && charAt <= 'F')) {
								return null; // invalid escape sequence.
							}
						}
					}
				}
			}
			if ( end>n ) return null; // invalid escape sequence.
			String esc = literal.substring(i, end);
			int c = getCharValueFromCharInGrammarLiteral(esc);
			if ( c==-1 ) {
				return null; // invalid escape sequence.
			}
			else buf.appendCodePoint(c);
			i = end;
		}
		return buf.toString();
	}

	/** Given char x or \\t or \\u1234 return the char value;
	 *  Unnecessary escapes like '\{' yield -1.
	 */
	public static int getCharValueFromCharInGrammarLiteral(String cstr) {
		switch ( cstr.length() ) {
			case 1:
				// 'x'
				return cstr.charAt(0); // no escape char
			case 2:
				if ( cstr.charAt(0)!='\\' ) return -1;
				// '\x'  (antlr lexer will catch invalid char)
				char escChar = cstr.charAt(1);
				if (escChar == '\'') return escChar; // escape quote only in string literals.
				int charVal = ANTLRLiteralEscapedCharValue[escChar];
				if (charVal == 0) return -1;
				return charVal;
			case 6:
				// '\\u1234' or '\\u{12}'
				if ( !cstr.startsWith("\\u") ) return -1;
				int startOff;
				int endOff;
				if ( cstr.charAt(2) == '{' ) {
					startOff = 3;
					endOff = cstr.indexOf('}');
				}
				else {
					startOff = 2;
					endOff = cstr.length();
				}
				return parseHexValue(cstr, startOff, endOff);
			default:
				if ( cstr.startsWith("\\u{") ) {
					return parseHexValue(cstr, 3, cstr.indexOf('}'));
				}
				return -1;
		}
	}

	public static int parseHexValue(String cstr, int startOff, int endOff) {
		if (startOff < 0 || endOff < 0) {
			return -1;
		}
		String unicodeChars = cstr.substring(startOff, endOff);
		int result = -1;
		try {
			result = Integer.parseInt(unicodeChars, 16);
		}
		catch (NumberFormatException e) {
		}
		return result;
	}

	public static String capitalize(String s) {
		return Character.toUpperCase(s.charAt(0)) + s.substring(1);
	}

	public static String getIntervalSetEscapedString(IntervalSet intervalSet) {
		StringBuilder buf = new StringBuilder();
		Iterator iter = intervalSet.getIntervals().iterator();
		while (iter.hasNext()) {
			Interval interval = iter.next();
			buf.append(getRangeEscapedString(interval.a, interval.b));
			if (iter.hasNext()) {
				buf.append(" | ");
			}
		}
		return buf.toString();
	}

	public static String getRangeEscapedString(int codePointStart, int codePointEnd) {
		return codePointStart != codePointEnd
				? getANTLRCharLiteralForChar(codePointStart) + ".." + getANTLRCharLiteralForChar(codePointEnd)
				: getANTLRCharLiteralForChar(codePointStart);
	}
}