org.anarres.cpp.LexerSource Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcpp Show documentation
Show all versions of jcpp Show documentation
An embeddable C Preprocessor for the JVM.
/*
* Anarres C Preprocessor
* Copyright (c) 2007-2008, Shevek
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.anarres.cpp;
import java.io.IOException;
import java.io.Reader;
import javax.annotation.Nonnull;
import static org.anarres.cpp.Token.*;
/** Does not handle digraphs. */
public class LexerSource extends Source {
private static final boolean DEBUG = false;
private JoinReader reader;
private final boolean ppvalid;
private boolean bol;
private boolean include;
private boolean digraphs;
/* Unread. */
private int u0, u1;
private int ucount;
private int line;
private int column;
private int lastcolumn;
private boolean cr;
/* ppvalid is:
* false in StringLexerSource,
* true in FileLexerSource */
public LexerSource(Reader r, boolean ppvalid) {
this.reader = new JoinReader(r);
this.ppvalid = ppvalid;
this.bol = true;
this.include = false;
this.digraphs = true;
this.ucount = 0;
this.line = 1;
this.column = 0;
this.lastcolumn = -1;
this.cr = false;
}
@Override
/* pp */ void init(Preprocessor pp) {
super.init(pp);
this.digraphs = pp.getFeature(Feature.DIGRAPHS);
this.reader.init(pp, this);
}
@Override
public int getLine() {
return line;
}
@Override
public int getColumn() {
return column;
}
@Override
/* pp */ boolean isNumbered() {
return true;
}
/* Error handling. */
private void _error(String msg, boolean error)
throws LexerException {
int _l = line;
int _c = column;
if (_c == 0) {
_c = lastcolumn;
_l--;
} else {
_c--;
}
if (error)
super.error(_l, _c, msg);
else
super.warning(_l, _c, msg);
}
/* Allow JoinReader to call this. */
/* pp */ final void error(String msg)
throws LexerException {
_error(msg, true);
}
/* Allow JoinReader to call this. */
/* pp */ final void warning(String msg)
throws LexerException {
_error(msg, false);
}
/* A flag for string handling. */
/* pp */ void setInclude(boolean b) {
this.include = b;
}
/*
* private boolean _isLineSeparator(int c) {
* return Character.getType(c) == Character.LINE_SEPARATOR
* || c == -1;
* }
*/
/* XXX Move to JoinReader and canonicalise newlines. */
private static boolean isLineSeparator(int c) {
switch ((char) c) {
case '\r':
case '\n':
case '\u2028':
case '\u2029':
case '\u000B':
case '\u000C':
case '\u0085':
return true;
default:
return (c == -1);
}
}
private int read()
throws IOException,
LexerException {
int c;
assert ucount <= 2 : "Illegal ucount: " + ucount;
switch (ucount) {
case 2:
ucount = 1;
c = u1;
break;
case 1:
ucount = 0;
c = u0;
break;
default:
if (reader == null)
c = -1;
else
c = reader.read();
break;
}
switch (c) {
case '\r':
cr = true;
line++;
lastcolumn = column;
column = 0;
break;
case '\n':
if (cr) {
cr = false;
break;
}
/* fallthrough */
case '\u2028':
case '\u2029':
case '\u000B':
case '\u000C':
case '\u0085':
cr = false;
line++;
lastcolumn = column;
column = 0;
break;
case -1:
cr = false;
break;
default:
cr = false;
column++;
break;
}
/*
* if (isLineSeparator(c)) {
* line++;
* lastcolumn = column;
* column = 0;
* }
* else {
* column++;
* }
*/
return c;
}
/* You can unget AT MOST one newline. */
private void unread(int c)
throws IOException {
/* XXX Must unread newlines. */
if (c != -1) {
if (isLineSeparator(c)) {
line--;
column = lastcolumn;
cr = false;
} else {
column--;
}
switch (ucount) {
case 0:
u0 = c;
ucount = 1;
break;
case 1:
u1 = c;
ucount = 2;
break;
default:
throw new IllegalStateException(
"Cannot unget another character!"
);
}
// reader.unread(c);
}
}
/* Consumes the rest of the current line into an invalid. */
@Nonnull
private Token invalid(StringBuilder text, String reason)
throws IOException,
LexerException {
int d = read();
while (!isLineSeparator(d)) {
text.append((char) d);
d = read();
}
unread(d);
return new Token(INVALID, text.toString(), reason);
}
@Nonnull
private Token ccomment()
throws IOException,
LexerException {
StringBuilder text = new StringBuilder("/*");
int d;
do {
do {
d = read();
text.append((char) d);
} while (d != '*');
do {
d = read();
text.append((char) d);
} while (d == '*');
} while (d != '/');
return new Token(CCOMMENT, text.toString());
}
@Nonnull
private Token cppcomment()
throws IOException,
LexerException {
StringBuilder text = new StringBuilder("//");
int d = read();
while (!isLineSeparator(d)) {
text.append((char) d);
d = read();
}
unread(d);
return new Token(CPPCOMMENT, text.toString());
}
private int escape(StringBuilder text)
throws IOException,
LexerException {
int d = read();
switch (d) {
case 'a':
text.append('a');
return 0x07;
case 'b':
text.append('b');
return '\b';
case 'f':
text.append('f');
return '\f';
case 'n':
text.append('n');
return '\n';
case 'r':
text.append('r');
return '\r';
case 't':
text.append('t');
return '\t';
case 'v':
text.append('v');
return 0x0b;
case '\\':
text.append('\\');
return '\\';
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
int len = 0;
int val = 0;
do {
val = (val << 3) + Character.digit(d, 8);
text.append((char) d);
d = read();
} while (++len < 3 && Character.digit(d, 8) != -1);
unread(d);
return val;
case 'x':
text.append((char) d);
len = 0;
val = 0;
while (len++ < 2) {
d = read();
if (Character.digit(d, 16) == -1) {
unread(d);
break;
}
val = (val << 4) + Character.digit(d, 16);
text.append((char) d);
}
return val;
/* Exclude two cases from the warning. */
case '"':
text.append('"');
return '"';
case '\'':
text.append('\'');
return '\'';
default:
warning("Unnecessary escape character " + (char) d);
text.append((char) d);
return d;
}
}
@Nonnull
private Token character()
throws IOException,
LexerException {
StringBuilder text = new StringBuilder("'");
int d = read();
if (d == '\\') {
text.append('\\');
d = escape(text);
} else if (isLineSeparator(d)) {
unread(d);
return new Token(INVALID, text.toString(),
"Unterminated character literal");
} else if (d == '\'') {
text.append('\'');
return new Token(INVALID, text.toString(),
"Empty character literal");
} else if (!Character.isDefined(d)) {
text.append('?');
return invalid(text, "Illegal unicode character literal");
} else {
text.append((char) d);
}
int e = read();
if (e != '\'') {
// error("Illegal character constant");
/* We consume up to the next ' or the rest of the line. */
for (;;) {
if (isLineSeparator(e)) {
unread(e);
break;
}
text.append((char) e);
if (e == '\'')
break;
e = read();
}
return new Token(INVALID, text.toString(),
"Illegal character constant " + text);
}
text.append('\'');
/* XXX It this a bad cast? */
return new Token(CHARACTER,
text.toString(), Character.valueOf((char) d));
}
@Nonnull
private Token string(char open, char close)
throws IOException,
LexerException {
StringBuilder text = new StringBuilder();
text.append(open);
StringBuilder buf = new StringBuilder();
for (;;) {
int c = read();
if (c == close) {
break;
} else if (c == '\\') {
text.append('\\');
if (!include) {
char d = (char) escape(text);
buf.append(d);
}
} else if (c == -1) {
unread(c);
// error("End of file in string literal after " + buf);
return new Token(INVALID, text.toString(),
"End of file in string literal after " + buf);
} else if (isLineSeparator(c)) {
unread(c);
// error("Unterminated string literal after " + buf);
return new Token(INVALID, text.toString(),
"Unterminated string literal after " + buf);
} else {
text.append((char) c);
buf.append((char) c);
}
}
text.append(close);
switch (close) {
case '"':
return new Token(STRING,
text.toString(), buf.toString());
case '>':
return new Token(HEADER,
text.toString(), buf.toString());
case '\'':
if (buf.length() == 1)
return new Token(CHARACTER,
text.toString(), buf.toString());
return new Token(SQSTRING,
text.toString(), buf.toString());
default:
throw new IllegalStateException(
"Unknown closing character " + String.valueOf(close));
}
}
@Nonnull
private Token _number_suffix(StringBuilder text, NumericValue value, int d)
throws IOException,
LexerException {
int flags = 0; // U, I, L, LL, F, D, MSB
for (;;) {
if (d == 'U' || d == 'u') {
if ((flags & NumericValue.F_UNSIGNED) != 0)
warning("Duplicate unsigned suffix " + d);
flags |= NumericValue.F_UNSIGNED;
text.append((char) d);
d = read();
} else if (d == 'L' || d == 'l') {
if ((flags & NumericValue.FF_SIZE) != 0)
warning("Multiple length suffixes after " + text);
text.append((char) d);
int e = read();
if (e == d) { // Case must match. Ll is Welsh.
flags |= NumericValue.F_LONGLONG;
text.append((char) e);
d = read();
} else {
flags |= NumericValue.F_LONG;
d = e;
}
} else if (d == 'I' || d == 'i') {
if ((flags & NumericValue.FF_SIZE) != 0)
warning("Multiple length suffixes after " + text);
flags |= NumericValue.F_INT;
text.append((char) d);
d = read();
} else if (d == 'F' || d == 'f') {
if ((flags & NumericValue.FF_SIZE) != 0)
warning("Multiple length suffixes after " + text);
flags |= NumericValue.F_FLOAT;
text.append((char) d);
d = read();
} else if (d == 'D' || d == 'd') {
if ((flags & NumericValue.FF_SIZE) != 0)
warning("Multiple length suffixes after " + text);
flags |= NumericValue.F_DOUBLE;
text.append((char) d);
d = read();
} // This should probably be isPunct() || isWhite().
else if (Character.isLetter(d) || d == '_') {
unread(d);
value.setFlags(flags);
return invalid(text,
"Invalid suffix \"" + (char) d
+ "\" on numeric constant");
} else {
unread(d);
value.setFlags(flags);
return new Token(NUMBER,
text.toString(), value);
}
}
}
/* Either a decimal part, or a hex exponent. */
@Nonnull
private String _number_part(StringBuilder text, int base, boolean sign)
throws IOException,
LexerException {
StringBuilder part = new StringBuilder();
int d = read();
if (sign && d == '-') {
text.append((char) d);
part.append((char) d);
d = read();
}
while (Character.digit(d, base) != -1) {
text.append((char) d);
part.append((char) d);
d = read();
}
unread(d);
return part.toString();
}
/* We do not know whether know the first digit is valid. */
@Nonnull
private Token number_hex(char x, boolean negative)
throws IOException,
LexerException {
StringBuilder text = new StringBuilder(negative ? "-0" : "0");
text.append(x);
String integer = _number_part(text, 16, false);
NumericValue value = new NumericValue(16, negative, integer);
int d = read();
if (d == '.') {
text.append((char) d);
String fraction = _number_part(text, 16, false);
value.setFractionalPart(fraction);
d = read();
}
if (d == 'P' || d == 'p') {
text.append((char) d);
String exponent = _number_part(text, 10, true);
value.setExponent(2, exponent);
d = read();
}
// XXX Make sure it's got enough parts
return _number_suffix(text, value, d);
}
private static boolean is_octal(@Nonnull String text) {
if (!text.startsWith("0"))
return false;
for (int i = 0; i < text.length(); i++)
if (Character.digit(text.charAt(i), 8) == -1)
return false;
return true;
}
/* We know we have at least one valid digit, but empty is not
* fine. */
@Nonnull
private Token number_decimal(boolean negative)
throws IOException,
LexerException {
StringBuilder text = new StringBuilder(negative ? "-" : "");
String integer = _number_part(text, 10, false);
String fraction = null;
String exponent = null;
int d = read();
if (d == '.') {
text.append((char) d);
fraction = _number_part(text, 10, false);
d = read();
}
if (d == 'E' || d == 'e') {
text.append((char) d);
exponent = _number_part(text, 10, true);
d = read();
}
int base = 10;
if (fraction == null && exponent == null && integer.startsWith("0")) {
if (!is_octal(integer))
warning("Decimal constant starts with 0, but not octal: " + integer);
else
base = 8;
}
NumericValue value = new NumericValue(base, negative, integer);
if (fraction != null)
value.setFractionalPart(fraction);
if (exponent != null)
value.setExponent(10, exponent);
// XXX Make sure it's got enough parts
return _number_suffix(text, value, d);
}
/**
* Section 6.4.4.2 of C99
*
* A floating constant has a significand part that may be followed
* by an exponent part and a suffix that specifies its type. The
* components of the significand part may include a digit sequence
* representing the whole-number part, followed by a period (.),
* followed by a digit sequence representing the fraction part.
*
* The components of the exponent part are an e, E, p, or P
* followed by an exponent consisting of an optionally signed digit
* sequence. Either the whole-number part or the fraction part has to
* be present; for decimal floating constants, either the period or
* the exponent part has to be present.
*
* The significand part is interpreted as a (decimal or hexadecimal)
* rational number; the digit sequence in the exponent part is
* interpreted as a decimal integer. For decimal floating constants,
* the exponent indicates the power of 10 by which the significand
* part is to be scaled. For hexadecimal floating constants, the
* exponent indicates the power of 2 by which the significand part is
* to be scaled.
*
* For decimal floating constants, and also for hexadecimal
* floating constants when FLT_RADIX is not a power of 2, the result
* is either the nearest representable value, or the larger or smaller
* representable value immediately adjacent to the nearest representable
* value, chosen in an implementation-defined manner. For hexadecimal
* floating constants when FLT_RADIX is a power of 2, the result is
* correctly rounded.
*/
@Nonnull
private Token number()
throws IOException,
LexerException {
boolean negative = false;
Token tok;
int c = read();
if (c == '-') {
negative = true;
c = read();
}
if (c == '0') {
int d = read();
if (d == 'x' || d == 'X') {
tok = number_hex((char) d, negative);
} else {
unread(d);
unread(c);
tok = number_decimal(negative);
}
} else if (Character.isDigit(c) || c == '.') {
unread(c);
tok = number_decimal(negative);
} else {
throw new LexerException("Asked to parse something as a number which isn't: " + (char) c);
}
return tok;
}
@Nonnull
private Token identifier(int c)
throws IOException,
LexerException {
StringBuilder text = new StringBuilder();
int d;
text.append((char) c);
for (;;) {
d = read();
if (Character.isIdentifierIgnorable(d))
; else if (Character.isJavaIdentifierPart(d))
text.append((char) d);
else
break;
}
unread(d);
return new Token(IDENTIFIER, text.toString());
}
@Nonnull
private Token whitespace(int c)
throws IOException,
LexerException {
StringBuilder text = new StringBuilder();
int d;
text.append((char) c);
for (;;) {
d = read();
if (ppvalid && isLineSeparator(d)) /* XXX Ugly. */
break;
if (Character.isWhitespace(d))
text.append((char) d);
else
break;
}
unread(d);
return new Token(WHITESPACE, text.toString());
}
/* No token processed by cond() contains a newline. */
@Nonnull
private Token cond(char c, int yes, int no)
throws IOException,
LexerException {
int d = read();
if (c == d)
return new Token(yes);
unread(d);
return new Token(no);
}
@Override
public Token token()
throws IOException,
LexerException {
Token tok = null;
int _l = line;
int _c = column;
int c = read();
int d;
switch (c) {
case '\n':
if (ppvalid) {
bol = true;
if (include) {
tok = new Token(NL, _l, _c, "\n");
} else {
int nls = 0;
do {
nls++;
d = read();
} while (d == '\n');
unread(d);
char[] text = new char[nls];
for (int i = 0; i < text.length; i++)
text[i] = '\n';
// Skip the bol = false below.
tok = new Token(NL, _l, _c, new String(text));
}
if (DEBUG)
System.out.println("lx: Returning NL: " + tok);
return tok;
}
/* Let it be handled as whitespace. */
break;
case '!':
tok = cond('=', NE, '!');
break;
case '#':
if (bol)
tok = new Token(HASH);
else
tok = cond('#', PASTE, '#');
break;
case '+':
d = read();
if (d == '+')
tok = new Token(INC);
else if (d == '=')
tok = new Token(PLUS_EQ);
else
unread(d);
break;
case '-':
d = read();
if (d == '-')
tok = new Token(DEC);
else if (d == '=')
tok = new Token(SUB_EQ);
else if (d == '>')
tok = new Token(ARROW);
else
unread(d);
if (Character.isDigit(d)) {
unread('-');
tok = number();
}
break;
case '*':
tok = cond('=', MULT_EQ, '*');
break;
case '/':
d = read();
if (d == '*')
tok = ccomment();
else if (d == '/')
tok = cppcomment();
else if (d == '=')
tok = new Token(DIV_EQ);
else
unread(d);
break;
case '%':
d = read();
if (d == '=')
tok = new Token(MOD_EQ);
else if (digraphs && d == '>')
tok = new Token('}'); // digraph
else if (digraphs && d == ':')
PASTE:
{
d = read();
if (d != '%') {
unread(d);
tok = new Token('#'); // digraph
break PASTE;
}
d = read();
if (d != ':') {
unread(d); // Unread 2 chars here.
unread('%');
tok = new Token('#'); // digraph
break PASTE;
}
tok = new Token(PASTE); // digraph
}
else
unread(d);
break;
case ':':
/* :: */
d = read();
if (digraphs && d == '>')
tok = new Token(']'); // digraph
else
unread(d);
break;
case '<':
if (include) {
tok = string('<', '>');
} else {
d = read();
if (d == '=')
tok = new Token(LE);
else if (d == '<')
tok = cond('=', LSH_EQ, LSH);
else if (digraphs && d == ':')
tok = new Token('['); // digraph
else if (digraphs && d == '%')
tok = new Token('{'); // digraph
else
unread(d);
}
break;
case '=':
tok = cond('=', EQ, '=');
break;
case '>':
d = read();
if (d == '=')
tok = new Token(GE);
else if (d == '>')
tok = cond('=', RSH_EQ, RSH);
else
unread(d);
break;
case '^':
tok = cond('=', XOR_EQ, '^');
break;
case '|':
d = read();
if (d == '=')
tok = new Token(OR_EQ);
else if (d == '|')
tok = cond('=', LOR_EQ, LOR);
else
unread(d);
break;
case '&':
d = read();
if (d == '&')
tok = cond('=', LAND_EQ, LAND);
else if (d == '=')
tok = new Token(AND_EQ);
else
unread(d);
break;
case '.':
d = read();
if (d == '.')
tok = cond('.', ELLIPSIS, RANGE);
else
unread(d);
if (Character.isDigit(d)) {
unread('.');
tok = number();
}
/* XXX decimal fraction */
break;
case '\'':
tok = string('\'', '\'');
break;
case '"':
tok = string('"', '"');
break;
case -1:
close();
tok = new Token(EOF, _l, _c, "");
break;
}
if (tok == null) {
if (Character.isWhitespace(c)) {
tok = whitespace(c);
} else if (Character.isDigit(c)) {
unread(c);
tok = number();
} else if (Character.isJavaIdentifierStart(c)) {
tok = identifier(c);
} else {
tok = new Token(c);
}
}
if (bol) {
switch (tok.getType()) {
case WHITESPACE:
case CCOMMENT:
break;
default:
bol = false;
break;
}
}
tok.setLocation(_l, _c);
if (DEBUG)
System.out.println("lx: Returning " + tok);
// (new Exception("here")).printStackTrace(System.out);
return tok;
}
@Override
public void close()
throws IOException {
if (reader != null) {
reader.close();
reader = null;
}
super.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy