org.anarres.cpp.LexerSource Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcpp Show documentation
An embeddable C Preprocessor for the JVM.
There is a newer version: 1.4.14
/*
 * Anarres C Preprocessor
 * Copyright (c) 2007-2008, Shevek
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied.  See the License for the specific language governing
 * permissions and limitations under the License.
 */
package org.anarres.cpp;

import java.io.IOException;
import java.io.Reader;

import javax.annotation.Nonnull;
import static org.anarres.cpp.Token.*;

/** Does not handle digraphs. */
public class LexerSource extends Source {

    private static final boolean DEBUG = false;

    private JoinReader reader;
    private final boolean ppvalid;
    private boolean bol;
    private boolean include;

    private boolean digraphs;

    /* Unread. */
    private int u0, u1;
    private int ucount;

    private int line;
    private int column;
    private int lastcolumn;
    private boolean cr;

    /* ppvalid is:
     * false in StringLexerSource,
     * true in FileLexerSource */
    public LexerSource(Reader r, boolean ppvalid) {
        this.reader = new JoinReader(r);
        this.ppvalid = ppvalid;
        this.bol = true;
        this.include = false;

        this.digraphs = true;

        this.ucount = 0;

        this.line = 1;
        this.column = 0;
        this.lastcolumn = -1;
        this.cr = false;
    }

    @Override
    /* pp */ void init(Preprocessor pp) {
        super.init(pp);
        this.digraphs = pp.getFeature(Feature.DIGRAPHS);
        this.reader.init(pp, this);
    }

    @Override
    public int getLine() {
        return line;
    }

    @Override
    public int getColumn() {
        return column;
    }

    @Override
    /* pp */ boolean isNumbered() {
        return true;
    }

    /* Error handling. */
    private void _error(String msg, boolean error)
            throws LexerException {
        int _l = line;
        int _c = column;
        if (_c == 0) {
            _c = lastcolumn;
            _l--;
        } else {
            _c--;
        }
        if (error)
            super.error(_l, _c, msg);
        else
            super.warning(_l, _c, msg);
    }

    /* Allow JoinReader to call this. */
    /* pp */ final void error(String msg)
            throws LexerException {
        _error(msg, true);
    }

    /* Allow JoinReader to call this. */
    /* pp */ final void warning(String msg)
            throws LexerException {
        _error(msg, false);
    }

    /* A flag for string handling. */

    /* pp */ void setInclude(boolean b) {
        this.include = b;
    }

    /*
     * private boolean _isLineSeparator(int c) {
     * return Character.getType(c) == Character.LINE_SEPARATOR
     * || c == -1;
     * }
     */

    /* XXX Move to JoinReader and canonicalise newlines. */
    private static boolean isLineSeparator(int c) {
        switch ((char) c) {
            case '\r':
            case '\n':
            case '\u2028':
            case '\u2029':
            case '\u000B':
            case '\u000C':
            case '\u0085':
                return true;
            default:
                return (c == -1);
        }
    }

    private int read()
            throws IOException,
            LexerException {
        int c;
        assert ucount <= 2 : "Illegal ucount: " + ucount;
        switch (ucount) {
            case 2:
                ucount = 1;
                c = u1;
                break;
            case 1:
                ucount = 0;
                c = u0;
                break;
            default:
                if (reader == null)
                    c = -1;
                else
                    c = reader.read();
                break;
        }

        switch (c) {
            case '\r':
                cr = true;
                line++;
                lastcolumn = column;
                column = 0;
                break;
            case '\n':
                if (cr) {
                    cr = false;
                    break;
                }
            /* fallthrough */
            case '\u2028':
            case '\u2029':
            case '\u000B':
            case '\u000C':
            case '\u0085':
                cr = false;
                line++;
                lastcolumn = column;
                column = 0;
                break;
            case -1:
                cr = false;
                break;
            default:
                cr = false;
                column++;
                break;
        }

        /*
         * if (isLineSeparator(c)) {
         * line++;
         * lastcolumn = column;
         * column = 0;
         * }
         * else {
         * column++;
         * }
         */
        return c;
    }

    /* You can unget AT MOST one newline. */
    private void unread(int c)
            throws IOException {
        /* XXX Must unread newlines. */
        if (c != -1) {
            if (isLineSeparator(c)) {
                line--;
                column = lastcolumn;
                cr = false;
            } else {
                column--;
            }
            switch (ucount) {
                case 0:
                    u0 = c;
                    ucount = 1;
                    break;
                case 1:
                    u1 = c;
                    ucount = 2;
                    break;
                default:
                    throw new IllegalStateException(
                            "Cannot unget another character!"
                    );
            }
            // reader.unread(c);
        }
    }

    /* Consumes the rest of the current line into an invalid. */
    @Nonnull
    private Token invalid(StringBuilder text, String reason)
            throws IOException,
            LexerException {
        int d = read();
        while (!isLineSeparator(d)) {
            text.append((char) d);
            d = read();
        }
        unread(d);
        return new Token(INVALID, text.toString(), reason);
    }

    @Nonnull
    private Token ccomment()
            throws IOException,
            LexerException {
        StringBuilder text = new StringBuilder("/*");
        int d;
        do {
            do {
                d = read();
                text.append((char) d);
            } while (d != '*');
            do {
                d = read();
                text.append((char) d);
            } while (d == '*');
        } while (d != '/');
        return new Token(CCOMMENT, text.toString());
    }

    @Nonnull
    private Token cppcomment()
            throws IOException,
            LexerException {
        StringBuilder text = new StringBuilder("//");
        int d = read();
        while (!isLineSeparator(d)) {
            text.append((char) d);
            d = read();
        }
        unread(d);
        return new Token(CPPCOMMENT, text.toString());
    }

    private int escape(StringBuilder text)
            throws IOException,
            LexerException {
        int d = read();
        switch (d) {
            case 'a':
                text.append('a');
                return 0x07;
            case 'b':
                text.append('b');
                return '\b';
            case 'f':
                text.append('f');
                return '\f';
            case 'n':
                text.append('n');
                return '\n';
            case 'r':
                text.append('r');
                return '\r';
            case 't':
                text.append('t');
                return '\t';
            case 'v':
                text.append('v');
                return 0x0b;
            case '\\':
                text.append('\\');
                return '\\';

            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
                int len = 0;
                int val = 0;
                do {
                    val = (val << 3) + Character.digit(d, 8);
                    text.append((char) d);
                    d = read();
                } while (++len < 3 && Character.digit(d, 8) != -1);
                unread(d);
                return val;

            case 'x':
                text.append((char) d);
                len = 0;
                val = 0;
                while (len++ < 2) {
                    d = read();
                    if (Character.digit(d, 16) == -1) {
                        unread(d);
                        break;
                    }
                    val = (val << 4) + Character.digit(d, 16);
                    text.append((char) d);
                }
                return val;

            /* Exclude two cases from the warning. */
            case '"':
                text.append('"');
                return '"';
            case '\'':
                text.append('\'');
                return '\'';

            default:
                warning("Unnecessary escape character " + (char) d);
                text.append((char) d);
                return d;
        }
    }

    @Nonnull
    private Token character()
            throws IOException,
            LexerException {
        StringBuilder text = new StringBuilder("'");
        int d = read();
        if (d == '\\') {
            text.append('\\');
            d = escape(text);
        } else if (isLineSeparator(d)) {
            unread(d);
            return new Token(INVALID, text.toString(),
                    "Unterminated character literal");
        } else if (d == '\'') {
            text.append('\'');
            return new Token(INVALID, text.toString(),
                    "Empty character literal");
        } else if (!Character.isDefined(d)) {
            text.append('?');
            return invalid(text, "Illegal unicode character literal");
        } else {
            text.append((char) d);
        }

        int e = read();
        if (e != '\'') {
            // error("Illegal character constant");
			/* We consume up to the next ' or the rest of the line. */
            for (;;) {
                if (isLineSeparator(e)) {
                    unread(e);
                    break;
                }
                text.append((char) e);
                if (e == '\'')
                    break;
                e = read();
            }
            return new Token(INVALID, text.toString(),
                    "Illegal character constant " + text);
        }
        text.append('\'');
        /* XXX It this a bad cast? */
        return new Token(CHARACTER,
                text.toString(), Character.valueOf((char) d));
    }

    @Nonnull
    private Token string(char open, char close)
            throws IOException,
            LexerException {
        StringBuilder text = new StringBuilder();
        text.append(open);

        StringBuilder buf = new StringBuilder();

        for (;;) {
            int c = read();
            if (c == close) {
                break;
            } else if (c == '\\') {
                text.append('\\');
                if (!include) {
                    char d = (char) escape(text);
                    buf.append(d);
                }
            } else if (c == -1) {
                unread(c);
                // error("End of file in string literal after " + buf);
                return new Token(INVALID, text.toString(),
                        "End of file in string literal after " + buf);
            } else if (isLineSeparator(c)) {
                unread(c);
                // error("Unterminated string literal after " + buf);
                return new Token(INVALID, text.toString(),
                        "Unterminated string literal after " + buf);
            } else {
                text.append((char) c);
                buf.append((char) c);
            }
        }
        text.append(close);
        switch (close) {
            case '"':
                return new Token(STRING,
                        text.toString(), buf.toString());
            case '>':
                return new Token(HEADER,
                        text.toString(), buf.toString());
            case '\'':
                if (buf.length() == 1)
                    return new Token(CHARACTER,
                            text.toString(), buf.toString());
                return new Token(SQSTRING,
                        text.toString(), buf.toString());
            default:
                throw new IllegalStateException(
                        "Unknown closing character " + String.valueOf(close));
        }
    }

    @Nonnull
    private Token _number_suffix(StringBuilder text, NumericValue value, int d)
            throws IOException,
            LexerException {
        int flags = 0;	// U, I, L, LL, F, D, MSB
        for (;;) {
            if (d == 'U' || d == 'u') {
                if ((flags & NumericValue.F_UNSIGNED) != 0)
                    warning("Duplicate unsigned suffix " + d);
                flags |= NumericValue.F_UNSIGNED;
                text.append((char) d);
                d = read();
            } else if (d == 'L' || d == 'l') {
                if ((flags & NumericValue.FF_SIZE) != 0)
                    warning("Multiple length suffixes after " + text);
                text.append((char) d);
                int e = read();
                if (e == d) {	// Case must match. Ll is Welsh.
                    flags |= NumericValue.F_LONGLONG;
                    text.append((char) e);
                    d = read();
                } else {
                    flags |= NumericValue.F_LONG;
                    d = e;
                }
            } else if (d == 'I' || d == 'i') {
                if ((flags & NumericValue.FF_SIZE) != 0)
                    warning("Multiple length suffixes after " + text);
                flags |= NumericValue.F_INT;
                text.append((char) d);
                d = read();
            } else if (d == 'F' || d == 'f') {
                if ((flags & NumericValue.FF_SIZE) != 0)
                    warning("Multiple length suffixes after " + text);
                flags |= NumericValue.F_FLOAT;
                text.append((char) d);
                d = read();
            } else if (d == 'D' || d == 'd') {
                if ((flags & NumericValue.FF_SIZE) != 0)
                    warning("Multiple length suffixes after " + text);
                flags |= NumericValue.F_DOUBLE;
                text.append((char) d);
                d = read();
            } // This should probably be isPunct() || isWhite().
            else if (Character.isLetter(d) || d == '_') {
                unread(d);
                value.setFlags(flags);
                return invalid(text,
                        "Invalid suffix \"" + (char) d
                        + "\" on numeric constant");
            } else {
                unread(d);
                value.setFlags(flags);
                return new Token(NUMBER,
                        text.toString(), value);
            }
        }
    }

    /* Either a decimal part, or a hex exponent. */
    @Nonnull
    private String _number_part(StringBuilder text, int base, boolean sign)
            throws IOException,
            LexerException {
        StringBuilder part = new StringBuilder();
        int d = read();
        if (sign && d == '-') {
            text.append((char) d);
            part.append((char) d);
            d = read();
        }
        while (Character.digit(d, base) != -1) {
            text.append((char) d);
            part.append((char) d);
            d = read();
        }
        unread(d);
        return part.toString();
    }

    /* We do not know whether know the first digit is valid. */
    @Nonnull
    private Token number_hex(char x)
            throws IOException,
            LexerException {
        StringBuilder text = new StringBuilder("0");
        text.append(x);
        String integer = _number_part(text, 16, false);
        NumericValue value = new NumericValue(16, integer);
        int d = read();
        if (d == '.') {
            text.append((char) d);
            String fraction = _number_part(text, 16, false);
            value.setFractionalPart(fraction);
            d = read();
        }
        if (d == 'P' || d == 'p') {
            text.append((char) d);
            String exponent = _number_part(text, 10, true);
            value.setExponent(2, exponent);
            d = read();
        }
        // XXX Make sure it's got enough parts
        return _number_suffix(text, value, d);
    }

    private static boolean is_octal(@Nonnull String text) {
        if (!text.startsWith("0"))
            return false;
        for (int i = 0; i < text.length(); i++)
            if (Character.digit(text.charAt(i), 8) == -1)
                return false;
        return true;
    }

    /* We know we have at least one valid digit, but empty is not
     * fine. */
    @Nonnull
    private Token number_decimal()
            throws IOException,
            LexerException {
        StringBuilder text = new StringBuilder();
        String integer = _number_part(text, 10, false);
        String fraction = null;
        String exponent = null;
        int d = read();
        if (d == '.') {
            text.append((char) d);
            fraction = _number_part(text, 10, false);
            d = read();
        }
        if (d == 'E' || d == 'e') {
            text.append((char) d);
            exponent = _number_part(text, 10, true);
            d = read();
        }
        int base = 10;
        if (fraction == null && exponent == null && integer.startsWith("0")) {
            if (!is_octal(integer))
                warning("Decimal constant starts with 0, but not octal: " + integer);
            else
                base = 8;
        }
        NumericValue value = new NumericValue(base, integer);
        if (fraction != null)
            value.setFractionalPart(fraction);
        if (exponent != null)
            value.setExponent(10, exponent);
        // XXX Make sure it's got enough parts
        return _number_suffix(text, value, d);
    }

    /**
     * Section 6.4.4.1 of C99
     * 
     * (Not pasted here, but says that the initial negation is a separate token.)
     * 
     * Section 6.4.4.2 of C99
     *
     * A floating constant has a significand part that may be followed
     * by an exponent part and a suffix that specifies its type. The
     * components of the significand part may include a digit sequence
     * representing the whole-number part, followed by a period (.),
     * followed by a digit sequence representing the fraction part.
     *
     * The components of the exponent part are an e, E, p, or P
     * followed by an exponent consisting of an optionally signed digit
     * sequence. Either the whole-number part or the fraction part has to
     * be present; for decimal floating constants, either the period or
     * the exponent part has to be present.
     *
     * The significand part is interpreted as a (decimal or hexadecimal)
     * rational number; the digit sequence in the exponent part is
     * interpreted as a decimal integer. For decimal floating constants,
     * the exponent indicates the power of 10 by which the significand
     * part is to be scaled. For hexadecimal floating constants, the
     * exponent indicates the power of 2 by which the significand part is
     * to be scaled.
     *
     * For decimal floating constants, and also for hexadecimal
     * floating constants when FLT_RADIX is not a power of 2, the result
     * is either the nearest representable value, or the larger or smaller
     * representable value immediately adjacent to the nearest representable
     * value, chosen in an implementation-defined manner. For hexadecimal
     * floating constants when FLT_RADIX is a power of 2, the result is
     * correctly rounded.
     */
    @Nonnull
    private Token number()
            throws IOException,
            LexerException {
        Token tok;
        int c = read();
        if (c == '0') {
            int d = read();
            if (d == 'x' || d == 'X') {
                tok = number_hex((char) d);
            } else {
                unread(d);
                unread(c);
                tok = number_decimal();
            }
        } else if (Character.isDigit(c) || c == '.') {
            unread(c);
            tok = number_decimal();
        } else {
            throw new LexerException("Asked to parse something as a number which isn't: " + (char) c);
        }
        return tok;
    }

    @Nonnull
    private Token identifier(int c)
            throws IOException,
            LexerException {
        StringBuilder text = new StringBuilder();
        int d;
        text.append((char) c);
        for (;;) {
            d = read();
            if (Character.isIdentifierIgnorable(d))
				; else if (Character.isJavaIdentifierPart(d))
                text.append((char) d);
            else
                break;
        }
        unread(d);
        return new Token(IDENTIFIER, text.toString());
    }

    @Nonnull
    private Token whitespace(int c)
            throws IOException,
            LexerException {
        StringBuilder text = new StringBuilder();
        int d;
        text.append((char) c);
        for (;;) {
            d = read();
            if (ppvalid && isLineSeparator(d))	/* XXX Ugly. */

                break;
            if (Character.isWhitespace(d))
                text.append((char) d);
            else
                break;
        }
        unread(d);
        return new Token(WHITESPACE, text.toString());
    }

    /* No token processed by cond() contains a newline. */
    @Nonnull
    private Token cond(char c, int yes, int no)
            throws IOException,
            LexerException {
        int d = read();
        if (c == d)
            return new Token(yes);
        unread(d);
        return new Token(no);
    }

    @Override
    public Token token()
            throws IOException,
            LexerException {
        Token tok = null;

        int _l = line;
        int _c = column;

        int c = read();
        int d;

        switch (c) {
            case '\n':
                if (ppvalid) {
                    bol = true;
                    if (include) {
                        tok = new Token(NL, _l, _c, "\n");
                    } else {
                        int nls = 0;
                        do {
                            nls++;
                            d = read();
                        } while (d == '\n');
                        unread(d);
                        char[] text = new char[nls];
                        for (int i = 0; i < text.length; i++)
                            text[i] = '\n';
                        // Skip the bol = false below.
                        tok = new Token(NL, _l, _c, new String(text));
                    }
                    if (DEBUG)
                        System.out.println("lx: Returning NL: " + tok);
                    return tok;
                }
                /* Let it be handled as whitespace. */
                break;

            case '!':
                tok = cond('=', NE, '!');
                break;

            case '#':
                if (bol)
                    tok = new Token(HASH);
                else
                    tok = cond('#', PASTE, '#');
                break;

            case '+':
                d = read();
                if (d == '+')
                    tok = new Token(INC);
                else if (d == '=')
                    tok = new Token(PLUS_EQ);
                else
                    unread(d);
                break;
            case '-':
                d = read();
                if (d == '-')
                    tok = new Token(DEC);
                else if (d == '=')
                    tok = new Token(SUB_EQ);
                else if (d == '>')
                    tok = new Token(ARROW);
                else
                    unread(d);
                break;

            case '*':
                tok = cond('=', MULT_EQ, '*');
                break;
            case '/':
                d = read();
                if (d == '*')
                    tok = ccomment();
                else if (d == '/')
                    tok = cppcomment();
                else if (d == '=')
                    tok = new Token(DIV_EQ);
                else
                    unread(d);
                break;

            case '%':
                d = read();
                if (d == '=')
                    tok = new Token(MOD_EQ);
                else if (digraphs && d == '>')
                    tok = new Token('}');	// digraph
                else if (digraphs && d == ':')
                    PASTE:
                    {
                        d = read();
                        if (d != '%') {
                            unread(d);
                            tok = new Token('#');	// digraph
                            break PASTE;
                        }
                        d = read();
                        if (d != ':') {
                            unread(d);	// Unread 2 chars here.
                            unread('%');
                            tok = new Token('#');	// digraph
                            break PASTE;
                        }
                        tok = new Token(PASTE);	// digraph
                    }
                else
                    unread(d);
                break;

            case ':':
                /* :: */
                d = read();
                if (digraphs && d == '>')
                    tok = new Token(']');	// digraph
                else
                    unread(d);
                break;

            case '<':
                if (include) {
                    tok = string('<', '>');
                } else {
                    d = read();
                    if (d == '=')
                        tok = new Token(LE);
                    else if (d == '<')
                        tok = cond('=', LSH_EQ, LSH);
                    else if (digraphs && d == ':')
                        tok = new Token('[');	// digraph
                    else if (digraphs && d == '%')
                        tok = new Token('{');	// digraph
                    else
                        unread(d);
                }
                break;

            case '=':
                tok = cond('=', EQ, '=');
                break;

            case '>':
                d = read();
                if (d == '=')
                    tok = new Token(GE);
                else if (d == '>')
                    tok = cond('=', RSH_EQ, RSH);
                else
                    unread(d);
                break;

            case '^':
                tok = cond('=', XOR_EQ, '^');
                break;

            case '|':
                d = read();
                if (d == '=')
                    tok = new Token(OR_EQ);
                else if (d == '|')
                    tok = cond('=', LOR_EQ, LOR);
                else
                    unread(d);
                break;
            case '&':
                d = read();
                if (d == '&')
                    tok = cond('=', LAND_EQ, LAND);
                else if (d == '=')
                    tok = new Token(AND_EQ);
                else
                    unread(d);
                break;

            case '.':
                d = read();
                if (d == '.')
                    tok = cond('.', ELLIPSIS, RANGE);
                else
                    unread(d);
                if (Character.isDigit(d)) {
                    unread('.');
                    tok = number();
                }
                /* XXX decimal fraction */
                break;

            case '\'':
                tok = string('\'', '\'');
                break;

            case '"':
                tok = string('"', '"');
                break;

            case -1:
                close();
                tok = new Token(EOF, _l, _c, "");
                break;
        }

        if (tok == null) {
            if (Character.isWhitespace(c)) {
                tok = whitespace(c);
            } else if (Character.isDigit(c)) {
                unread(c);
                tok = number();
            } else if (Character.isJavaIdentifierStart(c)) {
                tok = identifier(c);
            } else {
                tok = new Token(c);
            }
        }

        if (bol) {
            switch (tok.getType()) {
                case WHITESPACE:
                case CCOMMENT:
                    break;
                default:
                    bol = false;
                    break;
            }
        }

        tok.setLocation(_l, _c);
        if (DEBUG)
            System.out.println("lx: Returning " + tok);
        // (new Exception("here")).printStackTrace(System.out);
        return tok;
    }

    @Override
    public void close()
            throws IOException {
        if (reader != null) {
            reader.close();
            reader = null;
        }
        super.close();
    }

}