com.palantir.javaformat.java.JavaInput Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of palantir-java-format Show documentation
Palantir open source project
There is a newer version: 2.50.0
/*
 * Copyright 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 * in compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package com.palantir.javaformat.java;

import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.collect.Iterables.getLast;
import static java.nio.charset.StandardCharsets.UTF_8;

import com.google.common.base.MoreObjects;
import com.google.common.base.Suppliers;
import com.google.common.base.Verify;
import com.google.common.collect.DiscreteDomain;
import com.google.common.collect.ImmutableCollection;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableRangeMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterators;
import com.google.common.collect.Range;
import com.google.common.collect.RangeSet;
import com.google.common.collect.TreeRangeSet;
import com.google.errorprone.annotations.Immutable;
import com.palantir.javaformat.Input;
import com.palantir.javaformat.Newlines;
import com.palantir.javaformat.java.JavacTokens.RawTok;
import com.sun.tools.javac.file.JavacFileManager;
import com.sun.tools.javac.parser.Tokens.TokenKind;
import com.sun.tools.javac.tree.JCTree.JCCompilationUnit;
import com.sun.tools.javac.util.Context;
import com.sun.tools.javac.util.Log;
import com.sun.tools.javac.util.Log.DeferredDiagnosticHandler;
import com.sun.tools.javac.util.Options;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.function.Supplier;
import javax.tools.Diagnostic;
import javax.tools.DiagnosticCollector;
import javax.tools.DiagnosticListener;
import javax.tools.JavaFileObject;
import javax.tools.JavaFileObject.Kind;
import javax.tools.SimpleJavaFileObject;

/** {@code JavaInput} extends {@link Input} to represent a Java input document. */
public final class JavaInput extends Input {
    /**
     * A {@code JavaInput} is a sequence of {@link Tok}s that cover the Java input. A {@link Tok} is either a token (if
     * {@code isToken()}), or a non-token, which is a comment (if {@code isComment()}) or a newline (if
     * {@code isNewline()}) or a maximal sequence of other whitespace characters (if {@code isSpaces()}). Each
     * {@link Tok} contains a sequence of characters, an index (sequential starting at {@code 0} for tokens and
     * comments, else {@code -1}), and a ({@code 0}-origin) position in the input. The concatenation of the texts of all
     * the {@link Tok}s equals the input. Each Input ends with a token EOF {@link Tok}, with empty text.
     *
     * A {@code /*} comment possibly contains newlines; a {@code //} comment does not contain the terminating newline
     * character, but is followed by a newline {@link Tok}.
     */
    @Immutable
    static final class Tok implements Input.Tok {
        private final int index;
        private final String originalText;
        private final String text;
        private final int position;
        private final int columnI;
        private final boolean isToken;
        private final TokenKind kind;

        /**
         * The {@code Tok} constructor.
         *
         * @param index its index
         * @param originalText its original text, before removing Unicode escapes
         * @param text its text after removing Unicode escapes
         * @param position its {@code 0}-origin position in the input
         * @param columnI its {@code 0}-origin column number in the input
         * @param isToken whether the {@code Tok} is a token
         * @param kind the token kind
         */
        Tok(int index, String originalText, String text, int position, int columnI, boolean isToken, TokenKind kind) {
            this.index = index;
            this.originalText = originalText;
            this.text = text;
            this.position = position;
            this.columnI = columnI;
            this.isToken = isToken;
            this.kind = kind;
        }

        @Override
        public int getIndex() {
            return index;
        }

        @Override
        public String getText() {
            return text;
        }

        @Override
        public String getOriginalText() {
            return originalText;
        }

        @Override
        public int length() {
            return originalText.length();
        }

        @Override
        public int getPosition() {
            return position;
        }

        @Override
        public int getColumn() {
            return columnI;
        }

        boolean isToken() {
            return isToken;
        }

        @Override
        public boolean isNewline() {
            return Newlines.isNewline(text);
        }

        @Override
        public boolean isSlashSlashComment() {
            return text.startsWith("//");
        }

        @Override
        public boolean isSlashStarComment() {
            return text.startsWith("/*");
        }

        @Override
        public boolean isJavadocComment() {
            return text.startsWith("/**") && text.length() > 4;
        }

        @Override
        public boolean isComment() {
            return isSlashSlashComment() || isSlashStarComment();
        }

        @Override
        public String toString() {
            return MoreObjects.toStringHelper(this)
                    .add("index", index)
                    .add("text", text)
                    .add("position", position)
                    .add("columnI", columnI)
                    .add("isToken", isToken)
                    .toString();
        }

        public TokenKind kind() {
            return kind;
        }
    }

    /**
     * A {@link Token} contains a token {@link Tok} and its associated non-tokens; each non-token {@link Tok} belongs to
     * one {@link Token}. Each {@link Token} has an immutable list of its non-tokens that appear before it, and another
     * list of its non-tokens that appear after it. The concatenation of the texts of all the {@link Token}s'
     * {@link Tok}s, each preceded by the texts of its {@code toksBefore} and followed by the texts of its
     * {@code toksAfter}, equals the input.
     */
    @Immutable
    static final class Token implements Input.Token {
        private final Tok tok;
        private final ImmutableList toksBefore;
        private final ImmutableList toksAfter;

        /**
         * Token constructor.
         *
         * @param toksBefore the earlier non-token {link Tok}s assigned to this {@code Token}
         * @param tok this token {@link Tok}
         * @param toksAfter the later non-token {link Tok}s assigned to this {@code Token}
         */
        Token(List toksBefore, Tok tok, List toksAfter) {
            this.toksBefore = ImmutableList.copyOf(toksBefore);
            this.tok = tok;
            this.toksAfter = ImmutableList.copyOf(toksAfter);
        }

        /**
         * Get the token's {@link Tok}.
         *
         * @return the token's {@link Tok}
         */
        @Override
        public Tok getTok() {
            return tok;
        }

        /**
         * Get the earlier {@link Tok}s assigned to this {@code Token}.
         *
         * @return the earlier {@link Tok}s assigned to this {@code Token}
         */
        @Override
        public ImmutableList getToksBefore() {
            return toksBefore;
        }

        /**
         * Get the later {@link Tok}s assigned to this {@code Token}.
         *
         * @return the later {@link Tok}s assigned to this {@code Token}
         */
        @Override
        public ImmutableList getToksAfter() {
            return toksAfter;
        }

        @Override
        public String toString() {
            return MoreObjects.toStringHelper(this)
                    .add("tok", tok)
                    .add("toksBefore", toksBefore)
                    .add("toksAfter", toksAfter)
                    .toString();
        }
    }

    private final String text; // The input.
    private final Supplier lineSeparator = Suppliers.memoize(() -> Newlines.guessLineSeparator(getText()));
    private int kN; // The number of numbered toks (tokens or comments), excluding the EOF.

    /*
     * The following lists record the sequential indices of the {@code Tok}s on each input line. (Only
     * tokens and comments have sequential indices.) Tokens and {@code //} comments lie on just one
     * line; {@code /*} comments can lie on multiple lines. These data structures (along with
     * equivalent ones for the formatted output) let us compute correspondences between the input and
     * output.
     */

    private final ImmutableMap positionToColumnMap; // Map Tok position to column.
    private final ImmutableList tokens; // The Tokens for this input.
    private final ImmutableRangeMap positionTokenMap; // Map position to Token.

    /** Map from Tok index to the associated Token. */
    private final Token[] kToToken;

    /**
     * Input constructor.
     *
     * @param text the input text
     * @throws FormatterException if the input cannot be parsed
     */
    public JavaInput(String text) throws FormatterException {
        this.text = checkNotNull(text);
        setLines(ImmutableList.copyOf(Newlines.lineIterator(text)));
        ImmutableList toks = buildToks(text);
        positionToColumnMap = makePositionToColumnMap(toks);
        tokens = buildTokens(toks);
        ImmutableRangeMap.Builder tokenLocations = ImmutableRangeMap.builder();
        for (Token token : tokens) {
            Input.Tok end = JavaOutput.endTok(token);
            int upper = end.getPosition();
            if (!end.getText().isEmpty()) {
                upper += end.length() - 1;
            }
            tokenLocations.put(Range.closed(JavaOutput.startTok(token).getPosition(), upper), token);
        }
        positionTokenMap = tokenLocations.build();

        // adjust kN for EOF
        kToToken = new Token[kN + 1];
        for (Token token : tokens) {
            for (Input.Tok tok : token.getToksBefore()) {
                if (tok.getIndex() < 0) {
                    continue;
                }
                kToToken[tok.getIndex()] = token;
            }
            kToToken[token.getTok().getIndex()] = token;
            for (Input.Tok tok : token.getToksAfter()) {
                if (tok.getIndex() < 0) {
                    continue;
                }
                kToToken[tok.getIndex()] = token;
            }
        }
    }

    private static ImmutableMap makePositionToColumnMap(List toks) {
        ImmutableMap.Builder builder = ImmutableMap.builder();
        for (Tok tok : toks) {
            builder.put(tok.getPosition(), tok.getColumn());
        }
        return builder.buildOrThrow();
    }

    /**
     * Get the input text.
     *
     * @return the input text
     */
    @Override
    public String getText() {
        return text;
    }

    @Override
    public ImmutableMap getPositionToColumnMap() {
        return positionToColumnMap;
    }

    public String getLineSeparator() {
        return lineSeparator.get();
    }

    /** Lex the input and build the list of toks. */
    private ImmutableList buildToks(String text) throws FormatterException {
        ImmutableList toks = buildToks(text, ImmutableSet.of());
        kN = getLast(toks).getIndex();
        computeRanges(toks);
        return toks;
    }

    /**
     * Lex the input and build the list of toks.
     *
     * @param text the text to be lexed.
     * @param stopTokens a set of tokens which should cause lexing to stop. If one of these is found, the returned list
     *     will include tokens up to but not including that token.
     */
    static ImmutableList buildToks(String text, ImmutableSet stopTokens) throws FormatterException {
        stopTokens = ImmutableSet.builder()
                .addAll(stopTokens)
                .add(TokenKind.EOF)
                .build();
        Context context = new Context();
        Options.instance(context).put("--enable-preview", "true");
        new JavacFileManager(context, true, UTF_8);
        DiagnosticCollector diagnosticCollector = new DiagnosticCollector<>();
        context.put(DiagnosticListener.class, diagnosticCollector);
        Log log = Log.instance(context);
        log.useSource(new SimpleJavaFileObject(URI.create("Source.java"), Kind.SOURCE) {
            @Override
            public CharSequence getCharContent(boolean ignoreEncodingErrors) throws IOException {
                return text;
            }
        });
        DeferredDiagnosticHandler diagnostics = new DeferredDiagnosticHandler(log);
        ImmutableList rawToks = JavacTokens.getTokens(text, context, stopTokens);
        if (diagnostics.getDiagnostics().stream().anyMatch(d -> d.getKind() == Diagnostic.Kind.ERROR)) {
            return ImmutableList.of(new Tok(0, "", "", 0, 0, true, null)); // EOF
        }
        int kN = 0;
        List toks = new ArrayList<>();
        int charI = 0;
        int columnI = 0;
        for (RawTok t : rawToks) {
            if (stopTokens.contains(t.kind())) {
                break;
            }
            int charI0 = t.pos();
            // Get string, possibly with Unicode escapes.
            String originalTokText = text.substring(charI0, t.endPos());
            String tokText = t.kind() == TokenKind.STRINGLITERAL
                    ? t.stringVal() // Unicode escapes removed.
                    : originalTokText;
            char tokText0 = tokText.charAt(0); // The token's first character.
            final boolean isToken; // Is this tok a token?
            final boolean isNumbered; // Is this tok numbered? (tokens and comments)
            String extraNewline = null; // Extra newline at end?
            List strings = new ArrayList<>();
            if (Character.isWhitespace(tokText0)) {
                isToken = false;
                isNumbered = false;
                Iterator it = Newlines.lineIterator(originalTokText);
                while (it.hasNext()) {
                    String line = it.next();
                    String newline = Newlines.getLineEnding(line);
                    if (newline != null) {
                        String spaces = line.substring(0, line.length() - newline.length());
                        if (!spaces.isEmpty()) {
                            strings.add(spaces);
                        }
                        strings.add(newline);
                    } else if (!line.isEmpty()) {
                        strings.add(line);
                    }
                }
            } else if (tokText.startsWith("'") || tokText.startsWith("\"")) {
                isToken = true;
                isNumbered = true;
                strings.add(originalTokText);
            } else if (tokText.startsWith("//") || tokText.startsWith("/*")) {
                // For compatibility with an earlier lexer, the newline after a // comment is its own tok.
                if (tokText.startsWith("//") && (originalTokText.endsWith("\n") || originalTokText.endsWith("\r"))) {
                    extraNewline = Newlines.getLineEnding(originalTokText);
                    tokText = tokText.substring(0, tokText.length() - extraNewline.length());
                    originalTokText = originalTokText.substring(0, originalTokText.length() - extraNewline.length());
                }
                isToken = false;
                isNumbered = true;
                strings.add(originalTokText);
            } else if (Character.isJavaIdentifierStart(tokText0)
                    || Character.isDigit(tokText0)
                    || (tokText0 == '.' && tokText.length() > 1 && Character.isDigit(tokText.charAt(1)))) {
                // Identifier, keyword, or numeric literal (a dot may begin a number, as in .2D).
                isToken = true;
                isNumbered = true;
                strings.add(tokText);
            } else {
                // Other tokens ("+" or "++" or ">>" are broken into one-character toks, because ">>"
                // cannot be lexed without syntactic knowledge. This implementation fails if the token
                // contains Unicode escapes.
                isToken = true;
                isNumbered = true;
                for (int i = 0; i < tokText.length(); i++) {
                    char c = tokText.charAt(i);
                    strings.add(String.valueOf(c));
                }
            }
            if (strings.size() == 1) {
                toks.add(new Tok(isNumbered ? kN++ : -1, originalTokText, tokText, charI, columnI, isToken, t.kind()));
                charI += originalTokText.length();
                columnI = updateColumn(columnI, originalTokText);

            } else {
                if (strings.size() != 1 && !tokText.equals(originalTokText)) {
                    throw new FormatterException(
                            "Unicode escapes not allowed in whitespace or multi-character operators");
                }
                for (String str : strings) {
                    toks.add(new Tok(isNumbered ? kN++ : -1, str, str, charI, columnI, isToken, null));
                    charI += str.length();
                    columnI = updateColumn(columnI, originalTokText);
                }
            }
            if (extraNewline != null) {
                toks.add(new Tok(-1, extraNewline, extraNewline, charI, columnI, false, null));
                columnI = 0;
                charI += extraNewline.length();
            }
        }
        toks.add(new Tok(kN, "", "", charI, columnI, true, null)); // EOF tok.
        return ImmutableList.copyOf(toks);
    }

    private static int updateColumn(int columnI, String originalTokText) {
        Integer last = Iterators.getLast(Newlines.lineOffsetIterator(originalTokText));
        if (last > 0) {
            columnI = originalTokText.length() - last;
        } else {
            columnI += originalTokText.length();
        }
        return columnI;
    }

    private static ImmutableList buildTokens(List toks) {
        ImmutableList.Builder tokens = ImmutableList.builder();
        int k = 0;
        int kN = toks.size();

        // Remaining non-tokens before the token go here.
        ImmutableList.Builder toksBefore = ImmutableList.builder();

        OUTERMOST:
        while (k < kN) {
            while (!toks.get(k).isToken()) {
                Tok tok = toks.get(k++);
                toksBefore.add(tok);
                if (isParamComment(tok)) {
                    while (toks.get(k).isNewline()) {
                        // drop newlines after parameter comments
                        k++;
                    }
                }
            }
            Tok tok = toks.get(k++);

            // Non-tokens starting on the same line go here too.
            ImmutableList.Builder toksAfter = ImmutableList.builder();
            OUTER:
            while (k < kN && !toks.get(k).isToken()) {
                // Don't attach inline comments to certain leading tokens, e.g. for `f(/*flag1=*/true).
                //
                // Attaching inline comments to the right token is hard, and this barely
                // scratches the surface. But it's enough to do a better job with parameter
                // name comments.
                //
                // TODO(cushon): find a better strategy.
                if (toks.get(k).isSlashStarComment()) {
                    switch (tok.getText()) {
                        case "(":
                        case "<":
                        case ".":
                            break OUTER;
                        default:
                            break;
                    }
                }
                if (toks.get(k).isJavadocComment()) {
                    switch (tok.getText()) {
                        case ";":
                            break OUTER;
                        default:
                            break;
                    }
                }
                if (isParamComment(toks.get(k))) {
                    tokens.add(new Token(toksBefore.build(), tok, toksAfter.build()));
                    toksBefore = ImmutableList.builder().add(toks.get(k++));
                    // drop newlines after parameter comments
                    while (toks.get(k).isNewline()) {
                        k++;
                    }
                    continue OUTERMOST;
                }
                Tok nonTokenAfter = toks.get(k++);
                toksAfter.add(nonTokenAfter);
                if (Newlines.containsBreaks(nonTokenAfter.getText())) {
                    break;
                }
            }
            tokens.add(new Token(toksBefore.build(), tok, toksAfter.build()));
            toksBefore = ImmutableList.builder();
        }
        return tokens.build();
    }

    private static boolean isParamComment(Tok tok) {
        return tok.isSlashStarComment() && tok.getText().matches("\\/\\*[A-Za-z0-9\\s_\\-]+=\\s*\\*\\/");
    }

    /**
     * Convert from an offset and length flag pair to a token range.
     *
     * @param offset the {@code 0}-based offset in characters
     * @param length the length in characters
     * @return the {@code 0}-based {@link Range} of tokens
     * @throws FormatterException on formatting errors
     */
    Range characterRangeToTokenRange(int offset, int length) throws FormatterException {
        int requiredLength = offset + length;
        if (requiredLength > text.length()) {
            throw new FormatterException(String.format(
                    "error: invalid length %d, offset + length (%d) is outside the file", length, requiredLength));
        }
        if (length < 0) {
            return EMPTY_RANGE;
        }
        if (length == 0) {
            // 0 stands for "format the line under the cursor"
            length = 1;
        }
        ImmutableCollection enclosed = getPositionTokenMap()
                .subRangeMap(Range.closedOpen(offset, offset + length))
                .asMapOfRanges()
                .values();
        if (enclosed.isEmpty()) {
            return EMPTY_RANGE;
        }
        return Range.closedOpen(
                enclosed.iterator().next().getTok().getIndex(),
                getLast(enclosed).getTok().getIndex() + 1);
    }

    /**
     * Get the number of toks.
     *
     * @return the number of toks, including the EOF tok
     */
    int getkN() {
        return kN;
    }

    /**
     * Get the Token by index.
     *
     * @param k the token index
     */
    Token getToken(int k) {
        return kToToken[k];
    }

    /**
     * Get the input tokens.
     *
     * @return the input tokens
     */
    @Override
    public ImmutableList getTokens() {
        return tokens;
    }

    /**
     * Get the navigable map from position to {@link Token}. Used to look for tokens following a given one, and to
     * implement the --offset and --length flags to reformat a character range in the input file.
     *
     * @return the navigable map from position to {@link Token}
     */
    @Override
    public ImmutableRangeMap getPositionTokenMap() {
        return positionTokenMap;
    }

    @Override
    public String toString() {
        return MoreObjects.toStringHelper(this)
                .add("tokens", tokens)
                .add("super", super.toString())
                .toString();
    }

    private JCCompilationUnit unit;

    @Override
    public int getLineNumber(int inputPosition) {
        Verify.verifyNotNull(unit, "Expected compilation unit to be set.");
        return unit.getLineMap().getLineNumber(inputPosition);
    }

    @Override
    public int getColumnNumber(int inputPosition) {
        Verify.verifyNotNull(unit, "Expected compilation unit to be set.");
        return unit.getLineMap().getColumnNumber(inputPosition);
    }

    // TODO(cushon): refactor JavaInput so the CompilationUnit can be passed into
    // the constructor.
    public void setCompilationUnit(JCCompilationUnit unit) {
        this.unit = unit;
    }

    public RangeSet characterRangesToTokenRanges(Collection> characterRanges)
            throws FormatterException {
        RangeSet tokenRangeSet = TreeRangeSet.create();
        for (Range characterRange0 : characterRanges) {
            Range characterRange = characterRange0.canonical(DiscreteDomain.integers());
            tokenRangeSet.add(characterRangeToTokenRange(
                    characterRange.lowerEndpoint(), characterRange.upperEndpoint() - characterRange.lowerEndpoint()));
        }
        return tokenRangeSet;
    }
}