org.neo4j.cypher.internal.parser.lexer.UnicodeEscapeReplacementReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cypher-antlr-ast-common Show documentation
Common files used by all cypher ast antlr parsers
There is a newer version: 5.26.1
Show newest version
/*
 * Copyright (c) "Neo4j"
 * Neo4j Sweden AB [https://neo4j.com]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.neo4j.cypher.internal.parser.lexer;

import static java.lang.Character.isHighSurrogate;
import static java.lang.Character.isLowSurrogate;
import static java.lang.Math.min;
import static org.antlr.v4.runtime.CodePointCharStream.fromBuffer;
import static org.antlr.v4.runtime.IntStream.UNKNOWN_SOURCE_NAME;

import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import org.antlr.v4.runtime.CharStream;
import org.antlr.v4.runtime.CodePointBuffer;
import org.eclipse.collections.impl.list.mutable.primitive.IntArrayList;
import org.neo4j.cypher.internal.util.InputPosition;

/** Reader that replace cypher unicode escape codes and keep track of query offsets. */
public class UnicodeEscapeReplacementReader extends Reader {
    public static int DEFAULT_BUFFER_SIZE = 4096;
    private final String cypher;
    private int srcPos;
    private OffsetTableBuilder offsetTable = null;
    boolean escaped = false; // If true last character was a backslash

    private UnicodeEscapeReplacementReader(String cypher) {
        this.cypher = cypher;
    }

    public static Result read(String cypher) throws IOException {
        return read(cypher, DEFAULT_BUFFER_SIZE);
    }

    public static Result read(String cypher, int maxBuffer) throws IOException {
        final var antlrBuffer = CodePointBuffer.builder(cypher.length());
        final var cb = CharBuffer.allocate(min(maxBuffer, cypher.length()));

        try (final var reader = new UnicodeEscapeReplacementReader(cypher)) {
            while (reader.read(cb.clear()) != -1) antlrBuffer.append(cb.flip());
            final var charStream = fromBuffer(antlrBuffer.build(), UNKNOWN_SOURCE_NAME);
            return new Result(charStream, reader.offsetTable());
        }
    }

    @Override
    public int read(char[] cbuf, int off, int len) {
        if (srcPos >= cypher.length()) return -1;
        return doRead(cbuf, off, len);
    }

    private int doRead(char[] cbuf, int off, int len) {
        final String src = cypher;
        int srcPos = this.srcPos, srcEnd = src.length(), destPos = off, destEnd = off + len;
        boolean escaped = this.escaped;
        var offsetTable = this.offsetTable;
        while (srcPos < srcEnd && destPos < destEnd) {
            char c = src.charAt(srcPos);

            boolean isReplacement = !escaped && c == '\\' && peek(srcPos + 1) == 'u';
            if (isReplacement) c = parseUnicodeReplacement(srcPos);

            boolean isHighSurrogate = isHighSurrogate(c);
            // Antlr do not like cut off surrogates, so we do our best to avoid that here
            if (isHighSurrogate && destPos + 1 == destEnd && srcPos + 1 < srcEnd && len > 1) break;

            // If input positions in the result (relative to codepoints) is different from input positions
            // in the input (relative to chars) we need to update the offset table. Most queries don't need this.
            if (offsetTable != null || isReplacement || isHighSurrogate) offsetTable = updateOffsetTable(srcPos, c);

            escaped = !escaped && !isReplacement && c == '\\';
            cbuf[destPos++] = c;
            srcPos += isReplacement ? 6 : 1;
        }
        this.srcPos = srcPos;
        this.escaped = escaped;
        return destPos - off;
    }

    // Peeks the character at the specified offset or return 0(!) if none is found
    private char peek(int pos) {
        return pos < cypher.length() ? cypher.charAt(pos) : 0;
    }

    private OffsetTableBuilder updateOffsetTable(int charPos, char destChar) {
        if (offsetTable == null) {
            offsetTable = new OffsetTableBuilder(inputPositionAt(charPos), cypher.charAt(charPos), destChar);
        } else {
            offsetTable.updateOffsets(charPos, cypher.charAt(charPos), destChar);
        }
        return offsetTable;
    }

    private InputPosition inputPositionAt(int pos) {
        int line = 1, col = 1;
        for (int i = 0; i < pos && i < cypher.length(); ++i) {
            char c = cypher.charAt(i);
            if (c == '\n' || (c == '\r' && peek(i + 1) != '\n')) {
                line += 1;
                col = 0;
            }
            col += 1;
        }
        return InputPosition.apply(pos, line, col);
    }

    private char parseUnicodeReplacement(int charPos) {
        final var hexString = cypher.substring(min(charPos + 2, cypher.length()), min(charPos + 6, cypher.length()));
        try {
            return (char) Integer.parseInt(hexString, 16);
        } catch (Exception e) {
            var pos = inputPositionAt(charPos + 2);
            var m = "Invalid input '%s': expected four hexadecimal digits specifying a unicode character";
            throw new InvalidUnicodeLiteral(m.formatted(hexString), charPos, pos.line(), pos.column());
        }
    }

    public int[] offsetTable() {
        return offsetTable != null ? offsetTable.offsets.toArray() : null;
    }

    @Override
    public void close() {}

    private static class OffsetTableBuilder {
        IntArrayList offsets = new IntArrayList();
        char lastSrcChar, lastDestChar;

        OffsetTableBuilder(InputPosition start, char srcChar, char destChar) {
            this.offsets.addAll(start.offset(), start.line(), start.column());
            this.lastSrcChar = srcChar;
            this.lastDestChar = destChar;
        }

        void updateOffsets(int charPos, char srcChar, char destChar) {
            int line = offsets.get(offsets.size() - 2);
            int col = offsets.getLast() + charPos - offsets.get(offsets.size() - 3);
            if (lastSrcChar == '\n' || (lastSrcChar == '\r' && srcChar != '\n')) {
                line += 1;
                col = 1;
            }
            if (!(isHighSurrogate(lastDestChar) && isLowSurrogate(destChar))) offsets.addAll(charPos, line, col);
            lastSrcChar = srcChar;
            lastDestChar = destChar;
        }
    }

    public record Result(CharStream charStream, int[] offsetTable) {}

    public static class InvalidUnicodeLiteral extends RuntimeException {
        public final int offset, line, column;

        private InvalidUnicodeLiteral(String message, int offset, int line, int column) {
            super(message);
            this.offset = offset;
            this.line = line;
            this.column = column;
        }
    }
}