All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cn.wjybxx.dson.text.DsonScanner Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2023-2024 wjybxx([email protected])
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cn.wjybxx.dson.text;

import cn.wjybxx.base.ObjectUtils;
import cn.wjybxx.base.pool.ConcurrentObjectPool;
import cn.wjybxx.base.pool.ObjectPool;

import java.util.HexFormat;
import java.util.List;
import java.util.Objects;

/**
 * @author wjybxx
 * date - 2023/6/2
 */
public final class DsonScanner implements AutoCloseable {

    private static final List STRING_TOKEN_TYPES = List.of(DsonTokenType.STRING, DsonTokenType.UNQUOTE_STRING);
    private static final ObjectPool STRING_BUILDER_POOL = ConcurrentObjectPool.SHARED_STRING_BUILDER_POOL;

    private DsonCharStream charStream;
    private StringBuilder pooledStringBuilder;
    private final CharBuffer hexBuffer = new CharBuffer(4);

    public DsonScanner(CharSequence dson) {
        this(new StringCharStream(dson));
    }

    public DsonScanner(DsonCharStream charStream) {
        this.charStream = Objects.requireNonNull(charStream);
        this.pooledStringBuilder = STRING_BUILDER_POOL.acquire();
    }

    @Override
    public void close() {
        if (charStream != null) {
            charStream.close();
            charStream = null;
        }
        if (pooledStringBuilder != null) {
            STRING_BUILDER_POOL.release(pooledStringBuilder);
            pooledStringBuilder = null;
        }
    }

    public DsonToken nextToken() {
        return nextToken(false);
    }

    /**
     * @param skipValue 是否跳过值解析;如果为true,则仅扫描而不截取内容解析;这对于快速扫描确定位置时特别有用
     */
    public DsonToken nextToken(boolean skipValue) {
        DsonCharStream buffer = charStream;
        if (buffer == null) {
            throw new DsonParseException("Scanner closed");
        }
        while (true) {
            int c = skipWhitespace();
            if (c == -1) {
                return new DsonToken(DsonTokenType.EOF, "eof", getPosition());
            }
            switch (c) {
                case '{':
                    return new DsonToken(DsonTokenType.BEGIN_OBJECT, "{", getPosition());
                case '}':
                    return new DsonToken(DsonTokenType.END_OBJECT, "}", getPosition());
                case '[':
                    return new DsonToken(DsonTokenType.BEGIN_ARRAY, "[", getPosition());
                case ']':
                    return new DsonToken(DsonTokenType.END_ARRAY, "]", getPosition());
                case ':':
                    return new DsonToken(DsonTokenType.COLON, ":", getPosition());
                case ',':
                    return new DsonToken(DsonTokenType.COMMA, ",", getPosition());
                case '@':
                    return parseTypeToken(skipValue);
                case '"': { // 普通文本段
                    int indent = buffer.getColumn() - 1;
                    if (buffer.read() != '"') {
                        buffer.unread();
                    } else if (buffer.read() != '"') {
                        buffer.unread();
                        buffer.unread();
                    } else {
                        if (buffer.read() != -2) {
                            throw new DsonParseException("Illegal text block start: missing new line after opening quotes, position: " + getPosition());
                        }
                        return new DsonToken(DsonTokenType.STRING, scanSimpleText(indent, skipValue), getPosition());
                    }
                    return new DsonToken(DsonTokenType.STRING, scanString(skipValue), getPosition());
                }
                case '/': {
                    skipComment();
                    continue;
                }
                default: {
                    return new DsonToken(DsonTokenType.UNQUOTE_STRING, scanUnquotedString((char) c, skipValue), getPosition());
                }
            }
        }
    }

    // region common

    private static void ensureStringToken(DsonTokenType tokenType, int position) {
        if (tokenType != DsonTokenType.UNQUOTE_STRING && tokenType != DsonTokenType.STRING) {
            throw invalidTokenType(STRING_TOKEN_TYPES, tokenType, position);
        }
    }

    private static DsonParseException invalidTokenType(List expected, DsonTokenType tokenType, int position) {
        return new DsonParseException(String.format("Invalid Dson Token. Position: %d. Expected: %s. Found: '%s'.",
                position, expected, tokenType));
    }

    private static DsonParseException invalidClassName(String c, int position) {
        return new DsonParseException(String.format("Invalid className. Position: %d. ClassName: '%s'.", position, c));
    }

    private static DsonParseException invalidEscapeSequence(int c, int position) {
        return new DsonParseException(String.format("Invalid escape sequence. Position: %d. Character: '\\%c'.", position, c));
    }

    private static DsonParseException spaceRequired(int position) {
        return new DsonParseException(String.format("Space is required. Position: %d.", position));
    }

    private StringBuilder getCachedStringBuilder() {
        pooledStringBuilder.setLength(0);
        return pooledStringBuilder;
    }

    private int getPosition() {
        return charStream.getPosition();
    }

    // endregion

    // region header

    private DsonToken parseTypeToken(boolean skipValue) {
        DsonCharStream buffer = charStream;
        int firstChar = buffer.read();
        if (firstChar < 0) {
            throw invalidClassName("@", getPosition());
        }
        // '@{' 对应的是header,header可能是 {k:v} 或 @{clsName} 简写形式 -- 需要判别
        if (firstChar == '{') {
            return scanHeader();
        }
        // '@"""' 对应Dson文本块,缩进由行首确定
        if (firstChar == '"') {
            if (buffer.read() != '"' || buffer.read() != '"') {
                throw new DsonParseException("Illegal text block start: missing quotes, position: " + getPosition());
            }
            if (buffer.read() != -2) {
                throw new DsonParseException("Illegal text block start: missing new line after opening quotes, position: " + getPosition());
            }
            buffer.unread();
            return new DsonToken(DsonTokenType.STRING, scanDsonText(skipValue), getPosition());
        }
        // '@' 对应的是内建值类型,@i @L ...
        return scanBuiltinValue(firstChar, skipValue);
    }

    /** header不处理跳过逻辑 -- 1.header信息很重要 2.header比例较低 */
    private DsonToken scanHeader() {
        DsonCharStream buffer = this.charStream;
        final int beginPos = buffer.getPosition();
        int firstChar = skipWhitespace(); // {}下跳过空白字符
        if (firstChar < 0) {
            throw invalidClassName("@{", getPosition());
        }
        String className;
        if (firstChar == '"') {
            className = scanString(false);
        } else {
            // 非双引号模式下,只能由安全字符构成
            if (DsonTexts.isUnsafeStringChar(firstChar)) {
                throw invalidClassName(Character.toString((char) firstChar), getPosition());
            }
            StringBuilder sb = getCachedStringBuilder();
            sb.append((char) firstChar);
            if (!isClsNameHeader(buffer, sb, beginPos)) {
                int c;
                while ((c = buffer.read()) >= 0) {
                    if (DsonTexts.isUnsafeStringChar(c)) {
                        break;
                    }
                    sb.append((char) c);
                }
                if (c < 0 || DsonTexts.isUnsafeStringChar(c)) {
                    buffer.unread();
                }
            }
            className = sb.toString();
        }
        // {} 模式下,下一个字符必须是 ':' 或 '}‘
        int nextChar = skipWhitespace();
        if (nextChar == ':') { // @{k: V} Object样式,需要回退
            while (buffer.getPosition() > beginPos) {
                buffer.unread();
            }
            return new DsonToken(DsonTokenType.BEGIN_HEADER, "{", beginPos);
        } else if (nextChar == '}') { // @{clsName} 简单缩写形式
            return new DsonToken(DsonTokenType.SIMPLE_HEADER, className, getPosition());
        } else {
            throw invalidClassName(className, getPosition());
        }
    }

    /** 如果在 '}' 之前没有出现':' ,我们就认为是clsName */
    private boolean isClsNameHeader(DsonCharStream buffer, StringBuilder sb, int beginPos) {
        int c;
        while ((c = buffer.read()) != -1) {
            if (c == ':') break; // @{k: va}结构
            if (c == '}') break; // @{clsName}缩写
            sb.append((char) c);
        }
        buffer.unread(); // 退出token字符
        if (c == '}') {
            // 删除尾部缩进
            int length = sb.length();
            while (DsonTexts.isIndentChar(sb.charAt(length - 1))) {
                length--;
            }
            sb.setLength(length);
            return true;
        } else {
            // 失败回退 c == ':' or c == -1
            while (buffer.getPosition() > beginPos) {
                buffer.unread();
            }
            return false;
        }
    }

    /** 内建值无引号,且类型标签后必须是空格或换行缩进 */
    private DsonToken scanBuiltinValue(int firstChar, boolean skipValue) {
        assert firstChar != '"';
        // 非双引号模式下,只能由安全字符构成
        if (DsonTexts.isUnsafeStringChar(firstChar)) {
            throw invalidClassName(Character.toString((char) firstChar), getPosition());
        }
        DsonCharStream buffer = this.charStream;
        StringBuilder sb = getCachedStringBuilder();
        sb.append((char) firstChar);
        int c;
        while ((c = buffer.read()) >= 0) {
            if (DsonTexts.isUnsafeStringChar(c)) {
                break;
            }
            sb.append((char) c);
        }
        if (c == -2) {
            buffer.unread();
        } else if (c != ' ') {
            throw spaceRequired(getPosition());
        }
        String className = sb.toString();
        if (ObjectUtils.isBlank(className)) {
            throw invalidClassName(className, getPosition());
        }
        return onReadClassName(className, skipValue);
    }

    private DsonToken onReadClassName(String className, boolean skipValue) {
        final int position = getPosition();
        switch (className) {
            case DsonTexts.LABEL_INT32 -> {
                DsonToken nextToken = nextToken(skipValue);
                ensureStringToken(nextToken.type, position);
                if (skipValue) {
                    return new DsonToken(DsonTokenType.INT32, null, getPosition());
                }
                return new DsonToken(DsonTokenType.INT32, DsonTexts.parseInt32(nextToken.stringValue()), getPosition());
            }
            case DsonTexts.LABEL_INT64 -> {
                DsonToken nextToken = nextToken(skipValue);
                ensureStringToken(nextToken.type, position);
                if (skipValue) {
                    return new DsonToken(DsonTokenType.INT64, null, getPosition());
                }
                return new DsonToken(DsonTokenType.INT64, DsonTexts.parseInt64(nextToken.stringValue()), getPosition());
            }
            case DsonTexts.LABEL_FLOAT -> {
                DsonToken nextToken = nextToken(skipValue);
                ensureStringToken(nextToken.type, position);
                if (skipValue) {
                    return new DsonToken(DsonTokenType.FLOAT, null, getPosition());
                }
                return new DsonToken(DsonTokenType.FLOAT, DsonTexts.parseFloat(nextToken.stringValue()), getPosition());
            }
            case DsonTexts.LABEL_DOUBLE -> {
                DsonToken nextToken = nextToken(skipValue);
                ensureStringToken(nextToken.type, position);
                if (skipValue) {
                    return new DsonToken(DsonTokenType.DOUBLE, null, getPosition());
                }
                return new DsonToken(DsonTokenType.DOUBLE, DsonTexts.parseDouble(nextToken.stringValue()), getPosition());
            }
            case DsonTexts.LABEL_BOOL -> {
                DsonToken nextToken = nextToken(skipValue);
                ensureStringToken(nextToken.type, position);
                if (skipValue) {
                    return new DsonToken(DsonTokenType.BOOL, null, getPosition());
                }
                return new DsonToken(DsonTokenType.BOOL, DsonTexts.parseBool(nextToken.stringValue()), getPosition());
            }
            case DsonTexts.LABEL_NULL -> {
                DsonToken nextToken = nextToken(skipValue);
                ensureStringToken(nextToken.type, position);
                if (skipValue) {
                    return new DsonToken(DsonTokenType.NULL, null, getPosition());
                }
                DsonTexts.checkNullString(nextToken.stringValue());
                return new DsonToken(DsonTokenType.NULL, null, getPosition());
            }
            case DsonTexts.LABEL_STRING -> {
                DsonToken nextToken = nextToken(skipValue);
                ensureStringToken(nextToken.type, position);
                return new DsonToken(DsonTokenType.STRING, nextToken.stringValue(), getPosition());
            }
            case DsonTexts.LABEL_STRING_LINE -> {
                return new DsonToken(DsonTokenType.STRING, scanSingleLineText(skipValue), getPosition());
            }
            case DsonTexts.LABEL_BINARY -> {
                return new DsonToken(DsonTokenType.BINARY, scanBinary(skipValue), getPosition());
            }
        }
        return new DsonToken(DsonTokenType.BUILTIN_STRUCT, className, getPosition());
    }

    // endregion

    // region 字符串

    /** @return 如果跳到文件尾则返回 -1 */
    private int skipWhitespace() {
        DsonCharStream buffer = this.charStream;
        int c;
        while ((c = buffer.read()) != -1) {
            if (c == -2) {
                continue;
            }
            if (c == '/') {
                skipComment();
                continue;
            }
            if (!DsonTexts.isIndentChar(c)) {
                break;
            }
        }
        return c;
    }

    /** 跳过双斜杠'//'注释 */
    private void skipComment() {
        DsonCharStream buffer = this.charStream;
        int nextChar = buffer.read();
        if (nextChar != '/') {
            throw new DsonParseException("invalid comment format: Single slash, position: " + getPosition());
        }
        buffer.skipLine();
    }

    /** 扫描字节数组 */
    private byte[] scanBinary(boolean skipValue) {
        StringBuilder sb = getCachedStringBuilder();
        final int firstChar = skipWhitespace();
        if (firstChar != '"') {
            throw new DsonParseException("invalid binary format, position: " + getPosition());
        }
        scanString(sb);
        // 可直接根据StringBuilder解析字节数组 -- 避免额外字符串构建
        return skipValue ? null : HexFormat.of().parseHex(sb);
    }

    /**
     * 扫描无引号字符串,无引号字符串不支持切换到独立行
     * (该方法只使用扫描元素,不适合扫描标签)
     *
     * @param firstChar 第一个非空白字符
     * @param skipValue 是否跳过值解析
     */
    private String scanUnquotedString(final char firstChar, boolean skipValue) {
        if (skipValue) {
            skipUnquotedString();
            return null;
        }
        StringBuilder sb = getCachedStringBuilder();
        scanUnquotedString(firstChar, sb);
        return sb.toString();
    }

    /** 无引号字符串应该的占比是极高的,skip值得处理 */
    private void skipUnquotedString() {
        DsonCharStream buffer = this.charStream;
        int c;
        while ((c = buffer.read()) >= 0) {
            if (DsonTexts.isUnsafeStringChar(c)) {
                break;
            }
        }
        buffer.unread();
    }

    private void scanUnquotedString(char firstChar, StringBuilder sb) {
        DsonCharStream buffer = this.charStream;
        sb.append(firstChar);
        int c;
        while ((c = buffer.read()) >= 0) {
            if (DsonTexts.isUnsafeStringChar(c)) {
                break;
            }
            sb.append((char) c);
        }
        buffer.unread();
    }

    /**
     * 扫描双引号字符串
     */
    private String scanString(boolean skipValue) {
        StringBuilder sb = getCachedStringBuilder();
        scanString(sb);
        return skipValue ? null : sb.toString();
    }

    private void scanString(StringBuilder sb) {
        DsonCharStream buffer = this.charStream;
        int c;
        while ((c = buffer.read()) != -1) {
            if (c == -2) {
                continue;
            }
            if (c == '"') { // 结束
                return;
            } else if (c == '\\') { // 处理转义字符
                doEscape(buffer, sb);
            } else {
                sb.append((char) c);
            }
        }
        throw new DsonParseException("End of file in Dson string.");
    }

    /** 扫描单行纯文本 */
    private String scanSingleLineText(boolean skipValue) {
        if (skipValue) {
            charStream.skipLine();
            return null;
        }
        StringBuilder sb = getCachedStringBuilder();
        scanSingleLineText(sb);
        return sb.toString();
    }

    private void scanSingleLineText(StringBuilder sb) {
        DsonCharStream buffer = this.charStream;
        int c;
        while ((c = buffer.read()) >= 0) {
            sb.append((char) c);
        }
        buffer.unread();
    }

    /**
     * 扫描文本段 -- """
     *
     * @param indent 缩进数量
     */
    private String scanSimpleText(int indent, boolean skipValue) {
        if (skipValue) {
            skipSimpleText(indent);
            return null;
        }
        StringBuilder sb = getCachedStringBuilder();
        scanSimpleText(sb, indent);
        return sb.toString();
    }

    private void skipSimpleText(int indent) {
        DsonCharStream buffer = this.charStream;
        int c;
        while ((c = buffer.read()) != -1) {
            if (c == -2) { // 空行
                continue;
            }
            // 处理缩进
            do {
                if (buffer.getColumn() > indent) {
                    break;
                }
                if (!DsonTexts.isIndentChar(c)) {
                    throw new DsonParseException("Line does not start with the same whitespace as the opening line of the raw string literal, position: " + getPosition());
                }
            } while ((c = buffer.read()) >= 0);
            // 空行
            if (c < 0) {
                continue;
            }
            // 处理结束符
            int position = buffer.getPosition();
            if (c == '"'
                    && buffer.read() == '"'
                    && buffer.read() == '"') {
                if (buffer.read() == '"') { // 超过3个引号
                    throw new DsonParseException("Illegal text block end: excessive quotes, position: " + getPosition());
                }
                buffer.unread();
                return; // 结束
            }
            // 回退到c对应的位置
            while (buffer.getPosition() > position) {
                buffer.unread();
            }
            // 跳过后续
            buffer.skipLine();
        }
    }

    private void scanSimpleText(StringBuilder sb, int indent) {
        DsonCharStream buffer = this.charStream;
        int c;
        while ((c = buffer.read()) != -1) {
            if (c == -2) { // 空行
                sb.append('\n');
                continue;
            }
            // 处理缩进
            do {
                if (buffer.getColumn() > indent) {
                    break;
                }
                if (!DsonTexts.isIndentChar(c)) {
                    throw new DsonParseException("Line does not start with the same whitespace as the opening line of the raw string literal, position: " + getPosition());
                }
            } while ((c = buffer.read()) >= 0);
            // 空行
            if (c < 0) {
                if (c == -1) {
                    break; // eof
                }
                sb.append('\n');
                continue;
            }
            // 处理结束符
            int position = buffer.getPosition();
            if (c == '"'
                    && buffer.read() == '"'
                    && buffer.read() == '"') {
                if (buffer.read() == '"') { // 超过3个引号
                    throw new DsonParseException("Illegal text block end: excessive quotes, position: " + getPosition());
                }
                buffer.unread();
                sb.setLength(sb.length() - 1); // 去除最后一个换行符
                return; // 结束
            }
            // 回退到c对应的位置
            while (buffer.getPosition() > position) {
                buffer.unread();
            }
            sb.append((char) c);
            while ((c = buffer.read()) >= 0) {
                sb.append((char) c);
            }
            // c < 0
            if (c == -1) {
                break; // Eof
            }
            sb.append('\n');
        }
        throw new DsonParseException("End of file in Dson string.");
    }


    /** 扫描Dson文本段 -- @""" */
    private String scanDsonText(boolean skipValue) {
        if (skipValue) {
            skipDsonText();
            return null;
        }
        StringBuilder sb = getCachedStringBuilder();
        scanDsonText(sb);
        return sb.toString();
    }

    private void skipDsonText() {
        DsonCharStream buffer = this.charStream;
        int c;
        while ((c = buffer.read()) != -1) {
            if (c == -2 && readLineHead(buffer) == LineHead.END_OF_TEXT) {
                break;
            }
        }
        throw new DsonParseException("End of file in Dson string.");
    }

    private void scanDsonText(StringBuilder sb) {
        DsonCharStream buffer = this.charStream;
        int c;
        while ((c = buffer.read()) != -1) {
            if (c == -2) {
                LineHead lineHead = readLineHead(buffer);
                if (lineHead == LineHead.END_OF_TEXT) { // 读取结束
                    return;
                }
                if (lineHead == LineHead.COMMENT) { // 注释行
                    buffer.skipLine();
                } else if (lineHead == LineHead.APPEND_LINE) { // 开启新行
                    sb.append('\n');
                } else if (lineHead == LineHead.SWITCH_MODE) { // 进入转义模式
                    switch2EscapeMode(buffer, sb);
                }
            } else {
                sb.append((char) c);
            }
        }
        throw new DsonParseException("End of file in Dson string.");
    }

    /** 转义模式 - 单行有效 */
    private void switch2EscapeMode(DsonCharStream buffer, StringBuilder sb) {
        int c;
        while ((c = buffer.read()) >= 0) {
            if (c == '\\') {
                doEscape(buffer, sb);
            } else {
                sb.append((char) c);
            }
        }
        buffer.unread();
    }

    private LineHead readLineHead(DsonCharStream buffer) {
        int c;
        while ((c = buffer.read()) >= 0) {
            if (DsonTexts.isIndentChar(c)) {
                continue;
            }
            if (c == '/') { // 注释行
                skipComment();
                return LineHead.COMMENT;
            }
            // 文本行必须'@'标记开始
            if (c != '@') {
                throw new DsonParseException("invalid text line, position: " + getPosition());
            }
            c = buffer.read();
            if (c < 0) {
                throw new DsonParseException("invalid text line, position: " + getPosition());
            }
            // 处理结束符
            if (c == '"') {
                if (buffer.read() != '"' || buffer.read() != '"') {
                    throw new DsonParseException("invalid text line, position: " + getPosition());
                }
                if (buffer.read() == '"') { // 超过3个引号
                    throw new DsonParseException("Illegal text block end: excessive quotes, position: " + getPosition());
                }
                buffer.unread();
                return LineHead.END_OF_TEXT;
            }
            LineHead lineHead = switch (c) {
                case DsonTexts.HEAD_APPEND_LINE -> LineHead.APPEND_LINE;
                case DsonTexts.HEAD_APPEND -> LineHead.APPEND;
                case DsonTexts.HEAD_SWITCH_MODE -> LineHead.SWITCH_MODE;
                default -> throw new DsonParseException("invalid text line, position: " + getPosition());
            };
            // 如果未达文件尾,必须是空格或换行
            c = buffer.read();
            if (c < 0) {
                buffer.unread();
            } else if (c != ' ') {
                throw spaceRequired(getPosition());
            }
            return lineHead;
        }
        buffer.unread(); // 空行
        return LineHead.COMMENT;
    }

    private void doEscape(DsonCharStream buffer, StringBuilder sb) {
        final int position = getPosition();
        final int c = readEscapeChar(buffer, position);
        switch (c) {
            case '"' -> sb.append('"'); // 双引号字符串下,双引号需要转义
            case '\\' -> sb.append('\\');
            case 'b' -> sb.append('\b');
            case 'f' -> sb.append('\f');
            case 'n' -> sb.append('\n');
            case 'r' -> sb.append('\r');
            case 't' -> sb.append('\t');
            case 'u' -> {
                // unicode字符,char是2字节,固定编码为4个16进制数,从高到底
                CharBuffer hexBuffer = this.hexBuffer;
                hexBuffer.clear();
                hexBuffer.write((char) readEscapeChar(buffer, position));
                hexBuffer.write((char) readEscapeChar(buffer, position));
                hexBuffer.write((char) readEscapeChar(buffer, position));
                hexBuffer.write((char) readEscapeChar(buffer, position));
                sb.append((char) Integer.parseInt(hexBuffer, 0, 4, 16));
            }
            default -> throw invalidEscapeSequence(c, position);
        }
    }

    /** 读取下一个要转义的字符 */
    private static int readEscapeChar(DsonCharStream buffer, int position) {
        int c = buffer.read();
        if (c >= 0) {
            return c;
        }
        throw invalidEscapeSequence('\\', position);
    }

    // endregion

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy