All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.openhft.chronicle.wire.YamlTokeniser Maven / Gradle / Ivy

There is a newer version: 2.27ea1
Show newest version
/*
 * Copyright 2016-2020 chronicle.software
 *
 *       https://chronicle.software
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.openhft.chronicle.wire;

import net.openhft.chronicle.bytes.Bytes;
import net.openhft.chronicle.bytes.BytesIn;
import net.openhft.chronicle.core.pool.StringBuilderPool;
import net.openhft.chronicle.core.scoped.ScopedResource;
import net.openhft.chronicle.core.scoped.ScopedResourcePool;
import org.jetbrains.annotations.NotNull;

import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;

/**
 * A tokenizer for YAML documents. The YamlTokeniser class is responsible for
 * converting a raw YAML input into individual tokens, each representing
 * a distinct construct or symbol in YAML. This class is integral to
 * processes such as parsing or tokenization of YAML documents.
 */
@SuppressWarnings({"this-escape","deprecation"})
public class YamlTokeniser {

    /** Represents an undefined or invalid indentation. */
    static final int NO_INDENT = -1;

    /** Set of YAML tokens that don't contain any associated text content. */
    static final Set NO_TEXT = EnumSet.of(
            YamlToken.SEQUENCE_START,
            YamlToken.SEQUENCE_ENTRY,
            YamlToken.SEQUENCE_END,
            YamlToken.MAPPING_START,
            YamlToken.MAPPING_KEY,
            YamlToken.MAPPING_END,
            YamlToken.DIRECTIVES_END);

    /** A pool of StringBuilders to improve efficiency and reduce memory overhead. */
    static final ScopedResourcePool SBP = StringBuilderPool.createThreadLocal(1);

    /** Stack to manage contextual information during tokenization. */
    protected final List contexts = new ArrayList<>();

    /** The input source containing the raw YAML content. */
    private final BytesIn in;

    /** A pool of reusable context objects to manage YAML structures. */
    private final List freeContexts = new ArrayList<>();

    /** List of tokens that have been identified but not yet processed. */
    private final List pushed = new ArrayList<>();

    /** Temporary bytes buffer. */
    Bytes temp = null;

    /** Position marker for the start of a line. */
    long lineStart;

    /** Position marker for the start of a block. */
    long blockStart;

    /** Position marker for the end of a block. */
    long blockEnd;

    /** Current depth of flow structures, like lists or maps. */
    int flowDepth = Integer.MAX_VALUE;

    /** Character used to denote quoting in a block. */
    char blockQuote = 0;

    /** Flag to indicate if a sequence entry has been encountered. */
    boolean hasSequenceEntry;

    /** Position marker for the last key in a key-value pair. */
    long lastKeyPosition = -1;

    /** The last token that was processed. */
    private YamlToken last = YamlToken.STREAM_START;

    /**
     * Constructs a new YAML tokenizer with the specified input.
     *
     * @param in The input source containing raw YAML content.
     */
    public YamlTokeniser(BytesIn in) {
        reset();
        this.in = in;
    }

    /**
     * Retrieves the number of context objects currently being managed.
     *
     * @return The size of the context list.
     */
    public int contextSize() {
        return contexts.size();
    }

    /**
     * Resets the state of the tokenizer. This method prepares the tokenizer
     * for processing a new input or to restart the tokenization of the current input.
     */
    void reset() {
        contexts.clear();
        freeContexts.clear();
        if (temp != null)
            temp.clear();
        long pos = in == null ? 0 : in.readPosition();
        lineStart = blockStart = blockEnd = pos;
        flowDepth = Integer.MAX_VALUE;
        blockQuote = 0;
        hasSequenceEntry = false;
        lastKeyPosition = -1;
        pushed.clear();
        last = YamlToken.STREAM_START;
        pushContext0(YamlToken.STREAM_START, NO_INDENT);
    }

    /**
     * Returns the context of the YAML tokenization process.
     * This method provides the top-level token context based on the tokenization history.
     *
     * @return The top context token if contexts are present, otherwise returns STREAM_START.
     */
    public YamlToken context() {
        return contexts.isEmpty() ? YamlToken.STREAM_START : topContext().token;
    }

    /**
     * Retrieves the top context from the context stack.
     * This method provides the most recent tokenization context.
     *
     * @return The top YTContext object from the context stack.
     */
    public YTContext topContext() {
        return contexts.get(contextSize() - 1);
    }

    /**
     * Retrieves the second to top context from the context stack.
     * This method provides the tokenization context that's just below the topmost one.
     *
     * @return The second top YTContext object from the context stack.
     */
    public YTContext secondTopContext() {
        return contexts.get(contextSize() - 2);
    }

    /**
     * Gets the current token in the tokenization process.
     * If the last token was the start of the stream, this method fetches the next token.
     *
     * @return The current YamlToken object representing the tokenization status.
     */
    public YamlToken current() {
        if (last == YamlToken.STREAM_START)
            return next(NO_INDENT);
        return last;
    }

    /**
     * Fetches the next token based on the current context's indentation.
     *
     * @return The next YamlToken object in line based on the current indentation context.
     */
    public YamlToken next() {
        return next(contextIndent());
    }

    /**
     * Retrieves the next YAML token considering the minimum indentation provided.
     * This method drives the core tokenization process, fetching tokens based on
     * the minimum indentation and updating the last token processed.
     *
     * @param minIndent The minimum indentation to consider while tokenizing.
     * @return The next YamlToken object based on the specified indentation.
     */
    @NotNull
    public YamlToken next(int minIndent) {
        if (!pushed.isEmpty()) {
            YamlToken next = popPushed();  // Fetching the next token from the pushed list
            return last = next;
        }
        YamlToken next = next0(minIndent); // Internal method to get the next token based on indentation
        return this.last = next;
    }

    /**
     * Core method to tokenize the YAML content based on the given minimum indentation.
     * The method processes the current position in the input stream and returns the
     * next tokenized YAML construct. It utilizes the current context and the indentation
     * level to identify and process different YAML constructs.
     *
     * @param minIndent The minimum indentation level to consider while tokenizing.
     * @return The next {@link YamlToken} in the tokenization sequence.
     */
    YamlToken next0(int minIndent) {
        // Consuming any whitespace present at the start of a line
        consumeWhitespace();

        // Setting the block start and end markers to the current reading position
        blockStart = blockEnd = in.readPosition();

        // Clearing the temporary buffer if present
        if (temp != null)
            temp.clear();

        // Fetching the top context for reference during tokenization
        YTContext context = topContext();

        // Calculating the indentation level for the current block
        int indent2 = Math.toIntExact(in.readPosition() - lineStart) * 2;

        // Reading the next character from the input stream for processing
        int ch = in.readUnsignedByte();

        // Processing the read character to identify the associated YAML token
        switch (ch) {
            case -1:
                if (contextIndent() <= minIndent)
                    return YamlToken.NONE;
                contextPop();
                return popPushed();
            case '#':
                readComment();
                return YamlToken.COMMENT;
            case '"':
                if (wouldChangeContext(minIndent, indent2))
                    return dontRead();
                lastKeyPosition = in.readPosition() - 1;
                readDoublyQuoted();
                if (isFieldEnd())
                    return indent(YamlToken.MAPPING_START, YamlToken.MAPPING_KEY, YamlToken.TEXT, indent2);
                return YamlToken.TEXT;
            case '\'':
                if (wouldChangeContext(minIndent, indent2))
                    return dontRead();
                lastKeyPosition = in.readPosition() - 1;
                readSinglyQuoted();
                if (isFieldEnd())
                    return indent(YamlToken.MAPPING_START, YamlToken.MAPPING_KEY, YamlToken.TEXT, indent2);

                return YamlToken.TEXT;

            case '?': {
                if (wouldChangeContext(minIndent, indent2))
                    return dontRead();
                lastKeyPosition = in.readPosition() - 1;
                YamlToken indentB = indent(YamlToken.MAPPING_START, YamlToken.MAPPING_KEY, YamlToken.STREAM_START, indent2);
                contextPush(YamlToken.MAPPING_KEY, indent2);
                return indentB;
            }

            case '-': {
                int next = in.peekUnsignedByte();
                if (next <= ' ') {
                    if (wouldChangeContext(minIndent, indent2 + 1))
                        return dontRead();

                    hasSequenceEntry = true;
                    return indent(YamlToken.SEQUENCE_START, YamlToken.SEQUENCE_ENTRY, YamlToken.STREAM_START, indent2 + 1);
                }
                if (indent2 == 0 && next == '-' && in.peekUnsignedByte(in.readPosition() + 1) == '-' && in.peekUnsignedByte(in.readPosition() + 2) <= ' ') {
                    if (contextIndent() <= minIndent && minIndent >= 0)
                        return dontRead();
                    in.readSkip(2);
                    pushed.add(YamlToken.DIRECTIVES_END);
                    popAll(1);
                    contextPush(YamlToken.DIRECTIVES_END, NO_INDENT);
                    return popPushed();
                }
                unreadLast();
                return readText(indent2);
            }
            case '.': {
                int next = in.peekUnsignedByte();
                if (indent2 == 0 && next == '.') {
                    if (in.peekUnsignedByte(in.readPosition() + 1) == '.' &&
                            in.peekUnsignedByte(in.readPosition() + 2) <= ' ') {
                        if (contextIndent() <= minIndent)
                            return dontRead();
                        in.readSkip(2);
                        popAll(1);
                        return popPushed();
                    }
                }
                unreadLast();
                return readText(indent2);
            }
            case '&':
                if (in.peekUnsignedByte() > ' ') {
                    readWord();
                    return YamlToken.ANCHOR;
                }
                break;
            case '*':
                if (in.peekUnsignedByte() > ' ') {
                    readWord();
                    return YamlToken.ALIAS;
                }
                break;
            case '|':
                if (in.peekUnsignedByte() <= ' ') {
                    readLiteral();
                    return seq(YamlToken.LITERAL);
                }
                break;
            case '>':
                if (in.peekUnsignedByte() <= ' ') {
                    readFolded();
                    return seq(YamlToken.LITERAL);
                }
                break;
            case '%':
                readDirective();
                return YamlToken.DIRECTIVE;
            case '@':
            case '`':
                readReserved();
                return seq(YamlToken.RESERVED);
            case '!':
                readWord();
                push(seq(YamlToken.TAG));
                if (context() == YamlToken.STREAM_START) {
                    pushContext0(YamlToken.DIRECTIVES_END, NO_INDENT);
                    push(YamlToken.DIRECTIVES_END);
                }
                return popPushed();
            case '{':
                return flow(YamlToken.MAPPING_START);
            case '}':
                if (minIndent == Integer.MAX_VALUE || context.keysCount() > 0) {
                    return dontRead();
                }
                return flowPop(YamlToken.MAPPING_START, '}');
            case '[':
                hasSequenceEntry = false;
                return flow(YamlToken.SEQUENCE_START);
            case ']':
                if (minIndent == Integer.MAX_VALUE)
                    return dontRead();
                return flowPop(YamlToken.SEQUENCE_START, ']');
            case ',':
                if (flowDepth >= contextSize())
                    flowDepth = contextSize();
                hasSequenceEntry = false;
                // CHECK in a LIST or MAPPING.
                return next0(minIndent);

            case ':':
                if (in.peekUnsignedByte() <= ' ') {
                    int pos = pushed.size();
                    while (context() != YamlToken.MAPPING_KEY && contextSize() > 1) {
                        contextPop();
                    }
                    if (context() == YamlToken.MAPPING_KEY)
                        contextPop();
                    reversePushed(pos);
                    return pushed.isEmpty() ? next0(minIndent) : popPushed();
                }
                break;
        // Other symbols that might have specific semantics in certain YAML constructs
            case '+':
            case '$':
            case '(':
            case ')':
            case '/':
            case ';':
            case '<':
            case '=':
            case '\\':
            case '^':
            case '_':
            case '~':
        }

        // If changing context, don't read the symbol
        if (wouldChangeContext(minIndent, indent2))
            return dontRead();

        // Revert to reading the last character in the input stream
        unreadLast();

        // Tokenize the special symbol as regular text
        return readText(indent2);
    }

    /**
     * Determine if the current context would change given the indentation levels.
     *
     * @param minIndent The minimum indentation to consider.
     * @param indent The current indentation level.
     * @return True if context would change, false otherwise.
     */
    private boolean wouldChangeContext(int minIndent, int indent) {
        if (isInFlow())
            return false;
        return minIndent > indent;
    }

    /**
     * Helper method to handle scenarios where the character shouldn't be tokenized.
     * This method ensures the last read character is reverted back and a NONE token is returned.
     *
     * @return The {@link YamlToken#NONE} token.
     */
    private YamlToken dontRead() {
        unreadLast();
        return YamlToken.NONE;
    }

    /**
     * Pop from the context stack until a specified start token is encountered.
     * This method is useful for flow constructs where we need to determine the
     * boundaries (like a list or map).
     *
     * @param start The token to identify the start of the flow construct.
     * @param end The character representing the end of the flow construct.
     * @return The appropriate {@link YamlToken} after popping the context.
     */
    private YamlToken flowPop(YamlToken start, char end) {
        int pos = pushed.size();
        while (context() != start) {
            if (contextSize() <= 1)
                throw new IllegalArgumentException("Unexpected '" + end + '\'');
            contextPop();
        }
        contextPop();
        reversePushed(pos);
        return popPushed();
    }

    /**
     * Handles YAML flow constructs such as sequences and maps.
     * This method manages the context and the stack of tokens accordingly.
     *
     * @param token The {@link YamlToken} to be processed.
     * @return The next token in the sequence.
     */
    private YamlToken flow(YamlToken token) {
        pushed.add(token);

        // Handle sequence entries and determine their context
        if (!hasSequenceEntry && token != YamlToken.SEQUENCE_START && context() == YamlToken.SEQUENCE_START) {
            hasSequenceEntry = true;
            pushed.add(YamlToken.SEQUENCE_ENTRY);
        }
        contextPush(token, -1);

        // Update the flow depth to the context size
        if (flowDepth > contextSize())
            flowDepth = contextSize();
        return popPushed();
    }

    /**
     * Placeholder method to handle reserved YAML constructs.
     * Currently, this operation is unsupported.
     */
    private void readReserved() {
        throw new UnsupportedOperationException();
    }

    /**
     * Process and tokenize YAML directives.
     * Directives apply specific parsing rules or serve to transfer metadata.
     */
    private void readDirective() {
        readWords();
    }

    /**
     * Reads and processes a folded style in YAML.
     * Folded style treats newlines as spaces, preserving newlines only when followed by more newlines.
     */
    private void readFolded() {
        readLiteral(false);
    }

    /**
     * Obtain or initialize the temporary bytes buffer.
     *
     * @return The temporary buffer as {@link Bytes}.
     */
    private Bytes temp() {
        if (temp == null)
            temp = Bytes.allocateElasticOnHeap(32);
        temp.clear();
        return temp;
    }

    /**
     * Read and process the literal block scalar style in YAML.
     * Literal style preserves newlines, treating them as part of the content.
     */
    private void readLiteral() {
        readLiteral(true);
    }

    /**
     * Reads and processes a literal scalar block from the YAML input.
     * In YAML, literal scalars are indicated by the pipe character '|'.
     * This method will capture content preserving formatting and any newlines
     * present, depending on the withNewLines flag.
     *
     * @param withNewLines A flag indicating if newlines should be preserved (true)
     * or converted to spaces (false) during the read process.
     */
    private void readLiteral(boolean withNewLines) {
        readNewline(); // read to the end of the line.
        readIndent();
        int indent2 = Math.toIntExact(in.readPosition() - lineStart);
        blockStart = blockEnd = -1;

        // Initialize or reset the temporary buffer
        final Bytes temp = temp();
        long start = in.readPosition();

        // Process characters until reaching the end of the input
        while (true) {
            int ch = in.readUnsignedByte();
            if (ch < 0) {
                // Reached end of input, write any remaining content to temp buffer.
                temp.write(in, start, in.readPosition() - start);
                break;
            }
            if (ch == '\r' || ch == '\n') {
                // Reached end of line, update buffer and handle indentation.
                unreadLast();
                if (withNewLines)
                    readNewline();
                temp.write(in, start, in.readPosition() - start);

                readIndent();
                int indent3 = Math.toIntExact(in.readPosition() - lineStart);
                if (indent3 < indent2)
                    return;

                // If not preserving newlines, add space as separator if previous character isn't whitespace.
                if (!withNewLines)
                    if (temp.peekUnsignedByte(temp.writePosition() - 1) > ' ')
                        temp.append(' ');

                if (indent3 > indent2)
                    in.readPosition(lineStart + indent2);
                start = in.readPosition();
            }
        }
    }

    /**
     * Reads and processes the indentation of the current line.
     * Whitespace characters are consumed, and any newline characters
     * encountered will reset the lineStart marker.
     */
    private void readIndent() {
        while (true) {
            int ch = in.peekUnsignedByte();
            if (ch < 0 || ch > ' ')
                break;

            in.readSkip(1); // Consume the character.

            // If newline is encountered, update the lineStart marker.
            if (ch == '\r' || ch == '\n')
                lineStart = in.readPosition();
        }
    }

    /**
     * Consumes and processes newline characters from the input.
     * It will keep reading and updating the lineStart until it encounters a non-whitespace character or reaches end of input.
     */
    private void readNewline() {
        while (true) {
            int ch = in.peekUnsignedByte(); // Peek the next byte without consuming.

            // Break loop if end of input is reached or a non-whitespace character is encountered.
            if (ch < 0 || ch >= ' ')
                break;

            in.readSkip(1); // Consume the character.
            lineStart = in.readPosition(); // Update the lineStart marker.
        }
    }

    /**
     * Handles and determines the indentation level and relevant token type based on context.
     *
     * @param indented The token for the start of indentation context.
     * @param key The key token type.
     * @param push The token type to be pushed to the stack.
     * @param indent The current indentation level.
     * @return The next token after processing the current input.
     */
    private YamlToken indent(
            YamlToken indented,
            @NotNull YamlToken key,
            @NotNull YamlToken push,
            int indent) {
        if (push != YamlToken.STREAM_START)
            this.pushed.add(push);
        if (isInFlow()) {
            return key; // If we are inside a flow structure, return the key token.
        }
        int pos = this.pushed.size();

        // Pop contexts until the current indent matches the existing context.
        while (indent < contextIndent()) {
            contextPop();
        }
        int contextIndent = contextIndent();

        // Push the indented token if we are starting a new indentation level.
        if (indented != null && indent != contextIndent)
            this.pushed.add(indented);
        this.pushed.add(key);

        // Reverse the order of the tokens in the pushed stack.
        reversePushed(pos);

        // Push a new context if we are starting a new indentation level.
        if (indented != null && indent > contextIndent())
            contextPush(indented, indent);
        return popPushed();
    }

    /**
     * Reads plain scalar text from the YAML input, handling mappings and sequences.
     *
     * @param indent2 The current indentation level.
     * @return The token after processing the text.
     */
    private YamlToken readText(int indent2) {
        long pos = in.readPosition(); // Store the current position of input.

        blockQuote = 0;
        readWords(); // Read words until we reach a character that is not part of the scalar.

        // If we've reached the end of a field, determine if this is a key in a mapping.
        if (isFieldEnd()) {
            lastKeyPosition = pos;
            if (topContext().token != YamlToken.MAPPING_KEY)
                return indent(YamlToken.MAPPING_START, YamlToken.MAPPING_KEY, YamlToken.TEXT, indent2);
        }

        // By default, treat the scalar as plain text.
        YamlToken token = YamlToken.TEXT;
        return seq(token); // Handle sequences if needed.
    }

    /**
     * Handles the sequence logic within the YAML structure.
     *
     * @param token The current token being processed.
     * @return A {@link YamlToken} after processing the sequence logic.
     */
    private YamlToken seq(YamlToken token) {
        // If a sequence entry has not been processed yet and the current context is a sequence start, and it's in flow
        if (!hasSequenceEntry && context() == YamlToken.SEQUENCE_START && isInFlow()) {
            hasSequenceEntry = true; // Set that sequence entry has been processed.
            pushed.add(token);
            return YamlToken.SEQUENCE_ENTRY; // Return SEQUENCE_ENTRY token.
        }
        return token; // Otherwise, return the original token.
    }

    /**
     * Moves back by one position in the input stream.
     */
    private void unreadLast() {
        in.readSkip(-1); // Go back by one character.
    }

    /**
     * @return The current indentation level based on the top context or 0 if the context stack is empty.
     */
    private int contextIndent() {
        return contexts.isEmpty() ? 0 : topContext().indent; // Return the indent of the top context.
    }

    /**
     * Checks if the parser is inside a flow context.
     *
     * @return {@code true} if inside a flow context, {@code false} otherwise.
     */
    private boolean isInFlow() {
        return contextSize() >= flowDepth;
    }

    /**
     * Pops all contexts down to the specified level.
     *
     * @param downTo The level to which to pop the context.
     */
    void popAll(int downTo) {
        int pos = pushed.size();
        while (contextSize() > downTo) {
            contextPop();
        }
        reversePushed(pos); // Reverse the order of pushed tokens after popping.
    }

    /**
     * Reverses the order of tokens in the pushed list starting from the specified position.
     *
     * @param pos The starting position in the pushed list.
     */
    private void reversePushed(int pos) {
        for (int i = pos, j = pushed.size() - 1; i < j; i++, j--)
            pushed.set(i, pushed.set(j, pushed.get(i)));
    }

    /**
     * Retrieves and removes the last token from the pushed list or fetches the next token from the input if the list is empty.
     *
     * @return The retrieved {@link YamlToken}.
     */
    private YamlToken popPushed() {
        return pushed.isEmpty() ? next(Integer.MIN_VALUE) : pushed.remove(pushed.size() - 1);
    }

    /**
     * Reads a word from the input stream, handling quoted values.
     */
    private void readWord() {
        blockStart = in.readPosition(); // Mark the start of the word.
        boolean isQuote = in.peekUnsignedByte() == '<'; // Check if the word starts with a '<'.
        int ch = in.readUnsignedByte(); // Read the next character.
        do {
            // Check if the character is a special YAML character or whitespace.
            if (ch <= ' ' || (!isQuote && ",{}:?'\"#".indexOf(ch) >= 0)) {
                unreadLast(); // Move back if the character is special.
                break;
            }
            blockEnd = in.readPosition(); // Mark the end of the word.
            if (isQuote && ch == '>') {
                blockStart++;
                blockEnd--;
                break;
            }
            ch = in.readUnsignedByte(); // Read the next character.
        } while (ch != -1); // Continue until the end of input.
    }

    /**
     * Reads multiple words or tokens from the input stream, processing YAML special characters and structures.
     */
    private void readWords() {
        blockStart = in.readPosition(); // Mark the start position.
        while (in.readRemaining() > 0) { // Continue until the end of the input.
            int ch = in.readUnsignedByte(); // Read the next character.
            switch (ch) {
                case ':':
                    // If the character following ':' is not whitespace, treat it as part of the current word.
                    if (in.peekUnsignedByte() > ' ')
                        continue;
                    // is a field.
                    unreadLast();
                    return;
                case ',':
                    // If the current context is not a sequence start or a mapping start, treat ',' as part of the current word.
                    if (context() != YamlToken.SEQUENCE_START && context() != YamlToken.MAPPING_START)
                        continue;
                    unreadLast();
                    return;

                case '[': {
                    long pos = in.readPosition();
                    // If the character before '[' is not whitespace and the next character is ']', treat it as a special token.
                    if (in.peekUnsignedByte(pos - 2) > ' ' &&
                            in.peekUnsignedByte() == ']') {
                        in.readSkip(1); // Skip the next character.
                        blockEnd = pos + 1; // Mark the end position.
                        return;
                    }
                    unreadLast(); // Move back to the '[' character.
                    return;
                }
                case ']':
                case '{':
                case '}':
                case '#':
                case '\n':
                case '\r':
                    unreadLast();
                    return;
            }
            if (ch > ' ')
                blockEnd = in.readPosition();
        }
    }

    /**
     * Removes the top context from the context stack, and frees up the context.
     */
    private void contextPop() {
        YTContext context0 = contexts.remove(contextSize() - 1); // Remove the top context.
        // Reset the flow depth if it's greater than the current context size.
        if (flowDepth > contextSize())
            flowDepth = Integer.MAX_VALUE;
        YamlToken toEnd = context0.token.toEnd; // Get the ending token of the context.
        if (toEnd == null)
            throw new IllegalStateException("context: " + context0); // Throw an error if the context's ending token is null.
        // If the context has a valid ending token, add it to the pushed list.
        if (toEnd != YamlToken.NONE)
            pushed.add(toEnd);
        // Add the removed context to the list of free contexts, which can be reused in the future.
        freeContexts.add(context0);
    }

    /**
     * Reverts to a specified context level.
     *
     * @param contextSize The desired context level.
     */
    void revertToContext(int contextSize) {
        pushed.clear(); // Clear the pushed tokens.
        // Remove contexts until reaching the desired context size.
        while (contextSize() > contextSize) {
            YTContext context0 = contexts.remove(contextSize() - 1);
            if (flowDepth == contextSize())
                flowDepth = Integer.MAX_VALUE; // Reset the flow depth if required.
            freeContexts.add(context0); // Store the removed context for future reuse.
        }
    }

    /**
     * Pushes a new context to the context stack.
     *
     * @param context The YAML token representing the context.
     * @param indent  The indentation level for this context.
     */
    private void contextPush(YamlToken context, int indent) {
        // If we're at the start of a stream and the context isn't the end of directives,
        // we add an end of directives context before the actual context.
        if (context() == YamlToken.STREAM_START && context != YamlToken.DIRECTIVES_END) {
            pushContext0(YamlToken.DIRECTIVES_END, NO_INDENT);
            pushContext0(context, indent);
            push(YamlToken.DIRECTIVES_END);
            return;
        }
        pushContext0(context, indent);
    }

    /**
     * Reads a value enclosed in double quotes from the input stream.
     * Supports escape sequences.
     */
    private void readDoublyQuoted() {
        blockQuote = '"';
        blockStart = in.readPosition(); // Mark the start of the quoted string.
        // Continue reading until the end of the input.
        while (in.readRemaining() > 0) {
            int ch = in.readUnsignedByte();
            if (ch == '\\') {
                ch = in.readUnsignedByte(); // Handle escaped characters.
            } else if (ch == blockQuote) { // End quote found.
                blockEnd = in.readPosition() - 1;
                return;
            }
            // Throw an exception if the end of input is reached without finding the closing quote.
            if (ch < 0) {
                throw new IllegalStateException("Unterminated quotes " + in.subBytes(blockStart - 1, in.readPosition()));
            }
        }
    }

    /**
     * Reads a value enclosed in single quotes from the input stream.
     * Supports consecutive single quotes as escape for a single quote.
     */
    private void readSinglyQuoted() {
        blockQuote = '\'';
        blockStart = in.readPosition(); // Mark the start of the quoted string.
        // Continue reading until the end of the input.
        while (in.readRemaining() > 0) {
            int ch = in.readUnsignedByte();
            if (ch == blockQuote) {
                // ignore double single quotes.
                int ch2 = in.peekUnsignedByte();
                if (ch2 == blockQuote) { // Check for two consecutive single quotes (escaped quote).
                    in.readSkip(1);
                    continue;
                }
                blockEnd = in.readPosition() - 1; // End quote found.
                return;
            }
            // Throw an exception if the end of input is reached without finding the closing quote.
            if (ch < 0) {
                throw new IllegalStateException("Unterminated quotes " + in.subBytes(blockStart - 1, in.readPosition()));
            }
        }
    }

    /**
     * Checks if the current position in the stream marks the end of a field (denoted by a colon).
     *
     * @return true if the current position is a field end, false otherwise.
     */
    private boolean isFieldEnd() {
        consumeSpaces(); // Consume any spaces or tabs.
        // Check if the next character is a colon.
        if (in.peekUnsignedByte() == ':') {
            // Peek at the character after the colon.
            int ch = in.peekUnsignedByte(in.readPosition() + 1);
            // Skip 2 bytes if the next character is a tab or space, otherwise skip just the colon.
            in.readSkip((ch == '\t' || ch == ' ') ? 2 : 1);
            return true; // The colon signifies the end of a field.
        }
        return false;
    }

    /**
     * Reads a comment from the stream until the end of line or end of stream.
     */
    private void readComment() {
        consumeSpaces(); // Consume any spaces or tabs.
        blockStart = blockEnd = in.readPosition(); // Mark the start of the comment.
        while (true) {
            int ch = in.readUnsignedByte();
            if (ch < 0)
                return; // End of stream, break out.
            if (ch == '\n' || ch == '\r') { // New line or carriage return indicates end of comment.
                unreadLast(); // Move back the read position to the newline character.
                return;
            }
            if (ch > ' ')
                blockEnd = in.readPosition(); // Update the end of the comment block if a non-space character is found.
        }
    }

    /**
     * Consumes spaces and tabs from the stream.
     */
    private void consumeSpaces() {
        while (true) {
            int ch = in.peekUnsignedByte();
            if (ch == ' ' || ch == '\t') {
                in.readSkip(1); // Skip the space or tab.
            } else {
                return; // If not a space or tab, break out.
            }
        }
    }

    /**
     * Consumes all forms of whitespace including spaces, tabs, newlines, and carriage returns.
     */
    private void consumeWhitespace() {
        while (true) {
            int ch = in.peekUnsignedByte();
            if (ch >= 0 && ch <= ' ') { // Check for ASCII value of whitespace characters.
                in.readSkip(1); // Skip the whitespace character.
                // Update the line start position if a new line or carriage return is encountered.
                if (ch == '\n' || ch == '\r')
                    lineStart = in.readPosition();
            } else {
                return; // If not a whitespace character, break out.
            }
        }
    }

    /**
     * Gets the start position of the current line in the stream.
     *
     * @return the position of the start of the current line.
     */
    public long lineStart() {
        return lineStart;
    }

    /**
     * Sets the start position of the current line in the stream.
     *
     * @param lineStart the new starting position for the current line.
     */
    public void lineStart(long lineStart) {
        this.lineStart = lineStart;
    }

    /**
     * Gets the start position of the current block in the stream.
     *
     * @return the position of the start of the current block.
     */
    public long blockStart() {
        return blockStart;
    }

    /**
     * Gets the end position of the current block in the stream.
     *
     * @return the position of the end of the current block.
     */
    public long blockEnd() {
        return blockEnd;
    }

    /**
     * Pushes a new context onto the stack, or reuses one from the freeContexts list.
     *
     * @param token  The YAML token for the context.
     * @param indent The indentation level for the context.
     */
    private void pushContext0(YamlToken token, int indent) {
        YTContext context = freeContexts.isEmpty() ? new YTContext() : freeContexts.remove(freeContexts.size() - 1);
        context.token = token;
        context.indent = indent;
        if (context.keys != null)
            context.keys.reset(); // Reset the keys if they exist.
        contexts.add(context); // Add the new context to the list.
    }

    @Override
    public String toString() {
        String name = last.name();
        return last + " " + (blockQuote == 0 || name.endsWith("_START") || name.endsWith("_END") ? "" : blockQuote + " ") + text();
    }

    /**
     * Gets the current block's quotation character.
     *
     * @return the quotation character used in the current block, or 0 if none.
     */
    public char blockQuote() {
        return blockQuote;
    }

    /**
     * Used primarily for testing purposes to extract the current block's text.
     *
     * @return the text of the current block or an empty string if no text.
     */
    public String text() {
        try (ScopedResource sbTl = SBP.get()) {
            final StringBuilder sb = sbTl.get();
            text(sb);
            return sb.length() == 0 ? "" : sb.toString();
        }
    }

    /**
     * Extracts the text of the current block into the provided StringBuilder.
     *
     * @param sb StringBuilder to which the block's text will be appended.
     */
    public void text(StringBuilder sb) {
        // If blockEnd is not set and a temporary value exists, use that.
        if (blockEnd < 0 && temp != null) {
            sb.append(temp);
            return;
        }
        sb.setLength(0);  // Clear the StringBuilder.

        // Return if there is no text to parse or if last token doesn't allow text extraction.
        if (blockStart == blockEnd || NO_TEXT.contains(last))
            return;
        long pos = in.readPosition();
        try {
            in.readPosition(blockStart);
            in.parseUtf8(sb, Math.toIntExact(blockEnd - blockStart));
        } finally {
            // Reset the reading position.
            in.readPosition(pos);
        }
    }

    /**
     * Parses the current block's content as a double.
     *
     * @return the parsed double value.
     */
    public double parseDouble() {
        if (blockEnd < 0 && temp != null) {
            return temp.parseDouble();
        }
        if (blockStart == blockEnd || NO_TEXT.contains(last))
            return -0.0;  // Return -0.0 if there's no data.

        long pos = in.readPosition();
        try {
            in.readPosition(blockStart);
            return in.parseDouble();
        } finally {
            // Reset the reading position.
            in.readPosition(pos);
        }
    }

    /**
     * Parses the current block's content as a long. Handles octal numbers as well.
     *
     * @return the parsed long value.
     */
    public long parseLong() {
        if (blockEnd < 0 && temp != null) {
            return temp.parseLong();
        }
        if (blockStart == blockEnd || NO_TEXT.contains(last))
            return 0;  // Return 0 if there's no data.

        long pos = in.readPosition();
        try {
            in.readPosition(blockStart);
            if (in.peekUnsignedByte() == '0') {
                // Handle octal numbers.
                final int i = in.peekUnsignedByte(in.readPosition() + 1);
                try (final ScopedResource sbTl = SBP.get()) {
                    StringBuilder sb = sbTl.get();
                    if (Character.isDigit(i)) {
                        in.readSkip(1);
                        in.parseUtf8(sb, Math.toIntExact(blockEnd - blockStart) - 1);
                    return Long.parseLong(sb.toString(), 8);  // Parse as octal.
                    } else if (i == 'o') {
                        in.readSkip(2);
                        in.parseUtf8(sb, Math.toIntExact(blockEnd - blockStart) - 2);
                    return Long.parseLong(sb.toString(), 8);  // Parse as octal.
                    }
                }
            }
            return in.parseLong();
        } finally {
            // Reset the reading position.
            in.readPosition(pos);
        }
    }

    /**
     * Adds a YamlToken to the list of pushed tokens.
     *
     * @param token The YamlToken to be pushed.
     */
    public void push(YamlToken token) {
        pushed.add(token);
    }

    /**
     * Checks if the text of the current block is equal to the provided string.
     *
     * @param s The string to be checked.
     * @return true if the text of the current block is equal to 's', false otherwise.
     */
    public boolean isText(String s) {
        // TODO: This method can potentially be optimized for efficiency.
        return text().equals(s);
    }

    /**
     * Retrieves the YamlKeys associated with the top context. If none exist, a new YamlKeys object is created.
     *
     * @return The YamlKeys associated with the top context.
     */
    public YamlKeys keys() {
        YTContext context = topContext();
        YamlKeys key = context.keys;
        if (key == null)
            return context.keys = new YamlKeys();
        return key;
    }

    /**
     * Resets the reading to start from the specified offset.
     *
     * @param offset The position from which the reading should start.
     */
    public void rereadFrom(long offset) {
        lineStart = offset;
        pushed.clear();
    }

    /**
     * Inner static class that represents the context during YAML parsing.
     * This can include the current token, the indent level, and associated keys.
     */
    static class YTContext extends SelfDescribingMarshallable {
        YamlToken token;     // The current token in this context.
        int indent;          // Indentation level of this context.
        YamlKeys keys = null;  // YamlKeys associated with this context.

        /**
         * Gets the count of keys in this context.
         *
         * @return The number of keys, or 0 if none exist.
         */
        int keysCount() {
            return keys == null ? 0 : keys.count;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy