 
                        
        
                        
        org.yaml.snakeyaml.scanner.ScannerImpl Maven / Gradle / Ivy
/**
 * Copyright (c) 2008, SnakeYAML
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.yaml.snakeyaml.scanner;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.yaml.snakeyaml.DumperOptions;
import org.yaml.snakeyaml.comments.CommentType;
import org.yaml.snakeyaml.error.Mark;
import org.yaml.snakeyaml.error.YAMLException;
import org.yaml.snakeyaml.reader.StreamReader;
import org.yaml.snakeyaml.tokens.AliasToken;
import org.yaml.snakeyaml.tokens.AnchorToken;
import org.yaml.snakeyaml.tokens.BlockEndToken;
import org.yaml.snakeyaml.tokens.BlockEntryToken;
import org.yaml.snakeyaml.tokens.BlockMappingStartToken;
import org.yaml.snakeyaml.tokens.BlockSequenceStartToken;
import org.yaml.snakeyaml.tokens.CommentToken;
import org.yaml.snakeyaml.tokens.DirectiveToken;
import org.yaml.snakeyaml.tokens.DocumentEndToken;
import org.yaml.snakeyaml.tokens.DocumentStartToken;
import org.yaml.snakeyaml.tokens.FlowEntryToken;
import org.yaml.snakeyaml.tokens.FlowMappingEndToken;
import org.yaml.snakeyaml.tokens.FlowMappingStartToken;
import org.yaml.snakeyaml.tokens.FlowSequenceEndToken;
import org.yaml.snakeyaml.tokens.FlowSequenceStartToken;
import org.yaml.snakeyaml.tokens.KeyToken;
import org.yaml.snakeyaml.tokens.ScalarToken;
import org.yaml.snakeyaml.tokens.StreamEndToken;
import org.yaml.snakeyaml.tokens.StreamStartToken;
import org.yaml.snakeyaml.tokens.TagToken;
import org.yaml.snakeyaml.tokens.TagTuple;
import org.yaml.snakeyaml.tokens.Token;
import org.yaml.snakeyaml.tokens.ValueToken;
import org.yaml.snakeyaml.util.ArrayStack;
import org.yaml.snakeyaml.util.UriEncoder;
/**
 * 
 * Scanner produces tokens of the following types:
 * STREAM-START
 * STREAM-END
 * COMMENT
 * DIRECTIVE(name, value)
 * DOCUMENT-START
 * DOCUMENT-END
 * BLOCK-SEQUENCE-START
 * BLOCK-MAPPING-START
 * BLOCK-END
 * FLOW-SEQUENCE-START
 * FLOW-MAPPING-START
 * FLOW-SEQUENCE-END
 * FLOW-MAPPING-END
 * BLOCK-ENTRY
 * FLOW-ENTRY
 * KEY
 * VALUE
 * ALIAS(value)
 * ANCHOR(value)
 * TAG(value)
 * SCALAR(value, plain, style)
 * Read comments in the Scanner code for more details.
 * 
 */
public final class ScannerImpl implements Scanner {
    /**
     * A regular expression matching characters which are not in the hexadecimal
     * set (0-9, A-F, a-f).
     */
    private final static Pattern NOT_HEXA = Pattern.compile("[^0-9A-Fa-f]");
    /**
     * A mapping from an escaped character in the input stream to the character
     * that they should be replaced with.
     * 
     * YAML defines several common and a few uncommon escape sequences.
     * 
     * @see 4.1.6.
     *      Escape Sequences
     */
    public final static Map ESCAPE_REPLACEMENTS = new HashMap();
    /**
     * A mapping from a character to a number of bytes to read-ahead for that
     * escape sequence. These escape sequences are used to handle unicode
     * escaping in the following formats, where H is a hexadecimal character:
     * 
     * 
     * \xHH         : escaped 8-bit Unicode character
     * \uHHHH       : escaped 16-bit Unicode character
     * \UHHHHHHHH   : escaped 32-bit Unicode character
     * 
     * 
     * @see 5.6. Escape
     *      Sequences
     */
    public final static Map ESCAPE_CODES = new HashMap();
    static {
        // ASCII null
        ESCAPE_REPLACEMENTS.put(Character.valueOf('0'), "\0");
        // ASCII bell
        ESCAPE_REPLACEMENTS.put(Character.valueOf('a'), "\u0007");
        // ASCII backspace
        ESCAPE_REPLACEMENTS.put(Character.valueOf('b'), "\u0008");
        // ASCII horizontal tab
        ESCAPE_REPLACEMENTS.put(Character.valueOf('t'), "\u0009");
        // ASCII newline (line feed; \n maps to 0x0A)
        ESCAPE_REPLACEMENTS.put(Character.valueOf('n'), "\n");
        // ASCII vertical tab
        ESCAPE_REPLACEMENTS.put(Character.valueOf('v'), "\u000B");
        // ASCII form-feed
        ESCAPE_REPLACEMENTS.put(Character.valueOf('f'), "\u000C");
        // carriage-return (\r maps to 0x0D)
        ESCAPE_REPLACEMENTS.put(Character.valueOf('r'), "\r");
        // ASCII escape character (Esc)
        ESCAPE_REPLACEMENTS.put(Character.valueOf('e'), "\u001B");
        // ASCII space
        ESCAPE_REPLACEMENTS.put(Character.valueOf(' '), "\u0020");
        // ASCII double-quote
        ESCAPE_REPLACEMENTS.put(Character.valueOf('"'), "\"");
        // ASCII backslash
        ESCAPE_REPLACEMENTS.put(Character.valueOf('\\'), "\\");
        // Unicode next line
        ESCAPE_REPLACEMENTS.put(Character.valueOf('N'), "\u0085");
        // Unicode non-breaking-space
        ESCAPE_REPLACEMENTS.put(Character.valueOf('_'), "\u00A0");
        // Unicode line-separator
        ESCAPE_REPLACEMENTS.put(Character.valueOf('L'), "\u2028");
        // Unicode paragraph separator
        ESCAPE_REPLACEMENTS.put(Character.valueOf('P'), "\u2029");
        // 8-bit Unicode
        ESCAPE_CODES.put(Character.valueOf('x'), 2);
        // 16-bit Unicode
        ESCAPE_CODES.put(Character.valueOf('u'), 4);
        // 32-bit Unicode (Supplementary characters are supported)
        ESCAPE_CODES.put(Character.valueOf('U'), 8);
    }
    private final StreamReader reader;
    // Had we reached the end of the stream?
    private boolean done = false;
    // The number of unclosed '{' and '['. `flow_level == 0` means block
    // context.
    private int flowLevel = 0;
    // List of processed tokens that are not yet emitted.
    private List tokens;
    // The last added token
    private Token lastToken;
    // Number of tokens that were emitted through the `get_token` method.
    private int tokensTaken = 0;
    // The current indentation level.
    private int indent = -1;
    // Past indentation levels.
    private ArrayStack indents;
    // A flag that indicates if comments should be parsed
    private boolean parseComments;
    // Variables related to simple keys treatment. See PyYAML.
    /**
     * 
     * A simple key is a key that is not denoted by the '?' indicator.
     * Example of simple keys:
     *   ---
     *   block simple key: value
     *   ? not a simple key:
     *   : { flow simple key: value }
     * We emit the KEY token before all keys, so when we find a potential
     * simple key, we try to locate the corresponding ':' indicator.
     * Simple keys should be limited to a single line and 1024 characters.
     * 
     * Can a simple key start at the current position? A simple key may
     * start:
     * - at the beginning of the line, not counting indentation spaces
     *       (in block context),
     * - after '{', '[', ',' (in the flow context),
     * - after '?', ':', '-' (in the block context).
     * In the block context, this flag also signifies if a block collection
     * may start at the current position.
     * 
     */
    private boolean allowSimpleKey = true;
    /*
     * Keep track of possible simple keys. This is a dictionary. The key is
     * `flow_level`; there can be no more that one possible simple key for each
     * level. The value is a SimpleKey record: (token_number, required, index,
     * line, column, mark) A simple key may start with ALIAS, ANCHOR, TAG,
     * SCALAR(flow), '[', or '{' tokens.
     */
    private Map possibleSimpleKeys;
    public ScannerImpl(StreamReader reader) {
        this.parseComments = false;
        this.reader = reader;
        this.tokens = new ArrayList(100);
        this.indents = new ArrayStack(10);
        // The order in possibleSimpleKeys is kept for nextPossibleSimpleKey()
        this.possibleSimpleKeys = new LinkedHashMap();
        fetchStreamStart();// Add the STREAM-START token.
    }
    /**
     * Set the scanner to ignore comments or parse them as a CommentToken.
     * 
     * @param parseComments true to parse; false to ignore
     */
    public ScannerImpl setParseComments(boolean parseComments) {
        this.parseComments = parseComments;
        return this;
    }
    public boolean isParseComments() {
        return parseComments;
    }
    /**
     * Check whether the next token is one of the given types.
     */
    public boolean checkToken(Token.ID... choices) {
        while (needMoreTokens()) {
            fetchMoreTokens();
        }
        if (!this.tokens.isEmpty()) {
            if (choices.length == 0) {
                return true;
            }
            // since profiler puts this method on top (it is used a lot), we
            // should not use 'foreach' here because of the performance reasons
            Token.ID first = this.tokens.get(0).getTokenId();
            for (int i = 0; i < choices.length; i++) {
                if (first == choices[i]) {
                    return true;
                }
            }
        }
        return false;
    }
    /**
     * Return the next token, but do not delete it from the queue.
     */
    public Token peekToken() {
        while (needMoreTokens()) {
            fetchMoreTokens();
        }
        return this.tokens.get(0);
    }
    /**
     * Return the next token, removing it from the queue.
     */
    public Token getToken() {
        this.tokensTaken++;
        return this.tokens.remove(0);
    }
    // Private methods.
    private void addToken(Token token) {
        lastToken = token;
        this.tokens.add(token);
    }
    private void addToken(int index, Token token) {
        if(index == this.tokens.size()) {
            lastToken = token;
        }
        this.tokens.add(index, token);
    }
    private void addAllTokens(List tokens) {
        lastToken = tokens.get(tokens.size()-1);
        this.tokens.addAll(tokens);
    }
    /**
     * Returns true if more tokens should be scanned.
     */
    private boolean needMoreTokens() {
        // If we are done, we do not require more tokens.
        if (this.done) {
            return false;
        }
        // If we aren't done, but we have no tokens, we need to scan more.
        if (this.tokens.isEmpty()) {
            return true;
        }
        // The current token may be a potential simple key, so we
        // need to look further.
        stalePossibleSimpleKeys();
        return nextPossibleSimpleKey() == this.tokensTaken;
    }
    /**
     * Fetch one or more tokens from the StreamReader.
     */
    private void fetchMoreTokens() {
        // Eat whitespaces and process comments until we reach the next token.
        scanToNextToken();
        // Remove obsolete possible simple keys.
        stalePossibleSimpleKeys();
        // Compare the current indentation and column. It may add some tokens
        // and decrease the current indentation level.
        unwindIndent(reader.getColumn());
        // Peek the next code point, to decide what the next group of tokens
        // will look like.
        int c = reader.peek();
        switch (c) {
        case '\0':
            // Is it the end of stream?
            fetchStreamEnd();
            return;
        case '%':
            // Is it a directive?
            if (checkDirective()) {
                fetchDirective();
                return;
            }
            break;
        case '-':
            // Is it the document start?
            if (checkDocumentStart()) {
                fetchDocumentStart();
                return;
                // Is it the block entry indicator?
            } else if (checkBlockEntry()) {
                fetchBlockEntry();
                return;
            }
            break;
        case '.':
            // Is it the document end?
            if (checkDocumentEnd()) {
                fetchDocumentEnd();
                return;
            }
            break;
        // TODO support for BOM within a stream. (also not implemented in PyYAML)
        case '[':
            // Is it the flow sequence start indicator?
            fetchFlowSequenceStart();
            return;
        case '{':
            // Is it the flow mapping start indicator?
            fetchFlowMappingStart();
            return;
        case ']':
            // Is it the flow sequence end indicator?
            fetchFlowSequenceEnd();
            return;
        case '}':
            // Is it the flow mapping end indicator?
            fetchFlowMappingEnd();
            return;
        case ',':
            // Is it the flow entry indicator?
            fetchFlowEntry();
            return;
            // see block entry indicator above
        case '?':
            // Is it the key indicator?
            if (checkKey()) {
                fetchKey();
                return;
            }
            break;
        case ':':
            // Is it the value indicator?
            if (checkValue()) {
                fetchValue();
                return;
            }
            break;
        case '*':
            // Is it an alias?
            fetchAlias();
            return;
        case '&':
            // Is it an anchor?
            fetchAnchor();
            return;
        case '!':
            // Is it a tag?
            fetchTag();
            return;
        case '|':
            // Is it a literal scalar?
            if (this.flowLevel == 0) {
                fetchLiteral();
                return;
            }
            break;
        case '>':
            // Is it a folded scalar?
            if (this.flowLevel == 0) {
                fetchFolded();
                return;
            }
            break;
        case '\'':
            // Is it a single quoted scalar?
            fetchSingle();
            return;
        case '"':
            // Is it a double quoted scalar?
            fetchDouble();
            return;
        }
        // It must be a plain scalar then.
        if (checkPlain()) {
            fetchPlain();
            return;
        }
        // No? It's an error. Let's produce a nice error message.We do this by
        // converting escaped characters into their escape sequences. This is a
        // backwards use of the ESCAPE_REPLACEMENTS map.
        String chRepresentation = String.valueOf(Character.toChars(c));
        for (Character s : ESCAPE_REPLACEMENTS.keySet()) {
            String v = ESCAPE_REPLACEMENTS.get(s);
            if (v.equals(chRepresentation)) {
                chRepresentation = "\\" + s;// ' ' -> '\t'
                break;
            }
        }
        if (c == '\t')
            chRepresentation += "(TAB)";
        String text = String
                .format("found character '%s' that cannot start any token. (Do not use %s for indentation)",
                        chRepresentation, chRepresentation);
        throw new ScannerException("while scanning for the next token", null, text,
                reader.getMark());
    }
    // Simple keys treatment.
    /**
     * Return the number of the nearest possible simple key. Actually we don't
     * need to loop through the whole dictionary.
     */
    private int nextPossibleSimpleKey() {
        /*
         * the implementation is not as in PyYAML. Because
         * this.possibleSimpleKeys is ordered we can simply take the first key
         */
        if (!this.possibleSimpleKeys.isEmpty()) {
            return this.possibleSimpleKeys.values().iterator().next().getTokenNumber();
        }
        return -1;
    }
    /**
     * 
     * Remove entries that are no longer possible simple keys. According to
     * the YAML specification, simple keys
     * - should be limited to a single line,
     * - should be no longer than 1024 characters.
     * Disabling this procedure will allow simple keys of any length and
     * height (may cause problems if indentation is broken though).
     * 
     */
    private void stalePossibleSimpleKeys() {
        if (!this.possibleSimpleKeys.isEmpty()) {
            for (Iterator iterator = this.possibleSimpleKeys.values().iterator(); iterator
                    .hasNext();) {
                SimpleKey key = iterator.next();
                if ((key.getLine() != reader.getLine())
                        || (reader.getIndex() - key.getIndex() > 1024)) {
                    // If the key is not on the same line as the current
                    // position OR the difference in column between the token
                    // start and the current position is more than the maximum
                    // simple key length, then this cannot be a simple key.
                    if (key.isRequired()) {
                        // If the key was required, this implies an error
                        // condition.
                        throw new ScannerException("while scanning a simple key", key.getMark(),
                                "could not find expected ':'", reader.getMark());
                    }
                    iterator.remove();
                }
            }
        }
    }
    /**
     * The next token may start a simple key. We check if it's possible and save
     * its position. This function is called for ALIAS, ANCHOR, TAG,
     * SCALAR(flow), '[', and '{'.
     */
    private void savePossibleSimpleKey() {
        // The next token may start a simple key. We check if it's possible
        // and save its position. This function is called for
        // ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
        // Check if a simple key is required at the current position.
        // A simple key is required if this position is the root flowLevel, AND
        // the current indentation level is the same as the last indent-level.
        boolean required = (this.flowLevel == 0) && (this.indent == this.reader.getColumn());
        if (allowSimpleKey || !required) {
            // A simple key is required only if it is the first token in the
            // current line. Therefore it is always allowed.
        } else {
            throw new YAMLException(
                    "A simple key is required only if it is the first token in the current line");
        }
        // The next token might be a simple key. Let's save it's number and
        // position.
        if (this.allowSimpleKey) {
            removePossibleSimpleKey();
            int tokenNumber = this.tokensTaken + this.tokens.size();
            SimpleKey key = new SimpleKey(tokenNumber, required, reader.getIndex(),
                    reader.getLine(), this.reader.getColumn(), this.reader.getMark());
            this.possibleSimpleKeys.put(this.flowLevel, key);
        }
    }
    /**
     * Remove the saved possible key position at the current flow level.
     */
    private void removePossibleSimpleKey() {
        SimpleKey key = possibleSimpleKeys.remove(flowLevel);
        if (key != null && key.isRequired()) {
            throw new ScannerException("while scanning a simple key", key.getMark(),
                    "could not find expected ':'", reader.getMark());
        }
    }
    // Indentation functions.
    /**
     * * Handle implicitly ending multiple levels of block nodes by decreased
     * indentation. This function becomes important on lines 4 and 7 of this
     * example:
     * 
     * 
     * 1) book one:
     * 2)   part one:
     * 3)     chapter one
     * 4)   part two:
     * 5)     chapter one
     * 6)     chapter two
     * 7) book two:
     * 
     * 
     * In flow context, tokens should respect indentation. Actually the
     * condition should be `self.indent >= column` according to the spec. But
     * this condition will prohibit intuitively correct constructions such as
     * key : { }             
     * *(anchor name)
     * 
     * 
     * @see 3.2.2.2. Anchors and Aliases
     */
    private void fetchAlias() {
        // ALIAS could be a simple key.
        savePossibleSimpleKey();
        // No simple keys after ALIAS.
        this.allowSimpleKey = false;
        // Scan and add ALIAS.
        Token tok = scanAnchor(false);
        addToken(tok);
    }
    /**
     * Fetch an anchor. Anchors take the form:
     * 
     * 
     * &(anchor name)
     * 
     * 
     * @see 3.2.2.2. Anchors and Aliases
     */
    private void fetchAnchor() {
        // ANCHOR could start a simple key.
        savePossibleSimpleKey();
        // No simple keys after ANCHOR.
        this.allowSimpleKey = false;
        // Scan and add ANCHOR.
        Token tok = scanAnchor(true);
        addToken(tok);
    }
    /**
     * Fetch a tag. Tags take a complex form.
     * 
     * @see 3.2.1.2. Tags
     */
    private void fetchTag() {
        // TAG could start a simple key.
        savePossibleSimpleKey();
        // No simple keys after TAG.
        this.allowSimpleKey = false;
        // Scan and add TAG.
        Token tok = scanTag();
        addToken(tok);
    }
    /**
     * Fetch a literal scalar, denoted with a vertical-bar. This is the type
     * best used for source code and other content, such as binary data, which
     * must be included verbatim.
     * 
     * @see 3.2.3.1. Node Styles
     */
    private void fetchLiteral() {
        fetchBlockScalar('|');
    }
    /**
     * Fetch a folded scalar, denoted with a greater-than sign. This is the type
     * best used for long content, such as the text of a chapter or description.
     * 
     * @see 3.2.3.1. Node Styles
     */
    private void fetchFolded() {
        fetchBlockScalar('>');
    }
    /**
     * Fetch a block scalar (literal or folded).
     * 
     * @see 3.2.3.1. Node Styles
     * 
     * @param style
     */
    private void fetchBlockScalar(char style) {
        // A simple key may follow a block scalar.
        this.allowSimpleKey = true;
        // Reset possible simple key on the current level.
        removePossibleSimpleKey();
        // Scan and add SCALAR.
        List
         * A plain scalar may start with any non-space character except:
         *   '-', '?', ':', ',', '[', ']', '{', '}',
         *   '#', '&', '*', '!', '|', '>', '\'', '\"',
         *   '%', '@', '`'.
         * 
         * It may also start with
         *   '-', '?', ':'
         * if it is followed by a non-space character.
         * 
         * Note that we limit the last rule to the block context (except the
         * '-' character) because we want the flow context to be space
         * independent.
         * 
         */
        int c = reader.peek();
        // If the next char is NOT one of the forbidden chars above or
        // whitespace, then this is the start of a plain scalar.
        return Constant.NULL_BL_T_LINEBR.hasNo(c, "-?:,[]{}#&*!|>\'\"%@`")
                || (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(1)) && (c == '-' || (this.flowLevel == 0 && "?:"
                        .indexOf(c) != -1)));
    }
    // Scanners.
    /**
     * 
     * We ignore spaces, line breaks and comments.
     * If we find a line break in the block context, we set the flag
     * `allow_simple_key` on.
     * The byte order mark is stripped if it's the first character in the
     * stream. We do not yet support BOM inside the stream as the
     * specification requires. Any such mark will be considered as a part
     * of the document.
     * TODO: We need to make tab handling rules more sane. A good rule is
     *   Tabs cannot precede tokens
     *   BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
     *   KEY(block), VALUE(block), BLOCK-ENTRY
     * So the checking code is
     *   if <TAB>:
     *       self.allow_simple_keys = False
     * We also need to add the check for `allow_simple_keys == True` to
     * `unwind_indent` before issuing BLOCK-END.
     * Scanners for block, flow, and plain scalars need to be modified.
     * 
     */
    private void scanToNextToken() {
        // If there is a byte order mark (BOM) at the beginning of the stream,
        // forward past it.
        if (reader.getIndex() == 0 && reader.peek() == 0xFEFF) {
            reader.forward();
        }
        boolean found = false;
        int inlineStartColumn = -1;
        while (!found) {
            Mark startMark = reader.getMark();
            int columnBeforeComment = reader.getColumn();
            boolean commentSeen = false;
            int ff = 0;
            // Peek ahead until we find the first non-space character, then
            // move forward directly to that character.
            while (reader.peek(ff) == ' ') {
                ff++;
            }
            if (ff > 0) {
                reader.forward(ff);
            }
            // If the character we have skipped forward to is a comment (#),
            // then peek ahead until we find the next end of line. YAML
            // comments are from a # to the next new-line. We then forward
            // past the comment.
            if (reader.peek() == '#') {
                commentSeen = true;
                CommentType type;
                if(columnBeforeComment != 0 && !(lastToken != null && lastToken.getTokenId() == Token.ID.BlockEntry)) {
                    type = CommentType.IN_LINE;
                    inlineStartColumn = reader.getColumn();
                } else if(inlineStartColumn == reader.getColumn()) {
                    type = CommentType.IN_LINE;
                } else {
                    inlineStartColumn = -1;
                    type = CommentType.BLOCK;
                }
                CommentToken token = scanComment(type);
                if (parseComments) {
                    addToken(token);
                }
            }
            // If we scanned a line break, then (depending on flow level),
            // simple keys may be allowed.
            String breaks = scanLineBreak();
            if (breaks.length() != 0) {// found a line-break
                if (parseComments && ! commentSeen) {
                    if (columnBeforeComment == 0) {
                        Mark endMark = reader.getMark();
                        addToken(new CommentToken(CommentType.BLANK_LINE, breaks, startMark, endMark));
                    }
                }
                if (this.flowLevel == 0) {
                    // Simple keys are allowed at flow-level 0 after a line
                    // break
                    this.allowSimpleKey = true;
                }
            } else {
                found = true;
            }
        }
    }
    private CommentToken scanComment(CommentType type) {
        // See the specification for details.
        Mark startMark = reader.getMark();
        reader.forward();
        int length = 0;
        while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(length))) {
            length++;
        }
        String value = reader.prefixForward(length);
        Mark endMark = reader.getMark();
        return new CommentToken(type, value, startMark, endMark);
    }
    @SuppressWarnings({ "unchecked", "rawtypes" })
    private List* Read a %TAG directive value: * *
     * s-ignored-space+ c-tag-handle s-ignored-space+ ns-tag-prefix s-l-comments
     * 
     * 
     * 
     * 
     * @see 7.1.2. “TAG” Directive
     */
    private List
     * The YAML 1.1 specification does not restrict characters for anchors and
     * aliases. This may lead to problems.
     * see https://bitbucket.org/snakeyaml/snakeyaml/issues/485/alias-names-are-too-permissive-compared-to
     * This implementation tries to follow https://github.com/yaml/yaml-spec/blob/master/rfc/RFC-0003.md
     * 
     */
    private Token scanAnchor(boolean isAnchor) {
        Mark startMark = reader.getMark();
        int indicator = reader.peek();
        String name = indicator == '*' ? "alias" : "anchor";
        reader.forward();
        int length = 0;
        int c = reader.peek(length);
        while (Constant.NULL_BL_T_LINEBR.hasNo(c, ":,[]{}/.*&")) {
            length++;
            c = reader.peek(length);
        }
        if (length == 0) {
            final String s = String.valueOf(Character.toChars(c));
            throw new ScannerException("while scanning an " + name, startMark,
                    "unexpected character found " + s + "(" + c + ")", reader.getMark());
        }
        String value = reader.prefixForward(length);
        c = reader.peek();
        if (Constant.NULL_BL_T_LINEBR.hasNo(c, "?:,]}%@`")) {
            final String s = String.valueOf(Character.toChars(c));
            throw new ScannerException("while scanning an " + name, startMark,
                    "unexpected character found " + s + "(" + c + ")", reader.getMark());
        }
        Mark endMark = reader.getMark();
        Token tok;
        if (isAnchor) {
            tok = new AnchorToken(value, startMark, endMark);
        } else {
            tok = new AliasToken(value, startMark, endMark);
        }
        return tok;
    }
    /**
     * * Scan a Tag property. A Tag property may be specified in one of three * ways: c-verbatim-tag, c-ns-shorthand-tag, or c-ns-non-specific-tag *
* ** c-verbatim-tag takes the form !<ns-uri-char+> and must be delivered * verbatim (as-is) to the application. In particular, verbatim tags are not * subject to tag resolution. *
* ** c-ns-shorthand-tag is a valid tag handle followed by a non-empty suffix. * If the tag handle is a c-primary-tag-handle ('!') then the suffix must * have all exclamation marks properly URI-escaped (%21); otherwise, the * string will look like a named tag handle: !foo!bar would be interpreted * as (handle="!foo!", suffix="bar"). *
* ** c-ns-non-specific-tag is always a lone '!'; this is only useful for plain * scalars, where its specification means that the scalar MUST be resolved * to have type tag:yaml.org,2002:str. *
* * TODO SnakeYaml incorrectly ignores c-ns-non-specific-tag right now. * * @see 8.2. Node Tags * * TODO Note that this method does not enforce rules about local versus * global tags! */ private Token scanTag() { // See the specification for details. Mark startMark = reader.getMark(); // Determine the type of tag property based on the first character // encountered int c = reader.peek(1); String handle = null; String suffix = null; // Verbatim tag! (c-verbatim-tag) if (c == '<') { // Skip the exclamation mark and >, then read the tag suffix (as // a URI). reader.forward(2); suffix = scanTagUri("tag", startMark); c = reader.peek(); if (c != '>') { // If there are any characters between the end of the tag-suffix // URI and the closing >, then an error has occurred. final String s = String.valueOf(Character.toChars(c)); throw new ScannerException("while scanning a tag", startMark, "expected '>', but found '" + s + "' (" + c + ")", reader.getMark()); } reader.forward(); } else if (Constant.NULL_BL_T_LINEBR.has(c)) { // A NUL, blank, tab, or line-break means that this was a // c-ns-non-specific tag. suffix = "!"; reader.forward(); } else { // Any other character implies c-ns-shorthand-tag type. // Look ahead in the stream to determine whether this tag property // is of the form !foo or !foo!bar. int length = 1; boolean useHandle = false; while (Constant.NULL_BL_LINEBR.hasNo(c)) { if (c == '!') { useHandle = true; break; } length++; c = reader.peek(length); } // If we need to use a handle, scan it in; otherwise, the handle is // presumed to be '!'. if (useHandle) { handle = scanTagHandle("tag", startMark); } else { handle = "!"; reader.forward(); } suffix = scanTagUri("tag", startMark); } c = reader.peek(); // Check that the next character is allowed to follow a tag-property; // if it is not, raise the error. if (Constant.NULL_BL_LINEBR.hasNo(c)) { final String s = String.valueOf(Character.toChars(c)); throw new ScannerException("while scanning a tag", startMark, "expected ' ', but found '" + s + "' (" + (c) + ")", reader.getMark()); } TagTuple value = new TagTuple(handle, suffix); Mark endMark = reader.getMark(); return new TagToken(value, startMark, endMark); } private List
     * See the specification for details.
     * Note that we loose indentation rules for quoted scalars. Quoted
     * scalars don't need to adhere indentation because " and ' clearly
     * mark the beginning and the end of them. Therefore we are less
     * restrictive then the specification requires. We only need to check
     * that document separators are not included in scalars.
     * 
     */
    private Token scanFlowScalar(char style) {
        boolean _double;
        // The style will be either single- or double-quoted; we determine this
        // by the first character in the entry (supplied)
        if (style == '"') {
            _double = true;
        } else {
            _double = false;
        }
        StringBuilder chunks = new StringBuilder();
        Mark startMark = reader.getMark();
        int quote = reader.peek();
        reader.forward();
        chunks.append(scanFlowScalarNonSpaces(_double, startMark));
        while (reader.peek() != quote) {
            chunks.append(scanFlowScalarSpaces(startMark));
            chunks.append(scanFlowScalarNonSpaces(_double, startMark));
        }
        reader.forward();
        Mark endMark = reader.getMark();
        return new ScalarToken(chunks.toString(), false, startMark, endMark, DumperOptions.ScalarStyle.createStyle(style));
    }
    /**
     * Scan some number of flow-scalar non-space characters.
     */
    private String scanFlowScalarNonSpaces(boolean doubleQuoted, Mark startMark) {
        // See the specification for details.
        StringBuilder chunks = new StringBuilder();
        while (true) {
            // Scan through any number of characters which are not: NUL, blank,
            // tabs, line breaks, single-quotes, double-quotes, or backslashes.
            int length = 0;
            while (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(length), "\'\"\\")) {
                length++;
            }
            if (length != 0) {
                chunks.append(reader.prefixForward(length));
            }
            // Depending on our quoting-type, the characters ', " and \ have
            // differing meanings.
            int c = reader.peek();
            if (!doubleQuoted && c == '\'' && reader.peek(1) == '\'') {
                chunks.append("'");
                reader.forward(2);
            } else if ((doubleQuoted && c == '\'') || (!doubleQuoted && "\"\\".indexOf(c) != -1)) {
                chunks.appendCodePoint(c);
                reader.forward();
            } else if (doubleQuoted && c == '\\') {
                reader.forward();
                c = reader.peek();
                if (!Character.isSupplementaryCodePoint(c) && ESCAPE_REPLACEMENTS.containsKey(Character.valueOf((char)c))) {
                    // The character is one of the single-replacement
                    // types; these are replaced with a literal character
                    // from the mapping.
                    chunks.append(ESCAPE_REPLACEMENTS.get(Character.valueOf((char)c)));
                    reader.forward();
                } else if (!Character.isSupplementaryCodePoint(c) && ESCAPE_CODES.containsKey(Character.valueOf((char)c))) {
                    // The character is a multi-digit escape sequence, with
                    // length defined by the value in the ESCAPE_CODES map.
                    length = ESCAPE_CODES.get(Character.valueOf((char)c)).intValue();
                    reader.forward();
                    String hex = reader.prefix(length);
                    if (NOT_HEXA.matcher(hex).find()) {
                        throw new ScannerException("while scanning a double-quoted scalar",
                                startMark, "expected escape sequence of " + length
                                        + " hexadecimal numbers, but found: " + hex,
                                reader.getMark());
                    }
                    int decimal = Integer.parseInt(hex, 16);
                    String unicode = new String(Character.toChars(decimal));
                    chunks.append(unicode);
                    reader.forward(length);
                } else if (scanLineBreak().length() != 0) {
                    chunks.append(scanFlowScalarBreaks(startMark));
                } else {
                    final String s = String.valueOf(Character.toChars(c));
                    throw new ScannerException("while scanning a double-quoted scalar", startMark,
                            "found unknown escape character " + s + "(" + c + ")",
                            reader.getMark());
                }
            } else {
                return chunks.toString();
            }
        }
    }
    private String scanFlowScalarSpaces(Mark startMark) {
        // See the specification for details.
        StringBuilder chunks = new StringBuilder();
        int length = 0;
        // Scan through any number of whitespace (space, tab) characters,
        // consuming them.
        while (" \t".indexOf(reader.peek(length)) != -1) {
            length++;
        }
        String whitespaces = reader.prefixForward(length);
        int c = reader.peek();
        if (c == '\0') {
            // A flow scalar cannot end with an end-of-stream
            throw new ScannerException("while scanning a quoted scalar", startMark,
                    "found unexpected end of stream", reader.getMark());
        }
        // If we encounter a line break, scan it into our assembled string...
        String lineBreak = scanLineBreak();
        if (lineBreak.length() != 0) {
            String breaks = scanFlowScalarBreaks(startMark);
            if (!"\n".equals(lineBreak)) {
                chunks.append(lineBreak);
            } else if (breaks.length() == 0) {
                chunks.append(" ");
            }
            chunks.append(breaks);
        } else {
            chunks.append(whitespaces);
        }
        return chunks.toString();
    }
    private String scanFlowScalarBreaks(Mark startMark) {
        // See the specification for details.
        StringBuilder chunks = new StringBuilder();
        while (true) {
            // Instead of checking indentation, we check for document
            // separators.
            String prefix = reader.prefix(3);
            if (("---".equals(prefix) || "...".equals(prefix))
                    && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
                throw new ScannerException("while scanning a quoted scalar", startMark,
                        "found unexpected document separator", reader.getMark());
            }
            // Scan past any number of spaces and tabs, ignoring them
            while (" \t".indexOf(reader.peek()) != -1) {
                reader.forward();
            }
            // If we stopped at a line break, add that; otherwise, return the
            // assembled set of scalar breaks.
            String lineBreak = scanLineBreak();
            if (lineBreak.length() != 0) {
                chunks.append(lineBreak);
            } else {
                return chunks.toString();
            }
        }
    }
    /**
     * Scan a plain scalar.
     * 
     * 
     * See the specification for details.
     * We add an additional restriction for the flow context:
     *   plain scalars in the flow context cannot contain ',', ':' and '?'.
     * We also keep track of the `allow_simple_key` flag here.
     * Indentation rules are loosed for the flow context.
     * 
     */
    private Token scanPlain() {
        StringBuilder chunks = new StringBuilder();
        Mark startMark = reader.getMark();
        Mark endMark = startMark;
        int indent = this.indent + 1;
        String spaces = "";
        while (true) {
            int c;
            int length = 0;
            // A comment indicates the end of the scalar.
            if (reader.peek() == '#') {
                break;
            }
            while (true) {
                c = reader.peek(length);
                if (Constant.NULL_BL_T_LINEBR.has(c)
                        || (c == ':' && Constant.NULL_BL_T_LINEBR.has(reader.peek(length + 1), flowLevel != 0 ? ",[]{}":""))
                        || (this.flowLevel != 0 && ",?[]{}".indexOf(c) != -1)) {
                    break;
                }
                length++;
            }
            if (length == 0) {
                break;
            }
            this.allowSimpleKey = false;
            chunks.append(spaces);
            chunks.append(reader.prefixForward(length));
            endMark = reader.getMark();
            spaces = scanPlainSpaces();
            // System.out.printf("spaces[%s]\n", spaces);
            if (spaces.length() == 0 || reader.peek() == '#'
                    || (this.flowLevel == 0 && this.reader.getColumn() < indent)) {
                break;
            }
        }
        return new ScalarToken(chunks.toString(), startMark, endMark, true);
    }
    // Helper for scanPlainSpaces method when comments are enabled.
    // The ensures that blank lines and comments following a multi-line plain token are not swallowed up
    private boolean atEndOfPlain() {
        // peak ahead to find end of whitespaces and the column at which it occurs
        int wsLength = 0;
        int wsColumn = this.reader.getColumn();
        {
            int c;
            while ((c = reader.peek(wsLength)) != '\0' && Constant.NULL_BL_T_LINEBR.has(c)) {
                wsLength++;
                if (!Constant.LINEBR.has(c) && (c != '\r' || reader.peek(wsLength + 1) != '\n') && c != 0xFEFF) {
                    wsColumn++;
                } else {
                    wsColumn = 0;
                }
            }
        }
        // if we see, a comment or end of string or change decrease in indent, we are done
        // Do not chomp end of lines and blanks, they will be handled by the main loop.
        if (reader.peek(wsLength) == '#' || reader.peek(wsLength + 1) == '\0'
                || this.flowLevel == 0 && wsColumn < this.indent) {
            return true;
        }
        // if we see, after the space, a key-value followed by a ':', we are done
        // Do not chomp end of lines and blanks, they will be handled by the main loop.
        if (this.flowLevel == 0) {
            int c;
            for(int extra = 1; (c = reader.peek(wsLength + extra)) != 0 && !Constant.NULL_BL_T_LINEBR.has(c); extra++) {
                if (c == ':' && Constant.NULL_BL_T_LINEBR.has(reader.peek(wsLength + extra + 1))) {
                    return true;
                }
            }
        }
        // None of the above so safe to chomp the spaces.
        return false;
    }
    /**
     * See the specification for details. SnakeYAML and libyaml allow tabs
     * inside plain scalar
     */
    private String scanPlainSpaces() {
        int length = 0;
        while (reader.peek(length) == ' ' || reader.peek(length) == '\t') {
            length++;
        }
        String whitespaces = reader.prefixForward(length);
        String lineBreak = scanLineBreak();
        if (lineBreak.length() != 0) {
            this.allowSimpleKey = true;
            String prefix = reader.prefix(3);
            if ("---".equals(prefix) || "...".equals(prefix)
                    && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
                return "";
            }
            if(parseComments && atEndOfPlain()) {
                return "";
            }
            StringBuilder breaks = new StringBuilder();
            while (true) {
                if (reader.peek() == ' ') {
                    reader.forward();
                } else {
                    String lb = scanLineBreak();
                    if (lb.length() != 0) {
                        breaks.append(lb);
                        prefix = reader.prefix(3);
                        if ("---".equals(prefix) || "...".equals(prefix)
                                && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) {
                            return "";
                        }
                    } else {
                        break;
                    }
                }
            }
            if (!"\n".equals(lineBreak)) {
                return lineBreak + breaks;
            } else if (breaks.length() == 0) {
                return " ";
            }
            return breaks.toString();
        }
        return whitespaces;
    }
    /**
     * * Scan a Tag handle. A Tag handle takes one of three forms: * *
     * "!" (c-primary-tag-handle)
     * "!!" (ns-secondary-tag-handle)
     * "!(name)!" (c-named-tag-handle)
     * 
     * 
     * Where (name) must be formatted as an ns-word-char.
     * 
     * 
     * @see 
     * @see 
     * 
     *      
     * See the specification for details.
     * For some strange reasons, the specification does not allow '_' in
     * tag handles. I have allowed it anyway.
     * 
     */
    private String scanTagHandle(String name, Mark startMark) {
        int c = reader.peek();
        if (c != '!') {
            final String s = String.valueOf(Character.toChars(c));
            throw new ScannerException("while scanning a " + name, startMark,
                    "expected '!', but found " + s + "(" + (c) + ")", reader.getMark());
        }
        // Look for the next '!' in the stream, stopping if we hit a
        // non-word-character. If the first character is a space, then the
        // tag-handle is a c-primary-tag-handle ('!').
        int length = 1;
        c = reader.peek(length);
        if (c != ' ') {
            // Scan through 0+ alphabetic characters.
            // FIXME According to the specification, these should be
            // ns-word-char only, which prohibits '_'. This might be a
            // candidate for a configuration option.
            while (Constant.ALPHA.has(c)) {
                length++;
                c = reader.peek(length);
            }
            // Found the next non-word-char. If this is not a space and not an
            // '!', then this is an error, as the tag-handle was specified as:
            // !(name) or similar; the trailing '!' is missing.
            if (c != '!') {
                reader.forward(length);
                final String s = String.valueOf(Character.toChars(c));
                throw new ScannerException("while scanning a " + name, startMark,
                        "expected '!', but found " + s + "(" + (c) + ")", reader.getMark());
            }
            length++;
        }
        String value = reader.prefixForward(length);
        return value;
    }
    /**
     * * Scan a Tag URI. This scanning is valid for both local and global tag * directives, because both appear to be valid URIs as far as scanning is * concerned. The difference may be distinguished later, in parsing. This * method will scan for ns-uri-char*, which covers both cases. *
* ** This method performs no verification that the scanned URI conforms to any * particular kind of URI specification. *
* * @see */ private String scanTagUri(String name, Mark startMark) { // See the specification for details. // Note: we do not check if URI is well-formed. StringBuilder chunks = new StringBuilder(); // Scan through accepted URI characters, which includes the standard // URI characters, plus the start-escape character ('%'). When we get // to a start-escape, scan the escaped sequence, then return. int length = 0; int c = reader.peek(length); while (Constant.URI_CHARS.has(c)) { if (c == '%') { chunks.append(reader.prefixForward(length)); length = 0; chunks.append(scanUriEscapes(name, startMark)); } else { length++; } c = reader.peek(length); } // Consume the last "chunk", which would not otherwise be consumed by // the loop above. if (length != 0) { chunks.append(reader.prefixForward(length)); } if (chunks.length() == 0) { // If no URI was found, an error has occurred. final String s = String.valueOf(Character.toChars(c)); throw new ScannerException("while scanning a " + name, startMark, "expected URI, but found " + s + "(" + (c) + ")", reader.getMark()); } return chunks.toString(); } /** ** Scan a sequence of %-escaped URI escape codes and convert them into a * String representing the unescaped values. *
* * FIXME This method fails for more than 256 bytes' worth of URI-encoded * characters in a row. Is this possible? Is this a use-case? * * @see section 2.4, Escaped Encoding */ private String scanUriEscapes(String name, Mark startMark) { // First, look ahead to see how many URI-escaped characters we should // expect, so we can use the correct buffer size. int length = 1; while (reader.peek(length * 3) == '%') { length++; } // See the specification for details. // URIs containing 16 and 32 bit Unicode characters are // encoded in UTF-8, and then each octet is written as a // separate character. Mark beginningMark = reader.getMark(); ByteBuffer buff = ByteBuffer.allocate(length); while (reader.peek() == '%') { reader.forward(); try { byte code = (byte) Integer.parseInt(reader.prefix(2), 16); buff.put(code); } catch (NumberFormatException nfe) { int c1 = reader.peek(); final String s1 = String.valueOf(Character.toChars(c1)); int c2 = reader.peek(1); final String s2 = String.valueOf(Character.toChars(c2)); throw new ScannerException("while scanning a " + name, startMark, "expected URI escape sequence of 2 hexadecimal numbers, but found " + s1 + "(" + c1 + ") and " + s2 + "(" + c2 + ")", reader.getMark()); } reader.forward(2); } buff.flip(); try { return UriEncoder.decode(buff); } catch (CharacterCodingException e) { throw new ScannerException("while scanning a " + name, startMark, "expected URI in UTF-8: " + e.getMessage(), beginningMark); } } /** * Scan a line break, transforming: * *
     * '\r\n' : '\n'
     * '\r' : '\n'
     * '\n' : '\n'
     * '\x85' : '\n'
     * default : ''
     * 
     */
    private String scanLineBreak() {
        // Transforms:
        // '\r\n' : '\n'
        // '\r' : '\n'
        // '\n' : '\n'
        // '\x85' : '\n'
        // default : ''
        int c = reader.peek();
        if (c == '\r' || c == '\n' || c == '\u0085') {
            if (c == '\r' && '\n' == reader.peek(1)) {
                reader.forward(2);
            } else {
                reader.forward();
            }
            return "\n";
        } else if (c == '\u2028' || c == '\u2029') {
            reader.forward();
            return String.valueOf(Character.toChars(c));
        }
        return "";
    }
    private List