org.yaml.snakeyaml.scanner.ScannerImpl Maven / Gradle / Ivy
/**
* Copyright (c) 2008, SnakeYAML
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.yaml.snakeyaml.scanner;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.yaml.snakeyaml.DumperOptions;
import org.yaml.snakeyaml.comments.CommentType;
import org.yaml.snakeyaml.error.Mark;
import org.yaml.snakeyaml.error.YAMLException;
import org.yaml.snakeyaml.reader.StreamReader;
import org.yaml.snakeyaml.tokens.AliasToken;
import org.yaml.snakeyaml.tokens.AnchorToken;
import org.yaml.snakeyaml.tokens.BlockEndToken;
import org.yaml.snakeyaml.tokens.BlockEntryToken;
import org.yaml.snakeyaml.tokens.BlockMappingStartToken;
import org.yaml.snakeyaml.tokens.BlockSequenceStartToken;
import org.yaml.snakeyaml.tokens.CommentToken;
import org.yaml.snakeyaml.tokens.DirectiveToken;
import org.yaml.snakeyaml.tokens.DocumentEndToken;
import org.yaml.snakeyaml.tokens.DocumentStartToken;
import org.yaml.snakeyaml.tokens.FlowEntryToken;
import org.yaml.snakeyaml.tokens.FlowMappingEndToken;
import org.yaml.snakeyaml.tokens.FlowMappingStartToken;
import org.yaml.snakeyaml.tokens.FlowSequenceEndToken;
import org.yaml.snakeyaml.tokens.FlowSequenceStartToken;
import org.yaml.snakeyaml.tokens.KeyToken;
import org.yaml.snakeyaml.tokens.ScalarToken;
import org.yaml.snakeyaml.tokens.StreamEndToken;
import org.yaml.snakeyaml.tokens.StreamStartToken;
import org.yaml.snakeyaml.tokens.TagToken;
import org.yaml.snakeyaml.tokens.TagTuple;
import org.yaml.snakeyaml.tokens.Token;
import org.yaml.snakeyaml.tokens.ValueToken;
import org.yaml.snakeyaml.util.ArrayStack;
import org.yaml.snakeyaml.util.UriEncoder;
/**
*
* Scanner produces tokens of the following types:
* STREAM-START
* STREAM-END
* COMMENT
* DIRECTIVE(name, value)
* DOCUMENT-START
* DOCUMENT-END
* BLOCK-SEQUENCE-START
* BLOCK-MAPPING-START
* BLOCK-END
* FLOW-SEQUENCE-START
* FLOW-MAPPING-START
* FLOW-SEQUENCE-END
* FLOW-MAPPING-END
* BLOCK-ENTRY
* FLOW-ENTRY
* KEY
* VALUE
* ALIAS(value)
* ANCHOR(value)
* TAG(value)
* SCALAR(value, plain, style)
* Read comments in the Scanner code for more details.
*
*/
public final class ScannerImpl implements Scanner {
/**
* A regular expression matching characters which are not in the hexadecimal
* set (0-9, A-F, a-f).
*/
private final static Pattern NOT_HEXA = Pattern.compile("[^0-9A-Fa-f]");
/**
* A mapping from an escaped character in the input stream to the character
* that they should be replaced with.
*
* YAML defines several common and a few uncommon escape sequences.
*
* @see 4.1.6.
* Escape Sequences
*/
public final static Map ESCAPE_REPLACEMENTS = new HashMap();
/**
* A mapping from a character to a number of bytes to read-ahead for that
* escape sequence. These escape sequences are used to handle unicode
* escaping in the following formats, where H is a hexadecimal character:
*
*
* \xHH : escaped 8-bit Unicode character
* \uHHHH : escaped 16-bit Unicode character
* \UHHHHHHHH : escaped 32-bit Unicode character
*
*
* @see 5.6. Escape
* Sequences
*/
public final static Map ESCAPE_CODES = new HashMap();
static {
// ASCII null
ESCAPE_REPLACEMENTS.put(Character.valueOf('0'), "\0");
// ASCII bell
ESCAPE_REPLACEMENTS.put(Character.valueOf('a'), "\u0007");
// ASCII backspace
ESCAPE_REPLACEMENTS.put(Character.valueOf('b'), "\u0008");
// ASCII horizontal tab
ESCAPE_REPLACEMENTS.put(Character.valueOf('t'), "\u0009");
// ASCII newline (line feed; \n maps to 0x0A)
ESCAPE_REPLACEMENTS.put(Character.valueOf('n'), "\n");
// ASCII vertical tab
ESCAPE_REPLACEMENTS.put(Character.valueOf('v'), "\u000B");
// ASCII form-feed
ESCAPE_REPLACEMENTS.put(Character.valueOf('f'), "\u000C");
// carriage-return (\r maps to 0x0D)
ESCAPE_REPLACEMENTS.put(Character.valueOf('r'), "\r");
// ASCII escape character (Esc)
ESCAPE_REPLACEMENTS.put(Character.valueOf('e'), "\u001B");
// ASCII space
ESCAPE_REPLACEMENTS.put(Character.valueOf(' '), "\u0020");
// ASCII double-quote
ESCAPE_REPLACEMENTS.put(Character.valueOf('"'), "\"");
// ASCII backslash
ESCAPE_REPLACEMENTS.put(Character.valueOf('\\'), "\\");
// Unicode next line
ESCAPE_REPLACEMENTS.put(Character.valueOf('N'), "\u0085");
// Unicode non-breaking-space
ESCAPE_REPLACEMENTS.put(Character.valueOf('_'), "\u00A0");
// Unicode line-separator
ESCAPE_REPLACEMENTS.put(Character.valueOf('L'), "\u2028");
// Unicode paragraph separator
ESCAPE_REPLACEMENTS.put(Character.valueOf('P'), "\u2029");
// 8-bit Unicode
ESCAPE_CODES.put(Character.valueOf('x'), 2);
// 16-bit Unicode
ESCAPE_CODES.put(Character.valueOf('u'), 4);
// 32-bit Unicode (Supplementary characters are supported)
ESCAPE_CODES.put(Character.valueOf('U'), 8);
}
private final StreamReader reader;
// Had we reached the end of the stream?
private boolean done = false;
// The number of unclosed '{' and '['. `flow_level == 0` means block
// context.
private int flowLevel = 0;
// List of processed tokens that are not yet emitted.
private List tokens;
// The last added token
private Token lastToken;
// Number of tokens that were emitted through the `get_token` method.
private int tokensTaken = 0;
// The current indentation level.
private int indent = -1;
// Past indentation levels.
private ArrayStack indents;
// A flag that indicates if comments should be parsed
private boolean parseComments;
// Variables related to simple keys treatment. See PyYAML.
/**
*
* A simple key is a key that is not denoted by the '?' indicator.
* Example of simple keys:
* ---
* block simple key: value
* ? not a simple key:
* : { flow simple key: value }
* We emit the KEY token before all keys, so when we find a potential
* simple key, we try to locate the corresponding ':' indicator.
* Simple keys should be limited to a single line and 1024 characters.
*
* Can a simple key start at the current position? A simple key may
* start:
* - at the beginning of the line, not counting indentation spaces
* (in block context),
* - after '{', '[', ',' (in the flow context),
* - after '?', ':', '-' (in the block context).
* In the block context, this flag also signifies if a block collection
* may start at the current position.
*
*/
private boolean allowSimpleKey = true;
/*
* Keep track of possible simple keys. This is a dictionary. The key is
* `flow_level`; there can be no more that one possible simple key for each
* level. The value is a SimpleKey record: (token_number, required, index,
* line, column, mark) A simple key may start with ALIAS, ANCHOR, TAG,
* SCALAR(flow), '[', or '{' tokens.
*/
private Map possibleSimpleKeys;
public ScannerImpl(StreamReader reader) {
this.parseComments = false;
this.reader = reader;
this.tokens = new ArrayList(100);
this.indents = new ArrayStack(10);
// The order in possibleSimpleKeys is kept for nextPossibleSimpleKey()
this.possibleSimpleKeys = new LinkedHashMap();
fetchStreamStart();// Add the STREAM-START token.
}
/**
* Set the scanner to ignore comments or parse them as a CommentToken.
*
* @param parseComments true
to parse; false
to ignore
*/
public ScannerImpl setParseComments(boolean parseComments) {
this.parseComments = parseComments;
return this;
}
public boolean isParseComments() {
return parseComments;
}
/**
* Check whether the next token is one of the given types.
*/
public boolean checkToken(Token.ID... choices) {
while (needMoreTokens()) {
fetchMoreTokens();
}
if (!this.tokens.isEmpty()) {
if (choices.length == 0) {
return true;
}
// since profiler puts this method on top (it is used a lot), we
// should not use 'foreach' here because of the performance reasons
Token.ID first = this.tokens.get(0).getTokenId();
for (int i = 0; i < choices.length; i++) {
if (first == choices[i]) {
return true;
}
}
}
return false;
}
/**
* Return the next token, but do not delete it from the queue.
*/
public Token peekToken() {
while (needMoreTokens()) {
fetchMoreTokens();
}
return this.tokens.get(0);
}
/**
* Return the next token, removing it from the queue.
*/
public Token getToken() {
this.tokensTaken++;
return this.tokens.remove(0);
}
// Private methods.
private void addToken(Token token) {
lastToken = token;
this.tokens.add(token);
}
private void addToken(int index, Token token) {
if(index == this.tokens.size()) {
lastToken = token;
}
this.tokens.add(index, token);
}
private void addAllTokens(List tokens) {
lastToken = tokens.get(tokens.size()-1);
this.tokens.addAll(tokens);
}
/**
* Returns true if more tokens should be scanned.
*/
private boolean needMoreTokens() {
// If we are done, we do not require more tokens.
if (this.done) {
return false;
}
// If we aren't done, but we have no tokens, we need to scan more.
if (this.tokens.isEmpty()) {
return true;
}
// The current token may be a potential simple key, so we
// need to look further.
stalePossibleSimpleKeys();
return nextPossibleSimpleKey() == this.tokensTaken;
}
/**
* Fetch one or more tokens from the StreamReader.
*/
private void fetchMoreTokens() {
// Eat whitespaces and process comments until we reach the next token.
scanToNextToken();
// Remove obsolete possible simple keys.
stalePossibleSimpleKeys();
// Compare the current indentation and column. It may add some tokens
// and decrease the current indentation level.
unwindIndent(reader.getColumn());
// Peek the next code point, to decide what the next group of tokens
// will look like.
int c = reader.peek();
switch (c) {
case '\0':
// Is it the end of stream?
fetchStreamEnd();
return;
case '%':
// Is it a directive?
if (checkDirective()) {
fetchDirective();
return;
}
break;
case '-':
// Is it the document start?
if (checkDocumentStart()) {
fetchDocumentStart();
return;
// Is it the block entry indicator?
} else if (checkBlockEntry()) {
fetchBlockEntry();
return;
}
break;
case '.':
// Is it the document end?
if (checkDocumentEnd()) {
fetchDocumentEnd();
return;
}
break;
// TODO support for BOM within a stream. (also not implemented in PyYAML)
case '[':
// Is it the flow sequence start indicator?
fetchFlowSequenceStart();
return;
case '{':
// Is it the flow mapping start indicator?
fetchFlowMappingStart();
return;
case ']':
// Is it the flow sequence end indicator?
fetchFlowSequenceEnd();
return;
case '}':
// Is it the flow mapping end indicator?
fetchFlowMappingEnd();
return;
case ',':
// Is it the flow entry indicator?
fetchFlowEntry();
return;
// see block entry indicator above
case '?':
// Is it the key indicator?
if (checkKey()) {
fetchKey();
return;
}
break;
case ':':
// Is it the value indicator?
if (checkValue()) {
fetchValue();
return;
}
break;
case '*':
// Is it an alias?
fetchAlias();
return;
case '&':
// Is it an anchor?
fetchAnchor();
return;
case '!':
// Is it a tag?
fetchTag();
return;
case '|':
// Is it a literal scalar?
if (this.flowLevel == 0) {
fetchLiteral();
return;
}
break;
case '>':
// Is it a folded scalar?
if (this.flowLevel == 0) {
fetchFolded();
return;
}
break;
case '\'':
// Is it a single quoted scalar?
fetchSingle();
return;
case '"':
// Is it a double quoted scalar?
fetchDouble();
return;
}
// It must be a plain scalar then.
if (checkPlain()) {
fetchPlain();
return;
}
// No? It's an error. Let's produce a nice error message.We do this by
// converting escaped characters into their escape sequences. This is a
// backwards use of the ESCAPE_REPLACEMENTS map.
String chRepresentation = String.valueOf(Character.toChars(c));
for (Character s : ESCAPE_REPLACEMENTS.keySet()) {
String v = ESCAPE_REPLACEMENTS.get(s);
if (v.equals(chRepresentation)) {
chRepresentation = "\\" + s;// ' ' -> '\t'
break;
}
}
if (c == '\t')
chRepresentation += "(TAB)";
String text = String
.format("found character '%s' that cannot start any token. (Do not use %s for indentation)",
chRepresentation, chRepresentation);
throw new ScannerException("while scanning for the next token", null, text,
reader.getMark());
}
// Simple keys treatment.
/**
* Return the number of the nearest possible simple key. Actually we don't
* need to loop through the whole dictionary.
*/
private int nextPossibleSimpleKey() {
/*
* the implementation is not as in PyYAML. Because
* this.possibleSimpleKeys is ordered we can simply take the first key
*/
if (!this.possibleSimpleKeys.isEmpty()) {
return this.possibleSimpleKeys.values().iterator().next().getTokenNumber();
}
return -1;
}
/**
*
* Remove entries that are no longer possible simple keys. According to
* the YAML specification, simple keys
* - should be limited to a single line,
* - should be no longer than 1024 characters.
* Disabling this procedure will allow simple keys of any length and
* height (may cause problems if indentation is broken though).
*
*/
private void stalePossibleSimpleKeys() {
if (!this.possibleSimpleKeys.isEmpty()) {
for (Iterator iterator = this.possibleSimpleKeys.values().iterator(); iterator
.hasNext();) {
SimpleKey key = iterator.next();
if ((key.getLine() != reader.getLine())
|| (reader.getIndex() - key.getIndex() > 1024)) {
// If the key is not on the same line as the current
// position OR the difference in column between the token
// start and the current position is more than the maximum
// simple key length, then this cannot be a simple key.
if (key.isRequired()) {
// If the key was required, this implies an error
// condition.
throw new ScannerException("while scanning a simple key", key.getMark(),
"could not find expected ':'", reader.getMark());
}
iterator.remove();
}
}
}
}
/**
* The next token may start a simple key. We check if it's possible and save
* its position. This function is called for ALIAS, ANCHOR, TAG,
* SCALAR(flow), '[', and '{'.
*/
private void savePossibleSimpleKey() {
// The next token may start a simple key. We check if it's possible
// and save its position. This function is called for
// ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
// Check if a simple key is required at the current position.
// A simple key is required if this position is the root flowLevel, AND
// the current indentation level is the same as the last indent-level.
boolean required = (this.flowLevel == 0) && (this.indent == this.reader.getColumn());
if (allowSimpleKey || !required) {
// A simple key is required only if it is the first token in the
// current line. Therefore it is always allowed.
} else {
throw new YAMLException(
"A simple key is required only if it is the first token in the current line");
}
// The next token might be a simple key. Let's save it's number and
// position.
if (this.allowSimpleKey) {
removePossibleSimpleKey();
int tokenNumber = this.tokensTaken + this.tokens.size();
SimpleKey key = new SimpleKey(tokenNumber, required, reader.getIndex(),
reader.getLine(), this.reader.getColumn(), this.reader.getMark());
this.possibleSimpleKeys.put(this.flowLevel, key);
}
}
/**
* Remove the saved possible key position at the current flow level.
*/
private void removePossibleSimpleKey() {
SimpleKey key = possibleSimpleKeys.remove(flowLevel);
if (key != null && key.isRequired()) {
throw new ScannerException("while scanning a simple key", key.getMark(),
"could not find expected ':'", reader.getMark());
}
}
// Indentation functions.
/**
* * Handle implicitly ending multiple levels of block nodes by decreased
* indentation. This function becomes important on lines 4 and 7 of this
* example:
*
*
* 1) book one:
* 2) part one:
* 3) chapter one
* 4) part two:
* 5) chapter one
* 6) chapter two
* 7) book two:
*
*
* In flow context, tokens should respect indentation. Actually the
* condition should be `self.indent >= column` according to the spec. But
* this condition will prohibit intuitively correct constructions such as
* key : { }
*/
private void unwindIndent(int col) {
// In the flow context, indentation is ignored. We make the scanner less
// restrictive than specification requires.
if (this.flowLevel != 0) {
return;
}
// In block context, we may need to issue the BLOCK-END tokens.
while (this.indent > col) {
Mark mark = reader.getMark();
this.indent = this.indents.pop();
addToken(new BlockEndToken(mark, mark));
}
}
/**
* Check if we need to increase indentation.
*/
private boolean addIndent(int column) {
if (this.indent < column) {
this.indents.push(this.indent);
this.indent = column;
return true;
}
return false;
}
// Fetchers.
/**
* We always add STREAM-START as the first token and STREAM-END as the last
* token.
*/
private void fetchStreamStart() {
// Read the token.
Mark mark = reader.getMark();
// Add STREAM-START.
Token token = new StreamStartToken(mark, mark);
addToken(token);
}
private void fetchStreamEnd() {
// Set the current indentation to -1.
unwindIndent(-1);
// Reset simple keys.
removePossibleSimpleKey();
this.allowSimpleKey = false;
this.possibleSimpleKeys.clear();
// Read the token.
Mark mark = reader.getMark();
// Add STREAM-END.
Token token = new StreamEndToken(mark, mark);
addToken(token);
// The stream is finished.
this.done = true;
}
/**
* Fetch a YAML directive. Directives are presentation details that are
* interpreted as instructions to the processor. YAML defines two kinds of
* directives, YAML and TAG; all other types are reserved for future use.
*
* @see 3.2.3.4. Directives
*/
private void fetchDirective() {
// Set the current indentation to -1.
unwindIndent(-1);
// Reset simple keys.
removePossibleSimpleKey();
this.allowSimpleKey = false;
// Scan and add DIRECTIVE.
List* *(anchor name) ** * @see 3.2.2.2. Anchors and Aliases */ private void fetchAlias() { // ALIAS could be a simple key. savePossibleSimpleKey(); // No simple keys after ALIAS. this.allowSimpleKey = false; // Scan and add ALIAS. Token tok = scanAnchor(false); addToken(tok); } /** * Fetch an anchor. Anchors take the form: * *
* &(anchor name) ** * @see 3.2.2.2. Anchors and Aliases */ private void fetchAnchor() { // ANCHOR could start a simple key. savePossibleSimpleKey(); // No simple keys after ANCHOR. this.allowSimpleKey = false; // Scan and add ANCHOR. Token tok = scanAnchor(true); addToken(tok); } /** * Fetch a tag. Tags take a complex form. * * @see 3.2.1.2. Tags */ private void fetchTag() { // TAG could start a simple key. savePossibleSimpleKey(); // No simple keys after TAG. this.allowSimpleKey = false; // Scan and add TAG. Token tok = scanTag(); addToken(tok); } /** * Fetch a literal scalar, denoted with a vertical-bar. This is the type * best used for source code and other content, such as binary data, which * must be included verbatim. * * @see 3.2.3.1. Node Styles */ private void fetchLiteral() { fetchBlockScalar('|'); } /** * Fetch a folded scalar, denoted with a greater-than sign. This is the type * best used for long content, such as the text of a chapter or description. * * @see 3.2.3.1. Node Styles */ private void fetchFolded() { fetchBlockScalar('>'); } /** * Fetch a block scalar (literal or folded). * * @see 3.2.3.1. Node Styles * * @param style */ private void fetchBlockScalar(char style) { // A simple key may follow a block scalar. this.allowSimpleKey = true; // Reset possible simple key on the current level. removePossibleSimpleKey(); // Scan and add SCALAR. List
* A plain scalar may start with any non-space character except: * '-', '?', ':', ',', '[', ']', '{', '}', * '#', '&', '*', '!', '|', '>', '\'', '\"', * '%', '@', '`'. * * It may also start with * '-', '?', ':' * if it is followed by a non-space character. * * Note that we limit the last rule to the block context (except the * '-' character) because we want the flow context to be space * independent. **/ int c = reader.peek(); // If the next char is NOT one of the forbidden chars above or // whitespace, then this is the start of a plain scalar. return Constant.NULL_BL_T_LINEBR.hasNo(c, "-?:,[]{}#&*!|>\'\"%@`") || (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(1)) && (c == '-' || (this.flowLevel == 0 && "?:" .indexOf(c) != -1))); } // Scanners. /** *
* We ignore spaces, line breaks and comments. * If we find a line break in the block context, we set the flag * `allow_simple_key` on. * The byte order mark is stripped if it's the first character in the * stream. We do not yet support BOM inside the stream as the * specification requires. Any such mark will be considered as a part * of the document. * TODO: We need to make tab handling rules more sane. A good rule is * Tabs cannot precede tokens * BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, * KEY(block), VALUE(block), BLOCK-ENTRY * So the checking code is * if <TAB>: * self.allow_simple_keys = False * We also need to add the check for `allow_simple_keys == True` to * `unwind_indent` before issuing BLOCK-END. * Scanners for block, flow, and plain scalars need to be modified. **/ private void scanToNextToken() { // If there is a byte order mark (BOM) at the beginning of the stream, // forward past it. if (reader.getIndex() == 0 && reader.peek() == 0xFEFF) { reader.forward(); } boolean found = false; int inlineStartColumn = -1; while (!found) { Mark startMark = reader.getMark(); int columnBeforeComment = reader.getColumn(); boolean commentSeen = false; int ff = 0; // Peek ahead until we find the first non-space character, then // move forward directly to that character. while (reader.peek(ff) == ' ') { ff++; } if (ff > 0) { reader.forward(ff); } // If the character we have skipped forward to is a comment (#), // then peek ahead until we find the next end of line. YAML // comments are from a # to the next new-line. We then forward // past the comment. if (reader.peek() == '#') { commentSeen = true; CommentType type; if(columnBeforeComment != 0 && !(lastToken != null && lastToken.getTokenId() == Token.ID.BlockEntry)) { type = CommentType.IN_LINE; inlineStartColumn = reader.getColumn(); } else if(inlineStartColumn == reader.getColumn()) { type = CommentType.IN_LINE; } else { inlineStartColumn = -1; type = CommentType.BLOCK; } CommentToken token = scanComment(type); if (parseComments) { addToken(token); } } // If we scanned a line break, then (depending on flow level), // simple keys may be allowed. String breaks = scanLineBreak(); if (breaks.length() != 0) {// found a line-break if (parseComments && ! commentSeen) { if (columnBeforeComment == 0) { Mark endMark = reader.getMark(); addToken(new CommentToken(CommentType.BLANK_LINE, breaks, startMark, endMark)); } } if (this.flowLevel == 0) { // Simple keys are allowed at flow-level 0 after a line // break this.allowSimpleKey = true; } } else { found = true; } } } private CommentToken scanComment(CommentType type) { // See the specification for details. Mark startMark = reader.getMark(); reader.forward(); int length = 0; while (Constant.NULL_OR_LINEBR.hasNo(reader.peek(length))) { length++; } String value = reader.prefixForward(length); Mark endMark = reader.getMark(); return new CommentToken(type, value, startMark, endMark); } @SuppressWarnings({ "unchecked", "rawtypes" }) private List
* Read a %TAG directive value: * *
* s-ignored-space+ c-tag-handle s-ignored-space+ ns-tag-prefix s-l-comments ** * * * @see 7.1.2. “TAG” Directive */ private List
* The YAML 1.1 specification does not restrict characters for anchors and * aliases. This may lead to problems. * see https://bitbucket.org/snakeyaml/snakeyaml/issues/485/alias-names-are-too-permissive-compared-to * This implementation tries to follow https://github.com/yaml/yaml-spec/blob/master/rfc/RFC-0003.md **/ private Token scanAnchor(boolean isAnchor) { Mark startMark = reader.getMark(); int indicator = reader.peek(); String name = indicator == '*' ? "alias" : "anchor"; reader.forward(); int length = 0; int c = reader.peek(length); while (Constant.NULL_BL_T_LINEBR.hasNo(c, ":,[]{}/.*&")) { length++; c = reader.peek(length); } if (length == 0) { final String s = String.valueOf(Character.toChars(c)); throw new ScannerException("while scanning an " + name, startMark, "unexpected character found " + s + "(" + c + ")", reader.getMark()); } String value = reader.prefixForward(length); c = reader.peek(); if (Constant.NULL_BL_T_LINEBR.hasNo(c, "?:,]}%@`")) { final String s = String.valueOf(Character.toChars(c)); throw new ScannerException("while scanning an " + name, startMark, "unexpected character found " + s + "(" + c + ")", reader.getMark()); } Mark endMark = reader.getMark(); Token tok; if (isAnchor) { tok = new AnchorToken(value, startMark, endMark); } else { tok = new AliasToken(value, startMark, endMark); } return tok; } /** *
* Scan a Tag property. A Tag property may be specified in one of three * ways: c-verbatim-tag, c-ns-shorthand-tag, or c-ns-non-specific-tag *
* ** c-verbatim-tag takes the form !<ns-uri-char+> and must be delivered * verbatim (as-is) to the application. In particular, verbatim tags are not * subject to tag resolution. *
* ** c-ns-shorthand-tag is a valid tag handle followed by a non-empty suffix. * If the tag handle is a c-primary-tag-handle ('!') then the suffix must * have all exclamation marks properly URI-escaped (%21); otherwise, the * string will look like a named tag handle: !foo!bar would be interpreted * as (handle="!foo!", suffix="bar"). *
* ** c-ns-non-specific-tag is always a lone '!'; this is only useful for plain * scalars, where its specification means that the scalar MUST be resolved * to have type tag:yaml.org,2002:str. *
* * TODO SnakeYaml incorrectly ignores c-ns-non-specific-tag right now. * * @see 8.2. Node Tags * * TODO Note that this method does not enforce rules about local versus * global tags! */ private Token scanTag() { // See the specification for details. Mark startMark = reader.getMark(); // Determine the type of tag property based on the first character // encountered int c = reader.peek(1); String handle = null; String suffix = null; // Verbatim tag! (c-verbatim-tag) if (c == '<') { // Skip the exclamation mark and >, then read the tag suffix (as // a URI). reader.forward(2); suffix = scanTagUri("tag", startMark); c = reader.peek(); if (c != '>') { // If there are any characters between the end of the tag-suffix // URI and the closing >, then an error has occurred. final String s = String.valueOf(Character.toChars(c)); throw new ScannerException("while scanning a tag", startMark, "expected '>', but found '" + s + "' (" + c + ")", reader.getMark()); } reader.forward(); } else if (Constant.NULL_BL_T_LINEBR.has(c)) { // A NUL, blank, tab, or line-break means that this was a // c-ns-non-specific tag. suffix = "!"; reader.forward(); } else { // Any other character implies c-ns-shorthand-tag type. // Look ahead in the stream to determine whether this tag property // is of the form !foo or !foo!bar. int length = 1; boolean useHandle = false; while (Constant.NULL_BL_LINEBR.hasNo(c)) { if (c == '!') { useHandle = true; break; } length++; c = reader.peek(length); } // If we need to use a handle, scan it in; otherwise, the handle is // presumed to be '!'. if (useHandle) { handle = scanTagHandle("tag", startMark); } else { handle = "!"; reader.forward(); } suffix = scanTagUri("tag", startMark); } c = reader.peek(); // Check that the next character is allowed to follow a tag-property; // if it is not, raise the error. if (Constant.NULL_BL_LINEBR.hasNo(c)) { final String s = String.valueOf(Character.toChars(c)); throw new ScannerException("while scanning a tag", startMark, "expected ' ', but found '" + s + "' (" + (c) + ")", reader.getMark()); } TagTuple value = new TagTuple(handle, suffix); Mark endMark = reader.getMark(); return new TagToken(value, startMark, endMark); } private List* See the specification for details. * Note that we loose indentation rules for quoted scalars. Quoted * scalars don't need to adhere indentation because " and ' clearly * mark the beginning and the end of them. Therefore we are less * restrictive then the specification requires. We only need to check * that document separators are not included in scalars. **/ private Token scanFlowScalar(char style) { boolean _double; // The style will be either single- or double-quoted; we determine this // by the first character in the entry (supplied) if (style == '"') { _double = true; } else { _double = false; } StringBuilder chunks = new StringBuilder(); Mark startMark = reader.getMark(); int quote = reader.peek(); reader.forward(); chunks.append(scanFlowScalarNonSpaces(_double, startMark)); while (reader.peek() != quote) { chunks.append(scanFlowScalarSpaces(startMark)); chunks.append(scanFlowScalarNonSpaces(_double, startMark)); } reader.forward(); Mark endMark = reader.getMark(); return new ScalarToken(chunks.toString(), false, startMark, endMark, DumperOptions.ScalarStyle.createStyle(style)); } /** * Scan some number of flow-scalar non-space characters. */ private String scanFlowScalarNonSpaces(boolean doubleQuoted, Mark startMark) { // See the specification for details. StringBuilder chunks = new StringBuilder(); while (true) { // Scan through any number of characters which are not: NUL, blank, // tabs, line breaks, single-quotes, double-quotes, or backslashes. int length = 0; while (Constant.NULL_BL_T_LINEBR.hasNo(reader.peek(length), "\'\"\\")) { length++; } if (length != 0) { chunks.append(reader.prefixForward(length)); } // Depending on our quoting-type, the characters ', " and \ have // differing meanings. int c = reader.peek(); if (!doubleQuoted && c == '\'' && reader.peek(1) == '\'') { chunks.append("'"); reader.forward(2); } else if ((doubleQuoted && c == '\'') || (!doubleQuoted && "\"\\".indexOf(c) != -1)) { chunks.appendCodePoint(c); reader.forward(); } else if (doubleQuoted && c == '\\') { reader.forward(); c = reader.peek(); if (!Character.isSupplementaryCodePoint(c) && ESCAPE_REPLACEMENTS.containsKey(Character.valueOf((char)c))) { // The character is one of the single-replacement // types; these are replaced with a literal character // from the mapping. chunks.append(ESCAPE_REPLACEMENTS.get(Character.valueOf((char)c))); reader.forward(); } else if (!Character.isSupplementaryCodePoint(c) && ESCAPE_CODES.containsKey(Character.valueOf((char)c))) { // The character is a multi-digit escape sequence, with // length defined by the value in the ESCAPE_CODES map. length = ESCAPE_CODES.get(Character.valueOf((char)c)).intValue(); reader.forward(); String hex = reader.prefix(length); if (NOT_HEXA.matcher(hex).find()) { throw new ScannerException("while scanning a double-quoted scalar", startMark, "expected escape sequence of " + length + " hexadecimal numbers, but found: " + hex, reader.getMark()); } int decimal = Integer.parseInt(hex, 16); String unicode = new String(Character.toChars(decimal)); chunks.append(unicode); reader.forward(length); } else if (scanLineBreak().length() != 0) { chunks.append(scanFlowScalarBreaks(startMark)); } else { final String s = String.valueOf(Character.toChars(c)); throw new ScannerException("while scanning a double-quoted scalar", startMark, "found unknown escape character " + s + "(" + c + ")", reader.getMark()); } } else { return chunks.toString(); } } } private String scanFlowScalarSpaces(Mark startMark) { // See the specification for details. StringBuilder chunks = new StringBuilder(); int length = 0; // Scan through any number of whitespace (space, tab) characters, // consuming them. while (" \t".indexOf(reader.peek(length)) != -1) { length++; } String whitespaces = reader.prefixForward(length); int c = reader.peek(); if (c == '\0') { // A flow scalar cannot end with an end-of-stream throw new ScannerException("while scanning a quoted scalar", startMark, "found unexpected end of stream", reader.getMark()); } // If we encounter a line break, scan it into our assembled string... String lineBreak = scanLineBreak(); if (lineBreak.length() != 0) { String breaks = scanFlowScalarBreaks(startMark); if (!"\n".equals(lineBreak)) { chunks.append(lineBreak); } else if (breaks.length() == 0) { chunks.append(" "); } chunks.append(breaks); } else { chunks.append(whitespaces); } return chunks.toString(); } private String scanFlowScalarBreaks(Mark startMark) { // See the specification for details. StringBuilder chunks = new StringBuilder(); while (true) { // Instead of checking indentation, we check for document // separators. String prefix = reader.prefix(3); if (("---".equals(prefix) || "...".equals(prefix)) && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) { throw new ScannerException("while scanning a quoted scalar", startMark, "found unexpected document separator", reader.getMark()); } // Scan past any number of spaces and tabs, ignoring them while (" \t".indexOf(reader.peek()) != -1) { reader.forward(); } // If we stopped at a line break, add that; otherwise, return the // assembled set of scalar breaks. String lineBreak = scanLineBreak(); if (lineBreak.length() != 0) { chunks.append(lineBreak); } else { return chunks.toString(); } } } /** * Scan a plain scalar. * *
* See the specification for details. * We add an additional restriction for the flow context: * plain scalars in the flow context cannot contain ',', ':' and '?'. * We also keep track of the `allow_simple_key` flag here. * Indentation rules are loosed for the flow context. **/ private Token scanPlain() { StringBuilder chunks = new StringBuilder(); Mark startMark = reader.getMark(); Mark endMark = startMark; int indent = this.indent + 1; String spaces = ""; while (true) { int c; int length = 0; // A comment indicates the end of the scalar. if (reader.peek() == '#') { break; } while (true) { c = reader.peek(length); if (Constant.NULL_BL_T_LINEBR.has(c) || (c == ':' && Constant.NULL_BL_T_LINEBR.has(reader.peek(length + 1), flowLevel != 0 ? ",[]{}":"")) || (this.flowLevel != 0 && ",?[]{}".indexOf(c) != -1)) { break; } length++; } if (length == 0) { break; } this.allowSimpleKey = false; chunks.append(spaces); chunks.append(reader.prefixForward(length)); endMark = reader.getMark(); spaces = scanPlainSpaces(); // System.out.printf("spaces[%s]\n", spaces); if (spaces.length() == 0 || reader.peek() == '#' || (this.flowLevel == 0 && this.reader.getColumn() < indent)) { break; } } return new ScalarToken(chunks.toString(), startMark, endMark, true); } // Helper for scanPlainSpaces method when comments are enabled. // The ensures that blank lines and comments following a multi-line plain token are not swallowed up private boolean atEndOfPlain() { // peak ahead to find end of whitespaces and the column at which it occurs int wsLength = 0; int wsColumn = this.reader.getColumn(); { int c; while ((c = reader.peek(wsLength)) != '\0' && Constant.NULL_BL_T_LINEBR.has(c)) { wsLength++; if (!Constant.LINEBR.has(c) && (c != '\r' || reader.peek(wsLength + 1) != '\n') && c != 0xFEFF) { wsColumn++; } else { wsColumn = 0; } } } // if we see, a comment or end of string or change decrease in indent, we are done // Do not chomp end of lines and blanks, they will be handled by the main loop. if (reader.peek(wsLength) == '#' || reader.peek(wsLength + 1) == '\0' || this.flowLevel == 0 && wsColumn < this.indent) { return true; } // if we see, after the space, a key-value followed by a ':', we are done // Do not chomp end of lines and blanks, they will be handled by the main loop. if (this.flowLevel == 0) { int c; for(int extra = 1; (c = reader.peek(wsLength + extra)) != 0 && !Constant.NULL_BL_T_LINEBR.has(c); extra++) { if (c == ':' && Constant.NULL_BL_T_LINEBR.has(reader.peek(wsLength + extra + 1))) { return true; } } } // None of the above so safe to chomp the spaces. return false; } /** * See the specification for details. SnakeYAML and libyaml allow tabs * inside plain scalar */ private String scanPlainSpaces() { int length = 0; while (reader.peek(length) == ' ' || reader.peek(length) == '\t') { length++; } String whitespaces = reader.prefixForward(length); String lineBreak = scanLineBreak(); if (lineBreak.length() != 0) { this.allowSimpleKey = true; String prefix = reader.prefix(3); if ("---".equals(prefix) || "...".equals(prefix) && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) { return ""; } if(parseComments && atEndOfPlain()) { return ""; } StringBuilder breaks = new StringBuilder(); while (true) { if (reader.peek() == ' ') { reader.forward(); } else { String lb = scanLineBreak(); if (lb.length() != 0) { breaks.append(lb); prefix = reader.prefix(3); if ("---".equals(prefix) || "...".equals(prefix) && Constant.NULL_BL_T_LINEBR.has(reader.peek(3))) { return ""; } } else { break; } } } if (!"\n".equals(lineBreak)) { return lineBreak + breaks; } else if (breaks.length() == 0) { return " "; } return breaks.toString(); } return whitespaces; } /** *
* Scan a Tag handle. A Tag handle takes one of three forms: * *
* "!" (c-primary-tag-handle) * "!!" (ns-secondary-tag-handle) * "!(name)!" (c-named-tag-handle) ** * Where (name) must be formatted as an ns-word-char. * * * @see * @see * *
* See the specification for details. * For some strange reasons, the specification does not allow '_' in * tag handles. I have allowed it anyway. **/ private String scanTagHandle(String name, Mark startMark) { int c = reader.peek(); if (c != '!') { final String s = String.valueOf(Character.toChars(c)); throw new ScannerException("while scanning a " + name, startMark, "expected '!', but found " + s + "(" + (c) + ")", reader.getMark()); } // Look for the next '!' in the stream, stopping if we hit a // non-word-character. If the first character is a space, then the // tag-handle is a c-primary-tag-handle ('!'). int length = 1; c = reader.peek(length); if (c != ' ') { // Scan through 0+ alphabetic characters. // FIXME According to the specification, these should be // ns-word-char only, which prohibits '_'. This might be a // candidate for a configuration option. while (Constant.ALPHA.has(c)) { length++; c = reader.peek(length); } // Found the next non-word-char. If this is not a space and not an // '!', then this is an error, as the tag-handle was specified as: // !(name) or similar; the trailing '!' is missing. if (c != '!') { reader.forward(length); final String s = String.valueOf(Character.toChars(c)); throw new ScannerException("while scanning a " + name, startMark, "expected '!', but found " + s + "(" + (c) + ")", reader.getMark()); } length++; } String value = reader.prefixForward(length); return value; } /** *
* Scan a Tag URI. This scanning is valid for both local and global tag * directives, because both appear to be valid URIs as far as scanning is * concerned. The difference may be distinguished later, in parsing. This * method will scan for ns-uri-char*, which covers both cases. *
* ** This method performs no verification that the scanned URI conforms to any * particular kind of URI specification. *
* * @see */ private String scanTagUri(String name, Mark startMark) { // See the specification for details. // Note: we do not check if URI is well-formed. StringBuilder chunks = new StringBuilder(); // Scan through accepted URI characters, which includes the standard // URI characters, plus the start-escape character ('%'). When we get // to a start-escape, scan the escaped sequence, then return. int length = 0; int c = reader.peek(length); while (Constant.URI_CHARS.has(c)) { if (c == '%') { chunks.append(reader.prefixForward(length)); length = 0; chunks.append(scanUriEscapes(name, startMark)); } else { length++; } c = reader.peek(length); } // Consume the last "chunk", which would not otherwise be consumed by // the loop above. if (length != 0) { chunks.append(reader.prefixForward(length)); } if (chunks.length() == 0) { // If no URI was found, an error has occurred. final String s = String.valueOf(Character.toChars(c)); throw new ScannerException("while scanning a " + name, startMark, "expected URI, but found " + s + "(" + (c) + ")", reader.getMark()); } return chunks.toString(); } /** ** Scan a sequence of %-escaped URI escape codes and convert them into a * String representing the unescaped values. *
* * FIXME This method fails for more than 256 bytes' worth of URI-encoded * characters in a row. Is this possible? Is this a use-case? * * @see section 2.4, Escaped Encoding */ private String scanUriEscapes(String name, Mark startMark) { // First, look ahead to see how many URI-escaped characters we should // expect, so we can use the correct buffer size. int length = 1; while (reader.peek(length * 3) == '%') { length++; } // See the specification for details. // URIs containing 16 and 32 bit Unicode characters are // encoded in UTF-8, and then each octet is written as a // separate character. Mark beginningMark = reader.getMark(); ByteBuffer buff = ByteBuffer.allocate(length); while (reader.peek() == '%') { reader.forward(); try { byte code = (byte) Integer.parseInt(reader.prefix(2), 16); buff.put(code); } catch (NumberFormatException nfe) { int c1 = reader.peek(); final String s1 = String.valueOf(Character.toChars(c1)); int c2 = reader.peek(1); final String s2 = String.valueOf(Character.toChars(c2)); throw new ScannerException("while scanning a " + name, startMark, "expected URI escape sequence of 2 hexadecimal numbers, but found " + s1 + "(" + c1 + ") and " + s2 + "(" + c2 + ")", reader.getMark()); } reader.forward(2); } buff.flip(); try { return UriEncoder.decode(buff); } catch (CharacterCodingException e) { throw new ScannerException("while scanning a " + name, startMark, "expected URI in UTF-8: " + e.getMessage(), beginningMark); } } /** * Scan a line break, transforming: * ** '\r\n' : '\n' * '\r' : '\n' * '\n' : '\n' * '\x85' : '\n' * default : '' **/ private String scanLineBreak() { // Transforms: // '\r\n' : '\n' // '\r' : '\n' // '\n' : '\n' // '\x85' : '\n' // default : '' int c = reader.peek(); if (c == '\r' || c == '\n' || c == '\u0085') { if (c == '\r' && '\n' == reader.peek(1)) { reader.forward(2); } else { reader.forward(); } return "\n"; } else if (c == '\u2028' || c == '\u2029') { reader.forward(); return String.valueOf(Character.toChars(c)); } return ""; } private List