All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.esotericsoftware.yamlbeans.tokenizer.Tokenizer Maven / Gradle / Ivy

There is a newer version: 2.0.31
Show newest version
/*
 * Copyright (c) 2008 Nathan Sweet, Copyright (c) 2006 Ola Bini
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
 * modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
 * is furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
 * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

package com.esotericsoftware.yamlbeans.tokenizer;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/** Interprets a YAML document as a stream of tokens.
 * @author Nathan Sweet
 * @author Ola Bini */
public class Tokenizer {
	private final static String LINEBR = "\n\u0085\u2028\u2029";
	private final static String NULL_BL_LINEBR = "\0 \r\n\u0085";
	private final static String NULL_BL_T_LINEBR = "\0 \t\r\n\u0085";
	private final static String NULL_OR_OTHER = NULL_BL_T_LINEBR;
	private final static String NULL_OR_LINEBR = "\0\r\n\u0085";
	private final static String FULL_LINEBR = "\r\n\u0085";
	private final static String BLANK_OR_LINEBR = " \r\n\u0085";
	private final static String S4 = "\0 \t\r\n\u0028[]{}";
	private final static String ALPHA = "abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-_";
	private final static String STRANGE_CHAR = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-#;/?:@&=+$,_.!~*'()[]";
	private final static String RN = "\r\n";
	private final static String BLANK_T = " \t";
	private final static String SPACES_AND_STUFF = "'\"\\\0 \t\r\n\u0085";
	private final static String DOUBLE_ESC = "\"\\";
	private final static String NON_ALPHA_OR_NUM = "\0 \t\r\n\u0085?:,]}%@`";
	private final static Pattern NON_PRINTABLE = Pattern.compile("[^\u0009\n\r\u0020-\u007E\u0085\u00A0-\u00FF]");
	private final static Pattern NOT_HEXA = Pattern.compile("[^0-9A-Fa-f]");
	private final static Pattern NON_ALPHA = Pattern.compile("[^-0-9A-Za-z_]");
	private final static Pattern R_FLOWZERO = Pattern.compile("[\0 \t\r\n\u0085]|(:[\0 \t\r\n\u0085])");
	private final static Pattern R_FLOWNONZERO = Pattern.compile("[\0 \t\r\n\u0085\\[\\]{},:?]");
	private final static Pattern END_OR_START = Pattern.compile("^(---|\\.\\.\\.)[\0 \t\r\n\u0085]$");
	private final static Pattern ENDING = Pattern.compile("^---[\0 \t\r\n\u0085]$");
	private final static Pattern START = Pattern.compile("^\\.\\.\\.[\0 \t\r\n\u0085]$");
	private final static Pattern BEG = Pattern
		.compile("^([^\0 \t\r\n\u0085\\-?:,\\[\\]{}#&*!|>'\"%@]|([\\-?:][^\0 \t\r\n\u0085]))");

	private final static Map ESCAPE_REPLACEMENTS = new HashMap();
	private final static Map ESCAPE_CODES = new HashMap();

	static {
		ESCAPE_REPLACEMENTS.put('0', "\0");
		ESCAPE_REPLACEMENTS.put('a', "\u0007");
		ESCAPE_REPLACEMENTS.put('b', "\u0008");
		ESCAPE_REPLACEMENTS.put('t', "\u0009");
		ESCAPE_REPLACEMENTS.put('\t', "\u0009");
		ESCAPE_REPLACEMENTS.put('n', "\n");
		ESCAPE_REPLACEMENTS.put('v', "\u000B");
		ESCAPE_REPLACEMENTS.put('f', "\u000C");
		ESCAPE_REPLACEMENTS.put('r', "\r");
		ESCAPE_REPLACEMENTS.put('e', "\u001B");
		ESCAPE_REPLACEMENTS.put(' ', "\u0020");
		ESCAPE_REPLACEMENTS.put('"', "\"");
		ESCAPE_REPLACEMENTS.put('\\', "\\");
		ESCAPE_REPLACEMENTS.put('N', "\u0085");
		ESCAPE_REPLACEMENTS.put('_', "\u00A0");
		ESCAPE_REPLACEMENTS.put('L', "\u2028");
		ESCAPE_REPLACEMENTS.put('P', "\u2029");

		ESCAPE_CODES.put('x', 2);
		ESCAPE_CODES.put('u', 4);
		ESCAPE_CODES.put('U', 8);
	}

	private boolean done = false;
	private int flowLevel = 0;
	private int tokensTaken = 0;
	private int indent = -1;
	private boolean allowSimpleKey = true;
	private boolean eof;
	private int lineNumber = 0;
	private int column = 0;
	private int pointer = 0;
	private final StringBuilder buffer;
	private final Reader reader;
	private final List tokens = new LinkedList();
	private final List indents = new LinkedList();
	private final Map possibleSimpleKeys = new HashMap();
	private boolean docStart = false;

	public Tokenizer (Reader reader) {
		if (reader == null) throw new IllegalArgumentException("reader cannot be null.");
		if (!(reader instanceof BufferedReader)) reader = new BufferedReader(reader);
		this.reader = reader;
		buffer = new StringBuilder();
		eof = false;
		fetchStreamStart();
	}

	public Tokenizer (String yaml) {
		this(new StringReader(yaml));
	}

	public Token peekNextToken () throws TokenizerException {
		while (needMoreTokens())
			fetchMoreTokens();
		return tokens.isEmpty() ? null : tokens.get(0);
	}

	public TokenType peekNextTokenType () throws TokenizerException {
		Token token = peekNextToken();
		if (token == null) return null;
		return token.type;
	}

	public Token getNextToken () throws TokenizerException {
		while (needMoreTokens())
			fetchMoreTokens();
		if (!tokens.isEmpty()) {
			tokensTaken++;
			Token token = tokens.remove(0);
			return token;
		}
		return null;
	}

	public Iterator iterator () {
		return new Iterator() {
			public boolean hasNext () {
				return null != peekNextToken();
			}

			public Object next () {
				return getNextToken();
			}

			public void remove () {
				throw new UnsupportedOperationException();
			}
		};
	}

	public int getLineNumber () {
		return lineNumber;
	}

	public int getColumn () {
		return column;
	}

	public void close () throws IOException {
		reader.close();
	}

	private char peek () {
		if (pointer + 1 > buffer.length()) update(1);
		return buffer.charAt(pointer);
	}

	private char peek (int index) {
		if (pointer + index + 1 > buffer.length()) update(index + 1);
		return buffer.charAt(pointer + index);
	}

	private String prefix (int length) {
		if (pointer + length >= buffer.length()) update(length);
		if (pointer + length > buffer.length()) return buffer.substring(pointer, buffer.length());
		return buffer.substring(pointer, pointer + length);
	}

	private String prefixForward (int length) {
		if (pointer + length + 1 >= buffer.length()) update(length + 1);
		String buff = null;
		if (pointer + length > buffer.length())
			buff = buffer.substring(pointer, buffer.length());
		else
			buff = buffer.substring(pointer, pointer + length);
		char ch = 0;
		for (int i = 0, j = buff.length(); i < j; i++) {
			ch = buff.charAt(i);
			pointer++;
			if (LINEBR.indexOf(ch) != -1 || ch == '\r' && buff.charAt(i + 1) != '\n') {
				column = 0;
				lineNumber++;
			} else if (ch != '\uFEFF') column++;
		}
		return buff;
	}

	private void forward () {
		if (pointer + 2 >= buffer.length()) update(2);
		char ch1 = buffer.charAt(pointer);
		pointer++;
		if (ch1 == '\n' || ch1 == '\u0085' || ch1 == '\r' && buffer.charAt(pointer) != '\n') {
			column = 0;
			lineNumber++;
		} else
			column++;
	}

	private void forward (int length) {
		if (pointer + length + 1 >= buffer.length()) update(length + 1);
		char ch = 0;
		for (int i = 0; i < length; i++) {
			ch = buffer.charAt(pointer);
			pointer++;
			if (LINEBR.indexOf(ch) != -1 || ch == '\r' && buffer.charAt(pointer) != '\n') {
				column = 0;
				lineNumber++;
			} else if (ch != '\uFEFF') column++;
		}
	}

	private void update (int length) {
		buffer.delete(0, pointer);
		pointer = 0;
		while (buffer.length() < length) {
			String rawData = "";
			if (!eof) {
				char[] data = new char[1024];
				int converted = -2;
				try {
					converted = reader.read(data);
				} catch (IOException ioe) {
					throw new TokenizerException("Error reading from stream.", ioe);
				}
				if (converted == -1)
					eof = true;
				else
					rawData = String.valueOf(data, 0, converted);
			}
			buffer.append(rawData);
			if (eof) {
				buffer.append('\0');
				break;
			}
		}
	}

	private boolean needMoreTokens () {
		if (done) return false;
		return tokens.isEmpty() || nextPossibleSimpleKey() == tokensTaken;
	}

	private Token fetchMoreTokens () {
		scanToNextToken();
		unwindIndent(column);
		char ch = peek();
		boolean colz = column == 0;
		switch (ch) {
		case '\0':
			return fetchStreamEnd();
		case '\'':
			return fetchSingle();
		case '"':
			return fetchDouble();
		case '?':
			if (flowLevel != 0 || NULL_OR_OTHER.indexOf(peek(1)) != -1) return fetchKey();
			break;
		case ':':
			if (flowLevel != 0 || NULL_OR_OTHER.indexOf(peek(1)) != -1) return fetchValue();
			break;
		case '%':
			if (colz) return fetchDirective();
			break;
		case '-':
			if ((colz || docStart) && ENDING.matcher(prefix(4)).matches())
				return fetchDocumentStart();
			else if (NULL_OR_OTHER.indexOf(peek(1)) != -1) return fetchBlockEntry();
			break;
		case '.':
			if (colz && START.matcher(prefix(4)).matches()) return fetchDocumentEnd();
			break;
		case '[':
			return fetchFlowSequenceStart();
		case '{':
			return fetchFlowMappingStart();
		case ']':
			return fetchFlowSequenceEnd();
		case '}':
			return fetchFlowMappingEnd();
		case ',':
			return fetchFlowEntry();
		case '*':
			return fetchAlias();
		case '&':
			return fetchAnchor();
		case '!':
			return fetchTag();
		case '|':
			if (flowLevel == 0) return fetchLiteral();
			break;
		case '>':
			if (flowLevel == 0) return fetchFolded();
			break;
		}
		if (BEG.matcher(prefix(2)).find()) return fetchPlain();
		if (ch == '\t') throw new TokenizerException("Tabs cannot be used for indentation.");
		throw new TokenizerException(
			"While scanning for the next token, a character that cannot begin a token was found: " + ch(ch));
	}

	private int nextPossibleSimpleKey () {
		for (Iterator iter = possibleSimpleKeys.values().iterator(); iter.hasNext();) {
			SimpleKey key = (SimpleKey)iter.next();
			if (key.tokenNumber > 0) return key.tokenNumber;
		}
		return -1;
	}

	private void savePossibleSimpleKey () {
		if (allowSimpleKey) possibleSimpleKeys.put(flowLevel, new SimpleKey(tokensTaken + tokens.size(), column));
	}

	private void unwindIndent (int col) {
		if (flowLevel != 0) return;

		while (indent > col) {
			indent = indents.remove(0);
			tokens.add(Token.BLOCK_END);
		}
	}

	private boolean addIndent (int col) {
		if (indent < col) {
			indents.add(0, indent);
			indent = col;
			return true;
		}
		return false;
	}

	private Token fetchStreamStart () {
		docStart = true;
		tokens.add(Token.STREAM_START);
		return Token.STREAM_START;
	}

	private Token fetchStreamEnd () {
		unwindIndent(-1);
		allowSimpleKey = false;
		possibleSimpleKeys.clear();
		tokens.add(Token.STREAM_END);
		done = true;
		return Token.STREAM_END;
	}

	private Token fetchDirective () {
		unwindIndent(-1);
		allowSimpleKey = false;
		Token tok = scanDirective();
		tokens.add(tok);
		return tok;
	}

	private Token fetchDocumentStart () {
		docStart = false;
		return fetchDocumentIndicator(Token.DOCUMENT_START);
	}

	private Token fetchDocumentEnd () {
		return fetchDocumentIndicator(Token.DOCUMENT_END);
	}

	private Token fetchDocumentIndicator (Token tok) {
		unwindIndent(-1);
		allowSimpleKey = false;
		forward(3);
		tokens.add(tok);
		return tok;
	}

	private Token fetchFlowSequenceStart () {
		return fetchFlowCollectionStart(Token.FLOW_SEQUENCE_START);
	}

	private Token fetchFlowMappingStart () {
		return fetchFlowCollectionStart(Token.FLOW_MAPPING_START);
	}

	private Token fetchFlowCollectionStart (Token tok) {
		savePossibleSimpleKey();
		flowLevel++;
		allowSimpleKey = true;
		forward(1);
		tokens.add(tok);
		return tok;
	}

	private Token fetchFlowSequenceEnd () {
		return fetchFlowCollectionEnd(Token.FLOW_SEQUENCE_END);
	}

	private Token fetchFlowMappingEnd () {
		return fetchFlowCollectionEnd(Token.FLOW_MAPPING_END);
	}

	private Token fetchFlowCollectionEnd (Token tok) {
		flowLevel--;
		allowSimpleKey = false;
		forward(1);
		tokens.add(tok);
		return tok;
	}

	private Token fetchFlowEntry () {
		allowSimpleKey = true;
		forward(1);
		tokens.add(Token.FLOW_ENTRY);
		return Token.FLOW_ENTRY;
	}

	private Token fetchBlockEntry () {
		if (flowLevel == 0) {
			if (!allowSimpleKey) throw new TokenizerException("Found a sequence entry where it is not allowed.");
			if (addIndent(column)) tokens.add(Token.BLOCK_SEQUENCE_START);
		}
		allowSimpleKey = true;
		forward();
		tokens.add(Token.BLOCK_ENTRY);
		return Token.BLOCK_ENTRY;
	}

	private Token fetchKey () {
		if (flowLevel == 0) {
			if (!allowSimpleKey) throw new TokenizerException("Found a mapping key where it is not allowed.");
			if (addIndent(column)) tokens.add(Token.BLOCK_MAPPING_START);
		}
		allowSimpleKey = flowLevel == 0;
		forward();
		tokens.add(Token.KEY);
		return Token.KEY;
	}

	private Token fetchValue () {
		SimpleKey key = possibleSimpleKeys.get(flowLevel);
		if (null == key) {
			if (flowLevel == 0 && !allowSimpleKey) throw new TokenizerException("Found a mapping value where it is not allowed.");
		} else {
			possibleSimpleKeys.remove(flowLevel);
			tokens.add(key.tokenNumber - tokensTaken, Token.KEY);
			if (flowLevel == 0 && addIndent(key.column)) tokens.add(key.tokenNumber - tokensTaken, Token.BLOCK_MAPPING_START);
			allowSimpleKey = false;
		}
		forward();
		tokens.add(Token.VALUE);
		return Token.VALUE;
	}

	private Token fetchAlias () {
		savePossibleSimpleKey();
		allowSimpleKey = false;
		Token tok = scanAnchor(new AliasToken());
		tokens.add(tok);
		return tok;
	}

	private Token fetchAnchor () {
		savePossibleSimpleKey();
		allowSimpleKey = false;
		Token tok = scanAnchor(new AnchorToken());
		tokens.add(tok);
		return tok;
	}

	private Token fetchTag () {
		savePossibleSimpleKey();
		allowSimpleKey = false;
		Token tok = scanTag();
		tokens.add(tok);
		return tok;
	}

	private Token fetchLiteral () {
		return fetchBlockScalar('|');
	}

	private Token fetchFolded () {
		return fetchBlockScalar('>');
	}

	private Token fetchBlockScalar (char style) {
		allowSimpleKey = true;
		Token tok = scanBlockScalar(style);
		tokens.add(tok);
		return tok;
	}

	private Token fetchSingle () {
		return fetchFlowScalar('\'');
	}

	private Token fetchDouble () {
		return fetchFlowScalar('"');
	}

	private Token fetchFlowScalar (char style) {
		savePossibleSimpleKey();
		allowSimpleKey = false;
		Token tok = scanFlowScalar(style);
		tokens.add(tok);
		return tok;
	}

	private Token fetchPlain () {
		savePossibleSimpleKey();
		allowSimpleKey = false;
		Token tok = scanPlain();
		tokens.add(tok);
		return tok;
	}

	private void scanToNextToken () {
		for (;;) {
			while (peek() == ' ')
				forward();
			if (peek() == '#') while (NULL_OR_LINEBR.indexOf(peek()) == -1)
				forward();
			if (scanLineBreak().length() != 0) {
				if (flowLevel == 0) allowSimpleKey = true;
			} else
				break;
		}
	}

	private Token scanDirective () {
		forward();
		String name = scanDirectiveName();
		String value = null;
		if (name.equals("YAML"))
			value = scanYamlDirectiveValue();
		else if (name.equals("TAG"))
			value = scanTagDirectiveValue();
		else {
			StringBuilder buffer = new StringBuilder();
			while (true) {
				char ch = peek();
				if (NULL_OR_LINEBR.indexOf(ch) != -1) break;
				buffer.append(ch);
				forward();
			}
			value = buffer.toString().trim();
		}
		scanDirectiveIgnoredLine();
		return new DirectiveToken(name, value);
	}

	private String scanDirectiveName () {
		int length = 0;
		char ch = peek(length);
		boolean zlen = true;
		while (ALPHA.indexOf(ch) != -1) {
			zlen = false;
			length++;
			ch = peek(length);
		}
		if (zlen) throw new TokenizerException(
			"While scanning for a directive name, expected an alpha or numeric character but found: " + ch(ch));
		String value = prefixForward(length);
		// forward(length);
		if (NULL_BL_LINEBR.indexOf(peek()) == -1) throw new TokenizerException(
			"While scanning for a directive name, expected an alpha or numeric character but found: " + ch(ch));
		return value;
	}

	private String scanYamlDirectiveValue () {
		while (peek() == ' ')
			forward();
		String major = scanYamlDirectiveNumber();
		if (peek() != '.')
			throw new TokenizerException("While scanning for a directive value, expected a digit or '.' but found: " + ch(peek()));
		forward();
		String minor = scanYamlDirectiveNumber();
		if (NULL_BL_LINEBR.indexOf(peek()) == -1)
			throw new TokenizerException("While scanning for a directive value, expected a digit or '.' but found: " + ch(peek()));
		return major + "." + minor;
	}

	private String scanYamlDirectiveNumber () {
		char ch = peek();
		if (!Character.isDigit(ch))
			throw new TokenizerException("While scanning for a directive number, expected a digit but found: " + ch(ch));
		int length = 0;
		while (Character.isDigit(peek(length)))
			length++;
		String value = prefixForward(length);
		// forward(length);
		return value;
	}

	private String scanTagDirectiveValue () {
		while (peek() == ' ')
			forward();
		String handle = scanTagDirectiveHandle();
		while (peek() == ' ')
			forward();
		String prefix = scanTagDirectivePrefix();
		return handle + " " + prefix;
	}

	private String scanTagDirectiveHandle () {
		String value = scanTagHandle("directive");
		if (peek() != ' ')
			throw new TokenizerException("While scanning for a directive tag handle, expected ' ' but found: " + ch(peek()));
		return value;
	}

	private String scanTagDirectivePrefix () {
		String value = scanTagUri("directive");
		if (NULL_BL_LINEBR.indexOf(peek()) == -1)
			throw new TokenizerException("While scanning for a directive tag prefix, expected ' ' but found: " + ch(peek()));
		return value;
	}

	private String scanDirectiveIgnoredLine () {
		while (peek() == ' ')
			forward();
		if (peek() == '"') while (NULL_OR_LINEBR.indexOf(peek()) == -1)
			forward();
		char ch = peek();
		if (NULL_OR_LINEBR.indexOf(ch) == -1)
			throw new TokenizerException("While scanning a directive, expected a comment or line break but found: " + ch(peek()));
		return scanLineBreak();
	}

	private Token scanAnchor (Token tok) {
		char indicator = peek();
		String name = indicator == '*' ? "alias" : "anchor";
		forward();
		int length = 0;
		int chunk_size = 16;
		Matcher m = null;
		for (;;) {
			String chunk = prefix(chunk_size);
			if ((m = NON_ALPHA.matcher(chunk)).find()) break;
			chunk_size += 16;
		}
		length = m.start();
		if (length == 0)
			throw new TokenizerException("While scanning an " + name + ", a non-alpha, non-numeric character was found.");
		String value = prefixForward(length);
		// forward(length);
		if (NON_ALPHA_OR_NUM.indexOf(peek()) == -1) throw new TokenizerException(
			"While scanning an " + name + ", expected an alpha or numeric character but found: " + ch(peek()));
		if (tok instanceof AnchorToken)
			((AnchorToken)tok).setInstanceName(value);
		else
			((AliasToken)tok).setInstanceName(value);
		return tok;
	}

	private Token scanTag () {
		char ch = peek(1);
		String handle = null;
		String suffix = null;
		if (ch == '<') {
			forward(2);
			suffix = scanTagUri("tag");
			if (peek() != '>') throw new TokenizerException("While scanning a tag, expected '>' but found: " + ch(peek()));
			forward();
		} else if (NULL_BL_T_LINEBR.indexOf(ch) != -1) {
			suffix = "!";
			forward();
		} else {
			int length = 1;
			boolean useHandle = false;
			while (NULL_BL_T_LINEBR.indexOf(ch) == -1) {
				if (ch == '!') {
					useHandle = true;
					break;
				}
				length++;
				ch = peek(length);
			}
			handle = "!";
			if (useHandle)
				handle = scanTagHandle("tag");
			else {
				handle = "!";
				forward();
			}
			suffix = scanTagUri("tag");
		}
		if (NULL_BL_LINEBR.indexOf(peek()) == -1)
			throw new TokenizerException("While scanning a tag, expected ' ' but found: " + ch(peek()));
		return new TagToken(handle, suffix);
	}

	private Token scanBlockScalar (char style) {
		boolean folded = style == '>';
		StringBuilder chunks = new StringBuilder();
		forward();
		Object[] chompi = scanBlockScalarIndicators();
		int chomping = ((Integer)chompi[0]).intValue();
		int increment = ((Integer)chompi[1]).intValue();
		scanBlockScalarIgnoredLine();
		int minIndent = indent + 1;
		if (minIndent < 1) minIndent = 1;
		String breaks = null;
		int maxIndent = 0;
		int ind = 0;
		if (increment == -1) {
			Object[] brme = scanBlockScalarIndentation();
			breaks = (String)brme[0];
			maxIndent = ((Integer)brme[1]).intValue();
			if (minIndent > maxIndent)
				ind = minIndent;
			else
				ind = maxIndent;
		} else {
			ind = minIndent + increment - 1;
			breaks = scanBlockScalarBreaks(ind);
		}

		String lineBreak = "";
		while (column == ind && peek() != '\0') {
			chunks.append(breaks);
			boolean leadingNonSpace = BLANK_T.indexOf(peek()) == -1;
			int length = 0;
			while (NULL_OR_LINEBR.indexOf(peek(length)) == -1)
				length++;
			chunks.append(prefixForward(length));
			// forward(length);
			lineBreak = scanLineBreak();
			breaks = scanBlockScalarBreaks(ind);
			if (column == ind && peek() != '\0') {
				if (folded && lineBreak.equals("\n") && leadingNonSpace && BLANK_T.indexOf(peek()) == -1) {
					if (breaks.length() == 0) chunks.append(" ");
				} else
					chunks.append(lineBreak);
			} else
				break;
		}

		if (chomping == 0) {
			chunks.append(lineBreak);
		} else if (chomping == 2) {
			chunks.append(lineBreak);
			chunks.append(breaks);
		}

		return new ScalarToken(chunks.toString(), false, style);
	}

	private Object[] scanBlockScalarIndicators () {
		int chomping = 0;  // 0 = clip, 1 = strip, 2 = keep
		int increment = -1;
		char ch = peek();
		if (ch == '-' || ch == '+') {
			chomping = ch == '-' ? 1 : 2;
			forward();
			ch = peek();
			if (Character.isDigit(ch)) {
				increment = Integer.parseInt(("" + ch));
				if (increment == 0) throw new TokenizerException(
					"While scanning a black scaler, expected indentation indicator between 1 and 9 but found: 0");
				forward();
			}
		} else if (Character.isDigit(ch)) {
			increment = Integer.parseInt(("" + ch));
			if (increment == 0) throw new TokenizerException(
				"While scanning a black scaler, expected indentation indicator between 1 and 9 but found: 0");
			forward();
			ch = peek();
			if (ch == '-' || ch == '+') {
				chomping = ch == '-' ? 1 : 2;
				forward();
			}
		}
		if (NULL_BL_LINEBR.indexOf(peek()) == -1) throw new TokenizerException(
			"While scanning a block scalar, expected chomping or indentation indicators but found: " + ch(peek()));
		return new Object[] {Integer.valueOf(chomping), increment};
	}

	private String scanBlockScalarIgnoredLine () {
		while (peek() == ' ')
			forward();
		if (peek() == '#') while (NULL_OR_LINEBR.indexOf(peek()) == -1)
			forward();
		if (NULL_OR_LINEBR.indexOf(peek()) == -1)
			throw new TokenizerException("While scanning a block scalar, expected a comment or line break but found: " + ch(peek()));
		return scanLineBreak();
	}

	private Object[] scanBlockScalarIndentation () {
		StringBuilder chunks = new StringBuilder();
		int maxIndent = 0;
		while (BLANK_OR_LINEBR.indexOf(peek()) != -1)
			if (peek() != ' ')
				chunks.append(scanLineBreak());
			else {
				forward();
				if (column > maxIndent) maxIndent = column;
			}
		return new Object[] {chunks.toString(), maxIndent};
	}

	private String scanBlockScalarBreaks (int indent) {
		StringBuilder chunks = new StringBuilder();
		while (column < indent && peek() == ' ')
			forward();
		while (FULL_LINEBR.indexOf(peek()) != -1) {
			chunks.append(scanLineBreak());
			while (column < indent && peek() == ' ')
				forward();
		}
		return chunks.toString();
	}

	private Token scanFlowScalar (char style) {
		boolean dbl = style == '"';
		StringBuilder chunks = new StringBuilder();
		char quote = peek();
		forward();
		chunks.append(scanFlowScalarNonSpaces(dbl));
		while (peek() != quote) {
			chunks.append(scanFlowScalarSpaces());
			chunks.append(scanFlowScalarNonSpaces(dbl));
		}
		forward();
		return new ScalarToken(chunks.toString(), false, style);
	}

	private String scanFlowScalarNonSpaces (boolean dbl) {
		StringBuilder chunks = new StringBuilder();
		for (;;) {
			int length = 0;
			while (SPACES_AND_STUFF.indexOf(peek(length)) == -1)
				length++;
			if (length != 0) chunks.append(prefixForward(length));
			// forward(length);
			char ch = peek();
			if (!dbl && ch == '\'' && peek(1) == '\'') {
				chunks.append("'");
				forward(2);
			} else if (dbl && ch == '\'' || !dbl && DOUBLE_ESC.indexOf(ch) != -1) {
				chunks.append(ch);
				forward();
			} else if (dbl && ch == '\\') {
				forward();
				ch = peek();
				if (ESCAPE_REPLACEMENTS.containsKey(ch)) {
					chunks.append(ESCAPE_REPLACEMENTS.get(ch));
					forward();
				} else if (ESCAPE_CODES.containsKey(ch)) {
					length = ESCAPE_CODES.get(ch);
					forward();
					String val = prefix(length);
					if (NOT_HEXA.matcher(val).find())
						throw new TokenizerException("While scanning a double quoted scalar, expected an escape sequence of " + length
							+ " hexadecimal numbers but found: " + ch(peek()));
					chunks.append(Character.toChars(Integer.parseInt(val, 16)));
					forward(length);
				} else if (FULL_LINEBR.indexOf(ch) != -1) {
					scanLineBreak();
					chunks.append(scanFlowScalarBreaks());
				} else
					throw new TokenizerException("While scanning a double quoted scalar, found unknown escape character: " + ch(ch));
			} else
				return chunks.toString();
		}
	}

	private String scanFlowScalarSpaces () {
		StringBuilder chunks = new StringBuilder();
		int length = 0;
		while (BLANK_T.indexOf(peek(length)) != -1)
			length++;
		String whitespaces = prefixForward(length);
		// forward(length);
		char ch = peek();
		if (ch == '\0')
			throw new TokenizerException("While scanning a quoted scalar, found unexpected end of stream.");
		else if (FULL_LINEBR.indexOf(ch) != -1) {
			String lineBreak = scanLineBreak();
			String breaks = scanFlowScalarBreaks();
			if (!lineBreak.equals("\n"))
				chunks.append(lineBreak);
			else if (breaks.length() == 0) chunks.append(" ");
			chunks.append(breaks);
		} else
			chunks.append(whitespaces);
		return chunks.toString();
	}

	private String scanFlowScalarBreaks () {
		StringBuilder chunks = new StringBuilder();
		String pre = null;
		for (;;) {
			pre = prefix(3);
			if ((pre.equals("---") || pre.equals("...")) && NULL_BL_T_LINEBR.indexOf(peek(3)) != -1)
				throw new TokenizerException("While scanning a quoted scalar, found unexpected document separator.");
			while (BLANK_T.indexOf(peek()) != -1)
				forward();
			if (FULL_LINEBR.indexOf(peek()) != -1)
				chunks.append(scanLineBreak());
			else
				return chunks.toString();
		}
	}

	private Token scanPlain () {
		/*
		 * See the specification for details. We add an additional restriction for the flow context: plain scalars in the flow
		 * context cannot contain ',', ':' and '?'. We also keep track of the `allow_simple_key` flag here. Indentation rules are
		 * loosed for the flow context.
		 */
		StringBuilder chunks = new StringBuilder();
		int ind = indent + 1;
		String spaces = "";
		boolean f_nzero = true;
		Pattern r_check = R_FLOWNONZERO;
		if (flowLevel == 0) {
			f_nzero = false;
			r_check = R_FLOWZERO;
		}
		while (peek() != '#') {
			int length = 0;
			int chunkSize = 32;
			Matcher m = null;
			while (!(m = r_check.matcher(prefix(chunkSize))).find())
				chunkSize += 32;
			length = m.start();
			char ch = peek(length);
			if (f_nzero && ch == ':' && S4.indexOf(peek(length + 1)) == -1) {
				forward(length);
				throw new TokenizerException(
					"While scanning a plain scalar, found unexpected ':'. See: http://pyyaml.org/wiki/YAMLColonInFlowContext");
			}
			if (length == 0) break;
			allowSimpleKey = false;
			chunks.append(spaces);
			chunks.append(prefixForward(length));
			// forward(length);
			spaces = scanPlainSpaces();
			if (spaces.length() == 0 || flowLevel == 0 && column < ind) break;
		}
		return new ScalarToken(chunks.toString(), true);
	}

	private String scanPlainSpaces () {
		StringBuilder chunks = new StringBuilder();
		int length = 0;
        // YAML recognizes two white space characters: space and tab.
        while (peek(length) == ' ' || peek(length) == '\t')
			length++;
		String whitespaces = prefixForward(length);
		// forward(length);
		char ch = peek();
		if (FULL_LINEBR.indexOf(ch) != -1) {
			String lineBreak = scanLineBreak();
			allowSimpleKey = true;
			if (END_OR_START.matcher(prefix(4)).matches()) return "";
			StringBuilder breaks = new StringBuilder();
			while (BLANK_OR_LINEBR.indexOf(peek()) != -1)
				if (' ' == peek())
					forward();
				else {
					breaks.append(scanLineBreak());
					if (END_OR_START.matcher(prefix(4)).matches()) return "";
				}
			if (!lineBreak.equals("\n"))
				chunks.append(lineBreak);
			else if (breaks.length() == 0) chunks.append(" ");
			chunks.append(breaks);
		} else
			chunks.append(whitespaces);
		return chunks.toString();
	}

	private String scanTagHandle (String name) {
		char ch = peek();
		if (ch != '!') throw new TokenizerException("While scanning a " + name + ", expected '!' but found: " + ch(ch));
		int length = 1;
		ch = peek(length);
		if (ch != ' ') {
			while (ALPHA.indexOf(ch) != -1) {
				length++;
				ch = peek(length);
			}
			if ('!' != ch) {
				forward(length);
				throw new TokenizerException("While scanning a " + name + ", expected '!' but found: " + ch(ch));
			}
			length++;
		}
		String value = prefixForward(length);
		// forward(length);
		return value;
	}

	private String scanTagUri (String name) {
		StringBuilder chunks = new StringBuilder();
		int length = 0;
		char ch = peek(length);
		while (STRANGE_CHAR.indexOf(ch) != -1) {
			if ('%' == ch) {
				chunks.append(prefixForward(length));
				// forward(length);
				length = 0;
				chunks.append(scanUriEscapes(name));
			} else
				length++;
			ch = peek(length);
		}
		if (length != 0) chunks.append(prefixForward(length));
		// forward(length);

		if (chunks.length() == 0)
			throw new TokenizerException("While scanning a " + name + ", expected a URI but found: " + ch(ch));
		return chunks.toString();
	}

	private String scanUriEscapes (String name) {
		StringBuilder bytes = new StringBuilder();
		while (peek() == '%') {
			forward();
			try {
				bytes.append(Character.toChars(Integer.parseInt(prefix(2), 16)));
			} catch (NumberFormatException nfe) {
				throw new TokenizerException("While scanning a " + name
					+ ", expected a URI escape sequence of 2 hexadecimal numbers but found: " + ch(peek(1)) + " and " + ch(peek(2)));
			}
			forward(2);
		}
		return bytes.toString();
	}

	private String scanLineBreak () {
		// Transforms:
		// '\r\n' : '\n'
		// '\r' : '\n'
		// '\n' : '\n'
		// '\x85' : '\n'
		// default : ''
		char val = peek();
		if (FULL_LINEBR.indexOf(val) != -1) {
			if (RN.equals(prefix(2)))
				forward(2);
			else
				forward();
			return "\n";
		}
		return "";
	}

	private String ch (char ch) {
		return "'" + ch + "' (" + (int)ch + ")";
	}

	public class TokenizerException extends RuntimeException {
		public TokenizerException (String message, Throwable cause) {
			super("Line " + getLineNumber() + ", column " + getColumn() + ": " + message, cause);
		}

		public TokenizerException (String message) {
			this(message, null);
		}
	}

	static class SimpleKey {
		public final int tokenNumber;
		public final int column;

		public SimpleKey (int tokenNumber, int column) {
			this.tokenNumber = tokenNumber;
			this.column = column;
		}
	}

	public static void main (String[] args) throws Exception {
		for (Iterator iter = new Tokenizer(new FileReader("test/test.yml")).iterator(); iter.hasNext();)
			System.out.println(iter.next());
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy