com.gitlab.mvysny.uribuilder.util.Tokenizer Maven / Gradle / Ivy
Show all versions of uribuilder Show documentation
/*
* ====================================================================
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* .
*
*/
package com.gitlab.mvysny.uribuilder.util;
import java.util.BitSet;
import com.gitlab.mvysny.uribuilder.annotation.Contract;
import com.gitlab.mvysny.uribuilder.annotation.ThreadingBehavior;
/**
* Tokenizer that can be used as a foundation for more complex parsing routines.
* Methods of this class are designed to produce near zero intermediate garbage
* and make no intermediate copies of input data.
*
* This class is immutable and thread safe.
*
* @since 5.1
*/
@Contract(threading = ThreadingBehavior.IMMUTABLE)
public final class Tokenizer {
public static class Cursor {
private final int lowerBound;
private final int upperBound;
private int pos;
public Cursor(final int lowerBound, final int upperBound) {
super();
Args.notNegative(lowerBound, "lowerBound");
Args.check(lowerBound <= upperBound, "lowerBound cannot be greater than upperBound");
this.lowerBound = lowerBound;
this.upperBound = upperBound;
this.pos = lowerBound;
}
public int getLowerBound() {
return this.lowerBound;
}
public int getUpperBound() {
return this.upperBound;
}
public int getPos() {
return this.pos;
}
public void updatePos(final int pos) {
Args.check(pos >= this.lowerBound, "pos: %s < lowerBound: %s", pos, this.lowerBound);
Args.check(pos <= this.upperBound, "pos: %s > upperBound: %s", pos, this.upperBound);
this.pos = pos;
}
public boolean atEnd() {
return this.pos >= this.upperBound;
}
@Override
public String toString() {
final StringBuilder buffer = new StringBuilder();
buffer.append('[');
buffer.append(this.lowerBound);
buffer.append('>');
buffer.append(this.pos);
buffer.append('>');
buffer.append(this.upperBound);
buffer.append(']');
return buffer.toString();
}
}
public static BitSet INIT_BITSET(final int ... b) {
final BitSet bitset = new BitSet();
for (final int aB : b) {
bitset.set(aB);
}
return bitset;
}
/** Double quote */
public static final char DQUOTE = '\"';
/** Backward slash / escape character */
public static final char ESCAPE = '\\';
public static final int CR = 13; //
public static final int LF = 10; //
public static final int SP = 32; //
public static final int HT = 9; //
public static boolean isWhitespace(final char ch) {
return ch == SP || ch == HT || ch == CR || ch == LF;
}
public static final Tokenizer INSTANCE = new Tokenizer();
/**
* Extracts from the sequence of chars a token terminated with any of the given delimiters
* or a whitespace characters.
*
* @param buf buffer with the sequence of chars to be parsed
* @param cursor defines the bounds and current position of the buffer
* @param delimiters set of delimiting characters. Can be {@code null} if the token
* is not delimited by any character.
*/
public String parseContent(final CharSequence buf, final Cursor cursor, final BitSet delimiters) {
Args.notNull(buf, "Char sequence");
Args.notNull(cursor, "Parser cursor");
final StringBuilder dst = new StringBuilder();
copyContent(buf, cursor, delimiters, dst);
return dst.toString();
}
/**
* Extracts from the sequence of chars a token terminated with any of the given delimiters
* discarding semantically insignificant whitespace characters.
*
* @param buf buffer with the sequence of chars to be parsed
* @param cursor defines the bounds and current position of the buffer
* @param delimiters set of delimiting characters. Can be {@code null} if the token
* is not delimited by any character.
*/
public String parseToken(final CharSequence buf, final Cursor cursor, final BitSet delimiters) {
Args.notNull(buf, "Char sequence");
Args.notNull(cursor, "Parser cursor");
final StringBuilder dst = new StringBuilder();
boolean whitespace = false;
while (!cursor.atEnd()) {
final char current = buf.charAt(cursor.getPos());
if (delimiters != null && delimiters.get(current)) {
break;
} else if (isWhitespace(current)) {
skipWhiteSpace(buf, cursor);
whitespace = true;
} else {
if (whitespace && dst.length() > 0) {
dst.append(' ');
}
copyContent(buf, cursor, delimiters, dst);
whitespace = false;
}
}
return dst.toString();
}
/**
* Extracts from the sequence of chars a value which can be enclosed in quote marks and
* terminated with any of the given delimiters discarding semantically insignificant
* whitespace characters.
*
* @param buf buffer with the sequence of chars to be parsed
* @param cursor defines the bounds and current position of the buffer
* @param delimiters set of delimiting characters. Can be {@code null} if the value
* is not delimited by any character.
*/
public String parseValue(final CharSequence buf, final Cursor cursor, final BitSet delimiters) {
Args.notNull(buf, "Char sequence");
Args.notNull(cursor, "Parser cursor");
final StringBuilder dst = new StringBuilder();
boolean whitespace = false;
while (!cursor.atEnd()) {
final char current = buf.charAt(cursor.getPos());
if (delimiters != null && delimiters.get(current)) {
break;
} else if (isWhitespace(current)) {
skipWhiteSpace(buf, cursor);
whitespace = true;
} else if (current == DQUOTE) {
if (whitespace && dst.length() > 0) {
dst.append(' ');
}
copyQuotedContent(buf, cursor, dst);
whitespace = false;
} else {
if (whitespace && dst.length() > 0) {
dst.append(' ');
}
copyUnquotedContent(buf, cursor, delimiters, dst);
whitespace = false;
}
}
return dst.toString();
}
/**
* Skips semantically insignificant whitespace characters and moves the cursor to the closest
* non-whitespace character.
*
* @param buf buffer with the sequence of chars to be parsed
* @param cursor defines the bounds and current position of the buffer
*/
public void skipWhiteSpace(final CharSequence buf, final Cursor cursor) {
Args.notNull(buf, "Char sequence");
Args.notNull(cursor, "Parser cursor");
int pos = cursor.getPos();
final int indexFrom = cursor.getPos();
final int indexTo = cursor.getUpperBound();
for (int i = indexFrom; i < indexTo; i++) {
final char current = buf.charAt(i);
if (!isWhitespace(current)) {
break;
}
pos++;
}
cursor.updatePos(pos);
}
/**
* Transfers content into the destination buffer until a whitespace character or any of
* the given delimiters is encountered.
*
* @param buf buffer with the sequence of chars to be parsed
* @param cursor defines the bounds and current position of the buffer
* @param delimiters set of delimiting characters. Can be {@code null} if the value
* is delimited by a whitespace only.
* @param dst destination buffer
*/
public void copyContent(final CharSequence buf, final Cursor cursor, final BitSet delimiters,
final StringBuilder dst) {
Args.notNull(buf, "Char sequence");
Args.notNull(cursor, "Parser cursor");
Args.notNull(dst, "String builder");
int pos = cursor.getPos();
final int indexFrom = cursor.getPos();
final int indexTo = cursor.getUpperBound();
for (int i = indexFrom; i < indexTo; i++) {
final char current = buf.charAt(i);
if ((delimiters != null && delimiters.get(current)) || isWhitespace(current)) {
break;
}
pos++;
dst.append(current);
}
cursor.updatePos(pos);
}
/**
* Transfers content into the destination buffer until a whitespace character, a quote,
* or any of the given delimiters is encountered.
*
* @param buf buffer with the sequence of chars to be parsed
* @param cursor defines the bounds and current position of the buffer
* @param delimiters set of delimiting characters. Can be {@code null} if the value
* is delimited by a whitespace or a quote only.
* @param dst destination buffer
*/
public void copyUnquotedContent(final CharSequence buf, final Cursor cursor,
final BitSet delimiters, final StringBuilder dst) {
Args.notNull(buf, "Char sequence");
Args.notNull(cursor, "Parser cursor");
Args.notNull(dst, "String builder");
int pos = cursor.getPos();
final int indexFrom = cursor.getPos();
final int indexTo = cursor.getUpperBound();
for (int i = indexFrom; i < indexTo; i++) {
final char current = buf.charAt(i);
if ((delimiters != null && delimiters.get(current))
|| isWhitespace(current) || current == DQUOTE) {
break;
}
pos++;
dst.append(current);
}
cursor.updatePos(pos);
}
/**
* Transfers content enclosed with quote marks into the destination buffer.
*
* @param buf buffer with the sequence of chars to be parsed
* @param cursor defines the bounds and current position of the buffer
* @param dst destination buffer
*/
public void copyQuotedContent(final CharSequence buf, final Cursor cursor,
final StringBuilder dst) {
Args.notNull(buf, "Char sequence");
Args.notNull(cursor, "Parser cursor");
Args.notNull(dst, "String builder");
if (cursor.atEnd()) {
return;
}
int pos = cursor.getPos();
int indexFrom = cursor.getPos();
final int indexTo = cursor.getUpperBound();
char current = buf.charAt(pos);
if (current != DQUOTE) {
return;
}
pos++;
indexFrom++;
boolean escaped = false;
for (int i = indexFrom; i < indexTo; i++, pos++) {
current = buf.charAt(i);
if (escaped) {
if (current != DQUOTE && current != ESCAPE) {
dst.append(ESCAPE);
}
dst.append(current);
escaped = false;
} else {
if (current == DQUOTE) {
pos++;
break;
}
if (current == ESCAPE) {
escaped = true;
} else if (current != CR && current != LF) {
dst.append(current);
}
}
}
cursor.updatePos(pos);
}
}