All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.gitlab.mvysny.uribuilder.util.Tokenizer Maven / Gradle / Ivy

The newest version!
/*
 * ====================================================================
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * .
 *
 */

package com.gitlab.mvysny.uribuilder.util;

import java.util.BitSet;

import com.gitlab.mvysny.uribuilder.annotation.Contract;
import com.gitlab.mvysny.uribuilder.annotation.ThreadingBehavior;

/**
 * Tokenizer that can be used as a foundation for more complex parsing routines.
 * Methods of this class are designed to produce near zero intermediate garbage
 * and make no intermediate copies of input data.
 * 

* This class is immutable and thread safe. * * @since 5.1 */ @Contract(threading = ThreadingBehavior.IMMUTABLE) public final class Tokenizer { public static class Cursor { private final int lowerBound; private final int upperBound; private int pos; public Cursor(final int lowerBound, final int upperBound) { super(); Args.notNegative(lowerBound, "lowerBound"); Args.check(lowerBound <= upperBound, "lowerBound cannot be greater than upperBound"); this.lowerBound = lowerBound; this.upperBound = upperBound; this.pos = lowerBound; } public int getLowerBound() { return this.lowerBound; } public int getUpperBound() { return this.upperBound; } public int getPos() { return this.pos; } public void updatePos(final int pos) { Args.check(pos >= this.lowerBound, "pos: %s < lowerBound: %s", pos, this.lowerBound); Args.check(pos <= this.upperBound, "pos: %s > upperBound: %s", pos, this.upperBound); this.pos = pos; } public boolean atEnd() { return this.pos >= this.upperBound; } @Override public String toString() { final StringBuilder buffer = new StringBuilder(); buffer.append('['); buffer.append(this.lowerBound); buffer.append('>'); buffer.append(this.pos); buffer.append('>'); buffer.append(this.upperBound); buffer.append(']'); return buffer.toString(); } } public static BitSet INIT_BITSET(final int ... b) { final BitSet bitset = new BitSet(); for (final int aB : b) { bitset.set(aB); } return bitset; } /** Double quote */ public static final char DQUOTE = '\"'; /** Backward slash / escape character */ public static final char ESCAPE = '\\'; public static final int CR = 13; // public static final int LF = 10; // public static final int SP = 32; // public static final int HT = 9; // public static boolean isWhitespace(final char ch) { return ch == SP || ch == HT || ch == CR || ch == LF; } public static final Tokenizer INSTANCE = new Tokenizer(); /** * Extracts from the sequence of chars a token terminated with any of the given delimiters * or a whitespace characters. * * @param buf buffer with the sequence of chars to be parsed * @param cursor defines the bounds and current position of the buffer * @param delimiters set of delimiting characters. Can be {@code null} if the token * is not delimited by any character. */ public String parseContent(final CharSequence buf, final Cursor cursor, final BitSet delimiters) { Args.notNull(buf, "Char sequence"); Args.notNull(cursor, "Parser cursor"); final StringBuilder dst = new StringBuilder(); copyContent(buf, cursor, delimiters, dst); return dst.toString(); } /** * Extracts from the sequence of chars a token terminated with any of the given delimiters * discarding semantically insignificant whitespace characters. * * @param buf buffer with the sequence of chars to be parsed * @param cursor defines the bounds and current position of the buffer * @param delimiters set of delimiting characters. Can be {@code null} if the token * is not delimited by any character. */ public String parseToken(final CharSequence buf, final Cursor cursor, final BitSet delimiters) { Args.notNull(buf, "Char sequence"); Args.notNull(cursor, "Parser cursor"); final StringBuilder dst = new StringBuilder(); boolean whitespace = false; while (!cursor.atEnd()) { final char current = buf.charAt(cursor.getPos()); if (delimiters != null && delimiters.get(current)) { break; } else if (isWhitespace(current)) { skipWhiteSpace(buf, cursor); whitespace = true; } else { if (whitespace && dst.length() > 0) { dst.append(' '); } copyContent(buf, cursor, delimiters, dst); whitespace = false; } } return dst.toString(); } /** * Extracts from the sequence of chars a value which can be enclosed in quote marks and * terminated with any of the given delimiters discarding semantically insignificant * whitespace characters. * * @param buf buffer with the sequence of chars to be parsed * @param cursor defines the bounds and current position of the buffer * @param delimiters set of delimiting characters. Can be {@code null} if the value * is not delimited by any character. */ public String parseValue(final CharSequence buf, final Cursor cursor, final BitSet delimiters) { Args.notNull(buf, "Char sequence"); Args.notNull(cursor, "Parser cursor"); final StringBuilder dst = new StringBuilder(); boolean whitespace = false; while (!cursor.atEnd()) { final char current = buf.charAt(cursor.getPos()); if (delimiters != null && delimiters.get(current)) { break; } else if (isWhitespace(current)) { skipWhiteSpace(buf, cursor); whitespace = true; } else if (current == DQUOTE) { if (whitespace && dst.length() > 0) { dst.append(' '); } copyQuotedContent(buf, cursor, dst); whitespace = false; } else { if (whitespace && dst.length() > 0) { dst.append(' '); } copyUnquotedContent(buf, cursor, delimiters, dst); whitespace = false; } } return dst.toString(); } /** * Skips semantically insignificant whitespace characters and moves the cursor to the closest * non-whitespace character. * * @param buf buffer with the sequence of chars to be parsed * @param cursor defines the bounds and current position of the buffer */ public void skipWhiteSpace(final CharSequence buf, final Cursor cursor) { Args.notNull(buf, "Char sequence"); Args.notNull(cursor, "Parser cursor"); int pos = cursor.getPos(); final int indexFrom = cursor.getPos(); final int indexTo = cursor.getUpperBound(); for (int i = indexFrom; i < indexTo; i++) { final char current = buf.charAt(i); if (!isWhitespace(current)) { break; } pos++; } cursor.updatePos(pos); } /** * Transfers content into the destination buffer until a whitespace character or any of * the given delimiters is encountered. * * @param buf buffer with the sequence of chars to be parsed * @param cursor defines the bounds and current position of the buffer * @param delimiters set of delimiting characters. Can be {@code null} if the value * is delimited by a whitespace only. * @param dst destination buffer */ public void copyContent(final CharSequence buf, final Cursor cursor, final BitSet delimiters, final StringBuilder dst) { Args.notNull(buf, "Char sequence"); Args.notNull(cursor, "Parser cursor"); Args.notNull(dst, "String builder"); int pos = cursor.getPos(); final int indexFrom = cursor.getPos(); final int indexTo = cursor.getUpperBound(); for (int i = indexFrom; i < indexTo; i++) { final char current = buf.charAt(i); if ((delimiters != null && delimiters.get(current)) || isWhitespace(current)) { break; } pos++; dst.append(current); } cursor.updatePos(pos); } /** * Transfers content into the destination buffer until a whitespace character, a quote, * or any of the given delimiters is encountered. * * @param buf buffer with the sequence of chars to be parsed * @param cursor defines the bounds and current position of the buffer * @param delimiters set of delimiting characters. Can be {@code null} if the value * is delimited by a whitespace or a quote only. * @param dst destination buffer */ public void copyUnquotedContent(final CharSequence buf, final Cursor cursor, final BitSet delimiters, final StringBuilder dst) { Args.notNull(buf, "Char sequence"); Args.notNull(cursor, "Parser cursor"); Args.notNull(dst, "String builder"); int pos = cursor.getPos(); final int indexFrom = cursor.getPos(); final int indexTo = cursor.getUpperBound(); for (int i = indexFrom; i < indexTo; i++) { final char current = buf.charAt(i); if ((delimiters != null && delimiters.get(current)) || isWhitespace(current) || current == DQUOTE) { break; } pos++; dst.append(current); } cursor.updatePos(pos); } /** * Transfers content enclosed with quote marks into the destination buffer. * * @param buf buffer with the sequence of chars to be parsed * @param cursor defines the bounds and current position of the buffer * @param dst destination buffer */ public void copyQuotedContent(final CharSequence buf, final Cursor cursor, final StringBuilder dst) { Args.notNull(buf, "Char sequence"); Args.notNull(cursor, "Parser cursor"); Args.notNull(dst, "String builder"); if (cursor.atEnd()) { return; } int pos = cursor.getPos(); int indexFrom = cursor.getPos(); final int indexTo = cursor.getUpperBound(); char current = buf.charAt(pos); if (current != DQUOTE) { return; } pos++; indexFrom++; boolean escaped = false; for (int i = indexFrom; i < indexTo; i++, pos++) { current = buf.charAt(i); if (escaped) { if (current != DQUOTE && current != ESCAPE) { dst.append(ESCAPE); } dst.append(current); escaped = false; } else { if (current == DQUOTE) { pos++; break; } if (current == ESCAPE) { escaped = true; } else if (current != CR && current != LF) { dst.append(current); } } } cursor.updatePos(pos); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy