All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jsoup.parser.CharacterReader Maven / Gradle / Ivy

There is a newer version: 4.0.119
Show newest version
package org.jsoup.parser;

import org.jsoup.helper.Validate;

import java.util.Arrays;
import java.util.Locale;

/**
 CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
 */
public final class CharacterReader {
    static final char EOF = (char) -1;
    private static final int maxCacheLen = 12;

    private final char[] input;
    private final int length;
    private int pos = 0;
    private int mark = 0;
    private final String[] stringCache = new String[512]; // holds reused strings in this doc, to lessen garbage

    public CharacterReader(String input) {
        Validate.notNull(input);
        this.input = input.toCharArray();
        this.length = this.input.length;
    }

    /**
     * Gets the current cursor position in the content.
     * @return current position
     */
    public int pos() {
        return pos;
    }

    /**
     * Tests if all the content has been read.
     * @return true if nothing left to read.
     */
    public boolean isEmpty() {
        return pos >= length;
    }

    /**
     * Get the char at the current position.
     * @return char
     */
    public char current() {
        return pos >= length ? EOF : input[pos];
    }

    char consume() {
        char val = pos >= length ? EOF : input[pos];
        pos++;
        return val;
    }

    void unconsume() {
        pos--;
    }

    /**
     * Moves the current position by one.
     */
    public void advance() {
        pos++;
    }

    void mark() {
        mark = pos;
    }

    void rewindToMark() {
        pos = mark;
    }

    String consumeAsString() {
        return new String(input, pos++, 1);
    }

    /**
     * Returns the number of characters between the current position and the next instance of the input char
     * @param c scan target
     * @return offset between current position and next instance of target. -1 if not found.
     */
    int nextIndexOf(char c) {
        // doesn't handle scanning for surrogates
        for (int i = pos; i < length; i++) {
            if (c == input[i])
                return i - pos;
        }
        return -1;
    }

    /**
     * Returns the number of characters between the current position and the next instance of the input sequence
     *
     * @param seq scan target
     * @return offset between current position and next instance of target. -1 if not found.
     */
    int nextIndexOf(CharSequence seq) {
        // doesn't handle scanning for surrogates
        char startChar = seq.charAt(0);
        for (int offset = pos; offset < length; offset++) {
            // scan to first instance of startchar:
            if (startChar != input[offset])
                while(++offset < length && startChar != input[offset]) { /* empty */ }
            int i = offset + 1;
            int last = i + seq.length()-1;
            if (offset < length && last <= length) {
                for (int j = 1; i < last && seq.charAt(j) == input[i]; i++, j++) { /* empty */ }
                if (i == last) // found full sequence
                    return offset - pos;
            }
        }
        return -1;
    }

    /**
     * Reads characters up to the specific char.
     * @param c the delimiter
     * @return the chars read
     */
    public String consumeTo(char c) {
        int offset = nextIndexOf(c);
        if (offset != -1) {
            String consumed = cacheString(pos, offset);
            pos += offset;
            return consumed;
        } else {
            return consumeToEnd();
        }
    }

    String consumeTo(String seq) {
        int offset = nextIndexOf(seq);
        if (offset != -1) {
            String consumed = cacheString(pos, offset);
            pos += offset;
            return consumed;
        } else {
            return consumeToEnd();
        }
    }

    /**
     * Read characters until the first of any delimiters is found.
     * @param chars delimiters to scan for
     * @return characters read up to the matched delimiter.
     */
    public String consumeToAny(final char... chars) {
        final int start = pos;
        final int remaining = length;
        final char[] val = input;

        OUTER: while (pos < remaining) {
            for (char c : chars) {
                if (val[pos] == c)
                    break OUTER;
            }
            pos++;
        }

        return pos > start ? cacheString(start, pos-start) : "";
    }

    String consumeToAnySorted(final char... chars) {
        final int start = pos;
        final int remaining = length;
        final char[] val = input;

        while (pos < remaining) {
            if (Arrays.binarySearch(chars, val[pos]) >= 0)
                break;
            pos++;
        }

        return pos > start ? cacheString(start, pos-start) : "";
    }

    String consumeData() {
        // &, <, null
        final int start = pos;
        final int remaining = length;
        final char[] val = input;

        while (pos < remaining) {
            final char c = val[pos];
            if (c == '&'|| c ==  '<' || c ==  TokeniserState.nullChar)
                break;
            pos++;
        }

        return pos > start ? cacheString(start, pos-start) : "";
    }

    String consumeTagName() {
        // '\t', '\n', '\r', '\f', ' ', '/', '>', nullChar
        final int start = pos;
        final int remaining = length;
        final char[] val = input;

        while (pos < remaining) {
            final char c = val[pos];
            if (c == '\t'|| c ==  '\n'|| c ==  '\r'|| c ==  '\f'|| c ==  ' '|| c ==  '/'|| c ==  '>'|| c ==  TokeniserState.nullChar)
                break;
            pos++;
        }

        return pos > start ? cacheString(start, pos-start) : "";
    }

    String consumeToEnd() {
        String data = cacheString(pos, length-pos);
        pos = length;
        return data;
    }

    String consumeLetterSequence() {
        int start = pos;
        while (pos < length) {
            char c = input[pos];
            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
                pos++;
            else
                break;
        }

        return cacheString(start, pos - start);
    }

    String consumeLetterThenDigitSequence() {
        int start = pos;
        while (pos < length) {
            char c = input[pos];
            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
                pos++;
            else
                break;
        }
        while (!isEmpty()) {
            char c = input[pos];
            if (c >= '0' && c <= '9')
                pos++;
            else
                break;
        }

        return cacheString(start, pos - start);
    }

    String consumeHexSequence() {
        int start = pos;
        while (pos < length) {
            char c = input[pos];
            if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'))
                pos++;
            else
                break;
        }
        return cacheString(start, pos - start);
    }

    String consumeDigitSequence() {
        int start = pos;
        while (pos < length) {
            char c = input[pos];
            if (c >= '0' && c <= '9')
                pos++;
            else
                break;
        }
        return cacheString(start, pos - start);
    }

    boolean matches(char c) {
        return !isEmpty() && input[pos] == c;

    }

    boolean matches(String seq) {
        int scanLength = seq.length();
        if (scanLength > length - pos)
            return false;

        for (int offset = 0; offset < scanLength; offset++)
            if (seq.charAt(offset) != input[pos+offset])
                return false;
        return true;
    }

    boolean matchesIgnoreCase(String seq) {
        int scanLength = seq.length();
        if (scanLength > length - pos)
            return false;

        for (int offset = 0; offset < scanLength; offset++) {
            char upScan = Character.toUpperCase(seq.charAt(offset));
            char upTarget = Character.toUpperCase(input[pos + offset]);
            if (upScan != upTarget)
                return false;
        }
        return true;
    }

    boolean matchesAny(char... seq) {
        if (isEmpty())
            return false;

        char c = input[pos];
        for (char seek : seq) {
            if (seek == c)
                return true;
        }
        return false;
    }

    boolean matchesAnySorted(char[] seq) {
        return !isEmpty() && Arrays.binarySearch(seq, input[pos]) >= 0;
    }

    boolean matchesLetter() {
        if (isEmpty())
            return false;
        char c = input[pos];
        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c);
    }

    boolean matchesDigit() {
        if (isEmpty())
            return false;
        char c = input[pos];
        return (c >= '0' && c <= '9');
    }

    boolean matchConsume(String seq) {
        if (matches(seq)) {
            pos += seq.length();
            return true;
        } else {
            return false;
        }
    }

    boolean matchConsumeIgnoreCase(String seq) {
        if (matchesIgnoreCase(seq)) {
            pos += seq.length();
            return true;
        } else {
            return false;
        }
    }

    boolean containsIgnoreCase(String seq) {
        // used to check presence of , . only finds consistent case.
        String loScan = seq.toLowerCase(Locale.ENGLISH);
        String hiScan = seq.toUpperCase(Locale.ENGLISH);
        return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1);
    }

    @Override
    public String toString() {
        return new String(input, pos, length - pos);
    }

    /**
     * Caches short strings, as a flywheel pattern, to reduce GC load. Just for this doc, to prevent leaks.
     * 

* Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list. * That saves both having to create objects as hash keys, and running through the entry list, at the expense of * some more duplicates. */ private String cacheString(final int start, final int count) { final char[] val = input; final String[] cache = stringCache; // limit (no cache): if (count > maxCacheLen) return new String(val, start, count); // calculate hash: int hash = 0; int offset = start; for (int i = 0; i < count; i++) { hash = 31 * hash + val[offset++]; } // get from cache final int index = hash & cache.length - 1; String cached = cache[index]; if (cached == null) { // miss, add cached = new String(val, start, count); cache[index] = cached; } else { // hashcode hit, check equality if (rangeEquals(start, count, cached)) { // hit return cached; } else { // hashcode conflict cached = new String(val, start, count); cache[index] = cached; // update the cache, as recently used strings are more likely to show up again } } return cached; } /** * Check if the value of the provided range equals the string. */ boolean rangeEquals(final int start, int count, final String cached) { if (count == cached.length()) { char one[] = input; int i = start; int j = 0; while (count-- != 0) { if (one[i++] != cached.charAt(j++)) return false; } return true; } return false; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy