org.antlr.v4.runtime.UnbufferedCharStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of virtdata-lib-realer Show documentation
With inspiration from other libraries
There is a newer version: 2.12.15
/*
 * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
 * Use of this file is governed by the BSD 3-clause license that
 * can be found in the LICENSE.txt file in the project root.
 */

package org.antlr.v4.runtime;

import org.antlr.v4.runtime.misc.Interval;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

/** Do not buffer up the entire char stream. It does keep a small buffer
 *  for efficiency and also buffers while a mark exists (set by the
 *  lookahead prediction in parser). "Unbuffered" here refers to fact
 *  that it doesn't buffer all data, not that's it's on demand loading of char.
 *
 *  Before 4.7, this class used the default environment encoding to convert
 *  bytes to UTF-16, and held the UTF-16 bytes in the buffer as chars.
 *
 *  As of 4.7, the class uses UTF-8 by default, and the buffer holds Unicode
 *  code points in the buffer as ints.
 */
public class UnbufferedCharStream implements CharStream {
	/**
	 * A moving window buffer of the data being scanned. While there's a marker,
	 * we keep adding to buffer. Otherwise, {@link #consume consume()} resets so
	 * we start filling at index 0 again.
	 */
	protected int[] data;

	/**
	 * The number of characters currently in {@link #data data}.
	 *
	 * This is not the buffer capacity, that's {@code data.length}.
	 */
   	protected int n;

	/**
	 * 0..n-1 index into {@link #data data} of next character.
	 *
	 * The {@code LA(1)} character is {@code data[p]}. If {@code p == n}, we are
	 * out of buffered characters.
	 */
   	protected int p=0;

	/**
	 * Count up with {@link #mark mark()} and down with
	 * {@link #release release()}. When we {@code release()} the last mark,
	 * {@code numMarkers} reaches 0 and we reset the buffer. Copy
	 * {@code data[p]..data[n-1]} to {@code data[0]..data[(n-1)-p]}.
	 */
	protected int numMarkers = 0;

	/**
	 * This is the {@code LA(-1)} character for the current position.
	 */
	protected int lastChar = -1;

	/**
	 * When {@code numMarkers > 0}, this is the {@code LA(-1)} character for the
	 * first character in {@link #data data}. Otherwise, this is unspecified.
	 */
	protected int lastCharBufferStart;

	/**
	 * Absolute character index. It's the index of the character about to be
	 * read via {@code LA(1)}. Goes from 0 to the number of characters in the
	 * entire stream, although the stream size is unknown before the end is
	 * reached.
	 */
    protected int currentCharIndex = 0;

    protected Reader input;

	/** The name or source of this char stream. */
	public String name;

	/** Useful for subclasses that pull char from other than this.input. */
	public UnbufferedCharStream() {
		this(256);
	}

	/** Useful for subclasses that pull char from other than this.input. */
	public UnbufferedCharStream(int bufferSize) {
		n = 0;
		data = new int[bufferSize];
	}

	public UnbufferedCharStream(InputStream input) {
		this(input, 256);
	}

	public UnbufferedCharStream(Reader input) {
		this(input, 256);
	}

	public UnbufferedCharStream(InputStream input, int bufferSize) {
		this(input, bufferSize, StandardCharsets.UTF_8);
	}

	public UnbufferedCharStream(InputStream input, int bufferSize, Charset charset) {
		this(bufferSize);
		this.input = new InputStreamReader(input, charset);
		fill(1); // prime
	}

	public UnbufferedCharStream(Reader input, int bufferSize) {
		this(bufferSize);
		this.input = input;
		fill(1); // prime
	}

	@Override
	public void consume() {
		if (LA(1) == IntStream.EOF) {
			throw new IllegalStateException("cannot consume EOF");
		}

		// buf always has at least data[p==0] in this method due to ctor
		lastChar = data[p];   // track last char for LA(-1)

		if (p == n-1 && numMarkers==0) {
			n = 0;
			p = -1; // p++ will leave this at 0
			lastCharBufferStart = lastChar;
		}

		p++;
		currentCharIndex++;
		sync(1);
	}

	/**
	 * Make sure we have 'need' elements from current position {@link #p p}.
	 * Last valid {@code p} index is {@code data.length-1}. {@code p+need-1} is
	 * the char index 'need' elements ahead. If we need 1 element,
	 * {@code (p+1-1)==p} must be less than {@code data.length}.
	 */
	protected void sync(int want) {
		int need = (p+want-1) - n + 1; // how many more elements we need?
		if ( need > 0 ) {
			fill(need);
		}
	}

	/**
	 * Add {@code n} characters to the buffer. Returns the number of characters
	 * actually added to the buffer. If the return value is less than {@code n},
	 * then EOF was reached before {@code n} characters could be added.
	 */
	protected int fill(int n) {
		for (int i=0; i 0 && data[this.n - 1] == IntStream.EOF) {
				return i;
			}

			try {
				int c = nextChar();
				if (c > Character.MAX_VALUE || c == IntStream.EOF) {
					add(c);
				}
				else {
					char ch = (char) c;
					if (Character.isLowSurrogate(ch)) {
						throw new RuntimeException("Invalid UTF-16 (low surrogate with no preceding high surrogate)");
					}
					else if (Character.isHighSurrogate(ch)) {
						int lowSurrogate = nextChar();
						if (lowSurrogate > Character.MAX_VALUE) {
							throw new RuntimeException("Invalid UTF-16 (high surrogate followed by code point > U+FFFF");
						}
						else if (lowSurrogate == IntStream.EOF) {
							throw new RuntimeException("Invalid UTF-16 (dangling high surrogate at end of file)");
						}
						else {
							char lowSurrogateChar = (char) lowSurrogate;
							if (Character.isLowSurrogate(lowSurrogateChar)) {
								add(Character.toCodePoint(ch, lowSurrogateChar));
							}
							else {
								throw new RuntimeException("Invalid UTF-16 (dangling high surrogate");
							}
						}
					}
					else {
						add(c);
					}
				}
			}
			catch (IOException ioe) {
				throw new RuntimeException(ioe);
			}
		}

		return n;
	}

	/**
	 * Override to provide different source of characters than
	 * {@link #input input}.
	 */
	protected int nextChar() throws IOException {
		return input.read();
	}

	protected void add(int c) {
		if ( n>=data.length ) {
			data = Arrays.copyOf(data, data.length * 2);
        }
        data[n++] = c;
    }

    @Override
    public int LA(int i) {
		if ( i==-1 ) return lastChar; // special case
        sync(i);
        int index = p + i - 1;
        if ( index < 0 ) throw new IndexOutOfBoundsException();
		if ( index >= n ) return IntStream.EOF;
        return data[index];
    }

	/**
	 * Return a marker that we can release later.
	 *
	 * The specific marker value used for this class allows for some level of
	 * protection against misuse where {@code seek()} is called on a mark or
	 * {@code release()} is called in the wrong order.
	 */
    @Override
    public int mark() {
		if (numMarkers == 0) {
			lastCharBufferStart = lastChar;
		}

		int mark = -numMarkers - 1;
		numMarkers++;
		return mark;
    }

	/** Decrement number of markers, resetting buffer if we hit 0.
	 * @param marker
	 */
    @Override
    public void release(int marker) {
		int expectedMark = -numMarkers;
		if ( marker!=expectedMark ) {
			throw new IllegalStateException("release() called with an invalid marker.");
		}

		numMarkers--;
		if ( numMarkers==0 && p > 0 ) { // release buffer when we can, but don't do unnecessary work
			// Copy data[p]..data[n-1] to data[0]..data[(n-1)-p], reset ptrs
			// p is last valid char; move nothing if p==n as we have no valid char
			System.arraycopy(data, p, data, 0, n - p); // shift n-p char from p to 0
			n = n - p;
			p = 0;
			lastCharBufferStart = lastChar;
		}
    }

    @Override
    public int index() {
		return currentCharIndex;
    }

	/** Seek to absolute character index, which might not be in the current
	 *  sliding window.  Move {@code p} to {@code index-bufferStartIndex}.
	 */
    @Override
    public void seek(int index) {
		if (index == currentCharIndex) {
			return;
		}

		if (index > currentCharIndex) {
			sync(index - currentCharIndex);
			index = Math.min(index, getBufferStartIndex() + n - 1);
		}

        // index == to bufferStartIndex should set p to 0
        int i = index - getBufferStartIndex();
        if ( i < 0 ) {
			throw new IllegalArgumentException("cannot seek to negative index " + index);
		}
		else if (i >= n) {
            throw new UnsupportedOperationException("seek to index outside buffer: "+
                    index+" not in "+getBufferStartIndex()+".."+(getBufferStartIndex()+n));
        }

		p = i;
		currentCharIndex = index;
		if (p == 0) {
			lastChar = lastCharBufferStart;
		}
		else {
			lastChar = data[p-1];
		}
    }

    @Override
    public int size() {
        throw new UnsupportedOperationException("Unbuffered stream cannot know its size");
    }

    @Override
    public String getSourceName() {
		if (name == null || name.isEmpty()) {
			return UNKNOWN_SOURCE_NAME;
		}

		return name;
	}

	@Override
	public String getText(Interval interval) {
		if (interval.a < 0 || interval.b < interval.a - 1) {
			throw new IllegalArgumentException("invalid interval");
		}

		int bufferStartIndex = getBufferStartIndex();
		if (n > 0 && data[n - 1] == Character.MAX_VALUE) {
			if (interval.a + interval.length() > bufferStartIndex + n) {
				throw new IllegalArgumentException("the interval extends past the end of the stream");
			}
		}

		if (interval.a < bufferStartIndex || interval.b >= bufferStartIndex + n) {
			throw new UnsupportedOperationException("interval "+interval+" outside buffer: "+
			                    bufferStartIndex+".."+(bufferStartIndex+n-1));
		}
		// convert from absolute to local index
		int i = interval.a - bufferStartIndex;
		return new String(data, i, interval.length());
	}

	protected final int getBufferStartIndex() {
		return currentCharIndex - p;
	}
}