org.antlr.v4.runtime.UnbufferedCharStream Maven / Gradle / Ivy
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.runtime;
import org.antlr.v4.runtime.misc.Interval;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
/** Do not buffer up the entire char stream. It does keep a small buffer
* for efficiency and also buffers while a mark exists (set by the
* lookahead prediction in parser). "Unbuffered" here refers to fact
* that it doesn't buffer all data, not that's it's on demand loading of char.
*
* Before 4.7, this class used the default environment encoding to convert
* bytes to UTF-16, and held the UTF-16 bytes in the buffer as chars.
*
* As of 4.7, the class uses UTF-8 by default, and the buffer holds Unicode
* code points in the buffer as ints.
*/
public class UnbufferedCharStream implements CharStream {
/**
* A moving window buffer of the data being scanned. While there's a marker,
* we keep adding to buffer. Otherwise, {@link #consume consume()} resets so
* we start filling at index 0 again.
*/
protected int[] data;
/**
* The number of characters currently in {@link #data data}.
*
* This is not the buffer capacity, that's {@code data.length}.
*/
protected int n;
/**
* 0..n-1 index into {@link #data data} of next character.
*
* The {@code LA(1)} character is {@code data[p]}. If {@code p == n}, we are
* out of buffered characters.
*/
protected int p=0;
/**
* Count up with {@link #mark mark()} and down with
* {@link #release release()}. When we {@code release()} the last mark,
* {@code numMarkers} reaches 0 and we reset the buffer. Copy
* {@code data[p]..data[n-1]} to {@code data[0]..data[(n-1)-p]}.
*/
protected int numMarkers = 0;
/**
* This is the {@code LA(-1)} character for the current position.
*/
protected int lastChar = -1;
/**
* When {@code numMarkers > 0}, this is the {@code LA(-1)} character for the
* first character in {@link #data data}. Otherwise, this is unspecified.
*/
protected int lastCharBufferStart;
/**
* Absolute character index. It's the index of the character about to be
* read via {@code LA(1)}. Goes from 0 to the number of characters in the
* entire stream, although the stream size is unknown before the end is
* reached.
*/
protected int currentCharIndex = 0;
protected Reader input;
/** The name or source of this char stream. */
public String name;
/** Useful for subclasses that pull char from other than this.input. */
public UnbufferedCharStream() {
this(256);
}
/** Useful for subclasses that pull char from other than this.input. */
public UnbufferedCharStream(int bufferSize) {
n = 0;
data = new int[bufferSize];
}
public UnbufferedCharStream(InputStream input) {
this(input, 256);
}
public UnbufferedCharStream(Reader input) {
this(input, 256);
}
public UnbufferedCharStream(InputStream input, int bufferSize) {
this(input, bufferSize, StandardCharsets.UTF_8);
}
public UnbufferedCharStream(InputStream input, int bufferSize, Charset charset) {
this(bufferSize);
this.input = new InputStreamReader(input, charset);
fill(1); // prime
}
public UnbufferedCharStream(Reader input, int bufferSize) {
this(bufferSize);
this.input = input;
fill(1); // prime
}
@Override
public void consume() {
if (LA(1) == IntStream.EOF) {
throw new IllegalStateException("cannot consume EOF");
}
// buf always has at least data[p==0] in this method due to ctor
lastChar = data[p]; // track last char for LA(-1)
if (p == n-1 && numMarkers==0) {
n = 0;
p = -1; // p++ will leave this at 0
lastCharBufferStart = lastChar;
}
p++;
currentCharIndex++;
sync(1);
}
/**
* Make sure we have 'need' elements from current position {@link #p p}.
* Last valid {@code p} index is {@code data.length-1}. {@code p+need-1} is
* the char index 'need' elements ahead. If we need 1 element,
* {@code (p+1-1)==p} must be less than {@code data.length}.
*/
protected void sync(int want) {
int need = (p+want-1) - n + 1; // how many more elements we need?
if ( need > 0 ) {
fill(need);
}
}
/**
* Add {@code n} characters to the buffer. Returns the number of characters
* actually added to the buffer. If the return value is less than {@code n},
* then EOF was reached before {@code n} characters could be added.
*/
protected int fill(int n) {
for (int i=0; i 0 && data[this.n - 1] == IntStream.EOF) {
return i;
}
try {
int c = nextChar();
if (c > Character.MAX_VALUE || c == IntStream.EOF) {
add(c);
}
else {
char ch = (char) c;
if (Character.isLowSurrogate(ch)) {
throw new RuntimeException("Invalid UTF-16 (low surrogate with no preceding high surrogate)");
}
else if (Character.isHighSurrogate(ch)) {
int lowSurrogate = nextChar();
if (lowSurrogate > Character.MAX_VALUE) {
throw new RuntimeException("Invalid UTF-16 (high surrogate followed by code point > U+FFFF");
}
else if (lowSurrogate == IntStream.EOF) {
throw new RuntimeException("Invalid UTF-16 (dangling high surrogate at end of file)");
}
else {
char lowSurrogateChar = (char) lowSurrogate;
if (Character.isLowSurrogate(lowSurrogateChar)) {
add(Character.toCodePoint(ch, lowSurrogateChar));
}
else {
throw new RuntimeException("Invalid UTF-16 (dangling high surrogate");
}
}
}
else {
add(c);
}
}
}
catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
return n;
}
/**
* Override to provide different source of characters than
* {@link #input input}.
*/
protected int nextChar() throws IOException {
return input.read();
}
protected void add(int c) {
if ( n>=data.length ) {
data = Arrays.copyOf(data, data.length * 2);
}
data[n++] = c;
}
@Override
public int LA(int i) {
if ( i==-1 ) return lastChar; // special case
sync(i);
int index = p + i - 1;
if ( index < 0 ) throw new IndexOutOfBoundsException();
if ( index >= n ) return IntStream.EOF;
return data[index];
}
/**
* Return a marker that we can release later.
*
* The specific marker value used for this class allows for some level of
* protection against misuse where {@code seek()} is called on a mark or
* {@code release()} is called in the wrong order.
*/
@Override
public int mark() {
if (numMarkers == 0) {
lastCharBufferStart = lastChar;
}
int mark = -numMarkers - 1;
numMarkers++;
return mark;
}
/** Decrement number of markers, resetting buffer if we hit 0.
* @param marker
*/
@Override
public void release(int marker) {
int expectedMark = -numMarkers;
if ( marker!=expectedMark ) {
throw new IllegalStateException("release() called with an invalid marker.");
}
numMarkers--;
if ( numMarkers==0 && p > 0 ) { // release buffer when we can, but don't do unnecessary work
// Copy data[p]..data[n-1] to data[0]..data[(n-1)-p], reset ptrs
// p is last valid char; move nothing if p==n as we have no valid char
System.arraycopy(data, p, data, 0, n - p); // shift n-p char from p to 0
n = n - p;
p = 0;
lastCharBufferStart = lastChar;
}
}
@Override
public int index() {
return currentCharIndex;
}
/** Seek to absolute character index, which might not be in the current
* sliding window. Move {@code p} to {@code index-bufferStartIndex}.
*/
@Override
public void seek(int index) {
if (index == currentCharIndex) {
return;
}
if (index > currentCharIndex) {
sync(index - currentCharIndex);
index = Math.min(index, getBufferStartIndex() + n - 1);
}
// index == to bufferStartIndex should set p to 0
int i = index - getBufferStartIndex();
if ( i < 0 ) {
throw new IllegalArgumentException("cannot seek to negative index " + index);
}
else if (i >= n) {
throw new UnsupportedOperationException("seek to index outside buffer: "+
index+" not in "+getBufferStartIndex()+".."+(getBufferStartIndex()+n));
}
p = i;
currentCharIndex = index;
if (p == 0) {
lastChar = lastCharBufferStart;
}
else {
lastChar = data[p-1];
}
}
@Override
public int size() {
throw new UnsupportedOperationException("Unbuffered stream cannot know its size");
}
@Override
public String getSourceName() {
if (name == null || name.isEmpty()) {
return UNKNOWN_SOURCE_NAME;
}
return name;
}
@Override
public String getText(Interval interval) {
if (interval.a < 0 || interval.b < interval.a - 1) {
throw new IllegalArgumentException("invalid interval");
}
int bufferStartIndex = getBufferStartIndex();
if (n > 0 && data[n - 1] == Character.MAX_VALUE) {
if (interval.a + interval.length() > bufferStartIndex + n) {
throw new IllegalArgumentException("the interval extends past the end of the stream");
}
}
if (interval.a < bufferStartIndex || interval.b >= bufferStartIndex + n) {
throw new UnsupportedOperationException("interval "+interval+" outside buffer: "+
bufferStartIndex+".."+(bufferStartIndex+n-1));
}
// convert from absolute to local index
int i = interval.a - bufferStartIndex;
return new String(data, i, interval.length());
}
protected final int getBufferStartIndex() {
return currentCharIndex - p;
}
}