org.antlr.v4.runtime.CharStreams Maven / Gradle / Ivy
/*
* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
* Use of this file is governed by the BSD 3-clause license that
* can be found in the LICENSE.txt file in the project root.
*/
package org.antlr.v4.runtime;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
/** This class represents the primary interface for creating {@link CharStream}s
* from a variety of sources as of 4.7. The motivation was to support
* Unicode code points > U+FFFF. {@link ANTLRInputStream} and
* {@link ANTLRFileStream} are now deprecated in favor of the streams created
* by this interface.
*
* DEPRECATED: {@code new ANTLRFileStream("myinputfile")}
* NEW: {@code CharStreams.fromFileName("myinputfile")}
*
* WARNING: If you use both the deprecated and the new streams, you will see
* a nontrivial performance degradation. This speed hit is because the
* {@link Lexer}'s internal code goes from a monomorphic to megamorphic
* dynamic dispatch to get characters from the input stream. Java's
* on-the-fly compiler (JIT) is unable to perform the same optimizations
* so stick with either the old or the new streams, if performance is
* a primary concern. See the extreme debugging and spelunking
* needed to identify this issue in our timing rig:
*
* https://github.com/antlr/antlr4/pull/1781
*
* The ANTLR character streams still buffer all the input when you create
* the stream, as they have done for ~20 years. If you need unbuffered
* access, please note that it becomes challenging to create
* parse trees. The parse tree has to point to tokens which will either
* point into a stale location in an unbuffered stream or you have to copy
* the characters out of the buffer into the token. That defeats the purpose
* of unbuffered input. Per the ANTLR book, unbuffered streams are primarily
* useful for processing infinite streams *during the parse.*
*
* The new streams also use 8-bit buffers when possible so this new
* interface supports character streams that use half as much memory
* as the old {@link ANTLRFileStream}, which assumed 16-bit characters.
*
* A big shout out to Ben Hamilton (github bhamiltoncx) for his superhuman
* efforts across all targets to get true Unicode 3.1 support for U+10FFFF.
*
* @since 4.7
*/
public final class CharStreams {
private static final int DEFAULT_BUFFER_SIZE = 4096;
// Utility class; do not construct.
private CharStreams() { }
/**
* Creates a {@link CharStream} given a path to a UTF-8
* encoded file on disk.
*
* Reads the entire contents of the file into the result before returning.
*/
public static CharStream fromPath(Path path) throws IOException {
return fromPath(path, StandardCharsets.UTF_8);
}
/**
* Creates a {@link CharStream} given a path to a file on disk and the
* charset of the bytes contained in the file.
*
* Reads the entire contents of the file into the result before returning.
*/
public static CharStream fromPath(Path path, Charset charset) throws IOException {
long size = Files.size(path);
try (ReadableByteChannel channel = Files.newByteChannel(path)) {
return fromChannel(
channel,
charset,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
path.toString(),
size);
}
}
/**
* Creates a {@link CharStream} given a string containing a
* path to a UTF-8 file on disk.
*
* Reads the entire contents of the file into the result before returning.
*/
public static CharStream fromFileName(String fileName) throws IOException {
return fromPath(Paths.get(fileName), StandardCharsets.UTF_8);
}
/**
* Creates a {@link CharStream} given a string containing a
* path to a file on disk and the charset of the bytes
* contained in the file.
*
* Reads the entire contents of the file into the result before returning.
*/
public static CharStream fromFileName(String fileName, Charset charset) throws IOException {
return fromPath(Paths.get(fileName), charset);
}
/**
* Creates a {@link CharStream} given an opened {@link InputStream}
* containing UTF-8 bytes.
*
* Reads the entire contents of the {@code InputStream} into
* the result before returning, then closes the {@code InputStream}.
*/
public static CharStream fromStream(InputStream is) throws IOException {
return fromStream(is, StandardCharsets.UTF_8);
}
/**
* Creates a {@link CharStream} given an opened {@link InputStream} and the
* charset of the bytes contained in the stream.
*
* Reads the entire contents of the {@code InputStream} into
* the result before returning, then closes the {@code InputStream}.
*/
public static CharStream fromStream(InputStream is, Charset charset) throws IOException {
return fromStream(is, charset, -1);
}
public static CharStream fromStream(InputStream is, Charset charset, long inputSize) throws IOException {
try (ReadableByteChannel channel = Channels.newChannel(is)) {
return fromChannel(
channel,
charset,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
IntStream.UNKNOWN_SOURCE_NAME,
inputSize);
}
}
/**
* Creates a {@link CharStream} given an opened {@link ReadableByteChannel}
* containing UTF-8 bytes.
*
* Reads the entire contents of the {@code channel} into
* the result before returning, then closes the {@code channel}.
*/
public static CharStream fromChannel(ReadableByteChannel channel) throws IOException {
return fromChannel(channel, StandardCharsets.UTF_8);
}
/**
* Creates a {@link CharStream} given an opened {@link ReadableByteChannel} and the
* charset of the bytes contained in the channel.
*
* Reads the entire contents of the {@code channel} into
* the result before returning, then closes the {@code channel}.
*/
public static CharStream fromChannel(ReadableByteChannel channel, Charset charset) throws IOException {
return fromChannel(
channel,
DEFAULT_BUFFER_SIZE,
CodingErrorAction.REPLACE,
IntStream.UNKNOWN_SOURCE_NAME);
}
/**
* Creates a {@link CharStream} given a {@link Reader}. Closes
* the reader before returning.
*/
public static CodePointCharStream fromReader(Reader r) throws IOException {
return fromReader(r, IntStream.UNKNOWN_SOURCE_NAME);
}
/**
* Creates a {@link CharStream} given a {@link Reader} and its
* source name. Closes the reader before returning.
*/
public static CodePointCharStream fromReader(Reader r, String sourceName) throws IOException {
try {
CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder(DEFAULT_BUFFER_SIZE);
CharBuffer charBuffer = CharBuffer.allocate(DEFAULT_BUFFER_SIZE);
while ((r.read(charBuffer)) != -1) {
charBuffer.flip();
codePointBufferBuilder.append(charBuffer);
charBuffer.compact();
}
return CodePointCharStream.fromBuffer(codePointBufferBuilder.build(), sourceName);
}
finally {
r.close();
}
}
/**
* Creates a {@link CharStream} given a {@link String}.
*/
public static CodePointCharStream fromString(String s) {
return fromString(s, IntStream.UNKNOWN_SOURCE_NAME);
}
/**
* Creates a {@link CharStream} given a {@link String} and the {@code sourceName}
* from which it came.
*/
public static CodePointCharStream fromString(String s, String sourceName) {
// Initial guess assumes no code points > U+FFFF: one code
// point for each code unit in the string
CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder(s.length());
// TODO: CharBuffer.wrap(String) rightfully returns a read-only buffer
// which doesn't expose its array, so we make a copy.
CharBuffer cb = CharBuffer.allocate(s.length());
cb.put(s);
cb.flip();
codePointBufferBuilder.append(cb);
return CodePointCharStream.fromBuffer(codePointBufferBuilder.build(), sourceName);
}
/**
* Creates a {@link CharStream} given an opened {@link ReadableByteChannel}
* containing UTF-8 bytes.
*
* Reads the entire contents of the {@code channel} into
* the result before returning, then closes the {@code channel}.
*/
public static CodePointCharStream fromChannel(
ReadableByteChannel channel,
int bufferSize,
CodingErrorAction decodingErrorAction,
String sourceName)
throws IOException
{
return fromChannel(channel, StandardCharsets.UTF_8, bufferSize, decodingErrorAction, sourceName, -1);
}
public static CodePointCharStream fromChannel(
ReadableByteChannel channel,
Charset charset,
int bufferSize,
CodingErrorAction decodingErrorAction,
String sourceName,
long inputSize)
throws IOException
{
try {
ByteBuffer utf8BytesIn = ByteBuffer.allocate(bufferSize);
CharBuffer utf16CodeUnitsOut = CharBuffer.allocate(bufferSize);
if (inputSize == -1) {
inputSize = bufferSize;
} else if (inputSize > Integer.MAX_VALUE) {
// ByteBuffer et al don't support long sizes
throw new IOException(String.format("inputSize %d larger than max %d", inputSize, Integer.MAX_VALUE));
}
CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder((int) inputSize);
CharsetDecoder decoder = charset
.newDecoder()
.onMalformedInput(decodingErrorAction)
.onUnmappableCharacter(decodingErrorAction);
boolean endOfInput = false;
while (!endOfInput) {
int bytesRead = channel.read(utf8BytesIn);
endOfInput = (bytesRead == -1);
utf8BytesIn.flip();
CoderResult result = decoder.decode(
utf8BytesIn,
utf16CodeUnitsOut,
endOfInput);
if (result.isError() && decodingErrorAction.equals(CodingErrorAction.REPORT)) {
result.throwException();
}
utf16CodeUnitsOut.flip();
codePointBufferBuilder.append(utf16CodeUnitsOut);
utf8BytesIn.compact();
utf16CodeUnitsOut.compact();
}
// Handle any bytes at the end of the file which need to
// be represented as errors or substitution characters.
CoderResult flushResult = decoder.flush(utf16CodeUnitsOut);
if (flushResult.isError() && decodingErrorAction.equals(CodingErrorAction.REPORT)) {
flushResult.throwException();
}
utf16CodeUnitsOut.flip();
codePointBufferBuilder.append(utf16CodeUnitsOut);
CodePointBuffer codePointBuffer = codePointBufferBuilder.build();
return CodePointCharStream.fromBuffer(codePointBuffer, sourceName);
}
finally {
channel.close();
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy