org.antlr.v4.runtime.CodePointCharStream Maven / Gradle / Ivy

Go to download
/*
 * Copyright (c) 2012 The ANTLR Project. All rights reserved.
 * Use of this file is governed by the BSD-3-Clause license that
 * can be found in the LICENSE.txt file in the project root.
 */
package org.antlr.v4.runtime;

import org.antlr.v4.runtime.misc.Interval;

import java.nio.charset.Charset;

/**
 * Alternative to {@link ANTLRInputStream} which treats the input
 * as a series of Unicode code points, instead of a series of UTF-16
 * code units.
 *
 * Use this if you need to parse input which potentially contains
 * Unicode values > U+FFFF.
 */
public abstract class CodePointCharStream implements UnicodeCharStream, CharStream {
	protected final int size;
	protected final String name;

	// To avoid lots of virtual method calls, we directly access
	// the state of the underlying code points in the
	// CodePointBuffer.
	protected int position;

	// Use the factory method {@link #fromBuffer(CodePointBuffer)} to
	// construct instances of this type.
	private CodePointCharStream(int position, int remaining, String name) {
		// TODO
		assert position == 0;
		this.size = remaining;
		this.name = name;
		this.position = 0;
	}

	// Visible for testing.
	abstract Object getInternalStorage();

	/**
	 * Constructs a {@link CodePointCharStream} which provides access
	 * to the Unicode code points stored in {@code codePointBuffer}.
	 */
	public static CodePointCharStream fromBuffer(CodePointBuffer codePointBuffer) {
		return fromBuffer(codePointBuffer, UNKNOWN_SOURCE_NAME);
	}

	/**
	 * Constructs a named {@link CodePointCharStream} which provides access
	 * to the Unicode code points stored in {@code codePointBuffer}.
	 */
	public static CodePointCharStream fromBuffer(CodePointBuffer codePointBuffer, String name) {
		// Java lacks generics on primitive types.
		//
		// To avoid lots of calls to virtual methods in the
		// very hot codepath of LA() below, we construct one
		// of three concrete subclasses.
		//
		// The concrete subclasses directly access the code
		// points stored in the underlying array (byte[],
		// char[], or int[]), so we can avoid lots of virtual
		// method calls to ByteBuffer.get(offset).
		switch (codePointBuffer.getType()) {
			case BYTE:
				return new CodePoint8BitCharStream(
						codePointBuffer.position(),
						codePointBuffer.remaining(),
						name,
						codePointBuffer.byteArray(),
						codePointBuffer.arrayOffset());
			case CHAR:
				return new CodePoint16BitCharStream(
						codePointBuffer.position(),
						codePointBuffer.remaining(),
						name,
						codePointBuffer.charArray(),
						codePointBuffer.arrayOffset());
			case INT:
				return new CodePoint32BitCharStream(
						codePointBuffer.position(),
						codePointBuffer.remaining(),
						name,
						codePointBuffer.intArray(),
						codePointBuffer.arrayOffset());
		}
		throw new UnsupportedOperationException("Not reached");
	}

	@Override
	public final void consume() {
		if (size - position == 0) {
			assert LA(1) == IntStream.EOF;
			throw new IllegalStateException("cannot consume EOF");
		}
		position = position + 1;
	}

	@Override
	public final int index() {
		return position;
	}

	@Override
	public final int size() {
		return size;
	}

	/** mark/release do nothing; we have entire buffer */
	@Override
	public final int mark() {
		return -1;
	}

	@Override
	public final void release(int marker) {
	}

	@Override
	public final void seek(int index) {
		position = index;
	}

	@Override
	public final String getSourceName() {
		if (name == null || name.isEmpty()) {
			return UNKNOWN_SOURCE_NAME;
		}

		return name;
	}

	@Override
	public final String toString() {
		return getText(Interval.of(0, size - 1));
	}

	@Override
	public final boolean supportsUnicodeCodePoints() {
		return true;
	}

	// 8-bit storage for code points <= U+00FF.
	private static final class CodePoint8BitCharStream extends CodePointCharStream {
		private final byte[] byteArray;

		private CodePoint8BitCharStream(int position, int remaining, String name, byte[] byteArray, int arrayOffset) {
			super(position, remaining, name);
			// TODO
			assert arrayOffset == 0;
			this.byteArray = byteArray;
		}

		/** Return the UTF-16 encoded string for the given interval */
		@Override
		public String getText(Interval interval) {
			int startIdx = Math.min(interval.a, size);
			int len = Math.min(interval.b - interval.a + 1, size - startIdx);

			// We know the maximum code point in byteArray is U+00FF,
			// so we can treat this as if it were ISO-8859-1, aka Latin-1,
			// which shares the same code points up to 0xFF.
			return new String(byteArray, startIdx, len, Charset.forName("ISO-8859-1"));
		}

		@Override
		public int LA(int i) {
			int offset;
			switch (Integer.signum(i)) {
				case -1:
					offset = position + i;
					if (offset < 0) {
						return IntStream.EOF;
					}
					return byteArray[offset] & 0xFF;
				case 0:
					// Undefined
					return 0;
				case 1:
					offset = position + i - 1;
					if (offset >= size) {
						return IntStream.EOF;
					}
					return byteArray[offset] & 0xFF;
			}
			throw new UnsupportedOperationException("Not reached");
		}

		@Override
		Object getInternalStorage() {
			return byteArray;
		}
	}

	// 16-bit internal storage for code points between U+0100 and U+FFFF.
	private static final class CodePoint16BitCharStream extends CodePointCharStream {
		private final char[] charArray;

		private CodePoint16BitCharStream(int position, int remaining, String name, char[] charArray, int arrayOffset) {
			super(position, remaining, name);
			this.charArray = charArray;
			// TODO
			assert arrayOffset == 0;
		}

		/** Return the UTF-16 encoded string for the given interval */
		@Override
		public String getText(Interval interval) {
			int startIdx = Math.min(interval.a, size);
			int len = Math.min(interval.b - interval.a + 1, size - startIdx);

			// We know there are no surrogates in this
			// array, since otherwise we would be given a
			// 32-bit int[] array.
			//
			// So, it's safe to treat this as if it were
			// UTF-16.
			return new String(charArray, startIdx, len);
		}

		@Override
		public int LA(int i) {
			int offset;
			switch (Integer.signum(i)) {
				case -1:
					offset = position + i;
					if (offset < 0) {
						return IntStream.EOF;
					}
					return charArray[offset] & 0xFFFF;
				case 0:
					// Undefined
					return 0;
				case 1:
					offset = position + i - 1;
					if (offset >= size) {
						return IntStream.EOF;
					}
					return charArray[offset] & 0xFFFF;
			}
			throw new UnsupportedOperationException("Not reached");
		}

		@Override
		Object getInternalStorage() {
			return charArray;
		}
	}

	// 32-bit internal storage for code points between U+10000 and U+10FFFF.
	private static final class CodePoint32BitCharStream extends CodePointCharStream {
		private final int[] intArray;

		private CodePoint32BitCharStream(int position, int remaining, String name, int[] intArray, int arrayOffset) {
			super(position, remaining, name);
			this.intArray = intArray;
			// TODO
			assert arrayOffset == 0;
		}

		/** Return the UTF-16 encoded string for the given interval */
		@Override
		public String getText(Interval interval) {
			int startIdx = Math.min(interval.a, size);
			int len = Math.min(interval.b - interval.a + 1, size - startIdx);

			// Note that we pass the int[] code points to the String constructor --
			// this is supported, and the constructor will convert to UTF-16 internally.
			return new String(intArray, startIdx, len);
		}

		@Override
		public int LA(int i) {
			int offset;
			switch (Integer.signum(i)) {
				case -1:
					offset = position + i;
					if (offset < 0) {
						return IntStream.EOF;
					}
					return intArray[offset];
				case 0:
					// Undefined
					return 0;
				case 1:
					offset = position + i - 1;
					if (offset >= size) {
						return IntStream.EOF;
					}
					return intArray[offset];
			}
			throw new UnsupportedOperationException("Not reached");
		}

		@Override
		Object getInternalStorage() {
			return intArray;
		}
	}
}