org.apache.flink.table.dataformat.BinaryString Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.	See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.	You may obtain a copy of the License at
 *
 *		http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.table.dataformat;

import org.apache.flink.api.common.typeinfo.TypeInfo;
import org.apache.flink.api.common.typeutils.TypeSerializer;
import org.apache.flink.core.memory.MemorySegment;
import org.apache.flink.core.memory.MemorySegmentFactory;
import org.apache.flink.table.runtime.typeutils.BinaryStringTypeInfoFactory;
import org.apache.flink.table.runtime.util.SegmentsUtil;
import org.apache.flink.table.runtime.util.StringUtf8Utils;

import javax.annotation.Nonnull;

import java.util.Arrays;

import static org.apache.flink.util.Preconditions.checkArgument;

/**
 * A utf8 string which is backed by {@link MemorySegment} instead of String. Its data may span
 * multiple {@link MemorySegment}s.
 *
 * Used for internal table-level implementation. The built-in operator will use it for comparison,
 * search, and so on.
 *
 * 
{@code BinaryString} are influenced by Apache Spark UTF8String.
 */
@TypeInfo(BinaryStringTypeInfoFactory.class)
public final class BinaryString extends LazyBinaryFormat implements Comparable {

	public static final BinaryString EMPTY_UTF8 = BinaryString.fromBytes(StringUtf8Utils.encodeUTF8(""));

	public BinaryString() {}

	private BinaryString(MemorySegment[] segments, int offset, int sizeInBytes) {
		super(segments, offset, sizeInBytes);
	}

	private BinaryString(String javaObject) {
		super(javaObject);
	}

	private BinaryString(MemorySegment[] segments, int offset, int sizeInBytes, String javaObject) {
		super(segments, offset, sizeInBytes, javaObject);
	}

	// ------------------------------------------------------------------------------------------
	// Constructor helper
	// ------------------------------------------------------------------------------------------

	/**
	 * Creates an BinaryString from given address (base and offset) and length.
	 */
	public static BinaryString fromAddress(
			MemorySegment[] segments, int offset, int numBytes) {
		return new BinaryString(segments, offset, numBytes);
	}

	/**
	 * Creates an BinaryString from given java String.
	 */
	public static BinaryString fromString(String str) {
		if (str == null) {
			return null;
		} else {
			return new BinaryString(str);
		}
	}

	/**
	 * Creates an BinaryString from given UTF-8 bytes.
	 */
	public static BinaryString fromBytes(byte[] bytes) {
		return fromBytes(bytes, 0, bytes.length);
	}

	/**
	 * Creates an BinaryString from given UTF-8 bytes with offset and number of bytes.
	 */
	public static BinaryString fromBytes(byte[] bytes, int offset, int numBytes) {
		return new BinaryString(
				new MemorySegment[] {MemorySegmentFactory.wrap(bytes)}, offset, numBytes);
	}

	/**
	 * Creates an BinaryString that contains `length` spaces.
	 */
	public static BinaryString blankString(int length) {
		byte[] spaces = new byte[length];
		Arrays.fill(spaces, (byte) ' ');
		return fromBytes(spaces);
	}

	// ------------------------------------------------------------------------------------------
	// Public methods on BinaryString
	// ------------------------------------------------------------------------------------------

	/**
	 * Returns the number of UTF-8 code points in the string.
	 */
	public int numChars() {
		ensureMaterialized();
		if (inFirstSegment()) {
			int len = 0;
			for (int i = 0; i < binarySection.sizeInBytes; i += numBytesForFirstByte(getByteOneSegment(i))) {
				len++;
			}
			return len;
		} else {
			return numCharsMultiSegs();
		}
	}

	private int numCharsMultiSegs() {
		int len = 0;
		int segSize = binarySection.segments[0].size();
		SegmentAndOffset index = firstSegmentAndOffset(segSize);
		int i = 0;
		while (i < binarySection.sizeInBytes) {
			int charBytes = numBytesForFirstByte(index.value());
			i += charBytes;
			len++;
			index.skipBytes(charBytes, segSize);
		}
		return len;
	}

	/**
	 * Returns the {@code byte} value at the specified index. An index ranges from {@code 0} to
	 * {@code binarySection.sizeInBytes - 1}.
	 *
	 * @param      index   the index of the {@code byte} value.
	 * @return     the {@code byte} value at the specified index of this UTF-8 bytes.
	 * @exception  IndexOutOfBoundsException  if the {@code index}
	 *             argument is negative or not less than the length of this
	 *             UTF-8 bytes.
	 */
	public byte byteAt(int index) {
		ensureMaterialized();
		int globalOffset = binarySection.offset + index;
		int size = binarySection.segments[0].size();
		if (globalOffset < size) {
			return binarySection.segments[0].get(globalOffset);
		} else {
			return binarySection.segments[globalOffset / size].get(globalOffset % size);
		}
	}

	/**
	 * Get the underlying UTF-8 byte array, the returned bytes may be reused.
	 */
	public byte[] getBytes() {
		ensureMaterialized();
		return SegmentsUtil.getBytes(binarySection.segments, binarySection.offset, binarySection.sizeInBytes);
	}

	@Override
	public boolean equals(Object o) {
		if (o instanceof BinaryString) {
			BinaryString other = (BinaryString) o;
			if (javaObject != null && other.javaObject != null) {
				return javaObject.equals(other.javaObject);
			}

			ensureMaterialized();
			other.ensureMaterialized();
			return binarySection.equals(other.binarySection);
		} else {
			return false;
		}
	}

	@Override
	public int hashCode() {
		ensureMaterialized();
		return binarySection.hashCode();
	}

	@Override
	public String toString() {
		if (javaObject == null) {
			byte[] bytes = SegmentsUtil.allocateReuseBytes(binarySection.sizeInBytes);
			SegmentsUtil.copyToBytes(binarySection.segments, binarySection.offset, bytes, 0, binarySection.sizeInBytes);
			javaObject = StringUtf8Utils.decodeUTF8(bytes, 0, binarySection.sizeInBytes);
		}
		return javaObject;
	}

	@Override
	public MemorySegment[] getSegments() {
		ensureMaterialized();
		return super.getSegments();
	}

	@Override
	public int getOffset() {
		ensureMaterialized();
		return super.getOffset();
	}

	@Override
	public int getSizeInBytes() {
		ensureMaterialized();
		return super.getSizeInBytes();
	}

	public void ensureMaterialized() {
		ensureMaterialized(null);
	}

	@Override
	protected BinarySection materialize(TypeSerializer serializer) {
		if (serializer != null) {
			throw new IllegalArgumentException("BinaryString does not support custom serializers");
		}

		byte[] bytes = StringUtf8Utils.encodeUTF8(javaObject);
		return new BinarySection(
			new MemorySegment[]{MemorySegmentFactory.wrap(bytes)},
			0,
			bytes.length
		);
	}

	/**
	 * Copy a new {@code BinaryString}.
	 */
	public BinaryString copy() {
		ensureMaterialized();
		byte[] copy = SegmentsUtil.copyToBytes(binarySection.segments, binarySection.offset, binarySection.sizeInBytes);
		return new BinaryString(new MemorySegment[] {MemorySegmentFactory.wrap(copy)},
				0, binarySection.sizeInBytes, javaObject);
	}

	/**
	 * Compares two strings lexicographically.
	 * Since UTF-8 uses groups of six bits, it is sometimes useful to use octal notation which
	 * uses 3-bit groups. With a calculator which can convert between hexadecimal and octal it
	 * can be easier to manually create or interpret UTF-8 compared with using binary.
	 * So we just compare the binary.
	 */
	@Override
	public int compareTo(@Nonnull BinaryString other) {
		if (javaObject != null && other.javaObject != null) {
			return javaObject.compareTo(other.javaObject);
		}

		ensureMaterialized();
		other.ensureMaterialized();
		if (binarySection.segments.length == 1 && other.binarySection.segments.length == 1) {

			int len = Math.min(binarySection.sizeInBytes, other.binarySection.sizeInBytes);
			MemorySegment seg1 = binarySection.segments[0];
			MemorySegment seg2 = other.binarySection.segments[0];

			for (int i = 0; i < len; i++) {
				int res =
					(seg1.get(binarySection.offset + i) & 0xFF) - (seg2.get(other.binarySection.offset + i) & 0xFF);
				if (res != 0) {
					return res;
				}
			}
			return binarySection.sizeInBytes - other.binarySection.sizeInBytes;
		}

		// if there are multi segments.
		return compareMultiSegments(other);
	}

	/**
	 * Find the boundaries of segments, and then compare MemorySegment.
	 */
	private int compareMultiSegments(BinaryString other) {

		if (binarySection.sizeInBytes == 0 || other.binarySection.sizeInBytes == 0) {
			return binarySection.sizeInBytes - other.binarySection.sizeInBytes;
		}

		int len = Math.min(binarySection.sizeInBytes, other.binarySection.sizeInBytes);

		MemorySegment seg1 = binarySection.segments[0];
		MemorySegment seg2 = other.binarySection.segments[0];

		int segmentSize = binarySection.segments[0].size();
		int otherSegmentSize = other.binarySection.segments[0].size();

		int sizeOfFirst1 = segmentSize - binarySection.offset;
		int sizeOfFirst2 = otherSegmentSize - other.binarySection.offset;

		int varSegIndex1 = 1;
		int varSegIndex2 = 1;

		// find the first segment of this string.
		while (sizeOfFirst1 <= 0) {
			sizeOfFirst1 += segmentSize;
			seg1 = binarySection.segments[varSegIndex1++];
		}

		while (sizeOfFirst2 <= 0) {
			sizeOfFirst2 += otherSegmentSize;
			seg2 = other.binarySection.segments[varSegIndex2++];
		}

		int offset1 = segmentSize - sizeOfFirst1;
		int offset2 = otherSegmentSize - sizeOfFirst2;

		int needCompare = Math.min(Math.min(sizeOfFirst1, sizeOfFirst2), len);

		while (needCompare > 0) {
			// compare in one segment.
			for (int i = 0; i < needCompare; i++) {
				int res = (seg1.get(offset1 + i) & 0xFF) - (seg2.get(offset2 + i) & 0xFF);
				if (res != 0) {
					return res;
				}
			}
			if (needCompare == len) {
				break;
			}
			len -= needCompare;
			// next segment
			if (sizeOfFirst1 < sizeOfFirst2) { //I am smaller
				seg1 = binarySection.segments[varSegIndex1++];
				offset1 = 0;
				offset2 += needCompare;
				sizeOfFirst1 = segmentSize;
				sizeOfFirst2 -= needCompare;
			} else if (sizeOfFirst1 > sizeOfFirst2) { //other is smaller
				seg2 = other.binarySection.segments[varSegIndex2++];
				offset2 = 0;
				offset1 += needCompare;
				sizeOfFirst2 = otherSegmentSize;
				sizeOfFirst1 -= needCompare;
			} else { // same, should go ahead both.
				seg1 = binarySection.segments[varSegIndex1++];
				seg2 = other.binarySection.segments[varSegIndex2++];
				offset1 = 0;
				offset2 = 0;
				sizeOfFirst1 = segmentSize;
				sizeOfFirst2 = otherSegmentSize;
			}
			needCompare = Math.min(Math.min(sizeOfFirst1, sizeOfFirst2), len);
		}

		checkArgument(needCompare == len);

		return binarySection.sizeInBytes - other.binarySection.sizeInBytes;
	}

	/**
	 * Returns a binary string that is a substring of this binary string. The substring begins at
	 * the specified {@code beginIndex} and extends to the character at index {@code endIndex - 1}.
	 *
	 * 
Examples:
	 * 
	 * fromString("hamburger").substring(4, 8) returns binary string "urge"
	 * fromString("smiles").substring(1, 5) returns binary string "mile"
	 * 
	 *
	 * @param beginIndex   the beginning index, inclusive.
	 * @param endIndex     the ending index, exclusive.
	 * @return the specified substring, return EMPTY_UTF8 when index out of bounds
	 * instead of StringIndexOutOfBoundsException.
	 */
	public BinaryString substring(int beginIndex, int endIndex) {
		ensureMaterialized();
		if (endIndex <= beginIndex || beginIndex >= binarySection.sizeInBytes) {
			return EMPTY_UTF8;
		}
		if (inFirstSegment()) {
			MemorySegment segment = binarySection.segments[0];
			int i = 0;
			int c = 0;
			while (i < binarySection.sizeInBytes && c < beginIndex) {
				i += numBytesForFirstByte(segment.get(i + binarySection.offset));
				c += 1;
			}

			int j = i;
			while (i < binarySection.sizeInBytes && c < endIndex) {
				i += numBytesForFirstByte(segment.get(i + binarySection.offset));
				c += 1;
			}

			if (i > j) {
				byte[] bytes = new byte[i - j];
				segment.get(binarySection.offset + j, bytes, 0, i - j);
				return fromBytes(bytes);
			} else {
				return EMPTY_UTF8;
			}
		} else {
			return substringMultiSegs(beginIndex, endIndex);
		}
	}

	private BinaryString substringMultiSegs(final int start, final int until) {
		int segSize = binarySection.segments[0].size();
		SegmentAndOffset index = firstSegmentAndOffset(segSize);
		int i = 0;
		int c = 0;
		while (i < binarySection.sizeInBytes && c < start) {
			int charSize = numBytesForFirstByte(index.value());
			i += charSize;
			index.skipBytes(charSize, segSize);
			c += 1;
		}

		int j = i;
		while (i < binarySection.sizeInBytes && c < until) {
			int charSize = numBytesForFirstByte(index.value());
			i += charSize;
			index.skipBytes(charSize, segSize);
			c += 1;
		}

		if (i > j) {
			return fromBytes(SegmentsUtil.copyToBytes(binarySection.segments, binarySection.offset + j, i - j));
		} else {
			return EMPTY_UTF8;
		}
	}

	/**
	 * Returns true if and only if this BinaryString contains the specified
	 * sequence of bytes values.
	 *
	 * @param s the sequence to search for
	 * @return true if this BinaryString contains {@code s}, false otherwise
	 */
	public boolean contains(final BinaryString s) {
		ensureMaterialized();
		s.ensureMaterialized();
		if (s.binarySection.sizeInBytes == 0) {
			return true;
		}
		int find = SegmentsUtil.find(
			binarySection.segments, binarySection.offset, binarySection.sizeInBytes,
			s.binarySection.segments, s.binarySection.offset, s.binarySection.sizeInBytes);
		return find != -1;
	}

	/**
	 * Tests if this BinaryString starts with the specified prefix.
	 *
	 * @param   prefix   the prefix.
	 * @return  {@code true} if the bytes represented by the argument is a prefix of the bytes
	 *          represented by this string; {@code false} otherwise. Note also that {@code true}
	 *          will be returned if the argument is an empty BinaryString or is equal to this
	 *          {@code BinaryString} object as determined by the {@link #equals(Object)} method.
	 */
	public boolean startsWith(final BinaryString prefix) {
		ensureMaterialized();
		prefix.ensureMaterialized();
		return matchAt(prefix, 0);
	}

	/**
	 * Tests if this BinaryString ends with the specified suffix.
	 *
	 * @param   suffix   the suffix.
	 * @return  {@code true} if the bytes represented by the argument is a suffix of the bytes
	 *          represented by this object; {@code false} otherwise. Note that the result will
	 *          be {@code true} if the argument is the empty string or is equal to this
	 *          {@code BinaryString} object as determined by the {@link #equals(Object)} method.
	 */
	public boolean endsWith(final BinaryString suffix) {
		ensureMaterialized();
		suffix.ensureMaterialized();
		return matchAt(suffix, binarySection.sizeInBytes - suffix.binarySection.sizeInBytes);
	}

	/**
	 * Returns a string whose value is this string, with any leading and trailing
	 * whitespace removed.
	 *
	 * @return  A string whose value is this string, with any leading and trailing white
	 *          space removed, or this string if it has no leading or
	 *          trailing white space.
	 */
	public BinaryString trim() {
		ensureMaterialized();
		if (inFirstSegment()) {
			int s = 0;
			int e = this.binarySection.sizeInBytes - 1;
			// skip all of the space (0x20) in the left side
			while (s < this.binarySection.sizeInBytes && getByteOneSegment(s) == 0x20) {
				s++;
			}
			// skip all of the space (0x20) in the right side
			while (e >= s && getByteOneSegment(e) == 0x20) {
				e--;
			}
			if (s > e) {
				// empty string
				return EMPTY_UTF8;
			} else {
				return copyBinaryStringInOneSeg(s, e - s + 1);
			}
		} else {
			return trimMultiSegs();
		}
	}

	private BinaryString trimMultiSegs() {
		int s = 0;
		int e = this.binarySection.sizeInBytes - 1;
		int segSize = binarySection.segments[0].size();
		SegmentAndOffset front = firstSegmentAndOffset(segSize);
		// skip all of the space (0x20) in the left side
		while (s < this.binarySection.sizeInBytes && front.value() == 0x20) {
			s++;
			front.nextByte(segSize);
		}
		SegmentAndOffset behind = lastSegmentAndOffset(segSize);
		// skip all of the space (0x20) in the right side
		while (e >= s && behind.value() == 0x20) {
			e--;
			behind.previousByte(segSize);
		}
		if (s > e) {
			// empty string
			return EMPTY_UTF8;
		} else {
			return copyBinaryString(s, e);
		}
	}

	/**
	 * Returns the index within this string of the first occurrence of the
	 * specified substring, starting at the specified index.
	 *
	 * @param   str         the substring to search for.
	 * @param   fromIndex   the index from which to start the search.
	 * @return  the index of the first occurrence of the specified substring,
	 *          starting at the specified index,
	 *          or {@code -1} if there is no such occurrence.
	 */
	public int indexOf(BinaryString str, int fromIndex) {
		ensureMaterialized();
		str.ensureMaterialized();
		if (str.binarySection.sizeInBytes == 0) {
			return 0;
		}
		if (inFirstSegment()) {
			// position in byte
			int byteIdx = 0;
			// position is char
			int charIdx = 0;
			while (byteIdx < binarySection.sizeInBytes && charIdx < fromIndex) {
				byteIdx += numBytesForFirstByte(getByteOneSegment(byteIdx));
				charIdx++;
			}
			do {
				if (byteIdx + str.binarySection.sizeInBytes > binarySection.sizeInBytes) {
					return -1;
				}
				if (SegmentsUtil.equals(binarySection.segments, binarySection.offset + byteIdx,
						str.binarySection.segments, str.binarySection.offset, str.binarySection.sizeInBytes)) {
					return charIdx;
				}
				byteIdx += numBytesForFirstByte(getByteOneSegment(byteIdx));
				charIdx++;
			} while (byteIdx < binarySection.sizeInBytes);

			return -1;
		} else {
			return indexOfMultiSegs(str, fromIndex);
		}
	}

	private int indexOfMultiSegs(BinaryString str, int fromIndex) {
		// position in byte
		int byteIdx = 0;
		// position is char
		int charIdx = 0;
		int segSize = binarySection.segments[0].size();
		SegmentAndOffset index = firstSegmentAndOffset(segSize);
		while (byteIdx < binarySection.sizeInBytes && charIdx < fromIndex) {
			int charBytes = numBytesForFirstByte(index.value());
			byteIdx += charBytes;
			charIdx++;
			index.skipBytes(charBytes, segSize);
		}
		do {
			if (byteIdx + str.binarySection.sizeInBytes > binarySection.sizeInBytes) {
				return -1;
			}
			if (SegmentsUtil.equals(binarySection.segments, binarySection.offset + byteIdx,
					str.binarySection.segments, str.binarySection.offset, str.binarySection.sizeInBytes)) {
				return charIdx;
			}
			int charBytes = numBytesForFirstByte(index.segment.get(index.offset));
			byteIdx += charBytes;
			charIdx++;
			index.skipBytes(charBytes, segSize);
		} while (byteIdx < binarySection.sizeInBytes);

		return -1;
	}

	/**
	 * Converts all of the characters in this {@code BinaryString} to upper case.
	 *
	 * @return the {@code BinaryString}, converted to uppercase.
	 */
	public BinaryString toUpperCase() {
		if (javaObject != null) {
			return javaToUpperCase();
		}
		if (binarySection.sizeInBytes == 0) {
			return EMPTY_UTF8;
		}
		int size = binarySection.segments[0].size();
		SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size);
		byte[] bytes = new byte[binarySection.sizeInBytes];
		bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value());
		for (int i = 0; i < binarySection.sizeInBytes; i++) {
			byte b = segmentAndOffset.value();
			if (numBytesForFirstByte(b) != 1) {
				// fallback
				return javaToUpperCase();
			}
			int upper = Character.toUpperCase((int) b);
			if (upper > 127) {
				// fallback
				return javaToUpperCase();
			}
			bytes[i] = (byte) upper;
			segmentAndOffset.nextByte(size);
		}
		return fromBytes(bytes);
	}

	private BinaryString javaToUpperCase() {
		return fromString(toString().toUpperCase());
	}

	/**
	 * Converts all of the characters in this {@code BinaryString} to lower case.
	 *
	 * @return the {@code BinaryString}, converted to lowercase.
	 */
	public BinaryString toLowerCase() {
		if (javaObject != null) {
			return javaToLowerCase();
		}
		if (binarySection.sizeInBytes == 0) {
			return EMPTY_UTF8;
		}
		int size = binarySection.segments[0].size();
		SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size);
		byte[] bytes = new byte[binarySection.sizeInBytes];
		bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value());
		for (int i = 0; i < binarySection.sizeInBytes; i++) {
			byte b = segmentAndOffset.value();
			if (numBytesForFirstByte(b) != 1) {
				// fallback
				return javaToLowerCase();
			}
			int lower = Character.toLowerCase((int) b);
			if (lower > 127) {
				// fallback
				return javaToLowerCase();
			}
			bytes[i] = (byte) lower;
			segmentAndOffset.nextByte(size);
		}
		return fromBytes(bytes);
	}

	private BinaryString javaToLowerCase() {
		return fromString(toString().toLowerCase());
	}

	// ------------------------------------------------------------------------------------------
	// Internal methods on BinaryString
	// ------------------------------------------------------------------------------------------

	byte getByteOneSegment(int i) {
		return binarySection.segments[0].get(binarySection.offset + i);
	}

	boolean inFirstSegment() {
		return binarySection.sizeInBytes + binarySection.offset <= binarySection.segments[0].size();
	}

	private boolean matchAt(final BinaryString s, int pos) {
		return (inFirstSegment() && s.inFirstSegment()) ? matchAtOneSeg(s, pos) : matchAtVarSeg(s, pos);
	}

	private boolean matchAtOneSeg(final BinaryString s, int pos) {
		return s.binarySection.sizeInBytes + pos <= binarySection.sizeInBytes && pos >= 0 &&
			binarySection.segments[0].equalTo(
				s.binarySection.segments[0],
				binarySection.offset + pos,
				s.binarySection.offset,
				s.binarySection.sizeInBytes);
	}

	private boolean matchAtVarSeg(final BinaryString s, int pos) {
		return s.binarySection.sizeInBytes + pos <= binarySection.sizeInBytes && pos >= 0 &&
			SegmentsUtil.equals(
				binarySection.segments,
				binarySection.offset + pos,
				s.binarySection.segments,
				s.binarySection.offset,
				s.binarySection.sizeInBytes);
	}

	BinaryString copyBinaryStringInOneSeg(int start, int len) {
		byte[] newBytes = new byte[len];
		binarySection.segments[0].get(binarySection.offset + start, newBytes, 0, len);
		return fromBytes(newBytes);
	}

	BinaryString copyBinaryString(int start, int end) {
		int len = end - start + 1;
		byte[] newBytes = new byte[len];
		SegmentsUtil.copyToBytes(binarySection.segments, binarySection.offset + start, newBytes, 0, len);
		return fromBytes(newBytes);
	}

	SegmentAndOffset firstSegmentAndOffset(int segSize) {
		int segIndex = binarySection.offset / segSize;
		return new SegmentAndOffset(segIndex, binarySection.offset % segSize);
	}

	SegmentAndOffset lastSegmentAndOffset(int segSize) {
		int lastOffset = binarySection.offset + binarySection.sizeInBytes - 1;
		int segIndex = lastOffset / segSize;
		return new SegmentAndOffset(segIndex, lastOffset % segSize);
	}

	private SegmentAndOffset startSegmentAndOffset(int segSize) {
		return inFirstSegment() ? new SegmentAndOffset(0, binarySection.offset) : firstSegmentAndOffset(segSize);
	}

	/**
	 * CurrentSegment and positionInSegment.
	 */
	class SegmentAndOffset {
		int segIndex;
		MemorySegment segment;
		int offset;

		private SegmentAndOffset(int segIndex, int offset) {
			this.segIndex = segIndex;
			this.segment = binarySection.segments[segIndex];
			this.offset = offset;
		}

		private void assignSegment() {
			segment = segIndex >= 0 && segIndex < binarySection.segments.length ?
					binarySection.segments[segIndex] : null;
		}

		void previousByte(int segSize) {
			offset--;
			if (offset == -1) {
				segIndex--;
				assignSegment();
				offset = segSize - 1;
			}
		}

		void nextByte(int segSize) {
			offset++;
			checkAdvance(segSize);
		}

		private void checkAdvance(int segSize) {
			if (offset == segSize) {
				advance();
			}
		}

		private void advance() {
			segIndex++;
			assignSegment();
			offset = 0;
		}

		void skipBytes(int n, int segSize) {
			int remaining = segSize - this.offset;
			if (remaining > n) {
				this.offset += n;
			} else {
				while (true) {
					int toSkip = Math.min(remaining, n);
					n -= toSkip;
					if (n <= 0) {
						this.offset += toSkip;
						checkAdvance(segSize);
						return;
					}
					advance();
					remaining = segSize - this.offset;
				}
			}
		}

		byte value() {
			return this.segment.get(this.offset);
		}
	}

	/**
	 * Returns the number of bytes for a code point with the first byte as `b`.
	 * @param b The first byte of a code point
	 */
	static int numBytesForFirstByte(final byte b) {
		if (b >= 0) {
			// 1 byte, 7 bits: 0xxxxxxx
			return 1;
		} else if ((b >> 5) == -2 && (b & 0x1e) != 0) {
			// 2 bytes, 11 bits: 110xxxxx 10xxxxxx
			return 2;
		} else if ((b >> 4) == -2) {
			// 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
			return 3;
		} else if ((b >> 3) == -2) {
			// 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
			return 4;
		} else {
			// Skip the first byte disallowed in UTF-8
			// Handling errors quietly, same semantics to java String.
			return 1;
		}
	}
}