org.apache.parquet.io.api.Binary Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of parquet-column Show documentation
Show all versions of parquet-column Show documentation
Parquet is a columnar storage format that supports nested data. This provides all generated metadata code.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.io.api;
import java.io.DataOutput;
import java.io.IOException;
import java.io.ObjectStreamException;
import java.io.OutputStream;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.schema.PrimitiveComparator;
import static org.apache.parquet.bytes.BytesUtils.UTF8;
abstract public class Binary implements Comparable, Serializable {
protected boolean isBackingBytesReused;
// this isn't really something others should extend
private Binary() { }
public static final Binary EMPTY = fromConstantByteArray(new byte[0]);
abstract public String toStringUsingUTF8();
abstract public int length();
abstract public void writeTo(OutputStream out) throws IOException;
abstract public void writeTo(DataOutput out) throws IOException;
abstract public byte[] getBytes();
/**
* Variant of getBytes() that avoids copying backing data structure by returning
* backing byte[] of the Binary. Do not modify backing byte[] unless you know what
* you are doing.
* @return backing byte[] of correct size, with an offset of 0, if possible, else returns result of getBytes()
*/
abstract public byte[] getBytesUnsafe();
abstract public Binary slice(int start, int length);
abstract boolean equals(byte[] bytes, int offset, int length);
abstract boolean equals(ByteBuffer bytes, int offset, int length);
abstract boolean equals(Binary other);
/**
* @deprecated will be removed in 2.0.0. The comparison logic depends on the related logical type therefore this one
* might not be correct. The {@link java.util.Comparator} implementation for the related type available at
* {@link org.apache.parquet.schema.PrimitiveType#comparator} should be used instead.
*/
@Deprecated
abstract public int compareTo(Binary other);
abstract public ByteBuffer toByteBuffer();
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (obj instanceof Binary) {
return equals((Binary)obj);
}
return false;
}
@Override
public String toString() {
return "Binary{" +
length() +
(isBackingBytesReused ? " reused": " constant") +
" bytes, " +
Arrays.toString(getBytesUnsafe())
+ "}";
}
public Binary copy() {
if (isBackingBytesReused) {
return Binary.fromConstantByteArray(getBytes());
} else {
return this;
}
}
/**
* Signals if backing bytes are owned, and can be modified, by producer of the Binary
* @return if backing bytes are held on by producer of the Binary
*/
public boolean isBackingBytesReused() {
return isBackingBytesReused;
}
private static class ByteArraySliceBackedBinary extends Binary {
private final byte[] value;
private final int offset;
private final int length;
public ByteArraySliceBackedBinary(byte[] value, int offset, int length, boolean isBackingBytesReused) {
this.value = value;
this.offset = offset;
this.length = length;
this.isBackingBytesReused = isBackingBytesReused;
}
@Override
public String toStringUsingUTF8() {
return UTF8.decode(ByteBuffer.wrap(value, offset, length)).toString();
// TODO: figure out why the following line was much slower
// rdb: new String(...) is slower because it instantiates a new Decoder,
// while Charset#decode uses a thread-local decoder cache
// return new String(value, offset, length, BytesUtils.UTF8);
}
@Override
public int length() {
return length;
}
@Override
public void writeTo(OutputStream out) throws IOException {
out.write(value, offset, length);
}
@Override
public byte[] getBytes() {
return Arrays.copyOfRange(value, offset, offset + length);
}
@Override
public byte[] getBytesUnsafe() {
// Backing array is larger than the slice used for this Binary.
return getBytes();
}
@Override
public Binary slice(int start, int length) {
if (isBackingBytesReused) {
return Binary.fromReusedByteArray(value, offset + start, length);
} else {
return Binary.fromConstantByteArray(value, offset + start, length);
}
}
@Override
public int hashCode() {
return Binary.hashCode(value, offset, length);
}
@Override
boolean equals(Binary other) {
return other.equals(value, offset, length);
}
@Override
boolean equals(byte[] other, int otherOffset, int otherLength) {
return Binary.equals(value, offset, length, other, otherOffset, otherLength);
}
@Override
boolean equals(ByteBuffer bytes, int otherOffset, int otherLength) {
return Binary.equals(value, offset, length, bytes, otherOffset, otherLength);
}
@Override
public int compareTo(Binary other) {
return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR.compare(this, other);
}
@Override
public ByteBuffer toByteBuffer() {
return ByteBuffer.wrap(value, offset, length);
}
@Override
public void writeTo(DataOutput out) throws IOException {
out.write(value, offset, length);
}
}
private static class FromStringBinary extends ByteBufferBackedBinary {
public FromStringBinary(String value) {
// reused is false, because we do not hold on to the buffer after
// conversion, and nobody else has a handle to it
super(encodeUTF8(value), false);
}
@Override
public String toString() {
return "Binary{\"" + toStringUsingUTF8() + "\"}";
}
private static ByteBuffer encodeUTF8(String value) {
try {
return ByteBuffer.wrap(value.getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
throw new ParquetEncodingException("UTF-8 not supported.", e);
}
}
}
private static class FromCharSequenceBinary extends ByteBufferBackedBinary {
public FromCharSequenceBinary(CharSequence value) {
// reused is false, because we do not hold on to the buffer after
// conversion, and nobody else has a handle to it
super(encodeUTF8(value), false);
}
@Override
public String toString() {
return "Binary{\"" + toStringUsingUTF8() + "\"}";
}
private static final ThreadLocal ENCODER =
new ThreadLocal() {
@Override
protected CharsetEncoder initialValue() {
return StandardCharsets.UTF_8.newEncoder();
}
};
private static ByteBuffer encodeUTF8(CharSequence value) {
try {
return ENCODER.get().encode(CharBuffer.wrap(value));
} catch (CharacterCodingException e) {
throw new ParquetEncodingException("UTF-8 not supported.", e);
}
}
}
public static Binary fromReusedByteArray(final byte[] value, final int offset, final int length) {
return new ByteArraySliceBackedBinary(value, offset, length, true);
}
public static Binary fromConstantByteArray(final byte[] value, final int offset,
final int length) {
return new ByteArraySliceBackedBinary(value, offset, length, false);
}
@Deprecated
/**
* @deprecated Use @link{fromReusedByteArray} or @link{fromConstantByteArray} instead
*/
public static Binary fromByteArray(final byte[] value, final int offset, final int length) {
return fromReusedByteArray(value, offset, length); // Assume producer intends to reuse byte[]
}
private static class ByteArrayBackedBinary extends Binary {
private final byte[] value;
public ByteArrayBackedBinary(byte[] value, boolean isBackingBytesReused) {
this.value = value;
this.isBackingBytesReused = isBackingBytesReused;
}
@Override
public String toStringUsingUTF8() {
return UTF8.decode(ByteBuffer.wrap(value)).toString();
}
@Override
public int length() {
return value.length;
}
@Override
public void writeTo(OutputStream out) throws IOException {
out.write(value);
}
@Override
public byte[] getBytes() {
return Arrays.copyOfRange(value, 0, value.length);
}
@Override
public byte[] getBytesUnsafe() {
return value;
}
@Override
public Binary slice(int start, int length) {
if (isBackingBytesReused) {
return Binary.fromReusedByteArray(value, start, length);
} else {
return Binary.fromConstantByteArray(value, start, length);
}
}
@Override
public int hashCode() {
return Binary.hashCode(value, 0, value.length);
}
@Override
boolean equals(Binary other) {
return other.equals(value, 0, value.length);
}
@Override
boolean equals(byte[] other, int otherOffset, int otherLength) {
return Binary.equals(value, 0, value.length, other, otherOffset, otherLength);
}
@Override
boolean equals(ByteBuffer bytes, int otherOffset, int otherLength) {
return Binary.equals(value, 0, value.length, bytes, otherOffset, otherLength);
}
@Override
public int compareTo(Binary other) {
return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR.compare(this, other);
}
@Override
public ByteBuffer toByteBuffer() {
return ByteBuffer.wrap(value);
}
@Override
public void writeTo(DataOutput out) throws IOException {
out.write(value);
}
}
public static Binary fromReusedByteArray(final byte[] value) {
return new ByteArrayBackedBinary(value, true);
}
public static Binary fromConstantByteArray(final byte[] value) {
return new ByteArrayBackedBinary(value, false);
}
@Deprecated
/**
* @deprecated Use @link{fromReusedByteArray} or @link{fromConstantByteArray} instead
*/
public static Binary fromByteArray(final byte[] value) {
return fromReusedByteArray(value); // Assume producer intends to reuse byte[]
}
private static class ByteBufferBackedBinary extends Binary {
private ByteBuffer value;
private transient byte[] cachedBytes;
private int offset;
private int length;
public ByteBufferBackedBinary(ByteBuffer value, boolean isBackingBytesReused) {
this.value = value;
this.offset = value.position();
this.length = value.remaining();
this.isBackingBytesReused = isBackingBytesReused;
}
public ByteBufferBackedBinary(ByteBuffer value, int offset, int length, boolean isBackingBytesReused) {
this.value = value;
this.offset = offset;
this.length = length;
this.isBackingBytesReused = isBackingBytesReused;
}
@Override
public String toStringUsingUTF8() {
String ret;
if (value.hasArray()) {
try {
ret = new String(value.array(), value.arrayOffset() + offset, length, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new ParquetDecodingException("UTF-8 not supported");
}
} else {
int limit = value.limit();
value.limit(offset+length);
int position = value.position();
value.position(offset);
// no corresponding interface to read a subset of a buffer, would have to slice it
// which creates another ByteBuffer object or do what is done here to adjust the
// limit/offset and set them back after
ret = UTF8.decode(value).toString();
value.limit(limit);
value.position(position);
}
return ret;
}
@Override
public int length() {
return length;
}
@Override
public void writeTo(OutputStream out) throws IOException {
if (value.hasArray()) {
out.write(value.array(), value.arrayOffset() + offset, length);
} else {
out.write(getBytesUnsafe(), 0, length);
}
}
@Override
public byte[] getBytes() {
byte[] bytes = new byte[length];
int limit = value.limit();
value.limit(offset + length);
int position = value.position();
value.position(offset);
value.get(bytes);
value.limit(limit);
value.position(position);
if (!isBackingBytesReused) { // backing buffer might change
cachedBytes = bytes;
}
return bytes;
}
@Override
public byte[] getBytesUnsafe() {
return cachedBytes != null ? cachedBytes : getBytes();
}
@Override
public Binary slice(int start, int length) {
return Binary.fromConstantByteArray(getBytesUnsafe(), start, length);
}
@Override
public int hashCode() {
if (value.hasArray()) {
return Binary.hashCode(value.array(), value.arrayOffset() + offset, length);
} else {
return Binary.hashCode(value, offset, length);
}
}
@Override
boolean equals(Binary other) {
if (value.hasArray()) {
return other.equals(value.array(), value.arrayOffset() + offset, length);
} else {
return other.equals(value, offset, length);
}
}
@Override
boolean equals(byte[] other, int otherOffset, int otherLength) {
if (value.hasArray()) {
return Binary.equals(value.array(), value.arrayOffset() + offset, length, other, otherOffset, otherLength);
} else {
return Binary.equals(other, otherOffset, otherLength, value, offset, length);
}
}
@Override
boolean equals(ByteBuffer otherBytes, int otherOffset, int otherLength) {
return Binary.equals(value, 0, length, otherBytes, otherOffset, otherLength);
}
@Override
public int compareTo(Binary other) {
return PrimitiveComparator.UNSIGNED_LEXICOGRAPHICAL_BINARY_COMPARATOR.compare(this, other);
}
@Override
public ByteBuffer toByteBuffer() {
ByteBuffer ret = value.duplicate();
ret.position(offset);
ret.limit(offset + length);
return ret;
}
@Override
public void writeTo(DataOutput out) throws IOException {
// TODO: should not have to materialize those bytes
out.write(getBytesUnsafe());
}
private void writeObject(java.io.ObjectOutputStream out) throws IOException {
byte[] bytes = getBytesUnsafe();
out.writeInt(bytes.length);
out.write(bytes);
}
private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException {
int length = in.readInt();
byte[] bytes = new byte[length];
in.readFully(bytes, 0, length);
this.value = ByteBuffer.wrap(bytes);
this.offset = 0;
this.length = length;
}
private void readObjectNoData() throws ObjectStreamException {
this.value = ByteBuffer.wrap(new byte[0]);
}
}
public static Binary fromReusedByteBuffer(final ByteBuffer value, int offset, int length) {
return new ByteBufferBackedBinary(value, offset, length, true);
}
public static Binary fromConstantByteBuffer(final ByteBuffer value, int offset, int length) {
return new ByteBufferBackedBinary(value, offset, length, false);
}
public static Binary fromReusedByteBuffer(final ByteBuffer value) {
return new ByteBufferBackedBinary(value, true);
}
public static Binary fromConstantByteBuffer(final ByteBuffer value) {
return new ByteBufferBackedBinary(value, false);
}
@Deprecated
/**
* @deprecated Use @link{fromReusedByteBuffer} or @link{fromConstantByteBuffer} instead
*/
public static Binary fromByteBuffer(final ByteBuffer value) {
return fromReusedByteBuffer(value); // Assume producer intends to reuse byte[]
}
public static Binary fromString(String value) {
return new FromStringBinary(value);
}
public static Binary fromCharSequence(CharSequence value) {
return new FromCharSequenceBinary(value);
}
/**
* @see {@link Arrays#hashCode(byte[])}
* @param array
* @param offset
* @param length
* @return
*/
private static final int hashCode(byte[] array, int offset, int length) {
int result = 1;
for (int i = offset; i < offset + length; i++) {
byte b = array[i];
result = 31 * result + b;
}
return result;
}
private static final int hashCode(ByteBuffer buf, int offset, int length) {
int result = 1;
for (int i = offset; i < offset + length; i++) {
byte b = buf.get(i);
result = 31 * result + b;
}
return result;
}
private static final boolean equals(ByteBuffer buf1, int offset1, int length1, ByteBuffer buf2, int offset2, int length2) {
if (buf1 == null && buf2 == null) return true;
if (buf1 == null || buf2 == null) return false;
if (length1 != length2) return false;
for (int i = 0; i < length1; i++) {
if (buf1.get(i + offset1) != buf2.get(i + offset2)) {
return false;
}
}
return true;
}
private static final boolean equals(byte[] array1, int offset1, int length1, ByteBuffer buf, int offset2, int length2) {
if (array1 == null && buf == null) return true;
if (array1 == null || buf == null) return false;
if (length1 != length2) return false;
for (int i = 0; i < length1; i++) {
if (array1[i + offset1] != buf.get(i + offset2)) {
return false;
}
}
return true;
}
/**
* @see {@link Arrays#equals(byte[], byte[])}
* @param array1
* @param offset1
* @param length1
* @param array2
* @param offset2
* @param length2
* @return
*/
private static final boolean equals(byte[] array1, int offset1, int length1, byte[] array2, int offset2, int length2) {
if (array1 == null && array2 == null) return true;
if (array1 == null || array2 == null) return false;
if (length1 != length2) return false;
if (array1 == array2 && offset1 == offset2) return true;
for (int i = 0; i < length1; i++) {
if (array1[i + offset1] != array2[i + offset2]) {
return false;
}
}
return true;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy