com.clickzetta.platform.operator.KeyEncoder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of clickzetta-java Show documentation
Show all versions of clickzetta-java Show documentation
The java SDK for clickzetta's Lakehouse
package com.clickzetta.platform.operator;
import com.clickzetta.platform.client.PartitionSchema;
import com.clickzetta.platform.common.ColumnSchema;
import com.clickzetta.platform.common.Schema;
import com.clickzetta.platform.common.Type;
import com.clickzetta.platform.util.ByteVec;
import com.clickzetta.platform.util.DateUtil;
import com.clickzetta.platform.util.DecimalUtil;
import com.clickzetta.platform.util.Pair;
import com.google.common.primitives.Ints;
import com.google.common.primitives.UnsignedLongs;
import com.sangupta.murmur.Murmur2;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
public class KeyEncoder {
private static final BigInteger MIN_VALUE_128 = BigInteger.valueOf(-2).pow(127);
private KeyEncoder() {
}
public static byte[] encodePrimaryKey(final PartialRow row) {
ByteVec buf = ByteVec.create();
final Schema schema = row.getSchema();
for (int columnIdx = 0; columnIdx < schema.getPrimaryKeyColumnCount(); columnIdx++) {
final boolean isLast = columnIdx + 1 == schema.getPrimaryKeyColumnCount();
encodeColumn(row, columnIdx, isLast, buf);
}
return buf.toArray();
}
public static int getHashBucket(PartialRow row, PartitionSchema.HashBucketSchema hashSchema) {
ByteVec buf = ByteVec.create();
encodeColumns(row, hashSchema.getColumnIds(), buf);
long hash = Murmur2.hash64(buf.data(), buf.len(), hashSchema.getSeed());
return (int) UnsignedLongs.remainder(hash, hashSchema.getNumBuckets());
}
public static byte[] encodePartitionKey(PartialRow row, PartitionSchema partitionSchema) {
ByteVec buf = ByteVec.create();
if (!partitionSchema.getHashBucketSchemas().isEmpty()) {
for (final PartitionSchema.HashBucketSchema hashSchema : partitionSchema.getHashBucketSchemas()) {
encodeHashBucket(getHashBucket(row, hashSchema), buf);
}
}
encodeColumns(row, partitionSchema.getRangeSchema().getColumnIds(), buf);
return buf.toArray();
}
public static byte[] encodeRangePartitionKey(PartialRow row,
PartitionSchema.RangeSchema rangeSchema) {
ByteVec buf = ByteVec.create();
encodeColumns(row, rangeSchema.getColumnIds(), buf);
return buf.toArray();
}
private static void encodeColumns(PartialRow row, List columnIds, ByteVec buf) {
for (int i = 0; i < columnIds.size(); i++) {
boolean isLast = i + 1 == columnIds.size();
encodeColumn(row, row.getSchema().getColumnIndex(columnIds.get(i)), isLast, buf);
}
}
private static void encodeColumn(PartialRow row,
int columnIdx,
boolean isLast,
ByteVec buf) {
final Schema schema = row.getSchema();
final ColumnSchema column = schema.getColumnByIndex(columnIdx);
if (!row.isSet(columnIdx)) {
throw new IllegalStateException(String.format("Primary key column %s is not set",
column.getName()));
}
final Type type = column.getType();
if (type == Type.STRING || type == Type.BINARY ||
type == Type.VARCHAR) {
encodeBinary(row.getVarLengthData().get(columnIdx), isLast, buf);
} else {
encodeSignedInt(row.getRowAlloc(),
schema.getColumnOffset(columnIdx),
column.getTypeSize(),
buf);
}
}
private static void encodeBinary(ByteBuffer value, boolean isLast, ByteVec buf) {
value.reset();
while (value.hasRemaining()) {
byte currentByte = value.get();
buf.push(currentByte);
if (!isLast && currentByte == 0x00) {
buf.push((byte) 0x01);
}
}
if (!isLast) {
buf.push((byte) 0x00);
buf.push((byte) 0x00);
}
}
private static void encodeSignedInt(byte[] value,
int offset,
int len,
ByteVec buf) {
byte lastByte = value[offset + (len - 1)];
lastByte = Bytes.xorLeftMostBit(lastByte);
buf.push(lastByte);
for (int i = len - 2; i >= 0; i--) {
buf.push(value[offset + i]);
}
}
public static void encodeHashBucket(int bucket, ByteVec buf) {
buf.append(Ints.toByteArray(bucket));
}
public static PartialRow decodePrimaryKey(Schema schema, byte[] key) {
PartialRow row = schema.newPartialRow();
ByteBuffer buf = ByteBuffer.wrap(key);
buf.order(ByteOrder.BIG_ENDIAN);
for (int idx = 0; idx < schema.getPrimaryKeyColumnCount(); idx++) {
decodeColumn(buf, row, idx, idx + 1 == schema.getPrimaryKeyColumnCount());
}
if (buf.hasRemaining()) {
throw new IllegalArgumentException("Unable to decode all primary key bytes");
}
return row;
}
public static Pair, PartialRow> decodePartitionKey(Schema schema,
PartitionSchema partitionSchema,
byte[] key) {
ByteBuffer buf = ByteBuffer.wrap(key);
buf.order(ByteOrder.BIG_ENDIAN);
List buckets = new ArrayList<>();
for (int i = 0; i < partitionSchema.getHashBucketSchemas().size(); i++) {
if (buf.hasRemaining()) {
buckets.add(buf.getInt());
} else {
buckets.add(0);
}
}
return new Pair<>(buckets, decodeRangePartitionKey(schema, partitionSchema, buf));
}
public static PartialRow decodeRangePartitionKey(Schema schema,
PartitionSchema partitionSchema,
byte[] key) {
ByteBuffer buf = ByteBuffer.wrap(key);
buf.order(ByteOrder.BIG_ENDIAN);
return decodeRangePartitionKey(schema, partitionSchema, buf);
}
private static PartialRow decodeRangePartitionKey(Schema schema,
PartitionSchema partitionSchema,
ByteBuffer buf) {
PartialRow row = schema.newPartialRow();
Iterator rangeIds = partitionSchema.getRangeSchema().getColumnIds().iterator();
while (rangeIds.hasNext()) {
int idx = schema.getColumnIndex(rangeIds.next());
if (buf.hasRemaining()) {
decodeColumn(buf, row, idx, !rangeIds.hasNext());
} else {
row.setMin(idx);
}
}
if (buf.hasRemaining()) {
throw new IllegalArgumentException("Unable to decode all partition key bytes");
}
return row;
}
private static void decodeColumn(ByteBuffer buf, PartialRow row, int idx, boolean isLast) {
Schema schema = row.getSchema();
ColumnSchema column = schema.getColumnByIndex(idx);
switch (column.getType()) {
case INT8:
row.addByte(idx, (byte) (buf.get() ^ Byte.MIN_VALUE));
break;
case INT16:
row.addShort(idx, (short) (buf.getShort() ^ Short.MIN_VALUE));
break;
case DATE: {
int days = buf.getInt() ^ Integer.MIN_VALUE;
row.addDate(idx, DateUtil.epochDaysToSqlDate(days));
break;
}
case INT32:
row.addInt(idx, buf.getInt() ^ Integer.MIN_VALUE);
break;
case INT64:
case UNIXTIME_MICROS:
row.addLong(idx, buf.getLong() ^ Long.MIN_VALUE);
break;
case BINARY: {
byte[] binary = decodeBinaryColumn(buf, isLast);
row.addBinary(idx, binary);
break;
}
case VARCHAR: {
byte[] binary = decodeBinaryColumn(buf, isLast);
row.addVarchar(idx, new String(binary, StandardCharsets.UTF_8));
break;
}
case STRING: {
byte[] binary = decodeBinaryColumn(buf, isLast);
row.addStringUtf8(idx, binary);
break;
}
case DECIMAL: {
int scale = column.getTypeAttributes().getScale();
int size = column.getTypeSize();
switch (size) {
case DecimalUtil.DECIMAL32_SIZE:
int intVal = buf.getInt() ^ Integer.MIN_VALUE;
row.addDecimal(idx, BigDecimal.valueOf(intVal, scale));
break;
case DecimalUtil.DECIMAL64_SIZE:
long longVal = buf.getLong() ^ Long.MIN_VALUE;
row.addDecimal(idx, BigDecimal.valueOf(longVal, scale));
break;
case DecimalUtil.DECIMAL128_SIZE:
byte[] bytes = new byte[size];
buf.get(bytes);
BigInteger bigIntVal = new BigInteger(bytes).xor(MIN_VALUE_128);
row.addDecimal(idx, new BigDecimal(bigIntVal, scale));
break;
default:
throw new IllegalArgumentException("Unsupported decimal type size: " + size);
}
break;
}
default:
throw new IllegalArgumentException(String.format(
"The column type %s is not a valid key component type",
schema.getColumnByIndex(idx).getType()));
}
}
private static byte[] decodeBinaryColumn(ByteBuffer key, boolean isLast) {
if (isLast) {
byte[] bytes = Arrays.copyOfRange(key.array(),
key.arrayOffset() + key.position(),
key.arrayOffset() + key.limit());
key.position(key.limit());
return bytes;
}
ByteVec buf = ByteVec.withCapacity(key.remaining());
for (int i = key.position(); i < key.limit(); i++) {
if (key.get(i) == 0) {
switch (key.get(i + 1)) {
case 0: {
buf.append(key.array(),
key.arrayOffset() + key.position(),
i - key.position());
key.position(i + 2);
return buf.toArray();
}
case 1: {
buf.append(key.array(),
key.arrayOffset() + key.position(),
i + 1 - key.position());
i++;
key.position(i + 1);
break;
}
default: throw new IllegalArgumentException("Unexpected binary sequence");
}
}
}
buf.append(key.array(),
key.arrayOffset() + key.position(),
key.remaining());
key.position(key.limit());
return buf.toArray();
}
public static String formatPartitionKeyRange(Schema schema,
PartitionSchema partitionSchema,
byte[] lowerBound,
byte[] upperBound) {
if (partitionSchema.getRangeSchema().getColumnIds().isEmpty() &&
partitionSchema.getHashBucketSchemas().isEmpty()) {
assert lowerBound.length == 0 && upperBound.length == 0;
return "";
}
Pair, PartialRow> lower = decodePartitionKey(schema, partitionSchema, lowerBound);
Pair, PartialRow> upper = decodePartitionKey(schema, partitionSchema, upperBound);
StringBuilder sb = new StringBuilder();
List hashBuckets = lower.getFirst();
if (!hashBuckets.isEmpty()) {
sb.append("hash-partition-buckets: ");
sb.append(hashBuckets);
}
if (!partitionSchema.getRangeSchema().getColumnIds().isEmpty()) {
if (!hashBuckets.isEmpty()) {
sb.append(", ");
}
List idxs = new ArrayList<>();
for (int id : partitionSchema.getRangeSchema().getColumnIds()) {
idxs.add(schema.getColumnIndex(id));
}
sb.append("range-partition: [");
if (lowerBound.length > 4 * hashBuckets.size()) {
sb.append('(');
lower.getSecond().appendDebugString(idxs, sb);
sb.append(')');
} else {
sb.append("");
}
sb.append(", ");
if (upperBound.length > 4 * hashBuckets.size()) {
sb.append('(');
upper.getSecond().appendDebugString(idxs, sb);
sb.append(')');
} else {
sb.append("");
}
sb.append(')');
}
return sb.toString();
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy