All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.snowflake.ingest.streaming.internal.ParquetValueParser Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
/*
 * Copyright (c) 2022 Snowflake Computing Inc. All rights reserved.
 */

package net.snowflake.ingest.streaming.internal;

import java.math.BigDecimal;
import java.math.BigInteger;
import java.math.RoundingMode;
import java.time.ZoneId;
import java.util.Optional;
import javax.annotation.Nullable;
import net.snowflake.ingest.utils.ErrorCode;
import net.snowflake.ingest.utils.SFException;
import net.snowflake.ingest.utils.Utils;
import org.apache.parquet.schema.PrimitiveType;

/** Parses a user column value into Parquet internal representation for buffering. */
class ParquetValueParser {

  // Parquet uses BitPacking to encode boolean, hence 1 bit per value
  public static final float BIT_ENCODING_BYTE_LEN = 1.0f / 8;

  /**
   * On average parquet needs 2 bytes / 8 values for the RLE+bitpack encoded definition level.
   *
   * 
    * There are two cases how definition level (0 for null values, 1 for non-null values) is * encoded: *
  • If there are at least 8 repeated values in a row, they are run-length encoded (length + * value itself). E.g. 11111111 -> 8 1 *
  • If there are less than 8 repeated values, they are written in group as part of a * bit-length encoded run, e.g. 1111 -> 15 A bit-length encoded run ends when either 64 * groups of 8 values have been written or if a new RLE run starts. *

    To distinguish between RLE and bitpack run, there is 1 extra bytes written as header * when a bitpack run starts. *

* *
    * For more details see ColumnWriterV1#createDLWriter and {@link * org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder#writeInt(int)} *
* *

Since we don't have nested types, repetition level is always 0 and is not stored at all by * Parquet. */ public static final float DEFINITION_LEVEL_ENCODING_BYTE_LEN = 2.0f / 8; // Parquet stores length in 4 bytes before the actual data bytes public static final int BYTE_ARRAY_LENGTH_ENCODING_BYTE_LEN = 4; /** Parquet internal value representation for buffering. */ static class ParquetBufferValue { private final Object value; private final float size; ParquetBufferValue(Object value, float size) { this.value = value; this.size = size; } Object getValue() { return value; } float getSize() { return size; } } /** * Parses a user column value into Parquet internal representation for buffering. * * @param value column value provided by user in a row * @param columnMetadata column metadata * @param typeName Parquet primitive type name * @param stats column stats to update * @param insertRowsCurrIndex Row index corresponding the row to parse (w.r.t input rows in * insertRows API, and not buffered row) * @return parsed value and byte size of Parquet internal representation */ static ParquetBufferValue parseColumnValueToParquet( Object value, ColumnMetadata columnMetadata, PrimitiveType.PrimitiveTypeName typeName, RowBufferStats stats, ZoneId defaultTimezone, long insertRowsCurrIndex) { Utils.assertNotNull("Parquet column stats", stats); float estimatedParquetSize = 0F; estimatedParquetSize += DEFINITION_LEVEL_ENCODING_BYTE_LEN; if (value != null) { AbstractRowBuffer.ColumnLogicalType logicalType = AbstractRowBuffer.ColumnLogicalType.valueOf(columnMetadata.getLogicalType()); AbstractRowBuffer.ColumnPhysicalType physicalType = AbstractRowBuffer.ColumnPhysicalType.valueOf(columnMetadata.getPhysicalType()); switch (typeName) { case BOOLEAN: int intValue = DataValidationUtil.validateAndParseBoolean( columnMetadata.getName(), value, insertRowsCurrIndex); value = intValue > 0; stats.addIntValue(BigInteger.valueOf(intValue)); estimatedParquetSize += BIT_ENCODING_BYTE_LEN; break; case INT32: int intVal = getInt32Value( columnMetadata.getName(), value, columnMetadata.getScale(), Optional.ofNullable(columnMetadata.getPrecision()).orElse(0), logicalType, physicalType, insertRowsCurrIndex); value = intVal; stats.addIntValue(BigInteger.valueOf(intVal)); estimatedParquetSize += 4; break; case INT64: long longValue = getInt64Value( columnMetadata.getName(), value, columnMetadata.getScale(), Optional.ofNullable(columnMetadata.getPrecision()).orElse(0), logicalType, physicalType, defaultTimezone, insertRowsCurrIndex); value = longValue; stats.addIntValue(BigInteger.valueOf(longValue)); estimatedParquetSize += 8; break; case DOUBLE: double doubleValue = DataValidationUtil.validateAndParseReal( columnMetadata.getName(), value, insertRowsCurrIndex); value = doubleValue; stats.addRealValue(doubleValue); estimatedParquetSize += 8; break; case BINARY: int length = 0; if (logicalType == AbstractRowBuffer.ColumnLogicalType.BINARY) { value = getBinaryValueForLogicalBinary(value, stats, columnMetadata, insertRowsCurrIndex); length = ((byte[]) value).length; } else { String str = getBinaryValue(value, stats, columnMetadata, insertRowsCurrIndex); value = str; if (str != null) { length = str.getBytes().length; } } if (value != null) { estimatedParquetSize += (BYTE_ARRAY_LENGTH_ENCODING_BYTE_LEN + length); } break; case FIXED_LEN_BYTE_ARRAY: BigInteger intRep = getSb16Value( columnMetadata.getName(), value, columnMetadata.getScale(), Optional.ofNullable(columnMetadata.getPrecision()).orElse(0), logicalType, physicalType, defaultTimezone, insertRowsCurrIndex); stats.addIntValue(intRep); value = getSb16Bytes(intRep); estimatedParquetSize += 16; break; default: throw new SFException(ErrorCode.UNKNOWN_DATA_TYPE, logicalType, physicalType); } } if (value == null) { if (!columnMetadata.getNullable()) { throw new SFException( ErrorCode.INVALID_FORMAT_ROW, columnMetadata.getName(), "Passed null to non nullable field"); } stats.incCurrentNullCount(); } return new ParquetBufferValue(value, estimatedParquetSize); } /** * Parses an int32 value based on Snowflake logical type. * * @param value column value provided by user in a row * @param scale data type scale * @param precision data type precision * @param logicalType Snowflake logical type * @param physicalType Snowflake physical type * @param insertRowsCurrIndex Used for logging the row of index given in insertRows API * @return parsed int32 value */ private static int getInt32Value( String columnName, Object value, @Nullable Integer scale, Integer precision, AbstractRowBuffer.ColumnLogicalType logicalType, AbstractRowBuffer.ColumnPhysicalType physicalType, final long insertRowsCurrIndex) { int intVal; switch (logicalType) { case DATE: intVal = DataValidationUtil.validateAndParseDate(columnName, value, insertRowsCurrIndex); break; case TIME: Utils.assertNotNull("Unexpected null scale for TIME data type", scale); intVal = DataValidationUtil.validateAndParseTime(columnName, value, scale, insertRowsCurrIndex) .intValue(); break; case FIXED: BigDecimal bigDecimalValue = DataValidationUtil.validateAndParseBigDecimal(columnName, value, insertRowsCurrIndex); bigDecimalValue = bigDecimalValue.setScale(scale, RoundingMode.HALF_UP); DataValidationUtil.checkValueInRange( bigDecimalValue, scale, precision, insertRowsCurrIndex); intVal = bigDecimalValue.intValue(); break; default: throw new SFException(ErrorCode.UNKNOWN_DATA_TYPE, logicalType, physicalType); } return intVal; } /** * Parses an int64 value based on Snowflake logical type. * * @param value column value provided by user in a row * @param scale data type scale * @param precision data type precision * @param logicalType Snowflake logical type * @param physicalType Snowflake physical type * @return parsed int64 value */ private static long getInt64Value( String columnName, Object value, int scale, int precision, AbstractRowBuffer.ColumnLogicalType logicalType, AbstractRowBuffer.ColumnPhysicalType physicalType, ZoneId defaultTimezone, final long insertRowsCurrIndex) { long longValue; switch (logicalType) { case TIME: Utils.assertNotNull("Unexpected null scale for TIME data type", scale); longValue = DataValidationUtil.validateAndParseTime(columnName, value, scale, insertRowsCurrIndex) .longValue(); break; case TIMESTAMP_LTZ: case TIMESTAMP_NTZ: boolean trimTimezone = logicalType == AbstractRowBuffer.ColumnLogicalType.TIMESTAMP_NTZ; longValue = DataValidationUtil.validateAndParseTimestamp( columnName, value, scale, defaultTimezone, trimTimezone, insertRowsCurrIndex) .toBinary(false) .longValue(); break; case TIMESTAMP_TZ: longValue = DataValidationUtil.validateAndParseTimestamp( columnName, value, scale, defaultTimezone, false, insertRowsCurrIndex) .toBinary(true) .longValue(); break; case FIXED: BigDecimal bigDecimalValue = DataValidationUtil.validateAndParseBigDecimal(columnName, value, insertRowsCurrIndex); bigDecimalValue = bigDecimalValue.setScale(scale, RoundingMode.HALF_UP); DataValidationUtil.checkValueInRange( bigDecimalValue, scale, precision, insertRowsCurrIndex); longValue = bigDecimalValue.longValue(); break; default: throw new SFException(ErrorCode.UNKNOWN_DATA_TYPE, logicalType, physicalType); } return longValue; } /** * Parses an int128 value based on Snowflake logical type. * * @param value column value provided by user in a row * @param scale data type scale * @param precision data type precision * @param logicalType Snowflake logical type * @param physicalType Snowflake physical type * @return parsed int64 value */ private static BigInteger getSb16Value( String columnName, Object value, int scale, int precision, AbstractRowBuffer.ColumnLogicalType logicalType, AbstractRowBuffer.ColumnPhysicalType physicalType, ZoneId defaultTimezone, final long insertRowsCurrIndex) { switch (logicalType) { case TIMESTAMP_TZ: return DataValidationUtil.validateAndParseTimestamp( columnName, value, scale, defaultTimezone, false, insertRowsCurrIndex) .toBinary(true); case TIMESTAMP_LTZ: case TIMESTAMP_NTZ: boolean trimTimezone = logicalType == AbstractRowBuffer.ColumnLogicalType.TIMESTAMP_NTZ; return DataValidationUtil.validateAndParseTimestamp( columnName, value, scale, defaultTimezone, trimTimezone, insertRowsCurrIndex) .toBinary(false); case FIXED: BigDecimal bigDecimalValue = DataValidationUtil.validateAndParseBigDecimal(columnName, value, insertRowsCurrIndex); // explicitly match the BigDecimal input scale with the Snowflake data type scale bigDecimalValue = bigDecimalValue.setScale(scale, RoundingMode.HALF_UP); DataValidationUtil.checkValueInRange( bigDecimalValue, scale, precision, insertRowsCurrIndex); return bigDecimalValue.unscaledValue(); default: throw new SFException(ErrorCode.UNKNOWN_DATA_TYPE, logicalType, physicalType); } } /** * Converts an int128 value to its byte array representation. * * @param intRep int128 value * @return byte array representation */ static byte[] getSb16Bytes(BigInteger intRep) { byte[] bytes = intRep.toByteArray(); byte padByte = (byte) (bytes[0] < 0 ? -1 : 0); byte[] bytesBE = new byte[16]; for (int i = 0; i < 16 - bytes.length; i++) { bytesBE[i] = padByte; } System.arraycopy(bytes, 0, bytesBE, 16 - bytes.length, bytes.length); return bytesBE; } /** * Converts an object or string to its byte array representation. * * @param value value to parse * @param stats column stats to update * @param columnMetadata column metadata * @param insertRowsCurrIndex Used for logging the row of index given in insertRows API * @return string representation */ private static String getBinaryValue( Object value, RowBufferStats stats, ColumnMetadata columnMetadata, final long insertRowsCurrIndex) { AbstractRowBuffer.ColumnLogicalType logicalType = AbstractRowBuffer.ColumnLogicalType.valueOf(columnMetadata.getLogicalType()); String str; if (logicalType.isObject()) { switch (logicalType) { case OBJECT: str = DataValidationUtil.validateAndParseObject( columnMetadata.getName(), value, insertRowsCurrIndex); break; case VARIANT: str = DataValidationUtil.validateAndParseVariant( columnMetadata.getName(), value, insertRowsCurrIndex); break; case ARRAY: str = DataValidationUtil.validateAndParseArray( columnMetadata.getName(), value, insertRowsCurrIndex); break; default: throw new SFException( ErrorCode.UNKNOWN_DATA_TYPE, logicalType, columnMetadata.getPhysicalType()); } } else { String maxLengthString = columnMetadata.getLength().toString(); str = DataValidationUtil.validateAndParseString( columnMetadata.getName(), value, Optional.of(maxLengthString).map(Integer::parseInt), insertRowsCurrIndex); stats.addStrValue(str); } return str; } /** * Converts a binary value to its byte array representation. * * @param value value to parse * @param stats column stats to update * @param columnMetadata column metadata * @return byte array representation */ private static byte[] getBinaryValueForLogicalBinary( Object value, RowBufferStats stats, ColumnMetadata columnMetadata, final long insertRowsCurrIndex) { String maxLengthString = columnMetadata.getByteLength().toString(); byte[] bytes = DataValidationUtil.validateAndParseBinary( columnMetadata.getName(), value, Optional.of(maxLengthString).map(Integer::parseInt), insertRowsCurrIndex); stats.addBinaryValue(bytes); return bytes; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy