
water.parser.parquet.ChunkConverter Maven / Gradle / Ivy
package water.parser.parquet;
import org.apache.parquet.column.Dictionary;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.Converter;
import org.apache.parquet.io.api.GroupConverter;
import org.apache.parquet.io.api.PrimitiveConverter;
import org.apache.parquet.schema.*;
import water.fvec.Vec;
import water.parser.BufferedString;
import water.parser.parquet.ext.DecimalUtils;
import water.util.StringUtils;
import java.time.Instant;
/**
* Implementation of Parquet's GroupConverter for H2O's chunks.
*
* ChunkConverter is responsible for converting parquet data into Chunks. As opposed to regular
* Parquet converters this converter doesn't actually produce any records and instead writes the data
* using a provided ParseWriter to chunks. The (artificial) output of the converter is number of
* the record that was written to the chunk.
*
* Note: It is meant to be used as a root converter.
*/
class ChunkConverter extends GroupConverter {
private final WriterDelegate _writer; // this guy actually performs the writing.
private final Converter[] _converters;
private long _currentRecordIdx = -1;
ChunkConverter(MessageType parquetSchema, byte[] chunkSchema, WriterDelegate writer, boolean[] keepColumns) {
_writer = writer;
int colIdx = 0; // index to columns actually parsed
_converters = new Converter[chunkSchema.length];
int trueColumnIndex = 0; // count all columns including the skipped ones
for (Type parquetField : parquetSchema.getFields()) {
assert parquetField.isPrimitive();
if (keepColumns[trueColumnIndex]) {
_converters[trueColumnIndex] = newConverter(colIdx, chunkSchema[trueColumnIndex], parquetField.asPrimitiveType());
colIdx++;
} else {
_converters[trueColumnIndex] = nullConverter(chunkSchema[trueColumnIndex], parquetField.asPrimitiveType());
}
trueColumnIndex++;
}
}
@Override
public Converter getConverter(int fieldIndex) {
return _converters[fieldIndex];
}
@Override
public void start() {
_currentRecordIdx++;
_writer.startLine();
}
@Override
public void end() {
_writer.endLine();
}
long getCurrentRecordIdx() {
return _currentRecordIdx;
}
private PrimitiveConverter nullConverter(byte vecType, PrimitiveType parquetType) {
switch (vecType) {
case Vec.T_BAD:
case Vec.T_CAT:
case Vec.T_STR:
case Vec.T_UUID:
case Vec.T_TIME:
case Vec.T_NUM:
boolean dictSupport = parquetType.getOriginalType() == OriginalType.UTF8 || parquetType.getOriginalType() == OriginalType.ENUM;
return new NullStringConverter(dictSupport);
default:
throw new UnsupportedOperationException("Unsupported type " + vecType);
}
}
private static class NullStringConverter extends PrimitiveConverter {
private final boolean _dictionarySupport;
NullStringConverter(boolean dictionarySupport) {
_dictionarySupport = dictionarySupport;
}
@Override
public void addBinary(Binary value) { ; }
@Override
public boolean hasDictionarySupport() {
return _dictionarySupport;
}
@Override
public void setDictionary(Dictionary dictionary) {
}
@Override
public void addValueFromDictionary(int dictionaryId) {
}
@Override
public void addBoolean(boolean value) { }
@Override
public void addDouble(double value) { }
@Override
public void addFloat(float value) { }
@Override
public void addInt(int value) { }
@Override
public void addLong(long value) { }
}
private PrimitiveConverter newConverter(int colIdx, byte vecType, PrimitiveType parquetType) {
switch (vecType) {
case Vec.T_BAD:
case Vec.T_CAT:
case Vec.T_STR:
if (parquetType.getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.BOOLEAN)) {
return new BooleanConverter(_writer, colIdx);
}
case Vec.T_UUID:
case Vec.T_TIME:
if (OriginalType.TIMESTAMP_MILLIS.equals(parquetType.getOriginalType()) || parquetType.getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT96)) {
return new TimestampConverter(colIdx, _writer);
} else if (OriginalType.DATE.equals(parquetType.getOriginalType()) || parquetType.getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.INT32)){
return new DateConverter(colIdx, _writer);
} else {
boolean dictSupport = parquetType.getOriginalType() == OriginalType.UTF8 || parquetType.getOriginalType() == OriginalType.ENUM;
return new StringConverter(_writer, colIdx, dictSupport);
}
case Vec.T_NUM:
if (OriginalType.DECIMAL.equals(parquetType.getOriginalType()))
return new DecimalConverter(colIdx, parquetType.getDecimalMetadata(), _writer);
else
return new NumberConverter(colIdx, _writer);
default:
throw new UnsupportedOperationException("Unsupported type " + vecType);
}
}
private static class BooleanConverter extends PrimitiveConverter {
private BufferedString TRUE = new BufferedString("True"); // note: this cannot be static - some BS ops are not thread safe!
private BufferedString FALSE = new BufferedString("False");
private final int _colIdx;
private final WriterDelegate _writer;
BooleanConverter(WriterDelegate writer, int colIdx) {
_colIdx = colIdx;
_writer = writer;
}
@Override
public void addBoolean(boolean value) {
BufferedString bsValue = value ? TRUE : FALSE;
_writer.addStrCol(_colIdx, bsValue);
}
}
private static class StringConverter extends PrimitiveConverter {
private final BufferedString _bs = new BufferedString();
private final int _colIdx;
private final WriterDelegate _writer;
private final boolean _dictionarySupport;
private String[] _dict;
StringConverter(WriterDelegate writer, int colIdx, boolean dictionarySupport) {
_colIdx = colIdx;
_writer = writer;
_dictionarySupport = dictionarySupport;
}
@Override
public void addBinary(Binary value) {
writeStrCol(StringUtils.bytesOf(value.toStringUsingUTF8()));
}
@Override
public boolean hasDictionarySupport() {
return _dictionarySupport;
}
@Override
public void setDictionary(Dictionary dictionary) {
_dict = new String[dictionary.getMaxId() + 1];
for (int i = 0; i <= dictionary.getMaxId(); i++) {
_dict[i] = dictionary.decodeToBinary(i).toStringUsingUTF8();
}
}
@Override
public void addValueFromDictionary(int dictionaryId) {
writeStrCol(StringUtils.bytesOf(_dict[dictionaryId]));
}
private void writeStrCol(byte[] data) {
_bs.set(data);
_writer.addStrCol(_colIdx, _bs);
}
}
private static class NumberConverter extends PrimitiveConverter {
private final int _colIdx;
private final WriterDelegate _writer;
private final BufferedString _bs = new BufferedString();
NumberConverter(int _colIdx, WriterDelegate _writer) {
this._colIdx = _colIdx;
this._writer = _writer;
}
@Override
public void addBoolean(boolean value) {
_writer.addNumCol(_colIdx, value ? 1 : 0);
}
@Override
public void addDouble(double value) {
_writer.addNumCol(_colIdx, value);
}
@Override
public void addFloat(float value) {
_writer.addNumCol(_colIdx, value);
}
@Override
public void addInt(int value) {
_writer.addNumCol(_colIdx, value, 0);
}
@Override
public void addLong(long value) {
_writer.addNumCol(_colIdx, value, 0);
}
@Override
public void addBinary(Binary value) {
_bs.set(StringUtils.bytesOf(value.toStringUsingUTF8()));
_writer.addStrCol(_colIdx, _bs);
}
}
private static class DecimalConverter extends PrimitiveConverter {
private final int _colIdx;
private final WriterDelegate _writer;
private final int _precision;
private final int _scale;
DecimalConverter(int colIdx, DecimalMetadata dm, WriterDelegate writer) {
_colIdx = colIdx;
_precision = dm.getPrecision();
_scale = dm.getScale();
_writer = writer;
}
@Override
public void addBoolean(boolean value) {
throw new UnsupportedOperationException("Boolean type is not supported by DecimalConverter");
}
@Override
public void addDouble(double value) {
throw new UnsupportedOperationException("Double type is not supported by DecimalConverter");
}
@Override
public void addFloat(float value) {
throw new UnsupportedOperationException("Float type is not supported by DecimalConverter");
}
@Override
public void addInt(int value) {
_writer.addNumCol(_colIdx, value, -_scale);
}
@Override
public void addLong(long value) {
_writer.addNumCol(_colIdx, value, -_scale);
}
@Override
public void addBinary(Binary value) {
_writer.addNumCol(_colIdx, DecimalUtils.binaryToDecimal(value, _precision, _scale).doubleValue());
}
}
private static class TimestampConverter extends PrimitiveConverter {
private final int _colIdx;
private final WriterDelegate _writer;
TimestampConverter(int _colIdx, WriterDelegate _writer) {
this._colIdx = _colIdx;
this._writer = _writer;
}
@Override
public void addLong(long value) {
_writer.addNumCol(_colIdx, value, 0);
}
@Override
public void addBinary(Binary value) {
final long timestampMillis = ParquetInt96TimestampConverter.getTimestampMillis(value);
_writer.addNumCol(_colIdx, timestampMillis);
}
}
private static class DateConverter extends PrimitiveConverter {
private final static long EPOCH_MILLIS = Instant.EPOCH.toEpochMilli();
private final static long MILLIS_IN_A_DAY = 24 * 60 * 60 * 1000;
private final int _colIdx;
private final WriterDelegate _writer;
DateConverter(int _colIdx, WriterDelegate _writer) {
this._colIdx = _colIdx;
this._writer = _writer;
}
@Override
public void addInt(int numberOfDaysFromUnixEpoch) {
final long parquetDateEpochMillis = EPOCH_MILLIS + numberOfDaysFromUnixEpoch * MILLIS_IN_A_DAY;
_writer.addNumCol(_colIdx, parquetDateEpochMillis);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy