parquet.thrift.ThriftRecordConverter Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package parquet.thrift;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.thrift.TException;
import org.apache.thrift.protocol.TField;
import org.apache.thrift.protocol.TList;
import org.apache.thrift.protocol.TMap;
import org.apache.thrift.protocol.TProtocol;
import org.apache.thrift.protocol.TSet;
import org.apache.thrift.protocol.TStruct;
import org.apache.thrift.protocol.TType;
import parquet.io.ParquetDecodingException;
import parquet.io.api.Binary;
import parquet.io.api.Converter;
import parquet.io.api.GroupConverter;
import parquet.io.api.PrimitiveConverter;
import parquet.io.api.RecordMaterializer;
import parquet.schema.GroupType;
import parquet.schema.MessageType;
import parquet.schema.Type;
import parquet.thrift.projection.amend.ProtocolEventsAmender;
import parquet.thrift.struct.ThriftField;
import parquet.thrift.struct.ThriftField.Requirement;
import parquet.thrift.struct.ThriftType;
import parquet.thrift.struct.ThriftType.EnumType;
import parquet.thrift.struct.ThriftType.EnumValue;
import parquet.thrift.struct.ThriftType.ListType;
import parquet.thrift.struct.ThriftType.MapType;
import parquet.thrift.struct.ThriftType.SetType;
import parquet.thrift.struct.ThriftType.StructType;
import parquet.thrift.struct.ThriftTypeID;
/**
* converts the columnar events into a Thrift protocol.
*
* @author Julien Le Dem
*
* @param
*/
public class ThriftRecordConverter extends RecordMaterializer {
final ParquetProtocol readFieldEnd = new ParquetProtocol("readFieldEnd()") {
@Override
public void readFieldEnd() throws TException {
}
};
private final StructType thriftType;
/**
* Handles field events creation by wrapping the converter for the actual type
*
* @author Julien Le Dem
*
*/
class PrimitiveFieldHandler extends PrimitiveConverter {
private final PrimitiveConverter delegate;
private final List events;
private final ParquetProtocol readFieldBegin;
private void startField() {
events.add(readFieldBegin);
}
private void endField() {
events.add(readFieldEnd);
}
public PrimitiveFieldHandler(PrimitiveConverter delegate, final ThriftField field, List events) {
this.delegate = delegate;
this.events = events;
final byte thriftType =
field.getType().getType() == ThriftTypeID.ENUM ?
ThriftTypeID.I32.getThriftType() : // enums are serialized as I32
field.getType().getType().getThriftType();
this.readFieldBegin = new ParquetProtocol("readFieldBegin()") {
@Override
public TField readFieldBegin() throws TException {
return new TField(field.getName(), thriftType, field.getFieldId());
}
};
}
@Override
public void addBinary(Binary value) {
startField();
delegate.addBinary(value);
endField();
}
@Override
public void addBoolean(boolean value) {
startField();
delegate.addBoolean(value);
endField();
}
@Override
public void addDouble(double value) {
startField();
delegate.addDouble(value);
endField();
}
@Override
public void addFloat(float value) {
startField();
delegate.addFloat(value);
endField();
}
@Override
public void addInt(int value) {
startField();
delegate.addInt(value);
endField();
}
@Override
public void addLong(long value) {
startField();
delegate.addLong(value);
endField();
}
}
/**
* Handles field events creation by wrapping the converter for the actual type
*
* @author Julien Le Dem
*
*/
class GroupFieldhandler extends GroupConverter {
private final GroupConverter delegate;
private final List events;
private final ParquetProtocol readFieldBegin;
public GroupFieldhandler(GroupConverter delegate, final ThriftField field, List events) {
this.delegate = delegate;
this.events = events;
this.readFieldBegin = new ParquetProtocol("readFieldBegin()") {
@Override
public TField readFieldBegin() throws TException {
return new TField(field.getName(), field.getType().getType().getThriftType(), field.getFieldId());
}
};
}
@Override
public Converter getConverter(int fieldIndex) {
return delegate.getConverter(fieldIndex);
}
@Override
public void start() {
events.add(readFieldBegin);
delegate.start();
}
@Override
public void end() {
delegate.end();
events.add(readFieldEnd);
}
}
interface Counter {
void startCounting();
int getCount();
}
/**
* counts the instances created to use in List/Set/Map that need to inform of the element count in the protocol
*
* @author Julien Le Dem
*
*/
class GroupCounter extends GroupConverter implements Counter {
private final GroupConverter delegate;
private int count;
public GroupCounter(GroupConverter delegate) {
this.delegate = delegate;
}
@Override
public Converter getConverter(int fieldIndex) {
return delegate.getConverter(fieldIndex);
}
@Override
public void start() {
delegate.start();
}
@Override
public void end() {
delegate.end();
++ count;
}
@Override
public void startCounting() {
count = 0;
}
@Override
public int getCount() {
return count;
}
}
/**
* counts the instances created to use in List/Set/Map that need to inform of the element count in the protocol
*
* @author Julien Le Dem
*
*/
class PrimitiveCounter extends PrimitiveConverter implements Counter {
private final PrimitiveConverter delegate;
private int count;
public PrimitiveCounter(PrimitiveConverter delegate) {
this.delegate = delegate;
}
@Override
public void addBinary(Binary value) {
delegate.addBinary(value);
++ count;
}
@Override
public void addBoolean(boolean value) {
delegate.addBoolean(value);
++ count;
}
@Override
public void addDouble(double value) {
delegate.addDouble(value);
++ count;
}
@Override
public void addFloat(float value) {
delegate.addFloat(value);
++ count;
}
@Override
public void addInt(int value) {
delegate.addInt(value);
++ count;
}
@Override
public void addLong(long value) {
delegate.addLong(value);
++ count;
}
@Override
public void startCounting() {
count = 0;
}
@Override
public int getCount() {
return count;
}
}
/**
* convert primitive values
*
* @author Julien Le Dem
*
*/
class FieldPrimitiveConverter extends PrimitiveConverter {
private final List events;
private ThriftTypeID type;
public FieldPrimitiveConverter(List events, ThriftField field) {
this.events = events;
this.type = field.getType().getType();
}
@Override
public void addBoolean(final boolean value) {
events.add(new ParquetProtocol("readBool()") {
@Override
public boolean readBool() throws TException {
return value;
}
});
}
@Override
public void addDouble(final double value) {
events.add(new ParquetProtocol("readDouble()") {
@Override
public double readDouble() throws TException {
return value;
}
});
}
@Override
public void addFloat(final float value) {
// TODO: check thrift has no float
events.add(new ParquetProtocol("readDouble() float") {
@Override
public double readDouble() throws TException {
return value;
}
});
}
@Override
public void addInt(final int value) {
// TODO: make subclass per type
switch (type) {
case BYTE:
events.add(new ParquetProtocol("readByte() int") {
@Override
public byte readByte() throws TException {
return (byte)value;
}
});
break;
case I16:
events.add(new ParquetProtocol("readI16()") {
@Override
public short readI16() throws TException {
return (short)value;
}
});
break;
case I32:
events.add(new ParquetProtocol("readI32()") {
@Override
public int readI32() throws TException {
return value;
}
});
break;
default:
throw new UnsupportedOperationException("not convertible type " + type);
}
}
@Override
public void addLong(final long value) {
events.add(new ParquetProtocol("readI64()") {
@Override
public long readI64() throws TException {
return value;
}
});
}
}
/**
* converts Binary into String
* @author Julien Le Dem
*
*/
class FieldStringConverter extends PrimitiveConverter {
private final List events;
public FieldStringConverter(List events, ThriftField field) {
this.events = events;
}
@Override
public void addBinary(final Binary value) {
events.add(new ParquetProtocol("readString() binary") {
@Override
public String readString() throws TException {
return value.toStringUsingUTF8();
}
@Override
public ByteBuffer readBinary() throws TException {
return value.toByteBuffer();
}
});
}
}
/**
* converts Binary into Enum
* @author Julien Le Dem
*
*/
class FieldEnumConverter extends PrimitiveConverter {
private final List events;
private Map enumLookup = new HashMap();
public FieldEnumConverter(List events, ThriftField field) {
this.events = events;
final Iterable values = ((EnumType)field.getType()).getValues();
for (EnumValue enumValue : values) {
enumLookup.put(Binary.fromString(enumValue.getName()), enumValue.getId());
}
}
@Override
public void addBinary(final Binary value) {
final int id = enumLookup.get(value);
events.add(new ParquetProtocol("readI32() enum") {
@Override
public int readI32() throws TException {
return id;
}
});
}
}
/**
* convert to Maps
* @author Julien Le Dem
*
*/
class MapConverter extends GroupConverter {
private final GroupCounter child;
private final List mapEvents = new ArrayList();
private final List parentEvents;
private final byte keyType;
private final byte valueType;
MapConverter(List parentEvents, GroupType parquetSchema, ThriftField field) {
this.parentEvents = parentEvents;
if (parquetSchema.getFieldCount() != 1) {
throw new IllegalArgumentException("maps have only one field. " + parquetSchema + " size = " + parquetSchema.getFieldCount());
}
Type nestedType = parquetSchema.getType(0);
final ThriftField key = ((MapType)field.getType()).getKey();
keyType = key.getType().getType().getThriftType();
final ThriftField value = ((MapType)field.getType()).getValue();
valueType = value.getType().getType().getThriftType();
child = new GroupCounter(new MapKeyValueConverter(mapEvents, nestedType, key, value));
}
@Override
public Converter getConverter(int fieldIndex) {
if (fieldIndex != 0) {
throw new IllegalArgumentException("lists have only one field. can't reach " + fieldIndex);
}
return child;
}
@Override
public void start() {
child.startCounting();
// we'll add the events in the end as we need to count elements
}
final ParquetProtocol readMapEnd = new ParquetProtocol("readMapEnd()") {
@Override
public void readMapEnd() throws TException {
}
};
@Override
public void end() {
final int count = child.getCount();
parentEvents.add(new ParquetProtocol("readMapBegin()") {
@Override
public TMap readMapBegin() throws TException {
return new TMap(keyType, valueType, count);
}
});
parentEvents.addAll(mapEvents);
mapEvents.clear();
parentEvents.add(readMapEnd);
}
}
/**
* converts to a key value pair (in maps)
* @author Julien Le Dem
*
*/
class MapKeyValueConverter extends GroupConverter {
private Converter keyConverter;
private Converter valueConverter;
public MapKeyValueConverter(List mapEvents, Type nestedType,
ThriftField key, ThriftField value) {
keyConverter = newConverter(mapEvents, nestedType.asGroupType().getType(0), key);
valueConverter = newConverter(mapEvents, nestedType.asGroupType().getType(1), value);
}
@Override
public Converter getConverter(int fieldIndex) {
switch (fieldIndex) {
case 0:
return keyConverter;
case 1:
return valueConverter;
default:
throw new IllegalArgumentException("only key (0) and value (1) are supported. got " + fieldIndex);
}
}
@Override
public void start() {
}
@Override
public void end() {
}
}
/**
* converts to a Set
* @author Julien Le Dem
*
*/
class SetConverter extends CollectionConverter {
final ParquetProtocol readSetEnd = new ParquetProtocol("readSetEnd()") {
@Override
public void readSetEnd() throws TException {
}
};
private final List parentEvents;
public SetConverter(List parentEvents, GroupType parquetSchema, ThriftField field) {
super(parentEvents, parquetSchema, ((SetType)field.getType()).getValues());
this.parentEvents = parentEvents;
}
@Override
void collectionStart(final int count, final byte type) {
parentEvents.add(new ParquetProtocol("readSetBegin()") {
@Override
public TSet readSetBegin() throws TException {
return new TSet(type, count);
}
});
}
@Override
void collectionEnd() {
parentEvents.add(readSetEnd);
}
}
/**
* converts to a List
* @author Julien Le Dem
*
*/
class ListConverter extends CollectionConverter {
final ParquetProtocol readListEnd = new ParquetProtocol("readListEnd()") {
@Override
public void readListEnd() throws TException {
}
};
private final List parentEvents;
ListConverter(List parentEvents, GroupType parquetSchema, ThriftField field) {
super(parentEvents, parquetSchema, ((ListType)field.getType()).getValues());
this.parentEvents = parentEvents;
}
@Override
void collectionStart(final int count, final byte type) {
parentEvents.add(new ParquetProtocol("readListBegin()") {
@Override
public TList readListBegin() throws TException {
return new TList(type, count);
}
});
}
@Override
void collectionEnd() {
parentEvents.add(readListEnd);
}
}
/**
* Base class to convert List and Set which basically work the same
* @author Julien Le Dem
*
*/
abstract class CollectionConverter extends GroupConverter {
private final Converter child;
private final Counter childCounter;
private List listEvents = new ArrayList();
private final List parentEvents;
private ThriftTypeID valuesType;
private final Type nestedType;
CollectionConverter(List parentEvents, GroupType parquetSchema, ThriftField values) {
this.parentEvents = parentEvents;
if (parquetSchema.getFieldCount() != 1) {
throw new IllegalArgumentException("lists have only one field. " + parquetSchema + " size = " + parquetSchema.getFieldCount());
}
nestedType = parquetSchema.getType(0);
valuesType = values.getType().getType();
if (nestedType.isPrimitive()) {
PrimitiveCounter counter = new PrimitiveCounter(newConverter(listEvents, nestedType, values).asPrimitiveConverter());
child = counter;
childCounter = counter;
} else {
GroupCounter counter = new GroupCounter(newConverter(listEvents, nestedType, values).asGroupConverter());
child = counter;
childCounter = counter;
}
}
@Override
public Converter getConverter(int fieldIndex) {
if (fieldIndex != 0) {
throw new IllegalArgumentException("lists have only one field. can't reach " + fieldIndex);
}
return child;
}
@Override
public void start() {
childCounter.startCounting();
// we'll add the events in the end as we need to count elements
}
@Override
public void end() {
final int count = childCounter.getCount();
collectionStart(count, valuesType.getThriftType());
parentEvents.addAll(listEvents);
listEvents.clear();
collectionEnd();
}
abstract void collectionStart(int count, byte type);
abstract void collectionEnd();
}
/**
* converts to Struct
* @author Julien Le Dem
*
*/
class StructConverter extends GroupConverter {
private final int schemaSize;
private final Converter[] converters;
private final StructType thriftType;
private final String name;
private final TStruct tStruct;
private final List events;
private StructConverter(List events, GroupType parquetSchema, ThriftField field) {
this.events = events;
this.name = field.getName();
this.tStruct = new TStruct(name);
this.thriftType = (StructType)field.getType();
this.schemaSize = parquetSchema.getFieldCount();
this.converters = new Converter[this.schemaSize];
List thriftChildren = thriftType.getChildren();
for (int i = 0; i < schemaSize; i++) {
Type schemaType = parquetSchema.getType(i);
String fieldName = schemaType.getName();
ThriftField matchingThrift = null;
for (ThriftField childField: thriftChildren) {
String thriftChildName = childField.getName();
if (thriftChildName != null && thriftChildName.equalsIgnoreCase(fieldName)) {
matchingThrift = childField;
break;
}
}
if (matchingThrift == null) {
// this means the file did not contain that field
// it will never be populated in this instance
// other files might populate it
continue;
}
if (schemaType.isPrimitive()) {
converters[i] = new PrimitiveFieldHandler(newConverter(events, schemaType, matchingThrift).asPrimitiveConverter(), matchingThrift, events);
} else {
converters[i] = new GroupFieldhandler(newConverter(events, schemaType, matchingThrift).asGroupConverter(), matchingThrift, events);
}
}
}
@Override
public Converter getConverter(int fieldIndex) {
return converters[fieldIndex];
}
final ParquetProtocol readStructBegin = new ParquetProtocol("readStructBegin()") {
@Override
public TStruct readStructBegin() throws TException {
return tStruct;
}
};
@Override
public void start() {
events.add(readStructBegin);
}
private final ParquetProtocol readFieldStop = new ParquetProtocol("readFieldBegin() => STOP") {
final TField stop = new TField("", TType.STOP, (short)0);
@Override
public TField readFieldBegin() throws TException {
return stop;
}
};
private final ParquetProtocol readStructEnd = new ParquetProtocol("readStructEnd()") {
@Override
public void readStructEnd() throws TException {
}
};
@Override
public void end() {
events.add(readFieldStop);
events.add(readStructEnd);
}
}
private final ThriftReader thriftReader;
private final ParquetReadProtocol protocol;
private final GroupConverter structConverter;
private List rootEvents = new ArrayList();
private boolean missingRequiredFieldsInProjection = false;
/**
*
* @param thriftReader the class responsible for instantiating the final object and read from the protocol
* @param name the name of that type ( the thrift class simple name)
* @param requestedParquetSchema the schema for the incoming columnar events
* @param thriftType the thrift type descriptor
*/
public ThriftRecordConverter(ThriftReader thriftReader, String name, MessageType requestedParquetSchema, ThriftType.StructType thriftType) {
super();
this.thriftReader = thriftReader;
this.protocol = new ParquetReadProtocol();
this.thriftType = thriftType;
MessageType fullSchema = new ThriftSchemaConverter().convert(thriftType);
missingRequiredFieldsInProjection = hasMissingRequiredFieldInGroupType(requestedParquetSchema, fullSchema);
this.structConverter = new StructConverter(rootEvents, requestedParquetSchema, new ThriftField(name, (short)0, Requirement.REQUIRED, thriftType));
}
private boolean hasMissingRequiredFieldInGroupType(GroupType requested, GroupType fullSchema) {
for (Type field : fullSchema.getFields()) {
if (requested.containsField(field.getName())) {
Type requestedType = requested.getType(field.getName());
// if a field is in requested schema and the type of it is a group type, then do recursive check
if (!field.isPrimitive()) {
if (hasMissingRequiredFieldInGroupType(requestedType.asGroupType(), field.asGroupType())) {
return true;
} else {
continue;// check next field
}
}
} else {
if (field.getRepetition() == Type.Repetition.REQUIRED) {
return true; // if a field is missing in requested schema and it's required
} else {
continue; // the missing field is not required, then continue checking next field
}
}
}
return false;
}
/**
*
* {@inheritDoc}
* @see parquet.io.api.RecordMaterializer#getCurrentRecord()
*/
@Override
public T getCurrentRecord() {
try {
if (missingRequiredFieldsInProjection) {
List fixedEvents = new ProtocolEventsAmender(rootEvents).amendMissingRequiredFields(thriftType);
protocol.addAll(fixedEvents);
} else {
protocol.addAll(rootEvents);
}
rootEvents.clear();
return thriftReader.readOneRecord(protocol);
} catch (TException e) {
throw new ParquetDecodingException("Could not read thrift object from protocol", e);
}
}
@Override
public void skipCurrentRecord() {
rootEvents.clear();
}
/**
*
* {@inheritDoc}
* @see parquet.io.api.RecordMaterializer#getRootConverter()
*/
@Override
public GroupConverter getRootConverter() {
return structConverter;
}
private Converter newConverter(List events, Type type, ThriftField field) {
switch (field.getType().getType()) {
case LIST:
return new ListConverter(events, type.asGroupType(), field);
case SET:
return new SetConverter(events, type.asGroupType(), field);
case MAP:
return new MapConverter(events, type.asGroupType(), field);
case STRUCT:
return new StructConverter(events, type.asGroupType(), field);
case STRING:
return new FieldStringConverter(events, field);
case ENUM:
return new FieldEnumConverter(events, field);
default:
return new FieldPrimitiveConverter(events, field);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy