parquet.pig.convert.TupleConverter Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package parquet.pig.convert;
import static java.lang.Math.max;
import java.util.ArrayList;
import java.util.List;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.NonSpillableDataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import parquet.column.Dictionary;
import parquet.io.ParquetDecodingException;
import parquet.io.api.Binary;
import parquet.io.api.Converter;
import parquet.io.api.GroupConverter;
import parquet.io.api.PrimitiveConverter;
import parquet.pig.TupleConversionException;
import parquet.schema.GroupType;
import parquet.schema.OriginalType;
import parquet.schema.PrimitiveType;
import parquet.schema.Type;
import parquet.schema.Type.Repetition;
/**
* converts a group into a tuple
*
* @author Julien Le Dem
*
*/
public class TupleConverter extends GroupConverter {
private static final TupleFactory TF = TupleFactory.getInstance();
private final int schemaSize;
protected Tuple currentTuple;
private final Converter[] converters;
private final GroupType parquetSchema;
private final boolean elephantBirdCompatible;
public TupleConverter(GroupType parquetSchema, Schema pigSchema, boolean elephantBirdCompatible, boolean columnIndexAccess) {
this.parquetSchema = parquetSchema;
this.elephantBirdCompatible = elephantBirdCompatible;
try {
this.schemaSize = max(parquetSchema.getFieldCount(), pigSchema.getFields().size());
this.converters = new Converter[this.schemaSize];
for (int i = 0, c = 0; i < schemaSize; i++) {
FieldSchema field = pigSchema.getField(i);
if(parquetSchema.containsField(field.alias) || columnIndexAccess) {
Type type = getType(columnIndexAccess, field.alias, i);
if(type != null) {
final int index = i;
converters[c++] = newConverter(field, type, new ParentValueContainer() {
@Override
void add(Object value) {
TupleConverter.this.set(index, value);
}
}, elephantBirdCompatible, columnIndexAccess);
}
}
}
} catch (FrontendException e) {
throw new ParquetDecodingException("can not initialize pig converter from:\n" + parquetSchema + "\n" + pigSchema, e);
}
}
private Type getType(boolean columnIndexAccess, String alias, int index) {
if(columnIndexAccess) {
if(index < parquetSchema.getFieldCount()) {
return parquetSchema.getType(index);
}
} else {
return parquetSchema.getType(parquetSchema.getFieldIndex(alias));
}
return null;
}
static Converter newConverter(FieldSchema pigField, Type type, final ParentValueContainer parent, boolean elephantBirdCompatible, boolean columnIndexAccess) {
try {
switch (pigField.type) {
case DataType.BAG:
return new BagConverter(type.asGroupType(), pigField, parent, elephantBirdCompatible, columnIndexAccess);
case DataType.MAP:
return new MapConverter(type.asGroupType(), pigField, parent, elephantBirdCompatible, columnIndexAccess);
case DataType.TUPLE:
return new TupleConverter(type.asGroupType(), pigField.schema, elephantBirdCompatible, columnIndexAccess) {
@Override
public void end() {
super.end();
parent.add(this.currentTuple);
}
};
case DataType.CHARARRAY:
//If the orignal type isn't a string, we don't want to use the dictionary because
//a custom implementation will be needed for each type. Just default to no dictionary.
return new FieldStringConverter(parent, type.getOriginalType() == OriginalType.UTF8);
case DataType.BYTEARRAY:
return new FieldByteArrayConverter(parent);
case DataType.INTEGER:
return new FieldIntegerConverter(parent);
case DataType.BOOLEAN:
if (elephantBirdCompatible) {
return new FieldIntegerConverter(parent);
} else {
return new FieldBooleanConverter(parent);
}
case DataType.FLOAT:
return new FieldFloatConverter(parent);
case DataType.DOUBLE:
return new FieldDoubleConverter(parent);
case DataType.LONG:
return new FieldLongConverter(parent);
default:
throw new TupleConversionException("unsupported pig type: " + pigField);
}
} catch (FrontendException e) {
throw new TupleConversionException("error while preparing converter for:\n" + pigField + "\n" + type, e);
} catch (RuntimeException e) {
throw new TupleConversionException("error while preparing converter for:\n" + pigField + "\n" + type, e);
}
}
@Override
public Converter getConverter(int fieldIndex) {
return converters[fieldIndex];
}
private static final Integer I32_ZERO = Integer.valueOf(0);
private static final Long I64_ZERO = Long.valueOf(0);
private static final Float FLOAT_ZERO = Float.valueOf(0);
private static final Double DOUBLE_ZERO = Double.valueOf(0);
@Override
final public void start() {
currentTuple = TF.newTuple(schemaSize);
if (elephantBirdCompatible) {
try {
int i = 0;
for (Type field : parquetSchema.getFields()) {
if (field.isPrimitive() && field.isRepetition(Repetition.OPTIONAL)) {
PrimitiveType primitiveType = field.asPrimitiveType();
switch (primitiveType.getPrimitiveTypeName()) {
case INT32:
currentTuple.set(i, I32_ZERO);
break;
case INT64:
currentTuple.set(i, I64_ZERO);
break;
case FLOAT:
currentTuple.set(i, FLOAT_ZERO);
break;
case DOUBLE:
currentTuple.set(i, DOUBLE_ZERO);
break;
case BOOLEAN:
currentTuple.set(i, I32_ZERO);
break;
}
}
++ i;
}
} catch (ExecException e) {
throw new RuntimeException(e);
}
}
}
final void set(int fieldIndex, Object value) {
try {
currentTuple.set(fieldIndex, value);
} catch (ExecException e) {
throw new TupleConversionException(
"Could not set " + value +
" to current tuple " + currentTuple + " at " + fieldIndex, e);
}
}
@Override
public void end() {
}
final public Tuple getCurrentTuple() {
return currentTuple;
}
/**
* handle string values.
* In case of dictionary encoding, the strings will be decoded only once.
* @author Julien Le Dem
*
*/
static final class FieldStringConverter extends PrimitiveConverter {
private final ParentValueContainer parent;
private boolean dictionarySupport;
private String[] dict;
public FieldStringConverter(ParentValueContainer parent, boolean dictionarySupport) {
this.parent = parent;
this.dictionarySupport = dictionarySupport;
}
@Override
final public void addBinary(Binary value) {
parent.add(value.toStringUsingUTF8());
}
@Override
public boolean hasDictionarySupport() {
return dictionarySupport;
}
@Override
public void setDictionary(Dictionary dictionary) {
dict = new String[dictionary.getMaxId() + 1];
for (int i = 0; i <= dictionary.getMaxId(); i++) {
dict[i] = dictionary.decodeToBinary(i).toStringUsingUTF8();
}
}
@Override
public void addValueFromDictionary(int dictionaryId) {
parent.add(dict[dictionaryId]);
}
@Override
public void addLong(long value) {
parent.add(Long.toString(value));
}
@Override
public void addInt(int value) {
parent.add(Integer.toString(value));
}
@Override
public void addFloat(float value) {
parent.add(Float.toString(value));
}
@Override
public void addDouble(double value) {
parent.add(Double.toString(value));
}
@Override
public void addBoolean(boolean value) {
parent.add(Boolean.toString(value));
}
}
/**
* handles DataByteArrays
* @author Julien Le Dem
*
*/
static final class FieldByteArrayConverter extends PrimitiveConverter {
private final ParentValueContainer parent;
public FieldByteArrayConverter(ParentValueContainer parent) {
this.parent = parent;
}
@Override
final public void addBinary(Binary value) {
parent.add(new DataByteArray(value.getBytes()));
}
}
/**
* Handles doubles
* @author Julien Le Dem
*
*/
static final class FieldDoubleConverter extends PrimitiveConverter {
private final ParentValueContainer parent;
public FieldDoubleConverter(ParentValueContainer parent) {
this.parent = parent;
}
@Override
final public void addDouble(double value) {
parent.add(value);
}
@Override
public void addLong(long value) {
parent.add((double)value);
}
@Override
public void addInt(int value) {
parent.add((double)value);
}
@Override
public void addFloat(float value) {
parent.add((double)value);
}
@Override
public void addBoolean(boolean value) {
parent.add(value ? 1.0d : 0.0d);
}
@Override
public void addBinary(Binary value) {
parent.add(Double.parseDouble(value.toStringUsingUTF8()));
}
}
/**
* handles floats
* @author Julien Le Dem
*
*/
static final class FieldFloatConverter extends PrimitiveConverter {
private final ParentValueContainer parent;
public FieldFloatConverter(ParentValueContainer parent) {
this.parent = parent;
}
@Override
final public void addFloat(float value) {
parent.add(value);
}
@Override
public void addLong(long value) {
parent.add((float)value);
}
@Override
public void addInt(int value) {
parent.add((float)value);
}
@Override
public void addDouble(double value) {
parent.add((float)value);
}
@Override
public void addBoolean(boolean value) {
parent.add(value ? 1.0f : 0.0f);
}
@Override
public void addBinary(Binary value) {
parent.add(Float.parseFloat(value.toStringUsingUTF8()));
}
}
/**
* Handles longs
*
* @author Julien Le Dem
*
*/
static final class FieldLongConverter extends PrimitiveConverter {
private final ParentValueContainer parent;
public FieldLongConverter(ParentValueContainer parent) {
this.parent = parent;
}
@Override
final public void addLong(long value) {
parent.add(value);
}
@Override
public void addInt(int value) {
parent.add((long)value);
}
@Override
public void addFloat(float value) {
parent.add((long)value);
}
@Override
public void addDouble(double value) {
parent.add((long)value);
}
@Override
public void addBoolean(boolean value) {
parent.add(value ? 1L : 0L);
}
@Override
public void addBinary(Binary value) {
parent.add(Long.parseLong(value.toStringUsingUTF8()));
}
}
/**
* handle integers
* @author Julien Le Dem
*
*/
static final class FieldIntegerConverter extends PrimitiveConverter {
private final ParentValueContainer parent;
public FieldIntegerConverter(ParentValueContainer parent) {
this.parent = parent;
}
@Override
final public void addBoolean(boolean value) {
parent.add(value ? 1 : 0);
}
@Override
final public void addInt(int value) {
parent.add(value);
}
@Override
public void addLong(long value) {
parent.add((int)value);
}
@Override
public void addFloat(float value) {
parent.add((int)value);
}
@Override
public void addDouble(double value) {
parent.add((int)value);
}
@Override
public void addBinary(Binary value) {
parent.add(Integer.parseInt(value.toStringUsingUTF8()));
}
}
/**
* handle booleans
* @author Julien Le Dem
*
*/
static final class FieldBooleanConverter extends PrimitiveConverter {
private final ParentValueContainer parent;
public FieldBooleanConverter(ParentValueContainer parent) {
this.parent = parent;
}
@Override
final public void addBoolean(boolean value) {
parent.add(value);
}
@Override
final public void addInt(int value) {
parent.add(value != 0);
}
@Override
public void addLong(long value) {
parent.add(value!=0);
}
@Override
public void addFloat(float value) {
parent.add(value!=0);
}
@Override
public void addDouble(double value) {
parent.add(value!=0);
}
@Override
public void addBinary(Binary value) {
parent.add(Boolean.parseBoolean(value.toStringUsingUTF8()));
}
}
/**
* Converts groups into bags
*
* @author Julien Le Dem
*
*/
static class BagConverter extends GroupConverter {
private final List buffer = new ArrayList();
private final Converter child;
private final ParentValueContainer parent;
BagConverter(GroupType parquetSchema, FieldSchema pigSchema, ParentValueContainer parent, boolean numbersDefaultToZero, boolean columnIndexAccess) throws FrontendException {
this.parent = parent;
if (parquetSchema.getFieldCount() != 1) {
throw new IllegalArgumentException("bags have only one field. " + parquetSchema + " size = " + parquetSchema.getFieldCount());
}
Type nestedType = parquetSchema.getType(0);
ParentValueContainer childsParent;
FieldSchema pigField;
if (nestedType.isPrimitive() || nestedType.getOriginalType() == OriginalType.MAP || nestedType.getOriginalType() == OriginalType.LIST) {
// Pig bags always contain tuples
// In that case we need to wrap the value in an extra tuple
childsParent = new ParentValueContainer() {
@Override
void add(Object value) {
buffer.add(TF.newTuple(value));
}};
pigField = pigSchema.schema.getField(0).schema.getField(0);
} else {
childsParent = new ParentValueContainer() {
@Override
void add(Object value) {
buffer.add((Tuple)value);
}};
pigField = pigSchema.schema.getField(0);
}
child = newConverter(pigField, nestedType, childsParent, numbersDefaultToZero, columnIndexAccess);
}
@Override
public Converter getConverter(int fieldIndex) {
if (fieldIndex != 0) {
throw new IllegalArgumentException("bags have only one field. can't reach " + fieldIndex);
}
return child;
}
@Override
final public void start() {
buffer.clear();
}
@Override
public void end() {
parent.add(new NonSpillableDataBag(new ArrayList(buffer)));
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy