org.apache.parquet.pig.TupleWriteSupport Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.parquet.pig;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.conf.HadoopParquetConfiguration;
import org.apache.parquet.conf.ParquetConfiguration;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.RecordConsumer;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Type;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.apache.pig.impl.util.Utils;
import org.apache.pig.parser.ParserException;
public class TupleWriteSupport extends WriteSupport {
private static final TupleFactory TF = TupleFactory.getInstance();
private static final PigSchemaConverter pigSchemaConverter = new PigSchemaConverter(false);
public static TupleWriteSupport fromPigSchema(String pigSchemaString) throws ParserException {
return new TupleWriteSupport(Utils.getSchemaFromString(pigSchemaString));
}
private RecordConsumer recordConsumer;
private MessageType rootSchema;
private Schema rootPigSchema;
/**
* @param pigSchema the pigSchema
*/
public TupleWriteSupport(Schema pigSchema) {
super();
this.rootSchema = pigSchemaConverter.convert(pigSchema);
this.rootPigSchema = pigSchema;
}
@Override
public String getName() {
return "pig";
}
public Schema getPigSchema() {
return rootPigSchema;
}
public MessageType getParquetSchema() {
return rootSchema;
}
@Override
public WriteContext init(Configuration configuration) {
return init(new HadoopParquetConfiguration(configuration));
}
@Override
public WriteContext init(ParquetConfiguration configuration) {
Map extraMetaData = new HashMap();
new PigMetaData(rootPigSchema).addToMetaData(extraMetaData);
return new WriteContext(rootSchema, extraMetaData);
}
@Override
public void prepareForWrite(RecordConsumer recordConsumer) {
this.recordConsumer = recordConsumer;
}
public void write(Tuple t) {
try {
recordConsumer.startMessage();
writeTuple(rootSchema, rootPigSchema, t);
recordConsumer.endMessage();
} catch (ExecException | FrontendException e) {
throw new RuntimeException(e);
}
}
private void writeTuple(GroupType schema, Schema pigSchema, Tuple t) throws ExecException, FrontendException {
List fields = schema.getFields();
List pigFields = pigSchema.getFields();
assert fields.size() == pigFields.size();
for (int i = 0; i < fields.size(); i++) {
if (t.isNull(i)) {
continue;
}
Type fieldType = fields.get(i);
recordConsumer.startField(fieldType.getName(), i);
FieldSchema pigType = pigFields.get(i);
switch (pigType.type) {
case DataType.BAG:
Type bagType = fieldType.asGroupType().getType(0);
FieldSchema pigBagInnerType = pigType.schema.getField(0);
DataBag bag = (DataBag) t.get(i);
recordConsumer.startGroup();
if (bag.size() > 0) {
recordConsumer.startField(bagType.getName(), 0);
for (Tuple tuple : bag) {
if (bagType.isPrimitive()) {
writeValue(bagType, pigBagInnerType, tuple, 0);
} else {
recordConsumer.startGroup();
writeTuple(bagType.asGroupType(), pigBagInnerType.schema, tuple);
recordConsumer.endGroup();
}
}
recordConsumer.endField(bagType.getName(), 0);
}
recordConsumer.endGroup();
break;
case DataType.MAP:
Type mapType = fieldType.asGroupType().getType(0);
FieldSchema pigMapInnerType = pigType.schema.getField(0);
@SuppressWarnings("unchecked") // I know
Map map = (Map) t.get(i);
recordConsumer.startGroup();
if (!map.isEmpty()) {
recordConsumer.startField(mapType.getName(), 0);
Set> entrySet = map.entrySet();
for (Entry entry : entrySet) {
recordConsumer.startGroup();
Schema keyValueSchema = new Schema(Arrays.asList(
new FieldSchema("key", DataType.CHARARRAY),
new FieldSchema("value", pigMapInnerType.schema, pigMapInnerType.type)));
writeTuple(
mapType.asGroupType(),
keyValueSchema,
TF.newTuple(Arrays.asList(entry.getKey(), entry.getValue())));
recordConsumer.endGroup();
}
recordConsumer.endField(mapType.getName(), 0);
}
recordConsumer.endGroup();
break;
default:
writeValue(fieldType, pigType, t, i);
break;
}
recordConsumer.endField(fieldType.getName(), i);
}
}
private void writeValue(Type type, FieldSchema pigType, Tuple t, int i) {
try {
if (type.isPrimitive()) {
switch (type.asPrimitiveType().getPrimitiveTypeName()) {
// TODO: use PrimitiveTuple accessors
case BINARY:
byte[] bytes;
if (pigType.type == DataType.BYTEARRAY) {
bytes = ((DataByteArray) t.get(i)).get();
} else if (pigType.type == DataType.CHARARRAY) {
bytes = ((String) t.get(i)).getBytes("UTF-8");
} else {
throw new UnsupportedOperationException(
"can not convert from " + DataType.findTypeName(pigType.type) + " to BINARY ");
}
recordConsumer.addBinary(Binary.fromReusedByteArray(bytes));
break;
case BOOLEAN:
recordConsumer.addBoolean((Boolean) t.get(i));
break;
case INT32:
recordConsumer.addInteger(((Number) t.get(i)).intValue());
break;
case INT64:
recordConsumer.addLong(((Number) t.get(i)).longValue());
break;
case DOUBLE:
recordConsumer.addDouble(((Number) t.get(i)).doubleValue());
break;
case FLOAT:
recordConsumer.addFloat(((Number) t.get(i)).floatValue());
break;
default:
throw new UnsupportedOperationException(
type.asPrimitiveType().getPrimitiveTypeName().name());
}
} else {
assert pigType.type == DataType.TUPLE;
recordConsumer.startGroup();
writeTuple(type.asGroupType(), pigType.schema, (Tuple) t.get(i));
recordConsumer.endGroup();
}
} catch (Exception e) {
throw new ParquetEncodingException(
"can not write value at " + i + " in tuple " + t + " from type '" + pigType + "' to type '" + type
+ "'",
e);
}
}
}