All Downloads are FREE. Search and download functionalities are using the official Maven repository.

parquet.pig.convert.TupleConverter Maven / Gradle / Ivy

There is a newer version: 1.6.0
Show newest version
/**
 * Copyright 2012 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package parquet.pig.convert;

import static java.lang.Math.max;
import java.util.ArrayList;
import java.util.List;

import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.NonSpillableDataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;

import parquet.column.Dictionary;
import parquet.io.ParquetDecodingException;
import parquet.io.api.Binary;
import parquet.io.api.Converter;
import parquet.io.api.GroupConverter;
import parquet.io.api.PrimitiveConverter;
import parquet.pig.TupleConversionException;
import parquet.schema.GroupType;
import parquet.schema.OriginalType;
import parquet.schema.PrimitiveType;
import parquet.schema.Type;
import parquet.schema.Type.Repetition;

/**
 * converts a group into a tuple
 *
 * @author Julien Le Dem
 *
 */

public class TupleConverter extends GroupConverter {

  private static final TupleFactory TF = TupleFactory.getInstance();

  private final int schemaSize;

  protected Tuple currentTuple;
  private final Converter[] converters;

  private final GroupType parquetSchema;

  private final boolean elephantBirdCompatible;

  public TupleConverter(GroupType parquetSchema, Schema pigSchema, boolean elephantBirdCompatible, boolean columnIndexAccess) {
    this.parquetSchema = parquetSchema;
    this.elephantBirdCompatible = elephantBirdCompatible;
    try {
      this.schemaSize = max(parquetSchema.getFieldCount(), pigSchema.getFields().size());
      this.converters = new Converter[this.schemaSize];
      for (int i = 0, c = 0; i < schemaSize; i++) {
        FieldSchema field = pigSchema.getField(i);
        if(parquetSchema.containsField(field.alias) || columnIndexAccess) {
          Type type = getType(columnIndexAccess, field.alias, i);
          
          if(type != null) {
            final int index = i;
            converters[c++] = newConverter(field, type, new ParentValueContainer() {
              @Override
              void add(Object value) {
                TupleConverter.this.set(index, value);
              }
            }, elephantBirdCompatible, columnIndexAccess);
          }
        }
        
      }
    } catch (FrontendException e) {
      throw new ParquetDecodingException("can not initialize pig converter from:\n" + parquetSchema + "\n" + pigSchema, e);
    }
  }

  private Type getType(boolean columnIndexAccess, String alias, int index) {
    if(columnIndexAccess) {
      if(index < parquetSchema.getFieldCount()) {
        return parquetSchema.getType(index);
      }
    } else {
      return parquetSchema.getType(parquetSchema.getFieldIndex(alias));
    }
    
    return null;
  }
  
  static Converter newConverter(FieldSchema pigField, Type type, final ParentValueContainer parent, boolean elephantBirdCompatible, boolean columnIndexAccess) {
    try {
      switch (pigField.type) {
      case DataType.BAG:
        return new BagConverter(type.asGroupType(), pigField, parent, elephantBirdCompatible, columnIndexAccess);
      case DataType.MAP:
        return new MapConverter(type.asGroupType(), pigField, parent, elephantBirdCompatible, columnIndexAccess);
      case DataType.TUPLE:
        return new TupleConverter(type.asGroupType(), pigField.schema, elephantBirdCompatible, columnIndexAccess) {
          @Override
          public void end() {
            super.end();
            parent.add(this.currentTuple);
          }
        };
      case DataType.CHARARRAY:
          //If the orignal type isn't a string, we don't want to use the dictionary because
          //a custom implementation will be needed for each type.  Just default to no dictionary.
        return new FieldStringConverter(parent, type.getOriginalType() == OriginalType.UTF8);
      case DataType.BYTEARRAY:
        return new FieldByteArrayConverter(parent);
      case DataType.INTEGER:
        return new FieldIntegerConverter(parent);
      case DataType.BOOLEAN:
        if (elephantBirdCompatible) {
          return new FieldIntegerConverter(parent);
        } else {
          return new FieldBooleanConverter(parent);
        }
      case DataType.FLOAT:
        return new FieldFloatConverter(parent);
      case DataType.DOUBLE:
        return new FieldDoubleConverter(parent);
      case DataType.LONG:
        return new FieldLongConverter(parent);
      default:
        throw new TupleConversionException("unsupported pig type: " + pigField);
      }
    } catch (FrontendException e) {
      throw new TupleConversionException("error while preparing converter for:\n" + pigField + "\n" + type, e);
    } catch (RuntimeException e) {
      throw new TupleConversionException("error while preparing converter for:\n" + pigField + "\n" + type, e);
    }
  }

  @Override
  public Converter getConverter(int fieldIndex) {
    return converters[fieldIndex];
  }

  private static final Integer I32_ZERO = Integer.valueOf(0);
  private static final Long I64_ZERO = Long.valueOf(0);
  private static final Float FLOAT_ZERO = Float.valueOf(0);
  private static final Double DOUBLE_ZERO = Double.valueOf(0);

  @Override
  final public void start() {
    currentTuple = TF.newTuple(schemaSize);
    if (elephantBirdCompatible) {
      try {
        int i = 0;
        for (Type field : parquetSchema.getFields()) {
          if (field.isPrimitive() && field.isRepetition(Repetition.OPTIONAL)) {
            PrimitiveType primitiveType = field.asPrimitiveType();
            switch (primitiveType.getPrimitiveTypeName()) {
            case INT32:
              currentTuple.set(i, I32_ZERO);
              break;
            case INT64:
              currentTuple.set(i, I64_ZERO);
              break;
            case FLOAT:
              currentTuple.set(i, FLOAT_ZERO);
              break;
            case DOUBLE:
              currentTuple.set(i, DOUBLE_ZERO);
              break;
            case BOOLEAN:
              currentTuple.set(i, I32_ZERO);
              break;
            }
          }
          ++ i;
        }
      } catch (ExecException e) {
        throw new RuntimeException(e);
      }
    }
  }

  final void set(int fieldIndex, Object value) {
    try {
      currentTuple.set(fieldIndex, value);
    } catch (ExecException e) {
      throw new TupleConversionException(
          "Could not set " + value +
          " to current tuple " + currentTuple + " at " + fieldIndex, e);
    }
  }

  @Override
  public void end() {
  }

  final public Tuple getCurrentTuple() {
    return currentTuple;
  }

  /**
   * handle string values.
   * In case of dictionary encoding, the strings will be decoded only once.
   * @author Julien Le Dem
   *
   */
  static final class FieldStringConverter extends PrimitiveConverter {

    private final ParentValueContainer parent;

    private boolean dictionarySupport;
    private String[] dict;

    public FieldStringConverter(ParentValueContainer parent, boolean dictionarySupport) {
      this.parent = parent;
      this.dictionarySupport = dictionarySupport;
    }

    @Override
    final public void addBinary(Binary value) {
      parent.add(value.toStringUsingUTF8());
    }

    @Override
    public boolean hasDictionarySupport() {
      return dictionarySupport;
    }

    @Override
    public void setDictionary(Dictionary dictionary) {
      dict = new String[dictionary.getMaxId() + 1];
      for (int i = 0; i <= dictionary.getMaxId(); i++) {
        dict[i] = dictionary.decodeToBinary(i).toStringUsingUTF8();
      }
    }

    @Override
    public void addValueFromDictionary(int dictionaryId) {
      parent.add(dict[dictionaryId]);
    }

    @Override
    public void addLong(long value) {
      parent.add(Long.toString(value));
    }

    @Override
    public void addInt(int value) {
      parent.add(Integer.toString(value));
    }

    @Override
    public void addFloat(float value) {
      parent.add(Float.toString(value));
    }

    @Override
    public void addDouble(double value) {
      parent.add(Double.toString(value));
    }

    @Override
    public void addBoolean(boolean value) {
      parent.add(Boolean.toString(value));
    }
    
    
  }

  /**
   * handles DataByteArrays
   * @author Julien Le Dem
   *
   */
  static final class FieldByteArrayConverter extends PrimitiveConverter {

    private final ParentValueContainer parent;

    public FieldByteArrayConverter(ParentValueContainer parent) {
      this.parent = parent;
    }

    @Override
    final public void addBinary(Binary value) {
      parent.add(new DataByteArray(value.getBytes()));
    }

  }

  /**
   * Handles doubles
   * @author Julien Le Dem
   *
   */
  static final class FieldDoubleConverter extends PrimitiveConverter {

    private final ParentValueContainer parent;

    public FieldDoubleConverter(ParentValueContainer parent) {
      this.parent = parent;
    }

    @Override
    final public void addDouble(double value) {
      parent.add(value);
    }

    @Override
    public void addLong(long value) {
      parent.add((double)value);
    }

    @Override
    public void addInt(int value) {
      parent.add((double)value);
    }

    @Override
    public void addFloat(float value) {
      parent.add((double)value);
    }

    @Override
    public void addBoolean(boolean value) {
      parent.add(value ? 1.0d : 0.0d);
    }

    @Override
    public void addBinary(Binary value) {
      parent.add(Double.parseDouble(value.toStringUsingUTF8()));
    }

  }

  /**
   * handles floats
   * @author Julien Le Dem
   *
   */
  static final class FieldFloatConverter extends PrimitiveConverter {

    private final ParentValueContainer parent;

    public FieldFloatConverter(ParentValueContainer parent) {
      this.parent = parent;
    }

    @Override
    final public void addFloat(float value) {
      parent.add(value);
    }

    @Override
    public void addLong(long value) {
      parent.add((float)value);
    }

    @Override
    public void addInt(int value) {
      parent.add((float)value);
    }

    @Override
    public void addDouble(double value) {
      parent.add((float)value);
    }

    @Override
    public void addBoolean(boolean value) {
      parent.add(value ? 1.0f : 0.0f);
    }

    @Override
    public void addBinary(Binary value) {
      parent.add(Float.parseFloat(value.toStringUsingUTF8()));
    }

  }

  /**
   * Handles longs
   *
   * @author Julien Le Dem
   *
   */
  static final class FieldLongConverter extends PrimitiveConverter {

    private final ParentValueContainer parent;

    public FieldLongConverter(ParentValueContainer parent) {
      this.parent = parent;
    }

    @Override
    final public void addLong(long value) {
      parent.add(value);
    }

    @Override
    public void addInt(int value) {
      parent.add((long)value); 
    }

    @Override
    public void addFloat(float value) {
      parent.add((long)value);
    }

    @Override
    public void addDouble(double value) {
      parent.add((long)value);
    }

    @Override
    public void addBoolean(boolean value) {
      parent.add(value ? 1L : 0L);
    }

    @Override
    public void addBinary(Binary value) {
      parent.add(Long.parseLong(value.toStringUsingUTF8()));
    }
    
  }

  /**
   * handle integers
   * @author Julien Le Dem
   *
   */
  static final class FieldIntegerConverter extends PrimitiveConverter {

    private final ParentValueContainer parent;

    public FieldIntegerConverter(ParentValueContainer parent) {
      this.parent = parent;
    }

    @Override
    final public void addBoolean(boolean value) {
      parent.add(value ? 1 : 0);
    }

    @Override
    final public void addInt(int value) {
      parent.add(value);
    }

    @Override
    public void addLong(long value) {
      parent.add((int)value);
    }

    @Override
    public void addFloat(float value) {
      parent.add((int)value);
    }

    @Override
    public void addDouble(double value) {
      parent.add((int)value);
    }

    @Override
    public void addBinary(Binary value) {
      parent.add(Integer.parseInt(value.toStringUsingUTF8()));
    }

  }

  /**
   * handle booleans
   * @author Julien Le Dem
   *
   */
  static final class FieldBooleanConverter extends PrimitiveConverter {

    private final ParentValueContainer parent;

    public FieldBooleanConverter(ParentValueContainer parent) {
      this.parent = parent;
    }

    @Override
    final public void addBoolean(boolean value) {
      parent.add(value);
    }

    @Override
    final public void addInt(int value) {
      parent.add(value != 0);
    }

    @Override
    public void addLong(long value) {
      parent.add(value!=0);
    }

    @Override
    public void addFloat(float value) {
      parent.add(value!=0);
    }

    @Override
    public void addDouble(double value) {
      parent.add(value!=0);
    }

    @Override
    public void addBinary(Binary value) {
      parent.add(Boolean.parseBoolean(value.toStringUsingUTF8()));
    }

    
  }

  /**
   * Converts groups into bags
   *
   * @author Julien Le Dem
   *
   */
  static class BagConverter extends GroupConverter {

    private final List buffer = new ArrayList();
    private final Converter child;
    private final ParentValueContainer parent;

    BagConverter(GroupType parquetSchema, FieldSchema pigSchema, ParentValueContainer parent, boolean numbersDefaultToZero, boolean columnIndexAccess) throws FrontendException {
      this.parent = parent;
      if (parquetSchema.getFieldCount() != 1) {
        throw new IllegalArgumentException("bags have only one field. " + parquetSchema + " size = " + parquetSchema.getFieldCount());
      }
      Type nestedType = parquetSchema.getType(0);

      ParentValueContainer childsParent;
      FieldSchema pigField;
      if (nestedType.isPrimitive() || nestedType.getOriginalType() == OriginalType.MAP || nestedType.getOriginalType() == OriginalType.LIST) {
        // Pig bags always contain tuples
        // In that case we need to wrap the value in an extra tuple
        childsParent = new ParentValueContainer() {
          @Override
          void add(Object value) {
            buffer.add(TF.newTuple(value));
          }};
        pigField = pigSchema.schema.getField(0).schema.getField(0);
      } else {
        childsParent = new ParentValueContainer() {
          @Override
          void add(Object value) {
            buffer.add((Tuple)value);
          }};
        pigField = pigSchema.schema.getField(0);
      }
      child = newConverter(pigField, nestedType, childsParent, numbersDefaultToZero, columnIndexAccess);
    }

    @Override
    public Converter getConverter(int fieldIndex) {
      if (fieldIndex != 0) {
        throw new IllegalArgumentException("bags have only one field. can't reach " + fieldIndex);
      }
      return child;
    }


    @Override
    final public void start() {
      buffer.clear();
    }

    @Override
    public void end() {
      parent.add(new NonSpillableDataBag(new ArrayList(buffer)));
    }

  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy