org.apache.parquet.pig.TupleReadSupport Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of parquet-pig Show documentation
There is a newer version: 1.14.4
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.pig;

import static org.apache.parquet.pig.PigSchemaConverter.parsePigSchema;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.conf.HadoopParquetConfiguration;
import org.apache.parquet.conf.ParquetConfiguration;
import org.apache.parquet.hadoop.api.InitContext;
import org.apache.parquet.hadoop.api.ReadSupport;
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.io.api.RecordMaterializer;
import org.apache.parquet.pig.convert.TupleRecordMaterializer;
import org.apache.parquet.schema.IncompatibleSchemaModificationException;
import org.apache.parquet.schema.MessageType;
import org.apache.pig.LoadPushDown.RequiredFieldList;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.apache.pig.impl.util.ObjectSerializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Read support for Pig Tuple
 * a Pig MetaDataBlock is expected in the initialization call
 */
public class TupleReadSupport extends ReadSupport {
  static final String PARQUET_PIG_SCHEMA = "parquet.pig.schema";
  static final String PARQUET_COLUMN_INDEX_ACCESS = "parquet.private.pig.column.index.access";
  static final String PARQUET_PIG_REQUIRED_FIELDS = "parquet.private.pig.required.fields";
  static final String PARQUET_PIG_ELEPHANT_BIRD_COMPATIBLE = "parquet.pig.elephantbird.compatible";
  private static final Logger LOG = LoggerFactory.getLogger(TupleReadSupport.class);

  private static final PigSchemaConverter pigSchemaConverter = new PigSchemaConverter(false);

  /**
   * @param configuration the configuration for the current job
   * @return the pig schema requested by the user or null if none.
   */
  static Schema getPigSchema(Configuration configuration) {
    return getPigSchema(new HadoopParquetConfiguration(configuration));
  }

  /**
   * @param configuration the configuration
   * @return the pig schema requested by the user or null if none.
   */
  static Schema getPigSchema(ParquetConfiguration configuration) {
    return parsePigSchema(configuration.get(PARQUET_PIG_SCHEMA));
  }

  /**
   * @param configuration configuration for the current job
   * @return List of required fields from pushProjection
   */
  static RequiredFieldList getRequiredFields(Configuration configuration) {
    return getRequiredFields(new HadoopParquetConfiguration(configuration));
  }

  /**
   * @param configuration configuration
   * @return List of required fields from pushProjection
   */
  static RequiredFieldList getRequiredFields(ParquetConfiguration configuration) {
    String requiredFieldString = configuration.get(PARQUET_PIG_REQUIRED_FIELDS);

    if (requiredFieldString == null) {
      return null;
    }

    try {
      return (RequiredFieldList) ObjectSerializer.deserialize(requiredFieldString);
    } catch (IOException iOException) {
      throw new RuntimeException("Failed to deserialize pushProjection");
    }
  }

  /**
   * @param fileSchema       the parquet schema from the file
   * @param keyValueMetaData the extra meta data from the files
   * @return the pig schema according to the file
   */
  static Schema getPigSchemaFromMultipleFiles(MessageType fileSchema, Map> keyValueMetaData) {
    Set pigSchemas = PigMetaData.getPigSchemas(keyValueMetaData);
    if (pigSchemas == null) {
      return pigSchemaConverter.convert(fileSchema);
    }
    Schema mergedPigSchema = null;
    for (String pigSchemaString : pigSchemas) {
      try {
        mergedPigSchema = union(mergedPigSchema, parsePigSchema(pigSchemaString));
      } catch (FrontendException e) {
        throw new ParquetDecodingException("can not merge " + pigSchemaString + " into " + mergedPigSchema, e);
      }
    }
    return mergedPigSchema;
  }

  /**
   * @param fileSchema       the parquet schema from the file
   * @param keyValueMetaData the extra meta data from the file
   * @return the pig schema according to the file
   */
  static Schema getPigSchemaFromFile(MessageType fileSchema, Map keyValueMetaData) {
    PigMetaData pigMetaData = PigMetaData.fromMetaData(keyValueMetaData);
    if (pigMetaData == null) {
      return pigSchemaConverter.convert(fileSchema);
    }
    return parsePigSchema(pigMetaData.getPigSchema());
  }

  private static Schema union(Schema merged, Schema pigSchema) throws FrontendException {
    List fields = new ArrayList();
    if (merged == null) {
      return pigSchema;
    }
    // merging existing fields
    for (FieldSchema fieldSchema : merged.getFields()) {
      FieldSchema newFieldSchema = pigSchema.getField(fieldSchema.alias);
      if (newFieldSchema == null) {
        fields.add(fieldSchema);
      } else {
        fields.add(union(fieldSchema, newFieldSchema));
      }
    }
    // adding new fields
    for (FieldSchema newFieldSchema : pigSchema.getFields()) {
      FieldSchema oldFieldSchema = merged.getField(newFieldSchema.alias);
      if (oldFieldSchema == null) {
        fields.add(newFieldSchema);
      }
    }
    return new Schema(fields);
  }

  private static FieldSchema union(FieldSchema mergedFieldSchema, FieldSchema newFieldSchema) {
    if (!mergedFieldSchema.alias.equals(newFieldSchema.alias) || mergedFieldSchema.type != newFieldSchema.type) {
      throw new IncompatibleSchemaModificationException(
          "Incompatible Pig schema change: " + mergedFieldSchema + " can not accept");
    }
    try {
      return new FieldSchema(
          mergedFieldSchema.alias,
          union(mergedFieldSchema.schema, newFieldSchema.schema),
          mergedFieldSchema.type);
    } catch (FrontendException e) {
      throw new SchemaConversionException(e);
    }
  }

  @Override
  public ReadContext init(InitContext initContext) {
    Schema pigSchema = getPigSchema(initContext.getParquetConfiguration());
    RequiredFieldList requiredFields = getRequiredFields(initContext.getParquetConfiguration());
    boolean columnIndexAccess =
        initContext.getParquetConfiguration().getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);

    if (pigSchema == null) {
      return new ReadContext(initContext.getFileSchema());
    } else {

      // project the file schema according to the requested Pig schema
      MessageType parquetRequestedSchema = new PigSchemaConverter(columnIndexAccess)
          .filter(initContext.getFileSchema(), pigSchema, requiredFields);
      return new ReadContext(parquetRequestedSchema);
    }
  }

  @Override
  public RecordMaterializer prepareForRead(
      Configuration configuration,
      Map keyValueMetaData,
      MessageType fileSchema,
      ReadContext readContext) {
    return prepareForRead(new HadoopParquetConfiguration(configuration), keyValueMetaData, fileSchema, readContext);
  }

  @Override
  public RecordMaterializer prepareForRead(
      ParquetConfiguration configuration,
      Map keyValueMetaData,
      MessageType fileSchema,
      ReadContext readContext) {
    MessageType requestedSchema = readContext.getRequestedSchema();
    Schema requestedPigSchema = getPigSchema(configuration);
    if (requestedPigSchema == null) {
      throw new ParquetDecodingException("Missing Pig schema: ParquetLoader sets the schema in the job conf");
    }
    boolean elephantBirdCompatible = configuration.getBoolean(PARQUET_PIG_ELEPHANT_BIRD_COMPATIBLE, false);
    boolean columnIndexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);
    if (elephantBirdCompatible) {
      LOG.info("Numbers will default to 0 instead of NULL; Boolean will be converted to Int");
    }
    return new TupleRecordMaterializer(
        requestedSchema, requestedPigSchema, elephantBirdCompatible, columnIndexAccess);
  }
}