org.apache.parquet.avro.AvroParquetInputFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hudi-gcp-bundle Show documentation
There is a newer version: 1.0.0-beta2
/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.avro;

import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.mapreduce.Job;
import org.apache.parquet.hadoop.ParquetInputFormat;
import org.apache.parquet.hadoop.util.ContextUtil;

/**
 * A Hadoop {@link org.apache.hadoop.mapreduce.InputFormat} for Parquet files.
 *
 * @param  the Java type of objects produced by this InputFormat
 */
public class AvroParquetInputFormat extends ParquetInputFormat {
  public AvroParquetInputFormat() {
    super(AvroReadSupport.class);
  }

  /**
   * Set the subset of columns to read (projection pushdown). Specified as an Avro
   * schema, the requested projection is converted into a Parquet schema for Parquet
   * column projection.
   * 
   * This is useful if the full schema is large and you only want to read a few
   * columns, since it saves time by not reading unused columns.
   * 

   * If a requested projection is set, then the Avro schema used for reading
   * must be compatible with the projection. For instance, if a column is not included
   * in the projection then it must either not be included or be optional in the read
   * schema. Use {@link #setAvroReadSchema(org.apache.hadoop.mapreduce.Job,
   * org.apache.avro.Schema)} to set a read schema, if needed.
   * @param job a job
   * @param requestedProjection the requested projection schema
   * @see #setAvroReadSchema(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema)
   * @see org.apache.parquet.avro.AvroParquetOutputFormat#setSchema(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema)
   */
  public static void setRequestedProjection(Job job, Schema requestedProjection) {
    AvroReadSupport.setRequestedProjection(ContextUtil.getConfiguration(job),
        requestedProjection);
  }

  /**
   * Override the Avro schema to use for reading. If not set, the Avro schema used for
   * writing is used.
   * 
   * Differences between the read and write schemas are resolved using
   * Avro's schema resolution rules.
   * @param job a job
   * @param avroReadSchema the requested schema
   * @see #setRequestedProjection(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema)
   * @see org.apache.parquet.avro.AvroParquetOutputFormat#setSchema(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema)
   */
  public static void setAvroReadSchema(Job job, Schema avroReadSchema) {
    AvroReadSupport.setAvroReadSchema(ContextUtil.getConfiguration(job), avroReadSchema);
  }

  /**
   * Uses an instance of the specified {@link AvroDataSupplier} class to control how the
   * {@link org.apache.avro.specific.SpecificData} instance that is used to find
   * Avro specific records is created.
   * @param job a job
   * @param supplierClass an avro data supplier class
   */
  public static void setAvroDataSupplier(Job job,
      Class supplierClass) {
    AvroReadSupport.setAvroDataSupplier(ContextUtil.getConfiguration(job), supplierClass);
  }
}