org.apache.parquet.hadoop.api.ReadSupport Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of parquet-hadoop Show documentation
There is a newer version: 1.14.4
/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.hadoop.api;

import java.util.Map;

import org.apache.hadoop.conf.Configuration;

import org.apache.parquet.io.api.RecordMaterializer;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;

/**
 * Abstraction used by the {@link org.apache.parquet.hadoop.ParquetInputFormat} to materialize records
 *
 * @author Julien Le Dem
 *
 * @param  the type of the materialized record
 */
abstract public class ReadSupport {

  /**
   * configuration key for a parquet read projection schema
   */
	public static final String PARQUET_READ_SCHEMA = "parquet.read.schema";

  /**
   * attempts to validate and construct a {@link MessageType} from a read projection schema
   *
   * @param fileMessageType         the typed schema of the source
   * @param partialReadSchemaString the requested projection schema
   * @return the typed schema that should be used to read
   */
  public static MessageType getSchemaForRead(MessageType fileMessageType, String partialReadSchemaString) {
    if (partialReadSchemaString == null)
      return fileMessageType;
    MessageType requestedMessageType = MessageTypeParser.parseMessageType(partialReadSchemaString);
    return getSchemaForRead(fileMessageType, requestedMessageType);
  }

  public static MessageType getSchemaForRead(MessageType fileMessageType, MessageType projectedMessageType) {
    fileMessageType.checkContains(projectedMessageType);
    return projectedMessageType;
  }

  /**
   * called in {@link org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} in the front end
   *
   * @param configuration    the job configuration
   * @param keyValueMetaData the app specific metadata from the file
   * @param fileSchema       the schema of the file
   * @return the readContext that defines how to read the file
   *
   * @deprecated override {@link ReadSupport#init(InitContext)} instead
   */
  @Deprecated
  public ReadContext init(
          Configuration configuration,
          Map keyValueMetaData,
          MessageType fileSchema) {
    throw new UnsupportedOperationException("Override init(InitContext)");
  }

  /**
   * called in {@link org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} in the front end
   *
   * @param context the initialisation context
   * @return the readContext that defines how to read the file
   */
  public ReadContext init(InitContext context) {
    return init(context.getConfiguration(), context.getMergedKeyValueMetaData(), context.getFileSchema());
  }

  /**
   * called in {@link org.apache.hadoop.mapreduce.RecordReader#initialize(org.apache.hadoop.mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext)} in the back end
   * the returned RecordMaterializer will materialize the records and add them to the destination
   *
   * @param configuration    the job configuration
   * @param keyValueMetaData the app specific metadata from the file
   * @param fileSchema       the schema of the file
   * @param readContext      returned by the init method
   * @return the recordMaterializer that will materialize the records
   */
  abstract public RecordMaterializer prepareForRead(
          Configuration configuration,
          Map keyValueMetaData,
          MessageType fileSchema,
          ReadContext readContext);

  /**
   * information to read the file
   *
   * @author Julien Le Dem
   *
   */
  public static final class ReadContext {
    private final MessageType requestedSchema;
    private final Map readSupportMetadata;

    /**
     * @param requestedSchema the schema requested by the user. Can not be null.
     */
    public ReadContext(MessageType requestedSchema) {
      this(requestedSchema, null);
    }

    /**
     * @param requestedSchema the schema requested by the user. Can not be null.
     * @param readSupportMetadata metadata specific to the ReadSupport implementation. Will be available in the prepareForRead phase.
     */
    public ReadContext(MessageType requestedSchema, Map readSupportMetadata) {
      super();
      if (requestedSchema == null) {
        throw new NullPointerException("requestedSchema");
      }
      this.requestedSchema = requestedSchema;
      this.readSupportMetadata = readSupportMetadata;
    }

    /**
     * @return the schema of the file
     */
    public MessageType getRequestedSchema() {
      return requestedSchema;
    }

    /**
     * @return metadata specific to the ReadSupport implementation
     */
    public Map getReadSupportMetadata() {
      return readSupportMetadata;
    }
  }
}