parquet.hadoop.api.ReadSupport Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of parquet-hadoop Show documentation
There is a newer version: 1.6.0
/**
 * Copyright 2012 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package parquet.hadoop.api;

import java.util.Map;

import org.apache.hadoop.conf.Configuration;

import parquet.io.api.RecordMaterializer;
import parquet.schema.MessageType;
import parquet.schema.MessageTypeParser;

/**
 * Abstraction used by the {@link parquet.hadoop.ParquetInputFormat} to materialize records
 *
 * @author Julien Le Dem
 *
 * @param  the type of the materialized record
 */
abstract public class ReadSupport {

  /**
   * configuration key for a parquet read projection schema
   */
	public static final String PARQUET_READ_SCHEMA = "parquet.read.schema";

  /**
   * attempts to validate and construct a {@link MessageType} from a read projection schema
   *
   * @param fileMessageType         the typed schema of the source
   * @param partialReadSchemaString the requested projection schema
   * @return the typed schema that should be used to read
   */
  public static MessageType getSchemaForRead(MessageType fileMessageType, String partialReadSchemaString) {
    if (partialReadSchemaString == null)
      return fileMessageType;
    MessageType requestedMessageType = MessageTypeParser.parseMessageType(partialReadSchemaString);
    return getSchemaForRead(fileMessageType, requestedMessageType);
  }

  public static MessageType getSchemaForRead(MessageType fileMessageType, MessageType projectedMessageType) {
    fileMessageType.checkContains(projectedMessageType);
    return projectedMessageType;
  }

  /**
   * called in {@link org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} in the front end
   *
   * @param configuration    the job configuration
   * @param keyValueMetaData the app specific metadata from the file
   * @param fileSchema       the schema of the file
   * @return the readContext that defines how to read the file
   *
   * @deprecated override {@link ReadSupport#init(InitContext)} instead
   */
  @Deprecated
  public ReadContext init(
          Configuration configuration,
          Map keyValueMetaData,
          MessageType fileSchema) {
    throw new UnsupportedOperationException("Override init(InitContext)");
  }

  /**
   * called in {@link org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} in the front end
   *
   * @param context the initialisation context
   * @return the readContext that defines how to read the file
   */
  public ReadContext init(InitContext context) {
    return init(context.getConfiguration(), context.getMergedKeyValueMetaData(), context.getFileSchema());
  }

  /**
   * called in {@link org.apache.hadoop.mapreduce.RecordReader#initialize(org.apache.hadoop.mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext)} in the back end
   * the returned RecordConsumer will materialize the records and add them to the destination
   *
   * @param configuration    the job configuration
   * @param keyValueMetaData the app specific metadata from the file
   * @param fileSchema       the schema of the file
   * @param readContext      returned by the init method
   * @return the recordConsumer that will receive the events
   */
  abstract public RecordMaterializer prepareForRead(
          Configuration configuration,
          Map keyValueMetaData,
          MessageType fileSchema,
          ReadContext readContext);

  /**
   * information to read the file
   *
   * @author Julien Le Dem
   *
   */
  public static final class ReadContext {
    private final MessageType requestedSchema;
    private final Map readSupportMetadata;

    /**
     * @param requestedSchema the schema requested by the user. Can not be null.
     */
    public ReadContext(MessageType requestedSchema) {
      this(requestedSchema, null);
    }

    /**
     * @param requestedSchema the schema requested by the user. Can not be null.
     * @param readSupportMetadata metadata specific to the ReadSupport implementation. Will be available in the prepareForRead phase.
     */
    public ReadContext(MessageType requestedSchema, Map readSupportMetadata) {
      super();
      if (requestedSchema == null) {
        throw new NullPointerException("requestedSchema");
      }
      this.requestedSchema = requestedSchema;
      this.readSupportMetadata = readSupportMetadata;
    }

    /**
     * @return the schema of the file
     */
    public MessageType getRequestedSchema() {
      return requestedSchema;
    }

    /**
     * @return metadata specific to the ReadSupport implementation
     */
    public Map getReadSupportMetadata() {
      return readSupportMetadata;
    }
  }
}