All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.avro.mapreduce.AvroRecordReaderBase Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.  See the License for the specific language governing
 * permissions and limitations under the License.
 */

package org.apache.avro.mapreduce;

import java.io.IOException;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.SeekableInput;
import org.apache.avro.generic.GenericData;
import org.apache.avro.hadoop.io.AvroSerialization;
import org.apache.avro.io.DatumReader;
import org.apache.avro.mapred.FsInput;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import com.facebook.presto.hive.$internal.org.slf4j.Logger;
import com.facebook.presto.hive.$internal.org.slf4j.LoggerFactory;

/**
 * Abstract base class for RecordReaders that read Avro container
 * files.
 *
 * @param  The type of key the record reader should generate.
 * @param  The type of value the record reader should generate.
 * @param  The type of the entries within the Avro container file being read.
 */
public abstract class AvroRecordReaderBase extends RecordReader {
  private static final Logger LOG = LoggerFactory.getLogger(AvroRecordReaderBase.class);

  /** The reader schema for the records within the input Avro container file. */
  private final Schema mReaderSchema;

  /** The current record from the Avro container file being read. */
  private T mCurrentRecord;

  /** A reader for the Avro container file containing the current input split. */
  private DataFileReader mAvroFileReader;

  /**
   * The byte offset into the Avro container file where the first block that fits
   * completely within the current input split begins.
   */
  private long mStartPosition;

  /**
   * The byte offset into the Avro container file where the current input split
   * ends.
   */
  private long mEndPosition;

  /**
   * Constructor.
   *
   * @param readerSchema The reader schema for the records of the Avro container
   *                     file.
   */
  protected AvroRecordReaderBase(Schema readerSchema) {
    mReaderSchema = readerSchema;
    mCurrentRecord = null;
  }

  /** {@inheritDoc} */
  @Override
  public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
    if (!(inputSplit instanceof FileSplit)) {
      throw new IllegalArgumentException("Only compatible with FileSplits.");
    }
    FileSplit fileSplit = (FileSplit) inputSplit;

    // Open a seekable input stream to the Avro container file.
    SeekableInput seekableFileInput = createSeekableInput(context.getConfiguration(), fileSplit.getPath());

    // Wrap the seekable input stream in an Avro DataFileReader.
    Configuration conf = context.getConfiguration();
    GenericData dataModel = AvroSerialization.createDataModel(conf);
    DatumReader datumReader = dataModel.createDatumReader(mReaderSchema);
    mAvroFileReader = createAvroFileReader(seekableFileInput, datumReader);

    // Initialize the start and end offsets into the file based on the boundaries of
    // the
    // input split we're responsible for. We will read the first block that begins
    // after the input split start boundary. We will read up to but not including
    // the
    // first block that starts after input split end boundary.

    // Sync to the closest block/record boundary just after beginning of our input
    // split.
    mAvroFileReader.sync(fileSplit.getStart());

    // Initialize the start position to the beginning of the first block of the
    // input split.
    mStartPosition = mAvroFileReader.previousSync();

    // Initialize the end position to the end of the input split (this isn't
    // necessarily
    // on a block boundary so using this for reporting progress will be approximate.
    mEndPosition = fileSplit.getStart() + fileSplit.getLength();
  }

  /** {@inheritDoc} */
  @Override
  public boolean nextKeyValue() throws IOException, InterruptedException {
    assert null != mAvroFileReader;

    if (mAvroFileReader.hasNext() && !mAvroFileReader.pastSync(mEndPosition)) {
      mCurrentRecord = mAvroFileReader.next(mCurrentRecord);
      return true;
    }
    return false;
  }

  /** {@inheritDoc} */
  @Override
  public float getProgress() throws IOException, InterruptedException {
    assert null != mAvroFileReader;

    if (mEndPosition == mStartPosition) {
      // Trivial empty input split.
      return 0.0f;
    }
    long bytesRead = mAvroFileReader.previousSync() - mStartPosition;
    long bytesTotal = mEndPosition - mStartPosition;
    LOG.debug("Progress: bytesRead=" + bytesRead + ", bytesTotal=" + bytesTotal);
    return Math.min(1.0f, (float) bytesRead / (float) bytesTotal);
  }

  /** {@inheritDoc} */
  @Override
  public void close() throws IOException {
    if (null != mAvroFileReader) {
      try {
        mAvroFileReader.close();
      } finally {
        mAvroFileReader = null;
      }
    }
  }

  /**
   * Gets the current record read from the Avro container file.
   *
   * 

* Calling nextKeyValue() moves this to the next record. *

* * @return The current Avro record (may be null if no record has been read). */ protected T getCurrentRecord() { return mCurrentRecord; } /** * Creates a seekable input stream to an Avro container file. * * @param conf The hadoop configuration. * @param path The path to the avro container file. * @throws IOException If there is an error reading from the path. */ protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException { return new FsInput(path, conf); } /** * Creates an Avro container file reader from a seekable input stream. * * @param input The input containing the Avro container file. * @param datumReader The reader to use for the individual records in the Avro * container file. * @throws IOException If there is an error reading from the input stream. */ protected DataFileReader createAvroFileReader(SeekableInput input, DatumReader datumReader) throws IOException { return new DataFileReader<>(input, datumReader); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy