org.apache.parquet.hadoop.mapred.DeprecatedParquetInputFormat Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of parquet-hadoop Show documentation
There is a newer version: 1.11.9
/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.hadoop.mapred;

import static java.lang.Boolean.TRUE;
import static java.util.Arrays.asList;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.List;

import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

import org.apache.parquet.hadoop.Footer;
import org.apache.parquet.hadoop.ParquetInputFormat;
import org.apache.parquet.hadoop.ParquetInputSplit;
import org.apache.parquet.hadoop.ParquetRecordReader;

public class DeprecatedParquetInputFormat extends org.apache.hadoop.mapred.FileInputFormat> {

  protected ParquetInputFormat realInputFormat = new ParquetInputFormat();

  @Override
  public RecordReader> getRecordReader(InputSplit split, JobConf job,
                  Reporter reporter) throws IOException {
    return new RecordReaderWrapper(split, job, reporter);
  }

  @Override
  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    if (isTaskSideMetaData(job)) {
      return super.getSplits(job, numSplits);
    }

    List footers = getFooters(job);
    List splits = realInputFormat.getSplits(job, footers);
    if (splits == null) {
      return null;
    }
    InputSplit[] resultSplits = new InputSplit[splits.size()];
    int i = 0;
    for (ParquetInputSplit split : splits) {
      resultSplits[i++] = new ParquetInputSplitWrapper(split);
    }
    return resultSplits;
  }

  public List getFooters(JobConf job) throws IOException {
    return realInputFormat.getFooters(job, asList(super.listStatus(job)));
  }

  private static class RecordReaderWrapper implements RecordReader> {

    private ParquetRecordReader realReader;
    private long splitLen; // for getPos()

    private Container valueContainer = null;

    private boolean firstRecord = false;
    private boolean eof = false;

    public RecordReaderWrapper(
        InputSplit oldSplit, JobConf oldJobConf, Reporter reporter)
        throws IOException {
      splitLen = oldSplit.getLength();

      try {
        realReader = new ParquetRecordReader(
            ParquetInputFormat.getReadSupportInstance(oldJobConf),
            ParquetInputFormat.getFilter(oldJobConf));

        if (oldSplit instanceof ParquetInputSplitWrapper) {
          realReader.initialize(((ParquetInputSplitWrapper) oldSplit).realSplit, oldJobConf, reporter);
        } else if (oldSplit instanceof FileSplit) {
          realReader.initialize((FileSplit) oldSplit, oldJobConf, reporter);
        } else {
          throw new IllegalArgumentException(
              "Invalid split (not a FileSplit or ParquetInputSplitWrapper): " + oldSplit);
        }

        // read once to gain access to key and value objects
        if (realReader.nextKeyValue()) {
          firstRecord = true;
          valueContainer = new Container();
          valueContainer.set(realReader.getCurrentValue());

        } else {
          eof = true;
        }
      } catch (InterruptedException e) {
        Thread.interrupted();
        throw new IOException(e);
      }
    }

    @Override
    public void close() throws IOException {
      realReader.close();
    }

    @Override
    public Void createKey() {
      return null;
    }

    @Override
    public Container createValue() {
      return valueContainer;
    }

    @Override
    public long getPos() throws IOException {
      return (long) (splitLen * getProgress());
    }

    @Override
    public float getProgress() throws IOException {
      try {
        return realReader.getProgress();
      } catch (InterruptedException e) {
        Thread.interrupted();
        throw new IOException(e);
      }
    }

    @Override
    public boolean next(Void key, Container value) throws IOException {
      if (eof) {
        return false;
      }

      if (firstRecord) { // key & value are already read.
        firstRecord = false;
        return true;
      }

      try {
        if (realReader.nextKeyValue()) {
          if (value != null) value.set(realReader.getCurrentValue());
          return true;
        }
      } catch (InterruptedException e) {
        throw new IOException(e);
      }

      eof = true; // strictly not required, just for consistency
      return false;
    }
  }

  public static boolean isTaskSideMetaData(JobConf job) {
    return job.getBoolean(ParquetInputFormat.TASK_SIDE_METADATA, TRUE);
  }

  private static class ParquetInputSplitWrapper implements InputSplit {

    ParquetInputSplit realSplit;

    @SuppressWarnings("unused") // MapReduce instantiates this.
    public ParquetInputSplitWrapper() {}

    public ParquetInputSplitWrapper(ParquetInputSplit realSplit) {
      this.realSplit = realSplit;
    }

    @Override
    public long getLength() throws IOException {
        return realSplit.getLength();
    }

    @Override
    public String[] getLocations() throws IOException {
        return realSplit.getLocations();
    }

    @Override
    public void readFields(DataInput in) throws IOException {
      realSplit = new ParquetInputSplit();
      realSplit.readFields(in);
    }

    @Override
    public void write(DataOutput out) throws IOException {
      realSplit.write(out);
    }
  }
}