com.facebook.hive.orc.ReaderImpl Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-dwrf Show documentation
DWRF file format for Hive
There is a newer version: 0.18.9
//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.facebook.hive.orc;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import com.facebook.hive.orc.compression.CompressionCodec;
import com.facebook.hive.orc.compression.CompressionKind;
import com.facebook.hive.orc.statistics.ColumnStatistics;
import com.facebook.hive.orc.statistics.ColumnStatisticsImpl;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;

import com.facebook.hive.orc.lazy.OrcLazyRowObjectInspector;
import com.google.protobuf.CodedInputStream;
import org.apache.hadoop.io.IOUtils;

public final class ReaderImpl implements Reader {
  private static final Log LOG = LogFactory.getLog(ReaderImpl.class);
  private static final int DIRECTORY_SIZE_GUESS = 16 * 1024;

  private final FileSystem fileSystem;
  private final Path path;
  private final Configuration conf;
  private final CompressionKind compressionKind;
  private final CompressionCodec codec;
  private final int bufferSize;
  private final OrcProto.Footer footer;
  private final ObjectInspector inspector;

  private static class StripeInformationImpl
      implements StripeInformation {
    private final OrcProto.StripeInformation stripe;

    StripeInformationImpl(OrcProto.StripeInformation stripe) {
      this.stripe = stripe;
    }

    @Override
    public long getOffset() {
      return stripe.getOffset();
    }

    @Override
    public long getDataLength() {
      return stripe.getDataLength();
    }

    @Override
    public long getFooterLength() {
      return stripe.getFooterLength();
    }

    @Override
    public long getIndexLength() {
      return stripe.getIndexLength();
    }

    @Override
    public long getNumberOfRows() {
      return stripe.getNumberOfRows();
    }

    @Override
    public long getRawDataSize() {
      return stripe.getRawDataSize();
    }

    @Override
    public String toString() {
      return "offset: " + getOffset() + " data: " + getDataLength() +
        " rows: " + getNumberOfRows() + " tail: " + getFooterLength() +
        " index: " + getIndexLength() + " raw_data: " + getRawDataSize();
    }
  }

  @Override
  public long getNumberOfRows() {
    return footer.getNumberOfRows();
  }

  @Override
  public long getRawDataSize() {
    return footer.getRawDataSize();
  }

  @Override
  public Iterable getMetadataKeys() {
    List metadata = footer.getMetadataList();
    List result = new ArrayList(metadata.size());

    for(OrcProto.UserMetadataItem item: metadata) {
      result.add(item.getName());
    }
    return result;
  }

  @Override
  public ByteBuffer getMetadataValue(String key) {
    for(OrcProto.UserMetadataItem item: footer.getMetadataList()) {
      if (item.hasName() && item.getName().equals(key)) {
        return item.getValue().asReadOnlyByteBuffer();
      }
    }
    throw new IllegalArgumentException("Can't find user metadata " + key);
  }

  @Override
  public CompressionKind getCompression() {
    return compressionKind;
  }

  @Override
  public int getCompressionSize() {
    return bufferSize;
  }

  @Override
  public Iterable getStripes() {
    return new Iterable(){

      @Override
      public Iterator iterator() {
        return new Iterator(){
          private final Iterator inner =
            footer.getStripesList().iterator();

          @Override
          public boolean hasNext() {
            return inner.hasNext();
          }

          @Override
          public com.facebook.hive.orc.StripeInformation next() {
            return new StripeInformationImpl(inner.next());
          }

          @Override
          public void remove() {
            throw new UnsupportedOperationException("remove unsupported");
          }
        };
      }
    };
  }

  @Override
  public ObjectInspector getObjectInspector() {
    return inspector;
  }

  @Override
  public long getContentLength() {
    return footer.getContentLength();
  }

  @Override
  public List getTypes() {
    return footer.getTypesList();
  }

  @Override
  public int getRowIndexStride() {
    return footer.getRowIndexStride();
  }

  @Override
  public ColumnStatistics[] getStatistics() {
    ColumnStatistics[] result = new ColumnStatistics[footer.getTypesCount()];
    for(int i=0; i < result.length; ++i) {
      result[i] = ColumnStatisticsImpl.deserialize(footer.getStatistics(i));
    }
    return result;
  }

  public ReaderImpl(FileSystem fs, Path path, Configuration conf) throws IOException {
    try {
      this.fileSystem = fs;
      this.path = path;
      this.conf = conf;
      FSDataInputStream file = fs.open(path);
      long size = fs.getFileStatus(path).getLen();
      int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
      ByteBuffer buffer = ByteBuffer.allocate(readSize);
      InStream.read(
          file, size - readSize, buffer.array(), buffer.arrayOffset() + buffer.position(),
          buffer.remaining());
      int psLen = buffer.get(readSize - 1);
      int psOffset = readSize - 1 - psLen;
      CodedInputStream in = CodedInputStream.newInstance(
          buffer.array(),
          buffer.arrayOffset() + psOffset, psLen);
      OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in);
      int footerSize = (int) ps.getFooterLength();
      bufferSize = (int) ps.getCompressionBlockSize();
      switch (ps.getCompression()) {
      case NONE:
        compressionKind = CompressionKind.NONE;
        break;
      case ZLIB:
        compressionKind = CompressionKind.ZLIB;
        break;
      case SNAPPY:
        compressionKind = CompressionKind.SNAPPY;
        break;
      case LZO:
        compressionKind = CompressionKind.LZO;
        break;
      default:
        throw new IllegalArgumentException("Unknown compression");
      }
      codec = WriterImpl.createCodec(compressionKind);

      InputStream instream = InStream.create(
          "footer", file, size - 1 - psLen - footerSize, footerSize,
          codec, bufferSize);
      footer = OrcProto.Footer.parseFrom(instream);
      inspector = new OrcLazyRowObjectInspector(0, footer.getTypesList());
      file.close();
    } catch (IndexOutOfBoundsException e) {
      /**
       * When a non ORC file is read by ORC reader, we get IndexOutOfBoundsException exception while
       * creating a reader. Caught that exception and checked the file header to see if the input
       * file was ORC or not. If its not ORC, throw a NotAnORCFileException with the file
       * attempted to be reading (thus helping to figure out which table-partition was being read).
       */
      checkIfORC(fs, path);
      throw new IOException("Failed to create record reader for file " + path , e);
    } catch (IOException e) {
      throw new IOException("Failed to create record reader for file " + path , e);
    }
  }

  /**
   * Reads the file header (first 40 bytes) and checks if the first three characters are 'ORC'.
   */
  public static void checkIfORC(FileSystem fs, Path path) throws IOException {
    // hardcoded to 40 because "SEQ-org.apache.hadoop.hive.ql.io.RCFile", the header, is of 40 chars
    final int buffLen = 40;
    final byte header[] = new byte[buffLen];
    final FSDataInputStream file = fs.open(path);
    final long fileLength = fs.getFileStatus(path).getLen();
    int sizeToBeRead = buffLen;
    if (buffLen > fileLength) {
      sizeToBeRead = (int)fileLength;
    }

    IOUtils.readFully(file, header, 0, sizeToBeRead);
    file.close();

    final String headerString = new String(header);
    if (headerString.startsWith("ORC")) {
      LOG.error("Error while parsing the footer of the file : " + path);
    } else {
      throw new NotAnORCFileException("Input file = " + path + " , header = " + headerString);
    }
  }

  @Override
  public RecordReader rows(boolean[] include) throws IOException {
    return rows(0, Long.MAX_VALUE, include);
  }

  @Override
  public RecordReader rows(long offset, long length, boolean[] include
                           ) throws IOException {
    return new RecordReaderImpl(this.getStripes(), fileSystem,  path, offset,
      length, footer.getTypesList(), codec, bufferSize,
      include, footer.getRowIndexStride(), conf);
  }

  @Override
  public StripeReader stripes(long offset, long length) throws IOException {
    return new StripeReader(this.getStripes(), fileSystem, path, offset, length);
  }

}