All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.io.orc.OrcFile Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.io.orc;

import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_BLOCK_PADDING;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_BLOCK_SIZE;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_BUFFER_SIZE;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_COMPRESS;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_DEFAULT_STRIPE_SIZE;
import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_ORC_WRITE_FORMAT;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.io.filters.BloomFilterIO;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;

/**
 * Contains factory methods to read or write ORC files.
 */
public final class OrcFile {

  public static final String MAGIC = "ORC";

  /**
   * Create a version number for the ORC file format, so that we can add
   * non-forward compatible changes in the future. To make it easier for users
   * to understand the version numbers, we use the Hive release number that
   * first wrote that version of ORC files.
   *
   * Thus, if you add new encodings or other non-forward compatible changes
   * to ORC files, which prevent the old reader from reading the new format,
   * you should change these variable to reflect the next Hive release number.
   * Non-forward compatible changes should never be added in patch releases.
   *
   * Do not make any changes that break backwards compatibility, which would
   * prevent the new reader from reading ORC files generated by any released
   * version of Hive.
   */
  public static enum Version {
    V_0_11("0.11", 0, 11),
      V_0_12("0.12", 0, 12);

    public static final Version CURRENT = V_0_12;

    private final String name;
    private final int major;
    private final int minor;

    private Version(String name, int major, int minor) {
      this.name = name;
      this.major = major;
      this.minor = minor;
    }

    public static Version byName(String name) {
      for(Version version: values()) {
        if (version.name.equals(name)) {
          return version;
        }
      }
      throw new IllegalArgumentException("Unknown ORC version " + name);
    }

    /**
     * Get the human readable name for the version.
     */
    public String getName() {
      return name;
    }

    /**
     * Get the major version number.
     */
    public int getMajor() {
      return major;
    }

    /**
     * Get the minor version number.
     */
    public int getMinor() {
      return minor;
    }
  }

  /**
   * Records the version of the writer in terms of which bugs have been fixed.
   * For bugs in the writer, but the old readers already read the new data
   * correctly, bump this version instead of the Version.
   */
  public static enum WriterVersion {
    ORIGINAL(0),
      HIVE_8732(1); // corrupted stripe/file maximum column statistics

    private final int id;

    public int getId() {
      return id;
    }

    private WriterVersion(int id) {
      this.id = id;
    }
  }

  public static enum EncodingStrategy {
    SPEED, COMPRESSION;
  }

  public static enum CompressionStrategy {
    SPEED, COMPRESSION;
  }

  // Note : these string definitions for table properties are deprecated,
  // and retained only for backward compatibility, please do not add to
  // them, add to OrcTableProperties below instead
  @Deprecated public static final String COMPRESSION = "orc.compress";
  @Deprecated public static final String COMPRESSION_BLOCK_SIZE = "orc.compress.size";
  @Deprecated public static final String STRIPE_SIZE = "orc.stripe.size";
  @Deprecated public static final String ROW_INDEX_STRIDE = "orc.row.index.stride";
  @Deprecated public static final String ENABLE_INDEXES = "orc.create.index";
  @Deprecated public static final String BLOCK_PADDING = "orc.block.padding";

  /**
   * Enum container for all orc table properties.
   * If introducing a new orc-specific table property,
   * add it here.
   */
  public static enum OrcTableProperties {
    COMPRESSION("orc.compress"),
    COMPRESSION_BLOCK_SIZE("orc.compress.size"),
    STRIPE_SIZE("orc.stripe.size"),
    BLOCK_SIZE("orc.block.size"),
    ROW_INDEX_STRIDE("orc.row.index.stride"),
    ENABLE_INDEXES("orc.create.index"),
    BLOCK_PADDING("orc.block.padding"),
    ENCODING_STRATEGY("orc.encoding.strategy"),
    BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns"),
    BLOOM_FILTER_FPP("orc.bloom.filter.fpp");

    private final String propName;

    OrcTableProperties(String propName) {
      this.propName = propName;
    }

    public String getPropName(){
      return this.propName;
    }
  }

  // unused
  private OrcFile() {}

  /**
   * Create an ORC file reader.
   * @param fs file system
   * @param path file name to read from
   * @return a new ORC file reader.
   * @throws IOException
   */
  public static Reader createReader(FileSystem fs, Path path
  ) throws IOException {
    ReaderOptions opts = new ReaderOptions(new Configuration());
    opts.filesystem(fs);
    return new ReaderImpl(path, opts);
  }

  public static class ReaderOptions {
    private final Configuration conf;
    private FileSystem filesystem;
    private ReaderImpl.FileMetaInfo fileMetaInfo;
    private long maxLength = Long.MAX_VALUE;

    public ReaderOptions(Configuration conf) {
      this.conf = conf;
    }
    ReaderOptions fileMetaInfo(ReaderImpl.FileMetaInfo info) {
      fileMetaInfo = info;
      return this;
    }

    public ReaderOptions filesystem(FileSystem fs) {
      this.filesystem = fs;
      return this;
    }

    public ReaderOptions maxLength(long val) {
      maxLength = val;
      return this;
    }

    Configuration getConfiguration() {
      return conf;
    }

    FileSystem getFilesystem() {
      return filesystem;
    }

    ReaderImpl.FileMetaInfo getFileMetaInfo() {
      return fileMetaInfo;
    }

    long getMaxLength() {
      return maxLength;
    }
  }

  public static ReaderOptions readerOptions(Configuration conf) {
    return new ReaderOptions(conf);
  }

  public static Reader createReader(Path path,
                                    ReaderOptions options) throws IOException {
    return new ReaderImpl(path, options);
  }

  public static interface WriterContext {
    Writer getWriter();
  }

  public static interface WriterCallback {
    public void preStripeWrite(WriterContext context) throws IOException;
    public void preFooterWrite(WriterContext context) throws IOException;
  }

  /**
   * Options for creating ORC file writers.
   */
  public static class WriterOptions {
    private final Configuration configuration;
    private FileSystem fileSystemValue = null;
    private ObjectInspector inspectorValue = null;
    private long stripeSizeValue;
    private long blockSizeValue;
    private int rowIndexStrideValue;
    private int bufferSizeValue;
    private boolean blockPaddingValue;
    private CompressionKind compressValue;
    private MemoryManager memoryManagerValue;
    private Version versionValue;
    private WriterCallback callback;
    private EncodingStrategy encodingStrategy;
    private CompressionStrategy compressionStrategy;
    private float paddingTolerance;
    private String bloomFilterColumns;
    private double bloomFilterFpp;

    WriterOptions(Configuration conf) {
      configuration = conf;
      memoryManagerValue = getMemoryManager(conf);
      stripeSizeValue = HiveConf.getLongVar(conf, HIVE_ORC_DEFAULT_STRIPE_SIZE);
      blockSizeValue = HiveConf.getLongVar(conf, HIVE_ORC_DEFAULT_BLOCK_SIZE);
      rowIndexStrideValue = HiveConf.getIntVar(conf, HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE);
      bufferSizeValue = HiveConf.getIntVar(conf, HIVE_ORC_DEFAULT_BUFFER_SIZE);
      blockPaddingValue = HiveConf.getBoolVar(conf, HIVE_ORC_DEFAULT_BLOCK_PADDING);
      compressValue = CompressionKind.valueOf(HiveConf.getVar(conf, HIVE_ORC_DEFAULT_COMPRESS));
      String versionName = HiveConf.getVar(conf, HIVE_ORC_WRITE_FORMAT);
      if (versionName == null) {
        versionValue = Version.CURRENT;
      } else {
        versionValue = Version.byName(versionName);
      }
      String enString =
          conf.get(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname);
      if (enString == null) {
        encodingStrategy = EncodingStrategy.SPEED;
      } else {
        encodingStrategy = EncodingStrategy.valueOf(enString);
      }

      String compString = conf
          .get(HiveConf.ConfVars.HIVE_ORC_COMPRESSION_STRATEGY.varname);
      if (compString == null) {
        compressionStrategy = CompressionStrategy.SPEED;
      } else {
        compressionStrategy = CompressionStrategy.valueOf(compString);
      }

      paddingTolerance = conf.getFloat(HiveConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.varname,
          HiveConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.defaultFloatVal);
      bloomFilterFpp = BloomFilterIO.DEFAULT_FPP;
    }

    /**
     * Provide the filesystem for the path, if the client has it available.
     * If it is not provided, it will be found from the path.
     */
    public WriterOptions fileSystem(FileSystem value) {
      fileSystemValue = value;
      return this;
    }

    /**
     * Set the stripe size for the file. The writer stores the contents of the
     * stripe in memory until this memory limit is reached and the stripe
     * is flushed to the HDFS file and the next stripe started.
     */
    public WriterOptions stripeSize(long value) {
      stripeSizeValue = value;
      return this;
    }

    /**
     * Set the file system block size for the file. For optimal performance,
     * set the block size to be multiple factors of stripe size.
     */
    public WriterOptions blockSize(long value) {
      blockSizeValue = value;
      return this;
    }

    /**
     * Set the distance between entries in the row index. The minimum value is
     * 1000 to prevent the index from overwhelming the data. If the stride is
     * set to 0, no indexes will be included in the file.
     */
    public WriterOptions rowIndexStride(int value) {
      rowIndexStrideValue = value;
      return this;
    }

    /**
     * The size of the memory buffers used for compressing and storing the
     * stripe in memory.
     */
    public WriterOptions bufferSize(int value) {
      bufferSizeValue = value;
      return this;
    }

    /**
     * Sets whether the HDFS blocks are padded to prevent stripes from
     * straddling blocks. Padding improves locality and thus the speed of
     * reading, but costs space.
     */
    public WriterOptions blockPadding(boolean value) {
      blockPaddingValue = value;
      return this;
    }

    /**
     * Sets the encoding strategy that is used to encode the data.
     */
    public WriterOptions encodingStrategy(EncodingStrategy strategy) {
      encodingStrategy = strategy;
      return this;
    }

    /**
     * Sets the tolerance for block padding as a percentage of stripe size.
     */
    public WriterOptions paddingTolerance(float value) {
      paddingTolerance = value;
      return this;
    }

    /**
     * Comma separated values of column names for which bloom filter is to be created.
     */
    public WriterOptions bloomFilterColumns(String columns) {
      bloomFilterColumns = columns;
      return this;
    }

    /**
     * Specify the false positive probability for bloom filter.
     * @param fpp - false positive probability
     * @return
     */
    public WriterOptions bloomFilterFpp(double fpp) {
      bloomFilterFpp = fpp;
      return this;
    }

    /**
     * Sets the generic compression that is used to compress the data.
     */
    public WriterOptions compress(CompressionKind value) {
      compressValue = value;
      return this;
    }

    /**
     * A required option that sets the object inspector for the rows. Used
     * to determine the schema for the file.
     */
    public WriterOptions inspector(ObjectInspector value) {
      inspectorValue = value;
      return this;
    }

    /**
     * Sets the version of the file that will be written.
     */
    public WriterOptions version(Version value) {
      versionValue = value;
      return this;
    }

    /**
     * Add a listener for when the stripe and file are about to be closed.
     * @param callback the object to be called when the stripe is closed
     * @return
     */
    public WriterOptions callback(WriterCallback callback) {
      this.callback = callback;
      return this;
    }

    /**
     * A package local option to set the memory manager.
     */
    WriterOptions memory(MemoryManager value) {
      memoryManagerValue = value;
      return this;
    }

  }

  /**
   * Create a default set of write options that can be modified.
   */
  public static WriterOptions writerOptions(Configuration conf) {
    return new WriterOptions(conf);
  }

  /**
   * Create an ORC file writer. This is the public interface for creating
   * writers going forward and new options will only be added to this method.
   * @param path filename to write to
   * @param opts the options
   * @return a new ORC file writer
   * @throws IOException
   */
  public static Writer createWriter(Path path,
                                    WriterOptions opts
                                    ) throws IOException {
    FileSystem fs = opts.fileSystemValue == null ?
      path.getFileSystem(opts.configuration) : opts.fileSystemValue;

    return new WriterImpl(fs, path, opts.configuration, opts.inspectorValue,
                          opts.stripeSizeValue, opts.compressValue,
                          opts.bufferSizeValue, opts.rowIndexStrideValue,
                          opts.memoryManagerValue, opts.blockPaddingValue,
                          opts.versionValue, opts.callback,
                          opts.encodingStrategy, opts.compressionStrategy,
                          opts.paddingTolerance, opts.blockSizeValue,
                          opts.bloomFilterColumns, opts.bloomFilterFpp);
  }

  /**
   * Create an ORC file writer. This method is provided for API backward
   * compatability with Hive 0.11.
   * @param fs file system
   * @param path filename to write to
   * @param inspector the ObjectInspector that inspects the rows
   * @param stripeSize the number of bytes in a stripe
   * @param compress how to compress the file
   * @param bufferSize the number of bytes to compress at once
   * @param rowIndexStride the number of rows between row index entries or
   *                       0 to suppress all indexes
   * @return a new ORC file writer
   * @throws IOException
   */
  public static Writer createWriter(FileSystem fs,
                                    Path path,
                                    Configuration conf,
                                    ObjectInspector inspector,
                                    long stripeSize,
                                    CompressionKind compress,
                                    int bufferSize,
                                    int rowIndexStride) throws IOException {
    return createWriter(path,
                        writerOptions(conf)
                        .fileSystem(fs)
                        .inspector(inspector)
                        .stripeSize(stripeSize)
                        .compress(compress)
                        .bufferSize(bufferSize)
                        .rowIndexStride(rowIndexStride));
  }

  private static MemoryManager memoryManager = null;

  private static synchronized
  MemoryManager getMemoryManager(Configuration conf) {
    if (memoryManager == null) {
      memoryManager = new MemoryManager(conf);
    }
    return memoryManager;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy