org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hive-exec Show documentation
The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.io.orc;

import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.ValidTxnList;
import org.apache.hadoop.hive.ql.io.AcidInputFormat;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.RecordIdentifier;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;

import java.io.IOException;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Deque;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

/**
 * Merges a base and a list of delta files together into a single stream of
 * events.
 */
public class OrcRawRecordMerger implements AcidInputFormat.RawReader{

  private static final Log LOG = LogFactory.getLog(OrcRawRecordMerger.class);

  private final Configuration conf;
  private final boolean collapse;
  private final RecordReader baseReader;
  private final long offset;
  private final long length;
  private final ValidTxnList validTxnList;
  private final int columns;
  private ReaderKey prevKey = new ReaderKey();
  // this is the key less than the lowest key we need to process
  private RecordIdentifier minKey;
  // this is the last key we need to process
  private RecordIdentifier maxKey;
  // an extra value so that we can return it while reading ahead
  private OrcStruct extraValue;

  /**
   * A RecordIdentifier extended with the current transaction id. This is the
   * key of our merge sort with the originalTransaction, bucket, and rowId
   * ascending and the currentTransaction descending. This means that if the
   * reader is collapsing events to just the last update, just the first
   * instance of each record is required.
   */
  final static class ReaderKey extends RecordIdentifier{
    private long currentTransactionId;

    public ReaderKey() {
      this(-1, -1, -1, -1);
    }

    public ReaderKey(long originalTransaction, int bucket, long rowId,
                     long currentTransactionId) {
      super(originalTransaction, bucket, rowId);
      this.currentTransactionId = currentTransactionId;
    }

    @Override
    public void set(RecordIdentifier other) {
      super.set(other);
      currentTransactionId = ((ReaderKey) other).currentTransactionId;
    }

    public void setValues(long originalTransactionId,
                          int bucket,
                          long rowId,
                          long currentTransactionId) {
      setValues(originalTransactionId, bucket, rowId);
      this.currentTransactionId = currentTransactionId;
    }

    @Override
    public boolean equals(Object other) {
      return super.equals(other) &&
          currentTransactionId == ((ReaderKey) other).currentTransactionId;
    }

    @Override
    public int compareTo(RecordIdentifier other) {
      int sup = compareToInternal(other);
      if (sup == 0) {
        if (other.getClass() == ReaderKey.class) {
          ReaderKey oth = (ReaderKey) other;
          if (currentTransactionId != oth.currentTransactionId) {
            return currentTransactionId < oth.currentTransactionId ? +1 : -1;
          }
        } else {
          return -1;
        }
      }
      return sup;
    }

    public long getCurrentTransactionId() {
      return currentTransactionId;
    }

    /**
     * Compare rows without considering the currentTransactionId.
     * @param other the value to compare to
     * @return -1, 0, +1
     */
    public int compareRow(RecordIdentifier other) {
      return compareToInternal(other);
    }

    @Override
    public String toString() {
      return "{originalTxn: " + getTransactionId() + ", bucket: " +
          getBucketId() + ", row: " + getRowId() + ", currentTxn: " +
          currentTransactionId + "}";
    }
  }

  /**
   * A reader and the next record from that reader. The code reads ahead so that
   * we can return the lowest ReaderKey from each of the readers. Thus, the
   * next available row is nextRecord and only following records are still in
   * the reader.
   */
  static class ReaderPair {
    OrcStruct nextRecord;
    final Reader reader;
    final RecordReader recordReader;
    final ReaderKey key;
    final RecordIdentifier maxKey;
    final int bucket;

    /**
     * Create a reader that reads from the first key larger than minKey to any
     * keys equal to maxKey.
     * @param key the key to read into
     * @param reader the ORC file reader
     * @param bucket the bucket number for the file
     * @param minKey only return keys larger than minKey if it is non-null
     * @param maxKey only return keys less than or equal to maxKey if it is
     *               non-null
     * @param options options to provide to read the rows.
     * @throws IOException
     */
    ReaderPair(ReaderKey key, Reader reader, int bucket,
               RecordIdentifier minKey, RecordIdentifier maxKey,
               ReaderImpl.Options options) throws IOException {
      this.reader = reader;
      this.key = key;
      this.maxKey = maxKey;
      this.bucket = bucket;
      // TODO use stripe statistics to jump over stripes
      recordReader = reader.rowsOptions(options);
      // advance the reader until we reach the minimum key
      do {
        next(nextRecord);
      } while (nextRecord != null &&
          (minKey != null && key.compareRow(minKey) <= 0));
    }

    void next(OrcStruct next) throws IOException {
      if (recordReader.hasNext()) {
        nextRecord = (OrcStruct) recordReader.next(next);
        // set the key
        key.setValues(OrcRecordUpdater.getOriginalTransaction(nextRecord),
            OrcRecordUpdater.getBucket(nextRecord),
            OrcRecordUpdater.getRowId(nextRecord),
            OrcRecordUpdater.getCurrentTransaction(nextRecord));

        // if this record is larger than maxKey, we need to stop
        if (maxKey != null && key.compareRow(maxKey) > 0) {
          LOG.debug("key " + key + " > maxkey " + maxKey);
          nextRecord = null;
          recordReader.close();
        }
      } else {
        nextRecord = null;
        recordReader.close();
      }
    }

    int getColumns() {
      return reader.getTypes().get(OrcRecordUpdater.ROW + 1).getSubtypesCount();
    }
  }

  /**
   * A reader that pretends an original base file is a new version base file.
   * It wraps the underlying reader's row with an ACID event object and
   * makes the relevant translations.
   */
  static final class OriginalReaderPair extends ReaderPair {
    OriginalReaderPair(ReaderKey key, Reader reader, int bucket,
                       RecordIdentifier minKey, RecordIdentifier maxKey,
                       Reader.Options options) throws IOException {
      super(key, reader, bucket, minKey, maxKey, options);
    }

    @Override
    void next(OrcStruct next) throws IOException {
      if (recordReader.hasNext()) {
        long nextRowId = recordReader.getRowNumber();
        // have to do initialization here, because the super's constructor
        // calls next and thus we need to initialize before our constructor
        // runs
        if (next == null) {
          nextRecord = new OrcStruct(OrcRecordUpdater.FIELDS);
          IntWritable operation =
              new IntWritable(OrcRecordUpdater.INSERT_OPERATION);
          nextRecord.setFieldValue(OrcRecordUpdater.OPERATION, operation);
          nextRecord.setFieldValue(OrcRecordUpdater.CURRENT_TRANSACTION,
              new LongWritable(0));
          nextRecord.setFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION,
              new LongWritable(0));
          nextRecord.setFieldValue(OrcRecordUpdater.BUCKET,
              new IntWritable(bucket));
          nextRecord.setFieldValue(OrcRecordUpdater.ROW_ID,
              new LongWritable(nextRowId));
          nextRecord.setFieldValue(OrcRecordUpdater.ROW,
              recordReader.next(null));
        } else {
          nextRecord = next;
          ((IntWritable) next.getFieldValue(OrcRecordUpdater.OPERATION))
              .set(OrcRecordUpdater.INSERT_OPERATION);
          ((LongWritable) next.getFieldValue(OrcRecordUpdater.ORIGINAL_TRANSACTION))
              .set(0);
          ((IntWritable) next.getFieldValue(OrcRecordUpdater.BUCKET))
              .set(bucket);
          ((LongWritable) next.getFieldValue(OrcRecordUpdater.CURRENT_TRANSACTION))
              .set(0);
          ((LongWritable) next.getFieldValue(OrcRecordUpdater.ROW_ID))
              .set(0);
          nextRecord.setFieldValue(OrcRecordUpdater.ROW,
              recordReader.next(OrcRecordUpdater.getRow(next)));
        }
        key.setValues(0L, bucket, nextRowId, 0L);
        if (maxKey != null && key.compareRow(maxKey) > 0) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("key " + key + " > maxkey " + maxKey);
          }
          nextRecord = null;
          recordReader.close();
        }
      } else {
        nextRecord = null;
        recordReader.close();
      }
    }

    @Override
    int getColumns() {
      return reader.getTypes().get(0).getSubtypesCount();
    }
  }

  private final TreeMap readers =
      new TreeMap();

  // The reader that currently has the lowest key.
  private ReaderPair primary;

  // The key of the next lowest reader.
  private ReaderKey secondaryKey = null;

  /**
   * Find the key range for original bucket files.
   * @param reader the reader
   * @param bucket the bucket number we are reading
   * @param options the options for reading with
   * @throws IOException
   */
  private void discoverOriginalKeyBounds(Reader reader, int bucket,
                                         Reader.Options options
                                         ) throws IOException {
    long rowLength = 0;
    long rowOffset = 0;
    long offset = options.getOffset();
    long maxOffset = options.getMaxOffset();
    boolean isTail = true;
    for(StripeInformation stripe: reader.getStripes()) {
      if (offset > stripe.getOffset()) {
        rowOffset += stripe.getNumberOfRows();
      } else if (maxOffset > stripe.getOffset()) {
        rowLength += stripe.getNumberOfRows();
      } else {
        isTail = false;
        break;
      }
    }
    if (rowOffset > 0) {
      minKey = new RecordIdentifier(0, bucket, rowOffset - 1);
    }
    if (!isTail) {
      maxKey = new RecordIdentifier(0, bucket, rowOffset + rowLength - 1);
    }
  }

  /**
   * Find the key range for bucket files.
   * @param reader the reader
   * @param options the options for reading with
   * @throws IOException
   */
  private void discoverKeyBounds(Reader reader,
                                 Reader.Options options) throws IOException {
    RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(reader);
    long offset = options.getOffset();
    long maxOffset = options.getMaxOffset();
    int firstStripe = 0;
    int stripeCount = 0;
    boolean isTail = true;
    List stripes = reader.getStripes();
    for(StripeInformation stripe: stripes) {
      if (offset > stripe.getOffset()) {
        firstStripe += 1;
      } else if (maxOffset > stripe.getOffset()) {
        stripeCount += 1;
      } else {
        isTail = false;
        break;
      }
    }
    if (firstStripe != 0) {
      minKey = keyIndex[firstStripe - 1];
    }
    if (!isTail) {
      maxKey = keyIndex[firstStripe + stripeCount - 1];
    }
  }

  /**
   * Convert from the row include/sarg/columnNames to the event equivalent
   * for the underlying file.
   * @param options options for the row reader
   * @return a cloned options object that is modified for the event reader
   */
  static Reader.Options createEventOptions(Reader.Options options) {
    Reader.Options result = options.clone();
    result.range(options.getOffset(), Long.MAX_VALUE);
    // slide the columns down by 6 for the include array
    if (options.getInclude() != null) {
      boolean[] orig = options.getInclude();
      // we always need the base row
      orig[0] = true;
      boolean[] include = new boolean[orig.length + OrcRecordUpdater.FIELDS];
      Arrays.fill(include, 0, OrcRecordUpdater.FIELDS, true);
      for(int i= 0; i < orig.length; ++i) {
        include[i + OrcRecordUpdater.FIELDS] = orig[i];
      }
      result.include(include);
    }

    // slide the column names down by 6 for the name array
    if (options.getColumnNames() != null) {
      String[] orig = options.getColumnNames();
      String[] cols = new String[orig.length + OrcRecordUpdater.FIELDS];
      for(int i=0; i < orig.length; ++i) {
        cols[i + OrcRecordUpdater.FIELDS] = orig[i];
      }
      result.searchArgument(options.getSearchArgument(), cols);
    }
    return result;
  }

  /**
   * Create a reader that merge sorts the ACID events together.
   * @param conf the configuration
   * @param collapseEvents should the events on the same row be collapsed
   * @param isOriginal is the base file a pre-acid file
   * @param bucket the bucket we are reading
   * @param options the options to read with
   * @param deltaDirectory the list of delta directories to include
   * @throws IOException
   */
  OrcRawRecordMerger(Configuration conf,
                     boolean collapseEvents,
                     Reader reader,
                     boolean isOriginal,
                     int bucket,
                     ValidTxnList validTxnList,
                     Reader.Options options,
                     Path[] deltaDirectory) throws IOException {
    this.conf = conf;
    this.collapse = collapseEvents;
    this.offset = options.getOffset();
    this.length = options.getLength();
    this.validTxnList = validTxnList;
    // modify the optins to reflect the event instead of the base row
    Reader.Options eventOptions = createEventOptions(options);
    if (reader == null) {
      baseReader = null;
    } else {

      // find the min/max based on the offset and length
      if (isOriginal) {
        discoverOriginalKeyBounds(reader, bucket, options);
      } else {
        discoverKeyBounds(reader, options);
      }
      LOG.info("min key = " + minKey + ", max key = " + maxKey);
      // use the min/max instead of the byte range
      ReaderPair pair;
      ReaderKey key = new ReaderKey();
      if (isOriginal) {
        options = options.clone();
        options.range(options.getOffset(), Long.MAX_VALUE);
        pair = new OriginalReaderPair(key, reader, bucket, minKey, maxKey,
                                      options);
      } else {
        pair = new ReaderPair(key, reader, bucket, minKey, maxKey,
                              eventOptions);
      }

      // if there is at least one record, put it in the map
      if (pair.nextRecord != null) {
        readers.put(key, pair);
      }
      baseReader = pair.recordReader;
    }

    // we always want to read all of the deltas
    eventOptions.range(0, Long.MAX_VALUE);
    // Turn off the sarg before pushing it to delta.  We never want to push a sarg to a delta as
    // it can produce wrong results (if the latest valid version of the record is filtered out by
    // the sarg) or ArrayOutOfBounds errors (when the sarg is applied to a delete record)
    eventOptions.searchArgument(null, null);
    if (deltaDirectory != null) {
      for(Path delta: deltaDirectory) {
        ReaderKey key = new ReaderKey();
        Path deltaFile = AcidUtils.createBucketFile(delta, bucket);
        FileSystem fs = deltaFile.getFileSystem(conf);
        long length = getLastFlushLength(fs, deltaFile);
        if (length != -1 && fs.exists(deltaFile)) {
          Reader deltaReader = OrcFile.createReader(deltaFile,
              OrcFile.readerOptions(conf).maxLength(length));
          ReaderPair deltaPair = new ReaderPair(key, deltaReader, bucket, minKey,
            maxKey, eventOptions);
          if (deltaPair.nextRecord != null) {
            readers.put(key, deltaPair);
          }
        }
      }
    }

    // get the first record
    Map.Entry entry = readers.pollFirstEntry();
    if (entry == null) {
      columns = 0;
      primary = null;
    } else {
      primary = entry.getValue();
      if (readers.isEmpty()) {
        secondaryKey = null;
      } else {
        secondaryKey = readers.firstKey();
      }
      // get the number of columns in the user's rows
      columns = primary.getColumns();
    }
  }

  /**
   * Read the side file to get the last flush length.
   * @param fs the file system to use
   * @param deltaFile the path of the delta file
   * @return the maximum size of the file to use
   * @throws IOException
   */
  private static long getLastFlushLength(FileSystem fs,
                                         Path deltaFile) throws IOException {
    Path lengths = OrcRecordUpdater.getSideFile(deltaFile);
    long result = Long.MAX_VALUE;
    try {
      FSDataInputStream stream = fs.open(lengths);
      result = -1;
      while (stream.available() > 0) {
        result = stream.readLong();
      }
      stream.close();
      return result;
    } catch (IOException ioe) {
      return result;
    }
  }

  @VisibleForTesting
  RecordIdentifier getMinKey() {
    return minKey;
  }

  @VisibleForTesting
  RecordIdentifier getMaxKey() {
    return maxKey;
  }

  @VisibleForTesting
  ReaderPair getCurrentReader() {
    return primary;
  }

  @VisibleForTesting
  Map getOtherReaders() {
    return readers;
  }

  @Override
  public boolean next(RecordIdentifier recordIdentifier,
                      OrcStruct prev) throws IOException {
    boolean keysSame = true;
    while (keysSame && primary != null) {

      // The primary's nextRecord is the next value to return
      OrcStruct current = primary.nextRecord;
      recordIdentifier.set(primary.key);

      // Advance the primary reader to the next record
      primary.next(extraValue);

      // Save the current record as the new extraValue for next time so that
      // we minimize allocations
      extraValue = current;

      // now that the primary reader has advanced, we need to see if we
      // continue to read it or move to the secondary.
      if (primary.nextRecord == null ||
          primary.key.compareTo(secondaryKey) > 0) {

        // if the primary isn't done, push it back into the readers
        if (primary.nextRecord != null) {
          readers.put(primary.key, primary);
        }

        // update primary and secondaryKey
        Map.Entry entry = readers.pollFirstEntry();
        if (entry != null) {
          primary = entry.getValue();
          if (readers.isEmpty()) {
            secondaryKey = null;
          } else {
            secondaryKey = readers.firstKey();
          }
        } else {
          primary = null;
        }
      }

      // if this transaction isn't ok, skip over it
      if (!validTxnList.isTxnValid(
          ((ReaderKey) recordIdentifier).getCurrentTransactionId())) {
        continue;
      }

      // if we are collapsing, figure out if this is a new row
      if (collapse) {
        keysSame = prevKey.compareRow(recordIdentifier) == 0;
        if (!keysSame) {
          prevKey.set(recordIdentifier);
        }
      } else {
        keysSame = false;
      }

      // set the output record by fiddling with the pointers so that we can
      // avoid a copy.
      prev.linkFields(current);
    }
    return !keysSame;
  }

  @Override
  public RecordIdentifier createKey() {
    return new ReaderKey();
  }

  @Override
  public OrcStruct createValue() {
    return new OrcStruct(OrcRecordUpdater.FIELDS);
  }

  @Override
  public long getPos() throws IOException {
    return offset + (long)(getProgress() * length);
  }

  @Override
  public void close() throws IOException {
    for(ReaderPair pair: readers.values()) {
      pair.recordReader.close();
    }
  }

  @Override
  public float getProgress() throws IOException {
    return baseReader == null ? 1 : baseReader.getProgress();
  }

  @Override
  public ObjectInspector getObjectInspector() {
    // Read the configuration parameters
    String columnNameProperty = conf.get(serdeConstants.LIST_COLUMNS);
    // NOTE: if "columns.types" is missing, all columns will be of String type
    String columnTypeProperty = conf.get(serdeConstants.LIST_COLUMN_TYPES);

    // Parse the configuration parameters
    ArrayList columnNames = new ArrayList();
    Deque virtualColumns = new ArrayDeque();
    if (columnNameProperty != null && columnNameProperty.length() > 0) {
      String[] colNames = columnNameProperty.split(",");
      for (int i = 0; i < colNames.length; i++) {
        if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(colNames[i])) {
          virtualColumns.addLast(i);
        } else {
          columnNames.add(colNames[i]);
        }
      }
    }
    if (columnTypeProperty == null) {
      // Default type: all string
      StringBuilder sb = new StringBuilder();
      for (int i = 0; i < columnNames.size(); i++) {
        if (i > 0) {
          sb.append(":");
        }
        sb.append("string");
      }
      columnTypeProperty = sb.toString();
    }

    ArrayList fieldTypes =
        TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
    while (virtualColumns.size() > 0) {
      fieldTypes.remove(virtualColumns.removeLast());
    }
    StructTypeInfo rowType = new StructTypeInfo();
    rowType.setAllStructFieldNames(columnNames);
    rowType.setAllStructFieldTypeInfos(fieldTypes);
    return OrcRecordUpdater.createEventSchema
        (OrcStruct.createObjectInspector(rowType));
  }

  @Override
  public boolean isDelete(OrcStruct value) {
    return OrcRecordUpdater.getOperation(value) == OrcRecordUpdater.DELETE_OPERATION;
  }

  /**
   * Get the number of columns in the underlying rows.
   * @return 0 if there are no base and no deltas.
   */
  public int getColumns() {
    return columns;
  }
}