org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-apache Show documentation
Shaded version of Apache Hive for Presto
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.common.io.encoded;

import java.util.Arrays;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

import io.prestosql.hive.$internal.org.slf4j.Logger;
import io.prestosql.hive.$internal.org.slf4j.LoggerFactory;

/**
 * A block of data for a given section of a file, similar to VRB but in encoded form.
 * Stores a set of buffers for each encoded stream that is a part of each column.
 */
public class EncodedColumnBatch {
  /**
   * Slice of the data for a stream for some column, stored inside MemoryBuffer's.
   * ColumnStreamData can be reused for many EncodedColumnBatch-es (e.g. dictionary stream), so
   * it tracks the number of such users via a refcount.
   */
  public static class ColumnStreamData {
    private List cacheBuffers;
    /** Base offset from the beginning of the indexable unit; for example, for ORC,
     * offset from the CB in a compressed file, from the stream in uncompressed file. */
    private int indexBaseOffset = 0;

    /** Reference count. */
    private AtomicInteger refCount = new AtomicInteger(0);

    public void reset() {
      cacheBuffers.clear();
      refCount.set(0);
      indexBaseOffset = 0;
    }

    public void incRef() {
      refCount.incrementAndGet();
    }

    public int decRef() {
      int i = refCount.decrementAndGet();
      assert i >= 0;
      return i;
    }

    public List getCacheBuffers() {
      return cacheBuffers;
    }

    public void setCacheBuffers(List cacheBuffers) {
      this.cacheBuffers = cacheBuffers;
    }

    public int getIndexBaseOffset() {
      return indexBaseOffset;
    }

    public void setIndexBaseOffset(int indexBaseOffset) {
      this.indexBaseOffset = indexBaseOffset;
    }

    @Override
    public String toString() {
      String bufStr = "";
      if (cacheBuffers != null) {
        for (MemoryBuffer mb : cacheBuffers) {
          bufStr += mb.getClass().getSimpleName() + " with " + mb.getByteBufferRaw().remaining() + " bytes, ";
        }
      }
      return "ColumnStreamData [cacheBuffers=[" + bufStr
          + "], indexBaseOffset=" + indexBaseOffset + "]";
    }

  }

  /** The key that is used to map this batch to source location. */
  protected BatchKey batchKey;
  /**
   * Stream data for each column that has true in the corresponding hasData position.
   * For each column, streams are indexed by kind (for ORC), with missing elements being null.
   */
  protected ColumnStreamData[][] columnData;
  /**
   * Indicates which columns have data. This is indexed by the column ID in ORC file schema;
   * the indices that are not included will not have data. Correspond to columnData elements.
   */
  protected boolean[] hasData;

  public void reset() {
    if (hasData != null) {
      Arrays.fill(hasData, false);
    }
    if (columnData == null) return;
    for (int i = 0; i < columnData.length; ++i) {
      if (columnData[i] == null) continue;
      for (int j = 0; j < columnData[i].length; ++j) {
        columnData[i][j] = null;
      }
    }
  }


  public void initColumn(int colIx, int streamCount) {
    hasData[colIx] = true;
    if (columnData[colIx] == null || columnData[colIx].length != streamCount) {
      columnData[colIx] = new ColumnStreamData[streamCount];
    }
  }

  private static final Logger LOG = LoggerFactory.getLogger(EncodedColumnBatch.class);
  public void setStreamData(int colIx, int streamIx, ColumnStreamData csd) {
    assert hasData[colIx];
    columnData[colIx][streamIx] = csd;
  }

  public BatchKey getBatchKey() {
    return batchKey;
  }

  public ColumnStreamData[] getColumnData(int colIx) {
    if (!hasData[colIx]) throw new AssertionError("No data for column " + colIx);
    return columnData[colIx];
  }

  public int getTotalColCount() {
    return columnData.length; // Includes the columns that have no data
  }

  protected void resetColumnArrays(int columnCount) {
    if (hasData != null && columnCount == hasData.length) {
      Arrays.fill(hasData, false);
    } else {
      hasData = new boolean[columnCount];
    }
    ColumnStreamData[][] columnData = new ColumnStreamData[columnCount][];
    if (this.columnData != null) {
      for (int i = 0; i < Math.min(columnData.length, this.columnData.length); ++i) {
        columnData[i] = this.columnData[i];
      }
    }
    this.columnData = columnData;
  }

  public boolean hasData(int colIx) {
    return hasData[colIx];
  }
}