shark.memstore2.column.ColumnBuilder.scala Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of shark_2.10 Show documentation
shark
The newest version!
/*
 * Copyright (C) 2012 The Regents of The University California.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package shark.memstore2.column

import java.nio.ByteBuffer
import java.nio.ByteOrder

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory
import org.apache.hadoop.hive.serde2.objectinspector.StructField

import shark.LogHelper


trait ColumnBuilder[T] {

  private[column] def t: ColumnType[T, _]

  private[memstore2] def stats: ColumnStats[T]

  private var _buffer: ByteBuffer = _
  private var _initialSize: Int = _
  private var _columnName: String = _

  def columnName: String = _columnName

  def append(o: Object, oi: ObjectInspector) {
    val v = t.get(o, oi)
    _buffer = growIfNeeded(_buffer, t.actualSize(v))
    t.append(v, _buffer)
    gatherStats(v)
  }

  protected def gatherStats(v: T) {
    stats.append(v)
  }

  def build(): ByteBuffer = {
    _buffer.limit(_buffer.position())
    _buffer.rewind()
    _buffer
  }

  /**
   * Initialize with an approximate lower bound on the expected number
   * of elements in this column.
   */
  def initialize(initialSize: Int, colName: String = ""): ByteBuffer = {
    _columnName = colName
    // Use 10MB as the initial size for the column if it is not specified.
    _initialSize = if (initialSize == 0) 1024 * 1024 * 10 else initialSize
    _buffer = ByteBuffer.allocate(_initialSize * t.defaultSize + 4 + 4)
    _buffer.order(ByteOrder.nativeOrder())
    _buffer.putInt(t.typeID)
  }

  protected def growIfNeeded(orig: ByteBuffer, size: Int): ByteBuffer = {
    val capacity = orig.capacity()
    if (orig.remaining() < size) {
      // grow in steps of initial size
      val additionalSize = capacity / 8 + 1
      var newSize = capacity + additionalSize
      if (additionalSize  < size) {
        newSize = capacity + size
      }
      val pos = orig.position()
      orig.clear()
      val b = ByteBuffer.allocate(newSize)
      b.order(ByteOrder.nativeOrder())
      b.put(orig.array(), 0, pos)
    } else {
      orig
    }
  }
}

class DefaultColumnBuilder[T](val stats: ColumnStats[T], val t: ColumnType[T, _])
  extends CompressedColumnBuilder[T] with NullableColumnBuilder[T]


trait CompressedColumnBuilder[T] extends ColumnBuilder[T] with LogHelper {

  private var compressionSchemes = Array.empty[CompressionAlgorithm]

  /**
   * Set the compression schemes to be used for this column. Only schemes that support
   * the column's data type will be kept in the list.
   */
  def setCompressionSchemes(schemes: CompressionAlgorithm*) {
    compressionSchemes = schemes.filter(_.supportsType(t)).toArray
  }

  /**
   * Determines whether a particular compression algorithm should apply given the compression
   * ratio. Test code can override this to force specific compression even if the compression
   * ratio is not ideal.
   */
  def shouldApply(scheme: CompressionAlgorithm): Boolean = {
    scheme.compressionRatio < 0.8
  }

  override protected def gatherStats(v: T) {
    var i = 0
    while (i < compressionSchemes.length) {
      compressionSchemes(i).gatherStatsForCompressibility(v, t)
      i += 1
    }
    super.gatherStats(v)
  }

  override def build() = {
    val b = super.build()

    val scheme: CompressionAlgorithm =
      if (compressionSchemes.length == 0) {
        new NoCompression
      } else {
        val scheme = compressionSchemes.minBy(_.compressionRatio)
        if (shouldApply(scheme)) scheme else new NoCompression
      }

    logInfo("Compression scheme chosen for [%s] is %s with ratio %f".format(
      columnName, scheme.compressionType.getClass.getSimpleName, scheme.compressionRatio))

    scheme.compress(b, t)
  }
}


object ColumnBuilder {

  def create(structField: StructField, shouldCompress: Boolean = true): ColumnBuilder[_] = {
    val columnOi = structField.getFieldObjectInspector

    val v = columnOi.getCategory match {
      case ObjectInspector.Category.PRIMITIVE => {
        columnOi.asInstanceOf[PrimitiveObjectInspector].getPrimitiveCategory match {
          case PrimitiveCategory.BOOLEAN   => new BooleanColumnBuilder
          case PrimitiveCategory.INT       => new IntColumnBuilder
          case PrimitiveCategory.LONG      => new LongColumnBuilder
          case PrimitiveCategory.FLOAT     => new FloatColumnBuilder
          case PrimitiveCategory.DOUBLE    => new DoubleColumnBuilder
          case PrimitiveCategory.STRING    => new StringColumnBuilder
          case PrimitiveCategory.SHORT     => new ShortColumnBuilder
          case PrimitiveCategory.BYTE      => new ByteColumnBuilder
          case PrimitiveCategory.TIMESTAMP => new TimestampColumnBuilder
          case PrimitiveCategory.BINARY    => new BinaryColumnBuilder

          // TODO: add decimal column.
          case _ => throw new MemoryStoreException(
            "Invalid primitive object inspector category" + columnOi.getCategory)
        }
      }
      case _ => new GenericColumnBuilder(columnOi)
    }
    if (shouldCompress) {
      v.setCompressionSchemes(
        new NoCompression,
        new RLE,
        new BooleanBitSetCompression,
        new DictionaryEncoding,
        new IntDeltaEncoding,
        new LongDeltaEncoding)
    }
    v
  }
}