All Downloads are FREE. Search and download functionalities are using the official Maven repository.

odkl.analysis.spark.util.collection.CompactBuffer.scala Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * Copy of package org.apache.spark.util.collection.CompactBuffer
 *
 * Modifications:
 * - Added @SerialVersionUID
 * - made public
 */
package odkl.analysis.spark.util.collection

import scala.math.Ordering
import scala.reflect.ClassTag

/**
 * An append-only buffer similar to ArrayBuffer, but more memory-efficient for small buffers.
 * ArrayBuffer always allocates an Object array to store the data, with 16 entries by default,
 * so it has about 80-100 bytes of overhead. In contrast, CompactBuffer can keep up to two
 * elements in fields of the main object, and only allocates an Array[AnyRef] if there are more
 * entries than that. This makes it more efficient for operations like groupBy where we expect
 * some keys to have very few elements.
 */
@SerialVersionUID(1L)
class CompactBuffer[T: ClassTag] extends Seq[T] with Serializable {
  // First two elements
  private var element0: T = _
  private var element1: T = _

  // Number of elements, including our two in the main object
  private var curSize = 0

  // Array for extra elements
  private [CompactBuffer] var otherElements: Array[T] = null

  def this(initialCapacity: Int) = {
    this()
    if (initialCapacity > 2) {
      otherElements = new Array[T](initialCapacity - 2)
    }
  }

  def apply(position: Int): T = {
    if (position < 0 || position >= curSize) {
      throw new IndexOutOfBoundsException
    }
    if (position == 0) {
      element0
    } else if (position == 1) {
      element1
    } else {
      otherElements(position - 2)
    }
  }

  private def update(position: Int, value: T): Unit = {
    if (position < 0 || position >= curSize) {
      throw new IndexOutOfBoundsException
    }
    if (position == 0) {
      element0 = value
    } else if (position == 1) {
      element1 = value
    } else {
      otherElements(position - 2) = value
    }
  }

  def += (value: T): CompactBuffer[T] = {
    val newIndex = curSize
    if (newIndex == 0) {
      element0 = value
      curSize = 1
    } else if (newIndex == 1) {
      element1 = value
      curSize = 2
    } else {
      growToSize(curSize + 1)
      otherElements(newIndex - 2) = value
    }
    this
  }

  def ++= (values: TraversableOnce[T]): CompactBuffer[T] = {
    values match {
      // Optimize merging of CompactBuffers, used in cogroup and groupByKey
      case compactBuf: CompactBuffer[T] =>
        val oldSize = curSize
        // Copy the other buffer's size and elements to local variables in case it is equal to us
        val itsSize = compactBuf.curSize
        val itsElements = compactBuf.otherElements
        growToSize(curSize + itsSize)
        if (itsSize == 1) {
          this(oldSize) = compactBuf.element0
        } else if (itsSize == 2) {
          this(oldSize) = compactBuf.element0
          this(oldSize + 1) = compactBuf.element1
        } else if (itsSize > 2) {
          this(oldSize) = compactBuf.element0
          this(oldSize + 1) = compactBuf.element1
          // At this point our size is also above 2, so just copy its array directly into ours.
          // Note that since we added two elements above, the index in this.otherElements that we
          // should copy to is oldSize.
          System.arraycopy(itsElements, 0, otherElements, oldSize, itsSize - 2)
        }

      case _ =>
        values.foreach(e => this += e)
    }
    this
  }

  override def length: Int = curSize

  override def size: Int = curSize

  override def iterator: Iterator[T] = new Iterator[T] {
    private var pos = 0
    override def hasNext: Boolean = pos < curSize
    override def next(): T = {
      if (!hasNext) {
        throw new NoSuchElementException
      }
      pos += 1
      apply(pos - 1)
    }
  }

  /** Increase our size to newSize and grow the backing array if needed. */
  private def growToSize(newSize: Int): Unit = {
    if (newSize < 0) {
      throw new UnsupportedOperationException("Can't grow buffer past Int.MaxValue elements")
    }
    val capacity = if (otherElements != null) otherElements.length + 2 else 2
    if (newSize > capacity) {
      var newArrayLen = 8
      while (newSize - 2 > newArrayLen) {
        newArrayLen *= 2
        if (newArrayLen == Int.MinValue) {
          // Prevent overflow if we double from 2^30 to 2^31, which will become Int.MinValue.
          // Note that we set the new array length to Int.MaxValue - 2 so that our capacity
          // calculation above still gives a positive integer.
          newArrayLen = Int.MaxValue - 2
        }
      }
      val newArray = new Array[T](newArrayLen)
      if (otherElements != null) {
        System.arraycopy(otherElements, 0, newArray, 0, otherElements.length)
      }
      otherElements = newArray
    }
    curSize = newSize
  }

  /**
   * More efficient implementation of [[scala.collection.SeqLike.sortBy]]
   *
   * @param f     f the transformation function mapping elements
   *              to some other domain `B`.
   * @param ord   the ordering assumed on domain `B`.
   * @tparam B    the target type of the transformation `f`, and the type where
   *              the ordering `ord` is defined.
   * @return      a [[CompactBuffer]] consisting of the elements of this [[CompactBuffer]]
   *               sorted according to the ordering where `x < y` if
   *              `ord.lt(f(x), f(y))`.
   */
  override def sortBy[B](f: T => B)(implicit ord: Ordering[B]): CompactBuffer[T] = sorted(ord on f)

  /**
   * More efficient implementation of [[scala.collection.SeqLike.sorted]]
   *
   * @param ord   the ordering to be used to compare elements.
   * @return      a [[CompactBuffer]] consisting of the elements of this [[CompactBuffer]]
   *              sorted according to the ordering `ord`.
   */
  override def sorted[B >: T](implicit ord: Ordering[B]): CompactBuffer[T] = {
    val len = this.size
    val arr = new Array[T](len)
    for (i <- indices) {
      arr(i) = this(i)
    }
    java.util.Arrays.sort(arr.asInstanceOf[Array[Object]], ord.asInstanceOf[Ordering[Object]])
    val res = new CompactBuffer[T](len)
    for (i <- arr.indices) {
      res += arr(i)
    }
    res
  }

  //  def sortBy
}

private[spark] object CompactBuffer {
  def apply[T: ClassTag](): CompactBuffer[T] = new CompactBuffer[T]

  def apply[T: ClassTag](value: T): CompactBuffer[T] = {
    val buf = new CompactBuffer[T]
    buf += value
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy