tech.sourced.engine.iterator.ChainableIterator.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jgit-spark-connector Show documentation
jgit-spark-connector
The newest version!
package tech.sourced.engine.iterator

import org.apache.spark.internal.Logging
import org.apache.spark.sql.Row
import org.eclipse.jgit.api.errors.GitAPIException
import org.eclipse.jgit.errors.{
  IncorrectObjectTypeException,
  MissingObjectException,
  RevWalkException
}
import org.eclipse.jgit.lib.Repository
import tech.sourced.engine.exception.RepositoryException
import tech.sourced.engine.util.CompiledFilter

import scala.annotation.tailrec

/**
  * Iterator that can have a previous iterator to output chained values.
  *
  * @param finalColumns final columns that must be in the resultant row
  * @param prevIter     previous iterator, if the iterator is chained
  * @param filters      filters for the iterator
  * @tparam T type of data returned by the internal iterator
  */
abstract class ChainableIterator[T](finalColumns: Array[String],
                                    prevIter: ChainableIterator[_],
                                    filters: Seq[CompiledFilter],
                                    val repo: Repository,
                                    skipReadErrors: Boolean
                                   ) extends Iterator[Row] with Logging {

  /** Raw values of the row. */
  type RawRow = Map[String, Any]

  /** Instance of the internal iterator. */
  private var iter: Iterator[T] = _

  /** The current row of the prevIter, null always if there is no prevIter. */
  private var prevIterCurrentRow: RawRow = _

  /** The current row of the internal iterator. */
  private[iterator] var currentRow: T = _

  /**
    * Returns the internal iterator that will return the data used to construct the final row.
    *
    * @param filters filters for the iterator
    * @return internal iterator
    */
  protected def loadIterator(filters: Seq[CompiledFilter]): Iterator[T]

  /**
    * Loads the next internal iterator.
    *
    * @return internal iterator
    */
  private def loadIterator: Iterator[T] = loadIterator(filters)

  /**
    * Given the object returned by the internal iterator, this method must transform
    * that object into a RawRow.
    *
    * @param obj object returned by the internal iterator
    * @return raw row
    */
  protected def mapColumns(obj: T): RawRow

  @tailrec
  final override def hasNext: Boolean = {
    loadNext match {
      case Some(v) => v
      case None => hasNext
    }
  }

  /**
    * Load the next iterator and returns if there is a next item or not. If
    * it returns some value it means we know for sure there is something or
    * not. If it returns None, it means we don't know and another call to
    * loadNext is required.
    *
    * @return whether there is a next item in the iterator or not, or if we
    *         don't know
    */
  final def loadNext: Option[Boolean] = {
    try {
      // If there is no previous iter just load the iterator the first pass
      // and use hasNext of iter all the times. We return here to get rid of
      // this logic and assume from this point on that prevIter is not null
      if (prevIter == null) {
        if (iter == null) {
          iter = loadIterator
        }

        return Some(iter.hasNext)
      }

      // If the iter is not loaded, do so, but only if there are actually more
      // rows in the prev iter. If there are, just load the iter and preload
      // the prevIterCurrentRow.
      if (iter == null) {
        if (prevIter.isEmpty) {
          return Some(false)
        }

        prevIterCurrentRow = prevIter.nextRaw
        iter = loadIterator
      }

      // if iter is empty, we need to check if there are more rows in the prev iter
      // if not, just finish. If there are, preload the next raw row of the prev iter
      // and load the iterator again for the prev iter current row
      if (iter.hasNext) {
        Some(true)
      } else {
        if (prevIter.isEmpty) {
          return Some(false)
        }

        prevIterCurrentRow = prevIter.nextRaw
        iter = loadIterator

        None
      }
    } catch {
      case e: IncorrectObjectTypeException =>
        log.debug("incorrect object type", new RepositoryException(repo, e))
        None
      case e: MissingObjectException =>
        log.warn("missing object", new RepositoryException(repo, e))
        None
      case e: RevWalkException =>
        log.warn("rev walk exception", new RepositoryException(repo, e))
        None
      case e: GitAPIException =>
        log.warn("git api exception", new RepositoryException(repo, e))
        None
      case e@(_: Exception | _: RuntimeException) =>
        if (skipReadErrors) {
          log.warn("read error skipped", new RepositoryException(repo, e))
          None
        } else {
          throw new RepositoryException(repo, e)
        }
      case e: Throwable =>
        throw e
    }
  }

  override def next: Row = {
    currentRow = iter.next
    // FIXME: if there's a repeated column name, value
    // will be the last one added. This could be solved by
    // qualifying all column names with their source.
    val mappedValues = if (prevIterCurrentRow != null) {
      prevIterCurrentRow ++ mapColumns(currentRow)
    } else {
      mapColumns(currentRow)
    }

    val values = finalColumns.map(c => mappedValues(c))
    Row(values: _*)
  }


  def nextRaw: RawRow = {
    currentRow = iter.next
    val row = mapColumns(currentRow)
    if (prevIterCurrentRow != null) {
      prevIterCurrentRow ++ row
    } else {
      row
    }
  }
}