tech.sourced.engine.Engine.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jgit-spark-connector Show documentation
jgit-spark-connector
The newest version!
package tech.sourced.engine

import java.nio.file.Paths
import java.util.Properties

import org.apache.spark.sql.functions.{lit, when}
import org.apache.spark.SparkException
import org.apache.spark.groupon.metrics.UserMetricsSystem
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import tech.sourced.engine.rule._
import tech.sourced.engine.udf.ConcatArrayUDF

import scala.collection.JavaConversions.asScalaBuffer

/**
  * Engine is the main entry point to all usage of the source{d} jgit-spark-connector.
  * It has methods to configure all possible configurable options as well as
  * the available methods to start analysing repositories of code.
  *
  * {{{
  * import tech.sourced.engine._
  *
  * val engine = Engine(sparkSession, "/path/to/repositories")
  * }}}
  *
  * NOTE: Keep in mind that you will need to register the UDFs in the session
  * manually if you choose to instantiate this class directly instead of using
  * the companion object.
  *
  * {{{
  * import tech.sourced.engine.{Engine, SessionFunctions}
  *
  * engine = new Engine(sparkSession)
  * sparkSession.registerUDFs()
  * }}}
  *
  * The only method available as of now is getRepositories, which will generate
  * a DataFrame of repositories, which is the very first thing you need to
  * analyse repositories of code.
  *
  * @constructor creates a Engine instance with the given Spark session.
  * @param session Spark session to be used
  */
class Engine(val session: SparkSession,
             repositoriesPath: String,
             repositoriesFormat: String) extends Logging {

  UserMetricsSystem.initialize(session.sparkContext, "Engine")

  this.setRepositoriesPath(repositoriesPath)
  this.setRepositoriesFormat(repositoriesFormat)
  session.registerUDFs()
  session.experimental.extraOptimizations = Seq(
    AddSourceToAttributes,
    SquashGitRelationsJoin,
    SquashMetadataRelationsJoin
  )
  registerViews()

  /**
    * Register the initial views with the DefaultSource.
    */
  private def registerViews(): Unit = {
    Sources.orderedSources.foreach(table => {
      session.read.format(DefaultSourceName)
        .option(DefaultSource.TableNameKey, table)
        .load(session.sqlContext.getConf(RepositoriesPathKey))
        .createOrReplaceTempView(table)
    })
  }

  /**
    * Registers in the current session the views of the MetadataSource so the data is obtained
    * from the metadata database instead of reading the repositories with the DefaultSource.
    *
    * @param dbPath path to the folder that contains the database.
    * @param dbName name of the database file (engine_metadata.db) by default.
    * @return the same instance of the engine
    */
  def fromMetadata(dbPath: String, dbName: String = MetadataSource.DefaultDbName): Engine = {
    Seq(RepositoriesTable, ReferencesTable, CommitsTable, TreeEntriesTable).foreach(table => {
      session.read.format(MetadataSourceName)
        .option(DefaultSource.TableNameKey, table)
        .option(MetadataSource.DbPathKey, dbPath)
        .option(MetadataSource.DbNameKey, dbName)
        .load()
        .createOrReplaceTempView(table)
    })
    this
  }

  /**
    * Registers in the current session the views of the DefaultSource so the data is obtained
    * by reading the repositories instead of reading from the MetadataSource. This has no effect
    * if [[Engine#fromMetadata]] has not been called before.
    *
    * @return the same instance of the engine
    */
  def fromRepositories(): Engine = {
    registerViews()
    this
  }

  /**
    * Returns a DataFrame with the data about the repositories found at
    * the specified repositories path in the form of siva files.
    * To call this method you need to have set before the repositories path,
    * you can do so by calling setRepositoriesPath or, preferably, instantiating
    * the Engine using the companion object.
    *
    * {{{
    * val reposDf = engine.getRepositories
    * }}}
    *
    * @return DataFrame
    */
  def getRepositories: DataFrame = getDataSource("repositories", session)

  /**
    * Retrieves the blobs of a list of repositories, reference names and commit hashes.
    * So the result will be a [[org.apache.spark.sql.DataFrame]] of all the blobs in
    * the given commits that are in the given references that belong to the given
    * repositories.
    *
    * {{{
    * val blobsDf = engine.getBlobs(repoIds, refNames, hashes)
    * }}}
    *
    * Calling this function with no arguments is the same as:
    *
    * {{{
    * engine.getRepositories.getReferences.getCommits.getTreeEntries.getBlobs
    * }}}
    *
    * @param repositoryIds  List of the repository ids to filter by (optional)
    * @param referenceNames List of reference names to filter by (optional)
    * @param commitHashes   List of commit hashes to filter by (optional)
    * @return [[org.apache.spark.sql.DataFrame]] with blobs of the given commits, refs and repos.
    */
  def getBlobs(repositoryIds: Seq[String] = Seq(),
               referenceNames: Seq[String] = Seq(),
               commitHashes: Seq[String] = Seq()): DataFrame = {
    val df = getRepositories

    var reposDf = df
    if (repositoryIds.nonEmpty) {
      reposDf = reposDf.filter(reposDf("id").isin(repositoryIds: _*))
    }

    var refsDf = reposDf.getReferences
    if (referenceNames.nonEmpty) {
      refsDf = refsDf.filter(refsDf("name").isin(referenceNames: _*))
    }

    var commitsDf = refsDf.getCommits
    if (commitHashes.nonEmpty) {
      commitsDf = commitsDf.getAllReferenceCommits.filter(commitsDf("hash").isin(commitHashes: _*))
    }

    commitsDf.getTreeEntries.getBlobs
  }

  /**
    * This method is only offered for easier usage from Python.
    */
  private[engine] def getBlobs(repositoryIds: java.util.List[String],
                               referenceNames: java.util.List[String],
                               commitHashes: java.util.List[String]): DataFrame =
    getBlobs(
      asScalaBuffer(repositoryIds),
      asScalaBuffer(referenceNames),
      asScalaBuffer(commitHashes)
    )

  /**
    * Sets the path where the siva files of the repositories are stored.
    * Although this can actually be called the proper way to use Engine is
    * to instantiate it using the Engine companion object, which already
    * asks for the path in its apply method. If you already instantiated the
    * API instance using the Engine companion object you don't need to call
    * this unless you want to change the repositories path.
    * Note that setting this will affect the session, so any other uses of the
    * session outside the Engine instance will also have that config set.
    *
    * {{{
    * engine.setRepositoriesPath("/path/to/repositories")
    * }}}
    *
    * @param path of the repositories.
    * @return instance of the engine itself
    */
  def setRepositoriesPath(path: String): Engine = {
    session.conf.set(RepositoriesPathKey, path)
    this
  }

  /**
    * Sets the format of the stored repositories on the specified path.
    *
    * Actual compatible formats are:
    *
    * - siva: to read siva files
    * - bare: to read bare repositories
    * - standard: to read standard git repositories (with workspace)
    *
    * @param format of the repositories.
    * @return instance of the engine itself
    */
  def setRepositoriesFormat(format: String): Engine = {
    session.conf.set(RepositoriesFormatKey, format)
    this
  }

  /**
    * Configures the Engine so it won't cleanup the unpacked siva files after
    * it's done with them to avoid having to unpack them afterwards.
    *
    * {{{
    * // disable cleanup
    * engine.skipCleanup(true)
    *
    * // enable cleanup again
    * engine.skipCleanup(false)
    * }}}
    *
    * @param skip whether to skip cleanup or not
    * @return instance of the engine itself
    */
  def skipCleanup(skip: Boolean): Engine = {
    session.conf.set(SkipCleanupKey, skip)
    this
  }

  /**
    * Configures the Engine so it will skip all read errors occurred while
    * reading siva files or repositories.
    *
    * {{{
    * engine.skipReadErrors(true)
    * }}}
    *
    * @param skip whether to skip read errors or not
    * @return instance of the engine
    */
  def skipReadErrors(skip: Boolean): Engine = {
    session.conf.set(SkipReadErrorsKey, skip)
    this
  }

  /**
    * Saves all the metadata in a SQLite database on the given path as "engine_metadata.db".
    * If the database already exists, it will be overwritten. The given path must exist and
    * must be a directory, otherwise it will throw a [[SparkException]].
    * Saved tables are repositories, references, commits and tree_entries. Blobs are not saved.
    *
    * @param path   where database with the metadata will be stored.
    * @param dbName name of the database file
    * @throws SparkException when the given path is not a folder or does not exist.
    */
  def saveMetadata(path: String, dbName: String = MetadataSource.DefaultDbName): Unit = {
    val folder = Paths.get(path)
    if (!folder.toFile.exists() || !folder.toFile.isDirectory) {
      throw new SparkException("folder given to saveMetadata is not a directory " +
        "or does not exist")
    }

    val dbFile = folder.resolve(dbName)
    if (dbFile.toFile.exists) {
      log.warn(s"metadata file '$dbFile' already exists, it will be deleted")
      dbFile.toFile.delete()
    }

    val properties = new Properties()
    properties.put("driver", "org.sqlite.JDBC")

    val repositoriesDf = getDataSource(RepositoriesTable, session)
    val referencesDf = repositoriesDf.getReferences
    val commitsDf = referencesDf.getAllReferenceCommits
    val treeEntriesDf = commitsDf.getTreeEntries

    import MetadataDataFrameCompat._

    Seq(
      (RepositoriesTable, repositoriesDf
        .withStringArrayColumnAsString("urls")
        .withBooleanColumnAsInt("is_fork")),
      (ReferencesTable, referencesDf.withBooleanColumnAsInt("is_remote")),
      (CommitsTable, commitsDf
        .drop("reference_name", "repository_id", "index")
        .withStringArrayColumnAsString("parents")
        .distinct()),
      (RepositoryHasCommitsTable, commitsDf
        .select("hash", "reference_name", "repository_id", "index")),
      (TreeEntriesTable, treeEntriesDf
        .drop("reference_name", "repository_id").distinct())
    ) foreach {
      case (table, df) =>
        Tables(table).create(dbFile.toString, df.schema)
        df.repartition(session.currentActiveExecutors())
          .write
          .mode(SaveMode.Append)
          .jdbc(s"jdbc:sqlite:$dbFile", Tables.prefix(table), properties)
    }
  }

}

/**
  * Factory for [[tech.sourced.engine.Engine]] instances.
  */
object Engine {
  /**
    * Creates a new Engine instance with the given Spark session and
    * configures the repositories path for that session.
    *
    * {{{
    * import tech.sourced.engine._
    *
    * val engine = Engine(sparkSession, "/path/to/repositories")
    * }}}
    *
    * @param session            spark session to use
    * @param repositoriesPath   the path to the repositories
    * @param repositoriesFormat format of the repositories inside the provided path.
    *                           It can be siva, bare or standard.
    * @return Engine instance
    */
  def apply(session: SparkSession, repositoriesPath: String, repositoriesFormat: String): Engine = {
    new Engine(session, repositoriesPath, repositoriesFormat)
  }
}

/**
  * Contains the Convert implicit class that gives DataFrame some methods to
  * deal with compatibility between the regular DefaultSource dataframe and the
  * MetadataSource one.
  */
private object MetadataDataFrameCompat {

  implicit class Convert(df: DataFrame) {
    /**
      * Returns a new DataFrame with the given boolean column converted to
      * an int column, being 0 the value for false and 1 for true.
      *
      * @param column column name
      * @return new DataFrame
      */
    def withBooleanColumnAsInt(column: String): DataFrame =
      df.withColumn(column, when(df(column) === false, 0)
        .otherwise(when(df(column) === true, 1).otherwise(null)))

    /**
      * Returns a new DataFrame with the given string array column converted to
      * a column with the content of the array joined by "|".
      *
      * @param column column name
      * @return new dataframe
      */
    def withStringArrayColumnAsString(column: String): DataFrame =
      df.withColumn(column, ConcatArrayUDF(df.sparkSession)(df(column), lit("|")))
  }

}