tech.sourced.engine.Engine.scala Maven / Gradle / Ivy
The newest version!
package tech.sourced.engine
import java.nio.file.Paths
import java.util.Properties
import org.apache.spark.sql.functions.{lit, when}
import org.apache.spark.SparkException
import org.apache.spark.groupon.metrics.UserMetricsSystem
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import tech.sourced.engine.rule._
import tech.sourced.engine.udf.ConcatArrayUDF
import scala.collection.JavaConversions.asScalaBuffer
* Engine is the main entry point to all usage of the source{d} jgit-spark-connector.
* It has methods to configure all possible configurable options as well as
* the available methods to start analysing repositories of code.
* {{{
* import tech.sourced.engine._
* val engine = Engine(sparkSession, "/path/to/repositories")
* }}}
* NOTE: Keep in mind that you will need to register the UDFs in the session
* manually if you choose to instantiate this class directly instead of using
* the companion object.
* {{{
* import tech.sourced.engine.{Engine, SessionFunctions}
* engine = new Engine(sparkSession)
* sparkSession.registerUDFs()
* }}}
* The only method available as of now is getRepositories, which will generate
* a DataFrame of repositories, which is the very first thing you need to
* analyse repositories of code.
* @constructor creates a Engine instance with the given Spark session.
* @param session Spark session to be used
class Engine(val session: SparkSession,
repositoriesPath: String,
repositoriesFormat: String) extends Logging {
UserMetricsSystem.initialize(session.sparkContext, "Engine")
session.experimental.extraOptimizations = Seq(
* Register the initial views with the DefaultSource.
private def registerViews(): Unit = {
Sources.orderedSources.foreach(table => {
.option(DefaultSource.TableNameKey, table)
* Registers in the current session the views of the MetadataSource so the data is obtained
* from the metadata database instead of reading the repositories with the DefaultSource.
* @param dbPath path to the folder that contains the database.
* @param dbName name of the database file (engine_metadata.db) by default.
* @return the same instance of the engine
def fromMetadata(dbPath: String, dbName: String = MetadataSource.DefaultDbName): Engine = {
Seq(RepositoriesTable, ReferencesTable, CommitsTable, TreeEntriesTable).foreach(table => {
.option(DefaultSource.TableNameKey, table)
.option(MetadataSource.DbPathKey, dbPath)
.option(MetadataSource.DbNameKey, dbName)
* Registers in the current session the views of the DefaultSource so the data is obtained
* by reading the repositories instead of reading from the MetadataSource. This has no effect
* if [[Engine#fromMetadata]] has not been called before.
* @return the same instance of the engine
def fromRepositories(): Engine = {
* Returns a DataFrame with the data about the repositories found at
* the specified repositories path in the form of siva files.
* To call this method you need to have set before the repositories path,
* you can do so by calling setRepositoriesPath or, preferably, instantiating
* the Engine using the companion object.
* {{{
* val reposDf = engine.getRepositories
* }}}
* @return DataFrame
def getRepositories: DataFrame = getDataSource("repositories", session)
* Retrieves the blobs of a list of repositories, reference names and commit hashes.
* So the result will be a [[org.apache.spark.sql.DataFrame]] of all the blobs in
* the given commits that are in the given references that belong to the given
* repositories.
* {{{
* val blobsDf = engine.getBlobs(repoIds, refNames, hashes)
* }}}
* Calling this function with no arguments is the same as:
* {{{
* engine.getRepositories.getReferences.getCommits.getTreeEntries.getBlobs
* }}}
* @param repositoryIds List of the repository ids to filter by (optional)
* @param referenceNames List of reference names to filter by (optional)
* @param commitHashes List of commit hashes to filter by (optional)
* @return [[org.apache.spark.sql.DataFrame]] with blobs of the given commits, refs and repos.
def getBlobs(repositoryIds: Seq[String] = Seq(),
referenceNames: Seq[String] = Seq(),
commitHashes: Seq[String] = Seq()): DataFrame = {
val df = getRepositories
var reposDf = df
if (repositoryIds.nonEmpty) {
reposDf = reposDf.filter(reposDf("id").isin(repositoryIds: _*))
var refsDf = reposDf.getReferences
if (referenceNames.nonEmpty) {
refsDf = refsDf.filter(refsDf("name").isin(referenceNames: _*))
var commitsDf = refsDf.getCommits
if (commitHashes.nonEmpty) {
commitsDf = commitsDf.getAllReferenceCommits.filter(commitsDf("hash").isin(commitHashes: _*))
* This method is only offered for easier usage from Python.
private[engine] def getBlobs(repositoryIds: java.util.List[String],
referenceNames: java.util.List[String],
commitHashes: java.util.List[String]): DataFrame =
* Sets the path where the siva files of the repositories are stored.
* Although this can actually be called the proper way to use Engine is
* to instantiate it using the Engine companion object, which already
* asks for the path in its apply method. If you already instantiated the
* API instance using the Engine companion object you don't need to call
* this unless you want to change the repositories path.
* Note that setting this will affect the session, so any other uses of the
* session outside the Engine instance will also have that config set.
* {{{
* engine.setRepositoriesPath("/path/to/repositories")
* }}}
* @param path of the repositories.
* @return instance of the engine itself
def setRepositoriesPath(path: String): Engine = {
session.conf.set(RepositoriesPathKey, path)
* Sets the format of the stored repositories on the specified path.
* Actual compatible formats are:
* - siva: to read siva files
* - bare: to read bare repositories
* - standard: to read standard git repositories (with workspace)
* @param format of the repositories.
* @return instance of the engine itself
def setRepositoriesFormat(format: String): Engine = {
session.conf.set(RepositoriesFormatKey, format)
* Configures the Engine so it won't cleanup the unpacked siva files after
* it's done with them to avoid having to unpack them afterwards.
* {{{
* // disable cleanup
* engine.skipCleanup(true)
* // enable cleanup again
* engine.skipCleanup(false)
* }}}
* @param skip whether to skip cleanup or not
* @return instance of the engine itself
def skipCleanup(skip: Boolean): Engine = {
session.conf.set(SkipCleanupKey, skip)
* Configures the Engine so it will skip all read errors occurred while
* reading siva files or repositories.
* {{{
* engine.skipReadErrors(true)
* }}}
* @param skip whether to skip read errors or not
* @return instance of the engine
def skipReadErrors(skip: Boolean): Engine = {
session.conf.set(SkipReadErrorsKey, skip)
* Saves all the metadata in a SQLite database on the given path as "engine_metadata.db".
* If the database already exists, it will be overwritten. The given path must exist and
* must be a directory, otherwise it will throw a [[SparkException]].
* Saved tables are repositories, references, commits and tree_entries. Blobs are not saved.
* @param path where database with the metadata will be stored.
* @param dbName name of the database file
* @throws SparkException when the given path is not a folder or does not exist.
def saveMetadata(path: String, dbName: String = MetadataSource.DefaultDbName): Unit = {
val folder = Paths.get(path)
if (!folder.toFile.exists() || !folder.toFile.isDirectory) {
throw new SparkException("folder given to saveMetadata is not a directory " +
"or does not exist")
val dbFile = folder.resolve(dbName)
if (dbFile.toFile.exists) {
log.warn(s"metadata file '$dbFile' already exists, it will be deleted")
val properties = new Properties()
properties.put("driver", "org.sqlite.JDBC")
val repositoriesDf = getDataSource(RepositoriesTable, session)
val referencesDf = repositoriesDf.getReferences
val commitsDf = referencesDf.getAllReferenceCommits
val treeEntriesDf = commitsDf.getTreeEntries
import MetadataDataFrameCompat._
(RepositoriesTable, repositoriesDf
(ReferencesTable, referencesDf.withBooleanColumnAsInt("is_remote")),
(CommitsTable, commitsDf
.drop("reference_name", "repository_id", "index")
(RepositoryHasCommitsTable, commitsDf
.select("hash", "reference_name", "repository_id", "index")),
(TreeEntriesTable, treeEntriesDf
.drop("reference_name", "repository_id").distinct())
) foreach {
case (table, df) =>
Tables(table).create(dbFile.toString, df.schema)
.jdbc(s"jdbc:sqlite:$dbFile", Tables.prefix(table), properties)
* Factory for [[tech.sourced.engine.Engine]] instances.
object Engine {
* Creates a new Engine instance with the given Spark session and
* configures the repositories path for that session.
* {{{
* import tech.sourced.engine._
* val engine = Engine(sparkSession, "/path/to/repositories")
* }}}
* @param session spark session to use
* @param repositoriesPath the path to the repositories
* @param repositoriesFormat format of the repositories inside the provided path.
* It can be siva, bare or standard.
* @return Engine instance
def apply(session: SparkSession, repositoriesPath: String, repositoriesFormat: String): Engine = {
new Engine(session, repositoriesPath, repositoriesFormat)
* Contains the Convert implicit class that gives DataFrame some methods to
* deal with compatibility between the regular DefaultSource dataframe and the
* MetadataSource one.
private object MetadataDataFrameCompat {
implicit class Convert(df: DataFrame) {
* Returns a new DataFrame with the given boolean column converted to
* an int column, being 0 the value for false and 1 for true.
* @param column column name
* @return new DataFrame
def withBooleanColumnAsInt(column: String): DataFrame =
df.withColumn(column, when(df(column) === false, 0)
.otherwise(when(df(column) === true, 1).otherwise(null)))
* Returns a new DataFrame with the given string array column converted to
* a column with the content of the array joined by "|".
* @param column column name
* @return new dataframe
def withStringArrayColumnAsString(column: String): DataFrame =
df.withColumn(column, ConcatArrayUDF(df.sparkSession)(df(column), lit("|")))