net.sansa_stack.inference.spark.data.loader.RDFGraphLoader.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sansa-inference-spark_2.12 Show documentation
Apache Spark based inference layer for RDF and OWL
The newest version!
package net.sansa_stack.inference.spark.data.loader

import java.net.URI

import net.sansa_stack.inference.data.{SQLSchema, SQLSchemaDefault}
import net.sansa_stack.inference.spark.data.model.{RDFGraph, RDFGraphDataFrame, RDFGraphDataset, RDFGraphNative}
import net.sansa_stack.rdf.spark.io.NTripleReader
import org.apache.jena.graph.Triple
import org.apache.jena.riot.Lang
import org.apache.jena.vocabulary.RDF
import org.apache.spark.sql.{Dataset, Encoder, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.LoggerFactory

import scala.language.implicitConversions



/**
  * A class that provides methods to load an RDF graph from disk.
  *
  * @author Lorenz Buehmann
  *
  */
object RDFGraphLoader {

  private val logger = com.typesafe.scalalogging.Logger(LoggerFactory.getLogger(this.getClass.getName))

  private implicit def pathURIsConverter(uris: Seq[URI]): String = uris.map(p => p.toString).mkString(",")

  /**
    * Load an RDF graph from a file or directory. The path can also contain multiple paths
    * and even wildcards, e.g.
    * `"/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"`
    *
    * @param path the absolute path of the file
    * @param session the Spark session
    * @param minPartitions min number of partitions for Hadoop RDDs ([[SparkContext.defaultMinPartitions]])
    * @return an RDF graph
    */
  def loadFromDisk(session: SparkSession, path: String, minPartitions: Int = 2): RDFGraph = {
    RDFGraph(NTripleReader.load(session, path))
  }

  /**
    * Load an RDF graph from multiple files or directories.
    *
    * @param paths the files or directories
    * @param session the Spark session
    * @param minPartitions min number of partitions for Hadoop RDDs ([[SparkContext.defaultMinPartitions]])
    * @return an RDF graph
    */
  def loadFromDisk(session: SparkSession, paths: Seq[URI], minPartitions: Int): RDFGraph = {
    loadFromDisk(session, paths.mkString(","), minPartitions)
  }

  /**
    * Load an RDF graph from a single file or directory.
    *
    * @param path the path to the file or directory
    * @param session the Spark session
    * @param minPartitions min number of partitions for Hadoop RDDs ([[SparkContext.defaultMinPartitions]])
    * @return an RDF graph
    */
  def loadFromDisk(session: SparkSession, path: URI, minPartitions: Int): RDFGraph = {
    loadFromDisk(session, Seq(path), minPartitions)
  }

  /**
    * Load an RDF graph from a file or directory with a Spark RDD as underlying datastructure.
    * The path can also contain multiple paths and even wildcards, e.g.
    * "/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"
    *
    * @param path the files
    * @param session the Spark session
    * @param minPartitions min number of partitions for Hadoop RDDs ([[SparkContext.defaultMinPartitions]])
    * @return an RDF graph
    */
  def loadFromDiskAsRDD(session: SparkSession, path: String, minPartitions: Int): RDFGraphNative = {
    new RDFGraphNative(NTripleReader.load(session, path))
  }

  private case class RDFTriple2(s: String, p: String, o: String) extends Product3[String, String, String] {
    override def _1: String = s
    override def _2: String = p
    override def _3: String = o

    def subject: String = s

    override def toString: String = s + "  " + p + "  " + o
  }

  /**
    * Load an RDF graph from from multiple files or directories with a Spark Dataset as underlying datastructure.
    * The path can also contain multiple paths and even wildcards, e.g.
    * `"/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"`
    *
    * @param path the absolute path of the file
    * @param session the Spark session
    * @return an RDF graph based on a [[Dataset]]
    */
  def loadFromDiskAsDataset(session: SparkSession, path: String): RDFGraphDataset = {
    logger.info("loading triples from disk...")
    val startTime = System.currentTimeMillis()

    import org.apache.spark.sql.functions._
    val splitter = udf((str: String) => {
      val splitted = str.split(" ").lift
      Array(splitted(0), splitted(1), splitted(2))
    })

    implicit val rdfTripleEncoder: Encoder[Triple] = org.apache.spark.sql.Encoders.kryo[Triple]
    val spark = session.sqlContext


    val triples = session
      .createDataset(NTripleReader.load(session, path))(rdfTripleEncoder)
      .as("triples")
    // (rdfTripleEncoder)
    //    val rowRDD = session.sparkContext
    //      .textFile(paths) // read the text file
    //      .map(s => {
    //      val tokens = s.split(" ")
    //      Row(tokens(0), tokens(1), tokens(2))
    //      //      RDFTriple(tokens(0), tokens(1), tokens(2))
    //    })
    //
    //    val encoder = Encoders.product[RDFTriple]
    //    val schema =
    //      StructType(Array(
    //        StructField("s", StringType, true),
    //        StructField("p", StringType, true),
    //        StructField("o", StringType, true)))
    //    val triplesDF = spark.createDataFrame(rowRDD, schema)
    //     val triples = triplesDF.as[RDFTriple](encoder)
    //    session.read
    //      .textFile(paths) // read the text file
    //      .map(s => {
    //      val tokens = s.split(" ")
    //      RDFTriple2(tokens(0), tokens(1), tokens(2))
    //    }).as[RDFTriple2].show(10)
    //      .select(splitter($"value") as "tokens")
    //      .select($"tokens"(0) as "s", $"tokens"(1) as "p", $"tokens"(2) as "o")
    //      .as[RDFTriple]


    // convert to triple object

    // logger.info("finished loading " + triples.count() + " triples in " +g
    // (System.currentTimeMillis()-startTime) + "ms.")
    new RDFGraphDataset(triples)
  }

  /**
    * Load an RDF graph from from from a file or directory with a Spark Dataset as underlying datastructure.
    * The path can also contain multiple paths and even wildcards, e.g.
    * `"/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"`
    *
    * @param paths the absolute path of the file
    * @param session the Spark session
    * @return an RDF graph based on a [[Dataset]]
    */
  def loadFromDiskAsDataset(session: SparkSession, paths: scala.Seq[URI]): RDFGraphDataset = {
    loadFromDiskAsDataset(session, paths.mkString(","))
  }

  /**
    * Load an RDF graph from a file or directory with a Spark DataFrame as underlying datastructure.
    * The path can also contain multiple paths and even wildcards, e.g.
    * `"/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"`
    *
    * @param path the absolute path of the file
    * @param session the Spark session
    * @param minPartitions min number of partitions for Hadoop RDDs ([[SparkContext.defaultMinPartitions]])
    * @return an RDF graph based on a [[org.apache.spark.sql.DataFrame]]
    */
  def loadFromDiskAsDataFrame(session: SparkSession, path: String, minPartitions: Int = 4, sqlSchema: SQLSchema = SQLSchemaDefault): RDFGraphDataFrame = {
    val df = session
      .read
      .format("net.sansa_stack.inference.spark.data.loader.sql")
      .load(path)

    // register the DataFrame as a table
    df.createOrReplaceTempView(sqlSchema.triplesTable)

    new RDFGraphDataFrame(df)
  }

  def main(args: Array[String]): Unit = {
    import net.sansa_stack.rdf.spark.io._

    val path = args(0)
    val lang = args(1) match {
      case "turtle" => Lang.TURTLE
      case "ntriples" => Lang.NTRIPLES
      case _ => null
    }

    val numThreads = if (args.length > 2)  args(2).toInt else 4
    val parallelism = if (args.length > 3)  args(3).toInt else 4

    val conf = new SparkConf()
    conf.registerKryoClasses(Array(classOf[org.apache.jena.graph.Triple]))
    conf.set("spark.extraListeners", "net.sansa_stack.inference.spark.utils.CustomSparkListener")
    conf.set("textinputformat.record.delimiter", ".\n")
    // the SPARK config
    val session = SparkSession.builder
      .appName(s"SPARK ${lang.getLabel} Loading")
      .master(s"local[$numThreads]")
      .config("spark.eventLog.enabled", "true")
      .config("spark.hadoop.validateOutputSpecs", "false") // override output files
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.default.parallelism", parallelism)
      .config("spark.ui.showConsoleProgress", "false")
      .config("spark.sql.shuffle.partitions", parallelism)
      .config(conf)
      .getOrCreate()


    val triples = session.read.rdf(lang)(path)
    triples.show(10)
    println(triples.count())
    triples
      .filter("p == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'")
      .write.mode(org.apache.spark.sql.SaveMode.Append).rdf("/tmp/lubm/out")



    val triplesRDD = session.rdf(lang)(path)
    triples.show(10)
    println(triples.count())
    triplesRDD
      .filter(_.predicateMatches(RDF.`type`.asNode()))
      .saveAsNTriplesFile("/tmp/lubm/out")
//
//    val triples = session.read.ntriples(path)
//
//    import session.implicits._
//
//    triples.show(10)
//    triples.select("s", "o").show(10)
//    println(triples.count())
//    println(triples.distinct().count())
//    triples.as("t1").join(triples.as("t2"), $"t1.o" === $"t2.s", "inner").show(10)

    session.stop()
  }
}