net.sansa_stack.inference.spark.RDFGraphMaterializer.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sansa-inference-spark_2.12 Show documentation
Apache Spark based inference layer for RDF and OWL
The newest version!
package net.sansa_stack.inference.spark

import java.net.URI

import net.sansa_stack.inference.rules.ReasoningProfile._
import net.sansa_stack.inference.rules.{RDFSLevel, ReasoningProfile}
import net.sansa_stack.inference.spark.data.loader.RDFGraphLoader
import net.sansa_stack.inference.spark.data.writer.RDFGraphWriter
import net.sansa_stack.inference.spark.forwardchaining.triples.{ForwardRuleReasonerOWLHorst, ForwardRuleReasonerRDFS, TransitiveReasoner}
import org.apache.jena.graph.{Node, NodeFactory}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

/**
  * The main entry class to compute the materialization on an RDF graph.
  * Currently, only RDFS and OWL-Horst are supported.
  *
  * @author Lorenz Buehmann
  *
  */
object RDFGraphMaterializer {

  def main(args: Array[String]) {
    parser.parse(args, Config()) match {
      case Some(config) =>
        run(config.in, config.out, config.profile, config.properties, config.writeToSingleFile, config.sortedOutput, config.parallelism)
      case None =>
        // scalastyle:off println
        println(parser.usage)
        // scalastyle:on println
    }
  }

  def run(input: Seq[URI], output: URI, profile: ReasoningProfile, properties: Seq[Node] = Seq(),
          writeToSingleFile: Boolean, sortedOutput: Boolean, parallelism: Int): Unit = {
    // register the custom classes for Kryo serializer
    val conf = new SparkConf()
    conf.registerKryoClasses(Array(classOf[org.apache.jena.graph.Triple]))
    conf.set("spark.extraListeners", "net.sansa_stack.inference.spark.utils.CustomSparkListener")

    // the SPARK config
    val session = SparkSession.builder
      .appName(s"SPARK $profile Reasoning")
      .master("local[4]")
//      .config("spark.eventLog.enabled", "true")
      .config("spark.hadoop.validateOutputSpecs", "false") // override output files
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.default.parallelism", parallelism)
      .config("spark.ui.showConsoleProgress", "false")
      .config("spark.sql.shuffle.partitions", parallelism)
      .config(conf)
      .getOrCreate()

//    println(session.conf.getAll.mkString("\n"))

//    val g = RDFGraphLoader.loadFromDiskAsDataset(session, input).distinct()
//    val g_inf = new ForwardRuleReasonerRDFSDataset(session).apply(g)
//    println(s"|G_inf| = ${g_inf.size()}")

    // load triples from disk
    val graph = RDFGraphLoader.loadFromDisk(session, input, parallelism)
//    println(s"|G| = ${graph.size()}")

    // create reasoner
    val reasoner = profile match {
      case TRANSITIVE => new TransitiveReasoner(session.sparkContext, properties, parallelism)
      case RDFS => new ForwardRuleReasonerRDFS(session.sparkContext, parallelism)
      case RDFS_SIMPLE =>
        val r = new ForwardRuleReasonerRDFS(session.sparkContext, parallelism)
        r.level = RDFSLevel.SIMPLE
        r
      case OWL_HORST => new ForwardRuleReasonerOWLHorst(session.sparkContext)
    }

    // compute inferred graph
    val inferredGraph = reasoner.apply(graph)
//    println(s"|G_inf| = ${inferredGraph.size()}")

    // write triples to disk
    RDFGraphWriter.writeToDisk(inferredGraph, output.toString, writeToSingleFile, sortedOutput)

    session.stop()
  }

  // the config object
  case class Config(
                     in: Seq[URI] = Seq(),
                     out: URI = new URI("."),
                     properties: Seq[Node] = Seq(),
                     profile: ReasoningProfile = ReasoningProfile.RDFS,
                     writeToSingleFile: Boolean = false,
                     sortedOutput: Boolean = false,
                     parallelism: Int = 4)

  // read ReasoningProfile enum
  implicit val profilesRead: scopt.Read[ReasoningProfile.Value] =
    scopt.Read.reads(ReasoningProfile forName _.toLowerCase())

  // read ReasoningProfile enum
  implicit val nodeRead: scopt.Read[Node] =
    scopt.Read.reads(NodeFactory.createURI(_))

  // the CLI parser
  val parser = new scopt.OptionParser[Config]("RDFGraphMaterializer") {

    head("RDFGraphMaterializer", "0.1.0")

    opt[Seq[URI]]('i', "input").required().valueName(",,...").
      action((x, c) => c.copy(in = x)).
      text("path to file or directory that contains the input files (in N-Triples format)")

    opt[URI]('o', "out").required().valueName("").
      action((x, c) => c.copy(out = x)).
      text("the output directory")

    opt[Seq[Node]]("properties").optional().valueName(",,...").
      action((x, c) => {
        c.copy(properties = x)
      }).
      text("list of properties for which the transitive closure will be computed (used only for profile 'transitive')")

    opt[ReasoningProfile]('p', "profile").required().valueName("{rdfs | rdfs-simple | owl-horst | transitive}").
      action((x, c) => c.copy(profile = x)).
      text("the reasoning profile")

    opt[Unit]("single-file").optional().action( (_, c) =>
      c.copy(writeToSingleFile = true)).text("write the output to a single file in the output directory")

    opt[Unit]("sorted").optional().action( (_, c) =>
      c.copy(sortedOutput = true)).text("sorted output of the triples (per file)")

    opt[Int]("parallelism").optional().action( (x, c) =>
      c.copy(parallelism = x)).text("the degree of parallelism, i.e. the number of Spark partitions used in the Spark operations")

    help("help").text("prints this usage text")

    checkConfig( c =>
      if (c.profile == TRANSITIVE && c.properties.isEmpty) failure("Option --properties must not be empty if profile 'transitive' is set")
      else success )
  }
}