All Downloads are FREE. Search and download functionalities are using the official Maven repository. Maven / Gradle / Ivy


import java.nio.file.{Files, Paths}
import java.util.UUID
import{CustomErrorHandler, ErrorParseMode, WarningParseMode}
import org.apache.jena.atlas.iterator.Iter
import org.apache.jena.graph.Triple
import org.apache.jena.irix.IRIxResolver
import org.apache.jena.rdf.model.impl.NTripleReader
import org.apache.jena.riot.RIOT
import org.apache.jena.riot.lang.{IteratorParsers, LabelToNode, RiotParsers}
import org.apache.jena.riot.system._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.slf4j.{Logger, LoggerFactory}

import scala.reflect.ClassTag

 * An N-Triples reader. One triple per line is assumed.
 * @author Lorenz Buehmann
object NTripleReader {

   * Loads N-Triples data from a file or directory into an RDD.
   * @param session the Spark session
   * @param path    the path to the N-Triples file(s)
   * @return the RDD of triples
  def load(session: SparkSession, path: URI): RDD[Triple] = {
    load(session, path.toString)

   * Loads N-Triples data from a set of files or directories into an RDD.
   * The path can also contain multiple paths
   * and even wildcards, e.g.
   * `"/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"`
   * @param session the Spark session
   * @param paths   the path to the N-Triples file(s)
   * @return the RDD of triples
  def load(session: SparkSession, paths: Seq[URI]): RDD[Triple] = {
    load(session, paths.mkString(","))

   * Create an RDD of lines based on the given filename-or-URI which may also refer to a classpath resource.
   * Files take precedence over class path resources.
   * Class path resources are loaded via sparkContext.parallelize otherwise sparkContext.textFile is used.
   * TODO This method is not NTripleReader-specific and should thus go into a generic SparkIoUtils class
   * @param session The spark session
   * @param filenameOrURI The file name, URI or classpath resource
   * @return An RDD of lines from the resource's content (if that resource exists)
  def loadLinesIntoRdd(session: SparkSession, filenameOrURI: String): RDD[String] = {

    val classLoader = classOf[NTripleReader].getClassLoader
    val classPathResource = classLoader.getResource(filenameOrURI)
    val nioPath = Paths.get(filenameOrURI);
    val rdd =
      if (classPathResource != null && !Files.exists(nioPath)) {
        val inputStream = classLoader.getResourceAsStream(filenameOrURI)
        try {
          // Load into memory to avoid any potential issues with spark reading items in parallel
          // while this thread closes the underyling inputStream
          val lines = Source.fromInputStream(inputStream).getLines().toList.seq
          // Load from class path if such resource exists and the path does not map to a file
        } finally {
      } else {
        // parse the text file first
          .textFile(filenameOrURI, minPartitions = 20)


   * Loads N-Triples data from a file or directory into an RDD.
   * The path can also contain multiple paths
   * and even wildcards, e.g.
   * `"/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"`
   * === Handling of errors===
   * By default, it stops once a parse error occurs, i.e. a [[org.apache.jena.riot.RiotException]] will be thrown
   * generated by the underlying parser.
   * The following options exist:
   *  - STOP the whole data loading process will be stopped and a `` will be thrown
   *  - SKIP the line will be skipped but the data loading process will continue, an error message will be logged
   * ===Handling of warnings===
   * If the additional checking of RDF terms is enabled, warnings during parsing can occur. For example,
   * a wrong lexical form of a literal w.r.t. to its datatype will lead to a warning.
   * The following can be done with those warnings:
   *  - IGNORE the warning will just be logged to the configured logger
   *  - STOP similar to the error handling mode, the whole data loading process will be stopped and a
   *  [[org.apache.jena.riot.RiotException]] will be thrown
   *  - SKIP similar to the error handling mode, the line will be skipped but the data loading process will continue
   * ===Checking of RDF terms===
   * Set whether to perform checking of NTriples - defaults to no checking.
   * Checking adds warnings over and above basic syntax errors.
   * This can also be used to turn warnings into exceptions if the option `stopOnWarnings` is set to STOP or SKIP.
   *  - IRIs - whether IRIs confirm to all the rules of the IRI scheme
   *  - Literals: whether the lexical form conforms to the rules for the datatype.
   *  - Triples: check slots have a valid kind of RDF term (parsers usually make this a syntax error anyway).
   * See also the optional `errorLog` argument to control the output. The default is to log.
   * @param session        the Spark session
   * @param path           the path to the N-Triples file(s)
   * @param stopOnBadTerm  stop parsing on encountering a bad RDF term
   * @param stopOnWarnings stop parsing on encountering a warning
   * @param checkRDFTerms  run with checking of literals and IRIs either on or off
   * @param errorLog       the logger used for error message handling
   * @return the RDD of triples
  def load(session: SparkSession, path: String,
           stopOnBadTerm: ErrorParseMode.Value = ErrorParseMode.STOP,
           stopOnWarnings: WarningParseMode.Value = WarningParseMode.IGNORE,
           checkRDFTerms: Boolean = false,
           errorLog: Logger = ErrorHandlerFactory.stdLogger): RDD[Triple] = {

    import scala.collection.JavaConverters._

    val rdd = loadLinesIntoRdd(session, path)

    val strict = stopOnBadTerm == ErrorParseMode.STOP && stopOnWarnings == WarningParseMode.STOP

    // create the error handler profile
    val profileWrapper = NonSerializableObjectWrapper {
      val errorHandler =
        if (strict) {
        } else {
          if (stopOnBadTerm == ErrorParseMode.STOP) {
            if (stopOnWarnings == WarningParseMode.STOP || stopOnWarnings == WarningParseMode.SKIP) {
            } else {
          } else {
            //            ErrorHandlerFactory.errorHandlerWarn
            new CustomErrorHandler()

      val seed = new UUID(path.hashCode, 0)
      new ParserProfileStd(RiotLib.factoryRDF(LabelToNode.createScopeByDocumentHash(seed)),
        checkRDFTerms || strict, strict)

    // parse each partition
    rdd.mapPartitions(p => {
      // convert iterator to input stream
      val input = ReadableByteChannelFromIterator.toInputStream(p.asJava)

      // create the parsing iterator
      val it =
        if (stopOnBadTerm == ErrorParseMode.STOP || stopOnWarnings == WarningParseMode.STOP) {
          // this is the default behaviour of Jena, i.e. once a parse error occurs the whole process stops
          IteratorParsers.createIteratorNTriples(input, profileWrapper.get)
        } else {
          // here we "simply" skip illegal triples

          // we need a custom tokenizer
          val tokenizer = new TokenizerTextForgiving(PeekReader.makeUTF8(input))

          // which is used by a custom N-Triples iterator
          val it = new LangNTriplesSkipBad(tokenizer, profileWrapper.get, null)

          // filter out null values
          Iterators.filter(it, Predicates.notNull[Triple]())
      Iter.onCloseIO(it, input).asScala

  private case class Config(
                     in: URI = null,
                     mode: String = "",
                     sampleSize: Int = 10)

  def main(args: Array[String]): Unit = {
    val parser = new scopt.OptionParser[Config]("N-Triples Reader") {

      head("N-Triples Reader", "0.7.2")

        .text("compute number of triples")
        .action((x, c) => c.copy(mode = "triples"))

        .text("show sample of triples")
        .action((x, c) => c.copy(mode = "sample"))
            .action((x, c) => c.copy(sampleSize = x))
            .text("sample size (too high number can be slow or lead to memory issues)"),
            c =>
              if (c.mode == "sample" && c.sampleSize <= 0) failure("sample size must be > 0")
              else success)

        .action((x, c) => c.copy(in = x))
        .text("URI to N-Triples file to process")

    // parser.parse returns Option[C]
    parser.parse(args, Config()) match {
      case Some(config) =>
        val sparkSession = SparkSession.builder
          //                .master("local")
          .appName("N-Quads reader")
          .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

        val rdd = NTripleReader.load(
          stopOnBadTerm = ErrorParseMode.SKIP,
          stopOnWarnings = WarningParseMode.SKIP,
          checkRDFTerms = true,

        config.mode match {
          case "triples" => println(s"#parsed triples: ${rdd.count()}")
          case "sample" => println(s"max ${config.sampleSize} sample triples:\n"
            + rdd.take(config.sampleSize).map { _.toString.replaceAll("[\\x00-\\x1f]", "???") }.mkString("\n"))


      case None =>
      // arguments are bad, error message will have been displayed


private class NonSerializableObjectWrapper[T: ClassTag](constructor: => T)
  extends AnyRef with Serializable {
  @transient private lazy val instance: T = constructor

  def get: T = instance

private object NonSerializableObjectWrapper {
  def apply[T: ClassTag](constructor: => T): NonSerializableObjectWrapper[T] = new NonSerializableObjectWrapper[T](constructor)

© 2015 - 2025 Weber Informatics LLC | Privacy Policy