![JAR search and dependency download from the Maven repository](/logo.png)
net.sansa_stack.rdf.spark.io.NTripleReader.scala Maven / Gradle / Ivy
package net.sansa_stack.rdf.spark.io
import java.net.URI
import java.nio.file.{Files, Paths}
import java.util.UUID
import com.google.common.base.Predicates
import com.google.common.collect.Iterators
import net.sansa_stack.rdf.benchmark.io.ReadableByteChannelFromIterator
import net.sansa_stack.rdf.common.io.riot.error.{CustomErrorHandler, ErrorParseMode, WarningParseMode}
import net.sansa_stack.rdf.common.io.riot.lang.LangNTriplesSkipBad
import net.sansa_stack.rdf.common.io.riot.tokens.TokenizerTextForgiving
import org.apache.commons.io.IOUtils
import org.apache.jena.atlas.io.PeekReader
import org.apache.jena.atlas.iterator.Iter
import org.apache.jena.graph.Triple
import org.apache.jena.irix.IRIxResolver
import org.apache.jena.rdf.model.impl.NTripleReader
import org.apache.jena.riot.RIOT
import org.apache.jena.riot.lang.{IteratorParsers, LabelToNode, RiotParsers}
import org.apache.jena.riot.system._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
import scala.reflect.ClassTag
/**
* An N-Triples reader. One triple per line is assumed.
*
* @author Lorenz Buehmann
*/
object NTripleReader {
/**
* Loads N-Triples data from a file or directory into an RDD.
*
* @param session the Spark session
* @param path the path to the N-Triples file(s)
* @return the RDD of triples
*/
def load(session: SparkSession, path: URI): RDD[Triple] = {
load(session, path.toString)
}
/**
* Loads N-Triples data from a set of files or directories into an RDD.
* The path can also contain multiple paths
* and even wildcards, e.g.
* `"/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"`
*
* @param session the Spark session
* @param paths the path to the N-Triples file(s)
* @return the RDD of triples
*/
def load(session: SparkSession, paths: Seq[URI]): RDD[Triple] = {
load(session, paths.mkString(","))
}
/**
* Create an RDD of lines based on the given filename-or-URI which may also refer to a classpath resource.
* Files take precedence over class path resources.
* Class path resources are loaded via sparkContext.parallelize otherwise sparkContext.textFile is used.
*
* TODO This method is not NTripleReader-specific and should thus go into a generic SparkIoUtils class
*
* @param session The spark session
* @param filenameOrURI The file name, URI or classpath resource
* @return An RDD of lines from the resource's content (if that resource exists)
*/
def loadLinesIntoRdd(session: SparkSession, filenameOrURI: String): RDD[String] = {
val classLoader = classOf[NTripleReader].getClassLoader
val classPathResource = classLoader.getResource(filenameOrURI)
val nioPath = Paths.get(filenameOrURI);
val rdd =
if (classPathResource != null && !Files.exists(nioPath)) {
val inputStream = classLoader.getResourceAsStream(filenameOrURI)
try {
// Load into memory to avoid any potential issues with spark reading items in parallel
// while this thread closes the underyling inputStream
val lines = Source.fromInputStream(inputStream).getLines().toList.seq
// Load from class path if such resource exists and the path does not map to a file
session.sparkContext.parallelize(lines)
} finally {
IOUtils.closeQuietly(inputStream)
}
} else {
// parse the text file first
session.sparkContext
.textFile(filenameOrURI, minPartitions = 20)
}
rdd
}
/**
* Loads N-Triples data from a file or directory into an RDD.
* The path can also contain multiple paths
* and even wildcards, e.g.
* `"/my/dir1,/my/paths/part-00[0-5]*,/another/dir,/a/specific/file"`
*
* === Handling of errors===
*
* By default, it stops once a parse error occurs, i.e. a [[org.apache.jena.riot.RiotException]] will be thrown
* generated by the underlying parser.
*
* The following options exist:
* - STOP the whole data loading process will be stopped and a `org.apache.jena.net.sansa_stack.rdf.spark.riot.RiotException` will be thrown
* - SKIP the line will be skipped but the data loading process will continue, an error message will be logged
*
*
* ===Handling of warnings===
*
* If the additional checking of RDF terms is enabled, warnings during parsing can occur. For example,
* a wrong lexical form of a literal w.r.t. to its datatype will lead to a warning.
*
* The following can be done with those warnings:
* - IGNORE the warning will just be logged to the configured logger
* - STOP similar to the error handling mode, the whole data loading process will be stopped and a
* [[org.apache.jena.riot.RiotException]] will be thrown
* - SKIP similar to the error handling mode, the line will be skipped but the data loading process will continue
*
*
* ===Checking of RDF terms===
* Set whether to perform checking of NTriples - defaults to no checking.
*
* Checking adds warnings over and above basic syntax errors.
* This can also be used to turn warnings into exceptions if the option `stopOnWarnings` is set to STOP or SKIP.
*
* - IRIs - whether IRIs confirm to all the rules of the IRI scheme
* - Literals: whether the lexical form conforms to the rules for the datatype.
* - Triples: check slots have a valid kind of RDF term (parsers usually make this a syntax error anyway).
*
*
* See also the optional `errorLog` argument to control the output. The default is to log.
*
*
* @param session the Spark session
* @param path the path to the N-Triples file(s)
* @param stopOnBadTerm stop parsing on encountering a bad RDF term
* @param stopOnWarnings stop parsing on encountering a warning
* @param checkRDFTerms run with checking of literals and IRIs either on or off
* @param errorLog the logger used for error message handling
* @return the RDD of triples
*/
def load(session: SparkSession, path: String,
stopOnBadTerm: ErrorParseMode.Value = ErrorParseMode.STOP,
stopOnWarnings: WarningParseMode.Value = WarningParseMode.IGNORE,
checkRDFTerms: Boolean = false,
errorLog: Logger = ErrorHandlerFactory.stdLogger): RDD[Triple] = {
import scala.collection.JavaConverters._
val rdd = loadLinesIntoRdd(session, path)
val strict = stopOnBadTerm == ErrorParseMode.STOP && stopOnWarnings == WarningParseMode.STOP
// create the error handler profile
val profileWrapper = NonSerializableObjectWrapper {
val errorHandler =
if (strict) {
ErrorHandlerFactory.errorHandlerStrict(errorLog)
} else {
if (stopOnBadTerm == ErrorParseMode.STOP) {
if (stopOnWarnings == WarningParseMode.STOP || stopOnWarnings == WarningParseMode.SKIP) {
ErrorHandlerFactory.errorHandlerStrict(errorLog)
} else {
ErrorHandlerFactory.errorHandlerStd(errorLog)
}
} else {
// ErrorHandlerFactory.errorHandlerWarn
new CustomErrorHandler()
}
}
val seed = new UUID(path.hashCode, 0)
new ParserProfileStd(RiotLib.factoryRDF(LabelToNode.createScopeByDocumentHash(seed)),
errorHandler,
IRIxResolver.create.noBase.allowRelative(true).build,
PrefixMapFactory.create,
RIOT.getContext.copy,
checkRDFTerms || strict, strict)
}
// parse each partition
rdd.mapPartitions(p => {
// convert iterator to input stream
val input = ReadableByteChannelFromIterator.toInputStream(p.asJava)
// create the parsing iterator
val it =
if (stopOnBadTerm == ErrorParseMode.STOP || stopOnWarnings == WarningParseMode.STOP) {
// this is the default behaviour of Jena, i.e. once a parse error occurs the whole process stops
IteratorParsers.createIteratorNTriples(input, profileWrapper.get)
} else {
// here we "simply" skip illegal triples
// we need a custom tokenizer
val tokenizer = new TokenizerTextForgiving(PeekReader.makeUTF8(input))
tokenizer.setErrorHandler(ErrorHandlerFactory.errorHandlerWarn)
// which is used by a custom N-Triples iterator
val it = new LangNTriplesSkipBad(tokenizer, profileWrapper.get, null)
// filter out null values
Iterators.filter(it, Predicates.notNull[Triple]())
}
Iter.onCloseIO(it, input).asScala
})
}
private case class Config(
in: URI = null,
mode: String = "",
sampleSize: Int = 10)
def main(args: Array[String]): Unit = {
val parser = new scopt.OptionParser[Config]("N-Triples Reader") {
head("N-Triples Reader", "0.7.2")
cmd("triples")
.text("compute number of triples")
.action((x, c) => c.copy(mode = "triples"))
cmd("sample")
.text("show sample of triples")
.action((x, c) => c.copy(mode = "sample"))
.children(
opt[Int]("size")
.abbr("n")
.action((x, c) => c.copy(sampleSize = x))
.text("sample size (too high number can be slow or lead to memory issues)"),
checkConfig(
c =>
if (c.mode == "sample" && c.sampleSize <= 0) failure("sample size must be > 0")
else success)
)
arg[URI]("")
.action((x, c) => c.copy(in = x))
.text("URI to N-Triples file to process")
.valueName("")
.required()
}
// parser.parse returns Option[C]
parser.parse(args, Config()) match {
case Some(config) =>
val sparkSession = SparkSession.builder
// .master("local")
.appName("N-Quads reader")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.getOrCreate()
val rdd = NTripleReader.load(
sparkSession,
config.in.getPath,
stopOnBadTerm = ErrorParseMode.SKIP,
stopOnWarnings = WarningParseMode.SKIP,
checkRDFTerms = true,
LoggerFactory.getLogger("errorLog"))
config.mode match {
case "triples" => println(s"#parsed triples: ${rdd.count()}")
case "sample" => println(s"max ${config.sampleSize} sample triples:\n"
+ rdd.take(config.sampleSize).map { _.toString.replaceAll("[\\x00-\\x1f]", "???") }.mkString("\n"))
}
sparkSession.stop()
case None =>
// arguments are bad, error message will have been displayed
}
}
}
private class NonSerializableObjectWrapper[T: ClassTag](constructor: => T)
extends AnyRef with Serializable {
@transient private lazy val instance: T = constructor
def get: T = instance
}
private object NonSerializableObjectWrapper {
def apply[T: ClassTag](constructor: => T): NonSerializableObjectWrapper[T] = new NonSerializableObjectWrapper[T](constructor)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy