![JAR search and dependency download from the Maven repository](/logo.png)
net.sansa_stack.rdf.spark.io.package.scala Maven / Gradle / Ivy
package net.sansa_stack.rdf.spark
import com.typesafe.config.{Config, ConfigFactory}
import net.sansa_stack.rdf.spark.io.nquads.NQuadReader
import net.sansa_stack.rdf.spark.io.stream.RiotFileInputFormat
import net.sansa_stack.rdf.spark.utils.Logging
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.LongWritable
import org.apache.jena.graph.{Graph, Node, NodeFactory, Triple}
import org.apache.jena.hadoop.rdf.io.input.TriplesInputFormat
import org.apache.jena.hadoop.rdf.io.input.turtle.TurtleInputFormat
import org.apache.jena.hadoop.rdf.io.output.trig.TriGOutputFormat
import org.apache.jena.hadoop.rdf.types.{QuadWritable, TripleWritable}
import org.apache.jena.query.{Dataset => JenaDataset}
import org.apache.jena.rdf.model.Model
import org.apache.jena.riot.{Lang, RDFDataMgr, RDFLanguages}
import org.apache.jena.shared.PrefixMapping
import org.apache.jena.sparql.core.Quad
import org.apache.jena.sparql.util.{FmtUtils, NodeFactoryExtra}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, _}
import java.io.ByteArrayOutputStream
import java.util.Collections
import net.sansa_stack.hadoop.format.jena.trig.FileInputFormatRdfTrigDataset
import net.sansa_stack.spark.io.rdf.input.impl.{RdfSourceFactories, RdfSourceFactoryImpl}
import net.sansa_stack.spark.io.rdf.output.RddRdfWriter
import org.aksw.jenax.arq.dataset.api.DatasetOneNg
import org.aksw.jenax.dataaccess.sparql.factory.datasource.RdfDataSources
/**
* Wrap up implicit classes/methods to read/write RDF data from N-Triples or Turtle files into either [[DataFrame]] or
* [[RDD]].
*/
package object io {
/**
* SaveMode is used to specify the expected behavior of saving an RDF dataset to a path.
*/
object SaveMode extends Enumeration {
type SaveMode = Value
val
/**
* Overwrite mode means that when saving an RDF dataset to a path,
* if path already exists,
* the existing data is expected to be overwritten by the contents of the RDF dataset.
*/
Overwrite,
/**
* ErrorIfExists mode means that when saving an RDF dataset to a path, if path already exists,
* an exception is expected to be thrown.
*/
ErrorIfExists,
/**
* Ignore mode means that when saving an RDF dataset to a path, if path already exists,
* the save operation is expected to not save the contents of the RDF dataset and to not
* change the existing data.
*/
Ignore = Value
}
/**
* Converts a Jena [[Triple]] to a Spark SQL [[Row]] with three columns.
* @param triple the triple
* @return the row
*/
def toRow(triple: org.apache.jena.graph.Triple): Row = {
toRow(Seq(triple.getSubject, triple.getPredicate, triple.getObject))
}
private val pm = PrefixMapping.Factory.create
/**
* Converts a list of Jena [[Node]] objects to a Spark SQL [[Row]].
* The output is always a row of string values with each RDF node type being serialized as follows:
*
* - URI: `http://foo.bar`
* - bnode: `_:123`
* - Literal: `"123"^^`
*
* @param nodes the nodes
* @return the row
*/
def toRow(nodes: Seq[Node]): Row = {
// we use the Jena rendering, for URIs we omit the wrapping angle brackets, i.e. we return
// http://foo.bar instead of but for datatype URIs in literals we keep it
// TODO this is basically done because of querying, but for simplicity it would be easier to return
Row.fromSeq(nodes.map(n => if (n.isURI) n.toString() else FmtUtils.stringForNode(n, pm)))
}
/**
* Converts a Spark SQL [[Row]] to a Triple.
*
* @note It assumes a row containing exactly 3 columns and the
* order being `s,p,o`.
* Moreover, we assume the following serialization of the RDF entities in the columns which also matches how we read
* N-Triples into a DataFrame:
*
* - URI: `http://foo.bar`
* - bnode: `_:123`
* - Literal: `"123"^^`
*
*
* @param row the row with columns `s,p,o`
* @return the parsed triple
*/
def fromRow(row: Row): org.apache.jena.graph.Triple = {
val sStr = row.getString(0)
val s = if (sStr.startsWith("_:")) NodeFactory.createBlankNode(sStr)
else NodeFactoryExtra.parseNode(s"<$sStr>")
// as common in RDF, we assume URIs only for predicate
val p = NodeFactoryExtra.parseNode("<" + row.getString(1) + ">")
val oStr = row.getString(2)
val o = if (oStr.startsWith("_:")) { // bnode
NodeFactory.createBlankNode(oStr)
} else if (oStr.startsWith("http") && !oStr.contains("^^")) { // URI
NodeFactory.createURI(oStr)
} else { // literal
val lit = oStr
// val idx = oStr.indexOf("^^")
// if (idx > 0) {
// val first = oStr.substring(0, idx)
// val second = "<" + oStr.substring(idx + 2).trim + ">"
// lit = first + "^^" + second
// }
NodeFactoryExtra.parseNode(lit)
}
Triple.create(s, p, o)
}
// the DataFrame methods
/**
* Adds methods, `ntriples` and `turtle`, to [[DataFrameWriter]] that allows to write N-Triples files.
*/
implicit class RDFDataFrameWriter[T](writer: DataFrameWriter[T]) {
def rdf: String => Unit = writer.format("ntriples").save
def ntriples: String => Unit = writer.format("ntriples").save
}
/**
* Adds methods, `rdf`, `ntriples` and `turtle`, to [[DataFrameReader]] that allows to read N-Triples and Turtle
* files.
*/
implicit class RDFDataFrameReader(reader: DataFrameReader) extends Logging {
@transient lazy val conf: Config = ConfigFactory.load("rdf_loader")
/**
* Load RDF data into a `DataFrame`.
* @param lang the RDF language
* @return a [[DataFrame]][(String, String, String)]
*/
def rdf(lang: Lang): String => DataFrame = lang match {
case i if lang == Lang.NTRIPLES => ntriples
case _ => reader.format("rdf").option("lang", lang.getLabel).load
}
/**
* Load RDF data in N-Triples syntax into a [[DataFrame]] with columns `s`, `p`, and `o`.
* @return a [[DataFrame]][(String, String, String)]
*/
def ntriples: String => DataFrame = {
logDebug(s"Parsing N-Triples with ${conf.getString("rdf.ntriples.parser")} ...")
reader.format("ntriples").load
}
/**
* Load RDF data in Turtle syntax into a [[DataFrame]] with columns `s`, `p`, and `o`.
* @return a [[DataFrame]][(String, String, String)]
*/
def turtle: String => DataFrame = reader.format("rdf").option("lang", Lang.TURTLE.getLabel).load
/**
* Load RDF data in RDF/XML syntax into a [[DataFrame]] with columns `s`, `p`, and `o`.
* @return a [[DataFrame]][(String, String, String)]
*/
def rdfxml: String => DataFrame = reader.format("rdf").option("lang", Lang.RDFXML.getLabel).load
}
// the RDD methods
/**
* Adds methods, `saveAsNTriplesFile` to an RDD[Triple] that allows to write N-Triples files.
*/
implicit class RDFWriter[T](triples: RDD[Triple]) {
/** @deprecated because misconfigurations are only detected at the end of a spark process.
* Use RddRdfWriterFactory instead which does eager checking. */
def configureSave(): RddRdfWriter[Triple] = RddRdfWriter.createForTriple.setRdd(triples)
// * @param singleFile write to a single file only (internally, this is done by RDD::coalesce(1) function)
// * and is usually not recommended for large dataset because all data has to be moved
// * to a single node).
/**
* Save the data in N-Triples format.
*
* @param path the path where the N-Triples file(s) will be written to
* @param mode the expected behavior of saving the data to a data source
* @param exitOnError whether to stop if an error occurred
*/
def saveAsNTriplesFile(path: String,
mode: io.SaveMode.Value = SaveMode.ErrorIfExists,
exitOnError: Boolean = false): Unit = {
val fsPath = new Path(path)
val fs = fsPath.getFileSystem(triples.sparkContext.hadoopConfiguration)
val doSave = if (fs.exists(fsPath)) {
mode match {
case SaveMode.Overwrite =>
fs.delete(fsPath, true)
true
case SaveMode.ErrorIfExists =>
sys.error(s"Given path $path already exists!")
if (exitOnError) sys.exit(1)
false
case SaveMode.Ignore => false
case _ =>
throw new IllegalStateException(s"Unsupported save mode $mode ")
}
} else {
true
}
import scala.collection.JavaConverters._
// save only if there was no failure with the path before
if (doSave) triples
.mapPartitions(p => { // process each partition
// check if partition is empty
if (p.hasNext) {
val os = new ByteArrayOutputStream()
RDFDataMgr.writeTriples(os, p.asJava)
Collections.singleton(os.toString("UTF-8").trim).iterator().asScala
} else {
Iterator()
}
})
.saveAsTextFile(path)
}
}
/**
* Adds method `saveAsNQuadsFile` to an RDD[Quad] that allows to write N-Quads files.
*/
implicit class RDFQuadsWriter[T](quads: RDD[Quad]) {
def configureSave(): RddRdfWriter[Quad] = RddRdfWriter.createForQuad.setRdd(quads)
/**
* Deprecated; this method does not reuse Jena's RDFFormat/Lang system and also
* does not scale because it writes each partition into a single string.
*
* Save the data in N-Quads format.
*
* @param path the path where the N-Quads file(s) will be written to
* @param mode the expected behavior of saving the data to a data source
* @param exitOnError whether to stop if an error occurred
*/
@deprecated
def saveAsNQuadsFile(path: String,
mode: io.SaveMode.Value = SaveMode.ErrorIfExists,
exitOnError: Boolean = false): Unit = {
val fsPath = new Path(path)
val fs = fsPath.getFileSystem(quads.sparkContext.hadoopConfiguration)
val doSave = if (fs.exists(fsPath)) {
mode match {
case SaveMode.Overwrite =>
fs.delete(fsPath, true)
true
case SaveMode.ErrorIfExists =>
sys.error(s"Given path $path already exists!")
if (exitOnError) sys.exit(1)
false
case SaveMode.Ignore => false
case _ =>
throw new IllegalStateException(s"Unsupported save mode $mode ")
}
} else {
true
}
import scala.collection.JavaConverters._
// save only if there was no failure with the path before
if (doSave) quads
.mapPartitions(p => { // process each partition
// check if partition is empty
if (p.hasNext) {
val os = new ByteArrayOutputStream()
RDFDataMgr.writeQuads(os, p.asJava)
Collections.singleton(os.toString("UTF-8").trim).iterator().asScala
} else {
Iterator()
}
})
.saveAsTextFile(path)
}
def save(path: String): Unit = {
// determine language based on file extension
val lang = RDFLanguages.filenameToLang(path)
// unknown format
if (!RDFLanguages.isQuads(lang)) {
throw new IllegalArgumentException(s"couldn't determine syntax for RDF quads based on file extension in given path $path")
}
// N-Triples can be handle efficiently via file splits
if (lang == Lang.NQUADS) {
saveAsNQuadsFile(path)
} else { // others can't
val sc = quads.sparkContext
val confHadoop = sc.hadoopConfiguration
// quads.zipWithIndex().map{case (k, v) => (v, k)}
// .saveAsNewAPIHadoopFile(path, classOf[LongWritable], classOf[QuadWritable], classOf[QuadsOutputFormat[LongWritable]], confHadoop)
// quads.zipWithIndex().map{case (k, v) => ( new LongWritable(v), new QuadWritable(k) )}
// .saveAsNewAPIHadoopFile(path, classOf[LongWritable], classOf[QuadWritable], classOf[TriGOutputFormat[LongWritable]], confHadoop)
quads.zipWithIndex().map{case (k, v) => ( new LongWritable(v), new QuadWritable(k) )}
.saveAsNewAPIHadoopFile(path, classOf[LongWritable], classOf[QuadWritable], classOf[TriGOutputFormat[LongWritable]], confHadoop)
}
}
}
/**
* Adds methods to save RDDs of datasets to a folder or file.
*
*/
/*
implicit class JenaDatasetWriter[T](quads: RDD[JenaDataset]) {
def configureSave(): RddRdfWriter[JenaDataset] = {
RddRdfWriter.createForDataset.setRdd(quads)
}
}
*/
/**
* Adds methods, `rdf(lang: Lang)`, `ntriples`, `nquads`, and `turtle`, to [[SparkSession]] that allows to read
* N-Triples, N-Quads and Turtle files.
*/
implicit class RDFReader(spark: SparkSession) {
/**
* Load RDF data into an [[RDD]][Triple].
*
* Syntax is determined based on the file extension:
* If the URI ends ".rdf", it is assumed to be RDF/XML.
* If the URI ends ".nt", it is assumed to be N-Triples.
* If the URI ends ".ttl", it is assumed to be Turtle.
* If the URI ends ".owl", it is assumed to be RDF/XML.
*
* @return the [[RDD]] of RDF triples
*/
def rdf(path: String): RDD[Triple] = {
// determine language based on file extension
val lang = RDFLanguages.filenameToLang(path)
// unknown format
if (!RDFLanguages.isTriples(lang)) {
throw new IllegalArgumentException(s"couldn't determine syntax for RDF triples based on file extension in given path $path")
}
// N-Triples can be handle efficiently via file splits
if (lang == Lang.NTRIPLES) {
NTripleReader.load(spark, path)
} else { // others can't
val confHadoop = spark.sparkContext.hadoopConfiguration
// 1. parse the Turtle file into an RDD[String] with each entry containing a full Turtle snippet
val rdd = spark.sparkContext.newAPIHadoopFile(
path, classOf[TriplesInputFormat], classOf[LongWritable], classOf[TripleWritable], confHadoop)
.map { case (_, v) => v.get() }
rdd
}
}
/**
* Load RDF data into an [[RDD]][Triple]. Currently, N-Triples, Turtle, RDF/XML and Trix syntax are supported.
* @param lang the RDF language (N-Triples, Turtle, RDF/XML, Trix)
* @return the [[RDD]] of RDF triples
*/
def rdf(lang: Lang): String => RDD[Triple] = lang match {
case i if lang == Lang.NTRIPLES => ntriples()
case j if lang == Lang.TURTLE => turtle
case k if lang == Lang.RDFXML => rdfxml
case l if lang == Lang.TRIX => trix
// case g if lang == Lang.NQUADS => nquads(allowBlankLines)
case _ => throw new IllegalArgumentException(s"${lang.getLabel} syntax not supported yet!")
}
/**
* Loader for datasets from quad-based formats.
*
* @param lang
* @return
*/
def datasets(lang: Lang): String => RDD[DatasetOneNg] = {
if (!RDFLanguages.isQuads(lang)) {
throw new RuntimeException("Language " + lang + " not a quad-based language according to jena's registry")
}
if(!RDFLanguages.TRIG.equals(lang)) {
throw new RuntimeException("Only trig format supported yet")
}
trig
}
/**
* Load RDF data in N-Triples syntax into an [[RDD]][Triple].
*
* @param allowBlankLines whether blank lines will be allowed and skipped during parsing
* @return the [[RDD]] of triples
*/
def ntriples(allowBlankLines: Boolean = false): String => RDD[Triple] = path => {
NTripleReader.load(spark, path)
}
/**
* Load RDF data in N-Quads syntax into an [[RDD]][Triple], i.e. the graph will be omitted.
*
* @param allowBlankLines whether blank lines will be allowed and skipped during parsing
* @return the [[RDD]] of quads
*/
def nquads(allowBlankLines: Boolean = false): String => RDD[Quad] = path => {
NQuadReader.load(spark, path)
}
/**
* Load RDF data in RDF/XML syntax into an [[RDD]][Triple].
*
* Note, the data will not be splitted and only loaded via a single task because of the nature of XML and
* how Spark can handle this format.
*
* @return the [[RDD]] of triples
*/
def rdfxml: String => RDD[Triple] = path => {
val confHadoop = org.apache.hadoop.mapreduce.Job.getInstance().getConfiguration
confHadoop.setBoolean("sansa.rdf.parser.skipinvalid", true)
confHadoop.setInt("sansa.rdf.parser.numthreads", 4)
spark.sparkContext.newAPIHadoopFile(
path, classOf[RiotFileInputFormat], classOf[LongWritable], classOf[Triple], confHadoop)
.map { case (_, v) => v }
}
/**
* Load RDF data in Turtle syntax into an [[RDD]][Triple]
* @return the [[RDD]] of triples
*/
def turtle: String => RDD[Triple] = path => {
RdfSourceFactories.of(spark).get(path, Lang.TURTLE).asTriples()
/*
val confHadoop = spark.sparkContext.hadoopConfiguration
// 1. parse the Turtle file into an RDD[String] with each entry containing a full Turtle snippet
val rdd = spark.sparkContext.newAPIHadoopFile(
path, classOf[TurtleInputFormat], classOf[LongWritable], classOf[TripleWritable], confHadoop)
.map { case (_, v) => v.get() }
rdd
*/
}
/**
* Load RDF data in TRIX syntax into an [[RDD]][Triple]
* @return the [[RDD]] of triples
*/
def trix: String => RDD[Triple] = path => {
val confHadoop = org.apache.hadoop.mapreduce.Job.getInstance().getConfiguration
confHadoop.setBoolean("sansa.rdf.parser.skipinvalid", true)
confHadoop.set("stream.recordreader.begin", "")
confHadoop.set("stream.recordreader.end", " ")
spark.sparkContext.newAPIHadoopFile(
path, classOf[RiotFileInputFormat], classOf[LongWritable], classOf[Triple], confHadoop)
.map { case (_, v) => v }
}
/**
* Load RDF data in Trig syntax into an [[RDD]][Dataset]
*
* @return the [[RDD]] of datasets
*/
def trig: String => RDD[DatasetOneNg] = path => {
val confHadoop = spark.sparkContext.hadoopConfiguration
spark.sparkContext.newAPIHadoopFile(path,
classOf[FileInputFormatRdfTrigDataset],
classOf[LongWritable],
classOf[DatasetOneNg], confHadoop)
.map { case (_, v) => v }
}
/**
* Create an RDD of triples from a model's graph
*
* @author Claus Stadler
*/
def rdf(model: Model): RDD[Triple] = {
rdf(model.getGraph)
}
/**
* Create an RDD of triples from a graph's triples
*
* @author Claus Stadler
*/
def rdf(graph: Graph): RDD[Triple] = {
import collection.JavaConverters._
// val seq = graph.
val it = graph.find
var result: RDD[Triple] = null
try {
val seq = it.asScala.toSeq
result = spark.sparkContext.parallelize(seq)
} finally {
it.close
}
result
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy