
net.sansa_stack.inference.spark.data.writer.RDFGraphWriter.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sansa-inference-spark_2.12 Show documentation
Show all versions of sansa-inference-spark_2.12 Show documentation
Apache Spark based inference layer for RDF and OWL
The newest version!
package net.sansa_stack.inference.spark.data.writer
import net.sansa_stack.inference.spark.data.model.RDFGraph
import net.sansa_stack.inference.utils.JenaTripleToNTripleString
import org.aksw.jenax.arq.util.triple.TripleUtils
import org.apache.jena.graph.{GraphUtil, NodeFactory, Triple}
import org.apache.jena.rdf.model.{Model, ModelFactory}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.slf4j.LoggerFactory
import java.util.Comparator
/**
* A class that provides methods to write an RDF graph to disk.
*
* @author Lorenz Buehmann
*
*/
object RDFGraphWriter {
private val logger = com.typesafe.scalalogging.Logger(LoggerFactory.getLogger(this.getClass.getName))
/**
* Write the graph to disk in N-Triples format.
*
* @param graph the RDF graph
* @param path the output directory
* @param singleFile whether to put all data into a single file
* @param sorted whether to sort the triples by subject, predicate, object
*/
def writeToDisk(graph: RDFGraph, path: String, singleFile: Boolean = false, sorted: Boolean = false): Unit = {
writeTriplesToDisk(graph.triples, path, singleFile, sorted)
}
/**
* Write the triples to disk in N-Triples format.
*
* @param triples the triples
* @param path the output directory
* @param singleFile whether to put all data into a single file
* @param sorted whether to sort the triples by subject, predicate, object
*/
def writeTriplesToDisk(triples: RDD[Triple],
path: String,
singleFile: Boolean = false,
sorted: Boolean = false): Unit = {
logger.info("writing triples to disk...")
val startTime = System.currentTimeMillis()
implicit val tripleOrdering = new Ordering[Triple] {
val comparator: Comparator[Triple] = TripleUtils.compareRDFTerms
override def compare(t1: Triple, t2: Triple): Int = comparator.compare(t1, t2)
}
// sort triples if enabled
val tmp = if (sorted) {
triples.map(t => (t, t)).sortByKey().map(_._1)
} else {
triples
}
// convert to N-Triple format
var triplesNTFormat = tmp.map(new JenaTripleToNTripleString())
// convert to single file, i.e. move al lto one partition
// (might be very expensive and contradicts the Big Data paradigm on Hadoop in general)
if (singleFile) {
triplesNTFormat = triplesNTFormat.coalesce(1, shuffle = true)
}
// finally, write to disk
triplesNTFormat.saveAsTextFile(path)
logger.info("finished writing triples to disk in " + (System.currentTimeMillis() - startTime) + "ms.")
}
/**
* Write the triples represented by the DataFrame to disk in N-Triples format.
*
* @param triples the DataFrame containing the triples
* @param path the output directory
* @param singleFile whether to put all data into a single file
* @param sorted whether to sort the triples by subject, predicate, object
*/
def writeDataframeToDisk(triples: DataFrame,
path: String,
singleFile: Boolean = false,
sorted: Boolean = false): Unit = {
val rowToJenaTriple = (row: Row) => {
Triple.create(
NodeFactory.createURI(row.getString(0)),
NodeFactory.createURI(row.getString(1)),
if (row.getString(2).startsWith("http:")) NodeFactory.createURI(row.getString(2)) else NodeFactory.createLiteral(row.getString(2)))
}
writeTriplesToDisk(
triples.rdd.map(rowToJenaTriple),
path,
singleFile,
sorted
)
}
/**
* Converts an RDF graph to an Apache Jena in-memory model.
*
* @note For large graphs this can be too expensive
* and lead to a OOM exception
*
* @param graph the RDF graph
*
* @return the in-memory Apache Jena model containing the triples
*/
def convertToModel(graph: RDFGraph): Model = {
val triples = graph.triples.collect()
val model = ModelFactory.createDefaultModel()
GraphUtil.add(model.getGraph, triples)
model
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy