net.sansa_stack.rdf.spark.model.df.TripleOps.scala Maven / Gradle / Ivy
package net.sansa_stack.rdf.spark.model.df
import net.sansa_stack.rdf.spark.io.fromRow
import org.apache.jena.graph.Triple
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
/**
* Spark/DataFrame based implementation of DataFrame of triples.
*
* @author Gezim Sejdiu
* @author Lorenz Buehmann
*/
object TripleOps {
/**
* Convert a [[DataFrame]] into a RDD[Triple].
*
* @param triples DataFrame of triples (as string).
* @return a RDD of triples.
*/
def toRDD(triples: DataFrame): RDD[Triple] = {
triples.rdd.map(fromRow)
}
/**
* Convert a DataFrame of Triple into a Dataset of Triple.
*
* @param triples DataFrame of triples.
* @return a Dataset of triples.
*/
def toDS(triples: DataFrame): Dataset[Triple] = {
val spark: SparkSession = SparkSession.builder().getOrCreate()
implicit val encoder: Encoder[Triple] = Encoders.kryo[Triple]
// triples.as[Triple]
spark.createDataset[Triple](toRDD(triples))
}
/**
* Get triples.
*
* @param triples DataFrame of triples.
* @return DataFrame which contains list of the triples.
*/
def getTriples(triples: DataFrame): DataFrame =
triples
/**
* Get subjects.
*
* @param triples DataFrame of triples.
* @return DataFrame which contains list of the subjects.
*/
def getSubjects(triples: DataFrame): DataFrame =
triples.select("s")
/**
* Get predicates.
*
* @param triples DataFrame of triples.
* @return DataFrame which contains list of the predicates.
*/
def getPredicates(triples: DataFrame): DataFrame =
triples.select("p")
/**
* Get objects.
*
* @param triples DataFrame of triples.
* @return DataFrame which contains list of the objects.
*/
def getObjects(triples: DataFrame): DataFrame =
triples.select("o")
/**
* Returns an DataFrame of triples that match with the given input.
*
* @param triples DataFrame of triples
* @param subject the subject
* @param predicate the predicate
* @param object the object
* @return DataFrame of triples
*/
def find(triples: DataFrame, subject: Option[String] = None, predicate: Option[String] = None, `object`: Option[String] = None): DataFrame = {
val sql = getSQL(subject, predicate, `object`)
triples.sqlContext.sql(sql)
}
/**
* Generate the translated SQL statement from the triple pattern.
*
* @param subject the subject
* @param predicate the predicate
* @param object the object
* @return the translated SQL statement as a string
*/
def getSQL(subject: Option[String] = None, predicate: Option[String] = None, `object`: Option[String] = None): String = {
var sql = s"SELECT s, p, o FROM TRIPLES"
if (subject.isDefined || predicate.isDefined || `object`.isDefined) {
sql += " WHERE "
val conditions = scala.collection.mutable.ListBuffer[String]()
if (subject.isDefined) conditions += s"s = '${subject.get}'"
if (predicate.isDefined) conditions += s"p = '${predicate.get}'"
if (`object`.isDefined) conditions += s"o = '${`object`.get}'"
sql += conditions.mkString(" AND ")
}
sql
}
/**
* Returns an DataFrame of triples that match with the given input.
*
* @param triples DataFrame of triples
* @param triple the triple to be checked
* @return DataFrame of triples that match the given input
*/
def find(triples: DataFrame, triple: Triple): DataFrame = {
find(
triples,
if (triple.getSubject.isVariable) None else Option(triple.getSubject.getURI),
if (triple.getPredicate.isVariable) None else Option(triple.getPredicate.getURI),
if (triple.getObject.isVariable) None else {
Option(if (triple.getObject.isLiteral) {
triple.getObject.getLiteralLexicalForm
} else triple.getObject.getURI)
})
}
/**
* Return the union all of RDF graphs.
*
* @param triples DataFrame of RDF graph
* @param others sequence of DataFrames of other RDF graph
* @return graph (union of all)
*/
def unionAll(triples: DataFrame, others: Seq[DataFrame]): DataFrame = {
val df: Option[DataFrame] = others match {
case g :: Nil => Some(g.toDF())
case g :: _ =>
Some(
g.toDF()
.sqlContext
.createDataFrame(
g.toDF().sqlContext.sparkContext.union(others.map(_.toDF().rdd)),
g.toDF().schema))
case _ => None
}
df.get
}
/**
* Determine whether this RDF graph contains any triples
* with a given (subject, predicate, object) pattern.
*
* @param triples DataFrame of triples
* @param subject the subject (None for any)
* @param predicate the predicate (None for any)
* @param object the object (None for any)
* @return true if there exists within this RDF graph
* a triple with (S, P, O) pattern, false otherwise
*/
def contains(triples: DataFrame, subject: Option[String] = None, predicate: Option[String] = None, `object`: Option[String] = None): Boolean = {
!find(triples, subject, predicate, `object`).isEmpty
}
/**
* Determine if a triple is present in this RDF graph.
*
* @param triples DataFrame of triples
* @param triple the triple to be checked
* @return true if the statement s is in this RDF graph, false otherwise
*/
def contains(triples: DataFrame, triple: Triple): Boolean = {
!find(triples, triple).isEmpty
}
/**
* Determine if any of the triples in an RDF graph are also contained in this RDF graph.
*
* @param triples DataFrame of triples
* @param other the other RDF graph containing the statements to be tested
* @return true if any of the statements in RDF graph are also contained
* in this RDF graph and false otherwise.
*/
def containsAny(triples: DataFrame, other: DataFrame): Boolean = {
!triples.except(other).isEmpty // .exceptAll()?
}
/**
* Determine if all of the statements in an RDF graph are also contained in this RDF graph.
*
* @param triples DataFrame of triples
* @param other the other RDF graph containing the statements to be tested
* @return true if all of the statements in RDF graph are also contained
* in this RDF graph and false otherwise.
*/
def containsAll(triples: DataFrame, other: DataFrame): Boolean = {
triples.except(other).isEmpty // .exceptAll()?
}
/**
* Add a statement to the current RDF graph.
*
* @param triples DataFrame of RDF graph
* @param triple the triple to be added.
* @return new DataFrame of triples containing this statement.
*/
def add(triples: DataFrame, triple: Triple): DataFrame = {
import net.sansa_stack.rdf.spark.model._
val statement = triples.sparkSession.sparkContext.parallelize(Seq(triple))
triples.union(statement.toDF())
}
/**
* Add a list of statements to the current RDF graph.
*
* @param triples DataFrame of RDF graph
* @param triple the list of triples to be added.
* @return new DataFrame of triples containing this list of statements.
*/
def addAll(triples: DataFrame, triple: Seq[Triple]): DataFrame = {
import net.sansa_stack.rdf.spark.model._
val statements = triples.sparkSession.sparkContext.parallelize(triple)
triples.union(statements.toDF())
}
/**
* Removes a statement from the current RDF graph.
* The statement with the same subject, predicate and
* object as that supplied will be removed from the model.
*
* @param triples DataFrame of RDF graph
* @param triple the statement to be removed.
* @return new DataFrame of triples without this statement.
*/
def remove(triples: DataFrame, triple: Triple): DataFrame = {
import net.sansa_stack.rdf.spark.model._
val statement = triples.sparkSession.sparkContext.parallelize(Seq(triple))
triples.except(statement.toDF())
}
/**
* Removes all the statements from the current RDF graph.
* The statements with the same subject, predicate and
* object as those supplied will be removed from the model.
*
* @param triples DataFrame of RDF graph
* @param triple the list of statements to be removed.
* @return new DataFrame of triples without these statements.
*/
def removeAll(triples: DataFrame, triple: Seq[Triple]): DataFrame = {
import net.sansa_stack.rdf.spark.model._
val statements = triples.sparkSession.sparkContext.parallelize(triple)
triples.except(statements.toDF())
}
/**
* Write N-Triples from a given DataFrame of triples
*
* @param triples DataFrame of RDF graph
* @param path path to the file containing N-Triples
*/
def saveAsNTriplesFile(triples: DataFrame, path: String): Unit = {
net.sansa_stack.rdf.spark.model.ds.TripleOps.saveAsNTriplesFile(toDS(triples), path)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy