
net.sansa_stack.rdf.spark.io.benchmark.SansaBenchRdfParse.scala Maven / Gradle / Ivy
The newest version!
package net.sansa_stack.rdf.spark.io.benchmark
import java.io.{ByteArrayInputStream, File, InputStream}
import java.util.concurrent.TimeUnit
import net.sansa_stack.rdf.benchmark.io.ReadableByteChannelFromIterator
import com.google.common.base.Stopwatch
import org.apache.jena.graph.Triple
import org.apache.jena.riot.{Lang, RDFDataMgr}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import scala.collection.JavaConverters._
object SansaBenchRdfParse {
def main(args: Array[String]): Unit = {
val tempDirStr = System.getProperty("java.io.tmpdir")
if (tempDirStr == null) {
throw new RuntimeException("Could not obtain temporary directory")
}
val sparkEventsDir = new File(tempDirStr + "/spark-events")
if (!sparkEventsDir.exists()) {
sparkEventsDir.mkdirs()
}
// File.createTempFile("spark-events")
val sparkSession = SparkSession.builder
.master("local")
.appName("spark session example")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
// .config("spark.kryo.registrationRequired", "true")
.config("spark.eventLog.enabled", "true")
// .config("spark.kryo.registrator", String.join(
// ", ",
// "net.sansa_stack.rdf.spark.io.JenaKryoRegistrator"
// "net.sansa_stack.query.spark.sparqlify.KryoRegistratorSparqlify")
// )
.config("spark.default.parallelism", "4")
.config("spark.sql.shuffle.partitions", "4")
.getOrCreate()
sparkSession.conf.set("spark.sql.crossJoin.enabled", "true")
val triplesString =
""" "Guy De" .
| "30"^^ .
| .
| .
| .
| "Charles"@en .
| .
| .
| .
| "20170731T150000Z"@en .
|""".stripMargin
// val it = RDFDataMgr.createIteratorTriples(IOUtils.toInputStream(triplesString, "UTF-8"), Lang.NTRIPLES, "http://example.org/").asScala.toSeq
// it.foreach { x => println("GOT: " + (if(x.getObject.isLiteral) x.getObject.getLiteralLanguage else "-")) }
// val graphRdd : RDD[Triple] = sparkSession.sparkContext.parallelize(it)
// val textRdd : RDD[String] = sparkSession.sparkContext.parallelize(triplesString.split("\n"))
if (args.length != 1) {
sys.error("please provide path to RDF file(s) a argument")
sparkSession.stop()
System.exit(0)
}
val textRdd: RDD[String] = sparkSession.sparkContext.textFile(args(0), 20)
println(s"Raw count: ${textRdd.count()}")
measureLoadingTime(
textRdd
.mapPartitions(p => RDFDataMgr.createIteratorTriples(toInputStream(p), Lang.NTRIPLES, null).asScala))
measureLoadingTime(
textRdd
.map(line => RDFDataMgr.createIteratorTriples(new ByteArrayInputStream(line.getBytes), Lang.NTRIPLES, null).next()))
sparkSession.stop()
}
def measureLoadingTime(rdd: RDD[Triple]): Unit = {
val sw = Stopwatch.createStarted()
val c = rdd.count()
println(s"Time: ${sw.stop().elapsed(TimeUnit.MILLISECONDS)}ms")
println(s"Count: $c")
}
def toInputStream(it: Iterator[String]): InputStream =
ReadableByteChannelFromIterator.toInputStream(it.asJava)
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy