![JAR search and dependency download from the Maven repository](/logo.png)
net.sansa_stack.rdf.spark.stats.package.scala Maven / Gradle / Ivy
package net.sansa_stack.rdf.spark
import net.sansa_stack.rdf.spark.stats.RDFStatistics.SubclassUsage
import net.sansa_stack.rdf.spark.utils.Logging
import org.apache.jena.graph.{Node, Triple}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
package object stats {
implicit class StatsCriteria(triples: RDD[Triple]) extends Logging {
@transient val spark: SparkSession = SparkSession.builder().getOrCreate()
/**
* Compute distributed RDF dataset statistics.
* @return VoID description of the given dataset
*/
def stats: RDD[String] =
RDFStatistics.run(triples)
/**
* 1. Used Classes Criterion
* Creates an RDD of classes are in use by instances of the analyzed dataset.
* As an example of such a triple that will be accepted by
* the filter is `sda:Gezim rdf:type distLODStats:Developer`.
* Filter rule : `?p=rdf:type && isIRI(?o)`
* Action : `S += ?o`
* @return RDD of classes/instances
*/
def statsUsedClasses(): RDD[Node] =
Used_Classes(triples, spark).Filter()
/**
* 2. Class Usage Count Criterion
* Count the usage of respective classes of a datase,
* the filter rule that is used to analyze a triple is the
* same as in the first criterion.
* As an action a map is being created having class IRIs as
* identifier and its respective usage count as value.
* If a triple is conform to the filter rule the respective
* value will be increased by one.
* Filter rule : `?p=rdf:type && isIRI(?o)`
* Action : `M[?o]++ `
* @return RDD of classes used in the dataset and their frequencies.
*/
def statsClassUsageCount(): RDD[(Node, Int)] =
Used_Classes(triples, spark).Action()
/**
* 3. Classes Defined Criterion
* Gets a set of classes that are defined within a
* dataset this criterion is being used.
* Usually in RDF/S and OWL a class can be defined by a triple
* using the predicate `rdf:type` and either `rdfs:Class` or
* `owl:Class` as object.
* The filter rule illustrates the condition used to analyze the triple.
* If the triple is accepted by the rule, the IRI used as subject is added to the set of classes.
* Filter rule : `?p=rdf:type && isIRI(?s) &&(?o=rdfs:Class||?o=owl:Class)`
* Action : `S += ?s `
* @return RDD of classes defined in the dataset.
*/
def statsClassesDefined(): RDD[Node] =
Classes_Defined(triples, spark).Action()
/**
* 4. Class hierarchy depth criterion
* @return the depth of the graph
*/
def statsClassHierarchyDepth(): RDD[(Node, Int)] =
RDFStatistics.ClassHierarchyDepth(triples)
/**
* 5. Property Usage Criterion
* Count the usage of properties within triples.
* Therefore an RDD will be created containing all property
* IRI's as identifier.
* Afterwards, their frequencies will be computed.
* Filter rule : `none`
* Action : `M[?p]++ `
* @return RDD of predicates used in the dataset and their frequencies.
*/
def statsPropertyUsage(): RDD[(Node, Int)] =
PropertyUsage(triples, spark).Action()
/**
* 6. Property usage distinct per subject
* Count the usage of properties within triples based on subjects.
* Filter rule : `none`
* Action : `M[?s] += ?p `
* @return RDD of predicates used in the dataset and their frequencies.
*/
def statsPropertyUsageDistinctPerSubject(): RDD[(Iterable[Triple], Int)] =
RDFStatistics.PropertyUsageDistinctPerSubject(triples)
/**
* 7. Property usage distinct per object
* Count the usage of properties within triples based on objects.
* Filter rule : `none`
* Action : `M[?o] += ?p `
* @return RDD of predicates used in the dataset and their frequencies.
*/
def statsPropertyUsageDistinctPerObject(): RDD[(Iterable[Triple], Int)] =
RDFStatistics.PropertyUsageDistinctPerObject(triples)
/**
* 12. Property hierarchy depth criterion
*
* @return the depth of the graph
*/
def statsPropertyHierarchyDepth(): RDD[(Node, Int)] =
RDFStatistics.PropertyHierarchyDepth(triples)
/**
* 16. Distinct entities
* Count distinct entities of a dataset by filtering out all IRIs.
* Filter rule : `S+=iris({?s,?p,?o})`
* Action : `S`
* @return RDD of distinct entities in the dataset.
*/
def statsDistinctEntities(): RDD[Node] =
DistinctEntities(triples, spark).Action()
/**
* * 17. Literals criterion
*
* @return number of triples that are referencing literals to subjects.
*/
def statsLiterals(): RDD[Triple] =
RDFStatistics.Literals(triples)
/**
* 18. Blanks as subject criterion
*
* @return number of triples where blanknodes are used as subjects.
*/
def statsBlanksAsSubject(): RDD[Triple] =
RDFStatistics.BlanksAsSubject(triples)
/**
* 19. Blanks as object criterion
*
* @return number of triples where blanknodes are used as objects.
*/
def statsBlanksAsObject(): RDD[Triple] =
RDFStatistics.BlanksAsObject(triples)
/**
* 20. Datatypes criterion
*
* @return histogram of types used for literals.
*/
def statsDatatypes(): RDD[(String, Int)] =
RDFStatistics.Datatypes(triples)
/**
* 21. Languages criterion
*
* @return histogram of languages used for literals.
*/
def statsLanguages(): RDD[(String, Int)] =
RDFStatistics.Languages(triples)
/**
* 22. Average typed string length criterion.
*
* @return the average typed string length used throughout the RDF graph.
*/
def statsAvgTypedStringLength(): Double =
RDFStatistics.AvgTypedStringLength(triples)
/**
* 23. Average untyped string length criterion.
*
* @return the average untyped string length used throughout the RDF graph.
*/
def statsAvgUntypedStringLength(): Double =
RDFStatistics.AvgUntypedStringLength(triples)
/**
* 24. Typed subjects criterion.
*
* @return list of typed subjects.
*/
def statsTypedSubjects(): RDD[Node] =
RDFStatistics.TypedSubjects(triples)
/**
* 24. Labeled subjects criterion.
*
* @return list of labeled subjects.
*/
def statsLabeledSubjects(): RDD[Node] =
RDFStatistics.LabeledSubjects(triples)
/**
* 25. SameAs criterion.
*
* @return list of triples with owl#sameAs as predicate
*/
def statsSameAs(): RDD[Triple] =
RDFStatistics.SameAs(triples)
/**
* 26. Links criterion.
*
* @return list of namespaces and their frequentcies.
*/
def statsLinks(): RDD[(String, String, Int)] =
RDFStatistics.Links(triples)
/**
* 28.Maximum per property {int,float,time} criterion
*
* @return entities with their maximum values on the graph
*/
def statsMaxPerProperty(): RDD[(Node, Node)] =
RDFStatistics.MaxPerProperty(triples)
/**
* 29. Average per property {int,float,time} criterion
*
* @return entities with their average values on the graph
*/
def statsAvgPerProperty(): RDD[(Node, Double)] =
RDFStatistics.AvgPerProperty(triples)
/**
* 30. Subject vocabularies
* Compute subject vocabularies/namespaces used through the dataset.
* Filter rule : `ns=ns(?s)`
* Action : `M[ns]++`
* @return RDD of distinct subject vocabularies used in the dataset and their frequencies.
*/
def statsSubjectVocabularies(): RDD[(String, Int)] =
SPO_Vocabularies(triples, spark).SubjectVocabulariesPostProc()
/**
* 31. Predicate vocabularies
* Compute predicate vocabularies/namespaces used through the dataset.
* Filter rule : `ns=ns(?p)`
* Action : `M[ns]++`
* @return RDD of distinct predicate vocabularies used in the dataset and their frequencies.
*/
def statsPredicateVocabularies(): RDD[(String, Int)] =
SPO_Vocabularies(triples, spark).PredicateVocabulariesPostProc()
/**
* 32. Object vocabularies
* Compute object vocabularies/namespaces used through the dataset.
* Filter rule : `ns=ns(?o)`
* Action : `M[ns]++`
* @return RDD of distinct object vocabularies used in the dataset and their frequencies.
*/
def statsObjectVocabularies(): RDD[(String, Int)] =
SPO_Vocabularies(triples, spark).ObjectVocabulariesPostProc()
/**
* Distinct Subjects
* Count distinct subject within triples.
* Filter rule : `isURI(?s)`
* Action : `M[?s]++`
* @return RDD of subjects used in the dataset.
*/
def statsDistinctSubjects(): RDD[Node] =
DistinctSubjects(triples, spark).Action()
/**
* Distinct Objects
* Count distinct objects within triples.
* Filter rule : `isURI(?o)`
* Action : `M[?o]++`
* @return RDD of objects used in the dataset.
*/
def statsDistinctObjects(): RDD[Node] =
DistinctObjects(triples, spark).Action()
/**
* Properties Defined
* Count the defined properties within triples.
* Filter rule : `?p=rdf:type && (?o=owl:ObjectProperty ||
* ?o=rdf:Property)&& !isIRI(?s)`
* Action : `M[?p]++`
* @return RDD of predicates defined in the dataset.
*/
def statsPropertiesDefined(): RDD[Node] =
PropertiesDefined(triples, spark).Action()
/**
* Subclass Usage Count
* Count the usage of rdfs:subClass properties within triples.
* @return number of rdfs:subClass predicates used in the dataset.
*/
def statsSubclassUsage(): Long =
SubclassUsage(triples)
/**
* Triples Count
* Count the number of triples.
* @return number of triples in the dataset.
*/
def statsNumberOfTriples(): Long =
RDFStatistics.Triples(triples)
/**
* 15. Distinct entities Count
* * Count the number of distinct entities.
*
* @return the number of distinct entities
*/
def statusEntitiesMentioned(): Long = {
RDFStatistics.EntitiesMentioned(triples)
}
}
implicit class StatsCriteriaVoidify(stats: RDD[String]) extends Logging {
/**
* Voidify RDF dataset based on the Vocabulary of Interlinked Datasets (VoID) [[https://www.w3.org/TR/void/]]
*
* @param source name of the Dataset:source--usualy the file's name
* @param output the directory to save RDF dataset summary
*/
def voidify(source: String, output: String): Unit =
RDFStatistics.voidify(stats, source, output)
/**
* Prints the Voidiy version of the given RDF dataset
*
* @param source name of the Dataset:source--usualy the file's name
*/
def print(source: String): Unit =
RDFStatistics.print(stats, source)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy