All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.opencypher.spark.api.io.EntityTable.scala Maven / Gradle / Ivy
/*
* Copyright (c) 2016-2018 "Neo4j, Inc." [https://neo4j.com]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.opencypher.spark.api.io
import org.apache.spark.sql.DataFrame
import org.apache.spark.storage.StorageLevel
import org.opencypher.okapi.api.io.conversion.{EntityMapping, NodeMapping, RelationshipMapping}
import org.opencypher.okapi.api.schema.Schema
import org.opencypher.okapi.api.table.CypherTable
import org.opencypher.okapi.api.types._
import org.opencypher.okapi.api.value.CypherValue
import org.opencypher.okapi.api.value.CypherValue.CypherValue
import org.opencypher.spark.api.CAPSSession
import org.opencypher.spark.api.io.EntityTable.SparkTable
import org.opencypher.spark.impl.CAPSRecords
import org.opencypher.spark.impl.DataFrameOps._
import org.opencypher.spark.impl.util.Annotation
import org.opencypher.spark.schema.CAPSSchema
import org.opencypher.spark.schema.CAPSSchema._
import scala.collection.JavaConverters._
import scala.reflect.runtime.universe._
/**
* An entity table describes how to map an input data frame to a Cypher entity (i.e. nodes or relationships).
*/
sealed trait EntityTable[T <: CypherTable[String]] {
verify()
def schema: CAPSSchema
def mapping: EntityMapping
def table: T
protected def verify(): Unit = {
table.verifyColumnType(mapping.sourceIdKey, CTInteger, "id key")
}
}
object EntityTable {
implicit class SparkTable(val df: DataFrame) extends CypherTable[String] {
override def columns: Seq[String] = df.columns
override def columnType: Map[String, CypherType] = columns.map(c => c -> df.cypherTypeForColumn(c)).toMap
override def rows: Iterator[String => CypherValue] = df.toLocalIterator.asScala.map { row =>
columns.map(c => c -> CypherValue(row.get(row.fieldIndex(c)))).toMap
}
override def size: Long = df.size
def cache(): SparkTable = df.cache()
def persist(): SparkTable = df.persist()
def persist(newLevel: StorageLevel): SparkTable = df.persist(newLevel)
def unpersist(): SparkTable = df.unpersist()
def unpersist(blocking: Boolean): SparkTable = df.unpersist(blocking)
}
}
trait CAPSEntityTable extends EntityTable[SparkTable] {
// TODO: create CTEntity type
private[spark] def entityType: CypherType with DefiniteCypherType = mapping.cypherType
private[spark] def records(implicit caps: CAPSSession): CAPSRecords = CAPSRecords.create(this)
}
case class CAPSNodeTable(mapping: NodeMapping, table: SparkTable) extends NodeTable(mapping, table) with CAPSEntityTable
object CAPSNodeTable {
def apply[E <: Node : TypeTag](nodes: Seq[E])(implicit caps: CAPSSession): CAPSNodeTable = {
val nodeLabels = Annotation.labels[E]
val nodeDF = caps.sparkSession.createDataFrame(nodes)
val nodeProperties = properties(nodeDF.columns)
val nodeMapping = NodeMapping.create(nodeIdKey = GraphEntity.sourceIdKey, impliedLabels = nodeLabels, propertyKeys = nodeProperties)
CAPSNodeTable(nodeMapping, nodeDF)
}
private def properties(nodeColumnNames: Seq[String]): Set[String] = {
nodeColumnNames.filter(_ != GraphEntity.sourceIdKey).toSet
}
}
case class CAPSRelationshipTable(mapping: RelationshipMapping, table: SparkTable) extends RelationshipTable(mapping, table) with CAPSEntityTable
object CAPSRelationshipTable {
def apply[E <: Relationship : TypeTag](relationships: Seq[E])(implicit caps: CAPSSession): CAPSRelationshipTable = {
val relationshipType: String = Annotation.relType[E]
val relationshipDF = caps.sparkSession.createDataFrame(relationships)
val relationshipProperties = properties(relationshipDF.columns.toSet)
val relationshipMapping = RelationshipMapping.create(GraphEntity.sourceIdKey,
Relationship.sourceStartNodeKey,
Relationship.sourceEndNodeKey,
relationshipType,
relationshipProperties)
CAPSRelationshipTable(relationshipMapping, relationshipDF)
}
private def properties(relColumnNames: Set[String]): Set[String] = {
relColumnNames.filter(!Relationship.nonPropertyAttributes.contains(_))
}
}
/**
* A node table describes how to map an input data frame to a Cypher node.
*
* @param mapping mapping from input data description to a Cypher node
* @param table input data frame
*/
abstract class NodeTable[T <: CypherTable[String]](mapping: NodeMapping, table: T) extends EntityTable[T] {
override lazy val schema: CAPSSchema = {
val propertyKeys = mapping.propertyMapping.toSeq.map {
case (propertyKey, sourceKey) => propertyKey -> table.columnType(sourceKey)
}
mapping.optionalLabelMapping.keys.toSet.subsets
.map(_.union(mapping.impliedLabels))
.map(combo => Schema.empty.withNodePropertyKeys(combo.toSeq: _*)(propertyKeys: _*))
.reduce(_ ++ _)
.asCaps
}
override protected def verify(): Unit = {
super.verify()
mapping.optionalLabelMapping.values.foreach { optionalLabelKey =>
table.verifyColumnType(optionalLabelKey, CTBoolean, "optional label")
}
}
}
/**
* A relationship table describes how to map an input data frame to a Cypher relationship.
*
* @param mapping mapping from input data description to a Cypher relationship
* @param table input data frame
*/
abstract class RelationshipTable[T <: CypherTable[String]](mapping: RelationshipMapping, table: T) extends EntityTable[T] {
override lazy val schema: CAPSSchema = {
val relTypes = mapping.relTypeOrSourceRelTypeKey match {
case Left(name) => Set(name)
case Right((_, possibleTypes)) => possibleTypes
}
val propertyKeys = mapping.propertyMapping.toSeq.map {
case (propertyKey, sourceKey) => propertyKey -> table.columnType(sourceKey)
}
relTypes.foldLeft(Schema.empty) {
case (partialSchema, relType) => partialSchema.withRelationshipPropertyKeys(relType)(propertyKeys: _*)
}.asCaps
}
override protected def verify(): Unit = {
super.verify()
table.verifyColumnType(mapping.sourceStartNodeKey, CTInteger, "start node")
table.verifyColumnType(mapping.sourceEndNodeKey, CTInteger, "end node")
mapping.relTypeOrSourceRelTypeKey.right.foreach { key =>
table.verifyColumnType(key._1, CTString, "relationship type")
}
}
}