All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencypher.spark.impl.io.hdfs.CsvGraphLoader.scala Maven / Gradle / Ivy

There is a newer version: 1.0.0-beta7
Show newest version
/*
 * Copyright (c) 2016-2018 "Neo4j, Inc." [https://neo4j.com]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.opencypher.spark.impl.io.hdfs

import java.net.URI

import org.apache.hadoop.conf.Configuration
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, SparkSession, functions}
import org.opencypher.okapi.api.graph.PropertyGraph
import org.opencypher.okapi.api.io.conversion.{NodeMapping, RelationshipMapping}
import org.opencypher.spark.api.CAPSSession
import org.opencypher.spark.api.io.{CAPSNodeTable, CAPSRelationshipTable}
import org.opencypher.spark.impl.DataFrameOps._

/**
  * Loads a graph stored in indexed CSV format from HDFS or the local file system
  * The CSV files must be stored following this schema:
  * # Nodes
  *   - all files describing nodes are stored in a subfolder called "nodes"
  *   - create one file for each possible label combination that exists in the data, i.e. there must not be overlapping
  * entities in different files (e.g. all nodes with labels :Person:Employee in a single file and all nodes that
  * have label :Person exclusively in another file)
  *   - for every node csv file create a schema file called FILE_NAME.csv.SCHEMA
  *   - for information about the structure of the node schema file see [[CsvNodeSchema]]
  * # Relationships
  *   - all files describing relationships are stored in a subfolder called "relationships"
  *   - create one csv file per relationship type
  *   - for every relationship csv file create a schema file called FILE_NAME.csv.SCHEMA
  *   - for information about the structure of the relationship schema file see [[CsvRelSchema]]
  *
  * @param fileHandler CsvGraphLoaderFileHandler file handler for hdfs or local file system
  * @param capsSession CAPS Session
  */
class CsvGraphLoader(fileHandler: CsvFileHandler)(implicit capsSession: CAPSSession) {

  private val sparkSession: SparkSession = capsSession.sparkSession

  def load: PropertyGraph = capsSession.readFrom(loadNodes ++ loadRels: _*)

  private def loadNodes: List[CAPSNodeTable] = {
    val csvFiles = fileHandler.listNodeFiles.toList

    csvFiles.map(e => {
      val schema = parseSchema(e)(CsvNodeSchema(_))

      val intermediateDF = sparkSession.read
        .schema(schema.toStructType)
        .csv(e.toString)

      val dataFrame = convertLists(intermediateDF, schema)

      val nodeMapping = NodeMapping.create(schema.idField.name,
        impliedLabels = schema.implicitLabels.toSet,
        optionalLabels = schema.optionalLabels.map(_.name).toSet,
        propertyKeys = schema.propertyFields.map(_.name).toSet)

      val shouldBeNonNullable = schema.optionalLabels.map(_.name).toSet + schema.idField.name
      val dfWithCorrectNullability = shouldBeNonNullable.foldLeft(dataFrame)(_.setNonNullable(_))

      CAPSNodeTable(nodeMapping, dfWithCorrectNullability)
    })
  }

  private def loadRels: List[CAPSRelationshipTable] = {
    val csvFiles = fileHandler.listRelationshipFiles.toList

    csvFiles.map(relationShipFile => {

      val schema = parseSchema(relationShipFile)(CsvRelSchema(_))

      val intermediateDF = sparkSession.read
        .option("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSS")
        .schema(schema.toStructType)
        .csv(relationShipFile.toString)

      val dataFrame = convertLists(intermediateDF, schema)

      val relMapping = RelationshipMapping.create(schema.idField.name,
        schema.startIdField.name,
        schema.endIdField.name,
        schema.relationshipType,
        schema.propertyFields.map(_.name).toSet)

      val shouldBeNonNullable = Set(schema.idField.name, schema.startIdField.name, schema.endIdField.name)
      val dfWithCorrectNullability = shouldBeNonNullable.foldLeft(dataFrame)(_.setNonNullable(_))

      CAPSRelationshipTable(relMapping, dfWithCorrectNullability)
    })
  }

  private def parseSchema[T <: CsvSchema](path: URI)(parser: String => T): T = {
    val text = fileHandler.readSchemaFile(path)
    parser(text)
  }

  /**
    * CSV does not allow to read array fields. Thus we have to read all list fields as Strings and then manually convert
    * them
    *
    * @param dataFrame the input data frame with list column represented as string
    * @param schema    the dataframes csv schema
    * @return Date frame with array list columns
    */
  private def convertLists(dataFrame: DataFrame, schema: CsvSchema): DataFrame = {
    schema.propertyFields
      .filter(field => field.getTargetType.isInstanceOf[ArrayType])
      .foldLeft(dataFrame) {
        case (df, field) =>
          df.safeReplaceColumn(field.name, functions.split(df(field.name), "\\|").cast(field.getTargetType))
      }
  }
}

object CsvGraphLoader {

  val NODES_DIRECTORY = "nodes"

  val RELS_DIRECTORY = "relationships"

  val CSV_SUFFIX = ".CSV"

  val SCHEMA_SUFFIX = ".SCHEMA"

  def apply(location: URI, hadoopConfig: Configuration)(implicit caps: CAPSSession): CsvGraphLoader = {
    new CsvGraphLoader(new HadoopFileHandler(location, hadoopConfig))
  }

  def apply(location: URI)(implicit caps: CAPSSession): CsvGraphLoader = {
    new CsvGraphLoader(new LocalFileHandler(location))
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy