overflowdb.formats.neo4jcsv.Neo4jCsvExporter.scala Maven / Gradle / Ivy
The newest version!
package overflowdb.formats.neo4jcsv
import com.github.tototoshi.csv._
import overflowdb.formats.{ExportResult, Exporter, writeFile}
import overflowdb.{Edge, Node}
import java.nio.file.Path
import scala.collection.mutable
import scala.jdk.CollectionConverters.CollectionHasAsScala
import scala.jdk.OptionConverters.RichOptional
import scala.util.Using
object Neo4jCsvExporter extends Exporter {
override def defaultFileExtension = "csv"
/** Exports OverflowDB Graph to neo4j csv files see
* https://neo4j.com/docs/operations-manual/current/tools/neo4j-admin/neo4j-admin-import/
*
* For both nodes and relationships, we first write the data file and to derive the property types from their runtime
* types. We will write columns for all declared properties, because we only know which ones are actually in use
* *after* traversing all elements.
*/
override def runExport(nodes: IterableOnce[Node], edges: IterableOnce[Edge], outputRootDirectory: Path) = {
val nodesByLabel = nodes.iterator.toSeq.groupBy(_.label).filter(_._2.nonEmpty)
val CountAndFiles(nodeCount, nodeFiles) = nodesByLabel
.map { case (label, nodes) =>
exportNodes(nodes, label, outputRootDirectory)
}
.reduce(_.plus(_))
val CountAndFiles(edgeCount, edgeFiles) = exportEdges(edges, outputRootDirectory)
val outputRootAbsolute = outputRootDirectory.toAbsolutePath
ExportResult(
nodeCount = nodeCount,
edgeCount = edgeCount,
files = nodeFiles ++ edgeFiles,
additionalInfo = Option(s"""Instructions on how to import the exported files into neo4j:
|Prerequisite: ensure you have neo4j community server running (enterprise and desktop may work too)
|e.g. download from https://neo4j.com/download-center/#community and start via `bin/neo4j console`
|
|Then, in a new terminal:
|```
|cd
|
|# if you have a fresh instance, you must first change the initial password
|bin/cypher-shell -u neo4j -p neo4j
|# exit the cypher shell
|
|# copy the data files to the `import` directory, where neo4j will find them
|cp $outputRootAbsolute/*$DataFileSuffix.csv import
|
|find $outputRootAbsolute -name 'nodes_*_cypher.csv' -exec bin/cypher-shell -u neo4j -p --file {} \\;
|find $outputRootAbsolute -name 'edges_*_cypher.csv' -exec bin/cypher-shell -u neo4j -p --file {} \\;
|```
|""".stripMargin)
)
}
private def exportNodes(nodes: IterableOnce[Node], label: String, outputRootDirectory: Path): CountAndFiles = {
val dataFile = outputRootDirectory.resolve(s"nodes_$label$DataFileSuffix.csv")
val headerFile = outputRootDirectory.resolve(
s"nodes_$label$HeaderFileSuffix.csv"
) // to be written at the very end, with complete ColumnDefByName
val cypherFile = outputRootDirectory.resolve(s"nodes_$label$CypherFileSuffix.csv")
// will be initialized with the first node
var columnDefinitions: ColumnDefinitions = null
var nodeCount = 0
Using.resource(CSVWriter.open(dataFile.toFile, append = false)) { writer =>
nodes.iterator.foreach { node =>
if (columnDefinitions == null) columnDefinitions = new ColumnDefinitions(node.propertyKeys.asScala)
val specialColumns = Seq(node.id.toString, node.label)
val propertyValueColumns = columnDefinitions.propertyValues(node.propertyOption(_).toScala)
writer.writeRow(specialColumns ++ propertyValueColumns)
nodeCount += 1
}
}
writeSingleLineCsv(headerFile, Seq(ColumnType.Id, ColumnType.Label) ++ columnDefinitions.propertiesWithTypes)
// write cypher file for import into neo4j
// starting with index=2, because 0|1 are taken by 'special' columns Id|Label
val cypherPropertyMappings = columnDefinitions.propertiesMappingsForCypher(startIndex = 2).mkString(",\n")
val cypherQuery =
s"""LOAD CSV FROM 'file:/nodes_${label}_data.csv' AS line
|CREATE (:$label {
|id: toInteger(line[0]),
|$cypherPropertyMappings
|});
|""".stripMargin
writeFile(cypherFile, cypherQuery)
CountAndFiles(nodeCount, Seq(headerFile, dataFile, cypherFile))
}
/** write edges of all labels */
private def exportEdges(edges: IterableOnce[Edge], outputRootDirectory: Path): CountAndFiles = {
val edgeFilesContextByLabel = mutable.Map.empty[String, EdgeFilesContext]
var count = 0
edges.iterator.foreach { edge =>
val label = edge.label
val context = edgeFilesContextByLabel.getOrElseUpdate(
label, {
// first time we encounter an edge of this type - create the columnMapping and write the header file
val headerFile = outputRootDirectory.resolve(
s"edges_$label$HeaderFileSuffix.csv"
) // to be written at the very end, with complete ColumnDefByName
val dataFile = outputRootDirectory.resolve(s"edges_$label$DataFileSuffix.csv")
val cypherFile = outputRootDirectory.resolve(s"edges_$label$CypherFileSuffix.csv")
val dataFileWriter = CSVWriter.open(dataFile.toFile, append = false)
val columnDefinitions = new ColumnDefinitions(edge.propertyKeys.asScala)
EdgeFilesContext(label, headerFile, dataFile, cypherFile, dataFileWriter, columnDefinitions)
}
)
val specialColumns = Seq(edge.outNode.id.toString, edge.inNode.id.toString, edge.label)
val propertyValueColumns = context.columnDefinitions.propertyValues(edge.propertyOption(_).toScala)
context.dataFileWriter.writeRow(specialColumns ++ propertyValueColumns)
count += 1
}
val files = edgeFilesContextByLabel.values.flatMap {
case EdgeFilesContext(label, headerFile, dataFile, cypherFile, dataFileWriter, columnDefinitions) =>
writeSingleLineCsv(
headerFile,
Seq(ColumnType.StartId, ColumnType.EndId, ColumnType.Type) ++ columnDefinitions.propertiesWithTypes
)
dataFileWriter.flush()
dataFileWriter.close()
// write cypher file for import into neo4j
// starting with index=3, because 0|1|2 are taken by 'special' columns StartId|EndId|Type
val cypherPropertyMappings = columnDefinitions.propertiesMappingsForCypher(startIndex = 3).mkString(",\n")
val cypherQuery =
s"""LOAD CSV FROM 'file:/edges_${label}_data.csv' AS line
|MATCH (a), (b)
|WHERE a.id = toInteger(line[0]) AND b.id = toInteger(line[1])
|CREATE (a)-[r:$label {$cypherPropertyMappings}]->(b);
|""".stripMargin
writeFile(cypherFile, cypherQuery)
Seq(headerFile, dataFile, cypherFile)
}.toSeq
CountAndFiles(count, files)
}
private def writeSingleLineCsv(outputFile: Path, entries: Seq[Any]): Unit = {
Using.resource(CSVWriter.open(outputFile.toFile, append = false)) { writer =>
writer.writeRow(entries)
}
}
private case class EdgeFilesContext(
label: String,
headerFile: Path,
dataFile: Path,
cypherFile: Path,
dataFileWriter: CSVWriter,
columnDefinitions: ColumnDefinitions
)
case class CountAndFiles(count: Int, files: Seq[Path]) {
def plus(other: CountAndFiles): CountAndFiles =
CountAndFiles(count + other.count, files ++ other.files)
}
}