All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.atlan.pkg.rab.Importer.kt Maven / Gradle / Ivy

/* SPDX-License-Identifier: Apache-2.0
   Copyright 2023 Atlan Pte. Ltd. */
package com.atlan.pkg.rab

import RelationalAssetsBuilderCfg
import com.atlan.model.assets.Asset
import com.atlan.model.assets.Column
import com.atlan.model.assets.Connection
import com.atlan.model.assets.Database
import com.atlan.model.assets.MaterializedView
import com.atlan.model.assets.Schema
import com.atlan.model.assets.Table
import com.atlan.model.assets.View
import com.atlan.model.enums.AssetCreationHandling
import com.atlan.pkg.Utils
import com.atlan.pkg.cache.ConnectionCache
import com.atlan.pkg.cache.LinkCache
import com.atlan.pkg.cache.TermCache
import com.atlan.pkg.rab.AssetImporter.Companion.getQualifiedNameDetails
import com.atlan.pkg.serde.FieldSerde
import com.atlan.pkg.serde.csv.CSVImporter
import de.siegmar.fastcsv.reader.CsvReader
import de.siegmar.fastcsv.writer.CsvWriter
import mu.KotlinLogging
import java.nio.file.Paths
import java.nio.file.StandardOpenOption
import java.util.concurrent.atomic.AtomicInteger
import kotlin.system.exitProcess

/**
 * Actually run the importer.
 * Note: all parameters should be passed through environment variables.
 */
object Importer {
    private val logger = KotlinLogging.logger {}

    @JvmStatic
    fun main(args: Array) {
        val outputDirectory = if (args.isEmpty()) "tmp" else args[0]
        val config = Utils.setPackageOps()
        import(config, outputDirectory)
    }

    fun import(config: RelationalAssetsBuilderCfg, outputDirectory: String = "tmp") {
        val batchSize = Utils.getOrDefault(config.assetsBatchSize, 20).toInt()
        val fieldSeparator = Utils.getOrDefault(config.assetsFieldSeparator, ",")[0]
        val assetsUpload = Utils.getOrDefault(config.importType, "DIRECT") == "DIRECT"
        val assetsFilename = Utils.getOrDefault(config.assetsFile, "")
        val assetsCloudDetails = Utils.getOrDefault(config.cloudSource, "")
        val assetsKey = Utils.getOrDefault(config.assetsKey, "")
        val assetAttrsToOverwrite =
            CSVImporter.attributesToClear(Utils.getOrDefault(config.assetsAttrToOverwrite, listOf()).toMutableList(), "assets", logger)
        val assetsFailOnErrors = Utils.getOrDefault(config.assetsFailOnErrors, true)
        val assetsSemantic = Utils.getCreationHandling(config.assetsUpsertSemantic, AssetCreationHandling.FULL)
        val trackBatches = Utils.getOrDefault(config.trackBatches, true)

        val assetsFileProvided = (
            assetsUpload && assetsFilename.isNotBlank()
            ) || (
            !assetsUpload && assetsCloudDetails.isNotBlank() && assetsKey.isNotBlank()
            )
        if (!assetsFileProvided) {
            logger.error { "No input file was provided for assets." }
            exitProcess(1)
        }

        // Preprocess the CSV file in an initial pass to inject key details,
        // to allow subsequent out-of-order parallel processing
        val assetsInput = Utils.getInputFile(
            assetsFilename,
            outputDirectory,
            assetsUpload,
            Utils.getOrDefault(config.assetsPrefix, ""),
            assetsKey,
        )
        val preprocessedDetails = preprocessCSV(assetsInput, fieldSeparator)

        // Only cache links and terms if there are any in the CSV, otherwise this
        // will be unnecessary work
        if (preprocessedDetails.hasLinks) {
            LinkCache.preload()
        }
        if (preprocessedDetails.hasTermAssignments) {
            TermCache.preload()
        }

        ConnectionCache.preload()

        FieldSerde.FAIL_ON_ERRORS.set(assetsFailOnErrors)
        logger.info { "=== Importing assets... ===" }

        logger.info { " --- Importing connections... ---" }
        // Note: we force-track the batches here to ensure any created connections are cached
        // (without tracking, any connections created will NOT be cached, either, which will then cause issues
        // with the subsequent processing steps.)
        val connectionImporter = ConnectionImporter(
            preprocessedDetails,
            assetAttrsToOverwrite,
            assetsSemantic,
            1,
            true,
            fieldSeparator,
        )
        connectionImporter.import()

        logger.info { " --- Importing databases... ---" }
        val databaseImporter = DatabaseImporter(
            preprocessedDetails,
            assetAttrsToOverwrite,
            assetsSemantic,
            batchSize,
            connectionImporter,
            trackBatches,
            fieldSeparator,
        )
        databaseImporter.import()

        logger.info { " --- Importing schemas... ---" }
        val schemaImporter = SchemaImporter(
            preprocessedDetails,
            assetAttrsToOverwrite,
            assetsSemantic,
            batchSize,
            connectionImporter,
            trackBatches,
            fieldSeparator,
        )
        schemaImporter.import()

        logger.info { " --- Importing tables... ---" }
        val tableImporter = TableImporter(
            preprocessedDetails,
            assetAttrsToOverwrite,
            assetsSemantic,
            batchSize,
            connectionImporter,
            trackBatches,
            fieldSeparator,
        )
        tableImporter.import()

        logger.info { " --- Importing views... ---" }
        val viewImporter = ViewImporter(
            preprocessedDetails,
            assetAttrsToOverwrite,
            assetsSemantic,
            batchSize,
            connectionImporter,
            trackBatches,
            fieldSeparator,
        )
        viewImporter.import()

        logger.info { " --- Importing materialized views... ---" }
        val materializedViewImporter = MaterializedViewImporter(
            preprocessedDetails,
            assetAttrsToOverwrite,
            assetsSemantic,
            batchSize,
            connectionImporter,
            trackBatches,
            fieldSeparator,
        )
        materializedViewImporter.import()

        logger.info { " --- Importing columns... ---" }
        val columnImporter = ColumnImporter(
            preprocessedDetails,
            assetAttrsToOverwrite,
            assetsSemantic,
            batchSize,
            connectionImporter,
            trackBatches,
            fieldSeparator,
        )
        columnImporter.import()
    }

    private fun preprocessCSV(originalFile: String, fieldSeparator: Char): PreprocessedCsv {
        // Setup
        val quoteCharacter = '"'
        val inputFile = Paths.get(originalFile)
        val revisedFile = Paths.get("$originalFile.CSA_RAB.csv")

        // Open the CSV reader and writer
        val reader = CsvReader.builder()
            .fieldSeparator(fieldSeparator)
            .quoteCharacter(quoteCharacter)
            .skipEmptyLines(true)
            .ignoreDifferentFieldCount(false)
        val writer = CsvWriter.builder()
            .fieldSeparator(fieldSeparator)
            .quoteCharacter(quoteCharacter)
            .build(revisedFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)

        // Start processing...
        reader.ofCsvRecord(inputFile).use { tmp ->
            var hasLinks = false
            var hasTermAssignments = false
            val entityQualifiedNameToType = mutableMapOf()
            val qualifiedNameToChildCount = mutableMapOf()
            val qualifiedNameToTableCount = mutableMapOf()
            val qualifiedNameToViewCount = mutableMapOf()
            var header: MutableList = mutableListOf()
            var typeIdx = 0
            var lastParentQN = ""
            var columnOrder = 1
            tmp.stream().forEach { row ->
                if (row.startingLineNumber == 1L) {
                    header = row.fields.toMutableList()
                    // Inject two columns at the end that we need for column assets
                    header.add(Column.ORDER.atlanFieldName)
                    header.add(ColumnImporter.COLUMN_PARENT_QN)
                    if (header.contains(Asset.LINKS.atlanFieldName)) {
                        hasLinks = true
                    }
                    if (header.contains("assignedTerms")) {
                        hasTermAssignments = true
                    }
                    typeIdx = header.indexOf(Asset.TYPE_NAME.atlanFieldName)
                    writer.writeRecord(header)
                } else {
                    val values = row.fields.toMutableList()
                    val typeName = values[typeIdx]
                    val qnDetails = getQualifiedNameDetails(values, header, typeName)
                    if (typeName !in setOf(Table.TYPE_NAME, View.TYPE_NAME, MaterializedView.TYPE_NAME)) {
                        if (!qualifiedNameToChildCount.containsKey(qnDetails.parentUniqueQN)) {
                            qualifiedNameToChildCount[qnDetails.parentUniqueQN] = AtomicInteger(0)
                        }
                        qualifiedNameToChildCount[qnDetails.parentUniqueQN]?.incrementAndGet()
                    }
                    when (typeName) {
                        Connection.TYPE_NAME, Database.TYPE_NAME, Schema.TYPE_NAME -> {
                            values.add("")
                            values.add("")
                        }
                        Table.TYPE_NAME -> {
                            if (!qualifiedNameToTableCount.containsKey(qnDetails.parentUniqueQN)) {
                                qualifiedNameToTableCount[qnDetails.parentUniqueQN] = AtomicInteger(0)
                            }
                            qualifiedNameToTableCount[qnDetails.parentUniqueQN]?.incrementAndGet()
                            entityQualifiedNameToType[qnDetails.uniqueQN] = typeName
                            values.add("")
                            values.add("")
                        }
                        View.TYPE_NAME, MaterializedView.TYPE_NAME -> {
                            if (!qualifiedNameToViewCount.containsKey(qnDetails.parentUniqueQN)) {
                                qualifiedNameToViewCount[qnDetails.parentUniqueQN] = AtomicInteger(0)
                            }
                            qualifiedNameToViewCount[qnDetails.parentUniqueQN]?.incrementAndGet()
                            entityQualifiedNameToType[qnDetails.uniqueQN] = typeName
                            values.add("")
                            values.add("")
                        }
                        Column.TYPE_NAME -> {
                            // If it is a column, calculate the order and parent qualifiedName and inject them
                            if (qnDetails.parentUniqueQN == lastParentQN) {
                                columnOrder += 1
                            } else {
                                lastParentQN = qnDetails.parentUniqueQN
                                columnOrder = 1
                            }
                            values.add("$columnOrder")
                            values.add(qnDetails.parentPartialQN)
                        }
                    }
                    writer.writeRecord(values)
                }
            }
            writer.close()
            return PreprocessedCsv(
                hasLinks,
                hasTermAssignments,
                revisedFile.toString(),
                entityQualifiedNameToType,
                qualifiedNameToChildCount,
                qualifiedNameToTableCount,
                qualifiedNameToViewCount,
            )
        }
    }

    data class PreprocessedCsv(
        val hasLinks: Boolean,
        val hasTermAssignments: Boolean,
        val preprocessedFile: String,
        val entityQualifiedNameToType: Map,
        val qualifiedNameToChildCount: Map,
        val qualifiedNameToTableCount: Map,
        val qualifiedNameToViewCount: Map,
    )
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy