com.atlan.pkg.cab.FieldImporter.kt Maven / Gradle / Ivy

Go to download
/* SPDX-License-Identifier: Apache-2.0
   Copyright 2023 Atlan Pte. Ltd. */
package com.atlan.pkg.cab

import com.atlan.model.assets.Asset
import com.atlan.model.assets.CubeField
import com.atlan.model.enums.AssetCreationHandling
import com.atlan.model.fields.AtlanField
import com.atlan.pkg.serde.RowDeserializer
import com.atlan.pkg.serde.csv.ImportResults
import com.atlan.util.StringUtils
import mu.KotlinLogging
import java.util.concurrent.atomic.AtomicLong
import kotlin.math.max

/**
 * Import cube fields into Atlan from a provided CSV file.
 *
 * Only the cube fields and attributes in the provided CSV file will attempt to be loaded.
 * By default, any blank values in a cell in the CSV file will be ignored. If you would like any
 * particular cube field's blank values to actually overwrite (i.e. remove) existing values for that
 * asset in Atlan, then add that cube field's field to getAttributesToOverwrite.
 *
 * @param preprocessed details of the preprocessed CSV file
 * @param attrsToOverwrite list of fields that should be overwritten in Atlan, if their value is empty in the CSV
 * @param creationHandling what to do with assets that do not exist (create full, partial, or ignore)
 * @param batchSize maximum number of records to save per API request
 * @param connectionImporter that was used to import connections
 * @param trackBatches if true, minimal details about every asset created or updated is tracked (if false, only counts of each are tracked)
 * @param fieldSeparator character to use to separate fields (for example ',' or ';')
 */
class FieldImporter(
    private val preprocessed: Importer.Results,
    private val attrsToOverwrite: List,
    private val creationHandling: AssetCreationHandling,
    private val batchSize: Int,
    private val connectionImporter: ConnectionImporter,
    trackBatches: Boolean,
    fieldSeparator: Char,
) : AssetImporter(
        preprocessed.preprocessedFile,
        attrsToOverwrite,
        creationHandling,
        batchSize,
        CubeField.TYPE_NAME,
        KotlinLogging.logger {},
        trackBatches,
        fieldSeparator,
    ) {
    private val leafNodeLevel = 1L
    private var generationToProcess = 0L

    // Maximum depth of any field in the CSV (overall and by hierarchy)
    private val maxFieldGeneration = AtomicLong(1)
    private val maxLevelByPath: MutableMap = mutableMapOf()

    companion object {
        const val PARENT_FIELD_QN = "parentFieldQualifiedName"
        const val FIELD_NAME = "fieldName"
        const val CUBE_NAME = "cubeName"
        const val DIMENSION_NAME = "cubeDimensionName"
        const val HIERARCHY_NAME = "cubeHierarchyName"
    }

    /** {@inheritDoc} */
    override fun preprocessRow(
        row: List,
        header: List,
        typeIdx: Int,
        qnIdx: Int,
    ): List {
        if (row[typeIdx] == typeNameFilter) {
            // Only build up the details if this is in fact a field row
            val hierarchyPath = getHierarchyPath(row, header)
            val path = getFieldPath(hierarchyPath, row, header)
            if (!maxLevelByPath.containsKey(path)) {
                // If path not yet seen, treat it as a leaf (for now)
                maxLevelByPath[path] = AtomicLong(leafNodeLevel)
            }
            bubbleUpParentLevel(path, hierarchyPath)
            // Consider whether we need to update the maximum depth of fields we need to load
            val currentMax = maxFieldGeneration.get()
            val fieldGeneration = getFieldGeneration(row, header)
            val maxDepth = max(fieldGeneration, currentMax)
            if (maxDepth > currentMax) {
                maxFieldGeneration.set(maxDepth)
            }
        }
        return row
    }

    /**
     * Recursively bubble-up setting the parent level(s) based on lower-field level updates.
     *
     * @param path of the field from which to bubble up levels
     * @param hierarchyPath path of the hierarchy for the field
     */
    private fun bubbleUpParentLevel(
        path: String,
        hierarchyPath: String,
    ) {
        if (path != hierarchyPath) { // Short-circuit once we reach hierarchy level (no need to bubble up further)
            val levelFromThisChild = maxLevelByPath[path]!!.get() + 1
            val parentPath = StringUtils.getParentQualifiedNameFromQualifiedName(path, Importer.QN_DELIMITER)
            if (parentPath != null) {
                val currentParentLevel = maxLevelByPath[parentPath]?.get() ?: leafNodeLevel
                // Logic for level calculation:
                //  - If there are no children, level = 0
                //  - Else level = max(child) + 1
                if (levelFromThisChild >= currentParentLevel) {
                    if (maxLevelByPath.containsKey(parentPath)) {
                        maxLevelByPath[parentPath]!!.set(levelFromThisChild)
                    } else {
                        maxLevelByPath[parentPath] = AtomicLong(levelFromThisChild)
                    }
                }
                bubbleUpParentLevel(parentPath, hierarchyPath)
            }
        }
    }

    /** {@inheritDoc} */
    override fun import(columnsToSkip: Set): ImportResults? {
        // Import fields by generation, top-to-bottom, and stop when we hit a generation with no fields
        logger.info { "Loading fields in multiple passes, by generation..." }
        var combinedResults: ImportResults? = null
        while (generationToProcess < maxFieldGeneration.get()) {
            generationToProcess += 1
            logger.info { "--- Loading generation $generationToProcess fields... ---" }
            val results = super.import(columnsToSkip)
            if (combinedResults == null) {
                combinedResults = results
            } else if (results != null) {
                combinedResults = combinedResults.combinedWith(results)
            }
        }
        return combinedResults
    }

    /** {@inheritDoc} */
    override fun includeRow(
        row: List,
        header: List,
        typeIdx: Int,
        qnIdx: Int,
    ): Boolean {
        val nameIdx = header.indexOf(FIELD_NAME)
        val parentIdx = header.indexOf(PARENT_FIELD_QN)

        val maxBound = max(typeIdx, max(nameIdx, parentIdx))
        if (maxBound > row.size || row[typeIdx] != typeNameFilter) {
            // If any of the columns are beyond the size of the row, or the row
            // represents something other than a field, short-circuit
            return false
        }
        val fieldGeneration = getFieldGeneration(row, header)
        if (fieldGeneration != generationToProcess) {
            // If this field is a different generation than we are currently processing,
            // short-circuit
            return false
        }
        return row[typeIdx] == typeNameFilter
    }

    /** {@inheritDoc} */
    override fun getBuilder(deserializer: RowDeserializer): Asset.AssetBuilder<*, *> {
        val name = deserializer.getValue(FIELD_NAME)?.let { it as String } ?: ""
        val connectionQN = connectionImporter.getBuilder(deserializer).build().qualifiedName
        val qnDetails = getQualifiedNameDetails(deserializer.row, deserializer.heading, typeNameFilter)
        val parentQN = "$connectionQN/${qnDetails.parentPartialQN}"
        val level = getFieldLevel(deserializer.row, deserializer.heading)
        return CubeField.creator(name, parentQN)
            .cubeFieldLevel(level)
            .cubeFieldGeneration(generationToProcess)
            .cubeSubFieldCount(preprocessed.qualifiedNameToChildCount[qnDetails.uniqueQN]?.toLong())
    }

    /**
     * Calculate the generation of the field in a given row of the CSV.
     *
     * @param row of values in the CSV
     * @param header names of columns for the CSV
     * @return numeric generation of the (nested) field
     */
    private fun getFieldGeneration(
        row: List,
        header: List,
    ): Long {
        val parentIdx = header.indexOf(PARENT_FIELD_QN)
        val parent = if (parentIdx >= 0) row[parentIdx] else ""
        return if (parent.isBlank()) {
            1L
        } else {
            val parentPath = parent.split(Importer.QN_DELIMITER)
            (parentPath.size + 1).toLong()
        }
    }

    /**
     * Calculate the level of the field in a given row of the CSV.
     *
     * @param row of values in the CSV
     * @param header names of columns for the CSV
     * @return numeric level of the (nested) field
     */
    private fun getFieldLevel(
        row: List,
        header: List,
    ): Long {
        val path = getFieldPath(getHierarchyPath(row, header), row, header)
        return maxLevelByPath[path]?.get() ?: leafNodeLevel
    }

    /**
     * Calculate the full path for the field on the provided row.
     *
     * @param hierarchyPath path of the hierarchy for the field
     * @param row of values in the CSV
     * @param header names of columns for the CSV
     * @return unique path for the field on the row
     */
    private fun getFieldPath(
        hierarchyPath: String,
        row: List,
        header: List,
    ): String {
        val parentIdx = header.indexOf(PARENT_FIELD_QN)
        val nameIdx = header.indexOf(FIELD_NAME)
        val parentPath = if (parentIdx >= 0) row[parentIdx] else ""
        return if (parentPath.isBlank()) {
            "$hierarchyPath${Importer.QN_DELIMITER}${row[nameIdx]}"
        } else {
            "$hierarchyPath${Importer.QN_DELIMITER}$parentPath${Importer.QN_DELIMITER}${row[nameIdx]}"
        }
    }

    /**
     * Calculate the hierarchy path for the field on the provided row.
     *
     * @param row of values in the CSV
     * @param header names of columns for the CSV
     * @return unique path for the hierarchy of the field on the row
     */
    private fun getHierarchyPath(
        row: List,
        header: List,
    ): String {
        val cubeIdx = header.indexOf(CUBE_NAME)
        val dimIdx = header.indexOf(DIMENSION_NAME)
        val hierIdx = header.indexOf(HIERARCHY_NAME)
        return "${row[cubeIdx]}${Importer.QN_DELIMITER}${row[dimIdx]}${Importer.QN_DELIMITER}${row[hierIdx]}"
    }
}