com.atlan.pkg.serde.csv.CSVWriter.kt Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of package-toolkit-runtime Show documentation
Atlan custom package runtime toolkit
There is a newer version: 4.2.0
Show newest version
/* SPDX-License-Identifier: Apache-2.0
   Copyright 2023 Atlan Pte. Ltd. */
package com.atlan.pkg.serde.csv

import com.atlan.model.assets.Asset
import com.atlan.pkg.Utils
import de.siegmar.fastcsv.writer.CsvWriter
import de.siegmar.fastcsv.writer.LineDelimiter
import de.siegmar.fastcsv.writer.QuoteStrategies
import mu.KLogger
import java.io.Closeable
import java.io.IOException
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.atomic.AtomicLong
import java.util.stream.Stream

/**
 * CSV file writer, using a specific field separator character.
 *
 * @param path location and filename of the CSV file to produce
 * @param fieldSeparator character to use to separate fields (for example ',' or ';')
 */
class CSVWriter
    @JvmOverloads
    constructor(path: String, fieldSeparator: Char = ',') : Closeable {
        private val writer =
            CsvWriter.builder()
                .fieldSeparator(fieldSeparator)
                .quoteCharacter('"')
                .quoteStrategy(QuoteStrategies.NON_EMPTY)
                .lineDelimiter(LineDelimiter.PLATFORM)
                .build(ThreadSafeWriter(path))

        private val header = mutableListOf()

        /**
         * Write a header row into the CSV file.
         *
         * @param values to use for the header
         */
        fun writeHeader(values: Iterable) {
            header.addAll(values)
            writeRecord(values)
        }

        /**
         * Write a row of data into the CSV file, where key of the map is the column name and the value
         * is the value to write for that column of the row of data.
         * Note: be sure you have first called {@code writeHeader} to output the header row.
         *
         * @param values map keyed by column name with values for the row of data
         */
        fun writeRecord(values: Map?) {
            if (values != null) {
                val list = mutableListOf()
                header.forEach { name ->
                    list.add(values.getOrDefault(name, "") ?: "")
                }
                writeRecord(list)
            }
        }

        /**
         * Write a row of data into the CSV file, where the values are already sequenced
         * in the same order as the header columns.
         *
         * @param values to use for the row of data
         */
        fun writeRecord(values: Iterable?) {
            if (values != null) {
                synchronized(writer) { writer.writeRecord(values) }
            }
        }

        /**
         * Parallel-write the provided asset stream into the CSV file.
         * (For the highest performance, we recommend sending in a parallel stream of assets.)
         *
         * @param stream of assets, typically from a FluentSearch (parallel stream recommended)
         * @param assetToRow translator from an asset object to a row of CSV values
         * @param totalAssetCount the total number of assets that will be output (used for logging / completion tracking)
         * @param pageSize the page size being used by the asset stream
         * @param logger through which to report the overall progress
         */
        fun streamAssets(
            stream: Stream,
            assetToRow: RowGenerator,
            totalAssetCount: Long,
            pageSize: Int,
            logger: KLogger,
        ) {
            logger.info { "Extracting a total of $totalAssetCount assets..." }
            val count = AtomicLong(0)
            val map = ConcurrentHashMap()
            stream.forEach { a: Asset ->
                writeAsset(a, assetToRow, count, totalAssetCount, pageSize, map, logger)
            }
            logger.info { "Total unique assets extracted: ${map.size}" }
        }

        /**
         * Append assets that have already been retrieved (not being streamed) into the CSV file.
         * This is useful, for example, where information is cached up-front and thus need not be re-retrieved.
         *
         * @param list of assets, pre-retrieved
         * @param assetToRow translator from an asset object into a row of CSV values
         * @param totalAssetCount the total number of assets that will be output (used for logging / completion tracking)
         * @param pageSize the page size to use for periodically logging progress
         * @param logger through which to report the overall progress
         */
        fun appendAssets(
            list: List,
            assetToRow: RowGenerator,
            totalAssetCount: Long,
            pageSize: Int,
            logger: KLogger,
        ) {
            val count = AtomicLong(0)
            val map = ConcurrentHashMap()
            list.forEach { a: Asset ->
                writeAsset(a, assetToRow, count, totalAssetCount, pageSize, map, logger)
            }
        }

        private fun writeAsset(
            a: Asset,
            assetToRow: RowGenerator,
            count: AtomicLong,
            totalAssetCount: Long,
            pageSize: Int,
            map: ConcurrentHashMap,
            logger: KLogger,
        ) {
            val duplicate = map.put(a.guid, a.typeName + "::" + a.guid)
            if (duplicate != null) {
                logger.warn { "Hit a duplicate asset entry — there could be page skew: $duplicate" }
            }
            val values = assetToRow.buildFromAsset(a)
            writeRecord(values)
            Utils.logProgress(count, totalAssetCount, logger, pageSize)
        }

        /** {@inheritDoc}  */
        @Throws(IOException::class)
        override fun close() {
            writer.close()
        }
    }