org.jetbrains.dataframe.ksp.DataSchemaGenerator.kt Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of symbol-processor-all Show documentation
Show all versions of symbol-processor-all Show documentation
Annotation preprocessor for DataFrame
package org.jetbrains.dataframe.ksp
import com.google.devtools.ksp.KspExperimental
import com.google.devtools.ksp.getAnnotationsByType
import com.google.devtools.ksp.processing.Dependencies
import com.google.devtools.ksp.processing.KSPLogger
import com.google.devtools.ksp.processing.Resolver
import com.google.devtools.ksp.symbol.KSFile
import org.jetbrains.dataframe.impl.codeGen.CodeGenerator
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.annotations.CsvOptions
import org.jetbrains.kotlinx.dataframe.annotations.DataSchemaVisibility
import org.jetbrains.kotlinx.dataframe.annotations.ImportDataSchema
import org.jetbrains.kotlinx.dataframe.annotations.JdbcOptions
import org.jetbrains.kotlinx.dataframe.annotations.JsonOptions
import org.jetbrains.kotlinx.dataframe.api.JsonPath
import org.jetbrains.kotlinx.dataframe.codeGen.MarkerVisibility
import org.jetbrains.kotlinx.dataframe.codeGen.NameNormalizer
import org.jetbrains.kotlinx.dataframe.impl.codeGen.CodeGenerationReadResult
import org.jetbrains.kotlinx.dataframe.impl.codeGen.DfReadResult
import org.jetbrains.kotlinx.dataframe.impl.codeGen.from
import org.jetbrains.kotlinx.dataframe.impl.codeGen.toStandaloneSnippet
import org.jetbrains.kotlinx.dataframe.impl.codeGen.urlCodeGenReader
import org.jetbrains.kotlinx.dataframe.impl.codeGen.urlDfReader
import org.jetbrains.kotlinx.dataframe.io.ArrowFeather
import org.jetbrains.kotlinx.dataframe.io.CSV
import org.jetbrains.kotlinx.dataframe.io.Excel
import org.jetbrains.kotlinx.dataframe.io.JSON
import org.jetbrains.kotlinx.dataframe.io.OpenApi
import org.jetbrains.kotlinx.dataframe.io.TSV
import org.jetbrains.kotlinx.dataframe.io.databaseCodeGenReader
import org.jetbrains.kotlinx.dataframe.io.db.driverClassNameFromUrl
import org.jetbrains.kotlinx.dataframe.io.getSchemaForSqlQuery
import org.jetbrains.kotlinx.dataframe.io.getSchemaForSqlTable
import org.jetbrains.kotlinx.dataframe.io.isURL
import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
import java.io.File
import java.net.MalformedURLException
import java.net.URL
import java.sql.Connection
import java.sql.DriverManager
@OptIn(KspExperimental::class)
class DataSchemaGenerator(
private val resolver: Resolver,
private val resolutionDir: String?,
private val logger: KSPLogger,
private val codeGenerator: com.google.devtools.ksp.processing.CodeGenerator,
) {
fun resolveImportStatements(): List = resolvePathImports(resolver).toList()
class ImportDataSchemaStatement(
val origin: KSFile,
val name: String,
val dataSource: CodeGeneratorDataSource,
val visibility: MarkerVisibility,
val normalizationDelimiters: List,
val withDefaultPath: Boolean,
val csvOptions: CsvOptions,
val jsonOptions: JsonOptions,
val jdbcOptions: JdbcOptions,
val isJdbc: Boolean = false,
)
class CodeGeneratorDataSource(val pathRepresentation: String, val data: URL)
private fun resolvePathImports(resolver: Resolver) =
resolver.getSymbolsWithAnnotation(ImportDataSchema::class.qualifiedName!!)
.filterIsInstance()
.flatMap { file ->
file.getAnnotationsByType(ImportDataSchema::class).mapNotNull { it.toStatement(file, logger) }
}
private fun ImportDataSchema.toStatement(file: KSFile, logger: KSPLogger): ImportDataSchemaStatement? {
val url = if (isURL(path)) {
try {
URL(this.path)
} catch (exception: MalformedURLException) {
logger.error("'${this.path}' is not valid URL: ${exception.message}", file)
return null
}
} else {
// revisit architecture for an addition of the new data source https://github.com/Kotlin/dataframe/issues/450
if (path.startsWith("jdbc")) {
return ImportDataSchemaStatement(
origin = file,
name = name,
// URL better to make nullable or make hierarchy here
dataSource = CodeGeneratorDataSource(this.path, URL("http://example.com/pages/")),
visibility = visibility.toMarkerVisibility(),
normalizationDelimiters = normalizationDelimiters.toList(),
withDefaultPath = withDefaultPath,
csvOptions = csvOptions,
jsonOptions = jsonOptions,
jdbcOptions = jdbcOptions,
isJdbc = true,
)
}
val resolutionDir: String = resolutionDir ?: run {
reportMissingKspArgument(file)
return null
}
val relativeFile = File(resolutionDir, path)
val absoluteFile = File(path)
val data = if (relativeFile.exists()) relativeFile else absoluteFile
try {
data.toURI().toURL() ?: return null
} catch (exception: MalformedURLException) {
logger.error(
"Failed to convert resolved path '${relativeFile.absolutePath}' or '${absoluteFile.absolutePath}' to URL: ${exception.message}",
file,
)
return null
}
}
return ImportDataSchemaStatement(
origin = file,
name = name,
dataSource = CodeGeneratorDataSource(this.path, url),
visibility = visibility.toMarkerVisibility(),
normalizationDelimiters = normalizationDelimiters.toList(),
withDefaultPath = withDefaultPath,
csvOptions = csvOptions,
jsonOptions = jsonOptions,
jdbcOptions = jdbcOptions,
)
}
private fun DataSchemaVisibility.toMarkerVisibility(): MarkerVisibility =
when (this) {
DataSchemaVisibility.INTERNAL -> MarkerVisibility.INTERNAL
DataSchemaVisibility.IMPLICIT_PUBLIC -> MarkerVisibility.IMPLICIT_PUBLIC
DataSchemaVisibility.EXPLICIT_PUBLIC -> MarkerVisibility.EXPLICIT_PUBLIC
}
private fun reportMissingKspArgument(file: KSFile) {
logger.error(
"""
|KSP option with key "dataframe.resolutionDir" must be set in order to use relative path in @${ImportDataSchema::class.simpleName}
|DataFrame Gradle plugin should set it by default to "project.projectDir".
|If you do not use DataFrame Gradle plugin, configure option manually
""".trimMargin(),
symbol = file,
)
}
fun generateDataSchema(importStatement: ImportDataSchemaStatement) {
val packageName = importStatement.origin.packageName.asString()
val name = importStatement.name
val schemaFile =
codeGenerator.createNewFile(Dependencies(true, importStatement.origin), packageName, "$name.Generated")
val formats = listOf(
CSV(delimiter = importStatement.csvOptions.delimiter),
JSON(
typeClashTactic = importStatement.jsonOptions.typeClashTactic,
keyValuePaths = importStatement.jsonOptions.keyValuePaths.map(::JsonPath),
),
Excel(),
TSV(),
ArrowFeather(),
OpenApi(),
)
// revisit architecture for an addition of the new data source https://github.com/Kotlin/dataframe/issues/450
if (importStatement.isJdbc) {
val url = importStatement.dataSource.pathRepresentation
// Force classloading
// TODO: probably will not work for the H2
Class.forName(driverClassNameFromUrl(url))
var userName = importStatement.jdbcOptions.user
var password = importStatement.jdbcOptions.password
// treat the passed userName and password parameters as env variables
if (importStatement.jdbcOptions.extractCredFromEnv) {
userName = System.getenv(userName) ?: userName
password = System.getenv(password) ?: password
}
val connection = DriverManager.getConnection(
url,
userName,
password,
)
connection.use {
val schema = generateSchemaForImport(importStatement, connection)
val codeGenerator = CodeGenerator.create(useFqNames = false)
val additionalImports: List = listOf()
val codeGenResult = codeGenerator.generate(
schema = schema,
name = name,
fields = true,
extensionProperties = false,
isOpen = true,
visibility = importStatement.visibility,
knownMarkers = emptyList(),
readDfMethod = null,
fieldNameNormalizer = NameNormalizer.from(importStatement.normalizationDelimiters.toSet()),
)
val code = codeGenResult.toStandaloneSnippet(packageName, additionalImports)
schemaFile.bufferedWriter().use {
it.write(code)
}
return
}
}
// revisit architecture for an addition of the new data source https://github.com/Kotlin/dataframe/issues/450
// works for JDBC and OpenAPI only
// first try without creating a dataframe
when (
val codeGenResult = if (importStatement.isJdbc) {
CodeGenerator.databaseCodeGenReader(importStatement.dataSource.data, name)
} else {
CodeGenerator.urlCodeGenReader(importStatement.dataSource.data, name, formats, false)
}
) {
is CodeGenerationReadResult.Success -> {
val readDfMethod = codeGenResult.getReadDfMethod(
pathRepresentation = importStatement
.dataSource
.pathRepresentation
.takeIf { importStatement.withDefaultPath },
)
val code = codeGenResult
.code
.toStandaloneSnippet(packageName, readDfMethod.additionalImports)
schemaFile.bufferedWriter().use {
it.write(code)
}
return
}
is CodeGenerationReadResult.Error -> {
// logger.warn("Error while reading types-only from data at ${importStatement.dataSource.pathRepresentation}: ${codeGenResult.reason}")
}
}
// Usually works for others
// on error, try with reading dataframe first
val parsedDf = when (val readResult = CodeGenerator.urlDfReader(importStatement.dataSource.data, formats)) {
is DfReadResult.Error -> {
logger.error(
"Error while reading dataframe from data at ${importStatement.dataSource.pathRepresentation}: ${readResult.reason}",
)
return
}
is DfReadResult.Success -> readResult
}
val readDfMethod =
parsedDf.getReadDfMethod(
importStatement.dataSource.pathRepresentation.takeIf { importStatement.withDefaultPath },
)
val codeGenerator = CodeGenerator.create(useFqNames = false)
val codeGenResult = codeGenerator.generate(
schema = parsedDf.schema,
name = name,
fields = true,
extensionProperties = false,
isOpen = true,
visibility = importStatement.visibility,
knownMarkers = emptyList(),
readDfMethod = readDfMethod,
fieldNameNormalizer = NameNormalizer.from(importStatement.normalizationDelimiters.toSet()),
)
val code = codeGenResult.toStandaloneSnippet(packageName, readDfMethod.additionalImports)
schemaFile.bufferedWriter().use {
it.write(code)
}
}
private fun generateSchemaForImport(
importStatement: ImportDataSchemaStatement,
connection: Connection,
): DataFrameSchema {
logger.info("Table name: ${importStatement.jdbcOptions.tableName}")
logger.info("SQL query: ${importStatement.jdbcOptions.sqlQuery}")
val tableName = importStatement.jdbcOptions.tableName
val sqlQuery = importStatement.jdbcOptions.sqlQuery
return when {
isTableNameNotBlankAndQueryBlank(tableName, sqlQuery) -> generateSchemaForTable(connection, tableName)
isQueryNotBlankAndTableBlank(tableName, sqlQuery) -> generateSchemaForQuery(connection, sqlQuery)
areBothNotBlank(tableName, sqlQuery) -> throwBothFieldsFilledException(tableName, sqlQuery)
else -> throwBothFieldsEmptyException(tableName, sqlQuery)
}
}
private fun isTableNameNotBlankAndQueryBlank(tableName: String, sqlQuery: String) =
tableName.isNotBlank() && sqlQuery.isBlank()
private fun isQueryNotBlankAndTableBlank(tableName: String, sqlQuery: String) =
sqlQuery.isNotBlank() && tableName.isBlank()
private fun areBothNotBlank(tableName: String, sqlQuery: String) = sqlQuery.isNotBlank() && tableName.isNotBlank()
private fun generateSchemaForTable(connection: Connection, tableName: String) =
DataFrame.getSchemaForSqlTable(connection, tableName)
private fun generateSchemaForQuery(connection: Connection, sqlQuery: String) =
DataFrame.getSchemaForSqlQuery(connection, sqlQuery)
private fun throwBothFieldsFilledException(tableName: String, sqlQuery: String): Nothing =
throw RuntimeException(
"Table name '$tableName' and SQL query '$sqlQuery' both are filled! " +
"Clear 'tableName' or 'sqlQuery' properties in jdbcOptions with value to generate schema for SQL table or result of SQL query!",
)
private fun throwBothFieldsEmptyException(tableName: String, sqlQuery: String): Nothing =
throw RuntimeException(
"Table name '$tableName' and SQL query '$sqlQuery' both are empty! " +
"Populate 'tableName' or 'sqlQuery' properties in jdbcOptions with value to generate schema for SQL table or result of SQL query!",
)
}