All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.dimajix.flowman.catalog.ImpalaExternalCatalog.scala Maven / Gradle / Ivy

/*
 * Copyright 2018-2022 Kaya Kupferschmidt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.dimajix.flowman.catalog

import java.sql.Connection
import java.sql.DriverManager
import java.sql.SQLException
import java.sql.SQLRecoverableException
import java.sql.SQLTransientException
import java.sql.Statement
import java.util.Properties

import com.cloudera.impala.support.exceptions.GeneralException
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.catalog.CatalogTablePartition
import org.slf4j.LoggerFactory

import com.dimajix.flowman.jdbc.HiveDialect


object ImpalaExternalCatalog {
    val IMPALA_DEFAULT_DRIVER = "com.cloudera.impala.jdbc.Driver"
    val IMPALA_DEFAULT_PORT = 21050

    case class Connection(
         url:String = "",
         host:String = "",
         port:Int = IMPALA_DEFAULT_PORT,
         driver:String = IMPALA_DEFAULT_DRIVER,
         user:Option[String] = None,
         password:Option[String] = None,
         properties: Map[String,String] = Map(),
         timeout: Int = 3000,
         retries: Int = 3
     )
}


final case class ImpalaExternalCatalog(
    connection:ImpalaExternalCatalog.Connection,
    computeStats:Boolean
) extends AbstractExternalCatalog {
    private val logger = LoggerFactory.getLogger(classOf[ImpalaExternalCatalog])
    private val connect = createConnectionFactory(connection)

    override def createTable(table: CatalogTable): Unit = {
        logger.info(s"INVALIDATE Impala metadata for newly created table ${table.identifier}")
        withStatement { stmt =>
            val identifier = HiveDialect.quote(table.identifier)
            stmt.execute(s"INVALIDATE METADATA $identifier")
        }
    }

    override def alterTable(table: CatalogTable): Unit = {
        logger.info(s"REFRESH Impala metadata for altered table ${table.identifier}")
        withStatement { stmt =>
            val identifier = HiveDialect.quote(table.identifier)
            stmt.execute(s"REFRESH $identifier")
        }
    }

    override def refreshTable(table: CatalogTable): Unit = {
        logger.info(s"REFRESH Impala metadata for modified table ${table.identifier}")
        withStatement { stmt =>
            val identifier = HiveDialect.quote(table.identifier)
            stmt.execute(s"REFRESH $identifier")

            if (computeStats) {
                stmt.execute(s"COMPUTE STATS $identifier")
            }
        }
    }

    override def dropTable(table: CatalogTable): Unit = {
        logger.info(s"INVALIDATE Impala metadata for dropped table ${table.identifier}")
        withStatement { stmt =>
            val identifier = HiveDialect.quote(table.identifier)
            try {
                stmt.execute(s"INVALIDATE METADATA $identifier")
            }
            catch {
                // Ignore "TableNotFoundExceptions"
                case ex:SQLException if ex.getMessage.contains("TableNotFoundException") =>
            }
        }
    }

    override def truncateTable(table: CatalogTable): Unit = {
        logger.info(s"REFRESH Impala metadata for truncated table ${table.identifier}")
        withStatement { stmt =>
            val identifier = HiveDialect.quote(table.identifier)
            stmt.execute(s"REFRESH $identifier")
        }
    }

    override def addPartition(table: CatalogTable, partition: CatalogTablePartition): Unit = {
        logger.info(s"REFRESH Impala metadata for new partition ${partition.spec} of table ${table.identifier}")
        withStatement { stmt =>
            val identifier = HiveDialect.quote(table.identifier)
            val spec = HiveDialect.expr.partition(PartitionSpec(partition.spec))
            stmt.execute(s"REFRESH $identifier $spec")

            if (computeStats) {
                stmt.execute(s"COMPUTE INCREMENTAL STATS $identifier $spec")
            }
        }
    }

    override def alterPartition(table: CatalogTable, partition: CatalogTablePartition): Unit =  {
        logger.info(s"REFRESH Impala metadata for changed partition ${partition.spec} of table ${table.identifier}")
        withStatement { stmt =>
            val identifier = HiveDialect.quote(table.identifier)
            val spec = HiveDialect.expr.partition(PartitionSpec(partition.spec))
            stmt.execute(s"REFRESH $identifier $spec")

            if (computeStats) {
                stmt.execute(s"COMPUTE INCREMENTAL STATS $identifier $spec")
            }
        }
    }

    override def dropPartition(table: CatalogTable, partition: CatalogTablePartition): Unit = {
        logger.info(s"INVALIDATE Impala metadata for dropped partition ${partition.spec} of table ${table.identifier}")
        withStatement { stmt =>
            val identifier = HiveDialect.quote(table.identifier)
            stmt.execute(s"INVALIDATE METADATA $identifier")
        }
    }

    override def truncatePartition(table: CatalogTable, partition: CatalogTablePartition): Unit =  {
        logger.info(s"REFRESH Impala metadata for truncated partition ${partition.spec} of table ${table.identifier}")
        withStatement { stmt =>
            val identifier = HiveDialect.quote(table.identifier)
            val spec = HiveDialect.expr.partition(PartitionSpec(partition.spec))
            stmt.execute(s"REFRESH $identifier $spec")
        }
    }

    override def createView(table: CatalogTable): Unit = {
        logger.info(s"INVALIDATE Impala metadata for newly created view ${table.identifier}")
        withStatement { stmt =>
            val identifier = HiveDialect.quote(table.identifier)
            stmt.execute(s"INVALIDATE METADATA $identifier")
        }
    }

    override def alterView(table: CatalogTable): Unit = {
        logger.info(s"INVALIDATE Impala metadata for modified view ${table.identifier}")
        withStatement { stmt =>
            val identifier = HiveDialect.quote(table.identifier)
            stmt.execute(s"INVALIDATE METADATA $identifier")
        }
    }

    override def dropView(table: CatalogTable): Unit = {
        logger.info(s"INVALIDATE Impala metadata for dropped view ${table.identifier}")
        withStatement { stmt =>
            val identifier = HiveDialect.quote(table.identifier)
            try {
                stmt.execute(s"INVALIDATE METADATA $identifier")
            }
            catch {
                // Ignore "TableNotFoundExceptions"
                case ex:SQLException if ex.getMessage.contains("TableNotFoundException") =>
            }
        }
    }

    private def withConnection[T](fn:Connection => T) : T = {
        def retry[T](n:Int)(fn: => T) : T = {
            try {
                fn
            } catch {
                case e @(_:SQLRecoverableException|_:SQLTransientException) if n > 1 => {
                    logger.error("Retrying after error while executing SQL: {}", e.getMessage)
                    Thread.sleep(connection.timeout)
                    retry(n - 1)(fn)
                }
            }
        }

        retry(connection.retries) {
            val conn = connect()
            try {
                fn(conn)
            }
            finally {
                conn.close()
            }
        }
    }

    private def withStatement[T](fn:Statement => T) : T = {
        withConnection { con =>
            val statement = con.createStatement()
            try {
                // statement.setQueryTimeout(options.queryTimeout)
                fn(statement)
            }
            finally {
                statement.close()
            }
        }
    }

    private def createConnectionFactory(options: ImpalaExternalCatalog.Connection): () => Connection = {
        val driver = Option(options.driver)
            .filter(_.nonEmpty)
            .getOrElse(ImpalaExternalCatalog.IMPALA_DEFAULT_DRIVER)
        val url = Option(options.url)
            .filter(_.nonEmpty)
            .getOrElse("jdbc:impala://" + options.host + ":" + options.port)

        Class.forName(driver)

        val properties = new Properties()
        options.user.foreach(properties.setProperty("user", _))
        options.password.foreach(properties.setProperty("password", _))
        options.properties.foreach(kv => properties.setProperty(kv._1, kv._2))
        () => {
            DriverManager.getConnection(url, properties)
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy