org.apache.hadoop.hive.metastore.MetaStoreDirectSql Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hive-metastore Show documentation
The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.metastore;

import static org.apache.commons.lang.StringUtils.join;
import static org.apache.commons.lang.StringUtils.repeat;

import java.sql.Connection;
import java.sql.SQLException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import javax.jdo.PersistenceManager;
import javax.jdo.Query;
import javax.jdo.Transaction;
import javax.jdo.datastore.JDOConnection;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.metastore.AggregateStatsCache.AggrColStats;
import org.apache.hadoop.hive.metastore.api.AggrStats;
import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.Database;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.PrincipalType;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
import org.apache.hadoop.hive.metastore.api.SkewedInfo;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.metastore.model.MDatabase;
import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics;
import org.apache.hadoop.hive.metastore.model.MTableColumnStatistics;
import org.apache.hadoop.hive.metastore.parser.ExpressionTree;
import org.apache.hadoop.hive.metastore.parser.ExpressionTree.FilterBuilder;
import org.apache.hadoop.hive.metastore.parser.ExpressionTree.LeafNode;
import org.apache.hadoop.hive.metastore.parser.ExpressionTree.LogicalOperator;
import org.apache.hadoop.hive.metastore.parser.ExpressionTree.Operator;
import org.apache.hadoop.hive.metastore.parser.ExpressionTree.TreeNode;
import org.apache.hadoop.hive.metastore.parser.ExpressionTree.TreeVisitor;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hive.common.util.BloomFilter;
import org.datanucleus.store.rdbms.query.ForwardQueryResult;

import com.google.common.collect.Lists;

/**
 * This class contains the optimizations for MetaStore that rely on direct SQL access to
 * the underlying database. It should use ANSI SQL and be compatible with common databases
 * such as MySQL (note that MySQL doesn't use full ANSI mode by default), Postgres, etc.
 *
 * As of now, only the partition retrieval is done this way to improve job startup time;
 * JDOQL partition retrieval is still present so as not to limit the ORM solution we have
 * to SQL stores only. There's always a way to do without direct SQL.
 */
class MetaStoreDirectSql {
  private static enum DB {
    MYSQL,
    ORACLE,
    MSSQL,
    DERBY,
    OTHER
  }

  private static final int NO_BATCHING = -1, DETECT_BATCHING = 0;

  private static final Log LOG = LogFactory.getLog(MetaStoreDirectSql.class);
  private final PersistenceManager pm;
  /**
   * We want to avoid db-specific code in this class and stick with ANSI SQL. However:
   * 1) mysql and postgres are differently ansi-incompatible (mysql by default doesn't support
   * quoted identifiers, and postgres contravenes ANSI by coercing unquoted ones to lower case).
   * MySQL's way of working around this is simpler (just set ansi quotes mode on), so we will
   * use that. MySQL detection is done by actually issuing the set-ansi-quotes command;
   *
   * Use sparingly, we don't want to devolve into another DataNucleus...
   */
  private final DB dbType;
  private final int batchSize;
  private final boolean convertMapNullsToEmptyStrings;

  /**
   * Whether direct SQL can be used with the current datastore backing {@link #pm}.
   */
  private final boolean isCompatibleDatastore;
  private final boolean isAggregateStatsCacheEnabled;
  private AggregateStatsCache aggrStatsCache;
  public MetaStoreDirectSql(PersistenceManager pm, Configuration conf) {
    this.pm = pm;
    this.dbType = determineDbType();
    int batchSize = HiveConf.getIntVar(conf, ConfVars.METASTORE_DIRECT_SQL_PARTITION_BATCH_SIZE);
    if (batchSize == DETECT_BATCHING) {
      batchSize = (dbType == DB.ORACLE || dbType == DB.MSSQL) ? 1000 : NO_BATCHING;
    }
    this.batchSize = batchSize;

    convertMapNullsToEmptyStrings =
        HiveConf.getBoolVar(conf, ConfVars.METASTORE_ORM_RETRIEVE_MAPNULLS_AS_EMPTY_STRINGS);

    String jdoIdFactory = HiveConf.getVar(conf, ConfVars.METASTORE_IDENTIFIER_FACTORY);
    if (! ("datanucleus1".equalsIgnoreCase(jdoIdFactory))){
      LOG.warn("Underlying metastore does not use 'datanuclues1' for its ORM naming scheme."
          + " Disabling directSQL as it uses hand-hardcoded SQL with that assumption.");
      isCompatibleDatastore = false;
    } else {
      isCompatibleDatastore = ensureDbInit() && runTestQuery();
      if (isCompatibleDatastore) {
        LOG.info("Using direct SQL, underlying DB is " + dbType);
      }
    }

    isAggregateStatsCacheEnabled = HiveConf.getBoolVar(conf, ConfVars.METASTORE_AGGREGATE_STATS_CACHE_ENABLED);
    if (isAggregateStatsCacheEnabled) {
      aggrStatsCache = AggregateStatsCache.getInstance(conf);
    }
  }

  private DB determineDbType() {
    DB dbType = DB.OTHER;
    if (runDbCheck("SET @@session.sql_mode=ANSI_QUOTES", "MySql")) {
      dbType = DB.MYSQL;
    } else if (runDbCheck("SELECT version FROM v$instance", "Oracle")) {
      dbType = DB.ORACLE;
    } else if (runDbCheck("SELECT @@version", "MSSQL")) {
      dbType = DB.MSSQL;
    } else {
      // TODO: maybe we should use getProductName to identify all the DBs
      String productName = getProductName();
      if (productName != null && productName.toLowerCase().contains("derby")) {
        dbType = DB.DERBY;
      }
    }
    return dbType;
  }

  private String getProductName() {
    JDOConnection jdoConn = pm.getDataStoreConnection();
    try {
      return ((Connection)jdoConn.getNativeConnection()).getMetaData().getDatabaseProductName();
    } catch (Throwable t) {
      LOG.warn("Error retrieving product name", t);
      return null;
    } finally {
      jdoConn.close(); // We must release the connection before we call other pm methods.
    }
  }

  private boolean ensureDbInit() {
    Transaction tx = pm.currentTransaction();
    try {
      // Force the underlying db to initialize.
      pm.newQuery(MDatabase.class, "name == ''").execute();
      pm.newQuery(MTableColumnStatistics.class, "dbName == ''").execute();
      pm.newQuery(MPartitionColumnStatistics.class, "dbName == ''").execute();
      return true;
    } catch (Exception ex) {
      LOG.warn("Database initialization failed; direct SQL is disabled", ex);
      tx.rollback();
      return false;
    }
  }

  private boolean runTestQuery() {
    Transaction tx = pm.currentTransaction();
    // Run a self-test query. If it doesn't work, we will self-disable. What a PITA...
    String selfTestQuery = "select \"DB_ID\" from \"DBS\"";
    try {
      pm.newQuery("javax.jdo.query.SQL", selfTestQuery).execute();
      tx.commit();
      return true;
    } catch (Exception ex) {
      LOG.warn("Self-test query [" + selfTestQuery + "] failed; direct SQL is disabled", ex);
      tx.rollback();
      return false;
    }
  }

  public boolean isCompatibleDatastore() {
    return isCompatibleDatastore;
  }

  /**
   * This function is intended to be called by functions before they put together a query
   * Thus, any query-specific instantiation to be done from within the transaction is done
   * here - for eg., for MySQL, we signal that we want to use ANSI SQL quoting behaviour
   */
  private void doDbSpecificInitializationsBeforeQuery() throws MetaException {
    if (dbType != DB.MYSQL) return;
    try {
      assert pm.currentTransaction().isActive(); // must be inside tx together with queries
      executeNoResult("SET @@session.sql_mode=ANSI_QUOTES");
    } catch (SQLException sqlEx) {
      throw new MetaException("Error setting ansi quotes: " + sqlEx.getMessage());
    }
  }

  private void executeNoResult(final String queryText) throws SQLException {
    JDOConnection jdoConn = pm.getDataStoreConnection();
    boolean doTrace = LOG.isDebugEnabled();
    try {
      long start = doTrace ? System.nanoTime() : 0;
      ((Connection)jdoConn.getNativeConnection()).createStatement().execute(queryText);
      timingTrace(doTrace, queryText, start, doTrace ? System.nanoTime() : 0);
    } finally {
      jdoConn.close(); // We must release the connection before we call other pm methods.
    }
  }

  private boolean runDbCheck(String queryText, String name) {
    Transaction tx = pm.currentTransaction();
    if (!tx.isActive()) {
      tx.begin();
    }
    try {
      executeNoResult(queryText);
      return true;
    } catch (Throwable t) {
      LOG.debug(name + " check failed, assuming we are not on " + name + ": " + t.getMessage());
      tx.rollback();
      tx = pm.currentTransaction();
      tx.begin();
      return false;
    }
  }

  public Database getDatabase(String dbName) throws MetaException{
    Query queryDbSelector = null;
    Query queryDbParams = null;
    try {
      dbName = dbName.toLowerCase();

      doDbSpecificInitializationsBeforeQuery();

      String queryTextDbSelector= "select "
          + "\"DB_ID\", \"NAME\", \"DB_LOCATION_URI\", \"DESC\", "
          + "\"OWNER_NAME\", \"OWNER_TYPE\" "
          + "FROM \"DBS\" where \"NAME\" = ? ";
      Object[] params = new Object[] { dbName };
      queryDbSelector = pm.newQuery("javax.jdo.query.SQL", queryTextDbSelector);

      if (LOG.isTraceEnabled()) {
        LOG.trace("getDatabase:query instantiated : " + queryTextDbSelector
            + " with param [" + params[0] + "]");
      }

      List sqlResult = executeWithArray(
          queryDbSelector, params, queryTextDbSelector);
      if ((sqlResult == null) || sqlResult.isEmpty()) {
        return null;
      }

      assert(sqlResult.size() == 1);
      if (sqlResult.get(0) == null) {
        return null;
      }

      Object[] dbline = sqlResult.get(0);
      Long dbid = extractSqlLong(dbline[0]);

      String queryTextDbParams = "select \"PARAM_KEY\", \"PARAM_VALUE\" "
          + " FROM \"DATABASE_PARAMS\" "
          + " WHERE \"DB_ID\" = ? "
          + " AND \"PARAM_KEY\" IS NOT NULL";
      params[0] = dbid;
      queryDbParams = pm.newQuery("javax.jdo.query.SQL", queryTextDbParams);
      if (LOG.isTraceEnabled()) {
        LOG.trace("getDatabase:query2 instantiated : " + queryTextDbParams
            + " with param [" + params[0] + "]");
      }

      Map dbParams = new HashMap();
      List sqlResult2 = ensureList(executeWithArray(
          queryDbParams, params, queryTextDbParams));
      if (!sqlResult2.isEmpty()) {
        for (Object[] line : sqlResult2) {
          dbParams.put(extractSqlString(line[0]), extractSqlString(line[1]));
        }
      }
      Database db = new Database();
      db.setName(extractSqlString(dbline[1]));
      db.setLocationUri(extractSqlString(dbline[2]));
      db.setDescription(extractSqlString(dbline[3]));
      db.setOwnerName(extractSqlString(dbline[4]));
      String type = extractSqlString(dbline[5]);
      db.setOwnerType(
          (null == type || type.trim().isEmpty()) ? null : PrincipalType.valueOf(type));
      db.setParameters(MetaStoreUtils.trimMapNulls(dbParams,convertMapNullsToEmptyStrings));
      if (LOG.isDebugEnabled()){
        LOG.debug("getDatabase: directsql returning db " + db.getName()
            + " locn["+db.getLocationUri()  +"] desc [" +db.getDescription()
            + "] owner [" + db.getOwnerName() + "] ownertype ["+ db.getOwnerType() +"]");
      }
      return db;
    } finally {
      if (queryDbSelector != null){
        queryDbSelector.closeAll();
      }
      if (queryDbParams != null){
        queryDbParams.closeAll();
      }
    }
  }

  /**
   * Gets partitions by using direct SQL queries.
   * Note that batching is not needed for this method - list of names implies the batch size;
   * @param dbName Metastore db name.
   * @param tblName Metastore table name.
   * @param partNames Partition names to get.
   * @return List of partitions.
   */
  public List getPartitionsViaSqlFilter(
      String dbName, String tblName, List partNames) throws MetaException {
    if (partNames.isEmpty()) {
      return new ArrayList();
    }
    return getPartitionsViaSqlFilterInternal(dbName, tblName, null,
        "\"PARTITIONS\".\"PART_NAME\" in (" + makeParams(partNames.size()) + ")",
        partNames, new ArrayList(), null);
  }

  /**
   * Gets partitions by using direct SQL queries.
   * @param table The table.
   * @param tree The expression tree from which the SQL filter will be derived.
   * @param max The maximum number of partitions to return.
   * @return List of partitions. Null if SQL filter cannot be derived.
   */
  public List getPartitionsViaSqlFilter(
      Table table, ExpressionTree tree, Integer max) throws MetaException {
    assert tree != null;
    List