All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.metadata.HiveMetaStoreChecker Maven / Gradle / Ivy

Go to download

Hive is a data warehouse infrastructure built on top of Hadoop see http://wiki.apache.org/hadoop/Hive

There is a newer version: 0.11.0-shark-0.9.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.metadata;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.ql.metadata.CheckResult.PartitionResult;
import org.apache.thrift.TException;

/**
 * Verify that the information in the metastore matches what is on the
 * filesystem. Return a CheckResult object containing lists of missing and any
 * unexpected tables and partitions.
 */
public class HiveMetaStoreChecker {

  public static final Log LOG = LogFactory.getLog(HiveMetaStoreChecker.class);

  private final Hive hive;
  private final HiveConf conf;

  public HiveMetaStoreChecker(Hive hive) {
    super();
    this.hive = hive;
    conf = hive.getConf();
  }

  /**
   * Check the metastore for inconsistencies, data missing in either the
   * metastore or on the dfs.
   *
   * @param dbName
   *          name of the database, if not specified the default will be used.
   * @param tableName
   *          Table we want to run the check for. If null we'll check all the
   *          tables in the database.
   * @param partitions
   *          List of partition name value pairs, if null or empty check all
   *          partitions
   * @param result
   *          Fill this with the results of the check
   * @throws HiveException
   *           Failed to get required information from the metastore.
   * @throws IOException
   *           Most likely filesystem related
   */
  public void checkMetastore(String dbName, String tableName,
      List> partitions, CheckResult result)
      throws HiveException, IOException {

    if (dbName == null || "".equalsIgnoreCase(dbName)) {
      dbName = MetaStoreUtils.DEFAULT_DATABASE_NAME;
    }

    try {
      if (tableName == null || "".equals(tableName)) {
        // no table specified, check all tables and all partitions.
        List tables = hive.getTablesForDb(dbName, ".*");
        for (String currentTableName : tables) {
          checkTable(dbName, currentTableName, null, result);
        }

        findUnknownTables(dbName, tables, result);
      } else if (partitions == null || partitions.isEmpty()) {
        // only one table, let's check all partitions
        checkTable(dbName, tableName, null, result);
      } else {
        // check the specified partitions
        checkTable(dbName, tableName, partitions, result);
      }
      Collections.sort(result.getPartitionsNotInMs());
      Collections.sort(result.getPartitionsNotOnFs());
      Collections.sort(result.getTablesNotInMs());
      Collections.sort(result.getTablesNotOnFs());
    } catch (MetaException e) {
      throw new HiveException(e);
    } catch (TException e) {
      throw new HiveException(e);
    }
  }

  /**
   * Check for table directories that aren't in the metastore.
   *
   * @param dbName
   *          Name of the database
   * @param tables
   *          List of table names
   * @param result
   *          Add any found tables to this
   * @throws HiveException
   *           Failed to get required information from the metastore.
   * @throws IOException
   *           Most likely filesystem related
   * @throws MetaException
   *           Failed to get required information from the metastore.
   * @throws NoSuchObjectException
   *           Failed to get required information from the metastore.
   * @throws TException
   *           Thrift communication error.
   */
  void findUnknownTables(String dbName, List tables, CheckResult result)
      throws IOException, MetaException, TException, HiveException {

    Set dbPaths = new HashSet();
    Set tableNames = new HashSet(tables);

    for (String tableName : tables) {
      Table table = hive.getTable(dbName, tableName);
      // hack, instead figure out a way to get the db paths
      String isExternal = table.getParameters().get("EXTERNAL");
      if (isExternal == null || !"TRUE".equalsIgnoreCase(isExternal)) {
        dbPaths.add(table.getPath().getParent());
      }
    }

    for (Path dbPath : dbPaths) {
      FileSystem fs = dbPath.getFileSystem(conf);
      FileStatus[] statuses = fs.listStatus(dbPath);
      for (FileStatus status : statuses) {

        if (status.isDir() && !tableNames.contains(status.getPath().getName())) {

          result.getTablesNotInMs().add(status.getPath().getName());
        }
      }
    }
  }

  /**
   * Check the metastore for inconsistencies, data missing in either the
   * metastore or on the dfs.
   *
   * @param dbName
   *          Name of the database
   * @param tableName
   *          Name of the table
   * @param partitions
   *          Partitions to check, if null or empty get all the partitions.
   * @param result
   *          Result object
   * @throws HiveException
   *           Failed to get required information from the metastore.
   * @throws IOException
   *           Most likely filesystem related
   * @throws MetaException
   *           Failed to get required information from the metastore.
   */
  void checkTable(String dbName, String tableName,
      List> partitions, CheckResult result)
      throws MetaException, IOException, HiveException {

    Table table = null;

    try {
      table = hive.getTable(dbName, tableName);
    } catch (HiveException e) {
      result.getTablesNotInMs().add(tableName);
      return;
    }

    List parts = new ArrayList();
    boolean findUnknownPartitions = true;

    if (table.isPartitioned()) {
      if (partitions == null || partitions.isEmpty()) {
        // no partitions specified, let's get all
        parts = hive.getPartitions(table);
      } else {
        // we're interested in specific partitions,
        // don't check for any others
        findUnknownPartitions = false;
        for (Map map : partitions) {
          Partition part = hive.getPartition(table, map, false);
          if (part == null) {
            PartitionResult pr = new PartitionResult();
            pr.setTableName(tableName);
            pr.setPartitionName(Warehouse.makePartPath(map));
            result.getPartitionsNotInMs().add(pr);
          } else {
            parts.add(part);
          }
        }
      }
    }

    checkTable(table, parts, findUnknownPartitions, result);
  }

  /**
   * Check the metastore for inconsistencies, data missing in either the
   * metastore or on the dfs.
   *
   * @param table
   *          Table to check
   * @param parts
   *          Partitions to check
   * @param result
   *          Result object
   * @param findUnknownPartitions
   *          Should we try to find unknown partitions?
   * @throws IOException
   *           Could not get information from filesystem
   * @throws HiveException
   *           Could not create Partition object
   */
  void checkTable(Table table, List parts,
      boolean findUnknownPartitions, CheckResult result) throws IOException,
      HiveException {

    Path tablePath = table.getPath();
    FileSystem fs = tablePath.getFileSystem(conf);
    if (!fs.exists(tablePath)) {
      result.getTablesNotOnFs().add(table.getTableName());
      return;
    }

    Set partPaths = new HashSet();

    // check that the partition folders exist on disk
    for (Partition partition : parts) {
      if (partition == null) {
        // most likely the user specified an invalid partition
        continue;
      }
      Path partPath = partition.getPartitionPath();
      fs = partPath.getFileSystem(conf);
      if (!fs.exists(partPath)) {
        PartitionResult pr = new PartitionResult();
        pr.setPartitionName(partition.getName());
        pr.setTableName(partition.getTable().getTableName());
        result.getPartitionsNotOnFs().add(pr);
      }

      for (int i = 0; i < partition.getSpec().size(); i++) {
        partPaths.add(partPath.makeQualified(fs));
        partPath = partPath.getParent();
      }
    }

    if (findUnknownPartitions) {
      findUnknownPartitions(table, partPaths, result);
    }
  }

  /**
   * Find partitions on the fs that are unknown to the metastore.
   *
   * @param table
   *          Table where the partitions would be located
   * @param partPaths
   *          Paths of the partitions the ms knows about
   * @param result
   *          Result object
   * @throws IOException
   *           Thrown if we fail at fetching listings from the fs.
   */
  void findUnknownPartitions(Table table, Set partPaths,
      CheckResult result) throws IOException {

    Path tablePath = table.getPath();
    // now check the table folder and see if we find anything
    // that isn't in the metastore
    Set allPartDirs = new HashSet();
    getAllLeafDirs(tablePath, allPartDirs);
    // don't want the table dir
    allPartDirs.remove(tablePath);

    // remove the partition paths we know about
    allPartDirs.removeAll(partPaths);

    // we should now only have the unexpected folders left
    for (Path partPath : allPartDirs) {
      FileSystem fs = partPath.getFileSystem(conf);
      String partitionName = getPartitionName(fs.makeQualified(tablePath),
          partPath);

      if (partitionName != null) {
        PartitionResult pr = new PartitionResult();
        pr.setPartitionName(partitionName);
        pr.setTableName(table.getTableName());

        result.getPartitionsNotInMs().add(pr);
      }
    }
  }

  /**
   * Get the partition name from the path.
   *
   * @param tablePath
   *          Path of the table.
   * @param partitionPath
   *          Path of the partition.
   * @return Partition name, for example partitiondate=2008-01-01
   */
  private String getPartitionName(Path tablePath, Path partitionPath) {
    String result = null;
    Path currPath = partitionPath;
    while (currPath != null && !tablePath.equals(currPath)) {
      if (result == null) {
        result = currPath.getName();
      } else {
        result = currPath.getName() + Path.SEPARATOR + result;
      }

      currPath = currPath.getParent();
    }
    return result;
  }

  /**
   * Recursive method to get the leaf directories of a base path. Example:
   * base/dir1/dir2 base/dir3
   *
   * This will return dir2 and dir3 but not dir1.
   *
   * @param basePath
   *          Start directory
   * @param allDirs
   *          This set will contain the leaf paths at the end.
   * @throws IOException
   *           Thrown if we can't get lists from the fs.
   */

  private void getAllLeafDirs(Path basePath, Set allDirs)
      throws IOException {
    getAllLeafDirs(basePath, allDirs, basePath.getFileSystem(conf));
  }

  private void getAllLeafDirs(Path basePath, Set allDirs, FileSystem fs)
      throws IOException {

    FileStatus[] statuses = fs.listStatus(basePath);
    boolean directoryFound=false;

    for (FileStatus status : statuses) {
      if (status.isDir()) {
        directoryFound = true;
        getAllLeafDirs(status.getPath(), allDirs, fs);
      }
    }

    if(!directoryFound){
      allDirs.add(basePath);
    }
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy