All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.org.apache.hadoop_hive.metastore.hbase.HBaseImport Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.hadoop.hive.metastore.hbase;

import com.google.common.annotations.VisibleForTesting;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.metastore.api.SQLForeignKey;
import org.apache.hadoop.hive.metastore.api.SQLPrimaryKey;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.Deadline;
import org.apache.hadoop.hive.metastore.ObjectStore;
import org.apache.hadoop.hive.metastore.RawStore;
import org.apache.hadoop.hive.metastore.api.Database;
import org.apache.hadoop.hive.metastore.api.Function;
import org.apache.hadoop.hive.metastore.api.Index;
import org.apache.hadoop.hive.metastore.api.InvalidObjectException;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.Role;
import org.apache.hadoop.hive.metastore.api.Table;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;

/**
 * A tool to take the contents of an RDBMS based Hive metastore and import it into an HBase based
 * one.  To use this the config files for Hive configured to work with the RDBMS (that is,
 * including the JDBC string, etc.) as well as HBase configuration files must be in the path.
 * There should not be a hive-site.xml that specifies HBaseStore in the path.  This tool will then
 * handle connecting to the RDBMS via the {@link org.apache.hadoop.hive.metastore.ObjectStore}
 * and HBase via {@link org.apache.hadoop.hive.metastore.hbase.HBaseStore} and transferring the
 * data.
 *
 * This tool can import an entire metastore or only selected objects.  When selecting objects it
 * is necessary to fully specify the object's name.  For example, if you want to import the table
 * T in the default database it needs to be identified as default.T.  The same is true for
 * functions.  When an object is specified, everything under that object will be imported (e.g.
 * if you select database D, then all tables and functions in that database will be
 * imported as well).
 *
 * At this point only tables and partitions are handled in parallel as it is assumed there are
 * relatively few of everything else.
 *
 * Note that HBaseSchemaTool must have already been used to create the appropriate tables in HBase.
 */
public class HBaseImport {

  static final private Logger LOG = LoggerFactory.getLogger(HBaseImport.class.getName());

  public static int main(String[] args) {
    try {
      HBaseImport tool = new HBaseImport();
      int rv = tool.init(args);
      if (rv != 0) return rv;
      tool.run();
    } catch (Exception e) {
      System.err.println("Caught exception " + e.getClass().getName() + " with message <" +
          e.getMessage() + ">");
      return 1;
    }
    return 0;
  }

  private ThreadLocal rdbmsStore = new ThreadLocal() {
    @Override
    protected RawStore initialValue() {
      if (rdbmsConf == null) {
        throw new RuntimeException("order violation, need to set rdbms conf first");
      }
      RawStore os = new ObjectStore();
      os.setConf(rdbmsConf);
      return os;
    }
  };

  private ThreadLocal hbaseStore = new ThreadLocal() {
    @Override
    protected RawStore initialValue() {
      if (hbaseConf == null) {
        throw new RuntimeException("order violation, need to set hbase conf first");
      }
      RawStore hs = new HBaseStore();
      hs.setConf(hbaseConf);
      return hs;
    }
  };

  private Configuration rdbmsConf;
  private Configuration hbaseConf;
  private List dbs;
  private BlockingQueue partitionedTables;
  private BlockingQueue tableNameQueue;
  private BlockingQueue indexNameQueue;
  private BlockingQueue partQueue;
  private boolean writingToQueue, readersFinished;
  private boolean doKerberos, doAll;
  private List rolesToImport, dbsToImport, tablesToImport, functionsToImport;
  private int parallel;
  private int batchSize;

  private HBaseImport() {}

  @VisibleForTesting
  public HBaseImport(String... args) throws ParseException {
    init(args);
  }

  private int init(String... args) throws ParseException {
    Options options = new Options();

    doAll = doKerberos = false;
    parallel = 1;
    batchSize = 1000;

    options.addOption(OptionBuilder
        .withLongOpt("all")
        .withDescription("Import the full metastore")
        .create('a'));

    options.addOption(OptionBuilder
        .withLongOpt("batchsize")
        .withDescription("Number of partitions to read and write in a batch, defaults to 1000")
            .hasArg()
            .create('b'));

    options.addOption(OptionBuilder
        .withLongOpt("database")
        .withDescription("Import a single database")
        .hasArgs()
        .create('d'));

    options.addOption(OptionBuilder
        .withLongOpt("help")
        .withDescription("You're looking at it")
        .create('h'));

    options.addOption(OptionBuilder
        .withLongOpt("function")
        .withDescription("Import a single function")
        .hasArgs()
        .create('f'));

    options.addOption(OptionBuilder
        .withLongOpt("kerberos")
        .withDescription("Import all kerberos related objects (master key, tokens)")
        .create('k'));

     options.addOption(OptionBuilder
        .withLongOpt("parallel")
        .withDescription("Parallel factor for loading (only applied to tables and partitions), " +
            "defaults to 1")
        .hasArg()
        .create('p'));

    options.addOption(OptionBuilder
        .withLongOpt("role")
        .withDescription("Import a single role")
        .hasArgs()
        .create('r'));

   options.addOption(OptionBuilder
        .withLongOpt("tables")
        .withDescription("Import a single tables")
        .hasArgs()
        .create('t'));

    CommandLine cli = new GnuParser().parse(options, args);

    // Process help, if it was asked for, this must be done first
    if (cli.hasOption('h')) {
      printHelp(options);
      return 1;
    }

    boolean hasCmd = false;
    // Now process the other command line args
    if (cli.hasOption('a')) {
      hasCmd = true;
      doAll = true;
    }
    if (cli.hasOption('b')) {
      batchSize = Integer.parseInt(cli.getOptionValue('b'));
    }
    if (cli.hasOption('d')) {
      hasCmd = true;
      dbsToImport = Arrays.asList(cli.getOptionValues('d'));
    }
    if (cli.hasOption('f')) {
      hasCmd = true;
      functionsToImport = Arrays.asList(cli.getOptionValues('f'));
    }
    if (cli.hasOption('p')) {
      parallel = Integer.parseInt(cli.getOptionValue('p'));
    }
    if (cli.hasOption('r')) {
      hasCmd = true;
      rolesToImport = Arrays.asList(cli.getOptionValues('r'));
    }
    if (cli.hasOption('k')) {
      doKerberos = true;
    }
    if (cli.hasOption('t')) {
      hasCmd = true;
      tablesToImport = Arrays.asList(cli.getOptionValues('t'));
    }
    if (!hasCmd) {
      printHelp(options);
      return 1;
    }

    dbs = new ArrayList<>();
    // We don't want to bound the size of the table queue because we keep it all in memory
    partitionedTables = new LinkedBlockingQueue<>();
    tableNameQueue = new LinkedBlockingQueue<>();
    indexNameQueue = new LinkedBlockingQueue<>();

    // Bound the size of this queue so we don't get too much in memory.
    partQueue = new ArrayBlockingQueue<>(parallel * 2);
    return 0;
  }

  private void printHelp(Options options) {
    (new HelpFormatter()).printHelp("hbaseschematool", options);
  }

  @VisibleForTesting
  void run() throws MetaException, InstantiationException, IllegalAccessException,
      NoSuchObjectException, InvalidObjectException, InterruptedException {
    // Order here is crucial, as you can't add tables until you've added databases, etc.
    init();
    if (doAll || rolesToImport != null) {
      copyRoles();
    }
    if (doAll || dbsToImport != null) {
      copyDbs();
    }
    if (doAll || dbsToImport != null || tablesToImport != null) {
      copyTables();
      copyPartitions();
      copyIndexes();
    }
    if (doAll || dbsToImport != null || functionsToImport != null) {
      copyFunctions();
    }
    if (doAll || doKerberos) {
      copyKerberos();
    }
  }

  private void init() throws MetaException, IllegalAccessException, InstantiationException {
    if (rdbmsConf != null) {
      // We've been configured for testing, so don't do anything here.
      return;
    }
    // We're depending on having everything properly in the path
    rdbmsConf = new HiveConf();
    hbaseConf = new HiveConf();//
    HiveConf.setVar(hbaseConf, HiveConf.ConfVars.METASTORE_RAW_STORE_IMPL,
        HBaseStore.class.getName());
    HiveConf.setBoolVar(hbaseConf, HiveConf.ConfVars.METASTORE_FASTPATH, true);

    // First get a connection to the RDBMS based store
    rdbmsStore.get().setConf(rdbmsConf);

    // Get a connection to the HBase based store
    hbaseStore.get().setConf(hbaseConf);
  }

  private void copyRoles() throws NoSuchObjectException, InvalidObjectException, MetaException {
    screen("Copying roles");
    List toCopy = doAll ? rdbmsStore.get().listRoleNames() : rolesToImport;
    for (String roleName : toCopy) {
      Role role = rdbmsStore.get().getRole(roleName);
      screen("Copying role " + roleName);
      hbaseStore.get().addRole(roleName, role.getOwnerName());
    }
  }

  private void copyDbs() throws MetaException, NoSuchObjectException, InvalidObjectException {
    screen("Copying databases");
    List toCopy = doAll ? rdbmsStore.get().getAllDatabases() : dbsToImport;
    for (String dbName : toCopy) {
      Database db = rdbmsStore.get().getDatabase(dbName);
      dbs.add(db);
      screen("Copying database " + dbName);
      hbaseStore.get().createDatabase(db);
    }
  }

  private void copyTables() throws MetaException, InvalidObjectException, InterruptedException {
    screen("Copying tables");

    // Start the parallel threads that will copy the tables
    Thread[] copiers = new Thread[parallel];
    writingToQueue = true;
    for (int i = 0; i < parallel; i++) {
      copiers[i] = new TableCopier();
      copiers[i].start();
    }

    // Put tables from the databases we copied into the queue
    for (Database db : dbs) {
      screen("Coyping tables in database " + db.getName());
      for (String tableName : rdbmsStore.get().getAllTables(db.getName())) {
        tableNameQueue.put(new String[]{db.getName(), tableName});
      }
    }

    // Now put any specifically requested tables into the queue
    if (tablesToImport != null) {
      for (String compoundTableName : tablesToImport) {
        String[] tn = compoundTableName.split("\\.");
        if (tn.length != 2) {
          error(compoundTableName + " not in proper form.  Must be in form dbname.tablename.  " +
              "Ignoring this table and continuing.");
        } else {
          tableNameQueue.put(new String[]{tn[0], tn[1]});
        }
      }
    }
    writingToQueue = false;

    // Wait until we've finished adding all the tables
    for (Thread copier : copiers) copier.join();
 }

  private class TableCopier extends Thread {
    @Override
    public void run() {
      while (writingToQueue || tableNameQueue.size() > 0) {
        try {
          String[] name = tableNameQueue.poll(1, TimeUnit.SECONDS);
          if (name != null) {
            Table table = rdbmsStore.get().getTable(name[0], name[1]);
            // If this has partitions, put it in the list to fetch partions for
            if (table.getPartitionKeys() != null && table.getPartitionKeys().size() > 0) {
              partitionedTables.put(table);
            }
            screen("Copying table " + name[0] + "." + name[1]);
            hbaseStore.get().createTable(table);

            // See if the table has any constraints, and if so copy those as well
            List pk =
                rdbmsStore.get().getPrimaryKeys(table.getDbName(), table.getTableName());
            if (pk != null && pk.size() > 0) {
              LOG.debug("Found primary keys, adding them");
              hbaseStore.get().addPrimaryKeys(pk);
            }

            // Passing null as the target table name results in all of the foreign keys being
            // retrieved.
            List fks =
                rdbmsStore.get().getForeignKeys(null, null, table.getDbName(), table.getTableName());
            if (fks != null && fks.size() > 0) {
              LOG.debug("Found foreign keys, adding them");
              hbaseStore.get().addForeignKeys(fks);
            }
          }
        } catch (InterruptedException | MetaException | InvalidObjectException e) {
          throw new RuntimeException(e);
        }
      }
    }
  }

  private void copyIndexes() throws MetaException, InvalidObjectException, InterruptedException {
    screen("Copying indexes");

    // Start the parallel threads that will copy the indexes
    Thread[] copiers = new Thread[parallel];
    writingToQueue = true;
    for (int i = 0; i < parallel; i++) {
      copiers[i] = new IndexCopier();
      copiers[i].start();
    }

    // Put indexes from the databases we copied into the queue
    for (Database db : dbs) {
      screen("Coyping indexes in database " + db.getName());
      for (String tableName : rdbmsStore.get().getAllTables(db.getName())) {
        for (Index index : rdbmsStore.get().getIndexes(db.getName(), tableName, -1)) {
          indexNameQueue.put(new String[]{db.getName(), tableName, index.getIndexName()});
        }
      }
    }

    // Now put any specifically requested tables into the queue
    if (tablesToImport != null) {
      for (String compoundTableName : tablesToImport) {
        String[] tn = compoundTableName.split("\\.");
        if (tn.length != 2) {
          error(compoundTableName + " not in proper form.  Must be in form dbname.tablename.  " +
              "Ignoring this table and continuing.");
        } else {
          for (Index index : rdbmsStore.get().getIndexes(tn[0], tn[1], -1)) {
            indexNameQueue.put(new String[]{tn[0], tn[1], index.getIndexName()});
          }
        }
      }
    }

    writingToQueue = false;

    // Wait until we've finished adding all the tables
    for (Thread copier : copiers) copier.join();
 }

  private class IndexCopier extends Thread {
    @Override
    public void run() {
      while (writingToQueue || indexNameQueue.size() > 0) {
        try {
          String[] name = indexNameQueue.poll(1, TimeUnit.SECONDS);
          if (name != null) {
            Index index = rdbmsStore.get().getIndex(name[0], name[1], name[2]);
            screen("Copying index " + name[0] + "." + name[1] + "." + name[2]);
            hbaseStore.get().addIndex(index);
          }
        } catch (InterruptedException | MetaException | InvalidObjectException e) {
          throw new RuntimeException(e);
        }
      }
    }
  }

  /* Partition copying is a little complex.  As we went through and copied the tables we put each
   * partitioned table into a queue.  We will now go through that queue and add partitions for the
   * tables.  We do the finding of partitions and writing of them separately and in parallel.
   * This way if there is one table with >> partitions then all of the others that skew won't
   * hurt us.  To avoid pulling all of the partitions for a table into memory, we batch up
   * partitions (by default in batches of 1000) and copy them over in batches.
   */
  private void copyPartitions() throws MetaException, NoSuchObjectException,
      InvalidObjectException, InterruptedException {
    screen("Copying partitions");
    readersFinished = false;
    Thread[] readers = new Thread[parallel];
    Thread[] writers = new Thread[parallel];
    for (int i = 0; i < parallel; i++) {
      readers[i] = new PartitionReader();
      readers[i].start();
      writers[i] = new PartitionWriter();
      writers[i].start();
    }

    for (Thread reader : readers) reader.join();
    readersFinished = true;

    // Wait until we've finished adding all the partitions
    for (Thread writer : writers) writer.join();
  }

  private class PartitionReader extends Thread {
    @Override
    public void run() {
      while (partitionedTables.size() > 0) {
        try {
          Table table = partitionedTables.poll(1, TimeUnit.SECONDS);
          if (table != null) {
            screen("Fetching partitions for table " + table.getDbName() + "." +
                table.getTableName());
            List partNames =
                rdbmsStore.get().listPartitionNames(table.getDbName(), table.getTableName(),
                    (short) -1);
            if (partNames.size() <= batchSize) {
              LOG.debug("Adding all partition names to queue for " + table.getDbName() + "." +
                  table.getTableName());
              partQueue.put(new PartQueueEntry(table.getDbName(), table.getTableName(), partNames));
            } else {
              int goUntil = partNames.size() % batchSize == 0 ? partNames.size() / batchSize :
                  partNames.size() / batchSize + 1;
              for (int i = 0; i < goUntil; i++) {
                int start = i * batchSize;
                int end = Math.min((i + 1) * batchSize, partNames.size());
                LOG.debug("Adding partitions " + start + " to " + end + " for " + table.getDbName()
                    + "." + table.getTableName());
                partQueue.put(new PartQueueEntry(table.getDbName(), table.getTableName(),
                    partNames.subList(start, end)));
              }
            }
          }
        } catch (InterruptedException | MetaException e) {
          throw new RuntimeException(e);
        }
      }
    }
  }

  private class PartitionWriter extends Thread {
    @Override
    public void run() {
      // This keeps us from throwing exceptions in our raw store calls
      Deadline.registerIfNot(1000000);
      while (!readersFinished || partQueue.size() > 0) {
        try {
          PartQueueEntry entry = partQueue.poll(1, TimeUnit.SECONDS);
          if (entry != null) {
            LOG.info("Writing partitions " + entry.dbName + "." + entry.tableName + "." +
                StringUtils.join(entry.partNames, ':'));
            // Fetch these partitions and write them to HBase
            Deadline.startTimer("hbaseimport");
            List parts =
                rdbmsStore.get().getPartitionsByNames(entry.dbName, entry.tableName,
                    entry.partNames);
            hbaseStore.get().addPartitions(entry.dbName, entry.tableName, parts);
            Deadline.stopTimer();
          }
        } catch (InterruptedException | MetaException | InvalidObjectException |
            NoSuchObjectException e) {
          throw new RuntimeException(e);
        }
      }
    }
  }

  private void copyFunctions() throws MetaException, NoSuchObjectException, InvalidObjectException {
    screen("Copying functions");
    // Copy any functions from databases we copied.
    for (Database db : dbs) {
      screen("Copying functions in database " + db.getName());
      for (String funcName : rdbmsStore.get().getFunctions(db.getName(), "*")) {
        copyOneFunction(db.getName(), funcName);
      }
    }
    // Now do any specifically requested functions
    if (functionsToImport != null) {
      for (String compoundFuncName : functionsToImport) {
        String[] fn = compoundFuncName.split("\\.");
        if (fn.length != 2) {
          error(compoundFuncName + " not in proper form.  Must be in form dbname.funcname.  " +
              "Ignoring this function and continuing.");
        } else {
          copyOneFunction(fn[0], fn[1]);
        }
      }
    }
  }

  private void copyOneFunction(String dbName, String funcName) throws MetaException,
      InvalidObjectException {
    Function func = rdbmsStore.get().getFunction(dbName, funcName);
    screen("Copying function " + dbName + "." + funcName);
    hbaseStore.get().createFunction(func);
  }

  private void copyKerberos() throws MetaException {
    screen("Copying kerberos related items");
    for (String tokenId : rdbmsStore.get().getAllTokenIdentifiers()) {
      String token = rdbmsStore.get().getToken(tokenId);
      hbaseStore.get().addToken(tokenId, token);
    }
    for (String masterKey : rdbmsStore.get().getMasterKeys()) {
      hbaseStore.get().addMasterKey(masterKey);
    }
  }

  private void screen(String msg) {
    LOG.info(msg);
    System.out.println(msg);
  }

  private void error(String msg) {
    LOG.error(msg);
    System.err.println("ERROR:  " + msg);
  }

  @VisibleForTesting
  void setConnections(RawStore rdbms, RawStore hbase) {
    rdbmsStore.set(rdbms);
    hbaseStore.set(hbase);
    rdbmsConf = rdbms.getConf();
    hbaseConf = hbase.getConf();
  }

  private static class PartQueueEntry {
    final String dbName;
    final String tableName;
    final List partNames;

    PartQueueEntry(String d, String t, List p) {
      dbName = d;
      tableName = t;
      partNames = p;
    }
  }

}