All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.uber.hoodie.hive.HoodieHiveClient Maven / Gradle / Ivy

/*
 *  Copyright (c) 2017 Uber Technologies, Inc. ([email protected])
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *           http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *
 */
package com.uber.hoodie.hive;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.uber.hoodie.common.model.HoodieCommitMetadata;
import com.uber.hoodie.common.model.HoodieLogFile;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.table.log.HoodieLogFormat;
import com.uber.hoodie.common.table.log.HoodieLogFormat.Reader;
import com.uber.hoodie.common.table.log.block.HoodieAvroDataBlock;
import com.uber.hoodie.common.table.log.block.HoodieLogBlock;
import com.uber.hoodie.common.table.timeline.HoodieInstant;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.exception.HoodieIOException;
import com.uber.hoodie.exception.InvalidDatasetException;
import com.uber.hoodie.hive.util.SchemaUtil;
import org.apache.commons.dbcp.BasicDataSource;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hive.jdbc.HiveDriver;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import parquet.format.converter.ParquetMetadataConverter;
import parquet.hadoop.ParquetFileReader;
import parquet.hadoop.metadata.ParquetMetadata;
import parquet.schema.MessageType;

import java.io.IOException;
import java.sql.Connection;
import java.sql.DatabaseMetaData;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;

@SuppressWarnings("ConstantConditions")
public class HoodieHiveClient {

  private static final String HOODIE_LAST_COMMIT_TIME_SYNC = "last_commit_time_sync";
  // Make sure we have the hive JDBC driver in classpath
  private static String driverName = HiveDriver.class.getName();

  static {
    try {
      Class.forName(driverName);
    } catch (ClassNotFoundException e) {
      throw new IllegalStateException("Could not find " + driverName + " in classpath. ", e);
    }
  }

  private static Logger LOG = LoggerFactory.getLogger(HoodieHiveClient.class);
  private final HoodieTableMetaClient metaClient;
  private final HoodieTableType tableType;
  private final PartitionValueExtractor partitionValueExtractor;
  private HiveMetaStoreClient client;
  private HiveSyncConfig syncConfig;
  private FileSystem fs;
  private Connection connection;
  private HoodieTimeline activeTimeline;

  HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
    this.syncConfig = cfg;
    this.fs = fs;
    this.metaClient = new HoodieTableMetaClient(fs.getConf(), cfg.basePath, true);
    this.tableType = metaClient.getTableType();

    LOG.info("Creating hive connection " + cfg.jdbcUrl);
    createHiveConnection();
    try {
      this.client = new HiveMetaStoreClient(configuration);
    } catch (MetaException e) {
      throw new HoodieHiveSyncException("Failed to create HiveMetaStoreClient", e);
    }

    try {
      this.partitionValueExtractor = (PartitionValueExtractor) Class
          .forName(cfg.partitionValueExtractorClass).newInstance();
    } catch (Exception e) {
      throw new HoodieHiveSyncException(
          "Failed to initialize PartitionValueExtractor class " + cfg.partitionValueExtractorClass,
          e);
    }

    activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline()
        .filterCompletedInstants();
  }

  public HoodieTimeline getActiveTimeline() {
    return activeTimeline;
  }

  /**
   * Add the (NEW) partitons to the table
   */
  void addPartitionsToTable(List partitionsToAdd) {
    if (partitionsToAdd.isEmpty()) {
      LOG.info("No partitions to add for " + syncConfig.tableName);
      return;
    }
    LOG.info("Adding partitions " + partitionsToAdd.size() + " to table " + syncConfig.tableName);
    String sql = constructAddPartitions(partitionsToAdd);
    updateHiveSQL(sql);
  }

  /**
   * Partition path has changed - update the path for te following partitions
   */
  void updatePartitionsToTable(List changedPartitions) {
    if (changedPartitions.isEmpty()) {
      LOG.info("No partitions to change for " + syncConfig.tableName);
      return;
    }
    LOG.info("Changing partitions " + changedPartitions.size() + " on " + syncConfig.tableName);
    List sqls = constructChangePartitions(changedPartitions);
    for (String sql : sqls) {
      updateHiveSQL(sql);
    }
  }

  private String constructAddPartitions(List partitions) {
    StringBuilder alterSQL = new StringBuilder("ALTER TABLE ");
    alterSQL.append(syncConfig.databaseName).append(".").append(syncConfig.tableName)
        .append(" ADD IF NOT EXISTS ");
    for (String partition : partitions) {

      StringBuilder partBuilder = new StringBuilder();
      List partitionValues = partitionValueExtractor
          .extractPartitionValuesInPath(partition);
      Preconditions.checkArgument(syncConfig.partitionFields.size() == partitionValues.size(),
          "Partition key parts " + syncConfig.partitionFields
              + " does not match with partition values " + partitionValues
              + ". Check partition strategy. ");
      for (int i = 0; i < syncConfig.partitionFields.size(); i++) {
        partBuilder.append(syncConfig.partitionFields.get(i)).append("=").append("'")
            .append(partitionValues.get(i)).append("'");
      }

      String fullPartitionPath = new Path(syncConfig.basePath, partition).toString();
      alterSQL.append("  PARTITION (").append(partBuilder.toString()).append(") LOCATION '")
          .append(fullPartitionPath).append("' ");
    }
    return alterSQL.toString();
  }

  private List constructChangePartitions(List partitions) {
    List changePartitions = Lists.newArrayList();
    String alterTable = "ALTER TABLE " + syncConfig.databaseName + "." + syncConfig.tableName;
    for (String partition : partitions) {
      StringBuilder partBuilder = new StringBuilder();
      List partitionValues = partitionValueExtractor
          .extractPartitionValuesInPath(partition);
      Preconditions.checkArgument(syncConfig.partitionFields.size() == partitionValues.size(),
          "Partition key parts " + syncConfig.partitionFields
              + " does not match with partition values " + partitionValues
              + ". Check partition strategy. ");
      for (int i = 0; i < syncConfig.partitionFields.size(); i++) {
        partBuilder.append(syncConfig.partitionFields.get(i)).append("=").append("'")
            .append(partitionValues.get(i)).append("'");
      }

      String fullPartitionPath = new Path(syncConfig.basePath, partition).toString();
      String changePartition =
          alterTable + " PARTITION (" + partBuilder.toString() + ") SET LOCATION '"
              + "hdfs://nameservice1" + fullPartitionPath + "'";
      changePartitions.add(changePartition);
    }
    return changePartitions;
  }

  /**
   * Iterate over the storage partitions and find if there are any new partitions that need to be
   * added or updated. Generate a list of PartitionEvent based on the changes required.
   */
  List getPartitionEvents(List tablePartitions,
      List partitionStoragePartitions) {
    Map paths = Maps.newHashMap();
    for (Partition tablePartition : tablePartitions) {
      List hivePartitionValues = tablePartition.getValues();
      Collections.sort(hivePartitionValues);
      String fullTablePartitionPath = Path
          .getPathWithoutSchemeAndAuthority(new Path(tablePartition.getSd().getLocation())).toUri()
          .getPath();
      paths.put(String.join(", ", hivePartitionValues), fullTablePartitionPath);
    }

    List events = Lists.newArrayList();
    for (String storagePartition : partitionStoragePartitions) {
      String fullStoragePartitionPath = new Path(syncConfig.basePath, storagePartition).toString();
      // Check if the partition values or if hdfs path is the same
      List storagePartitionValues = partitionValueExtractor
          .extractPartitionValuesInPath(storagePartition);
      Collections.sort(storagePartitionValues);
      String storageValue = String.join(", ", storagePartitionValues);
      if (!paths.containsKey(storageValue)) {
        events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
      } else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
        events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
      }
    }
    return events;
  }


  /**
   * Scan table partitions
   */
  List scanTablePartitions() throws TException {
    return client
        .listPartitions(syncConfig.databaseName, syncConfig.tableName, (short) -1);
  }

  void updateTableDefinition(MessageType newSchema) {
    try {
      String newSchemaStr = SchemaUtil.generateSchemaString(newSchema);
      // Cascade clause should not be present for non-partitioned tables
      String cascadeClause = syncConfig.partitionFields.size() > 0 ? " cascade" : "";
      StringBuilder sqlBuilder = new StringBuilder("ALTER TABLE ").append("`")
          .append(syncConfig.databaseName).append(".").append(syncConfig.tableName).append("`")
          .append(" REPLACE COLUMNS(")
          .append(newSchemaStr).append(" )").append(cascadeClause);
      LOG.info("Creating table with " + sqlBuilder);
      updateHiveSQL(sqlBuilder.toString());
    } catch (IOException e) {
      throw new HoodieHiveSyncException("Failed to update table for " + syncConfig.tableName, e);
    }
  }

  void createTable(MessageType storageSchema,
      String inputFormatClass, String outputFormatClass, String serdeClass) {
    try {
      String createSQLQuery = SchemaUtil
          .generateCreateDDL(storageSchema, syncConfig, inputFormatClass,
              outputFormatClass, serdeClass);
      LOG.info("Creating table with " + createSQLQuery);
      updateHiveSQL(createSQLQuery);
    } catch (IOException e) {
      throw new HoodieHiveSyncException("Failed to create table " + syncConfig.tableName, e);
    }
  }

  /**
   * Get the table schema
   */
  Map getTableSchema() {
    if (!doesTableExist()) {
      throw new IllegalArgumentException(
          "Failed to get schema for table " + syncConfig.tableName + " does not exist");
    }
    Map schema = Maps.newHashMap();
    ResultSet result = null;
    try {
      DatabaseMetaData databaseMetaData = connection.getMetaData();
      result = databaseMetaData
          .getColumns(null, syncConfig.databaseName, syncConfig.tableName, null);
      while (result.next()) {
        String columnName = result.getString(4);
        String columnType = result.getString(6);
        schema.put(columnName, columnType);
      }
      return schema;
    } catch (SQLException e) {
      throw new HoodieHiveSyncException(
          "Failed to get table schema for " + syncConfig.tableName, e);
    } finally {
      closeQuietly(result, null);
    }
  }

  /**
   * Gets the schema for a hoodie dataset. Depending on the type of table, read from any file
   * written in the latest commit. We will assume that the schema has not changed within a single
   * atomic write.
   *
   * @return Parquet schema for this dataset
   */
  @SuppressWarnings("WeakerAccess")
  public MessageType getDataSchema() {
    try {
      switch (tableType) {
        case COPY_ON_WRITE:
          // If this is COW, get the last commit and read the schema from a file written in the last commit
          HoodieInstant lastCommit = activeTimeline.lastInstant()
              .orElseThrow(() -> new InvalidDatasetException(syncConfig.basePath));
          HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
              .fromBytes(activeTimeline.getInstantDetails(lastCommit).get());
          String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values()
              .stream().findAny()
              .orElseThrow(() -> new IllegalArgumentException(
                  "Could not find any data file written for commit " + lastCommit
                      + ", could not get schema for dataset " + metaClient.getBasePath()));
          return readSchemaFromDataFile(new Path(filePath));
        case MERGE_ON_READ:
          // If this is MOR, depending on whether the latest commit is a delta commit or compaction commit
          // Get a datafile written and get the schema from that file
          Optional lastCompactionCommit = metaClient.getActiveTimeline()
              .getCommitTimeline().filterCompletedInstants().lastInstant();
          LOG.info("Found the last compaction commit as " + lastCompactionCommit);

          Optional lastDeltaCommit;
          if (lastCompactionCommit.isPresent()) {
            lastDeltaCommit = metaClient.getActiveTimeline()
                .getDeltaCommitTimeline()
                .filterCompletedInstants()
                .findInstantsAfter(lastCompactionCommit.get().getTimestamp(), Integer.MAX_VALUE)
                .lastInstant();
          } else {
            lastDeltaCommit = metaClient.getActiveTimeline()
                    .getDeltaCommitTimeline()
                    .filterCompletedInstants()
                    .lastInstant();
          }
          LOG.info("Found the last delta commit "
              + lastDeltaCommit);

          if (lastDeltaCommit.isPresent()) {
            HoodieInstant lastDeltaInstant = lastDeltaCommit.get();
            // read from the log file wrote
            commitMetadata = HoodieCommitMetadata
                .fromBytes(activeTimeline.getInstantDetails(lastDeltaInstant).get());
            filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values()
                .stream().filter(s -> s.contains(
                    HoodieLogFile.DELTA_EXTENSION)).findAny()
                .orElseThrow(() -> new IllegalArgumentException(
                    "Could not find any data file written for commit " + lastDeltaInstant
                        + ", could not get schema for dataset " + metaClient.getBasePath()));
            return readSchemaFromLogFile(lastCompactionCommit, new Path(filePath));
          } else {
            return readSchemaFromLastCompaction(lastCompactionCommit);
          }
        default:
          LOG.error("Unknown table type " + tableType);
          throw new InvalidDatasetException(syncConfig.basePath);
      }
    } catch (IOException e) {
      throw new HoodieHiveSyncException(
          "Failed to get dataset schema for " + syncConfig.tableName, e);
    }
  }

  /**
   * Read schema from a data file from the last compaction commit done.
   */
  @SuppressWarnings("OptionalUsedAsFieldOrParameterType")
  private MessageType readSchemaFromLastCompaction(Optional lastCompactionCommitOpt)
      throws IOException {
    HoodieInstant lastCompactionCommit = lastCompactionCommitOpt.orElseThrow(
        () -> new HoodieHiveSyncException(
            "Could not read schema from last compaction, no compaction commits found on path "
                + syncConfig.basePath));

    // Read from the compacted file wrote
    HoodieCommitMetadata compactionMetadata = HoodieCommitMetadata
        .fromBytes(activeTimeline.getInstantDetails(lastCompactionCommit).get());
    String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values()
        .stream().findAny()
        .orElseThrow(() -> new IllegalArgumentException(
            "Could not find any data file written for compaction " + lastCompactionCommit
                + ", could not get schema for dataset " + metaClient.getBasePath()));
    return readSchemaFromDataFile(new Path(filePath));
  }

  /**
   * Read the schema from the log file on path
   */
  @SuppressWarnings("OptionalUsedAsFieldOrParameterType")
  private MessageType readSchemaFromLogFile(Optional lastCompactionCommitOpt,
      Path path) throws IOException {
    Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null);
    HoodieAvroDataBlock lastBlock = null;
    while (reader.hasNext()) {
      HoodieLogBlock block = reader.next();
      if (block instanceof HoodieAvroDataBlock) {
        lastBlock = (HoodieAvroDataBlock) block;
      }
    }
    if (lastBlock != null) {
      lastBlock.getRecords();
      return new parquet.avro.AvroSchemaConverter().convert(lastBlock.getSchema());
    }
    // Fall back to read the schema from last compaction
    LOG.info("Falling back to read the schema from last compaction " + lastCompactionCommitOpt);
    return readSchemaFromLastCompaction(lastCompactionCommitOpt);
  }

  /**
   * Read the parquet schema from a parquet File
   */
  private MessageType readSchemaFromDataFile(Path parquetFilePath) throws IOException {
    LOG.info("Reading schema from " + parquetFilePath);
    if (!fs.exists(parquetFilePath)) {
      throw new IllegalArgumentException(
          "Failed to read schema from data file " + parquetFilePath
              + ". File does not exist.");
    }
    ParquetMetadata fileFooter =
        ParquetFileReader
            .readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER);
    return fileFooter.getFileMetaData().getSchema();
  }

  /**
   * @return true if the configured table exists
   */
  boolean doesTableExist() {
    try {
      return client.tableExists(syncConfig.databaseName, syncConfig.tableName);
    } catch (TException e) {
      throw new HoodieHiveSyncException(
          "Failed to check if table exists " + syncConfig.tableName, e);
    }
  }

  /**
   * Execute a update in hive metastore with this SQL
   *
   * @param s SQL to execute
   */
  void updateHiveSQL(String s) {
    Statement stmt = null;
    try {
      stmt = connection.createStatement();
      LOG.info("Executing SQL " + s);
      stmt.execute(s);
    } catch (SQLException e) {
      throw new HoodieHiveSyncException("Failed in executing SQL " + s, e);
    } finally {
      closeQuietly(null, stmt);
    }
  }


  private void createHiveConnection() {
    if (connection == null) {
      BasicDataSource ds = new BasicDataSource();
      ds.setDriverClassName(driverName);
      ds.setUrl(getHiveJdbcUrlWithDefaultDBName());
      ds.setUsername(syncConfig.hiveUser);
      ds.setPassword(syncConfig.hivePass);
      LOG.info("Getting Hive Connection from Datasource " + ds);
      try {
        this.connection = ds.getConnection();
      } catch (SQLException e) {
        throw new HoodieHiveSyncException(
            "Cannot create hive connection " + getHiveJdbcUrlWithDefaultDBName(), e);
      }
    }
  }

  private String getHiveJdbcUrlWithDefaultDBName() {
    String hiveJdbcUrl = syncConfig.jdbcUrl;
    String urlAppend = null;
    // If the hive url contains addition properties like ;transportMode=http;httpPath=hs2
    if (hiveJdbcUrl.contains(";")) {
      urlAppend = hiveJdbcUrl.substring(hiveJdbcUrl.indexOf(";"));
      hiveJdbcUrl = hiveJdbcUrl.substring(0, hiveJdbcUrl.indexOf(";"));
    }
    if (!hiveJdbcUrl.endsWith("/")) {
      hiveJdbcUrl = hiveJdbcUrl + "/";
    }
    return hiveJdbcUrl + syncConfig.databaseName + (urlAppend == null ? "" : urlAppend);
  }

  private static void closeQuietly(ResultSet resultSet, Statement stmt) {
    try {
      if (stmt != null) {
        stmt.close();
      }
      if (resultSet != null) {
        resultSet.close();
      }
    } catch (SQLException e) {
      LOG.error("Could not close the resultset opened ", e);
    }
  }

  public String getBasePath() {
    return metaClient.getBasePath();
  }

  HoodieTableType getTableType() {
    return tableType;
  }

  public FileSystem getFs() {
    return fs;
  }

  Optional getLastCommitTimeSynced() {
    // Get the last commit time from the TBLproperties
    try {
      Table database = client.getTable(syncConfig.databaseName, syncConfig.tableName);
      return Optional
          .ofNullable(database.getParameters().getOrDefault(HOODIE_LAST_COMMIT_TIME_SYNC, null));
    } catch (Exception e) {
      throw new HoodieHiveSyncException(
          "Failed to get the last commit time synced from the database", e);
    }
  }

  void close() {
    try {
      if (connection != null) {
        connection.close();
      }
      if (client != null) {
        client.close();
      }
    } catch (SQLException e) {
      LOG.error("Could not close connection ", e);
    }
  }

  @SuppressWarnings("OptionalUsedAsFieldOrParameterType")
  List getPartitionsWrittenToSince(Optional lastCommitTimeSynced) {
    if (!lastCommitTimeSynced.isPresent()) {
      LOG.info("Last commit time synced is not known, listing all partitions");
      try {
        return FSUtils
            .getAllPartitionPaths(fs, syncConfig.basePath, syncConfig.assumeDatePartitioning);
      } catch (IOException e) {
        throw new HoodieIOException("Failed to list all partitions in " + syncConfig.basePath, e);
      }
    } else {
      LOG.info("Last commit time synced is " + lastCommitTimeSynced.get()
          + ", Getting commits since then");

      HoodieTimeline timelineToSync = activeTimeline
          .findInstantsAfter(lastCommitTimeSynced.get(), Integer.MAX_VALUE);
      return timelineToSync.getInstants().map(s -> {
        try {
          return HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(s).get());
        } catch (IOException e) {
          throw new HoodieIOException(
              "Failed to get partitions written since " + lastCommitTimeSynced, e);
        }
      }).flatMap(s -> s.getPartitionToWriteStats().keySet().stream()).distinct()
          .collect(Collectors.toList());
    }
  }

  void updateLastCommitTimeSynced() {
    // Set the last commit time from the TBLproperties
    String lastCommitSynced = activeTimeline.lastInstant().get().getTimestamp();
    try {
      Table table = client.getTable(syncConfig.databaseName, syncConfig.tableName);
      table.putToParameters(HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced);
      client.alter_table(syncConfig.databaseName, syncConfig.tableName, table, true);
    } catch (Exception e) {
      throw new HoodieHiveSyncException(
          "Failed to get update last commit time synced to " + lastCommitSynced, e);
    }

  }

  /**
   * Partition Event captures any partition that needs to be added or updated
   */
  static class PartitionEvent {

    public enum PartitionEventType {ADD, UPDATE}

    PartitionEventType eventType;
    String storagePartition;

    PartitionEvent(
        PartitionEventType eventType, String storagePartition) {
      this.eventType = eventType;
      this.storagePartition = storagePartition;
    }

    static PartitionEvent newPartitionAddEvent(String storagePartition) {
      return new PartitionEvent(PartitionEventType.ADD, storagePartition);
    }

    static PartitionEvent newPartitionUpdateEvent(String storagePartition) {
      return new PartitionEvent(PartitionEventType.UPDATE, storagePartition);
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy