All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.index.hbase.SparkHoodieHBaseIndex Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.index.hbase;

import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.client.utils.SparkMemoryUtils;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordDelegate;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.RateLimiter;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.config.HoodieHBaseIndexConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.data.HoodieJavaRDD;
import org.apache.hudi.exception.HoodieDependentSystemUnavailableException;
import org.apache.hudi.exception.HoodieIndexException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.index.HoodieIndexUtils;
import org.apache.hudi.table.HoodieTable;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.BufferedMutator;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.spark.Partitioner;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkFiles;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.Serializable;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;

import scala.Tuple2;

import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION;
import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_CLIENT_PORT;
import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_QUORUM;
import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_ZNODE_PARENT;
import static org.apache.hadoop.hbase.security.SecurityConstants.MASTER_KRB_PRINCIPAL;
import static org.apache.hadoop.hbase.security.SecurityConstants.REGIONSERVER_KRB_PRINCIPAL;
import static org.apache.hadoop.hbase.security.User.HBASE_SECURITY_AUTHORIZATION_CONF_KEY;
import static org.apache.hadoop.hbase.security.User.HBASE_SECURITY_CONF_KEY;
import static org.apache.hudi.common.util.StringUtils.fromUTF8Bytes;
import static org.apache.hudi.common.util.StringUtils.getUTF8Bytes;

/**
 * Hoodie Index implementation backed by HBase.
 */
public class SparkHoodieHBaseIndex extends HoodieIndex {

  public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME = "spark.executor.instances";
  public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME = "spark.dynamicAllocation.enabled";
  public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME =
      "spark.dynamicAllocation.maxExecutors";

  private static final byte[] SYSTEM_COLUMN_FAMILY = getUTF8Bytes("_s");
  private static final byte[] COMMIT_TS_COLUMN = getUTF8Bytes("commit_ts");
  private static final byte[] FILE_NAME_COLUMN = getUTF8Bytes("file_name");
  private static final byte[] PARTITION_PATH_COLUMN = getUTF8Bytes("partition_path");

  private static final Logger LOG = LoggerFactory.getLogger(SparkHoodieHBaseIndex.class);
  private static Connection hbaseConnection = null;
  private HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator = null;
  private int maxQpsPerRegionServer;
  private long totalNumInserts;
  private int numWriteStatusWithInserts;
  private static transient Thread shutdownThread;

  /**
   * multiPutBatchSize will be computed and re-set in updateLocation if
   * {@link HoodieHBaseIndexConfig#PUT_BATCH_SIZE_AUTO_COMPUTE} is set to true.
   */
  private Integer multiPutBatchSize;
  private Integer numRegionServersForTable;
  private final String tableName;
  private HBasePutBatchSizeCalculator putBatchSizeCalculator;

  public SparkHoodieHBaseIndex(HoodieWriteConfig config) {
    super(config);
    this.tableName = config.getHbaseTableName();
    addShutDownHook();
    init(config);
  }

  private void init(HoodieWriteConfig config) {
    this.multiPutBatchSize = config.getHbaseIndexPutBatchSize();
    this.maxQpsPerRegionServer = config.getHbaseIndexMaxQPSPerRegionServer();
    this.putBatchSizeCalculator = new HBasePutBatchSizeCalculator();
    this.hBaseIndexQPSResourceAllocator = createQPSResourceAllocator(this.config);
  }

  public HBaseIndexQPSResourceAllocator createQPSResourceAllocator(HoodieWriteConfig config) {
    try {
      LOG.info("createQPSResourceAllocator :" + config.getHBaseQPSResourceAllocatorClass());
      return (HBaseIndexQPSResourceAllocator) ReflectionUtils
              .loadClass(config.getHBaseQPSResourceAllocatorClass(), config);
    } catch (Exception e) {
      LOG.warn("error while instantiating HBaseIndexQPSResourceAllocator", e);
    }
    return new DefaultHBaseQPSResourceAllocator(config);
  }

  private Connection getHBaseConnection() {
    Configuration hbaseConfig = HBaseConfiguration.create();
    String quorum = config.getHbaseZkQuorum();
    hbaseConfig.set(ZOOKEEPER_QUORUM, quorum);
    String zkZnodeParent = config.getHBaseZkZnodeParent();
    if (zkZnodeParent != null) {
      hbaseConfig.set(ZOOKEEPER_ZNODE_PARENT, zkZnodeParent);
    }
    String port = String.valueOf(config.getHbaseZkPort());
    hbaseConfig.set(ZOOKEEPER_CLIENT_PORT, port);

    try {
      String authentication = config.getHBaseIndexSecurityAuthentication();
      if (authentication.equals("kerberos")) {
        hbaseConfig.set(HBASE_SECURITY_CONF_KEY, "kerberos");
        hbaseConfig.set(HADOOP_SECURITY_AUTHENTICATION, "kerberos");
        hbaseConfig.set(HBASE_SECURITY_AUTHORIZATION_CONF_KEY, "true");
        hbaseConfig.set(REGIONSERVER_KRB_PRINCIPAL, config.getHBaseIndexRegionserverPrincipal());
        hbaseConfig.set(MASTER_KRB_PRINCIPAL, config.getHBaseIndexMasterPrincipal());

        String principal = config.getHBaseIndexKerberosUserPrincipal();
        String keytab = SparkFiles.get(config.getHBaseIndexKerberosUserKeytab());

        UserGroupInformation.setConfiguration(hbaseConfig);
        UserGroupInformation ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keytab);
        return ugi.doAs((PrivilegedExceptionAction) () ->
          (Connection) ConnectionFactory.createConnection(hbaseConfig)
        );
      } else {
        return ConnectionFactory.createConnection(hbaseConfig);
      }
    } catch (IOException | InterruptedException e) {
      throw new HoodieDependentSystemUnavailableException(HoodieDependentSystemUnavailableException.HBASE,
          quorum + ":" + port, e);
    }
  }

  /**
   * Since we are sharing the HBaseConnection across tasks in a JVM, make sure the HBaseConnection is closed when JVM
   * exits.
   */
  private void addShutDownHook() {
    if (null == shutdownThread) {
      shutdownThread = new Thread(() -> {
        try {
          hbaseConnection.close();
        } catch (Exception e) {
          // fail silently for any sort of exception
        }
      });
      Runtime.getRuntime().addShutdownHook(shutdownThread);
    }
  }

  /**
   * Ensure that any resources used for indexing are released here.
   */
  @Override
  public void close() {
    LOG.info("No resources to release from Hbase index");
  }

  private Get generateStatement(String key) throws IOException {
    return new Get(getUTF8Bytes(getHBaseKey(key))).readVersions(1).addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)
        .addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN).addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN);
  }

  private Get generateStatement(String key, long startTime, long endTime) throws IOException {
    return generateStatement(key).setTimeRange(startTime, endTime);
  }

  protected String getHBaseKey(String key) {
    return key;
  }

  /**
   * Function that tags each HoodieRecord with an existing location, if known.
   */
  private  Function2>, Iterator>> locationTagFunction(
      HoodieTableMetaClient metaClient) {

    // `multiGetBatchSize` is intended to be a batch per 100ms. To create a rate limiter that measures
    // operations per second, we need to multiply `multiGetBatchSize` by 10.
    Integer multiGetBatchSize = config.getHbaseIndexGetBatchSize();
    return (partitionNum, hoodieRecordIterator) -> {
      boolean updatePartitionPath = config.getHbaseIndexUpdatePartitionPath();
      RateLimiter limiter = RateLimiter.create(multiGetBatchSize * 10, TimeUnit.SECONDS);
      // Grab the global HBase connection
      synchronized (SparkHoodieHBaseIndex.class) {
        if (hbaseConnection == null || hbaseConnection.isClosed()) {
          hbaseConnection = getHBaseConnection();
        }
      }
      List> taggedRecords = new ArrayList<>();
      try (HTable hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName))) {
        List statements = new ArrayList<>();
        List currentBatchOfRecords = new LinkedList<>();
        // Do the tagging.
        HoodieTimeline completedCommitsTimeline = metaClient.getCommitsTimeline().filterCompletedInstants();
        while (hoodieRecordIterator.hasNext()) {
          HoodieRecord rec = hoodieRecordIterator.next();
          statements.add(generateStatement(rec.getRecordKey()));
          currentBatchOfRecords.add(rec);
          // iterator till we reach batch size
          if (hoodieRecordIterator.hasNext() && statements.size() < multiGetBatchSize) {
            continue;
          }
          // get results for batch from Hbase
          Result[] results = doGet(hTable, statements, limiter);
          // clear statements to be GC'd
          statements.clear();
          for (Result result : results) {
            // first, attempt to grab location from HBase
            HoodieRecord currentRecord = currentBatchOfRecords.remove(0);
            if (result.getRow() == null) {
              taggedRecords.add(currentRecord);
              continue;
            }
            String keyFromResult = fromUTF8Bytes(result.getRow());
            String commitTs = fromUTF8Bytes(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN));
            String fileId = fromUTF8Bytes(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
            String partitionPath = fromUTF8Bytes(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
            if (!HoodieIndexUtils.checkIfValidCommit(completedCommitsTimeline, commitTs)) {
              // if commit is invalid, treat this as a new taggedRecord
              taggedRecords.add(currentRecord);
              continue;
            }
            // check whether to do partition change processing
            if (updatePartitionPath && !partitionPath.equals(currentRecord.getPartitionPath())) {
              // delete partition old data record
              HoodieRecord emptyRecord = new HoodieAvroRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath),
                  new EmptyHoodieRecordPayload());
              emptyRecord.unseal();
              emptyRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId));
              emptyRecord.setIgnoreIndexUpdate(true);
              emptyRecord.seal();
              // insert partition new data record
              currentRecord = new HoodieAvroRecord(new HoodieKey(currentRecord.getRecordKey(), currentRecord.getPartitionPath()),
                  (HoodieRecordPayload) currentRecord.getData());
              taggedRecords.add(emptyRecord);
              taggedRecords.add(currentRecord);
            } else {
              currentRecord = new HoodieAvroRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath),
                  (HoodieRecordPayload) currentRecord.getData());
              currentRecord.unseal();
              currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId));
              currentRecord.seal();
              taggedRecords.add(currentRecord);
              // the key from Result and the key being processed should be same
              assert (currentRecord.getRecordKey().contentEquals(keyFromResult));
            }
          }
        }
      } catch (IOException e) {
        throw new HoodieIndexException("Failed to Tag indexed locations because of exception with HBase Client", e);
      } finally {
        limiter.stop();
      }
      return taggedRecords.iterator();
    };
  }

  private Result[] doGet(HTable hTable, List keys, RateLimiter limiter) throws IOException {
    if (keys.size() > 0) {
      limiter.tryAcquire(keys.size());
      return hTable.get(keys);
    }
    return new Result[keys.size()];
  }

  @Override
  public  HoodieData> tagLocation(
      HoodieData> records, HoodieEngineContext context,
      HoodieTable hoodieTable) {
    return HoodieJavaRDD.of(HoodieJavaRDD.getJavaRDD(records)
        .mapPartitionsWithIndex(locationTagFunction(hoodieTable.getMetaClient()), true));
  }

  private Function2, Iterator> updateLocationFunction() {

    return (partition, statusIterator) -> {

      List writeStatusList = new ArrayList<>();
      // Grab the global HBase connection
      synchronized (SparkHoodieHBaseIndex.class) {
        if (hbaseConnection == null || hbaseConnection.isClosed()) {
          hbaseConnection = getHBaseConnection();
        }
      }
      final long startTimeForPutsTask = DateTime.now().getMillis();
      LOG.info("startTimeForPutsTask for this task: " + startTimeForPutsTask);

      final RateLimiter limiter = RateLimiter.create(multiPutBatchSize, TimeUnit.SECONDS);
      try (BufferedMutator mutator = hbaseConnection.getBufferedMutator(TableName.valueOf(tableName))) {
        while (statusIterator.hasNext()) {
          WriteStatus writeStatus = statusIterator.next();
          List mutations = new ArrayList<>();
          try {
            long numOfInserts = writeStatus.getStat().getNumInserts();
            LOG.info("Num of inserts in this WriteStatus: " + numOfInserts);
            LOG.info("Total inserts in this job: " + this.totalNumInserts);
            LOG.info("multiPutBatchSize for this job: " + this.multiPutBatchSize);
            // Create a rate limiter that allows `multiPutBatchSize` operations per second
            // Any calls beyond `multiPutBatchSize` within a second will be rate limited
            for (HoodieRecordDelegate recordDelegate : writeStatus.getWrittenRecordDelegates()) {
              if (!writeStatus.isErrored(recordDelegate.getHoodieKey())) {
                if (recordDelegate.getIgnoreIndexUpdate()) {
                  continue;
                }
                Option loc = recordDelegate.getNewLocation();
                if (loc.isPresent()) {
                  if (recordDelegate.getCurrentLocation().isPresent()) {
                    // This is an update, no need to update index
                    continue;
                  }
                  Put put = new Put(getUTF8Bytes(getHBaseKey(recordDelegate.getRecordKey())));
                  put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, getUTF8Bytes(loc.get().getInstantTime()));
                  put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, getUTF8Bytes(loc.get().getFileId()));
                  put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, getUTF8Bytes(recordDelegate.getPartitionPath()));
                  mutations.add(put);
                } else {
                  // Delete existing index for a deleted record
                  Delete delete = new Delete(getUTF8Bytes(getHBaseKey(recordDelegate.getRecordKey())));
                  mutations.add(delete);
                }
              }
              if (mutations.size() < multiPutBatchSize) {
                continue;
              }
              doMutations(mutator, mutations, limiter);
            }
            // process remaining puts and deletes, if any
            doMutations(mutator, mutations, limiter);
          } catch (Exception e) {
            Exception we = new Exception("Error updating index for " + writeStatus, e);
            LOG.error(we.getMessage(), e);
            writeStatus.setGlobalError(we);
          }
          writeStatusList.add(writeStatus);
        }
        final long endPutsTime = DateTime.now().getMillis();
        LOG.info("hbase puts task time for this task: " + (endPutsTime - startTimeForPutsTask));
      } catch (IOException e) {
        throw new HoodieIndexException("Failed to Update Index locations because of exception with HBase Client", e);
      } finally {
        limiter.stop();
      }
      return writeStatusList.iterator();
    };
  }

  /**
   * Helper method to facilitate performing mutations (including puts and deletes) in Hbase.
   */
  private void doMutations(BufferedMutator mutator, List mutations, RateLimiter limiter) throws IOException {
    if (mutations.isEmpty()) {
      return;
    }
    // report number of operations to account per second with rate limiter.
    // If #limiter.getRate() operations are acquired within 1 second, ratelimiter will limit the rest of calls
    // for within that second
    limiter.tryAcquire(mutations.size());
    mutator.mutate(mutations);
    mutator.flush();
    mutations.clear();
  }

  Map mapFileWithInsertsToUniquePartition(JavaRDD writeStatusRDD) {
    final Map fileIdPartitionMap = new HashMap<>();
    int partitionIndex = 0;
    // Map each fileId that has inserts to a unique partition Id. This will be used while
    // repartitioning RDD
    final List fileIds = writeStatusRDD.filter(w -> w.getStat().getNumInserts() > 0)
                                   .map(WriteStatus::getFileId).collect();
    for (final String fileId : fileIds) {
      fileIdPartitionMap.put(fileId, partitionIndex++);
    }
    return fileIdPartitionMap;
  }

  @Override
  public HoodieData updateLocation(
      HoodieData writeStatus, HoodieEngineContext context,
      HoodieTable hoodieTable) {
    JavaRDD writeStatusRDD = HoodieJavaRDD.getJavaRDD(writeStatus);
    final Option desiredQPSFraction = calculateQPSFraction(writeStatusRDD);
    final Map fileIdPartitionMap = mapFileWithInsertsToUniquePartition(writeStatusRDD);
    JavaRDD partitionedRDD = this.numWriteStatusWithInserts == 0 ? writeStatusRDD :
        writeStatusRDD.mapToPair(w -> new Tuple2<>(w.getFileId(), w))
            .partitionBy(new WriteStatusPartitioner(fileIdPartitionMap,
                this.numWriteStatusWithInserts))
            .map(Tuple2::_2);
    JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
    acquireQPSResourcesAndSetBatchSize(desiredQPSFraction, jsc);
    JavaRDD writeStatusJavaRDD = partitionedRDD.mapPartitionsWithIndex(updateLocationFunction(),
        true);
    // caching the index updated status RDD
    writeStatusJavaRDD = writeStatusJavaRDD.persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps()));
    // force trigger update location(hbase puts)
    writeStatusJavaRDD.count();
    this.hBaseIndexQPSResourceAllocator.releaseQPSResources();
    return HoodieJavaRDD.of(writeStatusJavaRDD);
  }

  private Option calculateQPSFraction(JavaRDD writeStatusRDD) {
    if (config.getHbaseIndexPutBatchSizeAutoCompute()) {
      /*
        Each writeStatus represents status information from a write done in one of the IOHandles.
        If a writeStatus has any insert, it implies that the corresponding task contacts HBase for
        doing puts, since we only do puts for inserts from HBaseIndex.
       */
      final Tuple2 numPutsParallelismTuple  = getHBasePutAccessParallelism(writeStatusRDD);
      this.totalNumInserts = numPutsParallelismTuple._1;
      this.numWriteStatusWithInserts = numPutsParallelismTuple._2;
      this.numRegionServersForTable = getNumRegionServersAliveForTable();
      final float desiredQPSFraction = this.hBaseIndexQPSResourceAllocator.calculateQPSFractionForPutsTime(
          this.totalNumInserts, this.numRegionServersForTable);
      LOG.info("Desired QPSFraction :" + desiredQPSFraction);
      LOG.info("Number HBase puts :" + this.totalNumInserts);
      LOG.info("Number of WriteStatus with inserts :" + numWriteStatusWithInserts);
      return Option.of(desiredQPSFraction);
    }
    return Option.empty();
  }

  private void acquireQPSResourcesAndSetBatchSize(final Option desiredQPSFraction,
                                                  final JavaSparkContext jsc) {
    if (config.getHbaseIndexPutBatchSizeAutoCompute()) {
      SparkConf conf = jsc.getConf();
      int maxExecutors = conf.getInt(DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME, 1);
      if (conf.getBoolean(DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME, false)) {
        maxExecutors = Math.max(maxExecutors, conf.getInt(
          DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1));
      }
      final float availableQpsFraction = this.hBaseIndexQPSResourceAllocator
                                           .acquireQPSResources(desiredQPSFraction.get(), this.totalNumInserts);
      LOG.info("Allocated QPS Fraction :" + availableQpsFraction);
      multiPutBatchSize = putBatchSizeCalculator
                            .getBatchSize(
                              numRegionServersForTable,
                              maxQpsPerRegionServer,
                              numWriteStatusWithInserts,
                              maxExecutors,
                              availableQpsFraction);
      LOG.info("multiPutBatchSize :" + multiPutBatchSize);
    }
  }

  Tuple2 getHBasePutAccessParallelism(final JavaRDD writeStatusRDD) {
    final JavaPairRDD insertOnlyWriteStatusRDD = writeStatusRDD
        .filter(w -> w.getStat().getNumInserts() > 0).mapToPair(w -> new Tuple2<>(w.getStat().getNumInserts(), 1));
    return insertOnlyWriteStatusRDD.fold(new Tuple2<>(0L, 0), (w, c) -> new Tuple2<>(w._1 + c._1, w._2 + c._2));
  }

  public static class HBasePutBatchSizeCalculator implements Serializable {

    private static final Logger LOG = LoggerFactory.getLogger(HBasePutBatchSizeCalculator.class);

    /**
     * Calculate putBatch size so that sum of requests across multiple jobs in a second does not exceed
     * maxQpsPerRegionServer for each Region Server. Multiplying qpsFraction to reduce the aggregate load on common RS
     * across topics. Assumption here is that all tables have regions across all RS, which is not necessarily true for
     * smaller tables. So, they end up getting a smaller share of QPS than they deserve, but it might be ok.
     * 

* Example: int putBatchSize = batchSizeCalculator.getBatchSize(10, 16667, 1200, 200, 100, 0.1f) *

*

* Expected batchSize is 8 because in that case, total request sent to a Region Server in one second is: * * 8 (batchSize) * 200 (parallelism) * 10 (maxReqsInOneSecond) * 10 (numRegionServers) * 0.1 (qpsFraction)) => * 16000. We assume requests get distributed to Region Servers uniformly, so each RS gets 1600 requests which * happens to be 10% of 16667 (maxQPSPerRegionServer), as expected. *

*

* Assumptions made here *

  • In a batch, writes get evenly distributed to each RS for that table. Since we do writes only in the case of * inserts and not updates, for this assumption to fail, inserts would have to be skewed towards few RS, likelihood * of which is less if Hbase table is pre-split and rowKeys are UUIDs (random strings). If this assumption fails, * then it is possible for some RS to receive more than maxQpsPerRegionServer QPS, but for simplicity, we are going * ahead with this model, since this is meant to be a lightweight distributed throttling mechanism without * maintaining a global context. So if this assumption breaks, we are hoping the HBase Master relocates hot-spot * regions to new Region Servers. * *
  • *
  • For Region Server stability, throttling at a second level granularity is fine. Although, within a second, the * sum of queries might be within maxQpsPerRegionServer, there could be peaks at some sub second intervals. So, the * assumption is that these peaks are tolerated by the Region Server (which at max can be maxQpsPerRegionServer). *
  • *

    */ public int getBatchSize(int numRegionServersForTable, int maxQpsPerRegionServer, int numTasksDuringPut, int maxExecutors, float qpsFraction) { int numRSAlive = numRegionServersForTable; int maxReqPerSec = getMaxReqPerSec(numRSAlive, maxQpsPerRegionServer, qpsFraction); int numTasks = numTasksDuringPut; int maxParallelPutsTask = Math.max(1, Math.min(numTasks, maxExecutors)); int multiPutBatchSizePerSecPerTask = Math.max(1, (int) Math.ceil((double) maxReqPerSec / maxParallelPutsTask)); LOG.info("HbaseIndexThrottling: qpsFraction :" + qpsFraction); LOG.info("HbaseIndexThrottling: numRSAlive :" + numRSAlive); LOG.info("HbaseIndexThrottling: maxReqPerSec :" + maxReqPerSec); LOG.info("HbaseIndexThrottling: numTasks :" + numTasks); LOG.info("HbaseIndexThrottling: maxExecutors :" + maxExecutors); LOG.info("HbaseIndexThrottling: maxParallelPuts :" + maxParallelPutsTask); LOG.info("HbaseIndexThrottling: numRegionServersForTable :" + numRegionServersForTable); LOG.info("HbaseIndexThrottling: multiPutBatchSizePerSecPerTask :" + multiPutBatchSizePerSecPerTask); return multiPutBatchSizePerSecPerTask; } public int getMaxReqPerSec(int numRegionServersForTable, int maxQpsPerRegionServer, float qpsFraction) { return (int) (qpsFraction * numRegionServersForTable * maxQpsPerRegionServer); } } private Integer getNumRegionServersAliveForTable() { // This is being called in the driver, so there is only one connection // from the driver, so ok to use a local connection variable. if (numRegionServersForTable == null) { try (Connection conn = getHBaseConnection()) { RegionLocator regionLocator = conn.getRegionLocator(TableName.valueOf(tableName)); numRegionServersForTable = Math .toIntExact(regionLocator.getAllRegionLocations().stream().map(HRegionLocation::getServerName).distinct().count()); return numRegionServersForTable; } catch (IOException e) { LOG.error("Get region locator error", e); throw new RuntimeException(e); } } return numRegionServersForTable; } @Override public boolean rollbackCommit(String instantTime) { int multiGetBatchSize = config.getHbaseIndexGetBatchSize(); boolean rollbackSync = config.getHBaseIndexRollbackSync(); if (!config.getHBaseIndexRollbackSync()) { // Default Rollback in HbaseIndex is managed via method {@link #checkIfValidCommit()} return true; } synchronized (SparkHoodieHBaseIndex.class) { if (hbaseConnection == null || hbaseConnection.isClosed()) { hbaseConnection = getHBaseConnection(); } } final RateLimiter limiter = RateLimiter.create(multiPutBatchSize, TimeUnit.SECONDS); try (HTable hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName)); BufferedMutator mutator = hbaseConnection.getBufferedMutator(TableName.valueOf(tableName))) { Long rollbackTime = TimelineUtils.parseDateFromInstantTime(instantTime).getTime(); Long currentTime = new Date().getTime(); Scan scan = new Scan(); scan.addFamily(SYSTEM_COLUMN_FAMILY); scan.setTimeRange(rollbackTime, currentTime); ResultScanner scanner = hTable.getScanner(scan); Iterator scannerIterator = scanner.iterator(); List statements = new ArrayList<>(); List currentVersionResults = new ArrayList(); List mutations = new ArrayList<>(); while (scannerIterator.hasNext()) { Result result = scannerIterator.next(); currentVersionResults.add(result); statements.add(generateStatement(fromUTF8Bytes(result.getRow()), 0L, rollbackTime - 1)); if (scannerIterator.hasNext() && statements.size() < multiGetBatchSize) { continue; } Result[] lastVersionResults = hTable.get(statements); for (int i = 0; i < lastVersionResults.length; i++) { Result lastVersionResult = lastVersionResults[i]; if (null == lastVersionResult.getRow() && rollbackSync) { Result currentVersionResult = currentVersionResults.get(i); Delete delete = new Delete(currentVersionResult.getRow()); mutations.add(delete); } if (null != lastVersionResult.getRow()) { String oldPath = new String(lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN)); String nowPath = new String(currentVersionResults.get(i).getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN)); if (!oldPath.equals(nowPath) || rollbackSync) { Put put = new Put(lastVersionResult.getRow()); put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)); put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN)); put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN)); mutations.add(put); } } } doMutations(mutator, mutations, limiter); currentVersionResults.clear(); statements.clear(); mutations.clear(); } } catch (Exception e) { LOG.error("hbase index roll back failed", e); return false; } finally { limiter.stop(); } return true; } /** * Only looks up by recordKey. */ @Override public boolean isGlobal() { return true; } /** * Mapping is available in HBase already. */ @Override public boolean canIndexLogFiles() { return true; } /** * Index needs to be explicitly updated after storage write. */ @Override public boolean isImplicitWithStorage() { return false; } public void setHbaseConnection(Connection hbaseConnection) { SparkHoodieHBaseIndex.hbaseConnection = hbaseConnection; } /** * Partitions each WriteStatus with inserts into a unique single partition. WriteStatus without inserts will be * assigned to random partitions. This partitioner will be useful to utilize max parallelism with spark operations * that are based on inserts in each WriteStatus. */ public static class WriteStatusPartitioner extends Partitioner { private int totalPartitions; final Map fileIdPartitionMap; public WriteStatusPartitioner(final Map fileIdPartitionMap, final int totalPartitions) { this.totalPartitions = totalPartitions; this.fileIdPartitionMap = fileIdPartitionMap; } @Override public int numPartitions() { return this.totalPartitions; } @Override public int getPartition(Object key) { final String fileId = (String) key; if (!fileIdPartitionMap.containsKey(fileId)) { LOG.info("This writestatus(fileId: " + fileId + ") is not mapped because it doesn't have any inserts. " + "In this case, we can assign a random partition to this WriteStatus."); // Assign random spark partition for the `WriteStatus` that has no inserts. For a spark operation that depends // on number of inserts, there won't be any performance penalty in packing these WriteStatus'es together. return Math.abs(fileId.hashCode()) % totalPartitions; } return fileIdPartitionMap.get(fileId); } } }




    © 2015 - 2025 Weber Informatics LLC | Privacy Policy