Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.hudi.index.hbase.SparkHoodieHBaseIndex Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.index.hbase;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.common.HoodieSparkEngineContext;
import org.apache.hudi.client.utils.SparkMemoryUtils;
import org.apache.hudi.common.data.HoodieData;
import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.model.EmptyHoodieRecordPayload;
import org.apache.hudi.common.model.HoodieAvroRecord;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordDelegate;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieTimeline;
import org.apache.hudi.common.table.timeline.TimelineUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.RateLimiter;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.config.HoodieHBaseIndexConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.data.HoodieJavaRDD;
import org.apache.hudi.exception.HoodieDependentSystemUnavailableException;
import org.apache.hudi.exception.HoodieIndexException;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.index.HoodieIndexUtils;
import org.apache.hudi.table.HoodieTable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.BufferedMutator;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.spark.Partitioner;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkFiles;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.Serializable;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import scala.Tuple2;
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION;
import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_CLIENT_PORT;
import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_QUORUM;
import static org.apache.hadoop.hbase.HConstants.ZOOKEEPER_ZNODE_PARENT;
import static org.apache.hadoop.hbase.security.SecurityConstants.MASTER_KRB_PRINCIPAL;
import static org.apache.hadoop.hbase.security.SecurityConstants.REGIONSERVER_KRB_PRINCIPAL;
import static org.apache.hadoop.hbase.security.User.HBASE_SECURITY_AUTHORIZATION_CONF_KEY;
import static org.apache.hadoop.hbase.security.User.HBASE_SECURITY_CONF_KEY;
import static org.apache.hudi.common.util.StringUtils.fromUTF8Bytes;
import static org.apache.hudi.common.util.StringUtils.getUTF8Bytes;
/**
* Hoodie Index implementation backed by HBase.
*/
public class SparkHoodieHBaseIndex extends HoodieIndex {
public static final String DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME = "spark.executor.instances";
public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME = "spark.dynamicAllocation.enabled";
public static final String DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME =
"spark.dynamicAllocation.maxExecutors";
private static final byte[] SYSTEM_COLUMN_FAMILY = getUTF8Bytes("_s");
private static final byte[] COMMIT_TS_COLUMN = getUTF8Bytes("commit_ts");
private static final byte[] FILE_NAME_COLUMN = getUTF8Bytes("file_name");
private static final byte[] PARTITION_PATH_COLUMN = getUTF8Bytes("partition_path");
private static final Logger LOG = LoggerFactory.getLogger(SparkHoodieHBaseIndex.class);
private static Connection hbaseConnection = null;
private HBaseIndexQPSResourceAllocator hBaseIndexQPSResourceAllocator = null;
private int maxQpsPerRegionServer;
private long totalNumInserts;
private int numWriteStatusWithInserts;
private static transient Thread shutdownThread;
/**
* multiPutBatchSize will be computed and re-set in updateLocation if
* {@link HoodieHBaseIndexConfig#PUT_BATCH_SIZE_AUTO_COMPUTE} is set to true.
*/
private Integer multiPutBatchSize;
private Integer numRegionServersForTable;
private final String tableName;
private HBasePutBatchSizeCalculator putBatchSizeCalculator;
public SparkHoodieHBaseIndex(HoodieWriteConfig config) {
super(config);
this.tableName = config.getHbaseTableName();
addShutDownHook();
init(config);
}
private void init(HoodieWriteConfig config) {
this.multiPutBatchSize = config.getHbaseIndexPutBatchSize();
this.maxQpsPerRegionServer = config.getHbaseIndexMaxQPSPerRegionServer();
this.putBatchSizeCalculator = new HBasePutBatchSizeCalculator();
this.hBaseIndexQPSResourceAllocator = createQPSResourceAllocator(this.config);
}
public HBaseIndexQPSResourceAllocator createQPSResourceAllocator(HoodieWriteConfig config) {
try {
LOG.info("createQPSResourceAllocator :" + config.getHBaseQPSResourceAllocatorClass());
return (HBaseIndexQPSResourceAllocator) ReflectionUtils
.loadClass(config.getHBaseQPSResourceAllocatorClass(), config);
} catch (Exception e) {
LOG.warn("error while instantiating HBaseIndexQPSResourceAllocator", e);
}
return new DefaultHBaseQPSResourceAllocator(config);
}
private Connection getHBaseConnection() {
Configuration hbaseConfig = HBaseConfiguration.create();
String quorum = config.getHbaseZkQuorum();
hbaseConfig.set(ZOOKEEPER_QUORUM, quorum);
String zkZnodeParent = config.getHBaseZkZnodeParent();
if (zkZnodeParent != null) {
hbaseConfig.set(ZOOKEEPER_ZNODE_PARENT, zkZnodeParent);
}
String port = String.valueOf(config.getHbaseZkPort());
hbaseConfig.set(ZOOKEEPER_CLIENT_PORT, port);
try {
String authentication = config.getHBaseIndexSecurityAuthentication();
if (authentication.equals("kerberos")) {
hbaseConfig.set(HBASE_SECURITY_CONF_KEY, "kerberos");
hbaseConfig.set(HADOOP_SECURITY_AUTHENTICATION, "kerberos");
hbaseConfig.set(HBASE_SECURITY_AUTHORIZATION_CONF_KEY, "true");
hbaseConfig.set(REGIONSERVER_KRB_PRINCIPAL, config.getHBaseIndexRegionserverPrincipal());
hbaseConfig.set(MASTER_KRB_PRINCIPAL, config.getHBaseIndexMasterPrincipal());
String principal = config.getHBaseIndexKerberosUserPrincipal();
String keytab = SparkFiles.get(config.getHBaseIndexKerberosUserKeytab());
UserGroupInformation.setConfiguration(hbaseConfig);
UserGroupInformation ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keytab);
return ugi.doAs((PrivilegedExceptionAction) () ->
(Connection) ConnectionFactory.createConnection(hbaseConfig)
);
} else {
return ConnectionFactory.createConnection(hbaseConfig);
}
} catch (IOException | InterruptedException e) {
throw new HoodieDependentSystemUnavailableException(HoodieDependentSystemUnavailableException.HBASE,
quorum + ":" + port, e);
}
}
/**
* Since we are sharing the HBaseConnection across tasks in a JVM, make sure the HBaseConnection is closed when JVM
* exits.
*/
private void addShutDownHook() {
if (null == shutdownThread) {
shutdownThread = new Thread(() -> {
try {
hbaseConnection.close();
} catch (Exception e) {
// fail silently for any sort of exception
}
});
Runtime.getRuntime().addShutdownHook(shutdownThread);
}
}
/**
* Ensure that any resources used for indexing are released here.
*/
@Override
public void close() {
LOG.info("No resources to release from Hbase index");
}
private Get generateStatement(String key) throws IOException {
return new Get(getUTF8Bytes(getHBaseKey(key))).readVersions(1).addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN)
.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN).addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN);
}
private Get generateStatement(String key, long startTime, long endTime) throws IOException {
return generateStatement(key).setTimeRange(startTime, endTime);
}
protected String getHBaseKey(String key) {
return key;
}
/**
* Function that tags each HoodieRecord with an existing location, if known.
*/
private Function2>, Iterator>> locationTagFunction(
HoodieTableMetaClient metaClient) {
// `multiGetBatchSize` is intended to be a batch per 100ms. To create a rate limiter that measures
// operations per second, we need to multiply `multiGetBatchSize` by 10.
Integer multiGetBatchSize = config.getHbaseIndexGetBatchSize();
return (partitionNum, hoodieRecordIterator) -> {
boolean updatePartitionPath = config.getHbaseIndexUpdatePartitionPath();
RateLimiter limiter = RateLimiter.create(multiGetBatchSize * 10, TimeUnit.SECONDS);
// Grab the global HBase connection
synchronized (SparkHoodieHBaseIndex.class) {
if (hbaseConnection == null || hbaseConnection.isClosed()) {
hbaseConnection = getHBaseConnection();
}
}
List> taggedRecords = new ArrayList<>();
try (HTable hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName))) {
List statements = new ArrayList<>();
List currentBatchOfRecords = new LinkedList<>();
// Do the tagging.
HoodieTimeline completedCommitsTimeline = metaClient.getCommitsTimeline().filterCompletedInstants();
while (hoodieRecordIterator.hasNext()) {
HoodieRecord rec = hoodieRecordIterator.next();
statements.add(generateStatement(rec.getRecordKey()));
currentBatchOfRecords.add(rec);
// iterator till we reach batch size
if (hoodieRecordIterator.hasNext() && statements.size() < multiGetBatchSize) {
continue;
}
// get results for batch from Hbase
Result[] results = doGet(hTable, statements, limiter);
// clear statements to be GC'd
statements.clear();
for (Result result : results) {
// first, attempt to grab location from HBase
HoodieRecord currentRecord = currentBatchOfRecords.remove(0);
if (result.getRow() == null) {
taggedRecords.add(currentRecord);
continue;
}
String keyFromResult = fromUTF8Bytes(result.getRow());
String commitTs = fromUTF8Bytes(result.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN));
String fileId = fromUTF8Bytes(result.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
String partitionPath = fromUTF8Bytes(result.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
if (!HoodieIndexUtils.checkIfValidCommit(completedCommitsTimeline, commitTs)) {
// if commit is invalid, treat this as a new taggedRecord
taggedRecords.add(currentRecord);
continue;
}
// check whether to do partition change processing
if (updatePartitionPath && !partitionPath.equals(currentRecord.getPartitionPath())) {
// delete partition old data record
HoodieRecord emptyRecord = new HoodieAvroRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath),
new EmptyHoodieRecordPayload());
emptyRecord.unseal();
emptyRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId));
emptyRecord.setIgnoreIndexUpdate(true);
emptyRecord.seal();
// insert partition new data record
currentRecord = new HoodieAvroRecord(new HoodieKey(currentRecord.getRecordKey(), currentRecord.getPartitionPath()),
(HoodieRecordPayload) currentRecord.getData());
taggedRecords.add(emptyRecord);
taggedRecords.add(currentRecord);
} else {
currentRecord = new HoodieAvroRecord(new HoodieKey(currentRecord.getRecordKey(), partitionPath),
(HoodieRecordPayload) currentRecord.getData());
currentRecord.unseal();
currentRecord.setCurrentLocation(new HoodieRecordLocation(commitTs, fileId));
currentRecord.seal();
taggedRecords.add(currentRecord);
// the key from Result and the key being processed should be same
assert (currentRecord.getRecordKey().contentEquals(keyFromResult));
}
}
}
} catch (IOException e) {
throw new HoodieIndexException("Failed to Tag indexed locations because of exception with HBase Client", e);
} finally {
limiter.stop();
}
return taggedRecords.iterator();
};
}
private Result[] doGet(HTable hTable, List keys, RateLimiter limiter) throws IOException {
if (keys.size() > 0) {
limiter.tryAcquire(keys.size());
return hTable.get(keys);
}
return new Result[keys.size()];
}
@Override
public HoodieData> tagLocation(
HoodieData> records, HoodieEngineContext context,
HoodieTable hoodieTable) {
return HoodieJavaRDD.of(HoodieJavaRDD.getJavaRDD(records)
.mapPartitionsWithIndex(locationTagFunction(hoodieTable.getMetaClient()), true));
}
private Function2, Iterator> updateLocationFunction() {
return (partition, statusIterator) -> {
List writeStatusList = new ArrayList<>();
// Grab the global HBase connection
synchronized (SparkHoodieHBaseIndex.class) {
if (hbaseConnection == null || hbaseConnection.isClosed()) {
hbaseConnection = getHBaseConnection();
}
}
final long startTimeForPutsTask = DateTime.now().getMillis();
LOG.info("startTimeForPutsTask for this task: " + startTimeForPutsTask);
final RateLimiter limiter = RateLimiter.create(multiPutBatchSize, TimeUnit.SECONDS);
try (BufferedMutator mutator = hbaseConnection.getBufferedMutator(TableName.valueOf(tableName))) {
while (statusIterator.hasNext()) {
WriteStatus writeStatus = statusIterator.next();
List mutations = new ArrayList<>();
try {
long numOfInserts = writeStatus.getStat().getNumInserts();
LOG.info("Num of inserts in this WriteStatus: " + numOfInserts);
LOG.info("Total inserts in this job: " + this.totalNumInserts);
LOG.info("multiPutBatchSize for this job: " + this.multiPutBatchSize);
// Create a rate limiter that allows `multiPutBatchSize` operations per second
// Any calls beyond `multiPutBatchSize` within a second will be rate limited
for (HoodieRecordDelegate recordDelegate : writeStatus.getWrittenRecordDelegates()) {
if (!writeStatus.isErrored(recordDelegate.getHoodieKey())) {
if (recordDelegate.getIgnoreIndexUpdate()) {
continue;
}
Option loc = recordDelegate.getNewLocation();
if (loc.isPresent()) {
if (recordDelegate.getCurrentLocation().isPresent()) {
// This is an update, no need to update index
continue;
}
Put put = new Put(getUTF8Bytes(getHBaseKey(recordDelegate.getRecordKey())));
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, getUTF8Bytes(loc.get().getInstantTime()));
put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, getUTF8Bytes(loc.get().getFileId()));
put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, getUTF8Bytes(recordDelegate.getPartitionPath()));
mutations.add(put);
} else {
// Delete existing index for a deleted record
Delete delete = new Delete(getUTF8Bytes(getHBaseKey(recordDelegate.getRecordKey())));
mutations.add(delete);
}
}
if (mutations.size() < multiPutBatchSize) {
continue;
}
doMutations(mutator, mutations, limiter);
}
// process remaining puts and deletes, if any
doMutations(mutator, mutations, limiter);
} catch (Exception e) {
Exception we = new Exception("Error updating index for " + writeStatus, e);
LOG.error(we.getMessage(), e);
writeStatus.setGlobalError(we);
}
writeStatusList.add(writeStatus);
}
final long endPutsTime = DateTime.now().getMillis();
LOG.info("hbase puts task time for this task: " + (endPutsTime - startTimeForPutsTask));
} catch (IOException e) {
throw new HoodieIndexException("Failed to Update Index locations because of exception with HBase Client", e);
} finally {
limiter.stop();
}
return writeStatusList.iterator();
};
}
/**
* Helper method to facilitate performing mutations (including puts and deletes) in Hbase.
*/
private void doMutations(BufferedMutator mutator, List mutations, RateLimiter limiter) throws IOException {
if (mutations.isEmpty()) {
return;
}
// report number of operations to account per second with rate limiter.
// If #limiter.getRate() operations are acquired within 1 second, ratelimiter will limit the rest of calls
// for within that second
limiter.tryAcquire(mutations.size());
mutator.mutate(mutations);
mutator.flush();
mutations.clear();
}
Map mapFileWithInsertsToUniquePartition(JavaRDD writeStatusRDD) {
final Map fileIdPartitionMap = new HashMap<>();
int partitionIndex = 0;
// Map each fileId that has inserts to a unique partition Id. This will be used while
// repartitioning RDD
final List fileIds = writeStatusRDD.filter(w -> w.getStat().getNumInserts() > 0)
.map(WriteStatus::getFileId).collect();
for (final String fileId : fileIds) {
fileIdPartitionMap.put(fileId, partitionIndex++);
}
return fileIdPartitionMap;
}
@Override
public HoodieData updateLocation(
HoodieData writeStatus, HoodieEngineContext context,
HoodieTable hoodieTable) {
JavaRDD writeStatusRDD = HoodieJavaRDD.getJavaRDD(writeStatus);
final Option desiredQPSFraction = calculateQPSFraction(writeStatusRDD);
final Map fileIdPartitionMap = mapFileWithInsertsToUniquePartition(writeStatusRDD);
JavaRDD partitionedRDD = this.numWriteStatusWithInserts == 0 ? writeStatusRDD :
writeStatusRDD.mapToPair(w -> new Tuple2<>(w.getFileId(), w))
.partitionBy(new WriteStatusPartitioner(fileIdPartitionMap,
this.numWriteStatusWithInserts))
.map(Tuple2::_2);
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
acquireQPSResourcesAndSetBatchSize(desiredQPSFraction, jsc);
JavaRDD writeStatusJavaRDD = partitionedRDD.mapPartitionsWithIndex(updateLocationFunction(),
true);
// caching the index updated status RDD
writeStatusJavaRDD = writeStatusJavaRDD.persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps()));
// force trigger update location(hbase puts)
writeStatusJavaRDD.count();
this.hBaseIndexQPSResourceAllocator.releaseQPSResources();
return HoodieJavaRDD.of(writeStatusJavaRDD);
}
private Option calculateQPSFraction(JavaRDD writeStatusRDD) {
if (config.getHbaseIndexPutBatchSizeAutoCompute()) {
/*
Each writeStatus represents status information from a write done in one of the IOHandles.
If a writeStatus has any insert, it implies that the corresponding task contacts HBase for
doing puts, since we only do puts for inserts from HBaseIndex.
*/
final Tuple2 numPutsParallelismTuple = getHBasePutAccessParallelism(writeStatusRDD);
this.totalNumInserts = numPutsParallelismTuple._1;
this.numWriteStatusWithInserts = numPutsParallelismTuple._2;
this.numRegionServersForTable = getNumRegionServersAliveForTable();
final float desiredQPSFraction = this.hBaseIndexQPSResourceAllocator.calculateQPSFractionForPutsTime(
this.totalNumInserts, this.numRegionServersForTable);
LOG.info("Desired QPSFraction :" + desiredQPSFraction);
LOG.info("Number HBase puts :" + this.totalNumInserts);
LOG.info("Number of WriteStatus with inserts :" + numWriteStatusWithInserts);
return Option.of(desiredQPSFraction);
}
return Option.empty();
}
private void acquireQPSResourcesAndSetBatchSize(final Option desiredQPSFraction,
final JavaSparkContext jsc) {
if (config.getHbaseIndexPutBatchSizeAutoCompute()) {
SparkConf conf = jsc.getConf();
int maxExecutors = conf.getInt(DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME, 1);
if (conf.getBoolean(DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME, false)) {
maxExecutors = Math.max(maxExecutors, conf.getInt(
DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1));
}
final float availableQpsFraction = this.hBaseIndexQPSResourceAllocator
.acquireQPSResources(desiredQPSFraction.get(), this.totalNumInserts);
LOG.info("Allocated QPS Fraction :" + availableQpsFraction);
multiPutBatchSize = putBatchSizeCalculator
.getBatchSize(
numRegionServersForTable,
maxQpsPerRegionServer,
numWriteStatusWithInserts,
maxExecutors,
availableQpsFraction);
LOG.info("multiPutBatchSize :" + multiPutBatchSize);
}
}
Tuple2 getHBasePutAccessParallelism(final JavaRDD writeStatusRDD) {
final JavaPairRDD insertOnlyWriteStatusRDD = writeStatusRDD
.filter(w -> w.getStat().getNumInserts() > 0).mapToPair(w -> new Tuple2<>(w.getStat().getNumInserts(), 1));
return insertOnlyWriteStatusRDD.fold(new Tuple2<>(0L, 0), (w, c) -> new Tuple2<>(w._1 + c._1, w._2 + c._2));
}
public static class HBasePutBatchSizeCalculator implements Serializable {
private static final Logger LOG = LoggerFactory.getLogger(HBasePutBatchSizeCalculator.class);
/**
* Calculate putBatch size so that sum of requests across multiple jobs in a second does not exceed
* maxQpsPerRegionServer for each Region Server. Multiplying qpsFraction to reduce the aggregate load on common RS
* across topics. Assumption here is that all tables have regions across all RS, which is not necessarily true for
* smaller tables. So, they end up getting a smaller share of QPS than they deserve, but it might be ok.
*
* Example: int putBatchSize = batchSizeCalculator.getBatchSize(10, 16667, 1200, 200, 100, 0.1f)
*
*
* Expected batchSize is 8 because in that case, total request sent to a Region Server in one second is:
*
* 8 (batchSize) * 200 (parallelism) * 10 (maxReqsInOneSecond) * 10 (numRegionServers) * 0.1 (qpsFraction)) =>
* 16000. We assume requests get distributed to Region Servers uniformly, so each RS gets 1600 requests which
* happens to be 10% of 16667 (maxQPSPerRegionServer), as expected.
*
*
* Assumptions made here
*
In a batch, writes get evenly distributed to each RS for that table. Since we do writes only in the case of
* inserts and not updates, for this assumption to fail, inserts would have to be skewed towards few RS, likelihood
* of which is less if Hbase table is pre-split and rowKeys are UUIDs (random strings). If this assumption fails,
* then it is possible for some RS to receive more than maxQpsPerRegionServer QPS, but for simplicity, we are going
* ahead with this model, since this is meant to be a lightweight distributed throttling mechanism without
* maintaining a global context. So if this assumption breaks, we are hoping the HBase Master relocates hot-spot
* regions to new Region Servers.
*
*
* For Region Server stability, throttling at a second level granularity is fine. Although, within a second, the
* sum of queries might be within maxQpsPerRegionServer, there could be peaks at some sub second intervals. So, the
* assumption is that these peaks are tolerated by the Region Server (which at max can be maxQpsPerRegionServer).
*
*
*/
public int getBatchSize(int numRegionServersForTable, int maxQpsPerRegionServer,
int numTasksDuringPut, int maxExecutors, float qpsFraction) {
int numRSAlive = numRegionServersForTable;
int maxReqPerSec = getMaxReqPerSec(numRSAlive, maxQpsPerRegionServer, qpsFraction);
int numTasks = numTasksDuringPut;
int maxParallelPutsTask = Math.max(1, Math.min(numTasks, maxExecutors));
int multiPutBatchSizePerSecPerTask = Math.max(1, (int) Math.ceil((double) maxReqPerSec / maxParallelPutsTask));
LOG.info("HbaseIndexThrottling: qpsFraction :" + qpsFraction);
LOG.info("HbaseIndexThrottling: numRSAlive :" + numRSAlive);
LOG.info("HbaseIndexThrottling: maxReqPerSec :" + maxReqPerSec);
LOG.info("HbaseIndexThrottling: numTasks :" + numTasks);
LOG.info("HbaseIndexThrottling: maxExecutors :" + maxExecutors);
LOG.info("HbaseIndexThrottling: maxParallelPuts :" + maxParallelPutsTask);
LOG.info("HbaseIndexThrottling: numRegionServersForTable :" + numRegionServersForTable);
LOG.info("HbaseIndexThrottling: multiPutBatchSizePerSecPerTask :" + multiPutBatchSizePerSecPerTask);
return multiPutBatchSizePerSecPerTask;
}
public int getMaxReqPerSec(int numRegionServersForTable, int maxQpsPerRegionServer, float qpsFraction) {
return (int) (qpsFraction * numRegionServersForTable * maxQpsPerRegionServer);
}
}
private Integer getNumRegionServersAliveForTable() {
// This is being called in the driver, so there is only one connection
// from the driver, so ok to use a local connection variable.
if (numRegionServersForTable == null) {
try (Connection conn = getHBaseConnection()) {
RegionLocator regionLocator = conn.getRegionLocator(TableName.valueOf(tableName));
numRegionServersForTable = Math
.toIntExact(regionLocator.getAllRegionLocations().stream().map(HRegionLocation::getServerName).distinct().count());
return numRegionServersForTable;
} catch (IOException e) {
LOG.error("Get region locator error", e);
throw new RuntimeException(e);
}
}
return numRegionServersForTable;
}
@Override
public boolean rollbackCommit(String instantTime) {
int multiGetBatchSize = config.getHbaseIndexGetBatchSize();
boolean rollbackSync = config.getHBaseIndexRollbackSync();
if (!config.getHBaseIndexRollbackSync()) {
// Default Rollback in HbaseIndex is managed via method {@link #checkIfValidCommit()}
return true;
}
synchronized (SparkHoodieHBaseIndex.class) {
if (hbaseConnection == null || hbaseConnection.isClosed()) {
hbaseConnection = getHBaseConnection();
}
}
final RateLimiter limiter = RateLimiter.create(multiPutBatchSize, TimeUnit.SECONDS);
try (HTable hTable = (HTable) hbaseConnection.getTable(TableName.valueOf(tableName));
BufferedMutator mutator = hbaseConnection.getBufferedMutator(TableName.valueOf(tableName))) {
Long rollbackTime = TimelineUtils.parseDateFromInstantTime(instantTime).getTime();
Long currentTime = new Date().getTime();
Scan scan = new Scan();
scan.addFamily(SYSTEM_COLUMN_FAMILY);
scan.setTimeRange(rollbackTime, currentTime);
ResultScanner scanner = hTable.getScanner(scan);
Iterator scannerIterator = scanner.iterator();
List statements = new ArrayList<>();
List currentVersionResults = new ArrayList();
List mutations = new ArrayList<>();
while (scannerIterator.hasNext()) {
Result result = scannerIterator.next();
currentVersionResults.add(result);
statements.add(generateStatement(fromUTF8Bytes(result.getRow()), 0L, rollbackTime - 1));
if (scannerIterator.hasNext() && statements.size() < multiGetBatchSize) {
continue;
}
Result[] lastVersionResults = hTable.get(statements);
for (int i = 0; i < lastVersionResults.length; i++) {
Result lastVersionResult = lastVersionResults[i];
if (null == lastVersionResult.getRow() && rollbackSync) {
Result currentVersionResult = currentVersionResults.get(i);
Delete delete = new Delete(currentVersionResult.getRow());
mutations.add(delete);
}
if (null != lastVersionResult.getRow()) {
String oldPath = new String(lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
String nowPath = new String(currentVersionResults.get(i).getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
if (!oldPath.equals(nowPath) || rollbackSync) {
Put put = new Put(lastVersionResult.getRow());
put.addColumn(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN, lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, COMMIT_TS_COLUMN));
put.addColumn(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN, lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, FILE_NAME_COLUMN));
put.addColumn(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN, lastVersionResult.getValue(SYSTEM_COLUMN_FAMILY, PARTITION_PATH_COLUMN));
mutations.add(put);
}
}
}
doMutations(mutator, mutations, limiter);
currentVersionResults.clear();
statements.clear();
mutations.clear();
}
} catch (Exception e) {
LOG.error("hbase index roll back failed", e);
return false;
} finally {
limiter.stop();
}
return true;
}
/**
* Only looks up by recordKey.
*/
@Override
public boolean isGlobal() {
return true;
}
/**
* Mapping is available in HBase already.
*/
@Override
public boolean canIndexLogFiles() {
return true;
}
/**
* Index needs to be explicitly updated after storage write.
*/
@Override
public boolean isImplicitWithStorage() {
return false;
}
public void setHbaseConnection(Connection hbaseConnection) {
SparkHoodieHBaseIndex.hbaseConnection = hbaseConnection;
}
/**
* Partitions each WriteStatus with inserts into a unique single partition. WriteStatus without inserts will be
* assigned to random partitions. This partitioner will be useful to utilize max parallelism with spark operations
* that are based on inserts in each WriteStatus.
*/
public static class WriteStatusPartitioner extends Partitioner {
private int totalPartitions;
final Map fileIdPartitionMap;
public WriteStatusPartitioner(final Map fileIdPartitionMap, final int totalPartitions) {
this.totalPartitions = totalPartitions;
this.fileIdPartitionMap = fileIdPartitionMap;
}
@Override
public int numPartitions() {
return this.totalPartitions;
}
@Override
public int getPartition(Object key) {
final String fileId = (String) key;
if (!fileIdPartitionMap.containsKey(fileId)) {
LOG.info("This writestatus(fileId: " + fileId + ") is not mapped because it doesn't have any inserts. "
+ "In this case, we can assign a random partition to this WriteStatus.");
// Assign random spark partition for the `WriteStatus` that has no inserts. For a spark operation that depends
// on number of inserts, there won't be any performance penalty in packing these WriteStatus'es together.
return Math.abs(fileId.hashCode()) % totalPartitions;
}
return fileIdPartitionMap.get(fileId);
}
}
}