org.apache.phoenix.hbase.index.Indexer Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.phoenix.hbase.index;
import static org.apache.phoenix.hbase.index.util.IndexManagementUtil.rethrowIndexingException;
import static org.apache.phoenix.hbase.index.write.IndexWriterUtils.DEFAULT_INDEX_WRITER_RPC_PAUSE;
import static org.apache.phoenix.hbase.index.write.IndexWriterUtils.DEFAULT_INDEX_WRITER_RPC_RETRIES_NUMBER;
import static org.apache.phoenix.hbase.index.write.IndexWriterUtils.INDEX_WRITER_RPC_PAUSE;
import static org.apache.phoenix.hbase.index.write.IndexWriterUtils.INDEX_WRITER_RPC_RETRIES_NUMBER;
import java.io.IOException;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CoprocessorEnvironment;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.KeyValueUtil;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Increment;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
import org.apache.hadoop.hbase.ipc.controller.InterRegionServerIndexRpcControllerFactory;
import org.apache.hadoop.hbase.regionserver.InternalScanner;
import org.apache.hadoop.hbase.regionserver.KeyValueScanner;
import org.apache.hadoop.hbase.regionserver.MiniBatchOperationInProgress;
import org.apache.hadoop.hbase.regionserver.OperationStatus;
import org.apache.hadoop.hbase.regionserver.Region;
import org.apache.hadoop.hbase.regionserver.ScanType;
import org.apache.hadoop.hbase.regionserver.Store;
import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.htrace.Span;
import org.apache.htrace.Trace;
import org.apache.htrace.TraceScope;
import org.apache.phoenix.coprocessor.BaseScannerRegionObserver.ReplayWrite;
import org.apache.phoenix.coprocessor.DelegateRegionCoprocessorEnvironment;
import org.apache.phoenix.hbase.index.LockManager.RowLock;
import org.apache.phoenix.hbase.index.builder.IndexBuildManager;
import org.apache.phoenix.hbase.index.builder.IndexBuilder;
import org.apache.phoenix.hbase.index.metrics.MetricsIndexerSource;
import org.apache.phoenix.hbase.index.metrics.MetricsIndexerSourceFactory;
import org.apache.phoenix.hbase.index.table.HTableInterfaceReference;
import org.apache.phoenix.hbase.index.util.ImmutableBytesPtr;
import org.apache.phoenix.hbase.index.util.IndexManagementUtil;
import org.apache.phoenix.hbase.index.util.VersionUtil;
import org.apache.phoenix.hbase.index.wal.IndexedKeyValue;
import org.apache.phoenix.hbase.index.write.IndexFailurePolicy;
import org.apache.phoenix.hbase.index.write.IndexWriter;
import org.apache.phoenix.hbase.index.write.RecoveryIndexWriter;
import org.apache.phoenix.hbase.index.write.recovery.PerRegionIndexWriteCache;
import org.apache.phoenix.hbase.index.write.recovery.StoreFailuresInCachePolicy;
import org.apache.phoenix.query.QueryServicesOptions;
import org.apache.phoenix.trace.TracingUtils;
import org.apache.phoenix.trace.util.NullSpan;
import org.apache.phoenix.util.EnvironmentEdgeManager;
import org.apache.phoenix.util.PropertiesUtil;
import org.apache.phoenix.util.ScanUtil;
import org.apache.phoenix.util.ServerUtil;
import org.apache.phoenix.util.ServerUtil.ConnectionType;
import com.google.common.collect.Lists;
import com.google.common.collect.Multimap;
/**
* Do all the work of managing index updates from a single coprocessor. All Puts/Delets are passed
* to an {@link IndexBuilder} to determine the actual updates to make.
*
* If the WAL is enabled, these updates are then added to the WALEdit and attempted to be written to
* the WAL after the WALEdit has been saved. If any of the index updates fail, this server is
* immediately terminated and we rely on WAL replay to attempt the index updates again (see
* {@link #preWALRestore(ObserverContext, HRegionInfo, HLogKey, WALEdit)}).
*
* If the WAL is disabled, the updates are attempted immediately. No consistency guarantees are made
* if the WAL is disabled - some or none of the index updates may be successful. All updates in a
* single batch must have the same durability level - either everything gets written to the WAL or
* nothing does. Currently, we do not support mixed-durability updates within a single batch. If you
* want to have different durability levels, you only need to split the updates into two different
* batches.
*
* We don't need to implement {@link #postPut(ObserverContext, Put, WALEdit, Durability)} and
* {@link #postDelete(ObserverContext, Delete, WALEdit, Durability)} hooks because
* Phoenix always does batch mutations.
*
*/
public class Indexer extends BaseRegionObserver {
private static final Log LOG = LogFactory.getLog(Indexer.class);
private static final OperationStatus IGNORE = new OperationStatus(OperationStatusCode.SUCCESS);
private static final OperationStatus NOWRITE = new OperationStatus(OperationStatusCode.SUCCESS);
protected IndexWriter writer;
protected IndexBuildManager builder;
private LockManager lockManager;
// Hack to get around not being able to save any state between
// coprocessor calls. TODO: remove after HBASE-18127 when available
private static class BatchMutateContext {
public final int clientVersion;
public Collection> indexUpdates = Collections.emptyList();
public List rowLocks = Lists.newArrayListWithExpectedSize(QueryServicesOptions.DEFAULT_MUTATE_BATCH_SIZE);
public BatchMutateContext(int clientVersion) {
this.clientVersion = clientVersion;
}
}
private ThreadLocal batchMutateContext =
new ThreadLocal();
/** Configuration key for the {@link IndexBuilder} to use */
public static final String INDEX_BUILDER_CONF_KEY = "index.builder";
/**
* Configuration key for if the indexer should check the version of HBase is running. Generally,
* you only want to ignore this for testing or for custom versions of HBase.
*/
public static final String CHECK_VERSION_CONF_KEY = "com.saleforce.hbase.index.checkversion";
private static final String INDEX_RECOVERY_FAILURE_POLICY_KEY = "org.apache.hadoop.hbase.index.recovery.failurepolicy";
private static final String INDEXER_INDEX_WRITE_SLOW_THRESHOLD_KEY = "phoenix.indexer.slow.post.batch.mutate.threshold";
private static final long INDEXER_INDEX_WRITE_SLOW_THRESHOLD_DEFAULT = 3_000;
private static final String INDEXER_INDEX_PREPARE_SLOW_THRESHOLD_KEY = "phoenix.indexer.slow.pre.batch.mutate.threshold";
private static final long INDEXER_INDEX_PREPARE_SLOW_THREHSOLD_DEFAULT = 3_000;
private static final String INDEXER_PRE_WAL_RESTORE_SLOW_THRESHOLD_KEY = "phoenix.indexer.slow.pre.wal.restore.threshold";
private static final long INDEXER_PRE_WAL_RESTORE_SLOW_THRESHOLD_DEFAULT = 3_000;
private static final String INDEXER_POST_OPEN_SLOW_THRESHOLD_KEY = "phoenix.indexer.slow.open.threshold";
private static final long INDEXER_POST_OPEN_SLOW_THRESHOLD_DEFAULT = 3_000;
private static final String INDEXER_PRE_INCREMENT_SLOW_THRESHOLD_KEY = "phoenix.indexer.slow.pre.increment";
private static final long INDEXER_PRE_INCREMENT_SLOW_THRESHOLD_DEFAULT = 3_000;
/**
* cache the failed updates to the various regions. Used for making the WAL recovery mechanisms
* more robust in the face of recoverying index regions that were on the same server as the
* primary table region
*/
private PerRegionIndexWriteCache failedIndexEdits = new PerRegionIndexWriteCache();
/**
* IndexWriter for writing the recovered index edits. Separate from the main indexer since we need
* different write/failure policies
*/
private IndexWriter recoveryWriter;
private MetricsIndexerSource metricSource;
private boolean stopped;
private boolean disabled;
private long slowIndexWriteThreshold;
private long slowIndexPrepareThreshold;
private long slowPreWALRestoreThreshold;
private long slowPostOpenThreshold;
private long slowPreIncrementThreshold;
private int rowLockWaitDuration;
public static final String RecoveryFailurePolicyKeyForTesting = INDEX_RECOVERY_FAILURE_POLICY_KEY;
public static final int INDEXING_SUPPORTED_MAJOR_VERSION = VersionUtil
.encodeMaxPatchVersion(0, 94);
public static final int INDEXING_SUPPORTED__MIN_MAJOR_VERSION = VersionUtil
.encodeVersion("0.94.0");
private static final int INDEX_WAL_COMPRESSION_MINIMUM_SUPPORTED_VERSION = VersionUtil
.encodeVersion("0.94.9");
private static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;
@Override
public void start(CoprocessorEnvironment e) throws IOException {
try {
final RegionCoprocessorEnvironment env = (RegionCoprocessorEnvironment) e;
String serverName = env.getRegionServerServices().getServerName().getServerName();
if (env.getConfiguration().getBoolean(CHECK_VERSION_CONF_KEY, true)) {
// make sure the right version <-> combinations are allowed.
String errormsg = Indexer.validateVersion(env.getHBaseVersion(), env.getConfiguration());
if (errormsg != null) {
IOException ioe = new IOException(errormsg);
env.getRegionServerServices().abort(errormsg, ioe);
throw ioe;
}
}
this.builder = new IndexBuildManager(env);
// Clone the config since it is shared
DelegateRegionCoprocessorEnvironment indexWriterEnv = new DelegateRegionCoprocessorEnvironment(env, ConnectionType.INDEX_WRITER_CONNECTION);
// setup the actual index writer
this.writer = new IndexWriter(indexWriterEnv, serverName + "-index-writer");
this.rowLockWaitDuration = env.getConfiguration().getInt("hbase.rowlock.wait.duration",
DEFAULT_ROWLOCK_WAIT_DURATION);
this.lockManager = new LockManager();
// Metrics impl for the Indexer -- avoiding unnecessary indirection for hadoop-1/2 compat
this.metricSource = MetricsIndexerSourceFactory.getInstance().create();
setSlowThresholds(e.getConfiguration());
try {
// get the specified failure policy. We only ever override it in tests, but we need to do it
// here
Class extends IndexFailurePolicy> policyClass =
env.getConfiguration().getClass(INDEX_RECOVERY_FAILURE_POLICY_KEY,
StoreFailuresInCachePolicy.class, IndexFailurePolicy.class);
IndexFailurePolicy policy =
policyClass.getConstructor(PerRegionIndexWriteCache.class).newInstance(failedIndexEdits);
LOG.debug("Setting up recovery writter with failure policy: " + policy.getClass());
recoveryWriter =
new RecoveryIndexWriter(policy, indexWriterEnv, serverName + "-recovery-writer");
} catch (Exception ex) {
throw new IOException("Could not instantiate recovery failure policy!", ex);
}
} catch (NoSuchMethodError ex) {
disabled = true;
super.start(e);
LOG.error("Must be too early a version of HBase. Disabled coprocessor ", ex);
}
}
/**
* Extracts the slow call threshold values from the configuration.
*/
private void setSlowThresholds(Configuration c) {
slowIndexPrepareThreshold = c.getLong(INDEXER_INDEX_WRITE_SLOW_THRESHOLD_KEY,
INDEXER_INDEX_WRITE_SLOW_THRESHOLD_DEFAULT);
slowIndexWriteThreshold = c.getLong(INDEXER_INDEX_PREPARE_SLOW_THRESHOLD_KEY,
INDEXER_INDEX_PREPARE_SLOW_THREHSOLD_DEFAULT);
slowPreWALRestoreThreshold = c.getLong(INDEXER_PRE_WAL_RESTORE_SLOW_THRESHOLD_KEY,
INDEXER_PRE_WAL_RESTORE_SLOW_THRESHOLD_DEFAULT);
slowPostOpenThreshold = c.getLong(INDEXER_POST_OPEN_SLOW_THRESHOLD_KEY,
INDEXER_POST_OPEN_SLOW_THRESHOLD_DEFAULT);
slowPreIncrementThreshold = c.getLong(INDEXER_PRE_INCREMENT_SLOW_THRESHOLD_KEY,
INDEXER_PRE_INCREMENT_SLOW_THRESHOLD_DEFAULT);
}
private String getCallTooSlowMessage(String callName, long duration, long threshold) {
StringBuilder sb = new StringBuilder(64);
sb.append("(callTooSlow) ").append(callName).append(" duration=").append(duration);
sb.append("ms, threshold=").append(threshold).append("ms");
return sb.toString();
}
@Override
public void stop(CoprocessorEnvironment e) throws IOException {
if (this.stopped) {
return;
}
if (this.disabled) {
super.stop(e);
return;
}
this.stopped = true;
String msg = "Indexer is being stopped";
this.builder.stop(msg);
this.writer.stop(msg);
this.recoveryWriter.stop(msg);
}
/**
* We use an Increment to serialize the ON DUPLICATE KEY clause so that the HBase plumbing
* sets up the necessary locks and mvcc to allow an atomic update. The Increment is not a
* real increment, though, it's really more of a Put. We translate the Increment into a
* list of mutations, at most a single Put and Delete that are the changes upon executing
* the list of ON DUPLICATE KEY clauses for this row.
*/
@Override
public Result preIncrementAfterRowLock(final ObserverContext e,
final Increment inc) throws IOException {
long start = EnvironmentEdgeManager.currentTimeMillis();
try {
List mutations = this.builder.executeAtomicOp(inc);
if (mutations == null) {
return null;
}
// Causes the Increment to be ignored as we're committing the mutations
// ourselves below.
e.bypass();
e.complete();
// ON DUPLICATE KEY IGNORE will return empty list if row already exists
// as no action is required in that case.
if (!mutations.isEmpty()) {
Region region = e.getEnvironment().getRegion();
// Otherwise, submit the mutations directly here
region.batchMutate(mutations.toArray(new Mutation[0]), HConstants.NO_NONCE,
HConstants.NO_NONCE);
}
return Result.EMPTY_RESULT;
} catch (Throwable t) {
throw ServerUtil.createIOException(
"Unable to process ON DUPLICATE IGNORE for " +
e.getEnvironment().getRegion().getRegionInfo().getTable().getNameAsString() +
"(" + Bytes.toStringBinary(inc.getRow()) + ")", t);
} finally {
long duration = EnvironmentEdgeManager.currentTimeMillis() - start;
if (duration >= slowIndexPrepareThreshold) {
if (LOG.isDebugEnabled()) {
LOG.debug(getCallTooSlowMessage("preIncrementAfterRowLock", duration, slowPreIncrementThreshold));
}
metricSource.incrementSlowDuplicateKeyCheckCalls();
}
metricSource.updateDuplicateKeyCheckTime(duration);
}
}
@Override
public void preBatchMutate(ObserverContext c,
MiniBatchOperationInProgress miniBatchOp) throws IOException {
if (this.disabled) {
super.preBatchMutate(c, miniBatchOp);
return;
}
long start = EnvironmentEdgeManager.currentTimeMillis();
try {
preBatchMutateWithExceptions(c, miniBatchOp);
return;
} catch (Throwable t) {
rethrowIndexingException(t);
} finally {
long duration = EnvironmentEdgeManager.currentTimeMillis() - start;
if (duration >= slowIndexPrepareThreshold) {
if (LOG.isDebugEnabled()) {
LOG.debug(getCallTooSlowMessage("preBatchMutate", duration, slowIndexPrepareThreshold));
}
metricSource.incrementNumSlowIndexPrepareCalls();
}
metricSource.updateIndexPrepareTime(duration);
}
throw new RuntimeException(
"Somehow didn't return an index update but also didn't propagate the failure to the client!");
}
private static void setTimeStamp(KeyValue kv, byte[] tsBytes) {
int tsOffset = kv.getTimestampOffset();
System.arraycopy(tsBytes, 0, kv.getBuffer(), tsOffset, Bytes.SIZEOF_LONG);
}
public void preBatchMutateWithExceptions(ObserverContext c,
MiniBatchOperationInProgress miniBatchOp) throws Throwable {
// first group all the updates for a single row into a single update to be processed
Map mutationsMap =
new HashMap();
Durability defaultDurability = Durability.SYNC_WAL;
if(c.getEnvironment().getRegion() != null) {
defaultDurability = c.getEnvironment().getRegion().getTableDesc().getDurability();
defaultDurability = (defaultDurability == Durability.USE_DEFAULT) ?
Durability.SYNC_WAL : defaultDurability;
}
/*
* Exclusively lock all rows so we get a consistent read
* while determining the index updates
*/
BatchMutateContext context = new BatchMutateContext(this.builder.getIndexMetaData(miniBatchOp).getClientVersion());
setBatchMutateContext(c, context);
Durability durability = Durability.SKIP_WAL;
boolean copyMutations = false;
for (int i = 0; i < miniBatchOp.size(); i++) {
Mutation m = miniBatchOp.getOperation(i);
if (this.builder.isAtomicOp(m)) {
miniBatchOp.setOperationStatus(i, IGNORE);
continue;
}
if (this.builder.isEnabled(m)) {
context.rowLocks.add(lockManager.lockRow(m.getRow(), rowLockWaitDuration));
Durability effectiveDurablity = (m.getDurability() == Durability.USE_DEFAULT) ?
defaultDurability : m.getDurability();
if (effectiveDurablity.ordinal() > durability.ordinal()) {
durability = effectiveDurablity;
}
// Track whether or not we need to
ImmutableBytesPtr row = new ImmutableBytesPtr(m.getRow());
if (mutationsMap.containsKey(row)) {
copyMutations = true;
} else {
mutationsMap.put(row, null);
}
}
}
// early exit if it turns out we don't have any edits
if (mutationsMap.isEmpty()) {
return;
}
// If we're copying the mutations
Collection originalMutations;
Collection extends Mutation> mutations;
if (copyMutations) {
originalMutations = null;
mutations = mutationsMap.values();
} else {
originalMutations = Lists.newArrayListWithExpectedSize(mutationsMap.size());
mutations = originalMutations;
}
Mutation firstMutation = miniBatchOp.getOperation(0);
ReplayWrite replayWrite = this.builder.getReplayWrite(firstMutation);
boolean resetTimeStamp = replayWrite == null;
long now = EnvironmentEdgeManager.currentTimeMillis();
byte[] byteNow = Bytes.toBytes(now);
for (int i = 0; i < miniBatchOp.size(); i++) {
Mutation m = miniBatchOp.getOperation(i);
// skip this mutation if we aren't enabling indexing
// unfortunately, we really should ask if the raw mutation (rather than the combined mutation)
// should be indexed, which means we need to expose another method on the builder. Such is the
// way optimization go though.
if (miniBatchOp.getOperationStatus(i) != IGNORE && this.builder.isEnabled(m)) {
if (resetTimeStamp) {
// Unless we're replaying edits to rebuild the index, we update the time stamp
// of the data table to prevent overlapping time stamps (which prevents index
// inconsistencies as this case isn't handled correctly currently).
for (List family : m.getFamilyCellMap().values()) {
List familyKVs = KeyValueUtil.ensureKeyValues(family);
for (KeyValue kv : familyKVs) {
setTimeStamp(kv, byteNow);
}
}
}
// No need to write the table mutations when we're rebuilding
// the index as they're already written and just being replayed.
if (replayWrite == ReplayWrite.INDEX_ONLY) {
miniBatchOp.setOperationStatus(i, NOWRITE);
}
// Only copy mutations if we found duplicate rows
// which only occurs when we're partially rebuilding
// the index (since we'll potentially have both a
// Put and a Delete mutation for the same row).
if (copyMutations) {
// Add the mutation to the batch set
ImmutableBytesPtr row = new ImmutableBytesPtr(m.getRow());
MultiMutation stored = mutationsMap.get(row);
// we haven't seen this row before, so add it
if (stored == null) {
stored = new MultiMutation(row);
mutationsMap.put(row, stored);
}
stored.addAll(m);
} else {
originalMutations.add(m);
}
}
}
// dump all the index updates into a single WAL. They will get combined in the end anyways, so
// don't worry which one we get
WALEdit edit = miniBatchOp.getWalEdit(0);
if (edit == null) {
edit = new WALEdit();
miniBatchOp.setWalEdit(0, edit);
}
if (copyMutations || replayWrite != null) {
mutations = IndexManagementUtil.flattenMutationsByTimestamp(mutations);
}
// get the current span, or just use a null-span to avoid a bunch of if statements
try (TraceScope scope = Trace.startSpan("Starting to build index updates")) {
Span current = scope.getSpan();
if (current == null) {
current = NullSpan.INSTANCE;
}
long start = EnvironmentEdgeManager.currentTimeMillis();
// get the index updates for all elements in this batch
Collection> indexUpdates =
this.builder.getIndexUpdate(miniBatchOp, mutations);
long duration = EnvironmentEdgeManager.currentTimeMillis() - start;
if (duration >= slowIndexPrepareThreshold) {
if (LOG.isDebugEnabled()) {
LOG.debug(getCallTooSlowMessage("indexPrepare", duration, slowIndexPrepareThreshold));
}
metricSource.incrementNumSlowIndexPrepareCalls();
}
metricSource.updateIndexPrepareTime(duration);
current.addTimelineAnnotation("Built index updates, doing preStep");
TracingUtils.addAnnotation(current, "index update count", indexUpdates.size());
byte[] tableName = c.getEnvironment().getRegion().getTableDesc().getTableName().getName();
Iterator> indexUpdatesItr = indexUpdates.iterator();
List localUpdates = new ArrayList(indexUpdates.size());
while(indexUpdatesItr.hasNext()) {
Pair next = indexUpdatesItr.next();
if (Bytes.compareTo(next.getSecond(), tableName) == 0) {
localUpdates.add(next.getFirst());
indexUpdatesItr.remove();
}
}
if (!localUpdates.isEmpty()) {
miniBatchOp.addOperationsFromCP(0,
localUpdates.toArray(new Mutation[localUpdates.size()]));
}
if (!indexUpdates.isEmpty()) {
context.indexUpdates = indexUpdates;
// write index updates to WAL
if (durability != Durability.SKIP_WAL) {
// we have all the WAL durability, so we just update the WAL entry and move on
for (Pair entry : indexUpdates) {
edit.add(new IndexedKeyValue(entry.getSecond(), entry.getFirst()));
}
}
}
}
}
private void setBatchMutateContext(ObserverContext c, BatchMutateContext context) {
this.batchMutateContext.set(context);
}
private BatchMutateContext getBatchMutateContext(ObserverContext c) {
return this.batchMutateContext.get();
}
private void removeBatchMutateContext(ObserverContext c) {
this.batchMutateContext.remove();
}
@Override
public void postBatchMutateIndispensably(ObserverContext c,
MiniBatchOperationInProgress miniBatchOp, final boolean success) throws IOException {
if (this.disabled) {
super.postBatchMutateIndispensably(c, miniBatchOp, success);
return;
}
long start = EnvironmentEdgeManager.currentTimeMillis();
BatchMutateContext context = getBatchMutateContext(c);
if (context == null) {
return;
}
try {
for (RowLock rowLock : context.rowLocks) {
rowLock.release();
}
this.builder.batchCompleted(miniBatchOp);
if (success) { // if miniBatchOp was successfully written, write index updates
doPost(c, context);
}
} finally {
removeBatchMutateContext(c);
long duration = EnvironmentEdgeManager.currentTimeMillis() - start;
if (duration >= slowIndexWriteThreshold) {
if (LOG.isDebugEnabled()) {
LOG.debug(getCallTooSlowMessage("postBatchMutateIndispensably", duration, slowIndexWriteThreshold));
}
metricSource.incrementNumSlowIndexWriteCalls();
}
metricSource.updateIndexWriteTime(duration);
}
}
private void doPost(ObserverContext c, BatchMutateContext context) throws IOException {
try {
doPostWithExceptions(c,context);
return;
} catch (Throwable e) {
rethrowIndexingException(e);
}
throw new RuntimeException(
"Somehow didn't complete the index update, but didn't return succesfully either!");
}
private void doPostWithExceptions(ObserverContext c, BatchMutateContext context)
throws IOException {
//short circuit, if we don't need to do any work
if (context == null || context.indexUpdates.isEmpty()) {
return;
}
// get the current span, or just use a null-span to avoid a bunch of if statements
try (TraceScope scope = Trace.startSpan("Completing index writes")) {
Span current = scope.getSpan();
if (current == null) {
current = NullSpan.INSTANCE;
}
long start = EnvironmentEdgeManager.currentTimeMillis();
current.addTimelineAnnotation("Actually doing index update for first time");
writer.writeAndKillYourselfOnFailure(context.indexUpdates, false, context.clientVersion);
long duration = EnvironmentEdgeManager.currentTimeMillis() - start;
if (duration >= slowIndexWriteThreshold) {
if (LOG.isDebugEnabled()) {
LOG.debug(getCallTooSlowMessage("indexWrite", duration, slowIndexWriteThreshold));
}
metricSource.incrementNumSlowIndexWriteCalls();
}
metricSource.updateIndexWriteTime(duration);
}
}
/**
* Search the {@link WALEdit} for the first {@link IndexedKeyValue} present
* @param edit {@link WALEdit}
* @return the first {@link IndexedKeyValue} in the {@link WALEdit} or null if not
* present
*/
private IndexedKeyValue getFirstIndexedKeyValue(WALEdit edit) {
for (Cell kv : edit.getCells()) {
if (kv instanceof IndexedKeyValue) {
return (IndexedKeyValue) kv;
}
}
return null;
}
/**
* Extract the index updates from the WAL Edit
* @param edit to search for index updates
* @return the mutations to apply to the index tables
*/
private Collection> extractIndexUpdate(WALEdit edit) {
// Avoid multiple internal array resizings. Initial size of 64, unless we have fewer cells in the edit
int initialSize = Math.min(edit.size(), 64);
Collection> indexUpdates = new ArrayList>(initialSize);
for (Cell kv : edit.getCells()) {
if (kv instanceof IndexedKeyValue) {
IndexedKeyValue ikv = (IndexedKeyValue) kv;
indexUpdates.add(new Pair(ikv.getMutation(), ikv.getIndexTable()));
}
}
return indexUpdates;
}
@Override
public void postOpen(final ObserverContext c) {
Multimap updates = failedIndexEdits.getEdits(c.getEnvironment().getRegion());
if (this.disabled) {
super.postOpen(c);
return;
}
long start = EnvironmentEdgeManager.currentTimeMillis();
try {
//if we have no pending edits to complete, then we are done
if (updates == null || updates.size() == 0) {
return;
}
LOG.info("Found some outstanding index updates that didn't succeed during"
+ " WAL replay - attempting to replay now.");
// do the usual writer stuff, killing the server again, if we can't manage to make the index
// writes succeed again
try {
writer.writeAndKillYourselfOnFailure(updates, true, ScanUtil.UNKNOWN_CLIENT_VERSION);
} catch (IOException e) {
LOG.error("During WAL replay of outstanding index updates, "
+ "Exception is thrown instead of killing server during index writing", e);
}
} finally {
long duration = EnvironmentEdgeManager.currentTimeMillis() - start;
if (duration >= slowPostOpenThreshold) {
if (LOG.isDebugEnabled()) {
LOG.debug(getCallTooSlowMessage("postOpen", duration, slowPostOpenThreshold));
}
metricSource.incrementNumSlowPostOpenCalls();
}
metricSource.updatePostOpenTime(duration);
}
}
@Override
public void preWALRestore(ObserverContext env, HRegionInfo info,
HLogKey logKey, WALEdit logEdit) throws IOException {
if (this.disabled) {
super.preWALRestore(env, info, logKey, logEdit);
return;
}
// TODO check the regions in transition. If the server on which the region lives is this one,
// then we should rety that write later in postOpen.
// we might be able to get even smarter here and pre-split the edits that are server-local
// into their own recovered.edits file. This then lets us do a straightforward recovery of each
// region (and more efficiently as we aren't writing quite as hectically from this one place).
long start = EnvironmentEdgeManager.currentTimeMillis();
try {
/*
* Basically, we let the index regions recover for a little while long before retrying in the
* hopes they come up before the primary table finishes.
*/
Collection> indexUpdates = extractIndexUpdate(logEdit);
recoveryWriter.writeAndKillYourselfOnFailure(indexUpdates, true, ScanUtil.UNKNOWN_CLIENT_VERSION);
} finally {
long duration = EnvironmentEdgeManager.currentTimeMillis() - start;
if (duration >= slowPreWALRestoreThreshold) {
if (LOG.isDebugEnabled()) {
LOG.debug(getCallTooSlowMessage("preWALRestore", duration, slowPreWALRestoreThreshold));
}
metricSource.incrementNumSlowPreWALRestoreCalls();
}
metricSource.updatePreWALRestoreTime(duration);
}
}
/**
* Create a custom {@link InternalScanner} for a compaction that tracks the versions of rows that
* are removed so we can clean then up from the the index table(s).
*
* This is not yet implemented - its not clear if we should even mess around with the Index table
* for these rows as those points still existed. TODO: v2 of indexing
*/
@Override
public InternalScanner preCompactScannerOpen(final ObserverContext c,
final Store store, final List extends KeyValueScanner> scanners, final ScanType scanType,
final long earliestPutTs, final InternalScanner s) throws IOException {
// Compaction and split upcalls run with the effective user context of the requesting user.
// This will lead to failure of cross cluster RPC if the effective user is not
// the login user. Switch to the login user context to ensure we have the expected
// security context.
// NOTE: Not necessary here at this time but leave in place to document this critical detail.
return User.runAsLoginUser(new PrivilegedExceptionAction() {
@Override
public InternalScanner run() throws Exception {
return Indexer.super.preCompactScannerOpen(c, store, scanners, scanType, earliestPutTs, s);
}
});
}
/**
* Exposed for testing!
* @return the currently instantiated index builder
*/
public IndexBuilder getBuilderForTesting() {
return this.builder.getBuilderForTesting();
}
/**
* Validate that the version and configuration parameters are supported
* @param hbaseVersion current version of HBase on which this coprocessor is installed
* @param conf configuration to check for allowed parameters (e.g. WAL Compression only if >=
* 0.94.9)
* @return null if the version is supported, the error message to display otherwise
*/
public static String validateVersion(String hbaseVersion, Configuration conf) {
int encodedVersion = VersionUtil.encodeVersion(hbaseVersion);
// above 0.94 everything should be supported
if (encodedVersion > INDEXING_SUPPORTED_MAJOR_VERSION) {
return null;
}
// check to see if its at least 0.94
if (encodedVersion < INDEXING_SUPPORTED__MIN_MAJOR_VERSION) {
return "Indexing not supported for versions older than 0.94.X";
}
// if less than 0.94.9, we need to check if WAL Compression is enabled
if (encodedVersion < INDEX_WAL_COMPRESSION_MINIMUM_SUPPORTED_VERSION) {
if (conf.getBoolean(HConstants.ENABLE_WAL_COMPRESSION, false)) {
return "Indexing not supported with WAL Compression for versions of HBase older than 0.94.9 - found version:"
+ hbaseVersion;
}
}
return null;
}
/**
* Enable indexing on the given table
* @param desc {@link HTableDescriptor} for the table on which indexing should be enabled
* @param builder class to use when building the index for this table
* @param properties map of custom configuration options to make available to your
* {@link IndexBuilder} on the server-side
* @param priority TODO
* @throws IOException the Indexer coprocessor cannot be added
*/
public static void enableIndexing(HTableDescriptor desc, Class extends IndexBuilder> builder,
Map properties, int priority) throws IOException {
if (properties == null) {
properties = new HashMap();
}
properties.put(Indexer.INDEX_BUILDER_CONF_KEY, builder.getName());
desc.addCoprocessor(Indexer.class.getName(), null, priority, properties);
}
}
|