All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.phoenix.hbase.index.IndexRegionObserver Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.phoenix.hbase.index;


import static org.apache.hadoop.hbase.HConstants.OperationStatusCode.SUCCESS;
import static org.apache.phoenix.coprocessorclient.BaseScannerRegionObserverConstants.UPSERT_CF;
import static org.apache.phoenix.coprocessorclient.BaseScannerRegionObserverConstants.UPSERT_STATUS_CQ;
import static org.apache.phoenix.hbase.index.util.IndexManagementUtil.rethrowIndexingException;

import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.regionserver.BloomType;
import org.apache.phoenix.execute.MutationState;
import org.apache.phoenix.expression.CaseExpression;
import org.apache.phoenix.index.PhoenixIndexBuilderHelper;
import org.apache.phoenix.schema.types.PInteger;
import org.apache.phoenix.thirdparty.com.google.common.base.Preconditions;
import org.apache.phoenix.thirdparty.com.google.common.collect.ArrayListMultimap;
import org.apache.phoenix.thirdparty.com.google.common.collect.ListMultimap;
import org.apache.phoenix.thirdparty.com.google.common.collect.Lists;
import org.apache.phoenix.thirdparty.com.google.common.collect.Maps;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.CoprocessorEnvironment;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Increment;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.coprocessor.RegionObserver;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.regionserver.MiniBatchOperationInProgress;
import org.apache.hadoop.hbase.regionserver.OperationStatus;
import org.apache.hadoop.hbase.regionserver.Region;
import org.apache.hadoop.hbase.regionserver.RegionScanner;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.wal.WALEdit;
import org.apache.hadoop.hbase.wal.WALKey;
import org.apache.hadoop.io.WritableUtils;
import org.apache.htrace.Span;
import org.apache.htrace.Trace;
import org.apache.htrace.TraceScope;
import org.apache.phoenix.compile.ScanRanges;
import org.apache.phoenix.coprocessor.DelegateRegionCoprocessorEnvironment;
import org.apache.phoenix.coprocessor.generated.PTableProtos;
import org.apache.phoenix.exception.DataExceedsCapacityException;
import org.apache.phoenix.expression.Expression;
import org.apache.phoenix.expression.ExpressionType;
import org.apache.phoenix.expression.KeyValueColumnExpression;
import org.apache.phoenix.expression.visitor.ExpressionVisitor;
import org.apache.phoenix.expression.visitor.StatelessTraverseAllExpressionVisitor;
import org.apache.phoenix.filter.SkipScanFilter;
import org.apache.phoenix.hbase.index.LockManager.RowLock;
import org.apache.phoenix.hbase.index.builder.FatalIndexBuildingFailureException;
import org.apache.phoenix.hbase.index.builder.IndexBuildManager;
import org.apache.phoenix.hbase.index.builder.IndexBuilder;
import org.apache.phoenix.hbase.index.covered.IndexMetaData;
import org.apache.phoenix.hbase.index.covered.update.ColumnReference;
import org.apache.phoenix.hbase.index.metrics.MetricsIndexerSource;
import org.apache.phoenix.hbase.index.metrics.MetricsIndexerSourceFactory;
import org.apache.phoenix.hbase.index.table.HTableInterfaceReference;
import org.apache.phoenix.hbase.index.util.GenericKeyValueBuilder;
import org.apache.phoenix.hbase.index.util.ImmutableBytesPtr;
import org.apache.phoenix.hbase.index.write.IndexWriter;
import org.apache.phoenix.hbase.index.write.LazyParallelWriterIndexCommitter;
import org.apache.phoenix.index.IndexMaintainer;
import org.apache.phoenix.index.PhoenixIndexMetaData;
import org.apache.phoenix.query.KeyRange;
import org.apache.phoenix.query.QueryConstants;
import org.apache.phoenix.query.QueryServicesOptions;
import org.apache.phoenix.schema.PColumn;
import org.apache.phoenix.schema.PRow;
import org.apache.phoenix.schema.PTable;
import org.apache.phoenix.schema.PTableImpl;
import org.apache.phoenix.schema.PTableType;
import org.apache.phoenix.schema.SortOrder;
import org.apache.phoenix.schema.tuple.MultiKeyValueTuple;
import org.apache.phoenix.schema.transform.TransformMaintainer;
import org.apache.phoenix.schema.types.PVarbinary;
import org.apache.phoenix.trace.TracingUtils;
import org.apache.phoenix.trace.util.NullSpan;
import org.apache.phoenix.util.ByteUtil;
import org.apache.phoenix.util.ClientUtil;
import org.apache.phoenix.util.EncodedColumnsUtil;
import org.apache.phoenix.util.EnvironmentEdgeManager;
import org.apache.phoenix.util.IndexUtil;
import org.apache.phoenix.util.PhoenixKeyValueUtil;
import org.apache.phoenix.util.SchemaUtil;
import org.apache.phoenix.util.ServerIndexUtil;
import org.apache.phoenix.util.ServerUtil.ConnectionType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;

import static org.apache.phoenix.coprocessor.IndexRebuildRegionScanner.applyNew;
import static org.apache.phoenix.coprocessor.IndexRebuildRegionScanner.removeColumn;
import static org.apache.phoenix.index.PhoenixIndexBuilderHelper.ATOMIC_OP_ATTRIB;
import static org.apache.phoenix.util.ByteUtil.EMPTY_BYTE_ARRAY;

/**
 * Do all the work of managing index updates from a single coprocessor. All Puts/Delets are passed
 * to an {@link IndexBuilder} to determine the actual updates to make.
 * We don't need to implement {@link #postPut(ObserverContext, Put, WALEdit, Durability)} and
 * {@link #postDelete(ObserverContext, Delete, WALEdit, Durability)} hooks because
 * Phoenix always does batch mutations.
 * 

*/ public class IndexRegionObserver implements RegionCoprocessor, RegionObserver { private static final Logger LOG = LoggerFactory.getLogger(IndexRegionObserver.class); private static final OperationStatus IGNORE = new OperationStatus(SUCCESS); private static final OperationStatus NOWRITE = new OperationStatus(SUCCESS); public static final String PHOENIX_APPEND_METADATA_TO_WAL = "phoenix.append.metadata.to.wal"; public static final boolean DEFAULT_PHOENIX_APPEND_METADATA_TO_WAL = false; /** * Class to represent pending data table rows * */ private class PendingRow { private int count; private boolean usable; private ImmutableBytesPtr rowKey; private BatchMutateContext lastContext; PendingRow(ImmutableBytesPtr rowKey, BatchMutateContext context) { count = 1; usable = true; lastContext = context; this.rowKey = rowKey; } public boolean add(BatchMutateContext context) { synchronized (this) { if (usable) { count++; lastContext = context; return true; } } return false; } public void remove() { synchronized (this) { count--; if (count == 0) { pendingRows.remove(rowKey); usable = false; } } } public int getCount() { return count; } public BatchMutateContext getLastContext() { return lastContext; } } private static boolean ignoreIndexRebuildForTesting = false; private static boolean failPreIndexUpdatesForTesting = false; private static boolean failPostIndexUpdatesForTesting = false; private static boolean failDataTableUpdatesForTesting = false; public static void setIgnoreIndexRebuildForTesting(boolean ignore) { ignoreIndexRebuildForTesting = ignore; } public static void setFailPreIndexUpdatesForTesting(boolean fail) { failPreIndexUpdatesForTesting = fail; } public static void setFailPostIndexUpdatesForTesting(boolean fail) { failPostIndexUpdatesForTesting = fail; } public static void setFailDataTableUpdatesForTesting(boolean fail) { failDataTableUpdatesForTesting = fail; } public enum BatchMutatePhase { PRE, POST, FAILED } // Hack to get around not being able to save any state between // coprocessor calls. TODO: remove after HBASE-18127 when available /** * The concurrent batch of mutations is a set such that every pair of batches in this set has at least one common row. * Since a BatchMutateContext object of a batch is modified only after the row locks for all the rows that are mutated * by this batch are acquired, there can be only one thread can acquire the locks for its batch and safely access * all the batch contexts in the set of concurrent batches. Because of this, we do not read atomic variables or * additional locks to serialize the access to the BatchMutateContext objects. */ public static class BatchMutateContext { private volatile BatchMutatePhase currentPhase = BatchMutatePhase.PRE; // The max of reference counts on the pending rows of this batch at the time this batch arrives private int maxPendingRowCount = 0; private final int clientVersion; // The collection of index mutations that will be applied before the data table mutations. The empty column (i.e., // the verified column) will have the value false ("unverified") on these mutations private ListMultimap preIndexUpdates; // The collection of index mutations that will be applied after the data table mutations. The empty column (i.e., // the verified column) will have the value true ("verified") on the put mutations private ListMultimap postIndexUpdates; // The collection of candidate index mutations that will be applied after the data table mutations private ListMultimap> indexUpdates; private List rowLocks = Lists.newArrayListWithExpectedSize(QueryServicesOptions.DEFAULT_MUTATE_BATCH_SIZE); // TreeSet to improve locking efficiency and avoid deadlock (PHOENIX-6871 and HBASE-17924) private Set rowsToLock = new TreeSet<>(); // The current and next states of the data rows corresponding to the pending mutations private HashMap> dataRowStates; // The previous concurrent batch contexts private HashMap lastConcurrentBatchContext = null; // The latches of the threads waiting for this batch to complete private List waitList = null; private Map multiMutationMap; //list containing the original mutations from the MiniBatchOperationInProgress. Contains // any annotations we were sent by the client, and can be used in hooks that don't get // passed MiniBatchOperationInProgress, like preWALAppend() private List originalMutations; private boolean hasAtomic; private boolean hasDelete; private boolean hasUncoveredIndex; private boolean hasGlobalIndex; private boolean hasLocalIndex; private boolean hasTransform; public BatchMutateContext() { this.clientVersion = 0; } public BatchMutateContext(int clientVersion) { this.clientVersion = clientVersion; } public void populateOriginalMutations(MiniBatchOperationInProgress miniBatchOp) { originalMutations = new ArrayList(miniBatchOp.size()); for (int k = 0; k < miniBatchOp.size(); k++) { originalMutations.add(miniBatchOp.getOperation(k)); } } public List getOriginalMutations() { return originalMutations; } public BatchMutatePhase getCurrentPhase() { return currentPhase; } public Put getNextDataRowState(ImmutableBytesPtr rowKeyPtr) { Pair rowState = dataRowStates.get(rowKeyPtr); if (rowState != null) { return rowState.getSecond(); } return null; } public CountDownLatch getCountDownLatch() { synchronized (this) { if (currentPhase != BatchMutatePhase.PRE) { return null; } if (waitList == null) { waitList = new ArrayList<>(); } CountDownLatch countDownLatch = new CountDownLatch(1); waitList.add(countDownLatch); return countDownLatch; } } public void countDownAllLatches() { synchronized (this) { if (waitList != null) { for (CountDownLatch countDownLatch : waitList) { countDownLatch.countDown(); } } } } public int getMaxPendingRowCount() { return maxPendingRowCount; } } private ThreadLocal batchMutateContext = new ThreadLocal(); /** * Configuration key for if the indexer should check the version of HBase is running. Generally, * you only want to ignore this for testing or for custom versions of HBase. */ public static final String CHECK_VERSION_CONF_KEY = "com.saleforce.hbase.index.checkversion"; public static final String INDEX_LAZY_POST_BATCH_WRITE = "org.apache.hadoop.hbase.index.lazy.post_batch.write"; private static final boolean INDEX_LAZY_POST_BATCH_WRITE_DEFAULT = false; private static final String INDEXER_INDEX_WRITE_SLOW_THRESHOLD_KEY = "phoenix.indexer.slow.post.batch.mutate.threshold"; private static final long INDEXER_INDEX_WRITE_SLOW_THRESHOLD_DEFAULT = 3_000; private static final String INDEXER_PRE_INCREMENT_SLOW_THRESHOLD_KEY = "phoenix.indexer.slow.pre.increment"; private static final long INDEXER_PRE_INCREMENT_SLOW_THRESHOLD_DEFAULT = 3_000; // Index writers get invoked before and after data table updates protected IndexWriter preWriter; protected IndexWriter postWriter; protected IndexBuildManager builder; private LockManager lockManager; // The collection of pending data table rows private Map pendingRows = new ConcurrentHashMap<>(); private MetricsIndexerSource metricSource; private boolean stopped; private boolean disabled; private long slowIndexPrepareThreshold; private long slowPreIncrementThreshold; private int rowLockWaitDuration; private int concurrentMutationWaitDuration; private String dataTableName; private boolean shouldWALAppend = DEFAULT_PHOENIX_APPEND_METADATA_TO_WAL; private boolean isNamespaceEnabled = false; private boolean useBloomFilter = false; private long lastTimestamp = 0; private List> batchesWithLastTimestamp = new ArrayList<>(); private static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000; private static final int DEFAULT_CONCURRENT_MUTATION_WAIT_DURATION_IN_MS = 100; @Override public Optional getRegionObserver() { return Optional.of(this); } @Override public void start(CoprocessorEnvironment e) throws IOException { try { final RegionCoprocessorEnvironment env = (RegionCoprocessorEnvironment) e; String serverName = env.getServerName().getServerName(); if (env.getConfiguration().getBoolean(CHECK_VERSION_CONF_KEY, true)) { // make sure the right version <-> combinations are allowed. String errormsg = Indexer.validateVersion(env.getHBaseVersion(), env.getConfiguration()); if (errormsg != null) { throw new FatalIndexBuildingFailureException(errormsg); } } this.builder = new IndexBuildManager(env); // Clone the config since it is shared DelegateRegionCoprocessorEnvironment indexWriterEnv = new DelegateRegionCoprocessorEnvironment(env, ConnectionType.INDEX_WRITER_CONNECTION); // setup the actual index preWriter this.preWriter = new IndexWriter(indexWriterEnv, serverName + "-index-preWriter", false); if (env.getConfiguration().getBoolean(INDEX_LAZY_POST_BATCH_WRITE, INDEX_LAZY_POST_BATCH_WRITE_DEFAULT)) { this.postWriter = new IndexWriter(indexWriterEnv, new LazyParallelWriterIndexCommitter(), serverName + "-index-postWriter", false); } else { this.postWriter = this.preWriter; } this.rowLockWaitDuration = env.getConfiguration().getInt("hbase.rowlock.wait.duration", DEFAULT_ROWLOCK_WAIT_DURATION); this.lockManager = new LockManager(); this.concurrentMutationWaitDuration = env.getConfiguration().getInt("phoenix.index.concurrent.wait.duration.ms", DEFAULT_CONCURRENT_MUTATION_WAIT_DURATION_IN_MS); // Metrics impl for the Indexer -- avoiding unnecessary indirection for hadoop-1/2 compat this.metricSource = MetricsIndexerSourceFactory.getInstance().getIndexerSource(); setSlowThresholds(e.getConfiguration()); this.dataTableName = env.getRegionInfo().getTable().getNameAsString(); this.shouldWALAppend = env.getConfiguration().getBoolean(PHOENIX_APPEND_METADATA_TO_WAL, DEFAULT_PHOENIX_APPEND_METADATA_TO_WAL); this.isNamespaceEnabled = SchemaUtil.isNamespaceMappingEnabled(PTableType.INDEX, env.getConfiguration()); TableDescriptor tableDescriptor = env.getRegion().getTableDescriptor(); BloomType bloomFilterType = tableDescriptor.getColumnFamilies()[0].getBloomFilterType(); // when the table descriptor changes, the coproc is reloaded this.useBloomFilter = bloomFilterType == BloomType.ROW; } catch (NoSuchMethodError ex) { disabled = true; LOG.error("Must be too early a version of HBase. Disabled coprocessor ", ex); } } /** * Extracts the slow call threshold values from the configuration. */ private void setSlowThresholds(Configuration c) { slowIndexPrepareThreshold = c.getLong(INDEXER_INDEX_WRITE_SLOW_THRESHOLD_KEY, INDEXER_INDEX_WRITE_SLOW_THRESHOLD_DEFAULT); slowPreIncrementThreshold = c.getLong(INDEXER_PRE_INCREMENT_SLOW_THRESHOLD_KEY, INDEXER_PRE_INCREMENT_SLOW_THRESHOLD_DEFAULT); } private String getCallTooSlowMessage(String callName, long duration, long threshold) { StringBuilder sb = new StringBuilder(64); sb.append("(callTooSlow) ").append(callName).append(" duration=").append(duration); sb.append("ms, threshold=").append(threshold).append("ms"); return sb.toString(); } @Override public void stop(CoprocessorEnvironment e) throws IOException { if (this.stopped) { return; } if (this.disabled) { return; } this.stopped = true; String msg = "Indexer is being stopped"; this.builder.stop(msg); this.preWriter.stop(msg); this.postWriter.stop(msg); } /** * We use an Increment to serialize the ON DUPLICATE KEY clause so that the HBase plumbing * sets up the necessary locks and mvcc to allow an atomic update. The Increment is not a * real increment, though, it's really more of a Put. We translate the Increment into a * list of mutations, at most a single Put and Delete that are the changes upon executing * the list of ON DUPLICATE KEY clauses for this row. */ @Override public Result preIncrementAfterRowLock(final ObserverContext e, final Increment inc) throws IOException { long start = EnvironmentEdgeManager.currentTimeMillis(); try { List mutations = this.builder.executeAtomicOp(inc); if (mutations == null) { return null; } // Causes the Increment to be ignored as we're committing the mutations // ourselves below. e.bypass(); // ON DUPLICATE KEY IGNORE will return empty list if row already exists // as no action is required in that case. if (!mutations.isEmpty()) { Region region = e.getEnvironment().getRegion(); // Otherwise, submit the mutations directly here region.batchMutate(mutations.toArray(new Mutation[0])); } return Result.EMPTY_RESULT; } catch (Throwable t) { throw ClientUtil.createIOException( "Unable to process ON DUPLICATE IGNORE for " + e.getEnvironment().getRegion().getRegionInfo().getTable().getNameAsString() + "(" + Bytes.toStringBinary(inc.getRow()) + ")", t); } finally { long duration = EnvironmentEdgeManager.currentTimeMillis() - start; if (duration >= slowIndexPrepareThreshold) { if (LOG.isDebugEnabled()) { LOG.debug(getCallTooSlowMessage("preIncrementAfterRowLock", duration, slowPreIncrementThreshold)); } metricSource.incrementSlowDuplicateKeyCheckCalls(dataTableName); } metricSource.updateDuplicateKeyCheckTime(dataTableName, duration); } } @Override public void preBatchMutate(ObserverContext c, MiniBatchOperationInProgress miniBatchOp) throws IOException { if (this.disabled) { return; } try { preBatchMutateWithExceptions(c, miniBatchOp); return; } catch (Throwable t) { rethrowIndexingException(t); } throw new RuntimeException( "Somehow didn't return an index update but also didn't propagate the failure to the client!"); } private void ignoreAtomicOperations (MiniBatchOperationInProgress miniBatchOp) { for (int i = 0; i < miniBatchOp.size(); i++) { Mutation m = miniBatchOp.getOperation(i); if (this.builder.isAtomicOp(m)) { miniBatchOp.setOperationStatus(i, IGNORE); } } } private void populateRowsToLock(MiniBatchOperationInProgress miniBatchOp, BatchMutateContext context) { for (int i = 0; i < miniBatchOp.size(); i++) { Mutation m = miniBatchOp.getOperation(i); if (this.builder.isAtomicOp(m) || this.builder.isEnabled(m)) { ImmutableBytesPtr row = new ImmutableBytesPtr(m.getRow()); context.rowsToLock.add(row); } } } /** * Add the mutations generated by the ON DUPLICATE KEY UPDATE to the current batch. * MiniBatchOperationInProgress#addOperationsFromCP() allows coprocessors to attach additional mutations * to the incoming mutation. These additional mutations are only executed if the status of the original * mutation is set to NOT_RUN. For atomic mutations, we want HBase to ignore the incoming mutation and * instead execute the mutations generated by the server for that atomic mutation. But we can’t achieve * this behavior just by setting the status of the original mutation to IGNORE because that will also * ignore the additional mutations added by the coprocessors. To get around this, we need to do a fixup * of the original mutation in the batch. Since we always generate one Put mutation from the incoming atomic * Put mutation, we can transfer the cells from the generated Put mutation to the original atomic Put mutation in the batch. * The additional mutations (Delete) can then be added to the operationsFromCoprocessors array. */ private void addOnDupMutationsToBatch(MiniBatchOperationInProgress miniBatchOp, int index, List mutations) { List deleteMutations = Lists.newArrayListWithExpectedSize(mutations.size()); for (Mutation m : mutations) { if (m instanceof Put) { // fix the incoming atomic mutation Mutation original = miniBatchOp.getOperation(index); original.getFamilyCellMap().putAll(m.getFamilyCellMap()); } else if (m instanceof Delete) { deleteMutations.add((Delete)m); } } if (!deleteMutations.isEmpty()) { miniBatchOp.addOperationsFromCP(index, deleteMutations.toArray(new Mutation[deleteMutations.size()])); } } private void addOnDupMutationsToBatch(MiniBatchOperationInProgress miniBatchOp, BatchMutateContext context) throws IOException { for (int i = 0; i < miniBatchOp.size(); i++) { Mutation m = miniBatchOp.getOperation(i); if (this.builder.isAtomicOp(m) && m instanceof Put) { List mutations = generateOnDupMutations(context, (Put)m); if (!mutations.isEmpty()) { addOnDupMutationsToBatch(miniBatchOp, i, mutations); } else { // empty list of generated mutations implies // 1) ON DUPLICATE KEY IGNORE if row already exists, OR // 2) ON DUPLICATE KEY UPDATE if CASE expression is specified and in each of // them the new value is the same as the old value in the ELSE-clause (empty // cell timestamp will NOT be updated) byte[] retVal = PInteger.INSTANCE.toBytes(0); Cell cell = PhoenixKeyValueUtil.newKeyValue(m.getRow(), Bytes.toBytes(UPSERT_CF), Bytes.toBytes(UPSERT_STATUS_CQ), 0, retVal, 0, retVal.length); // put Result in OperationStatus for returning update status from conditional // upserts, where 0 represents the row is not updated Result result = Result.create(new ArrayList<>(Arrays.asList(cell))); miniBatchOp.setOperationStatus(i, new OperationStatus(SUCCESS, result)); } } } } private void lockRows(BatchMutateContext context) throws IOException { for (ImmutableBytesPtr rowKey : context.rowsToLock) { context.rowLocks.add(lockManager.lockRow(rowKey, rowLockWaitDuration)); } } private void unlockRows(BatchMutateContext context) throws IOException { for (RowLock rowLock : context.rowLocks) { rowLock.release(); } context.rowLocks.clear(); } private Collection groupMutations(MiniBatchOperationInProgress miniBatchOp, BatchMutateContext context) throws IOException { context.multiMutationMap = new HashMap<>(); for (int i = 0; i < miniBatchOp.size(); i++) { Mutation m = miniBatchOp.getOperation(i); // skip this mutation if we aren't enabling indexing // unfortunately, we really should ask if the raw mutation (rather than the combined mutation) // should be indexed, which means we need to expose another method on the builder. Such is the // way optimization go though. if (!isAtomicOperationComplete(miniBatchOp.getOperationStatus(i)) && this.builder.isEnabled(m)) { ImmutableBytesPtr row = new ImmutableBytesPtr(m.getRow()); MultiMutation stored = context.multiMutationMap.get(row); if (stored == null) { // we haven't seen this row before, so add it stored = new MultiMutation(row); context.multiMutationMap.put(row, stored); } stored.addAll(m); Mutation[] mutationsAddedByCP = miniBatchOp.getOperationsFromCoprocessors(i); if (mutationsAddedByCP != null) { for (Mutation addedMutation : mutationsAddedByCP) { stored.addAll(addedMutation); } } } } return context.multiMutationMap.values(); } public static void setTimestamps(MiniBatchOperationInProgress miniBatchOp, IndexBuildManager builder, long ts) throws IOException { for (Integer i = 0; i < miniBatchOp.size(); i++) { if (isAtomicOperationComplete(miniBatchOp.getOperationStatus(i))) { continue; } Mutation m = miniBatchOp.getOperation(i); // skip this mutation if we aren't enabling indexing or not an atomic op // or if it is an atomic op and its timestamp is already set(not LATEST) if (!builder.isEnabled(m) && !(builder.isAtomicOp(m) && IndexUtil.getMaxTimestamp(m) == HConstants.LATEST_TIMESTAMP)) { continue; } setTimestampOnMutation(m, ts); // set the timestamps on any additional mutations added Mutation[] mutationsAddedByCP = miniBatchOp.getOperationsFromCoprocessors(i); if (mutationsAddedByCP != null) { for (Mutation addedMutation : mutationsAddedByCP) { setTimestampOnMutation(addedMutation, ts); } } } } private static void setTimestampOnMutation(Mutation m, long ts) throws IOException { for (List cells : m.getFamilyCellMap().values()) { for (Cell cell : cells) { CellUtil.setTimestamp(cell, ts); } } } /** * This method applies pending delete mutations on the next row states */ private void applyPendingDeleteMutations(MiniBatchOperationInProgress miniBatchOp, BatchMutateContext context) throws IOException { for (int i = 0; i < miniBatchOp.size(); i++) { if (miniBatchOp.getOperationStatus(i) == IGNORE) { continue; } Mutation m = miniBatchOp.getOperation(i); if (!this.builder.isEnabled(m)) { continue; } if (!(m instanceof Delete)) { continue; } if (!applyOnePendingDeleteMutation(context, (Delete) m)) { miniBatchOp.setOperationStatus(i, NOWRITE); } } } /** * This method returns true if the pending delete mutation needs to be applied * and false f the delete mutation can be ignored for example in the case of * delete on non-existing row. */ private boolean applyOnePendingDeleteMutation(BatchMutateContext context, Delete delete) { ImmutableBytesPtr rowKeyPtr = new ImmutableBytesPtr(delete.getRow()); Pair dataRowState = context.dataRowStates.get(rowKeyPtr); if (dataRowState == null) { dataRowState = new Pair(null, null); context.dataRowStates.put(rowKeyPtr, dataRowState); } Put nextDataRowState = dataRowState.getSecond(); if (nextDataRowState == null) { if (dataRowState.getFirst() == null) { // This is a delete row mutation on a non-existing row. There is no need to apply this mutation // on the data table return false; } } for (List cells : delete.getFamilyCellMap().values()) { for (Cell cell : cells) { switch (cell.getType()) { case DeleteFamily: case DeleteFamilyVersion: nextDataRowState.getFamilyCellMap().remove(CellUtil.cloneFamily(cell)); break; case DeleteColumn: case Delete: removeColumn(nextDataRowState, cell); } } } if (nextDataRowState != null && nextDataRowState.getFamilyCellMap().size() == 0) { dataRowState.setSecond(null); } return true; } /** * This method applies the pending put mutations on the the next row states. * Before this method is called, the next row states is set to current row states. */ private void applyPendingPutMutations(MiniBatchOperationInProgress miniBatchOp, BatchMutateContext context, long now) throws IOException { for (Integer i = 0; i < miniBatchOp.size(); i++) { if (isAtomicOperationComplete(miniBatchOp.getOperationStatus(i))) { continue; } Mutation m = miniBatchOp.getOperation(i); // skip this mutation if we aren't enabling indexing if (!this.builder.isEnabled(m)) { continue; } if (!(m instanceof Put)) { continue; } ImmutableBytesPtr rowKeyPtr = new ImmutableBytesPtr(m.getRow()); Pair dataRowState = context.dataRowStates.get(rowKeyPtr); if (dataRowState == null) { dataRowState = new Pair(null, null); context.dataRowStates.put(rowKeyPtr, dataRowState); } Put nextDataRowState = dataRowState.getSecond(); dataRowState.setSecond((nextDataRowState != null) ? applyNew((Put) m, nextDataRowState) : new Put((Put) m)); Mutation[] mutationsAddedByCP = miniBatchOp.getOperationsFromCoprocessors(i); if (mutationsAddedByCP != null) { // all added mutations are of type delete corresponding to set nulls for (Mutation addedMutation : mutationsAddedByCP) { applyOnePendingDeleteMutation(context, (Delete)addedMutation); } } } } /** * * Prepares next data row state */ private void prepareDataRowStates(ObserverContext c, MiniBatchOperationInProgress miniBatchOp, BatchMutateContext context, long now) throws IOException { if (context.rowsToLock.size() == 0) { return; } applyPendingPutMutations(miniBatchOp, context, now); applyPendingDeleteMutations(miniBatchOp, context); } /** * The index update generation for local indexes uses the existing index update generation code (i.e., * the {@link IndexBuilder} implementation). */ private void handleLocalIndexUpdates(TableName table, MiniBatchOperationInProgress miniBatchOp, Collection pendingMutations, PhoenixIndexMetaData indexMetaData) throws Throwable { ListMultimap> indexUpdates = ArrayListMultimap.>create(); this.builder.getIndexUpdates(indexUpdates, miniBatchOp, pendingMutations, indexMetaData); byte[] tableName = table.getName(); HTableInterfaceReference hTableInterfaceReference = new HTableInterfaceReference(new ImmutableBytesPtr(tableName)); List> localIndexUpdates = indexUpdates.removeAll(hTableInterfaceReference); if (localIndexUpdates == null || localIndexUpdates.isEmpty()) { return; } List localUpdates = new ArrayList(); Iterator> indexUpdatesItr = localIndexUpdates.iterator(); while (indexUpdatesItr.hasNext()) { Pair next = indexUpdatesItr.next(); localUpdates.add(next.getFirst()); } if (!localUpdates.isEmpty()) { Mutation[] mutationsAddedByCP = miniBatchOp.getOperationsFromCoprocessors(0); if (mutationsAddedByCP != null) { localUpdates.addAll(Arrays.asList(mutationsAddedByCP)); } miniBatchOp.addOperationsFromCP(0, localUpdates.toArray(new Mutation[localUpdates.size()])); } } /** * Determines if any of the data table mutations in the given batch does not include all * the indexed columns or the where clause columns for partial uncovered indexes. */ private boolean isPartialUncoveredIndexMutation(PhoenixIndexMetaData indexMetaData, MiniBatchOperationInProgress miniBatchOp) { int indexedColumnCount = 0; for (IndexMaintainer indexMaintainer : indexMetaData.getIndexMaintainers()) { indexedColumnCount += indexMaintainer.getIndexedColumns().size(); if (indexMaintainer.getIndexWhereColumns() != null) { indexedColumnCount += indexMaintainer.getIndexWhereColumns().size(); } } Set columns = new HashSet(indexedColumnCount); for (IndexMaintainer indexMaintainer : indexMetaData.getIndexMaintainers()) { columns.addAll(indexMaintainer.getIndexedColumns()); if (indexMaintainer.getIndexWhereColumns() != null) { columns.addAll(indexMaintainer.getIndexWhereColumns()); } } for (int i = 0; i < miniBatchOp.size(); i++) { if (isAtomicOperationComplete(miniBatchOp.getOperationStatus(i))) { continue; } Mutation m = miniBatchOp.getOperation(i); if (!this.builder.isEnabled(m)) { continue; } for (ColumnReference column : columns) { if (m.get(column.getFamily(), column.getQualifier()).isEmpty()) { // The returned list is empty, which means the indexed column is not // included. This mutation would result in partial index update (and thus // index column values should be retrieved from the existing data table row) return true; } } } return false; } /** * Retrieve the data row state either from memory or disk. The rows are locked by the caller. */ private void getCurrentRowStates(ObserverContext c, BatchMutateContext context) throws IOException { Set keys = new HashSet(context.rowsToLock.size()); for (ImmutableBytesPtr rowKeyPtr : context.rowsToLock) { PendingRow pendingRow = new PendingRow(rowKeyPtr, context); // Add the data table rows in the mini batch to the per region collection of pending // rows. This will be used to detect concurrent updates PendingRow existingPendingRow = pendingRows.putIfAbsent(rowKeyPtr, pendingRow); if (existingPendingRow == null) { // There was no pending row for this row key. We need to retrieve this row from disk keys.add(PVarbinary.INSTANCE.getKeyRange(rowKeyPtr.get(), SortOrder.ASC)); } else { // There is a pending row for this row key. We need to retrieve the row from memory BatchMutateContext lastContext = existingPendingRow.getLastContext(); if (existingPendingRow.add(context)) { BatchMutatePhase phase = lastContext.getCurrentPhase(); Preconditions.checkArgument(phase != BatchMutatePhase.POST, "the phase of the last batch cannot be POST"); if (phase == BatchMutatePhase.PRE) { if (context.lastConcurrentBatchContext == null) { context.lastConcurrentBatchContext = new HashMap<>(); } context.lastConcurrentBatchContext.put(rowKeyPtr, lastContext); if (context.maxPendingRowCount < existingPendingRow.getCount()) { context.maxPendingRowCount = existingPendingRow.getCount(); } Put put = lastContext.getNextDataRowState(rowKeyPtr); if (put != null) { context.dataRowStates.put(rowKeyPtr, new Pair<>(put, new Put(put))); } } else { // The last batch for this row key failed. We cannot use the memory state. // So we need to retrieve this row from disk keys.add(PVarbinary.INSTANCE.getKeyRange(rowKeyPtr.get(), SortOrder.ASC)); } } else { // The existing pending row is removed from the map. That means there is no // pending row for this row key anymore. We need to add the new one to the map pendingRows.put(rowKeyPtr, pendingRow); keys.add(PVarbinary.INSTANCE.getKeyRange(rowKeyPtr.get(), SortOrder.ASC)); } } } if (keys.isEmpty()) { return; } if (this.useBloomFilter) { for (KeyRange key : keys) { // Scan.java usage alters scan instances, safer to create scan instance per usage Scan scan = new Scan(); // create a scan with same start/stop row key scan#isGetScan() // for bloom filters scan should be a get scan.withStartRow(key.getLowerRange(), true); scan.withStopRow(key.getLowerRange(), true); readDataTableRows(c, context, scan); } } else { Scan scan = new Scan(); ScanRanges scanRanges = ScanRanges.createPointLookup(new ArrayList(keys)); scanRanges.initializeScan(scan); SkipScanFilter skipScanFilter = scanRanges.getSkipScanFilter(); scan.setFilter(skipScanFilter); readDataTableRows(c, context, scan); } } private void readDataTableRows(ObserverContext c, BatchMutateContext context, Scan scan) throws IOException { try (RegionScanner scanner = c.getEnvironment().getRegion().getScanner(scan)) { boolean more = true; while (more) { List cells = new ArrayList(); more = scanner.next(cells); if (cells.isEmpty()) { continue; } byte[] rowKey = CellUtil.cloneRow(cells.get(0)); Put put = new Put(rowKey); for (Cell cell : cells) { put.add(cell); } context.dataRowStates.put(new ImmutableBytesPtr(rowKey), new Pair(put, new Put(put))); } } } /** * Generate the index update for a data row from the mutation that are obtained by merging the previous data row * state with the pending row mutation. */ private void prepareIndexMutations(BatchMutateContext context, List maintainers, long ts) throws IOException { List> indexTables = new ArrayList<>(maintainers.size()); for (IndexMaintainer indexMaintainer : maintainers) { if (indexMaintainer.isLocalIndex()) { continue; } HTableInterfaceReference hTableInterfaceReference = new HTableInterfaceReference(new ImmutableBytesPtr(indexMaintainer.getIndexTableName())); indexTables.add(new Pair<>(indexMaintainer, hTableInterfaceReference)); } for (Map.Entry> entry : context.dataRowStates.entrySet()) { ImmutableBytesPtr rowKeyPtr = entry.getKey(); Pair dataRowState = entry.getValue(); Put currentDataRowState = dataRowState.getFirst(); Put nextDataRowState = dataRowState.getSecond(); if (currentDataRowState == null && nextDataRowState == null) { continue; } for (Pair pair : indexTables) { IndexMaintainer indexMaintainer = pair.getFirst(); HTableInterfaceReference hTableInterfaceReference = pair.getSecond(); if (nextDataRowState != null && indexMaintainer.shouldPrepareIndexMutations(nextDataRowState)) { ValueGetter nextDataRowVG = new IndexUtil.SimpleValueGetter(nextDataRowState); Put indexPut = indexMaintainer.buildUpdateMutation(GenericKeyValueBuilder.INSTANCE, nextDataRowVG, rowKeyPtr, ts, null, null, false); if (indexPut == null) { // No covered column. Just prepare an index row with the empty column byte[] indexRowKey = indexMaintainer.buildRowKey(nextDataRowVG, rowKeyPtr, null, null, ts); indexPut = new Put(indexRowKey); } else { IndexUtil.removeEmptyColumn(indexPut, indexMaintainer.getEmptyKeyValueFamily().copyBytesIfNecessary(), indexMaintainer.getEmptyKeyValueQualifier()); } indexPut.addColumn( indexMaintainer.getEmptyKeyValueFamily().copyBytesIfNecessary(), indexMaintainer.getEmptyKeyValueQualifier(), ts, QueryConstants.UNVERIFIED_BYTES); context.indexUpdates.put(hTableInterfaceReference, new Pair(indexPut, rowKeyPtr.get())); // Delete the current index row if the new index key is different than the current one if (currentDataRowState != null) { ValueGetter currentDataRowVG = new IndexUtil.SimpleValueGetter(currentDataRowState); byte[] indexRowKeyForCurrentDataRow = indexMaintainer.buildRowKey(currentDataRowVG, rowKeyPtr, null, null, ts); if (Bytes.compareTo(indexPut.getRow(), indexRowKeyForCurrentDataRow) != 0) { Mutation del = indexMaintainer.buildRowDeleteMutation(indexRowKeyForCurrentDataRow, IndexMaintainer.DeleteType.ALL_VERSIONS, ts); context.indexUpdates.put(hTableInterfaceReference, new Pair(del, rowKeyPtr.get())); } } } else if (currentDataRowState != null && indexMaintainer.shouldPrepareIndexMutations(currentDataRowState)) { ValueGetter currentDataRowVG = new IndexUtil.SimpleValueGetter(currentDataRowState); byte[] indexRowKeyForCurrentDataRow = indexMaintainer.buildRowKey(currentDataRowVG, rowKeyPtr, null, null, ts); Mutation del = indexMaintainer.buildRowDeleteMutation(indexRowKeyForCurrentDataRow, IndexMaintainer.DeleteType.ALL_VERSIONS, ts); context.indexUpdates.put(hTableInterfaceReference, new Pair(del, rowKeyPtr.get())); } } } } /** * This method prepares unverified index mutations which are applied to index tables before the data table is * updated. In the three-phase update approach, in phase 1, the status of existing index rows is set to "unverified" * (these rows will be deleted from the index table in phase 3), and/or new put mutations are added with the * unverified status. In phase 2, data table mutations are applied. In phase 3, the status for an index table row is * either set to "verified" or the row is deleted. */ private void preparePreIndexMutations(BatchMutateContext context, long batchTimestamp, PhoenixIndexMetaData indexMetaData) throws Throwable { List maintainers = indexMetaData.getIndexMaintainers(); // get the current span, or just use a null-span to avoid a bunch of if statements try (TraceScope scope = Trace.startSpan("Starting to build index updates")) { Span current = scope.getSpan(); if (current == null) { current = NullSpan.INSTANCE; } current.addTimelineAnnotation("Built index updates, doing preStep"); // The rest of this method is for handling global index updates context.indexUpdates = ArrayListMultimap.>create(); prepareIndexMutations(context, maintainers, batchTimestamp); context.preIndexUpdates = ArrayListMultimap.create(); int updateCount = 0; for (IndexMaintainer indexMaintainer : maintainers) { updateCount++; byte[] emptyCF = indexMaintainer.getEmptyKeyValueFamily().copyBytesIfNecessary(); byte[] emptyCQ = indexMaintainer.getEmptyKeyValueQualifier(); HTableInterfaceReference hTableInterfaceReference = new HTableInterfaceReference(new ImmutableBytesPtr(indexMaintainer.getIndexTableName())); List > updates = context.indexUpdates.get(hTableInterfaceReference); for (Pair update : updates) { Mutation m = update.getFirst(); if (m instanceof Put) { // This will be done before the data table row is updated (i.e., in the first write phase) context.preIndexUpdates.put(hTableInterfaceReference, m); } else { // Set the status of the index row to "unverified" Put unverifiedPut = new Put(m.getRow()); unverifiedPut.addColumn( emptyCF, emptyCQ, batchTimestamp, QueryConstants.UNVERIFIED_BYTES); // This will be done before the data table row is updated (i.e., in the first write phase) context.preIndexUpdates.put(hTableInterfaceReference, unverifiedPut); } } } TracingUtils.addAnnotation(current, "index update count", updateCount); } } protected PhoenixIndexMetaData getPhoenixIndexMetaData(ObserverContext observerContext, MiniBatchOperationInProgress miniBatchOp) throws IOException { IndexMetaData indexMetaData = this.builder.getIndexMetaData(miniBatchOp); if (!(indexMetaData instanceof PhoenixIndexMetaData)) { throw new DoNotRetryIOException( "preBatchMutateWithExceptions: indexMetaData is not an instance of "+PhoenixIndexMetaData.class.getName() + ", current table is:" + observerContext.getEnvironment().getRegion().getRegionInfo().getTable().getNameAsString()); } return (PhoenixIndexMetaData)indexMetaData; } private void preparePostIndexMutations(BatchMutateContext context, long batchTimestamp, PhoenixIndexMetaData indexMetaData) { context.postIndexUpdates = ArrayListMultimap.create(); List maintainers = indexMetaData.getIndexMaintainers(); for (IndexMaintainer indexMaintainer : maintainers) { byte[] emptyCF = indexMaintainer.getEmptyKeyValueFamily().copyBytesIfNecessary(); byte[] emptyCQ = indexMaintainer.getEmptyKeyValueQualifier(); HTableInterfaceReference hTableInterfaceReference = new HTableInterfaceReference(new ImmutableBytesPtr(indexMaintainer.getIndexTableName())); List> updates = context.indexUpdates.get(hTableInterfaceReference); for (Pair update : updates) { Mutation m = update.getFirst(); if (m instanceof Put) { if (!indexMaintainer.isUncovered()) { Put verifiedPut = new Put(m.getRow()); // Set the status of the index row to "verified" verifiedPut.addColumn(emptyCF, emptyCQ, batchTimestamp, QueryConstants.VERIFIED_BYTES); context.postIndexUpdates.put(hTableInterfaceReference, verifiedPut); } } else { context.postIndexUpdates.put(hTableInterfaceReference, m); } } } // all cleanup will be done in postBatchMutateIndispensably() } private static void identifyIndexMaintainerTypes(PhoenixIndexMetaData indexMetaData, BatchMutateContext context) { for (IndexMaintainer indexMaintainer : indexMetaData.getIndexMaintainers()) { if (indexMaintainer instanceof TransformMaintainer) { context.hasTransform = true; } else if (indexMaintainer.isLocalIndex()) { context.hasLocalIndex = true; } else if (indexMaintainer.isUncovered()) { context.hasUncoveredIndex = true; } else { context.hasGlobalIndex = true; } } } private void identifyMutationTypes(MiniBatchOperationInProgress miniBatchOp, BatchMutateContext context) { for (int i = 0; i < miniBatchOp.size(); i++) { Mutation m = miniBatchOp.getOperation(i); if (this.builder.isAtomicOp(m)) { context.hasAtomic = true; if (context.hasDelete) { return; } } else if (m instanceof Delete) context.hasDelete = true; if (context.hasAtomic) { return; } } } /** * Wait for the previous batches to complete. If any of the previous batch fails then this * batch will fail too and needs to be retried. The rows are locked by the caller. * @param table * @param context * @throws Throwable */ private void waitForPreviousConcurrentBatch(TableName table, BatchMutateContext context) throws Throwable { for (BatchMutateContext lastContext : context.lastConcurrentBatchContext.values()) { BatchMutatePhase phase = lastContext.getCurrentPhase(); if (phase == BatchMutatePhase.FAILED) { context.currentPhase = BatchMutatePhase.FAILED; break; } else if (phase == BatchMutatePhase.PRE) { CountDownLatch countDownLatch = lastContext.getCountDownLatch(); if (countDownLatch == null) { // phase changed from PRE to either FAILED or POST if (phase == BatchMutatePhase.FAILED) { context.currentPhase = BatchMutatePhase.FAILED; break; } continue; } // Release the locks so that the previous concurrent mutation can go into the post phase unlockRows(context); // Wait for at most one concurrentMutationWaitDuration for each level in the dependency tree of batches. // lastContext.getMaxPendingRowCount() is the depth of the subtree rooted at the batch pointed by lastContext if (!countDownLatch.await((lastContext.getMaxPendingRowCount() + 1) * concurrentMutationWaitDuration, TimeUnit.MILLISECONDS)) { context.currentPhase = BatchMutatePhase.FAILED; LOG.debug(String.format("latch timeout context %s last %s", context, lastContext)); break; } if (lastContext.getCurrentPhase() == BatchMutatePhase.FAILED) { context.currentPhase = BatchMutatePhase.FAILED; break; } // Acquire the locks again before letting the region proceed with data table updates lockRows(context); LOG.debug(String.format("context %s last %s exit phase %s", context, lastContext, lastContext.getCurrentPhase())); } } if (context.currentPhase == BatchMutatePhase.FAILED) { // This batch needs to be retried since one of the previous concurrent batches has not completed yet. // Throwing an IOException will result in retries of this batch. Removal of reference counts and // locks for the rows of this batch will be done in postBatchMutateIndispensably() throw new IOException("One of the previous concurrent mutations has not completed. " + "The batch needs to be retried " + table.getNameAsString()); } } private boolean shouldSleep(BatchMutateContext context) { for (ImmutableBytesPtr ptr : context.rowsToLock) { for (Set set : batchesWithLastTimestamp) { if (set.contains(ptr)) { return true; } } } return false; } private long getBatchTimestamp(BatchMutateContext context, TableName table) throws InterruptedException { synchronized (this) { long ts = EnvironmentEdgeManager.currentTimeMillis(); if (ts != lastTimestamp) { // The timestamp for this batch will be different from the last batch processed. lastTimestamp = ts; batchesWithLastTimestamp.clear(); batchesWithLastTimestamp.add(context.rowsToLock); return ts; } else { if (!shouldSleep(context)) { // There is no need to sleep as the last batches with the same timestamp // do not have a common row this batch batchesWithLastTimestamp.add(context.rowsToLock); return ts; } } } // Sleep for one millisecond. The sleep is necessary to get different timestamps // for concurrent batches that share common rows. Thread.sleep(1); LOG.debug("slept 1ms for " + table.getNameAsString()); synchronized (this) { long ts = EnvironmentEdgeManager.currentTimeMillis(); if (ts != lastTimestamp) { // The timestamp for this batch will be different from the last batch processed. lastTimestamp = ts; batchesWithLastTimestamp.clear(); } // We do not have to check again if we need to sleep again since we got the next // timestamp while holding the row locks. This mean there cannot be a new // mutation with the same row attempting get the same timestamp batchesWithLastTimestamp.add(context.rowsToLock); return ts; } } public void preBatchMutateWithExceptions(ObserverContext c, MiniBatchOperationInProgress miniBatchOp) throws Throwable { PhoenixIndexMetaData indexMetaData = getPhoenixIndexMetaData(c, miniBatchOp); BatchMutateContext context = new BatchMutateContext(indexMetaData.getClientVersion()); setBatchMutateContext(c, context); identifyIndexMaintainerTypes(indexMetaData, context); identifyMutationTypes(miniBatchOp, context); context.populateOriginalMutations(miniBatchOp); if (context.hasDelete) { // Need to add cell tags to Delete Marker before we do any index processing // since we add tags to tables which doesn't have indexes also. ServerIndexUtil.setDeleteAttributes(miniBatchOp); } // Exclusively lock all rows to do consistent writes over multiple tables // (i.e., the data and its index tables) populateRowsToLock(miniBatchOp, context); // early exit if it turns out we don't have any update for indexes if (context.rowsToLock.isEmpty()) { return; } lockRows(context); long onDupCheckTime = 0; if (context.hasAtomic || context.hasGlobalIndex || context.hasUncoveredIndex || context.hasTransform) { // Retrieve the current row states from the data table while holding the lock. // This is needed for both atomic mutations and global indexes long start = EnvironmentEdgeManager.currentTimeMillis(); context.dataRowStates = new HashMap>(context.rowsToLock.size()); if (context.hasGlobalIndex || context.hasTransform || context.hasAtomic || context.hasDelete || (context.hasUncoveredIndex && isPartialUncoveredIndexMutation(indexMetaData, miniBatchOp))) { getCurrentRowStates(c, context); } onDupCheckTime += (EnvironmentEdgeManager.currentTimeMillis() - start); } if (context.hasAtomic) { long start = EnvironmentEdgeManager.currentTimeMillis(); // add the mutations for conditional updates to the mini batch addOnDupMutationsToBatch(miniBatchOp, context); // release locks for ON DUPLICATE KEY IGNORE since we won't be changing those rows // this is needed so that we can exit early releaseLocksForOnDupIgnoreMutations(miniBatchOp, context); onDupCheckTime += (EnvironmentEdgeManager.currentTimeMillis() - start); metricSource.updateDuplicateKeyCheckTime(dataTableName, onDupCheckTime); // early exit if we are not changing any rows if (context.rowsToLock.isEmpty()) { return; } } TableName table = c.getEnvironment().getRegion().getRegionInfo().getTable(); long batchTimestamp = getBatchTimestamp(context, table); // Update the timestamps of the data table mutations to prevent overlapping timestamps // (which prevents index inconsistencies as this case is not handled). setTimestamps(miniBatchOp, builder, batchTimestamp); if (context.hasGlobalIndex || context.hasUncoveredIndex || context.hasTransform) { // Prepare next data rows states for pending mutations (for global indexes) prepareDataRowStates(c, miniBatchOp, context, batchTimestamp); // early exit if it turns out we don't have any edits long start = EnvironmentEdgeManager.currentTimeMillis(); preparePreIndexMutations(context, batchTimestamp, indexMetaData); metricSource.updateIndexPrepareTime(dataTableName, EnvironmentEdgeManager.currentTimeMillis() - start); // Release the locks before making RPC calls for index updates unlockRows(context); // Do the first phase index updates doPre(context); // Acquire the locks again before letting the region proceed with data table updates lockRows(context); if (context.lastConcurrentBatchContext != null) { waitForPreviousConcurrentBatch(table, context); } preparePostIndexMutations(context, batchTimestamp, indexMetaData); } if (context.hasLocalIndex) { // Group all the updates for a single row into a single update to be processed (for local indexes) Collection mutations = groupMutations(miniBatchOp, context); handleLocalIndexUpdates(table, miniBatchOp, mutations, indexMetaData); } if (failDataTableUpdatesForTesting) { throw new DoNotRetryIOException("Simulating the data table write failure"); } } /** * In case of ON DUPLICATE KEY IGNORE, if the row already exists no mutations will be * generated so release the row lock. */ private void releaseLocksForOnDupIgnoreMutations(MiniBatchOperationInProgress miniBatchOp, BatchMutateContext context) { for (int i = 0; i < miniBatchOp.size(); i++) { if (!isAtomicOperationComplete(miniBatchOp.getOperationStatus(i))) { continue; } Mutation m = miniBatchOp.getOperation(i); if (!this.builder.isAtomicOp(m)) { continue; } ImmutableBytesPtr row = new ImmutableBytesPtr(m.getRow()); Iterator rowLockIterator = context.rowLocks.iterator(); while(rowLockIterator.hasNext()){ RowLock rowLock = rowLockIterator.next(); ImmutableBytesPtr rowKey = rowLock.getRowKey(); if (row.equals(rowKey)) { PendingRow pendingRow = pendingRows.get(rowKey); if (pendingRow != null) { pendingRow.remove(); } rowLock.release(); rowLockIterator.remove(); context.rowsToLock.remove(row); break; } } } } private void setBatchMutateContext(ObserverContext c, BatchMutateContext context) { this.batchMutateContext.set(context); } private BatchMutateContext getBatchMutateContext(ObserverContext c) { return this.batchMutateContext.get(); } private void removeBatchMutateContext(ObserverContext c) { this.batchMutateContext.remove(); } @Override public void preWALAppend(ObserverContext c, WALKey key, WALEdit edit) { if (shouldWALAppend) { BatchMutateContext context = getBatchMutateContext(c); appendMutationAttributesToWALKey(key, context); } } public void appendMutationAttributesToWALKey(WALKey key, IndexRegionObserver.BatchMutateContext context) { if (context != null && context.getOriginalMutations().size() > 0) { Mutation firstMutation = context.getOriginalMutations().get(0); Map attrMap = firstMutation.getAttributesMap(); for (MutationState.MutationMetadataType metadataType : MutationState.MutationMetadataType.values()) { String metadataTypeKey = metadataType.toString(); if (attrMap.containsKey(metadataTypeKey)) { IndexRegionObserver.appendToWALKey(key, metadataTypeKey, attrMap.get(metadataTypeKey)); } } } } /** * When this hook is called, all the rows in the batch context are locked if the batch of * mutations is successful. Because the rows are locked, we can safely make updates to * pending row states in memory and perform the necessary cleanup in that case. * * However, when the batch fails, then some of the rows may not be locked. In that case, * we remove the pending row states from the concurrent hash map without updating them since * pending rows states become invalid when a batch fails. */ @Override public void postBatchMutateIndispensably(ObserverContext c, MiniBatchOperationInProgress miniBatchOp, final boolean success) throws IOException { if (this.disabled) { return; } BatchMutateContext context = getBatchMutateContext(c); if (context == null) { return; } try { if (success) { context.currentPhase = BatchMutatePhase.POST; if(context.hasAtomic && miniBatchOp.size() == 1) { if (!isAtomicOperationComplete(miniBatchOp.getOperationStatus(0))) { byte[] retVal = PInteger.INSTANCE.toBytes(1); Cell cell = PhoenixKeyValueUtil.newKeyValue( miniBatchOp.getOperation(0).getRow(), Bytes.toBytes(UPSERT_CF), Bytes.toBytes(UPSERT_STATUS_CQ), 0, retVal, 0, retVal.length); Result result = Result.create(new ArrayList<>(Arrays.asList(cell))); miniBatchOp.setOperationStatus(0, new OperationStatus(SUCCESS, result)); } } } else { context.currentPhase = BatchMutatePhase.FAILED; } context.countDownAllLatches(); removePendingRows(context); if (context.indexUpdates != null) { context.indexUpdates.clear(); } unlockRows(context); this.builder.batchCompleted(miniBatchOp); if (success) { // The pre-index and data table updates are successful, and now, do post index updates doPost(c, context); } } finally { removeBatchMutateContext(c); } } private void doPost(ObserverContext c, BatchMutateContext context) throws IOException { long start = EnvironmentEdgeManager.currentTimeMillis(); try { if (failPostIndexUpdatesForTesting) { throw new DoNotRetryIOException("Simulating the last (i.e., post) index table write failure"); } doIndexWritesWithExceptions(context, true); metricSource.updatePostIndexUpdateTime(dataTableName, EnvironmentEdgeManager.currentTimeMillis() - start); } catch (Throwable e) { metricSource.updatePostIndexUpdateFailureTime(dataTableName, EnvironmentEdgeManager.currentTimeMillis() - start); metricSource.incrementPostIndexUpdateFailures(dataTableName); // Ignore the failures in the third write phase } } private void doIndexWritesWithExceptions(BatchMutateContext context, boolean post) throws IOException { ListMultimap indexUpdates = post ? context.postIndexUpdates : context.preIndexUpdates; //short circuit, if we don't need to do any work if (context == null || indexUpdates == null || indexUpdates.isEmpty()) { return; } // get the current span, or just use a null-span to avoid a bunch of if statements try (TraceScope scope = Trace.startSpan("Completing " + (post ? "post" : "pre") + " index writes")) { Span current = scope.getSpan(); if (current == null) { current = NullSpan.INSTANCE; } current.addTimelineAnnotation("Actually doing " + (post ? "post" : "pre") + " index update for first time"); if (post) { postWriter.write(indexUpdates, false, context.clientVersion); } else { preWriter.write(indexUpdates, false, context.clientVersion); } } } private void removePendingRows(BatchMutateContext context) { for (ImmutableBytesPtr rowKey : context.rowsToLock) { PendingRow pendingRow = pendingRows.get(rowKey); if (pendingRow != null) { pendingRow.remove(); } } } private void doPre(BatchMutateContext context) throws IOException { long start = 0; try { start = EnvironmentEdgeManager.currentTimeMillis(); if (failPreIndexUpdatesForTesting) { throw new DoNotRetryIOException("Simulating the first (i.e., pre) index table write failure"); } doIndexWritesWithExceptions(context, false); metricSource.updatePreIndexUpdateTime(dataTableName, EnvironmentEdgeManager.currentTimeMillis() - start); } catch (Throwable e) { metricSource.updatePreIndexUpdateFailureTime(dataTableName, EnvironmentEdgeManager.currentTimeMillis() - start); metricSource.incrementPreIndexUpdateFailures(dataTableName); // Re-acquire all locks since we released them before making index updates // Removal of reference counts and locks for the rows of this batch will be // done in postBatchMutateIndispensably() lockRows(context); rethrowIndexingException(e); } } private void extractExpressionsAndColumns(DataInputStream input, List>> operations, final Set colsReadInExpr) throws IOException { while (true) { ExpressionVisitor visitor = new StatelessTraverseAllExpressionVisitor() { @Override public Void visit(KeyValueColumnExpression expression) { colsReadInExpr.add(new ColumnReference(expression.getColumnFamily(), expression.getColumnQualifier())); return null; } }; try { int nExpressions = WritableUtils.readVInt(input); List expressions = Lists.newArrayListWithExpectedSize(nExpressions); for (int i = 0; i < nExpressions; i++) { Expression expression = ExpressionType.values()[WritableUtils.readVInt(input)].newInstance(); expression.readFields(input); expressions.add(expression); expression.accept(visitor); } PTableProtos.PTable tableProto = PTableProtos.PTable.parseDelimitedFrom(input); PTable table = PTableImpl.createFromProto(tableProto); operations.add(new Pair<>(table, expressions)); } catch (EOFException e) { break; } } } /** * This function has been adapted from PhoenixIndexBuilder#executeAtomicOp(). * The critical difference being that the code in PhoenixIndexBuilder#executeAtomicOp() * generates the mutations by reading the latest data table row from HBase but in order * to correctly support concurrent index mutations we need to always read the latest * data table row from memory. * It takes in an atomic Put mutation and generates a list of Put and Delete mutations. * The mutation list will be empty in two cases: * 1) ON DUPLICATE KEY IGNORE and the row already exists; * 2) ON DUPLICATE KEY UPDATE if CASE expression is specified and in each of them the new * value is the same as the old value in the ELSE-clause. * Otherwise, we will generate one Put mutation and optionally one Delete mutation (with * DeleteColumn type cells for all columns set to null). */ private List generateOnDupMutations(BatchMutateContext context, Put atomicPut) throws IOException { List mutations = Lists.newArrayListWithExpectedSize(2); byte[] opBytes = atomicPut.getAttribute(ATOMIC_OP_ATTRIB); if (opBytes == null) { // Unexpected return null; } Put put = null; Delete delete = null; // mutations returned by this function will have the LATEST timestamp // later these timestamps will be updated by the IndexRegionObserver#setTimestamps() function long ts = HConstants.LATEST_TIMESTAMP; byte[] rowKey = atomicPut.getRow(); ImmutableBytesPtr rowKeyPtr = new ImmutableBytesPtr(rowKey); // Get the latest data row state Pair dataRowState = context.dataRowStates.get(rowKeyPtr); Put currentDataRowState = dataRowState != null ? dataRowState.getFirst() : null; if (PhoenixIndexBuilderHelper.isDupKeyIgnore(opBytes)) { if (currentDataRowState == null) { // new row mutations.add(atomicPut); } return mutations; } ByteArrayInputStream stream = new ByteArrayInputStream(opBytes); DataInputStream input = new DataInputStream(stream); boolean skipFirstOp = input.readBoolean(); short repeat = input.readShort(); List>> operations = Lists.newArrayListWithExpectedSize(3); final Set colsReadInExpr = new HashSet<>(); // deserialize the conditional update expressions and // extract the columns that are read in the conditional expressions extractExpressionsAndColumns(input, operations, colsReadInExpr); int estimatedSize = colsReadInExpr.size(); // initialized to either the incoming new row or the current row // stores the intermediate values as we apply conditional update expressions List flattenedCells; // read the column values requested in the get from the current data row List cells = IndexUtil.readColumnsFromRow(currentDataRowState, colsReadInExpr); // store current cells into a map where the key is ColumnReference of the column family and // column qualifier, and value is a pair of cell and a boolean. The value of the boolean // will be true if the expression is CaseExpression and Else-clause is evaluated to be // true, will be null if there is no expression on this column, otherwise false Map> currColumnCellExprMap = new HashMap<>(); if (currentDataRowState == null) { // row doesn't exist if (skipFirstOp) { if (operations.size() <= 1 && repeat <= 1) { // early exit since there is only one ON DUPLICATE KEY UPDATE // clause which is ignored because the row doesn't exist so // simply use the values in UPSERT VALUES mutations.add(atomicPut); return mutations; } // If there are multiple ON DUPLICATE KEY UPDATE on a new row, // the first one is skipped repeat--; } // Base current state off of new row flattenedCells = flattenCells(atomicPut); } else { // Base current state off of existing row flattenedCells = cells; // store all current cells from currentDataRowState for (Map.Entry> entry : currentDataRowState.getFamilyCellMap().entrySet()) { for (Cell cell : new ArrayList<>(entry.getValue())) { byte[] family = CellUtil.cloneFamily(cell); byte[] qualifier = CellUtil.cloneQualifier(cell); ColumnReference colRef = new ColumnReference(family, qualifier); currColumnCellExprMap.put(colRef, new Pair<>(cell, null)); } } } MultiKeyValueTuple tuple = new MultiKeyValueTuple(flattenedCells); ImmutableBytesWritable ptr = new ImmutableBytesWritable(); // for each conditional upsert in the batch for (int opIndex = 0; opIndex < operations.size(); opIndex++) { Pair> operation = operations.get(opIndex); PTable table = operation.getFirst(); List expressions = operation.getSecond(); for (int j = 0; j < repeat; j++) { // repeater loop ptr.set(rowKey); // Sort the list of cells (if they've been flattened in which case they're // not necessarily ordered correctly). if (flattenedCells != null) { Collections.sort(flattenedCells, CellComparator.getInstance()); } PRow row = table.newRow(GenericKeyValueBuilder.INSTANCE, ts, ptr, false); int adjust = table.getBucketNum() == null ? 1 : 2; for (int i = 0; i < expressions.size(); i++) { Expression expression = expressions.get(i); ptr.set(EMPTY_BYTE_ARRAY); expression.evaluate(tuple, ptr); PColumn column = table.getColumns().get(i + adjust); Object value = expression.getDataType().toObject(ptr, column.getSortOrder()); // We are guaranteed that the two column will have the same type if (!column.getDataType().isSizeCompatible(ptr, value, column.getDataType(), expression.getSortOrder(), expression.getMaxLength(), expression.getScale(), column.getMaxLength(), column.getScale())) { throw new DataExceedsCapacityException(column.getDataType(), column.getMaxLength(), column.getScale(), column.getName().getString()); } column.getDataType().coerceBytes(ptr, value, expression.getDataType(), expression.getMaxLength(), expression.getScale(), expression.getSortOrder(), column.getMaxLength(), column.getScale(), column.getSortOrder(), table.rowKeyOrderOptimizable()); byte[] bytes = ByteUtil.copyKeyBytesIfNecessary(ptr); row.setValue(column, bytes); // If the column exist in currColumnCellExprMap, set the boolean value in the // map to be true if the expression is CaseExpression and the Else-clause is // evaluated to be true ColumnReference colRef = new ColumnReference(column.getFamilyName().getBytes(), column.getColumnQualifierBytes()); if (currColumnCellExprMap.containsKey(colRef)) { Pair valuePair = currColumnCellExprMap.get(colRef); if (expression instanceof CaseExpression && ((CaseExpression) expression).evaluateIndexOf(tuple, ptr) == expression.getChildren().size() - 1) { valuePair.setSecond(true); } else { valuePair.setSecond(false); } } } List updatedCells = Lists.newArrayListWithExpectedSize(estimatedSize); List newMutations = row.toRowMutations(); for (Mutation source : newMutations) { flattenCells(source, updatedCells); } // update the cells to the latest values calculated above flattenedCells = mergeCells(flattenedCells, updatedCells); // we need to retrieve empty cell later on which relies on binary search flattenedCells.sort(CellComparator.getInstance()); tuple.setKeyValues(flattenedCells); } // Repeat only applies to first statement repeat = 1; } put = new Put(rowKey); delete = new Delete(rowKey); transferAttributes(atomicPut, put); transferAttributes(atomicPut, delete); for (int i = 0; i < tuple.size(); i++) { Cell cell = tuple.getValue(i); if (cell.getType() == Cell.Type.Put) { if (checkCellNeedUpdate(cell, currColumnCellExprMap)) { put.add(cell); } } else { delete.add(cell); } } if (!put.isEmpty() || !delete.isEmpty()) { PTable table = operations.get(0).getFirst(); addEmptyKVCellToPut(put, tuple, table); } if (!put.isEmpty()) { mutations.add(put); } if (!delete.isEmpty()) { mutations.add(delete); } return mutations; } private void addEmptyKVCellToPut(Put put, MultiKeyValueTuple tuple, PTable table) throws IOException { byte[] emptyCF = SchemaUtil.getEmptyColumnFamily(table); byte[] emptyCQ = EncodedColumnsUtil.getEmptyKeyValueInfo(table).getFirst(); Cell emptyKVCell = tuple.getValue(emptyCF, emptyCQ); if (emptyKVCell != null) { put.add(emptyKVCell); } } private static List flattenCells(Mutation m) { List flattenedCells = new ArrayList<>(); flattenCells(m, flattenedCells); return flattenedCells; } private static void flattenCells(Mutation m, List flattenedCells) { for (List cells : m.getFamilyCellMap().values()) { flattenedCells.addAll(cells); } } /** * This function is to check if a cell need to be updated, based on the current cells' values. * The cell will not be updated only if the column exist in the expression in which CASE is * specified and the new value is the same as the old value in the ELSE-clause, otherwise it * should be updated. * * @param cell the cell with new value to be checked * @param colCellExprMap the column reference map with cell current value * @return true if the cell need update, false otherwise */ private boolean checkCellNeedUpdate(Cell cell, Map> colCellExprMap) { byte[] family = CellUtil.cloneFamily(cell); byte[] qualifier = CellUtil.cloneQualifier(cell); ColumnReference colRef = new ColumnReference(family, qualifier); // if cell not exist in the map, meaning that they are new and need update if (colCellExprMap.isEmpty() || !colCellExprMap.containsKey(colRef)) { return true; } Pair valuePair = colCellExprMap.get(colRef); Boolean isInCaseExpressionElseClause = valuePair.getSecond(); if (isInCaseExpressionElseClause == null) { return false; } if (!isInCaseExpressionElseClause) { return true; } Cell oldCell = valuePair.getFirst(); ImmutableBytesPtr newValuePtr = new ImmutableBytesPtr(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength()); ImmutableBytesPtr oldValuePtr = new ImmutableBytesPtr(oldCell.getValueArray(), oldCell.getValueOffset(), oldCell.getValueLength()); return !Bytes.equals(oldValuePtr.get(), oldValuePtr.getOffset(), oldValuePtr.getLength(), newValuePtr.get(), newValuePtr.getOffset(), newValuePtr.getLength()); } /** * ensure that the generated mutations have all the attributes like schema */ private static void transferAttributes(Mutation source, Mutation target) { for (Map.Entry entry : source.getAttributesMap().entrySet()) { target.setAttribute(entry.getKey(), entry.getValue()); } } /** * First take all the cells that are present in the latest. Then look at current * and any cell not present in latest is taken. */ private static List mergeCells(List current, List latest) { Map latestColVals = Maps.newHashMapWithExpectedSize(latest.size() + current.size()); // first take everything present in latest for (Cell cell : latest) { byte[] family = CellUtil.cloneFamily(cell); byte[] qualifier = CellUtil.cloneQualifier(cell); ColumnReference colInfo = new ColumnReference(family, qualifier); latestColVals.put(colInfo, cell); } // check for any leftovers in current for (Cell cell : current) { byte[] family = CellUtil.cloneFamily(cell); byte[] qualifier = CellUtil.cloneQualifier(cell); ColumnReference colInfo = new ColumnReference(family, qualifier); if (!latestColVals.containsKey(colInfo)) { latestColVals.put(colInfo, cell); } } return Lists.newArrayList(latestColVals.values()); } public static void appendToWALKey(WALKey key, String attrKey, byte[] attrValue) { key.addExtendedAttribute(attrKey, attrValue); } public static byte[] getAttributeValueFromWALKey(WALKey key, String attrKey) { return key.getExtendedAttribute(attrKey); } public static Map getAttributeValuesFromWALKey(WALKey key) { return new HashMap(key.getExtendedAttributes()); } /** * Determines whether the atomic operation is complete based on the operation status. * HBase returns null Result by default for successful Put and Delete mutations, only for * Increment and Append mutations, non-null Result is returned by default. * @param status the operation status. * @return true if the atomic operation is completed, false otherwise. */ public static boolean isAtomicOperationComplete(OperationStatus status) { return status.getOperationStatusCode() == SUCCESS && status.getResult() != null; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy