org.apache.phoenix.index.PhoenixTransactionalIndexer Maven / Gradle / Ivy
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.phoenix.index;
import static org.apache.phoenix.hbase.index.write.IndexWriterUtils.DEFAULT_INDEX_WRITER_RPC_PAUSE;
import static org.apache.phoenix.hbase.index.write.IndexWriterUtils.DEFAULT_INDEX_WRITER_RPC_RETRIES_NUMBER;
import static org.apache.phoenix.hbase.index.write.IndexWriterUtils.INDEX_WRITER_RPC_PAUSE;
import static org.apache.phoenix.hbase.index.write.IndexWriterUtils.INDEX_WRITER_RPC_RETRIES_NUMBER;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellScanner;
import org.apache.hadoop.hbase.CoprocessorEnvironment;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.KeyValueUtil;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.coprocessor.BaseRegionObserver;
import org.apache.hadoop.hbase.coprocessor.ObserverContext;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
import org.apache.hadoop.hbase.ipc.controller.InterRegionServerIndexRpcControllerFactory;
import org.apache.hadoop.hbase.regionserver.MiniBatchOperationInProgress;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.htrace.Span;
import org.apache.htrace.Trace;
import org.apache.htrace.TraceScope;
import org.apache.phoenix.compile.ScanRanges;
import org.apache.phoenix.coprocessor.DelegateRegionCoprocessorEnvironment;
import org.apache.phoenix.filter.SkipScanFilter;
import org.apache.phoenix.hbase.index.MultiMutation;
import org.apache.phoenix.hbase.index.ValueGetter;
import org.apache.phoenix.hbase.index.covered.IndexMetaData;
import org.apache.phoenix.hbase.index.covered.IndexUpdate;
import org.apache.phoenix.hbase.index.covered.TableState;
import org.apache.phoenix.hbase.index.covered.update.ColumnReference;
import org.apache.phoenix.hbase.index.covered.update.ColumnTracker;
import org.apache.phoenix.hbase.index.covered.update.IndexedColumnGroup;
import org.apache.phoenix.hbase.index.util.ImmutableBytesPtr;
import org.apache.phoenix.hbase.index.write.IndexWriter;
import org.apache.phoenix.hbase.index.write.LeaveIndexActiveFailurePolicy;
import org.apache.phoenix.hbase.index.write.ParallelWriterIndexCommitter;
import org.apache.phoenix.query.KeyRange;
import org.apache.phoenix.schema.types.PVarbinary;
import org.apache.phoenix.trace.TracingUtils;
import org.apache.phoenix.trace.util.NullSpan;
import org.apache.phoenix.transaction.PhoenixTransactionContext;
import org.apache.phoenix.transaction.PhoenixTransactionContext.PhoenixVisibilityLevel;
import org.apache.phoenix.transaction.PhoenixTransactionalTable;
import org.apache.phoenix.transaction.TransactionFactory;
import org.apache.phoenix.util.PropertiesUtil;
import org.apache.phoenix.util.ScanUtil;
import org.apache.phoenix.util.SchemaUtil;
import org.apache.phoenix.util.ServerUtil;
import org.apache.phoenix.util.TransactionUtil;
* Do all the work of managing index updates for a transactional table from a single coprocessor. Since the transaction
* manager essentially time orders writes through conflict detection, the logic to maintain a secondary index is quite a
* bit simpler than the non transactional case. For example, there's no need to muck with the WAL, as failure scenarios
* are handled by aborting the transaction.
public class PhoenixTransactionalIndexer extends BaseRegionObserver {
private static final Log LOG = LogFactory.getLog(PhoenixTransactionalIndexer.class);
// Hack to get around not being able to save any state between
// coprocessor calls. TODO: remove after HBASE-18127 when available
private static class BatchMutateContext {
public Collection> indexUpdates = Collections.emptyList();
private ThreadLocal batchMutateContext =
new ThreadLocal();
private PhoenixIndexCodec codec;
private IndexWriter writer;
private boolean stopped;
public void start(CoprocessorEnvironment e) throws IOException {
final RegionCoprocessorEnvironment env = (RegionCoprocessorEnvironment)e;
String serverName = env.getRegionServerServices().getServerName().getServerName();
codec = new PhoenixIndexCodec();
// Clone the config since it is shared
Configuration clonedConfig = PropertiesUtil.cloneConfig(e.getConfiguration());
* Set the rpc controller factory so that the HTables used by IndexWriter would
* set the correct priorities on the remote RPC calls.
InterRegionServerIndexRpcControllerFactory.class, RpcControllerFactory.class);
// lower the number of rpc retries. We inherit config from HConnectionManager#setServerSideHConnectionRetries,
// which by default uses a multiplier of 10. That is too many retries for our synchronous index writes
clonedConfig.setInt(HConstants.HBASE_CLIENT_PAUSE, env.getConfiguration()
DelegateRegionCoprocessorEnvironment indexWriterEnv = new DelegateRegionCoprocessorEnvironment(clonedConfig, env);
// setup the actual index writer
// For transactional tables, we keep the index active upon a write failure
// since we have the all versus none behavior for transactions. Also, we
// fail on any write exception since this will end up failing the transaction.
this.writer = new IndexWriter(IndexWriter.getCommitter(indexWriterEnv, ParallelWriterIndexCommitter.class),
new LeaveIndexActiveFailurePolicy(), indexWriterEnv, serverName + "-tx-index-writer");
public void stop(CoprocessorEnvironment e) throws IOException {
if (this.stopped) { return; }
this.stopped = true;
String msg = "TxIndexer is being stopped";
private static Iterator getMutationIterator(final MiniBatchOperationInProgress miniBatchOp) {
return new Iterator() {
private int i = 0;
public boolean hasNext() {
return i < miniBatchOp.size();
public Mutation next() {
return miniBatchOp.getOperation(i++);
public void remove() {
throw new UnsupportedOperationException();
public void preBatchMutate(ObserverContext c,
MiniBatchOperationInProgress miniBatchOp) throws IOException {
Mutation m = miniBatchOp.getOperation(0);
if (!codec.isEnabled(m)) {
super.preBatchMutate(c, miniBatchOp);
BatchMutateContext context = new BatchMutateContext();
setBatchMutateContext(c, context);
Map updateAttributes = m.getAttributesMap();
PhoenixIndexMetaData indexMetaData = new PhoenixIndexMetaData(c.getEnvironment(),updateAttributes);
byte[] txRollbackAttribute = m.getAttribute(PhoenixTransactionContext.TX_ROLLBACK_ATTRIBUTE_KEY);
Collection> indexUpdates = null;
// get the current span, or just use a null-span to avoid a bunch of if statements
try (TraceScope scope = Trace.startSpan("Starting to build index updates")) {
Span current = scope.getSpan();
if (current == null) {
current = NullSpan.INSTANCE;
// get the index updates for all elements in this batch
context.indexUpdates = getIndexUpdates(c.getEnvironment(), indexMetaData, getMutationIterator(miniBatchOp), txRollbackAttribute);
current.addTimelineAnnotation("Built index updates, doing preStep");
TracingUtils.addAnnotation(current, "index update count", context.indexUpdates.size());
} catch (Throwable t) {
String msg = "Failed to update index with entries:" + indexUpdates;
LOG.error(msg, t);
ServerUtil.throwIOException(msg, t);
public void postBatchMutateIndispensably(ObserverContext c,
MiniBatchOperationInProgress miniBatchOp, final boolean success) throws IOException {
BatchMutateContext context = getBatchMutateContext(c);
if (context == null || context.indexUpdates == null) {
// get the current span, or just use a null-span to avoid a bunch of if statements
try (TraceScope scope = Trace.startSpan("Starting to write index updates")) {
Span current = scope.getSpan();
if (current == null) {
current = NullSpan.INSTANCE;
if (success) { // if miniBatchOp was successfully written, write index updates
if (!context.indexUpdates.isEmpty()) {
this.writer.write(context.indexUpdates, true);
current.addTimelineAnnotation("Wrote index updates");
} catch (Throwable t) {
String msg = "Failed to write index updates:" + context.indexUpdates;
LOG.error(msg, t);
ServerUtil.throwIOException(msg, t);
} finally {
private void setBatchMutateContext(ObserverContext c, BatchMutateContext context) {
private BatchMutateContext getBatchMutateContext(ObserverContext c) {
return this.batchMutateContext.get();
private void removeBatchMutateContext(ObserverContext c) {
private static void addMutation(Map mutations, ImmutableBytesPtr row, Mutation m) {
MultiMutation stored = mutations.get(row);
// we haven't seen this row before, so add it
if (stored == null) {
stored = new MultiMutation(row);
mutations.put(row, stored);
private Collection> getIndexUpdates(RegionCoprocessorEnvironment env, PhoenixIndexMetaData indexMetaData, Iterator mutationIterator, byte[] txRollbackAttribute) throws IOException {
PhoenixTransactionContext txnContext = indexMetaData.getTransactionContext();
if (txnContext == null) {
throw new NullPointerException("Expected to find transaction in metadata for " + env.getRegionInfo().getTable().getNameAsString());
boolean isRollback = txRollbackAttribute!=null;
boolean isImmutable = indexMetaData.isImmutableRows();
ResultScanner currentScanner = null;
PhoenixTransactionalTable txTable = null;
// Collect up all mutations in batch
Map mutations =
new HashMap();
Map findPriorValueMutations;
if (isImmutable && !isRollback) {
findPriorValueMutations = new HashMap();
} else {
findPriorValueMutations = mutations;
while(mutationIterator.hasNext()) {
Mutation m =;
// add the mutation to the batch set
ImmutableBytesPtr row = new ImmutableBytesPtr(m.getRow());
if (mutations != findPriorValueMutations && isDeleteMutation(m)) {
addMutation(findPriorValueMutations, row, m);
addMutation(mutations, row, m);
// Collect the set of mutable ColumnReferences so that we can first
// run a scan to get the current state. We'll need this to delete
// the existing index rows.
List indexMaintainers = indexMetaData.getIndexMaintainers();
int estimatedSize = indexMaintainers.size() * 10;
Set mutableColumns = Sets.newHashSetWithExpectedSize(estimatedSize);
for (IndexMaintainer indexMaintainer : indexMaintainers) {
// For transactional tables, we use an index maintainer
// to aid in rollback if there's a KeyValue column in the index. The alternative would be
// to hold on to all uncommitted index row keys (even ones already sent to HBase) on the
// client side.
Set allColumns = indexMaintainer.getAllColumns();
Collection> indexUpdates = new ArrayList>(mutations.size() * 2 * indexMaintainers.size());
try {
// Track if we have row keys with Delete mutations (or Puts that are
// Tephra's Delete marker). If there are none, we don't need to do the scan for
// prior versions, if there are, we do. Since rollbacks always have delete mutations,
// this logic will work there too.
if (!findPriorValueMutations.isEmpty()) {
List keys = Lists.newArrayListWithExpectedSize(mutations.size());
for (ImmutableBytesPtr ptr : findPriorValueMutations.keySet()) {
Scan scan = new Scan();
// Project all mutable columns
for (ColumnReference ref : mutableColumns) {
scan.addColumn(ref.getFamily(), ref.getQualifier());
* Indexes inherit the storage scheme of the data table which means all the indexes have the same
* storage scheme and empty key value qualifier. Note that this assumption would be broken if we start
* supporting new indexes over existing data tables to have a different storage scheme than the data
* table.
byte[] emptyKeyValueQualifier = indexMaintainers.get(0).getEmptyKeyValueQualifier();
// Project empty key value column
scan.addColumn(indexMaintainers.get(0).getDataEmptyKeyValueCF(), emptyKeyValueQualifier);
ScanRanges scanRanges = ScanRanges.create(SchemaUtil.VAR_BINARY_SCHEMA, Collections.singletonList(keys), ScanUtil.SINGLE_COLUMN_SLOT_SPAN, KeyRange.EVERYTHING_RANGE, null, true, -1);
TableName tableName = env.getRegion().getRegionInfo().getTable();
HTableInterface htable = env.getTable(tableName);
txTable = TransactionFactory.getTransactionFactory().getTransactionalTable(txnContext, htable);
// For rollback, we need to see all versions, including
// the last committed version as there may be multiple
// checkpointed versions.
SkipScanFilter filter = scanRanges.getSkipScanFilter();
if (isRollback) {
filter = new SkipScanFilter(filter,true);
currentScanner = txTable.getScanner(scan);
if (isRollback) {
processRollback(env, indexMetaData, txRollbackAttribute, currentScanner, txnContext, mutableColumns, indexUpdates, mutations);
} else {
processMutation(env, indexMetaData, txRollbackAttribute, currentScanner, txnContext, mutableColumns, indexUpdates, mutations, findPriorValueMutations);
} finally {
if (txTable != null) txTable.close();
return indexUpdates;
private static boolean isDeleteMutation(Mutation m) {
for (Map.Entry> cellMap : m.getFamilyCellMap().entrySet()) {
for (Cell cell : cellMap.getValue()) {
if (cell.getTypeByte() != KeyValue.Type.Put.getCode() || TransactionUtil.isDelete(cell)) {
return true;
return false;
private void processMutation(RegionCoprocessorEnvironment env,
PhoenixIndexMetaData indexMetaData, byte[] txRollbackAttribute,
ResultScanner scanner,
PhoenixTransactionContext txnContext,
Set upsertColumns,
Collection> indexUpdates,
Map mutations,
Map mutationsToFindPreviousValue) throws IOException {
if (scanner != null) {
Result result;
ColumnReference emptyColRef = new ColumnReference(indexMetaData.getIndexMaintainers().get(0)
.getDataEmptyKeyValueCF(), indexMetaData.getIndexMaintainers().get(0).getEmptyKeyValueQualifier());
// Process existing data table rows by removing the old index row and adding the new index row
while ((result = != null) {
Mutation m = mutationsToFindPreviousValue.remove(new ImmutableBytesPtr(result.getRow()));
TxTableState state = new TxTableState(env, upsertColumns, indexMetaData.getAttributes(), txnContext.getWritePointer(), m, emptyColRef, result);
generateDeletes(indexMetaData, indexUpdates, txRollbackAttribute, state);
generatePuts(indexMetaData, indexUpdates, state);
// Process new data table by adding new index rows
for (Mutation m : mutations.values()) {
TxTableState state = new TxTableState(env, upsertColumns, indexMetaData.getAttributes(), txnContext.getWritePointer(), m);
generatePuts(indexMetaData, indexUpdates, state);
private void processRollback(RegionCoprocessorEnvironment env,
PhoenixIndexMetaData indexMetaData, byte[] txRollbackAttribute,
ResultScanner scanner,
PhoenixTransactionContext tx, Set mutableColumns,
Collection> indexUpdates,
Map mutations) throws IOException {
if (scanner != null) {
Result result;
// Loop through last committed row state plus all new rows associated with current transaction
// to generate point delete markers for all index rows that were added. We don't have Tephra
// manage index rows in change sets because we don't want to be hit with the additional
// memory hit and do not need to do conflict detection on index rows.
ColumnReference emptyColRef = new ColumnReference(indexMetaData.getIndexMaintainers().get(0).getDataEmptyKeyValueCF(), indexMetaData.getIndexMaintainers().get(0).getEmptyKeyValueQualifier());
while ((result = != null) {
Mutation m = mutations.remove(new ImmutableBytesPtr(result.getRow()));
// Sort by timestamp, type, cf, cq so we can process in time batches from oldest to newest
// (as if we're "replaying" them in time order).
List cells = result.listCells();
Collections.sort(cells, new Comparator() {
public int compare(Cell o1, Cell o2) {
int c =, o2.getTimestamp());
if (c != 0) return c;
c = o1.getTypeByte() - o2.getTypeByte();
if (c != 0) return c;
c = Bytes.compareTo(o1.getFamilyArray(), o1.getFamilyOffset(), o1.getFamilyLength(), o1.getFamilyArray(), o1.getFamilyOffset(), o1.getFamilyLength());
if (c != 0) return c;
return Bytes.compareTo(o1.getQualifierArray(), o1.getQualifierOffset(), o1.getQualifierLength(), o1.getQualifierArray(), o1.getQualifierOffset(), o1.getQualifierLength());
int i = 0;
int nCells = cells.size();
Result oldResult = null, newResult;
long readPtr = tx.getReadPointer();
do {
boolean hasPuts = false;
LinkedList singleTimeCells = Lists.newLinkedList();
long writePtr;
Cell cell = cells.get(i);
do {
hasPuts |= cell.getTypeByte() == KeyValue.Type.Put.getCode();
writePtr = cell.getTimestamp();
ListIterator it = singleTimeCells.listIterator();
do {
// Add at the beginning of the list to match the expected HBase
// newest to oldest sort order (which TxTableState relies on
// with the Result.getLatestColumnValue() calls). However, we
// still want to add Cells in the expected order for each time
// bound as otherwise we won't find it in our old state.
} while (++i < nCells && (cell=cells.get(i)).getTimestamp() == writePtr);
} while (i < nCells && cell.getTimestamp() <= readPtr);
// Generate point delete markers for the prior row deletion of the old index value.
// The write timestamp is the next timestamp, not the current timestamp,
// as the earliest cells are the current values for the row (and we don't
// want to delete the current row).
if (oldResult != null) {
TxTableState state = new TxTableState(env, mutableColumns, indexMetaData.getAttributes(), writePtr, m, emptyColRef, oldResult);
generateDeletes(indexMetaData, indexUpdates, txRollbackAttribute, state);
// Generate point delete markers for the new index value.
// If our time batch doesn't have Puts (i.e. we have only Deletes), then do not
// generate deletes. We would have generated the delete above based on the state
// of the previous row. The delete markers do not give us the state we need to
// delete.
if (hasPuts) {
newResult = Result.create(singleTimeCells);
// First row may represent the current state which we don't want to delete
if (writePtr > readPtr) {
TxTableState state = new TxTableState(env, mutableColumns, indexMetaData.getAttributes(), writePtr, m, emptyColRef, newResult);
generateDeletes(indexMetaData, indexUpdates, txRollbackAttribute, state);
oldResult = newResult;
} else {
oldResult = null;
} while (i < nCells);
private void generateDeletes(PhoenixIndexMetaData indexMetaData,
Collection> indexUpdates,
byte[] attribValue, TxTableState state) throws IOException {
Iterable deletes = codec.getIndexDeletes(state, indexMetaData);
for (IndexUpdate delete : deletes) {
if (delete.isValid()) {
delete.getUpdate().setAttribute(PhoenixTransactionContext.TX_ROLLBACK_ATTRIBUTE_KEY, attribValue);
indexUpdates.add(new Pair(delete.getUpdate(),delete.getTableName()));
private boolean generatePuts(
PhoenixIndexMetaData indexMetaData,
Collection> indexUpdates,
TxTableState state)
throws IOException {
Iterable puts = codec.getIndexUpserts(state, indexMetaData);
boolean validPut = false;
for (IndexUpdate put : puts) {
if (put.isValid()) {
indexUpdates.add(new Pair(put.getUpdate(),put.getTableName()));
validPut = true;
return validPut;
private static class TxTableState implements TableState {
private final Mutation mutation;
private final long currentTimestamp;
private final RegionCoprocessorEnvironment env;
private final Map attributes;
private final List pendingUpdates;
private final Set indexedColumns;
private final Map valueMap;
private TxTableState(RegionCoprocessorEnvironment env, Set indexedColumns, Map attributes, long currentTimestamp, Mutation mutation) {
this.env = env;
this.currentTimestamp = currentTimestamp;
this.indexedColumns = indexedColumns;
this.attributes = attributes;
this.mutation = mutation;
int estimatedSize = indexedColumns.size();
this.valueMap = Maps.newHashMapWithExpectedSize(estimatedSize);
this.pendingUpdates = Lists.newArrayListWithExpectedSize(estimatedSize);
try {
CellScanner scanner = mutation.cellScanner();
while (scanner.advance()) {
Cell cell = scanner.current();
} catch (IOException e) {
throw new RuntimeException(e); // Impossible
public TxTableState(RegionCoprocessorEnvironment env, Set indexedColumns, Map attributes, long currentTimestamp, Mutation m, ColumnReference emptyColRef, Result r) {
this(env, indexedColumns, attributes, currentTimestamp, m);
for (ColumnReference ref : indexedColumns) {
Cell cell = r.getColumnLatestCell(ref.getFamily(), ref.getQualifier());
if (cell != null) {
ImmutableBytesWritable ptr = new ImmutableBytesWritable();
ptr.set(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength());
valueMap.put(ref, ptr);
public RegionCoprocessorEnvironment getEnvironment() {
return env;
public long getCurrentTimestamp() {
return currentTimestamp;
public Map getUpdateAttributes() {
return attributes;
public byte[] getCurrentRowKey() {
return mutation.getRow();
public List extends IndexedColumnGroup> getIndexColumnHints() {
return Collections.emptyList();
private void applyMutation() {
for (Cell cell : pendingUpdates) {
if (cell.getTypeByte() == KeyValue.Type.Delete.getCode() || cell.getTypeByte() == KeyValue.Type.DeleteColumn.getCode()) {
ColumnReference ref = new ColumnReference(cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(), cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength());
} else if (cell.getTypeByte() == KeyValue.Type.DeleteFamily.getCode() || cell.getTypeByte() == KeyValue.Type.DeleteFamilyVersion.getCode()) {
for (ColumnReference ref : indexedColumns) {
if (ref.matchesFamily(cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength())) {
} else if (cell.getTypeByte() == KeyValue.Type.Put.getCode()){
ColumnReference ref = new ColumnReference(cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(), cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength());
if (indexedColumns.contains(ref)) {
ImmutableBytesWritable ptr = new ImmutableBytesWritable();
ptr.set(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength());
valueMap.put(ref, ptr);
} else {
throw new IllegalStateException("Unexpected mutation type for " + cell);
public Collection getPendingUpdate() {
return pendingUpdates;
public Pair getIndexUpdateState(Collection extends ColumnReference> indexedColumns, boolean ignoreNewerMutations, boolean returnNullScannerIfRowNotFound, IndexMetaData indexMetaData)
throws IOException {
// TODO: creating these objects over and over again is wasteful
ColumnTracker tracker = new ColumnTracker(indexedColumns);
ValueGetter getter = new ValueGetter() {
public ImmutableBytesWritable getLatestValue(ColumnReference ref, long ts) throws IOException {
return valueMap.get(ref);
public byte[] getRowKey() {
return mutation.getRow();
Pair pair = new Pair(getter, new IndexUpdate(tracker));
return pair;
| | | |
© 2015 - 2025 Weber Informatics LLC | Privacy Policy