
org.apache.phoenix.coprocessor.CompactionScanner Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phoenix-server-hbase-2.5.0
Show all versions of phoenix-server-hbase-2.5.0
Phoenix HBase Server Side JAR
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.phoenix.coprocessor;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellComparator;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.KeepDeletedCells;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
import org.apache.hadoop.hbase.regionserver.InternalScanner;
import org.apache.hadoop.hbase.regionserver.Region;
import org.apache.hadoop.hbase.regionserver.RegionScanner;
import org.apache.hadoop.hbase.regionserver.ScannerContext;
import org.apache.hadoop.hbase.regionserver.Store;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.phoenix.util.EnvironmentEdgeManager;
import org.apache.phoenix.util.ScanUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.phoenix.thirdparty.com.google.common.annotations.VisibleForTesting;
import static org.apache.phoenix.query.QueryConstants.LOCAL_INDEX_COLUMN_FAMILY_PREFIX;
/**
* The store scanner that implements compaction for Phoenix. Phoenix coproc overrides the scan
* options so that HBase store scanner retains all cells during compaction and flushes. Then this
* store scanner decides which cells to retain. This is required to ensure rows do not expire
* partially and to preserve all cells within Phoenix max lookback window.
*
* The compaction process is optimized for Phoenix. This optimization assumes that
* . A given delete family or delete family version marker is inserted to all column families
* . A given delete family version marker always delete a full version of a row. Please note
* delete family version markers are used only on index tables where mutations are always
* full row mutations.
*
* During major compaction, minor compaction and memstore flush, all cells (and delete markers)
* that are visible through the max lookback window are retained. Outside the max lookback window,
* (1) extra put cell versions, (2) delete markers and deleted cells that are not supposed to be
* kept (by the KeepDeletedCell option), and (3) expired cells are removed during major compaction.
* During flushes and minor compaction, expired cells and delete markers are not removed however
* deleted cells that are not supposed to be kept (by the KeepDeletedCell option) and extra put
* cell versions are removed.
*
*/
public class CompactionScanner implements InternalScanner {
private static final Logger LOGGER = LoggerFactory.getLogger(CompactionScanner.class);
public static final String SEPARATOR = ":";
private final InternalScanner storeScanner;
private final Region region;
private final Store store;
private final Configuration config;
private final RegionCoprocessorEnvironment env;
private long maxLookbackWindowStart;
private long ttlWindowStart;
private long ttlInMillis;
private final long maxLookbackInMillis;
private int minVersion;
private int maxVersion;
private final boolean emptyCFStore;
private final boolean localIndex;
private final int familyCount;
private KeepDeletedCells keepDeletedCells;
private long compactionTime;
private final byte[] emptyCF;
private final byte[] emptyCQ;
private final byte[] storeColumnFamily;
private final String tableName;
private final String columnFamilyName;
private static Map maxLookbackMap = new ConcurrentHashMap<>();
private PhoenixLevelRowCompactor phoenixLevelRowCompactor;
private HBaseLevelRowCompactor hBaseLevelRowCompactor;
private boolean major;
private long inputCellCount = 0;
private long outputCellCount = 0;
private boolean phoenixLevelOnly = false;
// Only for forcing minor compaction while testing
private static boolean forceMinorCompaction = false;
public CompactionScanner(RegionCoprocessorEnvironment env,
Store store,
InternalScanner storeScanner,
long maxLookbackInMillis,
byte[] emptyCF,
byte[] emptyCQ,
boolean major,
boolean keepDeleted) {
this.storeScanner = storeScanner;
this.region = env.getRegion();
this.store = store;
this.env = env;
// Empty column family and qualifier are always needed to compute which all empty cells to retain
// even during minor compactions. If required empty cells are not retained during
// minor compactions then we can run into the risk of partial row expiry on next major compaction.
this.emptyCF = emptyCF;
this.emptyCQ = emptyCQ;
this.config = env.getConfiguration();
compactionTime = EnvironmentEdgeManager.currentTimeMillis();
columnFamilyName = store.getColumnFamilyName();
storeColumnFamily = columnFamilyName.getBytes();
tableName = region.getRegionInfo().getTable().getNameAsString();
Long overriddenMaxLookback = maxLookbackMap.get(tableName + SEPARATOR + columnFamilyName);
this.maxLookbackInMillis = overriddenMaxLookback == null ?
maxLookbackInMillis : Math.max(maxLookbackInMillis, overriddenMaxLookback);
// The oldest scn is current time - maxLookbackInMillis. Phoenix sets the scan time range
// for scn queries [0, scn). This means that the maxlookback size should be
// maxLookbackInMillis + 1 so that the oldest scn does not return empty row
this.maxLookbackWindowStart = this.maxLookbackInMillis == 0 ?
compactionTime : compactionTime - (this.maxLookbackInMillis + 1);
ColumnFamilyDescriptor cfd = store.getColumnFamilyDescriptor();
this.major = major && ! forceMinorCompaction;
int ttl = this.major ? cfd.getTimeToLive() : HConstants.FOREVER;
ttlInMillis = ((long) ttl) * 1000;
this.ttlWindowStart = ttl == HConstants.FOREVER ? 1 : compactionTime - ttlInMillis;
this.maxLookbackWindowStart = Math.max(ttlWindowStart, maxLookbackWindowStart);
this.minVersion = cfd.getMinVersions();
this.maxVersion = cfd.getMaxVersions();
this.keepDeletedCells = keepDeleted ? KeepDeletedCells.TTL : cfd.getKeepDeletedCells();
familyCount = region.getTableDescriptor().getColumnFamilies().length;
localIndex = columnFamilyName.startsWith(LOCAL_INDEX_COLUMN_FAMILY_PREFIX);
emptyCFStore = familyCount == 1 || columnFamilyName.equals(Bytes.toString(emptyCF))
|| localIndex;
phoenixLevelRowCompactor = new PhoenixLevelRowCompactor();
hBaseLevelRowCompactor = new HBaseLevelRowCompactor();
LOGGER.info("Starting CompactionScanner for table " + tableName + " store "
+ columnFamilyName + (this.major ? " major " : " not major ") + "compaction ttl "
+ ttlInMillis + "ms " + "max lookback " + this.maxLookbackInMillis + "ms");
}
@VisibleForTesting
public static void setForceMinorCompaction(boolean doMinorCompaction) {
forceMinorCompaction = doMinorCompaction;
}
@VisibleForTesting
public static boolean getForceMinorCompaction() {
return forceMinorCompaction;
}
/**
* Any coprocessors within a JVM can extend the max lookback window for a column family
* by calling this static method.
*/
public static void overrideMaxLookback(String tableName, String columnFamilyName,
long maxLookbackInMillis) {
if (tableName == null || columnFamilyName == null) {
return;
}
Long old = maxLookbackMap.putIfAbsent(tableName + SEPARATOR + columnFamilyName,
maxLookbackInMillis);
if (old != null) {
maxLookbackMap.put(tableName + SEPARATOR + columnFamilyName, maxLookbackInMillis);
}
}
public static long getMaxLookbackInMillis(String tableName, String columnFamilyName,
long maxLookbackInMillis) {
if (tableName == null || columnFamilyName == null) {
return maxLookbackInMillis;
}
Long value = maxLookbackMap.get(tableName + CompactionScanner.SEPARATOR + columnFamilyName);
return value == null
? maxLookbackInMillis
: maxLookbackMap.get(tableName + CompactionScanner.SEPARATOR + columnFamilyName);
}
static class CellTimeComparator implements Comparator {
public static final CellTimeComparator COMPARATOR = new CellTimeComparator();
@Override public int compare(Cell o1, Cell o2) {
long ts1 = o1.getTimestamp();
long ts2 = o2.getTimestamp();
if (ts1 == ts2) return 0;
if (ts1 > ts2) return -1;
return 1;
}
@Override public boolean equals(Object obj) {
return false;
}
}
private void printRow(List result, String title, boolean sort) {
List row;
if (sort) {
row = new ArrayList<>(result);
Collections.sort(row, CellTimeComparator.COMPARATOR);
} else {
row = result;
}
System.out.println("---- " + title + " ----");
System.out.println((major ? "Major " : "Not major ")
+ "compaction time: " + compactionTime);
System.out.println("Max lookback window start time: " + maxLookbackWindowStart);
System.out.println("Max lookback in ms: " + maxLookbackInMillis);
System.out.println("TTL in ms: " + ttlInMillis);
boolean maxLookbackLine = false;
boolean ttlLine = false;
for (Cell cell : row) {
if (!maxLookbackLine && cell.getTimestamp() < maxLookbackWindowStart) {
System.out.println("-----> Max lookback window start time: " + maxLookbackWindowStart);
maxLookbackLine = true;
} else if (!ttlLine && cell.getTimestamp() < ttlWindowStart) {
System.out.println("-----> TTL window start time: " + ttlWindowStart);
ttlLine = true;
}
System.out.println(cell);
}
}
@Override
public boolean next(List result) throws IOException {
boolean hasMore = storeScanner.next(result);
inputCellCount += result.size();
if (!result.isEmpty()) {
// printRow(result, "Input for " + tableName + " " + columnFamilyName, true); // This is for debugging
phoenixLevelRowCompactor.compact(result, false);
outputCellCount += result.size();
// printRow(result, "Output for " + tableName + " " + columnFamilyName, true); // This is for debugging
}
return hasMore;
}
@Override
public boolean next(List result, ScannerContext scannerContext) throws IOException {
return next(result);
}
@Override
public void close() throws IOException {
LOGGER.info("Closing CompactionScanner for table " + tableName + " store "
+ columnFamilyName + (major ? " major " : " not major ") + "compaction retained "
+ outputCellCount + " of " + inputCellCount + " cells"
+ (phoenixLevelOnly ? " phoenix level only" : ""));
if (forceMinorCompaction) {
forceMinorCompaction = false;
}
storeScanner.close();
}
/**
* The context for a given row during compaction. A row may have multiple compaction row
* versions. CompactionScanner uses the same row context for these versions.
*/
static class RowContext {
Cell familyDeleteMarker = null;
Cell familyVersionDeleteMarker = null;
List columnDeleteMarkers = new ArrayList<>();
int version = 0;
long maxTimestamp;
long minTimestamp;
private void init() {
familyDeleteMarker = null;
familyVersionDeleteMarker = null;
columnDeleteMarkers.clear();
version = 0;
}
private void addColumnDeleteMarker(Cell deleteMarker) {
if (columnDeleteMarkers.isEmpty()) {
columnDeleteMarkers.add(deleteMarker);
return;
}
int i = 0;
// Replace the existing delete marker for the same column
for (Cell cell : columnDeleteMarkers) {
if (cell.getType() == deleteMarker.getType() &&
CellUtil.matchingColumn(cell, deleteMarker)) {
columnDeleteMarkers.remove(i);
break;
}
i++;
}
columnDeleteMarkers.add(deleteMarker);
}
private void retainFamilyDeleteMarker(List retainedCells) {
if (familyVersionDeleteMarker != null) {
retainedCells.add(familyVersionDeleteMarker);
// Set it to null so it will be used once
familyVersionDeleteMarker = null;
} else {
// The same delete family marker may be retained multiple times. Duplicates will be
// removed later
retainedCells.add(familyDeleteMarker);
}
}
/**
* Based on the column delete markers decide if the cells should be retained. If a
* deleted cell is retained, the delete marker is also retained.
*/
private void retainCell(Cell cell, List retainedCells,
KeepDeletedCells keepDeletedCells, long ttlWindowStart) {
int i = 0;
for (Cell dm : columnDeleteMarkers) {
if (cell.getTimestamp() > dm.getTimestamp()) {
continue;
}
if ((CellUtil.matchingFamily(cell, dm)) &&
CellUtil.matchingQualifier(cell, dm)) {
if (dm.getType() == Cell.Type.Delete) {
if (cell.getTimestamp() == dm.getTimestamp()) {
// Delete is for deleting a specific cell version. Thus, it can be used
// to delete only one cell.
columnDeleteMarkers.remove(i);
} else {
continue;
}
}
if (maxTimestamp >= ttlWindowStart) {
// Inside the TTL window
if (keepDeletedCells != KeepDeletedCells.FALSE ) {
retainedCells.add(cell);
retainedCells.add(dm);
}
} else if (keepDeletedCells == KeepDeletedCells.TTL &&
dm.getTimestamp() >= ttlWindowStart) {
retainedCells.add(cell);
retainedCells.add(dm);
}
return;
}
i++;
}
// No delete marker for this cell
retainedCells.add(cell);
}
/**
* This method finds out the maximum and minimum timestamp of the cells of the next row
* version. Cells are organized into columns based on the pair of family name and column
* qualifier. This means that the delete family markers for a column family will have their
* own column. However, the delete column markers will be packed with the put cells. The cells
* within a column are ordered in descending timestamps.
*/
private void getNextRowVersionTimestamps(LinkedList> columns,
byte[] columnFamily) {
maxTimestamp = 0;
minTimestamp = Long.MAX_VALUE;
Cell firstCell;
LinkedList deleteColumn = null;
long ts;
// The next row version is formed by the first cell of each column. Similarly, the min
// max timestamp of the cells of a row version is determined by looking at just first
// cell of the columns
for (LinkedList column : columns) {
firstCell = column.getFirst();
ts = firstCell.getTimestamp();
if ((firstCell.getType() == Cell.Type.DeleteFamily ||
firstCell.getType() == Cell.Type.DeleteFamilyVersion) &&
CellUtil.matchingFamily(firstCell, columnFamily)) {
deleteColumn = column;
}
if (maxTimestamp < ts) {
maxTimestamp = ts;
}
if (minTimestamp > ts) {
minTimestamp = ts;
}
}
if (deleteColumn != null) {
// A row version cannot cross a family delete marker by definition. This means
// min timestamp cannot be lower than the delete markers timestamp
for (Cell cell : deleteColumn) {
ts = cell.getTimestamp();
if (ts < maxTimestamp) {
minTimestamp = ts + 1;
break;
}
}
}
}
/**
* This is used for Phoenix level compaction
*/
private void getNextRowVersionTimestamps(List row, byte[] columnFamily) {
maxTimestamp = 0;
minTimestamp = Long.MAX_VALUE;
Cell deleteFamily = null;
long ts;
// The next row version is formed by the first cell of each column. Similarly, the min
// max timestamp of the cells of a row version is determined by looking at just first
// cell of the columns
for (Cell cell : row) {
ts = cell.getTimestamp();
if ((cell.getType() == Cell.Type.DeleteFamily ||
cell.getType() == Cell.Type.DeleteFamilyVersion) &&
CellUtil.matchingFamily(cell, columnFamily)) {
deleteFamily = cell;
}
if (maxTimestamp < ts) {
maxTimestamp = ts;
}
if (minTimestamp > ts) {
minTimestamp = ts;
}
}
if (deleteFamily != null) {
// A row version cannot cross a family delete marker by definition. This means
// min timestamp cannot be lower than the delete markers timestamp
ts = deleteFamily.getTimestamp();
if (ts < maxTimestamp) {
minTimestamp = ts + 1;
}
}
}
}
/**
* HBaseLevelRowCompactor ensures that the cells of a given row are retained according to the
* HBase data retention rules.
*
*/
class HBaseLevelRowCompactor {
private RowContext rowContext = new RowContext();
private CompactionRowVersion rowVersion = new CompactionRowVersion();
/**
* A compaction row version includes the latest put cell versions from each column such that
* the cell versions do not cross delete family markers. In other words, the compaction row
* versions are built from cell versions that are all either before or after the next delete
* family or delete family version maker if family delete markers exist. Also, when the cell
* timestamps are ordered for a given row version, the difference between two subsequent
* timestamps has to be less than the ttl value. This is taken care before calling
* HBaseLevelRowCompactor#compact().
*
* Compaction row versions are disjoint sets. A compaction row version does not share a cell
* version with the next compaction row version. A compaction row version includes at most
* one cell version from a column.
*
* After creating the first compaction row version, we form the next compaction row version
* from the remaining cell versions.
*
* Compaction row versions are used for compaction purposes to efficiently determine which
* cell versions to retain based on the HBase data retention parameters.
*/
class CompactionRowVersion {
// Cells included in the row version
List cells = new ArrayList<>();
// The timestamp of the row version
long ts = 0;
// The version of a row version. It is the minimum of the versions of the cells included
// in the row version
int version = 0;
private void init() {
cells.clear();
}
@Override
public String toString() {
StringBuilder output = new StringBuilder();
output.append("Cell count: " + cells.size() + "\n");
for (Cell cell : cells) {
output.append(cell + "\n");
}
output.append("ts:" + ts + " v:" + version);
return output.toString();
}
}
/**
* Decide if compaction row versions inside the TTL window should be retained. The
* versions are retained if one of the following conditions holds
* 1. The compaction row version is alive and its version is less than VERSIONS
* 2. The compaction row version is deleted and KeepDeletedCells is not FALSE
*
*/
private void retainInsideTTLWindow(CompactionRowVersion rowVersion, RowContext rowContext,
List retainedCells) {
if (rowContext.familyDeleteMarker == null
&& rowContext.familyVersionDeleteMarker == null) {
// The compaction row version is alive
if (rowVersion.version < maxVersion) {
// Rule 1
retainCells(rowVersion, rowContext, retainedCells);
}
} else {
// Deleted
if (rowVersion.version < maxVersion && keepDeletedCells != KeepDeletedCells.FALSE) {
// Retain based on rule 2
retainCells(rowVersion, rowContext, retainedCells);
rowContext.retainFamilyDeleteMarker(retainedCells);
}
}
}
/**
* Decide if compaction row versions outside the TTL window should be retained. The
* versions are retained if one of the following conditions holds
*
* 1. Live row versions less than MIN_VERSIONS are retained
* 2. Delete row versions whose delete markers are inside the TTL window and
* KeepDeletedCells is TTL are retained
*/
private void retainOutsideTTLWindow(CompactionRowVersion rowVersion, RowContext rowContext,
List retainedCells) {
if (rowContext.familyDeleteMarker == null
&& rowContext.familyVersionDeleteMarker == null) {
// Live compaction row version
if (rowVersion.version < minVersion) {
// Rule 1
retainCells(rowVersion, rowContext, retainedCells);
}
} else {
// Deleted compaction row version
if (keepDeletedCells == KeepDeletedCells.TTL
&& rowContext.familyDeleteMarker != null
&& rowContext.familyDeleteMarker.getTimestamp() > ttlWindowStart) {
// Rule 2
retainCells(rowVersion, rowContext, retainedCells);
rowContext.retainFamilyDeleteMarker(retainedCells);
}
}
}
private void retainCells(CompactionRowVersion rowVersion, RowContext rowContext,
List retainedCells) {
if (rowContext.columnDeleteMarkers == null) {
retainedCells.addAll(rowVersion.cells);
return;
}
for (Cell cell : rowVersion.cells) {
rowContext.retainCell(cell, retainedCells, keepDeletedCells, ttlWindowStart);
}
}
/**
* Form the next compaction row version by picking (removing) the first cell from each
* column. Put cells are used to form the next compaction row version. Delete markers
* are added to the row context which are processed to decide which row versions
* or cell version to delete.
*/
private void formNextCompactionRowVersion(LinkedList> columns,
RowContext rowContext, List retainedCells) {
rowVersion.init();
rowContext.getNextRowVersionTimestamps(columns, storeColumnFamily);
rowVersion.ts = rowContext.maxTimestamp;
for (LinkedList column : columns) {
Cell cell = column.getFirst();
if (column.getFirst().getTimestamp() < rowContext.minTimestamp) {
continue;
}
if (cell.getType() == Cell.Type.DeleteFamily) {
if (cell.getTimestamp() >= rowContext.maxTimestamp) {
rowContext.familyDeleteMarker = cell;
column.removeFirst();
break;
}
continue;
}
else if (cell.getType() == Cell.Type.DeleteFamilyVersion) {
if (cell.getTimestamp() == rowVersion.ts) {
rowContext.familyVersionDeleteMarker = cell;
column.removeFirst();
break;
}
continue;
}
column.removeFirst();
if (cell.getType() == Cell.Type.DeleteColumn ||
cell.getType() == Cell.Type.Delete) {
rowContext.addColumnDeleteMarker(cell);
continue;
}
rowVersion.cells.add(cell);
}
if (rowVersion.cells.isEmpty()) {
return;
}
rowVersion.version = rowContext.version++;
if (rowVersion.ts >= ttlWindowStart) {
retainInsideTTLWindow(rowVersion, rowContext, retainedCells);
} else {
retainOutsideTTLWindow(rowVersion, rowContext, retainedCells);
}
}
private void formCompactionRowVersions(LinkedList> columns,
List result) {
rowContext.init();
while (!columns.isEmpty()) {
formNextCompactionRowVersion(columns, rowContext, result);
// Remove the columns that are empty
Iterator> iterator = columns.iterator();
while (iterator.hasNext()) {
LinkedList column = iterator.next();
if (column.isEmpty()) {
iterator.remove();
}
}
}
}
/**
* Group the cells that are ordered lexicographically into columns based on
* the pair of family name and column qualifier. While doing that also add the delete
* markers to a separate list.
*/
private void formColumns(List result, LinkedList> columns) {
Cell currentColumnCell = null;
LinkedList currentColumn = null;
for (Cell cell : result) {
if (currentColumnCell == null) {
currentColumn = new LinkedList<>();
currentColumnCell = cell;
currentColumn.add(cell);
} else if (!CellUtil.matchingColumn(cell, currentColumnCell)) {
columns.add(currentColumn);
currentColumn = new LinkedList<>();
currentColumnCell = cell;
currentColumn.add(cell);
} else {
currentColumn.add(cell);
}
}
if (currentColumn != null) {
columns.add(currentColumn);
}
}
/**
* Compacts a single row at the HBase level. The result parameter is the input row and
* modified to be the output of the compaction.
*/
private void compact(List result) {
if (result.isEmpty()) {
return;
}
LinkedList> columns = new LinkedList<>();
formColumns(result, columns);
result.clear();
formCompactionRowVersions(columns, result);
}
}
/**
* PhoenixLevelRowCompactor ensures that the cells of the latest row version and the
* row versions that are visible through the max lookback window are retained including delete
* markers placed after these cells. This is the complete set of cells that Phoenix
* needs for its queries. Beyond these cells, HBase retention rules may require more
* cells to be retained. These cells are identified by the HBase level compaction implemented
* by HBaseLevelRowCompactor.
*
*/
class PhoenixLevelRowCompactor {
private RowContext rowContext = new RowContext();
List lastRowVersion = new ArrayList<>();
List emptyColumn = new ArrayList<>();
List phoenixResult = new ArrayList<>();
List trimmedRow = new ArrayList<>();
List trimmedEmptyColumn = new ArrayList<>();
/**
* The cells of the row (i.e., result) read from HBase store are lexicographically ordered
* for tables using the key part of the cells which includes row, family, qualifier,
* timestamp and type. The cells belong of a column are ordered from the latest to
* the oldest. The method leverages this ordering and groups the cells into their columns
* based on the pair of family name and column qualifier.
*
* The cells within the max lookback window except the once at the lower edge of the
* max lookback window (the last row of the max lookback window) are retained immediately.
*
* This method also returned the remaining cells (outside the max lookback window) of
* the empty colum
*/
private void getLastRowVersionInMaxLookbackWindow(List result,
List lastRowVersion, List retainedCells, List emptyColumn) {
Cell currentColumnCell = null;
boolean isEmptyColumn = false;
for (Cell cell : result) {
if (cell.getTimestamp() > maxLookbackWindowStart) {
retainedCells.add(cell);
continue;
}
if (!major && cell.getType() != Cell.Type.Put) {
retainedCells.add(cell);
}
if (currentColumnCell == null ||
!CellUtil.matchingColumn(cell, currentColumnCell)) {
currentColumnCell = cell;
isEmptyColumn = ScanUtil.isEmptyColumn(cell, emptyCF, emptyCQ);
if ((cell.getType() != Cell.Type.Delete
&& cell.getType() != Cell.Type.DeleteColumn)
|| cell.getTimestamp() == maxLookbackWindowStart) {
// Include only delete family markers and put cells or the
// cells at start edge of max lookback window
lastRowVersion.add(cell);
}
} else if (isEmptyColumn) {
// We only need to keep one cell for every column for the last row version.
// So here we just form the empty column beyond the last row version.
// Empty column needs to be collected during minor compactions also
// else we will see partial row expiry.
emptyColumn.add(cell);
}
}
}
/**
* Close the gap between the two timestamps, max and min, with the minimum number of cells
* from the input list such that the timestamp difference between two cells should
* not more than ttl. The cells that are used to close the gap are added to the output
* list. The input list is a list of empty cells in decreasing order of timestamp.
*/
private void closeGap(long max, long min, List input, List output) {
int previous = -1;
long ts;
for (Cell cell : input) {
ts = cell.getTimestamp();
if (ts >= max) {
previous++;
continue;
}
if (previous == -1 && max - ts > ttlInMillis) {
// Means even the first empty cells in the input list which is closest to
// max timestamp can't close the gap. So, gap can't be closed by empty cells at all.
break;
}
if (max - ts > ttlInMillis) {
max = input.get(previous).getTimestamp();
output.add(input.remove(previous));
if (max - min > ttlInMillis) {
closeGap(max, min, input, output);
}
return;
}
previous++;
}
if (previous > -1 && max - min > ttlInMillis) {
// This covers the case we need to retain the last empty cell in the input list. The close gap
// algorithm is such that if we need to retain the i th empty cell in the input list then we
// will get to know that once we are iterating on i+1 th empty cell. So, to retain last empty cell
// in input list we need to check the min timestamp.
output.add(input.remove(previous));
}
}
/**
* Retains minimum empty cells needed during minor compaction to not loose data/partial row expiry
* on next major compaction.
* @param emptyColumn Empty column cells in decreasing order of timestamp.
* @param retainedCells Cells to be retained.
*/
private void retainEmptyCellsInMinorCompaction(List emptyColumn, List retainedCells) {
if (emptyColumn.isEmpty()) {
return;
}
else if (familyCount == 1 || localIndex) {
// We are compacting empty column family store and its single column family so
// just need to retain empty cells till min timestamp of last row version. Can't
// minimize the retained empty cells further as we don't know actual TTL during
// minor compactions.
long minRowTimestamp = rowContext.minTimestamp;
for (Cell emptyCell: emptyColumn) {
if (emptyCell.getTimestamp() > minRowTimestamp) {
retainedCells.add(emptyCell);
}
}
return;
}
// For multi-column family, w/o doing region level scan we can't put a bound on timestamp
// till which we should retain the empty cells. The empty cells can be needed to close the gap
// b/w empty column family cell and non-empty column family cell.
retainedCells.addAll(emptyColumn);
}
/**
* Retain the last row version visible through the max lookback window
*/
private void retainCellsOfLastRowVersion(List lastRow,
List emptyColumn, List retainedCells) {
if (lastRow.isEmpty()) {
return;
}
rowContext.init();
rowContext.getNextRowVersionTimestamps(lastRow, storeColumnFamily);
Cell firstCell = lastRow.get(0);
if (firstCell.getType() == Cell.Type.DeleteFamily ||
firstCell.getType() == Cell.Type.DeleteFamilyVersion) {
if (firstCell.getTimestamp() >= rowContext.maxTimestamp) {
// This means that the row version outside the max lookback window is
// deleted and thus should not be visible to the scn queries
return;
}
}
if (major && compactionTime - rowContext.maxTimestamp > maxLookbackInMillis + ttlInMillis) {
// Only do this check for major compaction as for minor compactions we don't expire cells.
// The row version should not be visible via the max lookback window. Nothing to do
return;
}
retainedCells.addAll(lastRow);
// If the gap between two back to back mutations is more than ttl then the older
// mutation will be considered expired and masked. If the length of the time range of
// a row version is not more than ttl, then we know the cells covered by the row
// version are not apart from each other more than ttl and will not be masked.
if (major && rowContext.maxTimestamp - rowContext.minTimestamp <= ttlInMillis) {
// Skip this check for minor compactions as we don't compute actual TTL for
// minor compactions and don't expire cells.
return;
}
// The quick time range check did not pass. We need get at least one empty cell to cover
// the gap so that the row version will not be masked by PhoenixTTLRegionScanner.
if (emptyColumn.isEmpty()) {
return;
}
else if (! major) {
retainEmptyCellsInMinorCompaction(emptyColumn, retainedCells);
return;
}
int size = lastRow.size();
long tsArray[] = new long[size];
int i = 0;
for (Cell cell : lastRow) {
tsArray[i++] = cell.getTimestamp();
}
Arrays.sort(tsArray);
for (i = size - 1; i > 0; i--) {
if (tsArray[i] - tsArray[i - 1] > ttlInMillis) {
closeGap(tsArray[i], tsArray[i - 1], emptyColumn, retainedCells);
}
}
}
/**
* The retained cells includes the cells that are visible through the max lookback
* window and the additional empty column cells that are needed to reduce large time
* between the cells of the last row version.
*/
private boolean retainCellsForMaxLookback(List result, boolean regionLevel,
List retainedCells) {
lastRowVersion.clear();
emptyColumn.clear();
getLastRowVersionInMaxLookbackWindow(result, lastRowVersion, retainedCells,
emptyColumn);
if (lastRowVersion.isEmpty()) {
return true;
}
if (!major) {
// We do not expire cells for minor compaction and memstore flushes
retainCellsOfLastRowVersion(lastRowVersion, emptyColumn, retainedCells);
return true;
}
long maxTimestamp = 0;
long minTimestamp = Long.MAX_VALUE;
long ts;
for (Cell cell : lastRowVersion) {
ts =cell.getTimestamp();
if (ts > maxTimestamp) {
maxTimestamp = ts;
}
ts = cell.getTimestamp();
if (ts < minTimestamp) {
minTimestamp = ts;
}
}
if (compactionTime - maxTimestamp > maxLookbackInMillis + ttlInMillis) {
if (!emptyCFStore && !regionLevel) {
// The row version is more than maxLookbackInMillis + ttl old. We cannot decide
// if we should retain it with the store level compaction when the current
// store is not the empty column family store.
return false;
}
return true;
}
// If the time gap between two back to back mutations is more than ttl then we know
// that the row is expired within the time gap.
if (maxTimestamp - minTimestamp > ttlInMillis) {
if ((familyCount > 1 && !regionLevel && !localIndex)) {
// When there are more than one column family for a given table and a row
// version constructed at the store level covers a time span larger than ttl,
// we need region level compaction to see if the other stores have more cells
// for any of these large time gaps. A store level compaction may incorrectly
// remove some cells due to a large time gap which may not there at the region
// level.
return false;
}
// We either have one column family or are doing region level compaction. In both
// case, we can safely trim the cells beyond the first time gap larger ttl.
// Here we are interested in the gaps between the cells of the last row version
// amd thus we need to examine the gaps between these cells and the empty column.
// Please note that empty column is always updated for every mutation and so we
// just need empty column cells for the gap analysis.
int size = lastRowVersion.size();
size += emptyColumn.size();
long tsArray[] = new long[size];
int i = 0;
for (Cell cell : lastRowVersion) {
tsArray[i++] = cell.getTimestamp();
}
for (Cell cell : emptyColumn) {
tsArray[i++] = cell.getTimestamp();
}
Arrays.sort(tsArray);
boolean gapFound = false;
// Since timestamps are sorted in ascending order, traverse them in reverse order
for (i = size - 1; i > 0; i--) {
if (tsArray[i] - tsArray[i - 1] > ttlInMillis) {
minTimestamp = tsArray[i];
gapFound = true;
break;
}
}
if (gapFound) {
trimmedRow.clear();
for (Cell cell : lastRowVersion) {
if (cell.getTimestamp() >= minTimestamp) {
trimmedRow.add(cell);
}
}
lastRowVersion = trimmedRow;
trimmedEmptyColumn.clear();;
for (Cell cell : emptyColumn) {
if (cell.getTimestamp() >= minTimestamp) {
trimmedEmptyColumn.add(cell);
}
}
emptyColumn = trimmedEmptyColumn;
}
}
retainCellsOfLastRowVersion(lastRowVersion, emptyColumn, retainedCells);
return true;
}
private void removeDuplicates(List input, List output) {
Cell previousCell = null;
for (Cell cell : input) {
if (previousCell == null ||
cell.getTimestamp() != previousCell.getTimestamp() ||
cell.getType() != previousCell.getType() ||
!CellUtil.matchingColumn(cell, previousCell)) {
output.add(cell);
}
previousCell = cell;
}
}
/**
* Compacts a single row at the Phoenix level. The result parameter is the input row and
* modified to be the output of the compaction process.
*/
private void compact(List result, boolean regionLevel) throws IOException {
if (result.isEmpty()) {
return;
}
phoenixResult.clear();
// For multi-CF case, always do region level scan for empty CF store during major compaction else
// we could end-up removing some empty cells which are needed to close the gap b/w empty CF cell and
// non-empty CF cell to prevent partial row expiry. This can happen when last row version of non-empty
// CF cell outside max lookback window is older than last row version of empty CF cell.
if (major && familyCount > 1 && ! localIndex && emptyCFStore && ! regionLevel) {
compactRegionLevel(result, phoenixResult);
}
else if (!retainCellsForMaxLookback(result, regionLevel, phoenixResult)) {
if (familyCount == 1 || regionLevel) {
throw new RuntimeException("UNEXPECTED");
}
phoenixResult.clear();
compactRegionLevel(result, phoenixResult);
}
if (maxVersion == 1
&& (!major
|| (minVersion == 0 && keepDeletedCells == KeepDeletedCells.FALSE))) {
// We need Phoenix level compaction only
Collections.sort(phoenixResult, CellComparator.getInstance());
result.clear();
removeDuplicates(phoenixResult, result);
phoenixLevelOnly = true;
return;
}
// We may need to retain more cells, and so we need to run HBase level compaction
// too. The result of two compactions will be merged and duplicate cells are removed.
int phoenixResultSize = phoenixResult.size();
List hbaseResult = new ArrayList<>(result);
hBaseLevelRowCompactor.compact(hbaseResult);
phoenixResult.addAll(hbaseResult);
Collections.sort(phoenixResult, CellComparator.getInstance());
result.clear();
removeDuplicates(phoenixResult, result);
if (result.size() > phoenixResultSize) {
LOGGER.debug("HBase level compaction retained " +
(result.size() - phoenixResultSize) + " more cells");
}
}
private int compareTypes(Cell a, Cell b) {
Cell.Type aType = a.getType();
Cell.Type bType = b.getType();
if (aType == bType) {
return 0;
}
if (aType == Cell.Type.DeleteFamily) {
return -1;
}
if (bType == Cell.Type.DeleteFamily) {
return 1;
}
if (aType == Cell.Type.DeleteFamilyVersion) {
return -1;
}
if (bType == Cell.Type.DeleteFamilyVersion) {
return 1;
}
if (aType == Cell.Type.DeleteColumn) {
return -1;
}
return 1;
}
private int compare(Cell a, Cell b) {
int result;
result = Bytes.compareTo(a.getFamilyArray(), a.getFamilyOffset(),
a.getFamilyLength(),
b.getFamilyArray(), b.getFamilyOffset(), b.getFamilyLength());
if (result != 0) {
return result;
}
result = Bytes.compareTo(a.getQualifierArray(), a.getQualifierOffset(),
a.getQualifierLength(),
b.getQualifierArray(), b.getQualifierOffset(), b.getQualifierLength());
if (result != 0) {
return result;
}
if (a.getTimestamp() > b.getTimestamp()) {
return -1;
}
if (a.getTimestamp() < b.getTimestamp()) {
return 1;
}
return compareTypes(a, b);
}
/**
* The generates the intersection of regionResult and input. The result is the resulting
* intersection.
*/
private void trimRegionResult(List regionResult, List input,
List result) {
if (regionResult.isEmpty()) {
return;
}
int index = 0;
int size = regionResult.size();
int compare;
for (Cell originalCell : input) {
Cell regionCell = regionResult.get(index);
compare = compare(originalCell, regionCell);
while (compare > 0) {
index++;
if (index == size) {
break;
}
regionCell = regionResult.get(index);
compare = compare(originalCell, regionCell);
}
if (compare == 0) {
result.add(originalCell);
index++;
}
if (index == size) {
break;
}
}
}
/**
* This is used only when the Phoenix level compaction cannot be done at the store level.
*/
private void compactRegionLevel(List input, List result) throws IOException {
byte[] rowKey = CellUtil.cloneRow(input.get(0));
Scan scan = new Scan();
scan.setRaw(true);
scan.readAllVersions();
// compaction + 1 because the upper limit of the time range is not inclusive
scan.setTimeRange(0, compactionTime + 1);
scan.withStartRow(rowKey, true);
scan.withStopRow(rowKey, true);
RegionScanner scanner = region.getScanner(scan);
List regionResult = new ArrayList<>(result.size());
scanner.next(regionResult);
scanner.close();
Collections.sort(regionResult, CellComparator.getInstance());
compact(regionResult, true);
result.clear();
trimRegionResult(regionResult, input, result);
}
}
}
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
© 2015 - 2025 Weber Informatics LLC | Privacy Policy