
oracle.kv.impl.rep.table.MaintenanceThread Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of oracle-nosql-server Show documentation
Show all versions of oracle-nosql-server Show documentation
NoSQL Database Server - supplies build and runtime support for the server (store) side of the Oracle NoSQL Database.
The newest version!
/*-
* Copyright (C) 2011, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This file was distributed by Oracle as part of a version of Oracle NoSQL
* Database made available at:
*
* http://www.oracle.com/technetwork/database/database-technologies/nosqldb/downloads/index.html
*
* Please see the LICENSE file included in the top-level directory of the
* appropriate version of Oracle NoSQL Database for a copy of the license and
* additional information.
*/
package oracle.kv.impl.rep.table;
import static oracle.kv.impl.api.ops.OperationHandler.CURSOR_DEFAULT;
import static oracle.kv.impl.api.ops.OperationHandler.CURSOR_READ_COMMITTED;
import static oracle.kv.impl.api.table.TableLimits.NO_LIMIT;
import static oracle.kv.impl.rep.PartitionManager.DB_OPEN_RETRY_MS;
import static oracle.kv.impl.rep.RNTaskCoordinator.KV_INDEX_CREATION_TASK;
import static oracle.kv.impl.rep.table.SecondaryInfoMap.CLEANER_CONFIG;
import static oracle.kv.impl.rep.table.SecondaryInfoMap.SECONDARY_INFO_CONFIG;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import oracle.kv.Consistency;
import oracle.kv.ThroughputLimitException;
import oracle.kv.impl.admin.IllegalCommandException;
import oracle.kv.impl.admin.param.RepNodeParams;
import oracle.kv.impl.api.ops.MultiDeleteTable;
import oracle.kv.impl.api.ops.MultiDeleteTableHandler;
import oracle.kv.impl.api.ops.OperationHandler;
import oracle.kv.impl.api.ops.Result;
import oracle.kv.impl.api.ops.Scanner;
import oracle.kv.impl.api.ops.TableIterate;
import oracle.kv.impl.api.ops.TableIterateHandler;
import oracle.kv.impl.api.table.IndexImpl;
import oracle.kv.impl.api.table.NameUtils;
import oracle.kv.impl.api.table.TableImpl;
import oracle.kv.impl.api.table.TableKey;
import oracle.kv.impl.api.table.TableLimits;
import oracle.kv.impl.api.table.TableMetadata;
import oracle.kv.impl.api.table.TargetTables;
import oracle.kv.impl.fault.RNUnavailableException;
import oracle.kv.impl.metadata.Metadata;
import oracle.kv.impl.rep.IncorrectRoutingException;
import oracle.kv.impl.rep.RepNode;
import oracle.kv.impl.rep.migration.MigrationStreamHandle;
import oracle.kv.impl.rep.table.SecondaryInfoMap.DeletedTableInfo;
import oracle.kv.impl.rep.table.SecondaryInfoMap.SecondaryInfo;
import oracle.kv.impl.test.TestHookExecute;
import oracle.kv.impl.topo.PartitionId;
import oracle.kv.impl.topo.Topology;
import oracle.kv.impl.util.DatabaseUtils;
import oracle.kv.impl.util.RateLimitingLogger;
import oracle.kv.impl.util.TxnUtil;
import oracle.kv.impl.util.server.LoggerUtils;
import oracle.kv.table.PrimaryKey;
import oracle.kv.table.Table;
import com.sleepycat.je.Cursor;
import com.sleepycat.je.CursorConfig;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DatabaseNotFoundException;
import com.sleepycat.je.DiskLimitException;
import com.sleepycat.je.LockConflictException;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.SecondaryDatabase;
import com.sleepycat.je.SecondaryIntegrityException;
import com.sleepycat.je.Transaction;
import com.sleepycat.je.rep.DatabasePreemptedException;
import com.sleepycat.je.rep.InsufficientAcksException;
import com.sleepycat.je.rep.InsufficientReplicasException;
import com.sleepycat.je.rep.ReplicaWriteException;
import com.sleepycat.je.rep.ReplicatedEnvironment;
import com.sleepycat.je.rep.UnknownMasterException;
import com.sleepycat.je.utilint.StoppableThread;
import com.sleepycat.je.utilint.TaskCoordinator;
import com.sleepycat.je.utilint.TaskCoordinator.Permit;
/**
* Thread to manage table maintenance. When run, the table
* metadata is scanned looking for indexes. Each index will have a
* corresponding secondary DB. If the index is new, (and this is the master)
* a new secondary DB is created.
*
* The table MD is also checked for changes, such as indexes dropped and
* tables that need their data deleted.
*
* Once the scan is complete, any new or pending maintenance is done.
* Maintenance only runs on the master.
*
* Note that secondary cleaning activity and partition migration cannot occur
* concurrently, see [#24245].
*/
public class MaintenanceThread extends StoppableThread {
private static final int THREAD_SOFT_SHUTDOWN_MS = 10000;
private static final int NUM_DB_OP_RETRIES = 100;
private static final long ONE_SECOND_MS = 1000L;
/* DB operation delays, in ms */
private static final long SHORT_RETRY_WAIT_MS = 500L;
private static final long LONG_RETRY_WAIT_MS = 1000L;
private static final long VERY_LONG_RETRY_WAIT_MS = 30000L;
/* Number of records read from the primary during each populate call. */
public static final int POPULATE_BATCH_SIZE = 500;
/*
* Number of records read from a secondary DB partition in a transaction
* when cleaning after a partition migration.
*/
private static final int CLEAN_BATCH_SIZE = 100;
/*
* Number of records deleted from the partition DB partition in a
* transaction during cleaning due to a removed table.
*/
private static final int DELETE_BATCH_SIZE = 100;
final TableManager tableManager;
private final RepNode repNode;
/* Used to coordinate table maint tasks with other housekeeping tasks */
private final TaskCoordinator taskCoordinator;
private final Logger logger;
/*
* Encapsulates the above logger to limit the rate of log messages
* associated with a specific fault.
*/
private final RateLimitingLogger rateLimitingLogger;
private ReplicatedEnvironment repEnv = null;
private int lastSeqNum = Metadata.UNKNOWN_SEQ_NUM;
/**
* True if the thread has been shutdown. Synchronize when setting this
* field.
*/
private volatile boolean isShutdown = false;
/**
* True if there has been a request to abandon the current maintenance
* operation, and perform a new update and associated maintenance.
* Synchronize when setting this field.
*/
private volatile boolean updateRequested = false;
/**
* The previous maintenance thread, if there was one. This is so that the
* new thread can wait for the old thread to exit. This avoids requiring
* callers who start the thread to wait.
*/
private MaintenanceThread oldThread;
MaintenanceThread(MaintenanceThread oldThread,
TableManager tableManager,
RepNode repNode,
Logger logger) {
super(null, repNode.getExceptionHandler(), "KV table maintenance");
this.oldThread = oldThread;
this.tableManager = tableManager;
this.repNode = repNode;
this.taskCoordinator = repNode.getTaskCoordinator();
this.logger = logger;
this.rateLimitingLogger =
new RateLimitingLogger<>(60 * 1000 /* logSamplePeriodMs */,
20 /* maxObjects */,
logger);
}
@Override
public void run() {
if (isStopped()) {
return;
}
/* Make sure the old thread is stopped */
if (oldThread != null) {
assert oldThread.isStopped();
try {
/*
* Call initiateSoftShutdown() to make sure the thread is
* stopped and any waiters are released.
*/
oldThread.join(oldThread.initiateSoftShutdown());
} catch (InterruptedException ex) {
/* ignore */
}
oldThread = null;
}
logger.log(Level.INFO, "Starting {0}", this);
int retryCount = NUM_DB_OP_RETRIES;
while (!isStopped()) {
try {
repEnv = repNode.getEnv(0);
if (repEnv == null) {
retryWait(LONG_RETRY_WAIT_MS);
continue;
}
final TableMetadata tableMd = tableManager.getTableMetadata();
if (tableMd == null) {
retryWait(LONG_RETRY_WAIT_MS);
continue;
}
if (update(tableMd) && checkForDone()) {
return;
}
} catch (DatabaseNotFoundException |
ReplicaWriteException |
UnknownMasterException |
InsufficientAcksException |
InsufficientReplicasException |
DiskLimitException de) {
/*
* DatabaseNotFoundException and ReplicaWriteException
* are likely due to the replica trying to update before
* the master has done an initial update.
*/
if (!repNode.hasAvailableLogSize()) {
/*
* Use a longer period when we reaches disk limit exception
* (e.g., when InsufficientReplicasException or
* DiskLimitException is thrown).
*/
retryWait(retryCount--, VERY_LONG_RETRY_WAIT_MS, de);
} else {
retryWait(retryCount--, LONG_RETRY_WAIT_MS, de);
}
} catch (LockConflictException lce) {
retryWait(retryCount--, SHORT_RETRY_WAIT_MS, lce);
} catch (InterruptedException ie) {
/* IE can happen during shutdown */
if (isStopped()) {
logger.log(Level.INFO, "{0} exiting after, {1}",
new Object[]{this, ie});
return;
}
throw new IllegalStateException(ie);
} catch (RuntimeException re) {
/*
* If shutdown or env is bad just exit.
*/
if (isStopped()) {
logger.log(Level.INFO, "{0} exiting after, {1}",
new Object[]{this, re});
return;
}
throw re;
}
}
}
/**
* Waits for a period of time. Returns if isStopped() is true or the
* thread is woken up.
*/
public void retryWait(long waitMS) {
retryWait(1, waitMS, null);
}
/**
* Waits for a period of time after a DatabaseException. If the count
* is <= 0, the exception is thrown. Returns if isStopped() is true
* or the thread is woken up.
*/
private void retryWait(int count, long waitMS, DatabaseException de) {
if (isStopped()) {
return;
}
/* If the retry count has expired, re-throw the last exception */
if (count <= 0) {
throw de;
}
if (de != null) {
logger.log(Level.INFO,
"DB op caused {0}, will retry in {1}ms," +
" attempts left: {2}",
new Object[]{de, waitMS, count});
}
try {
synchronized (this) {
wait(waitMS);
}
} catch (InterruptedException ie) {
/* IE may happen during shutdown */
if (!isStopped()) {
getLogger().log(Level.SEVERE,
"Unexpected exception in {0}: {1}",
new Object[]{this, ie});
throw new IllegalStateException(ie);
}
}
}
/**
* If updateRequested is true, there is more work to be done, so clears the
* flag and returns false unless isShutdown has already been set to true.
* Otherwise, sets isShutdown to true and returns true so the thread will
* exit.
*/
private synchronized boolean checkForDone() {
if (updateRequested) {
updateRequested = false;
} else {
isShutdown = true;
}
return isShutdown;
}
/**
* Returns false if the thread is shutdown. Otherwise, sets
* updateRequested to true, which will cause the current maintenance
* operation to be abandoned and a new update requested, and returns true.
*/
boolean requestUpdate() {
return requestUpdate(false);
}
private synchronized boolean requestUpdate(boolean force) {
if (isShutdown) {
return false;
}
updateRequested = true;
if (force) {
/*
* Setting lastSeqNum to UNKNOWN_SEQ_NUM will force a call to
* updateInternal during the update.
*/
lastSeqNum = Metadata.UNKNOWN_SEQ_NUM;
}
notifyAll();
return true;
}
/* -- From StoppableThread -- */
@Override
protected synchronized int initiateSoftShutdown() {
isShutdown = true;
notifyAll();
return THREAD_SOFT_SHUTDOWN_MS;
}
@Override
protected Logger getLogger() {
return logger;
}
/**
* Updates the SecondaryInfoMap with dropped tables and added indexes
* that require action via maintenance threads to populate indexes or
* clean out deleted tables.
*
* @return true if the update was successful and maintenance complete
*/
private boolean update(TableMetadata tableMd)
throws InterruptedException {
if (isStopped()) {
return false;
}
logger.log(Level.INFO,
"Establishing secondary database handles, " +
"table seq#: {0}",
tableMd.getSequenceNumber());
try {
final Database infoDb = tableManager.getInfoDb(repEnv);
/*
* Update maps if the local state needs updating, if any
* secondaries have been reset, or this is new table metadata.
*/
boolean update = tableManager.checkLocalState(repEnv, true);
if (resetSecondaries(infoDb)) {
update = true;
}
if (update || tableMd.getSequenceNumber() > lastSeqNum) {
/* Return false if update failed */
if (!updateInternal(tableMd, infoDb)) {
return false;
}
lastSeqNum = tableMd.getSequenceNumber();
}
doMaintenance(infoDb);
return true;
} finally {
assert !tableManager.isBusyMaintenance();
}
}
/**
* Resets any secondaries which have been invalidated due to corruption.
* Returns true if any secondary was successfully reset.
*/
private boolean resetSecondaries(Database infoDb) {
final Map invalidatedSecondaries =
tableManager.getInvalidatedSecondaries();
if (invalidatedSecondaries == null) {
return false;
}
boolean needsUpdate = false;
for (SecondaryIntegrityException sie : invalidatedSecondaries.values()){
if (resetSecondary(infoDb, sie, null /*already logged*/)) {
needsUpdate = true;
}
}
/*
* At this point the invalidated DBs will be removed, preventing
* further operations to the secondary, so we can clear out the map.
* Since the map is not synchronized, it is possible that a new entry
* was added after the above iteration, but that is OK because there
* will very likely be another operation which will invalidate the
* secondary.
*/
invalidatedSecondaries.clear();
return needsUpdate;
}
private boolean updateInternal(TableMetadata tableMd, Database infoDb) {
/*
* Map of secondary DB name -> indexes that are defined in the table
* metadata. This is used to determine what databases to open and if
* an index has been dropped.
*/
final Map indexes = new HashMap<>();
/*
* Set of tables which are being removed but first need their data
* deleted.
*/
final Set deletedTables = new HashSet<>();
for (Table table : tableMd.getTables().values()) {
TableManager.scanTable((TableImpl)table, indexes, deletedTables);
}
if (logger.isLoggable(Level.FINE)) {
/* Enumerate the indexes and tables for logging purposes */
final StringBuilder indexesString = new StringBuilder();
final StringBuilder tablesString = new StringBuilder();
for (String indexName : indexes.keySet()) {
if (indexesString.length() > 0) {
indexesString.append(" ");
}
indexesString.append(indexName);
}
for (TableImpl table : deletedTables) {
if (tablesString.length() > 0) {
tablesString.append(" ");
}
tablesString.append(table.getFullNamespaceName());
}
logger.log(Level.FINE,
"Found {0} indexes({1}) and {2} tables" +
" marked for deletion({3}) in {4})",
new Object[]{indexes.size(),
indexesString.toString(),
deletedTables.size(),
tablesString.toString(),
tableMd});
} else if (!deletedTables.isEmpty()) {
logger.log(Level.INFO, "Found {0} tables" +
" marked for deletion in {1})",
new Object[]{deletedTables.size(), tableMd});
}
/*
* Update the secondary map with any indexes that have been dropped
* and removed tables. Changes to the secondary map can only be
* made by the master. This call will also remove the database.
* This call will not affect secondaryDbMap which is dealt with
* below.
*/
if (repEnv.getState().isMaster()) {
SecondaryInfoMap.update(tableMd, indexes, deletedTables,
this, infoDb, repEnv, logger);
}
final Set secondaryDbs = tableManager.getSecondaryDbs();
/*
* Remove entries from secondaryDbMap for dropped indexes.
* The call to SecondaryInfoMap.check above will close the DB
* when the master, so this will close the DB on the replica.
*/
final Iterator itr = secondaryDbs.iterator();
while (itr.hasNext()) {
if (isStopped()) {
return false;
}
final String dbName = itr.next();
if (!indexes.containsKey(dbName)) {
if (tableManager.closeSecondary(dbName)) {
itr.remove();
}
}
}
int errors = 0;
/*
* For each index open its secondary DB
*/
for (Entry entry : indexes.entrySet()) {
if (isStopped()) {
logger.log(Level.INFO,
"Update terminated, established {0} " +
"secondary database handles",
secondaryDbs.size());
return false;
}
final String dbName = entry.getKey();
SecondaryDatabase db = null;
try {
final IndexImpl index = entry.getValue();
db = tableManager.getSecondaryDB(dbName, index, infoDb, repEnv);
} catch (SecondaryIntegrityException sie) {
resetSecondary(infoDb, sie, "update");
errors++;
} catch (RuntimeException re) {
/*
* If a DB was opened, and there was some other error (like
* not getting the info record), close and remove the DB
* from the map.
*/
if (db != null) {
tableManager.removeSecondary(dbName);
tableManager.closeSecondaryDb(db);
}
/*
* Index in metadata but the database was removed by the
* master. Likely the secondary was corrupted and will be
* rebuilt. So just retry.
*/
if (re instanceof DatabasePreemptedException) {
logger.log(Level.FINE, "Failed to open database " +
"for {0}. {1}",
new Object[] {dbName, re.getMessage()});
errors++;
continue;
}
if (re instanceof IllegalCommandException) {
logger.log(Level.WARNING, "Failed to open database " +
"for {0}. {1}",
new Object[] {dbName, re.getMessage()});
continue;
}
if (DatabaseUtils.handleException(re, logger, dbName)) {
errors++;
}
}
}
/*
* If there have been errors return false which will cause the
* update to be retried after a delay.
*/
if (errors > 0) {
logger.log(Level.INFO,
"Established {0} secondary database handles, " +
"will retry in {1}ms",
new Object[] {secondaryDbs.size(),
DB_OPEN_RETRY_MS});
retryWait(DB_OPEN_RETRY_MS);
return false;
}
logger.log(Level.INFO, "Established {0} secondary database handles",
secondaryDbs.size());
return true; // Success
}
/**
* Closes the specified secondary database. Returns true if the close
* succeeded, or the database was not open.
*
* @param dbName the name of the secondary DB to close
* @return true on success
*/
boolean closeSecondary(String dbName) {
return tableManager.closeSecondary(dbName);
}
/**
* Performs maintenance operations as needed. Does not return until all
* maintenance is complete, the thread is shutdown, or there has been
* a request to update.
*/
private void doMaintenance(Database infoDb) throws InterruptedException {
if (!repEnv.getState().isMaster()) {
return;
}
while (!exitMaintenance()) {
final SecondaryInfoMap secondaryInfoMap =
SecondaryInfoMap.fetch(infoDb);
if (secondaryInfoMap == null) {
/* No map, no work */
return;
}
if (secondaryInfoMap.secondaryNeedsPopulate()) {
populateSecondary(infoDb);
} else if (secondaryInfoMap.tableNeedCleaning()) {
cleanPrimary(infoDb);
} else if (secondaryInfoMap.secondaryNeedsCleaning()) {
cleanSecondary(infoDb);
} else {
/* Nothing to do */
return;
}
}
/*
* Exiting because exitMaintenance return true. This means the
* thread is exiting, or we need to return to update.
*/
}
/**
* Returns true if maintenance operations should stop, or the thread
* is shutdown.
*/
public boolean exitMaintenance() {
return updateRequested || isStopped();
}
/**
* Returns true if the thread is shutdown.
*/
boolean isStopped() {
return isShutdown || ((repEnv != null) && !repEnv.isValid());
}
/**
* Populates the secondary databases. Secondary DBs (indexes) are populated
* by scanning through each partition DB. During the scan the secondary
* DB has Incremental Population set to true to prevent
* SequenceIntegrityExceptions from being thrown.
*
* @throws InterruptedException if the wait for a permit is interrupted
*/
private void populateSecondary(Database infoDb)
throws InterruptedException {
logger.info("Running secondary population");
final OperationHandler oh =
new OperationHandler(repNode, tableManager.getParams());
final TableIterateHandlerInternal opHandler =
new TableIterateHandlerInternal(oh);
final RepNodeParams repNodeParams = repNode.getRepNodeParams();
final long permitTimeoutMs =
repNodeParams.getPermitTimeoutMs(KV_INDEX_CREATION_TASK);
final long permitLeaseMs =
repNodeParams.getPermitLeaseMs(KV_INDEX_CREATION_TASK);
final Topology topo = repNode.getTopology();
final int numShards = (topo == null) ? 1 : topo.getRepGroupMap().size();
/*
* Since we are not using the timeout mechanism in the task coordinator
* we keep track of the timeout for getting a permit externally.
*/
final long startMs = System.currentTimeMillis();
/*
* We populate secondaries by scanning through one partition at a time.
* This can be done without interlocking with partition migration
* until it is time to determine if scanning is done. To know whether
* we are done, migration target processing needs to be idle so that the
* list of partitions is stable. The waitForTargetIdle flag indicates
* if migration needs to be idle and is set when it appears that we
* are done.
*/
boolean waitForTargetIdle = false;
/*
* Map from dbName -> ThroughputInfo. This will only contain entries
* for indexes who's tables have throuput limits.
*/
final Map throughputInfoMap = new HashMap<>();
while (!exitMaintenance()) {
Transaction txn = null;
/*
* Attempt to get a work permit without a timeout. If it fails
* (isDeficit == true) we wait. By waiting we will exit if there
* is a update.
*/
try (final Permit permit = taskCoordinator.
acquirePermit(KV_INDEX_CREATION_TASK,
0 /* timeout */,
permitLeaseMs,
TimeUnit.MILLISECONDS)) {
final long timeStamp = System.currentTimeMillis();
/*
* If the acquire failed wait a short bit to retry unless we
* are past the permit timeout, in which case proceed with the
* population.
*/
if (permit.isDeficit() &
(timeStamp - startMs) < permitTimeoutMs) {
permit.releasePermit();
retryWait(SHORT_RETRY_WAIT_MS);
continue; /* loop back to while (!exitMaintenance()) */
}
/*
* For all but the last pass through the loop waitForTargetIdle
* will be false, letting population proceed without waiting
* for migration to idle.
*/
if (waitForTargetIdle) {
tableManager.setBusyMaintenance();
repNode.getMigrationManager().awaitTargetIdle(this);
if (exitMaintenance()) {
return;
}
waitForTargetIdle = false;
}
txn = repEnv.beginTransaction(null, SECONDARY_INFO_CONFIG);
final SecondaryInfoMap infoMap =
SecondaryInfoMap.fetch(infoDb, txn, LockMode.RMW);
/*
* The iterate order of the set returned by
* getSecondariesToPopulate will be from least recently
* populated to most recent.
*/
final Set> entries =
infoMap.getSecondariesToPopulate();
/* If no more, we are finally done */
if (entries.isEmpty()) {
logger.info("Completed populating secondary " +
"database(s)");
return;
}
String dbName = null;
SecondaryInfo info = null;
/* Will be null if there are no throughput limits */
ThroughputInfo throughputInfo = null;
/*
* Make a pass through the indexes that need to be populated
* looking for one which can be populated right now. The loop
* will exit with dbName, info, and possibly throughputInfo
* set to the index to be populated, or dbName == null if no
* indexes are ready to populate.
*/
for (Entry entry : entries) {
if (exitMaintenance()) {
return;
}
dbName = entry.getKey();
info = entry.getValue();
assert info.needsPopulating();
throughputInfo = throughputInfoMap.get(dbName);
if (throughputInfo == null) {
/* No limits (or first time through) - populate */
break;
}
/*
* If it has been a second or more since the last pass,
* reset the throughput counts and populate.
*/
if (info.msSinceLastPass(timeStamp) >= ONE_SECOND_MS) {
throughputInfo.reset();
break;
}
if (!throughputInfo.isThrottled()) {
break;
}
/* If here, the secondary could not be populated. */
dbName = null;
}
/*
* If dbName is null, there are no secondaries ready to populate
* (all are being throttled). Briefly wait and retry.
* Note that the info == null check is unnecessary, except for
* keeping the code checker happy.
*/
if ((dbName == null) || (info == null)) {
TxnUtil.abort(txn);
txn = null;
permit.releasePermit();
retryWait(SHORT_RETRY_WAIT_MS);
continue;
}
assert dbName != null;
assert info != null;
logger.log(Level.FINE, "Started populating {0}", dbName);
/*
* If there is no current partition to process, set a new
* one. setCurrentPartition() will return false if we should
* retry with migration idle. In that case set waitForTargetIdle
* and make another pass.
*/
if ((info.getCurrentPartition() == null) &&
!setCurrentPartition(info)) {
waitForTargetIdle = true;
continue;
}
final SecondaryDatabase db =
tableManager.getSecondaryDb(dbName);
if (info.getCurrentPartition() == null) {
logger.log(Level.FINE, "Finished populating {0} {1}",
new Object[]{dbName, info});
throughputInfoMap.remove(dbName);
assert db != null;
tableManager.endPopulation(dbName);
info.donePopulation();
infoMap.persist(infoDb, txn);
txn.commit();
txn = null;
continue;
}
/*
* db can be null if the update thread has not
* yet opened all of the secondary databases.
*/
if (db == null) {
/*
* By updating the last pass time this table will be
* pushed to the back of the list. If there is only
* one index to populate, wait a short time to allow
* update to finish.
*/
info.completePass(timeStamp);
infoMap.persist(infoDb, txn);
txn.commit();
txn = null;
if (entries.size() == 1) {
permit.releasePermit();
retryWait(SHORT_RETRY_WAIT_MS);
}
continue;
}
logger.log(Level.FINE, "Populating {0} {1}",
new Object[]{dbName, info});
assert TestHookExecute.doHookIfSet(tableManager.populateHook,
db);
final String tableName =
TableManager.getTableName(db.getDatabaseName());
final String namespace =
TableManager.getNamespace(db.getDatabaseName());
final TableImpl table = tableManager.getTable(namespace,
tableName);
if (table == null) {
final String nsName =
NameUtils.makeQualifiedName(namespace, tableName);
logger.log(Level.WARNING,
"Failed to populate {0}, missing table {1}",
new Object[]{info, nsName});
return;
}
/*
* If there are throughput limits for the table, create a
* ThroughputInfo object if needed to track populate throughput
* and to control throtteling.
*/
if (throughputInfo == null) {
final TableLimits tl = table.getTableLimits();
if ((tl != null) && tl.hasThroughputLimits() &&
(tl.getIndexPopulateLimit() != NO_LIMIT)) {
throughputInfo = new ThroughputInfo(tl, numShards);
throughputInfoMap.put(dbName, throughputInfo);
}
}
try {
final boolean more =
populate(table,
opHandler,
txn,
info,
throughputInfo);
info.completePass(timeStamp);
if (!more) {
logger.log(Level.FINE, "Finished partition for {0}",
info);
info.completeCurrentPartition();
}
infoMap.persist(infoDb, txn);
} catch (SecondaryIntegrityException sie) {
/*
* If there is an SIE the secondary DB is corrupt. We will
* restart population using a fresh DB. resetSecondary
* will remove the current DB and reset the population info.
* The txn is no longer valid.
*/
TxnUtil.abort(txn);
txn = null;
if (resetSecondary(infoDb, sie, "secondary population")) {
/*
* Calling requestUpdate(true) will cause the
* populateSecondary loop to exit and will force a call
* to update, which will then open a new DB and restart
* population.
*/
requestUpdate(true);
}
return;
} catch (RNUnavailableException rue) {
/*
* This can happen if the metadata is updated during
* a population. In that case a new DB name can be added
* to secondaryLookupMap before the DB is opened and
* inserted into secondaryDbMap. Even though this is
* populating secondary A, the populate() call will
* result in a callback to getSecondaries().
* getSecondaries() requires all DBs named in
* secondaryLookupMap be open and present in
* secondaryDbMap.
*/
logger.log(Level.INFO, "Index population failed " +
"on {0}: {1} populate will be retried",
new Object[]{dbName, rue.getMessage()});
return;
} catch (IncorrectRoutingException ire) {
/*
* An IncorrectRoutingException is thrown if the
* partition has moved and its DB removed. In this
* case we can mark it complete and move on. The txn
* is still valid.
*/
handleIncorrectRoutingException(infoMap, info, infoDb, txn);
} catch (IllegalStateException ise) {
/*
* In addition to IncorrectRoutingException, an
* IllegalStateException can be thrown if the partition is
* moved during population (and its DB closed). The call to
* getPartitionDB() will throw IncorrectRoutingException
* if that is the case. Since the ise was likely thrown
* from JE, the txn is probably not valid.
*/
TxnUtil.abort(txn);
txn = null;
try {
repNode.getPartitionDB(info.getCurrentPartition());
} catch (IncorrectRoutingException ire) {
/*
* Mark this partition as done, commit that fact
* and continue (don't exit).
*/
handleIncorrectRoutingException(dbName, infoDb);
continue;
}
persistInfoErrorString(dbName, infoDb, ise);
/* return normally to move on */
return;
} catch (RuntimeException re) {
TxnUtil.abort(txn);
txn = null;
persistInfoErrorString(dbName, infoDb, re);
/* return normally to move on */
return;
}
txn.commit();
txn = null;
} finally {
TxnUtil.abort(txn);
/* Busy may have been set */
tableManager.clearBusyMaintenance();
}
}
}
/**
* Sets the current partition field in the specified info object. The
* current partition is the partition that requires maintenance. If
* no partition is found, the field is not modified. Returns true on
* success. If false is returned, the call should be retried with
* migration idle.
*
* @return true if the current partition was set
*/
private boolean setCurrentPartition(SecondaryInfo info) {
assert info.needsPopulating();
assert info.getCurrentPartition() == null;
/* Loop to find a partition that needs processing. */
for (PartitionId partition : repNode.getPartitions()) {
if (!info.isCompleted(partition)) {
info.setCurrentPartition(partition);
return true;
}
}
/*
* No partition was found, so it appears that we are done. However,
* if isBusyMaintenance() == false the list of partitions on this RN
* may have changed due to migration. In that case we return false to
* indicate we should check again while migration target processing
* is idle.
*/
return tableManager.isBusyMaintenance();
}
/**
* Resets the specified secondary database due to corruption. The
* secondary DB is removed and the secondary info is reset for
* population. Returns true if the reset was successful
*/
private boolean resetSecondary(Database infoDb,
SecondaryIntegrityException sie,
String operation) {
final String dbName = sie.getSecondaryDatabaseName();
if (operation != null) {
rateLimitingLogger.log(dbName, Level.SEVERE,
() -> "Integrity problem " +
"with index during " +
operation + ": " + sie.getMessage());
}
Transaction txn = null;
try {
txn = repEnv.beginTransaction(null, SECONDARY_INFO_CONFIG);
/* Close and remove the secondary DB */
tableManager.closeSecondary(dbName);
try {
repEnv.removeDatabase(txn, dbName);
} catch (DatabaseNotFoundException dnfe) {
/* already gone */
tableManager.removeSecondary(dbName);
return true;
}
final SecondaryInfoMap infoMap =
SecondaryInfoMap.fetch(infoDb, txn, LockMode.RMW);
/*
* Reset secondary info. It should be present, but if not it will
* be created during update.
*/
final SecondaryInfo info = infoMap.getSecondaryInfo(dbName);
if (info != null) {
info.resetForPopulation(sie);
infoMap.persist(infoDb, txn);
}
txn.commit();
txn = null;
/* Safe to remove from map */
tableManager.removeSecondary(dbName);
if ((info != null) && (info.getErrorString() != null)) {
/*
* This is a permanent condition. The index will need to be
* explicitly removed and then re-added.
*/
logger.log(Level.SEVERE,
() -> "Secondary reset for " + dbName +
" failed: too many retries, the index must" +
" be explicitly removed and re-added");
} else {
logger.log(Level.INFO, () -> "Secondary reset for " + dbName +
" " + info);
}
return true;
} catch (Exception e) {
/*
* If there is an exception, log it and return. The population will
* eventually continue and fail with an SIE, which will call
* back here, essentially a retry.
*/
logger.log(Level.WARNING,
() -> "Secondary reset for " + dbName + " failed: " +
LoggerUtils.getStackTrace(e));
} finally {
TxnUtil.abort(txn);
}
return false;
}
private void handleIncorrectRoutingException(String dbName,
Database infoDb) {
Transaction txn = null;
try {
txn = repEnv.beginTransaction(null, SECONDARY_INFO_CONFIG);
final SecondaryInfoMap infoMap =
SecondaryInfoMap.fetch(infoDb, txn, LockMode.RMW);
/* Reset secondary info */
final SecondaryInfo info = infoMap.getSecondaryInfo(dbName);
handleIncorrectRoutingException(infoMap, info, infoDb, txn);
txn.commit();
txn = null;
} catch (Exception e) {
logger.log(Level.WARNING,
"Index population failed on {0} due to partition" +
" being moved, unable to persist info due to: {1}",
new Object[]{dbName, e.getMessage()});
} finally {
TxnUtil.abort(txn);
}
}
/**
* Handle an IncorrectRoutingException caught during secondary population.
* An IncorrectRoutingException is thrown if the partition has moved and
* its DB removed. In this case we can mark it complete.
*/
private void handleIncorrectRoutingException(SecondaryInfoMap infoMap,
SecondaryInfo info,
Database infoDb,
Transaction txn) {
logger.log(Level.FINE, "Finished partition for {0}, " +
"partition no longer in this shard", info);
info.completeCurrentPartition();
infoMap.persist(infoDb, txn);
}
/**
* Persists the error string for the specified database. This will stop
* population to that DB.
*/
private void persistInfoErrorString(String dbName,
Database infoDb,
Exception exception) {
Transaction txn = null;
try {
txn = repEnv.beginTransaction(null, SECONDARY_INFO_CONFIG);
final SecondaryInfoMap infoMap =
SecondaryInfoMap.fetch(infoDb, txn, LockMode.RMW);
final SecondaryInfo info = infoMap.getSecondaryInfo(dbName);
if (info != null) {
info.setErrorString(exception);
infoMap.persist(infoDb, txn);
txn.commit();
txn = null;
logger.log(Level.WARNING,
"Index population failed on {0}: {1}",
new Object[]{dbName, info});
} else {
logger.log(Level.WARNING,
"Index population failed on {0}: due to {1}" +
", unable to persist error string due info" +
" object missing",
new Object[]{dbName, exception.getMessage()});
}
} catch (Exception e) {
logger.log(Level.WARNING,
"Index population failed, on {0} due to: {1}" +
", unable to persist error string due to: {2}",
new Object[]{dbName, exception.getMessage(),
e.getMessage()});
} finally {
TxnUtil.abort(txn);
}
}
/**
* Processes records for the specified table from the current partition.
* Returns true if there are more records to read from the
* partition. If true is returned the bytes in info.lastKey are set to the
* key of the last record read.
*
* @param table the source table
* @param opHandler the internal operation handler
* @param txn current transaction
* @return true if there are more records to process
* @throws InterruptedException if the wait for a Permit is interrupted
*/
private boolean populate(TableImpl table,
TableIterateHandlerInternal opHandler,
Transaction txn,
SecondaryInfo info,
ThroughputInfo throughputInfo) {
/*
* If the bytes in lastKey is not null then use that to start the
* iteration. Set to the last key read on exit.
*/
final byte[] resumeKey = info.getLastKey().getData();
final PrimaryKey pkey = table.createPrimaryKey();
final TableKey tkey = TableKey.createKey(table, pkey, true);
/* A single target table */
final TargetTables targetTables = new TargetTables(table, null, null);
/* Create and execute the iteration */
final TableIterate op =
new TableIterate(tkey.getKeyBytes(),
targetTables,
tkey.getMajorKeyComplete(),
POPULATE_BATCH_SIZE,
resumeKey);
/*
* Set the throughput collector if there is one for this table unless
* the secondary is being rebuilt. In that case ignore throughput.
*/
final ThroughputCollector tc = info.isRebuild() ?
null :
tableManager.getThroughputCollector(table.getId());
if (tc != null) {
/*
* Check to make sure we can write and that the throughout limit
* has not been exceeded. Note that this may result in starvation
* but it is up to the client to control their application when
* creating indexes.
*/
try {
tc.checkForLimitExceeded(op);
} catch (ThroughputLimitException tle) {
/*
* The table this index belongs to is over its throughput
* limit. In this case we will skip this batch. By setting
* setThrottled the next pass will either let another
* secondary be processed or wait. Note that by returning
* true we will persist the infoMap object which will update
* the last populate time which will push this index to the
* back of the line (see getSecondariesToPopulate).
*/
throughputInfo.setThrottled();
return true;
}
op.setThroughputTracker(tc, Consistency.NONE_REQUIRED);
}
return opHandler.populate(op, txn,
info, throughputInfo);
}
/**
* Object to track throughput consumed by population and to control
* throttling.
*/
private static class ThroughputInfo {
/* Need a minimum to make some progress */
private final int MIN_LIMIT_KBS = 10;
final int populateReadLimit;
final int populateWriteLimit;
int totalReadKB;
int totalWriteKB;
boolean throttled = false;
ThroughputInfo(TableLimits tl, int numShards) {
assert tl.hasThroughputLimits();
assert tl.getIndexPopulateLimit() != NO_LIMIT;
/*
* The read and write limits for population are a percentage of the
* table read and write limit divided by the number of shards.
*/
int factor = tl.getIndexPopulateLimit() / numShards;
if (factor == 0) {
factor = 1;
}
int rl = tl.getReadLimit();
if (rl != NO_LIMIT) {
rl = (rl * factor) / 100;
if (rl < MIN_LIMIT_KBS) {
rl = MIN_LIMIT_KBS;
}
}
int wl = tl.getWriteLimit();
if (wl != NO_LIMIT) {
wl = (wl * factor) / 100;
if (wl < MIN_LIMIT_KBS) {
wl = MIN_LIMIT_KBS;
}
}
assert rl > 0;
assert wl > 0;
populateReadLimit = rl;
populateWriteLimit = wl;
}
/*
* Accumulates the specified read and write throughput. Returns
* true if isThrottled() would return true. If the total
* throughput since the last call to reset() is over the
* populate limits, then throttling will be set.
*/
private boolean accumulateThroughput(int readKB, int writeKB) {
totalReadKB += readKB;
totalWriteKB += writeKB;
if ((totalReadKB >= populateReadLimit) ||
(totalWriteKB >= populateWriteLimit)) {
throttled = true;
}
return throttled;
}
private void setThrottled() {
throttled = true;
}
private boolean isThrottled() {
return throttled;
}
private void reset() {
totalReadKB = 0;
totalWriteKB = 0;
throttled = false;
}
@Override
public String toString() {
return "ThroughputInfo[" +
totalReadKB + ", " + populateReadLimit + ", " +
totalWriteKB + ", " + populateWriteLimit + ", " +
throttled + "]";
}
}
/*
* Special internal handler which bypasses security checks.
*/
private static class TableIterateHandlerInternal
extends TableIterateHandler {
TableIterateHandlerInternal(OperationHandler handler) {
super(handler);
}
/*
* Use CURSOR_DEFAULT so that primary records accessed by the iteration
* remain locked during the transaction.
*/
@Override
protected CursorConfig getCursorConfig() {
return CURSOR_DEFAULT;
}
/*
* Overriding verifyTableAccess() will bypass keyspace and
* security checks.
*/
@Override
public void verifyTableAccess(TableIterate op) { }
private boolean populate(TableIterate op,
Transaction txn,
SecondaryInfo info,
ThroughputInfo throughputInfo) {
/*
* Use READ_UNCOMMITTED_ALL and keyOnly to make it inexpensive to
* skip records that don't match the table. Once they match, lock
* them, and use them.
*
* TODO: can this loop be combined with the loop in TableIterate
* to share code?
*/
final OperationTableInfo tableInfo = new OperationTableInfo();
final Scanner scanner = getScanner(op,
tableInfo,
txn,
info.getCurrentPartition(),
CURSOR_READ_COMMITTED,
LockMode.READ_UNCOMMITTED_ALL,
true); // key-only
boolean moreElements = false;
try {
final DatabaseEntry lastKey = info.getLastKey();
final DatabaseEntry keyEntry = scanner.getKey();
final DatabaseEntry dataEntry = scanner.getData();
/* this is used to do a full fetch of data when needed */
final DatabaseEntry dentry = new DatabaseEntry();
final Cursor cursor = scanner.getCursor();
int numElements = 0;
while ((moreElements = scanner.next()) == true) {
final int match = keyInTargetTable(op,
tableInfo,
keyEntry,
dataEntry,
cursor,
true /*chargeReadCost*/);
if (match > 0) {
/*
* The iteration was done using READ_UNCOMMITTED_ALL
* and with the cursor set to getPartial(). It is
* necessary to call getLockedData() here to both lock
* the record and fetch the data. populateSecondaries()
* must be called with a locked record.
*/
if (scanner.getLockedData(dentry)) {
if (!TableImpl.isTableData(dentry.getData(), null)) {
continue;
}
final byte[] keyBytes = keyEntry.getData();
/* Sets the lastKey return */
lastKey.setData(keyBytes);
scanner.getDatabase().
populateSecondaries(txn, lastKey, dentry,
scanner.getExpirationTime(),
null);
/* Charge for the secondary DB wtite */
op.addWriteBytes(0, 1);
++numElements;
}
} else if (match < 0) {
moreElements = false;
break;
}
if (numElements >= POPULATE_BATCH_SIZE) {
break;
}
info.accumulateThroughput(op.getReadKB(), op.getWriteKB());
/*
* If we are charging for throughput track it and exit
* if over the local limit.
*/
if ((throughputInfo != null) &&
throughputInfo.accumulateThroughput(op.getReadKB(),
op.getWriteKB())) {
break;
}
}
} finally {
scanner.close();
}
return moreElements;
}
}
/**
* Cleans secondary databases. A secondary needs to be "cleaned"
* when a partition has moved from this node. In this case, secondary
* records that are from primary records in the moved partition need to be
* removed the secondary. Cleaning is done by reading each record in a
* secondary DB and checking whether the primary key is from a missing
* partition. If so, remove the secondary record. Cleaning needs to happen
* on every secondary whenever a partition has migrated.
*
* Secondary cleaning cannot take place on the source when there is a
* completed transfer and the migration is waiting to complete. This is
* because the migration may still fail and the partition re-instated on
* the source. If the secondaries were cleaned, operations to the
* re-instated partition would encounter SecondaryIntegrityExceptions.
*
* The cleaner is also run on the target if a partition migration has
* failed. This is to remove any secondary records that were generated on
* the target during the failed migration. Secondary cleaning activity
* resulting from a failed migration must be completed before the failed
* migration target can re-start, otherwise the migration may encounter
* SecondaryIntegrityExceptions when writing a record which already has
* a (leftover) secondary record for it. Once a SecondaryInfo cleaning
* record is added to the SecondaryInfoMap no new migration target will
* start (see busySecondaryCleaning()).
*
* Note that the check in busySecondaryCleaning() does not distinguish
* between secondary cleaning on the target due to a failure or cleaning
* on the source due to a success. Since it is unlikely for a migration
* source to also be a target it's not a worth being able to tell the
* difference. Also, busySecondaryCleaning() only needs to check if the
* migration target is a restart (i.e. the partition is the same as a
* failed one) but we do not keep this information.
*
* Partition migration target cannot run if there is ongoing or pending
* secondary cleaning.
*/
private void cleanSecondary(Database infoDb)
throws InterruptedException {
logger.info("Running secondary cleaning");
/*
* The working secondary. Working on one secondary at a time is
* an optimization in space and time. It reduces the calls to
* getNextSecondaryToClean() which iterates over the indexes, and
* makes it so that only one SecondaryInfo is keeping track of
* the cleaned partitions.
*/
String currentSecondary = null;
while (!exitMaintenance()) {
Transaction txn = null;
try {
/*
* Wait for migration source activity to stop in two steps.
* First wait without setting the busy maintenance flag. This
* will allow source activity to continue (which is likely
* if source threads > 1). This also means cleaning will be
* delayed until all migration is done, reducing the number
* of passes over the secondary databases. Note that we only
* wait on source idle, target synchronization is handled
* via SecondaryInfo data (see comment above).
*/
repNode.getMigrationManager().awaitSourceIdle(this);
if (exitMaintenance()) {
return;
}
/*
* Source activity is done, or at least paused. Set busy and
* check for source or target activity.
*/
tableManager.setBusyMaintenance();
repNode.getMigrationManager().awaitIdle(this);
if (exitMaintenance()) {
return;
}
/*
* Note that we do not call setBusyMaintenance() because because
* we are only concerned about migration target restart and
* that is handled via SecondaryInfo date (see comment above).
*/
txn = repEnv.beginTransaction(null, CLEANER_CONFIG);
final SecondaryInfoMap infoMap =
SecondaryInfoMap.fetch(infoDb, txn, LockMode.RMW);
if (currentSecondary == null) {
currentSecondary = infoMap.getNextSecondaryToClean();
if (currentSecondary != null) {
logger.log(Level.FINE, "Cleaning {0}",
currentSecondary);
}
}
/* If no more, we are finally done */
if (currentSecondary == null) {
logger.info("Completed cleaning secondary database(s)");
return;
}
final SecondaryInfo info =
infoMap.getSecondaryInfo(currentSecondary);
assert info != null;
assert info.needsCleaning();
final SecondaryDatabase db =
tableManager.getSecondaryDb(currentSecondary);
/*
* We have seen missing DB in a stress test. By throwing
* DNFE the thread will retry after a delay. If the DB
* was not yet initialized the delay may allow the DB to
* be opened/created. If that is not the problem, the retry
* loop will eventually throw the DNFE out causing the RN to
* restart. That should fix things since the problem here
* is between the secondary info and the table metadata.
* On restart the secondary info will be updated from the
* metadata before the maintenance thread is started.
*/
if (db == null) {
throw new DatabaseNotFoundException(
"Failed to clean " + currentSecondary +
", the secondary database was not" +
" found");
}
assert TestHookExecute.doHookIfSet(tableManager.cleaningHook,
db);
if (!db.deleteObsoletePrimaryKeys(info.getLastKey(),
info.getLastData(),
CLEAN_BATCH_SIZE)) {
logger.log(Level.FINE, "Completed cleaning {0}",
currentSecondary);
info.doneCleaning();
currentSecondary = null;
}
infoMap.persist(infoDb, txn);
txn.commit();
txn = null;
} catch (SecondaryIntegrityException sie) {
TxnUtil.abort(txn);
txn = null;
if (resetSecondary(infoDb, sie, "secondary cleaning")) {
requestUpdate(true);
}
} finally {
tableManager.clearBusyMaintenance();
TxnUtil.abort(txn);
}
}
}
/**
* Cleans primary records. This will remove primary records associated
* with a table which has been marked for deletion.
*/
private void cleanPrimary(Database infoDb) throws InterruptedException {
/*
* There is a race condition which can lead to records from a dropped
* table to get left in the store. What can happen is:
* 1) a write operation checks whether the table exist by calling
* getTable() which returns true
* 2) a metadata update comes in indicating that the table is dropped
* 3) cleaning starts
* 4) the operation completes
*
* If the record written by the operation is behind the cleaning
* scan (lower key), it will not be cleaned and remain in the store
* forever. The fix is to keep track of the number of active
* operations per table metadata sequence #, and then wait for any
* operation using an earlier metadata to complete.
*/
assert lastSeqNum != 0;
final int opsWaitedOn = repNode.awaitTableOps(lastSeqNum);
logger.log(Level.INFO,
"Running primary cleaning, waited on {0} ops to complete",
opsWaitedOn);
/*
* We can only use record extinction if all nodes in the group have
* been upgraded.
*
* If primary cleaning is interrupted before finishing and then resumes
* after a shard is upgraded, we will switch approaches from deleting
* records to using record extinction. That is OK.
*/
if (repEnv.isRecordExtinctionAvailable()) {
cleanPrimaryRecordExtinction(infoDb);
} else {
cleanPrimaryDeleteRecords(infoDb);
}
}
/**
* Cleans primary records by using record extinction. Record deletions
* are not logged, so this operation can be performed in a short
* transaction. JE asynchronously discards the extinct records.
*
* Note that this operation does not require interlocking with partition
* migration.
*/
private void cleanPrimaryRecordExtinction(Database infoDb) {
int retryCount = 10;
while (!exitMaintenance()) {
Transaction txn = null;
try {
if (exitMaintenance()) {
return;
}
txn = repEnv.beginTransaction(null, CLEANER_CONFIG);
final SecondaryInfoMap infoMap =
SecondaryInfoMap.fetch(infoDb, txn, LockMode.RMW);
final DeletedTableInfo info = infoMap.getNextTableToClean();
if (info == null) {
logger.info("Completed cleaning all tables");
return;
}
assert !info.isDone();
final Set dbNames = getPartitionDbNames(info);
if (!dbNames.isEmpty()) {
final TableImpl table =
tableManager.getTableInternal(info.getTargetTableId());
assert table.isDeleting();
logger.log(Level.INFO,
"Discarding table record(s) for {0} {1}",
new Object[] {info.getTargetTableId(),
table.getFullName()});
Scanner.discardTableRecords(repEnv, txn, dbNames, table);
}
info.setDone();
infoMap.persist(infoDb, txn);
txn.commit();
txn = null;
retryCount = 10;
} catch (DatabaseNotFoundException dnfe) {
TxnUtil.abort(txn);
txn = null;
/*
* This can happen due to async partition migration creating
* and removing target partition DBs. In this case we simply
* retry.
*/
logger.log(Level.INFO,
"Partition database not found during " +
"cleaning, will retry in {0} ms: {0}",
new Object[] {LONG_RETRY_WAIT_MS,
dnfe.getMessage()});
retryWait(retryCount--, LONG_RETRY_WAIT_MS, dnfe);
} finally {
TxnUtil.abort(txn);
}
}
}
/**
* Returns the set of partition DB names that need to be cleaned.
*/
private Set getPartitionDbNames(DeletedTableInfo info) {
final Set names = new HashSet<>();
/*
* Start by getting the names of any partitions being migrated to this
* node. We then get the partitions known to the RN. This order is
* important to make sure we get all of the DBs. If the order was
* reversed, a target migration could end after getting the RN's
* names and before we query the migration targets and that DB would
* be missed.
* It is OK for a migration target to start after this point because
* the records from any dropped table are being filtered out (see
* MigrationTarget).
*/
repNode.getMigrationManager().getTargetPartitionDbNames(names);
for (PartitionId partition : repNode.getPartitions()) {
if (info.isCompleted(partition)) {
/* Should only happen if we mix approaches. */
continue;
}
names.add(partition.getPartitionName());
}
return names;
}
/**
* Cleans primary records by deleting individual records from the partition
* databases.
*
* Primary cleaning must be interlocked with partition migration on
* both the source and target. On the target, we need to make sure
* we clean the incoming partition since it is not possible to prevent
* "dirty" records from being migrated. If cleaning is in progress
* and a migration occurs the moved partition needs to be cleaned once
* movement is complete. In order to do this, the table being deleted
* needs to remain in the metadata. To prevent this we lock on the
* source as well. As long as cleaning is blocked on the source the
* metadata will not be updated (cleaning will not be complete
* on the source).
*/
private void cleanPrimaryDeleteRecords(Database infoDb)
throws InterruptedException {
final OperationHandler oh =
new OperationHandler(repNode, tableManager.getParams());
final MultiDeleteTableHandlerInternal opHandler =
new MultiDeleteTableHandlerInternal(oh);
MigrationStreamHandle streamHandle = null;
while (!exitMaintenance()) {
Transaction txn = null;
try {
/*
* We must make sure that there are no migrations in-progress
* to this node.
*/
tableManager.setBusyMaintenance();
repNode.getMigrationManager().awaitIdle(this);
if (exitMaintenance()) {
return;
}
txn = repEnv.beginTransaction(null, CLEANER_CONFIG);
final SecondaryInfoMap infoMap =
SecondaryInfoMap.fetch(infoDb, txn, LockMode.RMW);
final DeletedTableInfo info = infoMap.getNextTableToClean();
/* If no more, we are finally done */
if (info == null) {
logger.info("Completed cleaning partition database(s)" +
" for all tables");
return;
}
assert !info.isDone();
if (info.getCurrentPartition() == null) {
for (PartitionId partition : repNode.getPartitions()) {
if (!info.isCompleted(partition)) {
info.setCurrentPartition(partition);
break;
}
}
}
final PartitionId currentPartition =
info.getCurrentPartition();
if (currentPartition == null) {
logger.log(Level.FINE,
"Completed cleaning partition database(s) " +
"for {0}", info.getTargetTableId());
info.setDone();
} else {
streamHandle =
MigrationStreamHandle.initialize(repNode,
currentPartition,
txn);
// delete some...
if (deleteABlock(info, opHandler, txn)) {
logger.log(Level.FINE,
"Completed cleaning {0} for {1}",
new Object[] {currentPartition,
info.getTargetTableId()});
// Done with this partition
info.completeCurrentPartition();
}
info.completePass();
}
infoMap.persist(infoDb, txn);
if (streamHandle != null) {
streamHandle.prepare();
}
txn.commit();
txn = null;
} finally {
tableManager.clearBusyMaintenance();
TxnUtil.abort(txn);
if (streamHandle != null) {
streamHandle.done();
}
}
}
}
/*
* Deletes up to DELETE_BATCH_SIZE number of records from some
* primary (partition) database.
*/
private boolean deleteABlock(DeletedTableInfo info,
MultiDeleteTableHandlerInternal opHandler,
Transaction txn) {
final MultiDeleteTable op =
new MultiDeleteTable(info.getParentKeyBytes(),
info.getTargetTableId(),
info.getMajorPathComplete(),
DELETE_BATCH_SIZE,
info.getCurrentKeyBytes());
final PartitionId currentPartition = info.getCurrentPartition();
final Result result;
try {
result = opHandler.execute(op, txn, currentPartition);
} catch (IllegalStateException ise) {
/*
* This can occur if a partition has moved while being cleaned.
* We can tell by checking whether the partition is in the
* local topology. We need to check the local topology because
* a migration may be in the transfer of ownership protocol
* (see MigrationSource) and the partition is thought to still
* be here by the "official" topo.
*/
final Topology topo = repNode.getLocalTopology();
if ((topo != null) &&
(topo.getPartitionMap().get(currentPartition) == null)) {
/* partition moved, so done */
logger.log(Level.INFO,
"Cleaning of partition {0} ended, partition " +
"has moved", currentPartition);
info.setCurrentKeyBytes(null);
return true;
}
throw ise;
}
final int num = result.getNDeletions();
logger.log(Level.FINE, "Deleted {0} records in partition {1}{2}",
new Object[]{num, currentPartition,
(num < DELETE_BATCH_SIZE ?
", partition is complete" : "")});
if (num < DELETE_BATCH_SIZE) {
/* done with this partition */
info.setCurrentKeyBytes(null);
return true;
}
info.setCurrentKeyBytes(op.getLastDeleted());
return false;
}
/*
* Special internal OP which bypasses security checks and deleting
* table check.
*/
private class MultiDeleteTableHandlerInternal
extends MultiDeleteTableHandler {
MultiDeleteTableHandlerInternal(OperationHandler handler) {
super(handler);
}
/*
* Overriding verifyTableAccess() will bypass keyspace and
* security checks.
*/
@Override
public void verifyTableAccess(MultiDeleteTable op) { }
/*
* Override getAndCheckTable() to allow return of deleting tables.
*/
@Override
protected TableImpl getAndCheckTable(final long tableId) {
final TableImpl table = tableManager.getTableInternal(tableId);
/* TODO - if table is null there is some internal error ??? */
assert table != null;
return table;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy