bitronix.tm.recovery.Recoverer Maven / Gradle / Ivy
/*
* Copyright (C) 2006-2013 Bitronix Software (http://www.bitronix.be)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package bitronix.tm.recovery;
import bitronix.tm.BitronixXid;
import bitronix.tm.TransactionManagerServices;
import bitronix.tm.internal.LogDebugCheck;
import bitronix.tm.internal.XAResourceHolderState;
import bitronix.tm.journal.JournalRecord;
import bitronix.tm.journal.TransactionLogRecord;
import bitronix.tm.resource.ResourceLoader;
import bitronix.tm.resource.ResourceRegistrar;
import bitronix.tm.resource.common.XAResourceProducer;
import bitronix.tm.utils.Decoder;
import bitronix.tm.utils.ManagementRegistrar;
import bitronix.tm.utils.Service;
import bitronix.tm.utils.Uid;
import jakarta.transaction.Status;
import javax.transaction.xa.XAException;
import javax.transaction.xa.XAResource;
import javax.transaction.xa.Xid;
import java.io.IOException;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
/**
* Recovery process implementation. Here is Mike Spille's description of XA recovery:
*
* Straight Line Recovery:
*
* - 1. Find transactions that the TM considers dangling and unresolved
* - 2. Find and reconstitute any {@link XAResource}s which were being used when chunk blowing occured.
* - 3. Call the
recover()
method on each of these {@link XAResource}s.
* - 4. Throw out any {@link Xid}'s in the {@link XAResource}' recover lists which are not owned by this TM.
* - 5. Correlate {@link Xid}'s that the TM knows about with remaining {@link Xid}'s that the {@link XAResource}s
* reported.
* - 6. For {@link XAResource} {@link Xid}'s that match the global transaction ID which the TM found dangling with
* a "Committing..." record, call
commit()
on those {@link XAResource}s for those {@link Xid}s.
* - 7. For {@link XAResource} {@link Xid}'s that do not match any dangling "Committing..." records, call
*
rollback()
on those {@link XAResource}s for those {@link Xid}s.
*
* Exceptional conditions:
*
* - 1. For any
rollback()
calls from step 6 which reported a Heuristic Commit, you are in danger or
* doubt, so run in circles, scream and shout.
* - 2. For any
commit()
calls from step 7 which reported a Heuristic Rollback, you are in danger or
* doubt, so run in circles, scream and shout.
* - 3. For any resource you can't reconstitute in in step #2, or who fails on recover in step #3, or who reports
* anything like an XAER_RMFAILURE in step 6 or step 7, keep trying to contact them in some implementation
* defined manner.
* - 4. For any heuristic outcome you see reported from an XAResource, call
forget()
for that
* {@link XAResource}/{@link Xid} pair so that the resource can stop holding onto a reference to that transaction
*
* To achieve this, {@link Recoverer} must have access to all previously used resources, even if the journal contains
* no trace of some of them. There are two ways of achieving this: either you use the {@link ResourceLoader} to configure
* all your resources and everything will be working automatically or by making sure resources are re-created and re-registered.
* Those are the three steps of the Bitronix implementation:
*
* - call
recover()
on all known resources (Mike's steps 1 to 5)
* - commit dangling COMMITTING transactions (Mike's step 6)
* - rollback any remaining recovered transaction (Mike's step 7)
*
*
* @author Ludovic Orban
*/
public class Recoverer
implements Runnable, Service, RecovererMBean
{
private static final java.util.logging.Logger log = java.util.logging.Logger.getLogger(Recoverer.class.toString());
private final Map registeredResources = new HashMap<>();
private final Map> recoveredXidSets = new HashMap<>();
private final AtomicBoolean isRunning = new AtomicBoolean(false);
private final String jmxName;
private volatile Exception completionException;
private volatile int committedCount;
private volatile int rolledbackCount;
private volatile int executionsCount;
/**
* Constructor Recoverer creates a new Recoverer instance.
*/
public Recoverer()
{
String serverId = TransactionManagerServices.getConfiguration()
.getServerId();
if (serverId == null)
{
serverId = "";
}
this.jmxName = "bitronix.tm:type=Recoverer,ServerId=" + ManagementRegistrar.makeValidName(serverId);
ManagementRegistrar.register(jmxName, this);
}
/**
* Run the recovery process. This method is automatically called by the transaction manager, you should never
* call it manually.
*/
@Override
public void run()
{
if (!isRunning.compareAndSet(false, true))
{
log.info("recoverer is already running, abandoning this recovery request");
return;
}
try
{
committedCount = 0;
rolledbackCount = 0;
long oldestTransactionTimestamp = Long.MAX_VALUE;
// Collect dangling records from journal, must run before oldestTransactionTimestamp is calculated
Map danglingRecords = TransactionManagerServices.getJournal()
.collectDanglingRecords();
// Query resources from ResourceRegistrar
synchronized (ResourceRegistrar.class)
{
for (String name : ResourceRegistrar.getResourcesUniqueNames())
{
registeredResources.put(name, ResourceRegistrar.get(name));
}
if (TransactionManagerServices.isTransactionManagerRunning())
{
oldestTransactionTimestamp = TransactionManagerServices.getTransactionManager()
.getOldestInFlightTransactionTimestamp();
}
}
// 1. call recover on all known resources
recoverAllResources();
// 2. commit dangling COMMITTING transactions
Set committedGtrids = commitDanglingTransactions(oldestTransactionTimestamp, danglingRecords);
committedCount = committedGtrids.size();
// 3. rollback any remaining recovered transaction
rolledbackCount = rollbackAbortedTransactions(oldestTransactionTimestamp, committedGtrids);
if (executionsCount == 0 || committedCount > 0 || rolledbackCount > 0)
{
log.info("recovery committed " + committedCount + " dangling transaction(s) and rolled back " + rolledbackCount +
" aborted transaction(s) on " + registeredResources.size() + " resource(s) [" + getRegisteredResourcesUniqueNames() + "]" +
((TransactionManagerServices.getConfiguration()
.isCurrentNodeOnlyRecovery()) ? " (restricted to serverId '" + TransactionManagerServices.getConfiguration()
.getServerId() + "')" : ""));
}
else if (LogDebugCheck.isDebugEnabled())
{
log.finer("recovery committed " + committedCount + " dangling transaction(s) and rolled back " + rolledbackCount +
" aborted transaction(s) on " + registeredResources.size() + " resource(s) [" + getRegisteredResourcesUniqueNames() + "]" +
((TransactionManagerServices.getConfiguration()
.isCurrentNodeOnlyRecovery()) ? " (restricted to serverId '" + TransactionManagerServices.getConfiguration()
.getServerId() + "')" : ""));
}
this.completionException = null;
}
catch (Exception ex)
{
this.completionException = ex;
log.log(Level.WARNING, "recovery failed, registered resource(s): " + getRegisteredResourcesUniqueNames(), ex);
}
finally
{
recoveredXidSets.clear();
registeredResources.clear();
executionsCount++;
isRunning.set(false);
}
}
/**
* Recover all configured resources and fill the recoveredXidSets
with all recovered XIDs.
* Step 1.
*/
private void recoverAllResources()
{
// a cloned registeredResources Map must be iterated as the original one can be modified in the loop
for (Map.Entry entry : new HashMap<>(registeredResources).entrySet())
{
String uniqueName = entry.getKey();
XAResourceProducer producer = entry.getValue();
try
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("performing recovery on " + uniqueName);
}
Set xids = recover(producer);
if (LogDebugCheck.isDebugEnabled())
{
log.finer("recovered " + xids.size() + " XID(s) from resource " + uniqueName);
}
recoveredXidSets.put(uniqueName, xids);
producer.setFailed(false);
}
catch (XAException ex)
{
producer.setFailed(true);
registeredResources.remove(uniqueName);
String extraErrorDetails = TransactionManagerServices.getExceptionAnalyzer()
.extractExtraXAExceptionDetails(ex);
log.log(Level.WARNING, "error running recovery on resource '" + uniqueName + "', resource marked as failed (background recoverer will retry recovery)" +
" (error=" + Decoder.decodeXAExceptionErrorCode(ex) + ")" + (extraErrorDetails == null ? "" : ", extra error=" + extraErrorDetails), ex);
}
catch (Exception ex)
{
if (producer != null)
{
producer.setFailed(true);
}
registeredResources.remove(uniqueName);
log.log(Level.WARNING, "error running recovery on resource '" + uniqueName + "', resource marked as failed (background recoverer will retry recovery)", ex);
}
}
}
/**
* Commit transactions that have a dangling COMMITTING record in the journal.
* Transactions younger than oldestTransactionTimestamp are ignored.
* Step 2.
*
* @param oldestTransactionTimestamp
* the timestamp of the oldest transaction still in-flight.
* @param danglingRecords
* a Map using Uid objects GTRID as key and {@link TransactionLogRecord} as value.
*
* @return a Set of all committed GTRIDs encoded as strings.
*
* @throws java.io.IOException
* if there is an I/O error reading the journal.
* @throws RecoveryException
* if an error preventing recovery happened.
*/
private Set commitDanglingTransactions(long oldestTransactionTimestamp, Map danglingRecords) throws IOException, RecoveryException
{
Set committedGtrids = new HashSet<>();
if (LogDebugCheck.isDebugEnabled())
{
log.finer("found " + danglingRecords.size() + " dangling record(s) in journal");
}
Iterator> it = danglingRecords.entrySet()
.iterator();
while (it.hasNext())
{
Entry entry = it.next();
Uid gtrid = entry.getKey();
JournalRecord tlog = entry.getValue();
Set uniqueNames = tlog.getUniqueNames();
Set danglingTransactions = getDanglingTransactionsInRecoveredXids(uniqueNames, tlog.getGtrid());
long txTimestamp = gtrid.extractTimestamp();
if (LogDebugCheck.isDebugEnabled())
{
log.finer("recovered XID timestamp: " + txTimestamp + " - oldest in-flight TX timestamp: " + oldestTransactionTimestamp);
}
if (txTimestamp < oldestTransactionTimestamp)
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("committing dangling transaction with GTRID " + gtrid);
}
commit(danglingTransactions);
if (LogDebugCheck.isDebugEnabled())
{
log.finer("committed dangling transaction with GTRID " + gtrid);
}
committedGtrids.add(gtrid);
Set participatingUniqueNames = filterParticipatingUniqueNamesInRecoveredXids(uniqueNames);
if (!participatingUniqueNames.isEmpty())
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer(
"updating journal's transaction with GTRID " + gtrid + " status to COMMITTED for names [" + buildUniqueNamesString(participatingUniqueNames) + "]");
}
TransactionManagerServices.getJournal()
.log(Status.STATUS_COMMITTED, tlog.getGtrid(), participatingUniqueNames);
}
else
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("not updating journal's transaction with GTRID " + gtrid +
" status to COMMITTED as no resource could be found (incremental recovery will need to clean this)");
}
committedGtrids.remove(gtrid);
}
}
else
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("skipping in-flight transaction with GTRID " + gtrid);
}
}
}
if (LogDebugCheck.isDebugEnabled())
{
log.finer("committed " + committedGtrids.size() + " dangling transaction(s)");
}
return committedGtrids;
}
/**
* Rollback branches whose {@link Xid} has been recovered on the resource but hasn't been committed.
* Those are the 'aborted' transactions of the Presumed Abort protocol.
* Step 3.
*
* @param oldestTransactionTimestamp
* the timestamp of the oldest transaction still in-flight.
* @param committedGtrids
* a set of {@link Uid}s already committed on this resource.
*
* @return the rolled back branches count.
*
* @throws RecoveryException
* if an error preventing recovery happened.
*/
private int rollbackAbortedTransactions(long oldestTransactionTimestamp, Set committedGtrids) throws RecoveryException
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("rolling back aborted branch(es)");
}
int rollbackCount = 0;
for (Map.Entry> entry : recoveredXidSets.entrySet())
{
String uniqueName = entry.getKey();
Set recoveredXids = entry.getValue();
if (LogDebugCheck.isDebugEnabled())
{
log.finer("checking " + recoveredXids.size() + " branch(es) on " + uniqueName + " for rollback");
}
int count = rollbackAbortedBranchesOfResource(oldestTransactionTimestamp, uniqueName, recoveredXids, committedGtrids);
if (LogDebugCheck.isDebugEnabled())
{
log.finer("checked " + recoveredXids.size() + " branch(es) on " + uniqueName + " for rollback");
}
rollbackCount += count;
}
if (LogDebugCheck.isDebugEnabled())
{
log.finer("rolled back " + rollbackCount + " aborted branch(es)");
}
return rollbackCount;
}
/**
* Build a string with comma-separated resources unique names.
*
* @return the string.
*/
private String getRegisteredResourcesUniqueNames()
{
return buildUniqueNamesString(registeredResources.keySet());
}
/**
* Run the recovery process on the target resource.
* Step 1.
*
* @param producer
* the {@link XAResourceProducer} to recover.
*
* @return a Set of BitronixXids.
*
* @throws javax.transaction.xa.XAException
* if {@link XAResource#recover(int)} call fails.
* @throws RecoveryException
* if an error preventing recovery happened.
*/
private Set recover(XAResourceProducer producer) throws XAException, RecoveryException
{
if (producer == null)
{
throw new IllegalArgumentException("recoverable resource cannot be null");
}
try
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("running recovery on " + producer);
}
XAResourceHolderState xaResourceHolderState = producer.startRecovery();
return RecoveryHelper.recover(xaResourceHolderState);
}
finally
{
producer.endRecovery();
}
}
/**
* Return {@link DanglingTransaction}s with {@link Xid}s corresponding to the GTRID parameter found in resources
* specified by their uniqueName
s.
* recoverAllResources
must have been called before or else the returned list will always be empty.
* Step 2.
*
* @param uniqueNames
* a set of uniqueName
s.
* @param gtrid
* the GTRID to look for.
*
* @return a set of {@link DanglingTransaction}s.
*/
private Set getDanglingTransactionsInRecoveredXids(Set uniqueNames, Uid gtrid)
{
Set danglingTransactions = new HashSet<>();
for (String uniqueName : uniqueNames)
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("finding dangling transaction(s) in recovered XID(s) of resource " + uniqueName);
}
Set recoveredXids = recoveredXidSets.get(uniqueName);
if (recoveredXids == null)
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("resource " + uniqueName + " did not recover, skipping commit");
}
continue;
}
for (BitronixXid recoveredXid : recoveredXids)
{
if (gtrid.equals(recoveredXid.getGlobalTransactionIdUid()))
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("found a recovered XID matching dangling log's GTRID " + gtrid + " in resource " + uniqueName);
}
danglingTransactions.add(new DanglingTransaction(uniqueName, recoveredXid));
}
}
}
return danglingTransactions;
}
/**
* Commit all branches of a dangling transaction.
* Step 2.
*
* @param danglingTransactions
* a set of {@link DanglingTransaction}s to commit.
*
* @throws RecoveryException
* if an error preventing recovery happened.
*/
private void commit(Set danglingTransactions) throws RecoveryException
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer(danglingTransactions.size() + " branch(es) to commit");
}
for (DanglingTransaction danglingTransaction : danglingTransactions)
{
Xid xid = danglingTransaction.getXid();
String uniqueName = danglingTransaction.getUniqueName();
if (LogDebugCheck.isDebugEnabled())
{
log.finer("committing branch with XID " + xid + " on " + uniqueName);
}
commit(uniqueName, xid);
}
}
/**
* Method filterParticipatingUniqueNamesInRecoveredXids ...
*
* @param uniqueNames
* of type Set
*
* @return Set
*/
private Set filterParticipatingUniqueNamesInRecoveredXids(Set uniqueNames)
{
Set recoveredUniqueNames = new HashSet<>();
for (String uniqueName : uniqueNames)
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("finding dangling transaction(s) in recovered XID(s) of resource " + uniqueName);
}
Set recoveredXids = recoveredXidSets.get(uniqueName);
if (recoveredXids == null)
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("cannot find resource '" + uniqueName + "' present in the journal, leaving it for incremental recovery");
}
}
else
{
recoveredUniqueNames.add(uniqueName);
}
}
return recoveredUniqueNames;
}
/**
* Method buildUniqueNamesString ...
*
* @param uniqueNames
* of type Set
*
* @return String
*/
private static String buildUniqueNamesString(Set uniqueNames)
{
StringBuilder resourcesUniqueNames = new StringBuilder();
Iterator it = uniqueNames.iterator();
while (it.hasNext())
{
String uniqueName = it.next();
resourcesUniqueNames.append(uniqueName);
if (it.hasNext())
{
resourcesUniqueNames.append(", ");
}
}
return resourcesUniqueNames.toString();
}
/**
* Rollback aborted branches of the resource specified by uniqueName.
* Step 3.
*
* @param oldestTransactionTimestamp
* the timestamp of the oldest transaction still in-flight.
* @param uniqueName
* the unique name of the resource on which to rollback branches.
* @param recoveredXids
* a set of {@link BitronixXid} recovered on the reource.
* @param committedGtrids
* a set of {@link Uid}s already committed on the resource.
*
* @return the rolled back branches count.
*
* @throws RecoveryException
* if an error preventing recovery happened.
*/
private int rollbackAbortedBranchesOfResource(long oldestTransactionTimestamp, String uniqueName, Set recoveredXids, Set committedGtrids) throws RecoveryException
{
int abortedCount = 0;
for (BitronixXid recoveredXid : recoveredXids)
{
if (committedGtrids.contains(recoveredXid.getGlobalTransactionIdUid()))
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("XID has been committed, skipping rollback: " + recoveredXid + " on " + uniqueName);
}
continue;
}
long txTimestamp = recoveredXid.getGlobalTransactionIdUid()
.extractTimestamp();
if (LogDebugCheck.isDebugEnabled())
{
log.finer("recovered XID timestamp: " + txTimestamp + " - oldest in-flight TX timestamp: " + oldestTransactionTimestamp);
}
if (txTimestamp >= oldestTransactionTimestamp)
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("skipping XID of in-flight transaction: " + recoveredXid);
}
continue;
}
if (LogDebugCheck.isDebugEnabled())
{
log.finer("rolling back in-doubt branch with XID " + recoveredXid + " on " + uniqueName);
}
boolean success = rollback(uniqueName, recoveredXid);
if (success)
{
abortedCount++;
}
}
return abortedCount;
}
/**
* Commit the specified branch of a dangling transaction.
* Step 2.
*
* @param uniqueName
* the unique name of the resource on which the commit should be done.
* @param xid
* the {@link Xid} to commit.
*
* @return true when commit was successful.
*
* @throws RecoveryException
* if an error preventing recovery happened.
*/
private boolean commit(String uniqueName, Xid xid) throws RecoveryException
{
XAResourceProducer producer = registeredResources.get(uniqueName);
try
{
XAResourceHolderState xaResourceHolderState = producer.startRecovery();
return RecoveryHelper.commit(xaResourceHolderState, xid);
}
finally
{
producer.endRecovery();
}
}
/**
* Rollback the specified branch of a dangling transaction.
* Step 3.
*
* @param uniqueName
* the unique name of the resource on which to rollback branches.
* @param xid
* the {@link Xid} to rollback.
*
* @return true when rollback was successful.
*
* @throws RecoveryException
* if an error preventing recovery happened.
*/
private boolean rollback(String uniqueName, Xid xid) throws RecoveryException
{
XAResourceProducer producer = registeredResources.get(uniqueName);
if (producer == null)
{
if (LogDebugCheck.isDebugEnabled())
{
log.finer("resource " + uniqueName + " has not recovered, skipping rollback");
}
return false;
}
try
{
XAResourceHolderState xaResourceHolderState = producer.startRecovery();
return RecoveryHelper.rollback(xaResourceHolderState, xid);
}
finally
{
producer.endRecovery();
}
}
/**
* Shutdown the service and free all held resources.
*/
@Override
public void shutdown()
{
ManagementRegistrar.unregister(jmxName);
}
/**
* Get the amount of transactions committed during the last recovery run.
*
* @return the amount of committed transactions.
*/
@Override
public int getCommittedCount()
{
return committedCount;
}
/**
* Get the amount of transactions rolled back during the last recovery run.
*
* @return the amount of rolled back transactions.
*/
@Override
public int getRolledbackCount()
{
return rolledbackCount;
}
/**
* Get the exception reported when recovery failed.
*
* @return the exception that made recovery fail or null if last recovery execution was successful.
*/
@Override
public Exception getCompletionException()
{
return completionException;
}
/**
* Get how many times the recoverer has run since the transaction manager started.
*
* @return how many times the recoverer has run since the transaction manager started.
*/
@Override
public int getExecutionsCount()
{
return executionsCount;
}
/**
* Check if the recoverer currently is running.
*
* @return true if the recoverer currently is running, false otherwise.
*/
@Override
public boolean isRunning()
{
return isRunning.get();
}
}