org.apache.cassandra.repair.RepairRunnable Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.repair;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.AsyncFunction;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.codahale.metrics.Timer;
import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
import org.apache.cassandra.concurrent.NamedThreadFactory;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.metrics.RepairMetrics;
import org.apache.cassandra.gms.FailureDetector;
import org.apache.cassandra.repair.consistent.SyncStatSummary;
import org.apache.cassandra.db.Keyspace;
import org.apache.cassandra.cql3.QueryOptions;
import org.apache.cassandra.cql3.QueryProcessor;
import org.apache.cassandra.cql3.UntypedResultSet;
import org.apache.cassandra.cql3.statements.SelectStatement;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.ConsistencyLevel;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.locator.EndpointsForRange;
import org.apache.cassandra.locator.InetAddressAndPort;
import org.apache.cassandra.locator.Replica;
import org.apache.cassandra.metrics.StorageMetrics;
import org.apache.cassandra.repair.consistent.CoordinatorSession;
import org.apache.cassandra.repair.messages.RepairOption;
import org.apache.cassandra.schema.SchemaConstants;
import org.apache.cassandra.service.ActiveRepairService;
import org.apache.cassandra.service.ActiveRepairService.ParentRepairStatus;
import org.apache.cassandra.service.ClientState;
import org.apache.cassandra.service.QueryState;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.streaming.PreviewKind;
import org.apache.cassandra.tracing.TraceKeyspace;
import org.apache.cassandra.tracing.TraceState;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.transport.messages.ResultMessage;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.DiagnosticSnapshotService;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.UUIDGen;
import org.apache.cassandra.utils.WrappedRunnable;
import org.apache.cassandra.utils.progress.ProgressEvent;
import org.apache.cassandra.utils.progress.ProgressEventNotifier;
import org.apache.cassandra.utils.progress.ProgressEventType;
import org.apache.cassandra.utils.progress.ProgressListener;
public class RepairRunnable implements Runnable, ProgressEventNotifier
{
private static final Logger logger = LoggerFactory.getLogger(RepairRunnable.class);
private final StorageService storageService;
private final int cmd;
private final RepairOption options;
private final String keyspace;
private final String tag;
private final AtomicInteger progressCounter = new AtomicInteger();
private final int totalProgress;
private final long creationTimeMillis = System.currentTimeMillis();
private final UUID parentSession = UUIDGen.getTimeUUID();
private final List listeners = new ArrayList<>();
private static final AtomicInteger threadCounter = new AtomicInteger(1);
private final AtomicReference firstError = new AtomicReference<>(null);
private final Scheduler validationScheduler;
private TraceState traceState;
public RepairRunnable(StorageService storageService, int cmd, RepairOption options, String keyspace)
{
this.storageService = storageService;
this.cmd = cmd;
this.options = options;
this.keyspace = keyspace;
this.validationScheduler = Scheduler.build(DatabaseDescriptor.getConcurrentMerkleTreeRequests());
this.tag = "repair:" + cmd;
// get valid column families, calculate neighbors, validation, prepare for repair + number of ranges to repair
this.totalProgress = 4 + options.getRanges().size();
}
@Override
public void addProgressListener(ProgressListener listener)
{
listeners.add(listener);
}
@Override
public void removeProgressListener(ProgressListener listener)
{
listeners.remove(listener);
}
protected void fireProgressEvent(ProgressEvent event)
{
for (ProgressListener listener : listeners)
{
listener.progress(tag, event);
}
}
public void notification(String msg)
{
logger.info(msg);
fireProgressEvent(new ProgressEvent(ProgressEventType.NOTIFICATION, progressCounter.get(), totalProgress, msg));
}
private void skip(String msg)
{
notification("Repair " + parentSession + " skipped: " + msg);
success(msg);
}
private void success(String msg)
{
fireProgressEvent(new ProgressEvent(ProgressEventType.SUCCESS, progressCounter.get(), totalProgress, msg));
ActiveRepairService.instance.recordRepairStatus(cmd, ActiveRepairService.ParentRepairStatus.COMPLETED,
ImmutableList.of(msg));
complete(null);
}
public void notifyError(Throwable error)
{
// exception should be ignored
if (error instanceof SomeRepairFailedException)
return;
logger.error("Repair {} failed:", parentSession, error);
StorageMetrics.repairExceptions.inc();
String errorMessage = String.format("Repair command #%d failed with error %s", cmd, error.getMessage());
fireProgressEvent(new ProgressEvent(ProgressEventType.ERROR, progressCounter.get(), totalProgress, errorMessage));
firstError.compareAndSet(null, error);
// since this can fail, update table only after updating in-memory and notification state
maybeStoreParentRepairFailure(error);
}
private void fail(String reason)
{
if (reason == null)
{
Throwable error = firstError.get();
reason = error != null ? error.getMessage() : "Some repair failed";
}
String completionMessage = String.format("Repair command #%d finished with error", cmd);
// Note we rely on the first message being the reason for the failure
// when inspecting this state from RepairRunner.queryForCompletedRepair
ActiveRepairService.instance.recordRepairStatus(cmd, ParentRepairStatus.FAILED,
ImmutableList.of(reason, completionMessage));
complete(completionMessage);
}
private void complete(String msg)
{
long durationMillis = System.currentTimeMillis() - creationTimeMillis;
if (msg == null)
{
String duration = DurationFormatUtils.formatDurationWords(durationMillis, true, true);
msg = String.format("Repair command #%d finished in %s", cmd, duration);
}
fireProgressEvent(new ProgressEvent(ProgressEventType.COMPLETE, progressCounter.get(), totalProgress, msg));
logger.info(options.getPreviewKind().logPrefix(parentSession) + msg);
ActiveRepairService.instance.removeParentRepairSession(parentSession);
TraceState localState = traceState;
if (options.isTraced() && localState != null)
{
for (ProgressListener listener : listeners)
localState.removeProgressListener(listener);
// Because DebuggableThreadPoolExecutor#afterExecute and this callback
// run in a nondeterministic order (within the same thread), the
// TraceState may have been nulled out at this point. The TraceState
// should be traceState, so just set it without bothering to check if it
// actually was nulled out.
Tracing.instance.set(localState);
Tracing.traceRepair(msg);
Tracing.instance.stopSession();
}
Keyspace.open(keyspace).metric.repairTime.update(durationMillis, TimeUnit.MILLISECONDS);
}
public void run()
{
try
{
runMayThrow();
}
catch (SkipRepairException e)
{
skip(e.getMessage());
}
catch (Exception | Error e)
{
notifyError(e);
fail(e.getMessage());
}
}
private void runMayThrow() throws Exception
{
ActiveRepairService.instance.recordRepairStatus(cmd, ParentRepairStatus.IN_PROGRESS, ImmutableList.of());
List columnFamilies = getColumnFamilies();
String[] cfnames = columnFamilies.stream().map(cfs -> cfs.name).toArray(String[]::new);
this.traceState = maybeCreateTraceState(columnFamilies);
notifyStarting();
NeighborsAndRanges neighborsAndRanges = getNeighborsAndRanges();
maybeStoreParentRepairStart(cfnames);
prepare(columnFamilies, neighborsAndRanges.participants, neighborsAndRanges.shouldExcludeDeadParticipants);
repair(cfnames, neighborsAndRanges);
}
private List getColumnFamilies() throws IOException
{
String[] columnFamilies = options.getColumnFamilies().toArray(new String[options.getColumnFamilies().size()]);
Iterable validColumnFamilies = storageService.getValidColumnFamilies(false, false, keyspace, columnFamilies);
progressCounter.incrementAndGet();
if (Iterables.isEmpty(validColumnFamilies))
throw new SkipRepairException(String.format("%s Empty keyspace, skipping repair: %s", parentSession, keyspace));
return Lists.newArrayList(validColumnFamilies);
}
private TraceState maybeCreateTraceState(Iterable columnFamilyStores)
{
if (!options.isTraced())
return null;
StringBuilder cfsb = new StringBuilder();
for (ColumnFamilyStore cfs : columnFamilyStores)
cfsb.append(", ").append(cfs.keyspace.getName()).append(".").append(cfs.name);
UUID sessionId = Tracing.instance.newSession(Tracing.TraceType.REPAIR);
TraceState traceState = Tracing.instance.begin("repair", ImmutableMap.of("keyspace", keyspace, "columnFamilies",
cfsb.substring(2)));
traceState.enableActivityNotification(tag);
for (ProgressListener listener : listeners)
traceState.addProgressListener(listener);
Thread queryThread = createQueryThread(cmd, sessionId);
queryThread.setName("RepairTracePolling");
queryThread.start();
return traceState;
}
private void notifyStarting()
{
String message = String.format("Starting repair command #%d (%s), repairing keyspace %s with %s", cmd, parentSession, keyspace,
options);
logger.info(message);
Tracing.traceRepair(message);
fireProgressEvent(new ProgressEvent(ProgressEventType.START, 0, 100, message));
}
private NeighborsAndRanges getNeighborsAndRanges()
{
Set allNeighbors = new HashSet<>();
List commonRanges = new ArrayList<>();
//pre-calculate output of getLocalReplicas and pass it to getNeighbors to increase performance and prevent
//calculation multiple times
Iterable> keyspaceLocalRanges = storageService.getLocalReplicas(keyspace).ranges();
for (Range range : options.getRanges())
{
EndpointsForRange neighbors = ActiveRepairService.getNeighbors(keyspace, keyspaceLocalRanges, range,
options.getDataCenters(),
options.getHosts());
if (neighbors.isEmpty())
{
if (options.ignoreUnreplicatedKeyspaces())
{
logger.info("{} Found no neighbors for range {} for {} - ignoring since repairing with --ignore-unreplicated-keyspaces", parentSession, range, keyspace);
continue;
}
else
{
throw new RuntimeException(String.format("Nothing to repair for %s in %s - aborting", range, keyspace));
}
}
addRangeToNeighbors(commonRanges, range, neighbors);
allNeighbors.addAll(neighbors.endpoints());
}
if (options.ignoreUnreplicatedKeyspaces() && allNeighbors.isEmpty())
{
throw new SkipRepairException(String.format("Nothing to repair for %s in %s - unreplicated keyspace is ignored since repair was called with --ignore-unreplicated-keyspaces",
options.getRanges(),
keyspace));
}
progressCounter.incrementAndGet();
boolean shouldExcludeDeadParticipants = options.isForcedRepair();
if (shouldExcludeDeadParticipants)
{
Set actualNeighbors = Sets.newHashSet(Iterables.filter(allNeighbors, FailureDetector.instance::isAlive));
shouldExcludeDeadParticipants = !allNeighbors.equals(actualNeighbors);
allNeighbors = actualNeighbors;
}
return new NeighborsAndRanges(shouldExcludeDeadParticipants, allNeighbors, commonRanges);
}
private void maybeStoreParentRepairStart(String[] cfnames)
{
if (!options.isPreview())
{
SystemDistributedKeyspace.startParentRepair(parentSession, keyspace, cfnames, options);
}
}
private void maybeStoreParentRepairSuccess(Collection> successfulRanges)
{
if (!options.isPreview())
{
SystemDistributedKeyspace.successfulParentRepair(parentSession, successfulRanges);
}
}
private void maybeStoreParentRepairFailure(Throwable error)
{
if (!options.isPreview())
{
SystemDistributedKeyspace.failParentRepair(parentSession, error);
}
}
private void prepare(List columnFamilies, Set allNeighbors, boolean force)
{
try (Timer.Context ignore = Keyspace.open(keyspace).metric.repairPrepareTime.time())
{
ActiveRepairService.instance.prepareForRepair(parentSession, FBUtilities.getBroadcastAddressAndPort(), allNeighbors, options, force, columnFamilies);
progressCounter.incrementAndGet();
}
}
private void repair(String[] cfnames, NeighborsAndRanges neighborsAndRanges)
{
if (options.isPreview())
{
previewRepair(parentSession,
creationTimeMillis,
neighborsAndRanges.filterCommonRanges(keyspace, cfnames),
neighborsAndRanges.participants,
cfnames);
}
else if (options.isIncremental())
{
incrementalRepair(parentSession,
creationTimeMillis,
traceState,
neighborsAndRanges,
neighborsAndRanges.participants,
cfnames);
}
else
{
normalRepair(parentSession,
creationTimeMillis,
traceState,
neighborsAndRanges.filterCommonRanges(keyspace, cfnames),
neighborsAndRanges.participants,
cfnames);
}
}
private void normalRepair(UUID parentSession,
long startTime,
TraceState traceState,
List commonRanges,
Set preparedEndpoints,
String... cfnames)
{
// Set up RepairJob executor for this repair command.
ListeningExecutorService executor = createExecutor();
// Setting the repairedAt time to UNREPAIRED_SSTABLE causes the repairedAt times to be preserved across streamed sstables
final ListenableFuture> allSessions = submitRepairSessions(parentSession, false, executor, validationScheduler, commonRanges, cfnames);
// After all repair sessions completes(successful or not),
// run anticompaction if necessary and send finish notice back to client
final Collection> successfulRanges = new ArrayList<>();
final AtomicBoolean hasFailure = new AtomicBoolean();
ListenableFuture repairResult = Futures.transformAsync(allSessions, new AsyncFunction, Object>()
{
@SuppressWarnings("unchecked")
public ListenableFuture apply(List results)
{
logger.debug("Repair result: {}", results);
// filter out null(=failed) results and get successful ranges
for (RepairSessionResult sessionResult : results)
{
if (sessionResult != null)
{
// don't record successful repair if we had to skip ranges
if (!sessionResult.skippedReplicas)
{
successfulRanges.addAll(sessionResult.ranges);
}
}
else
{
hasFailure.compareAndSet(false, true);
}
}
return Futures.immediateFuture(null);
}
}, MoreExecutors.directExecutor());
Futures.addCallback(repairResult,
new RepairCompleteCallback(parentSession,
successfulRanges,
preparedEndpoints,
startTime,
traceState,
hasFailure,
executor),
MoreExecutors.directExecutor());
}
private void incrementalRepair(UUID parentSession,
long startTime,
TraceState traceState,
NeighborsAndRanges neighborsAndRanges,
Set preparedEndpoints,
String... cfnames)
{
// the local node also needs to be included in the set of participants, since coordinator sessions aren't persisted
Set allParticipants = ImmutableSet.builder()
.addAll(neighborsAndRanges.participants)
.add(FBUtilities.getBroadcastAddressAndPort())
.build();
// Not necessary to include self for filtering. The common ranges only contains neighbhor node endpoints.
List allRanges = neighborsAndRanges.filterCommonRanges(keyspace, cfnames);
CoordinatorSession coordinatorSession = ActiveRepairService.instance.consistent.coordinated.registerSession(parentSession, allParticipants, neighborsAndRanges.shouldExcludeDeadParticipants);
ListeningExecutorService executor = createExecutor();
AtomicBoolean hasFailure = new AtomicBoolean(false);
ListenableFuture repairResult = coordinatorSession.execute(() -> submitRepairSessions(parentSession, true, executor, validationScheduler, allRanges, cfnames),
hasFailure);
Collection> ranges = new HashSet<>();
for (Collection> range : Iterables.transform(allRanges, cr -> cr.ranges))
{
ranges.addAll(range);
}
Futures.addCallback(repairResult,
new RepairCompleteCallback(parentSession, ranges, preparedEndpoints, startTime, traceState, hasFailure, executor),
MoreExecutors.directExecutor());
}
private void previewRepair(UUID parentSession,
long startTime,
List commonRanges,
Set preparedEndpoints,
String... cfnames)
{
logger.debug("Starting preview repair for {}", parentSession);
// Set up RepairJob executor for this repair command.
ListeningExecutorService executor = createExecutor();
final ListenableFuture> allSessions = submitRepairSessions(parentSession, false, executor, validationScheduler, commonRanges, cfnames);
Futures.addCallback(allSessions, new FutureCallback>()
{
public void onSuccess(List results)
{
try
{
if (results == null || results.stream().anyMatch(s -> s == null))
{
// something failed
fail(null);
return;
}
PreviewKind previewKind = options.getPreviewKind();
Preconditions.checkState(previewKind != PreviewKind.NONE, "Preview is NONE");
SyncStatSummary summary = new SyncStatSummary(true);
summary.consumeSessionResults(results);
final String message;
if (summary.isEmpty())
{
message = previewKind == PreviewKind.REPAIRED ? "Repaired data is in sync" : "Previewed data was in sync";
}
else
{
message = (previewKind == PreviewKind.REPAIRED ? "Repaired data is inconsistent\n" : "Preview complete\n") + summary.toString();
RepairMetrics.previewFailures.inc();
if (previewKind == PreviewKind.REPAIRED)
maybeSnapshotReplicas(parentSession, keyspace, results);
}
notification(message);
success("Repair preview completed successfully");
ActiveRepairService.instance.cleanUp(parentSession, preparedEndpoints);
}
catch (Throwable t)
{
logger.error("Error completing preview repair", t);
onFailure(t);
}
finally
{
executor.shutdown();
}
}
public void onFailure(Throwable t)
{
notifyError(t);
fail("Error completing preview repair: " + t.getMessage());
executor.shutdown();
}
}, MoreExecutors.directExecutor());
}
private void maybeSnapshotReplicas(UUID parentSession, String keyspace, List results)
{
if (!DatabaseDescriptor.snapshotOnRepairedDataMismatch())
return;
try
{
Set mismatchingTables = new HashSet<>();
Set nodes = new HashSet<>();
for (RepairSessionResult sessionResult : results)
{
for (RepairResult repairResult : emptyIfNull(sessionResult.repairJobResults))
{
for (SyncStat stat : emptyIfNull(repairResult.stats))
{
if (stat.numberOfDifferences > 0)
mismatchingTables.add(repairResult.desc.columnFamily);
// snapshot all replicas, even if they don't have any differences
nodes.add(stat.nodes.coordinator);
nodes.add(stat.nodes.peer);
}
}
}
String snapshotName = DiagnosticSnapshotService.getSnapshotName(DiagnosticSnapshotService.REPAIRED_DATA_MISMATCH_SNAPSHOT_PREFIX);
for (String table : mismatchingTables)
{
// we can just check snapshot existence locally since the repair coordinator is always a replica (unlike in the read case)
if (!Keyspace.open(keyspace).getColumnFamilyStore(table).snapshotExists(snapshotName))
{
logger.info("{} Snapshotting {}.{} for preview repair mismatch with tag {} on instances {}",
options.getPreviewKind().logPrefix(parentSession),
keyspace, table, snapshotName, nodes);
DiagnosticSnapshotService.repairedDataMismatch(Keyspace.open(keyspace).getColumnFamilyStore(table).metadata(), nodes);
}
else
{
logger.info("{} Not snapshotting {}.{} - snapshot {} exists",
options.getPreviewKind().logPrefix(parentSession),
keyspace, table, snapshotName);
}
}
}
catch (Exception e)
{
logger.error("{} Failed snapshotting replicas", options.getPreviewKind().logPrefix(parentSession), e);
}
}
private static Iterable emptyIfNull(Iterable iter)
{
if (iter == null)
return Collections.emptyList();
return iter;
}
private ListenableFuture> submitRepairSessions(UUID parentSession,
boolean isIncremental,
ListeningExecutorService executor,
Scheduler validationScheduler,
List commonRanges,
String... cfnames)
{
List> futures = new ArrayList<>(options.getRanges().size());
for (CommonRange commonRange : commonRanges)
{
logger.info("Starting RepairSession for {}", commonRange);
RepairSession session = ActiveRepairService.instance.submitRepairSession(parentSession,
commonRange,
keyspace,
options.getParallelism(),
isIncremental,
options.isPullRepair(),
options.getPreviewKind(),
options.optimiseStreams(),
executor,
validationScheduler,
cfnames);
if (session == null)
continue;
Futures.addCallback(session, new RepairSessionCallback(session), MoreExecutors.directExecutor());
futures.add(session);
}
return Futures.successfulAsList(futures);
}
private ListeningExecutorService createExecutor()
{
return MoreExecutors.listeningDecorator(new JMXEnabledThreadPoolExecutor(options.getJobThreads(),
Integer.MAX_VALUE,
TimeUnit.SECONDS,
new LinkedBlockingQueue<>(),
new NamedThreadFactory("Repair#" + cmd),
"internal"));
}
private class RepairSessionCallback implements FutureCallback
{
private final RepairSession session;
public RepairSessionCallback(RepairSession session)
{
this.session = session;
}
public void onSuccess(RepairSessionResult result)
{
String message = String.format("Repair session %s for range %s finished", session.getId(),
session.ranges().toString());
logger.info(message);
fireProgressEvent(new ProgressEvent(ProgressEventType.PROGRESS,
progressCounter.incrementAndGet(),
totalProgress,
message));
}
public void onFailure(Throwable t)
{
String message = String.format("Repair session %s for range %s failed with error %s",
session.getId(), session.ranges().toString(), t.getMessage());
notifyError(new RuntimeException(message, t));
}
}
private class RepairCompleteCallback implements FutureCallback
© 2015 - 2024 Weber Informatics LLC | Privacy Policy