Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.index;
import java.io.UncheckedIOException;
import java.lang.reflect.Constructor;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.annotation.Nullable;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.FutureCallback;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.concurrent.ExecutorPlus;
import org.apache.cassandra.concurrent.FutureTask;
import org.apache.cassandra.concurrent.ImmediateExecutor;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.compaction.CompactionManager;
import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter;
import org.apache.cassandra.db.filter.ColumnFilter;
import org.apache.cassandra.db.filter.DataLimits;
import org.apache.cassandra.db.filter.RowFilter;
import org.apache.cassandra.db.lifecycle.SSTableSet;
import org.apache.cassandra.db.lifecycle.View;
import org.apache.cassandra.db.memtable.Memtable;
import org.apache.cassandra.db.partitions.PartitionUpdate;
import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
import org.apache.cassandra.db.rows.*;
import org.apache.cassandra.exceptions.InvalidRequestException;
import org.apache.cassandra.index.Index.IndexBuildingSupport;
import org.apache.cassandra.index.internal.CassandraIndex;
import org.apache.cassandra.index.transactions.CleanupTransaction;
import org.apache.cassandra.index.transactions.CompactionTransaction;
import org.apache.cassandra.index.transactions.IndexTransaction;
import org.apache.cassandra.index.transactions.UpdateTransaction;
import org.apache.cassandra.io.sstable.format.SSTableReader;
import org.apache.cassandra.notifications.INotification;
import org.apache.cassandra.notifications.INotificationConsumer;
import org.apache.cassandra.notifications.SSTableAddedNotification;
import org.apache.cassandra.schema.ColumnMetadata;
import org.apache.cassandra.schema.IndexMetadata;
import org.apache.cassandra.schema.Indexes;
import org.apache.cassandra.service.pager.SinglePartitionPager;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.transport.ProtocolVersion;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.JVMStabilityInspector;
import org.apache.cassandra.utils.concurrent.*;
import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory;
import static org.apache.cassandra.config.CassandraRelevantProperties.FORCE_DEFAULT_INDEXING_PAGE_SIZE;
import static org.apache.cassandra.utils.ExecutorUtils.awaitTermination;
import static org.apache.cassandra.utils.ExecutorUtils.shutdown;
/**
* Handles the core maintenance functionality associated with indexes: adding/removing them to or from
* a table, (re)building during bootstrap or other streaming operations, flushing, reloading metadata
* and so on.
*
* The Index interface defines a number of methods which return {@code Callable}. These are primarily the
* management tasks for an index implementation. Most of them are currently executed in a blocking
* fashion via submission to SIM's blockingExecutor. This provides the desired behaviour in pretty
* much all cases, as tasks like flushing an index needs to be executed synchronously to avoid potentially
* deadlocking on the FlushWriter or PostFlusher. Several of these {@code Callable} returning methods on Index could
* then be defined with as void and called directly from SIM (rather than being run via the executor service).
* Separating the task defintion from execution gives us greater flexibility though, so that in future, for example,
* if the flush process allows it we leave open the possibility of executing more of these tasks asynchronously.
*
* The primary exception to the above is the Callable returned from Index#addIndexedColumn. This may
* involve a significant effort, building a new index over any existing data. We perform this task asynchronously;
* as it is called as part of a schema update, which we do not want to block for a long period. Building non-custom
* indexes is performed on the CompactionManager.
*
* This class also provides instances of processors which listen to updates to the base table and forward to
* registered Indexes the info required to keep those indexes up to date.
* There are two variants of these processors, each with a factory method provided by SIM:
* IndexTransaction: deals with updates generated on the regular write path.
* CleanupTransaction: used when partitions are modified during compaction or cleanup operations.
* Further details on their usage and lifecycles can be found in the interface definitions below.
*
* The bestIndexFor method is used at query time to identify the most selective index of those able
* to satisfy any search predicates defined by a ReadCommand's RowFilter. It returns a thin IndexAccessor object
* which enables the ReadCommand to access the appropriate functions of the Index at various stages in its lifecycle.
* e.g. the getEstimatedResultRows is required when StorageProxy calculates the initial concurrency factor for
* distributing requests to replicas, whereas a Searcher instance is needed when the ReadCommand is executed locally on
* a target replica.
*
* Finally, this class provides a clear and safe lifecycle to manage index builds, either full rebuilds via
* {@link this#rebuildIndexesBlocking(Set)} or builds of new sstables
* added via {@link org.apache.cassandra.notifications.SSTableAddedNotification}s, guaranteeing
* the following:
*
*
The initialization task and any subsequent successful (re)build mark the index as built.
*
If any (re)build operation fails, the index is not marked as built, and only another full rebuild can mark the
* index as built.
*
Full rebuilds cannot be run concurrently with other full or sstable (re)builds.
*
SSTable builds can always be run concurrently with any other builds.
*
*/
public class SecondaryIndexManager implements IndexRegistry, INotificationConsumer
{
private static final Logger logger = LoggerFactory.getLogger(SecondaryIndexManager.class);
// default page size (in rows) when rebuilding the index for a whole partition
public static final int DEFAULT_PAGE_SIZE = 10000;
/**
* All registered indexes.
*/
private final Map indexes = Maps.newConcurrentMap();
/**
* The indexes that had a build failure.
*/
private final Set needsFullRebuild = Sets.newConcurrentHashSet();
/**
* The indexes that are available for querying.
*/
private final Set queryableIndexes = Sets.newConcurrentHashSet();
/**
* The indexes that are available for writing.
*/
private final Map writableIndexes = Maps.newConcurrentMap();
/**
* The groups of all the registered indexes
*/
private final Map indexGroups = Maps.newConcurrentMap();
/**
* The count of pending index builds for each index.
*/
private final Map inProgressBuilds = Maps.newConcurrentMap();
// executes tasks returned by Indexer#addIndexColumn which may require index(es) to be (re)built
private static final ExecutorPlus asyncExecutor = executorFactory()
.withJmxInternal()
.sequential("SecondaryIndexManagement");
// executes all blocking tasks produced by Indexers e.g. getFlushTask, getMetadataReloadTask etc
private static final ExecutorPlus blockingExecutor = ImmediateExecutor.INSTANCE;
/**
* The underlying column family containing the source data for these indexes
*/
public final ColumnFamilyStore baseCfs;
private final Keyspace keyspace;
public SecondaryIndexManager(ColumnFamilyStore baseCfs)
{
this.baseCfs = baseCfs;
this.keyspace = baseCfs.keyspace;
baseCfs.getTracker().subscribe(this);
}
/**
* Drops and adds new indexes associated with the underlying CF
*/
public void reload()
{
// figure out what needs to be added and dropped.
Indexes tableIndexes = baseCfs.metadata().indexes;
indexes.keySet()
.stream()
.filter(indexName -> !tableIndexes.has(indexName))
.forEach(this::removeIndex);
// we call add for every index definition in the collection as
// some may not have been created here yet, only added to schema
for (IndexMetadata tableIndex : tableIndexes)
addIndex(tableIndex, false);
}
private Future reloadIndex(IndexMetadata indexDef)
{
Index index = indexes.get(indexDef.name);
Callable reloadTask = index.getMetadataReloadTask(indexDef);
return reloadTask == null
? ImmediateFuture.success(null)
: blockingExecutor.submit(reloadTask);
}
private synchronized Future createIndex(IndexMetadata indexDef, boolean isNewCF)
{
final Index index = createInstance(indexDef);
index.register(this);
if (writableIndexes.put(index.getIndexMetadata().name, index) == null)
logger.info("Index [{}] registered and writable.", index.getIndexMetadata().name);
markIndexesBuilding(ImmutableSet.of(index), true, isNewCF);
return buildIndex(index);
}
@VisibleForTesting
public Future buildIndex(final Index index)
{
FutureTask initialBuildTask = null;
// if the index didn't register itself, we can probably assume that no initialization needs to happen
if (indexes.containsKey(index.getIndexMetadata().name))
{
try
{
Callable call = index.getInitializationTask();
if (call != null)
initialBuildTask = new FutureTask<>(call);
}
catch (Throwable t)
{
logAndMarkIndexesFailed(Collections.singleton(index), t, true);
throw t;
}
}
// if there's no initialization, just mark as built and return:
if (initialBuildTask == null)
{
markIndexBuilt(index, true);
return ImmediateFuture.success(null);
}
// otherwise run the initialization task asynchronously with a callback to mark it built or failed
final Promise initialization = new AsyncPromise<>();
// we want to ensure we invoke this task asynchronously, so we want to add our callback before submission
// to ensure the work is not completed before we register the callback and so it gets performed by us.
// This is because Keyspace.open("system") can transitively attempt to open Keyspace.open("system")
initialBuildTask.addCallback(
success -> {
markIndexBuilt(index, true);
initialization.trySuccess(null);
},
failure -> {
logAndMarkIndexesFailed(Collections.singleton(index), failure, true);
initialization.tryFailure(failure);
}
);
asyncExecutor.execute(initialBuildTask);
return initialization;
}
/**
* Adds and builds a index
*
* @param indexDef the IndexMetadata describing the index
* @param isNewCF true if the index is added as part of a new table/columnfamily (i.e. loading a CF at startup),
* false for all other cases (i.e. newly added index)
*/
public synchronized Future addIndex(IndexMetadata indexDef, boolean isNewCF)
{
if (indexes.containsKey(indexDef.name))
return reloadIndex(indexDef);
else
return createIndex(indexDef, isNewCF);
}
/**
* Checks if the specified index is queryable.
*
* @param index the index
* @return true if the specified index is queryable, false otherwise
*/
public boolean isIndexQueryable(Index index)
{
return queryableIndexes.contains(index.getIndexMetadata().name);
}
/**
* Throws an {@link IndexNotAvailableException} if any of the indexes in the specified {@link Index.QueryPlan} is
* not queryable, as it's defined by {@link #isIndexQueryable(Index)}.
*
* @param queryPlan a query plan
* @throws IndexNotAvailableException if the query plan has any index that is not queryable
*/
public void checkQueryability(Index.QueryPlan queryPlan)
{
for (Index index : queryPlan.getIndexes())
{
if (!isIndexQueryable(index))
throw new IndexNotAvailableException(index);
}
}
/**
* Checks if the specified index is writable.
*
* @param index the index
* @return true if the specified index is writable, false otherwise
*/
public boolean isIndexWritable(Index index)
{
return writableIndexes.containsKey(index.getIndexMetadata().name);
}
/**
* Checks if the specified index has any running build task.
*
* @param indexName the index name
* @return {@code true} if the index is building, {@code false} otherwise
*/
@VisibleForTesting
public synchronized boolean isIndexBuilding(String indexName)
{
AtomicInteger counter = inProgressBuilds.get(indexName);
return counter != null && counter.get() > 0;
}
public synchronized void removeIndex(String indexName)
{
Index removedIndex = indexes.remove(indexName);
if (removedIndex != null)
{
removedIndex.unregister(this);
markIndexRemoved(indexName);
executeBlocking(removedIndex.getInvalidateTask(), null);
}
}
public Set getDependentIndexes(ColumnMetadata column)
{
if (indexes.isEmpty())
return Collections.emptySet();
Set dependentIndexes = new HashSet<>();
for (Index index : indexes.values())
if (index.dependsOn(column))
dependentIndexes.add(index.getIndexMetadata());
return dependentIndexes;
}
/**
* Called when dropping a Table
*/
public void markAllIndexesRemoved()
{
getBuiltIndexNames().forEach(this::markIndexRemoved);
}
/**
* Does a blocking full rebuild/recovery of the specifed indexes from all the sstables in the base table.
* Note also that this method of (re)building/recovering indexes:
* a) takes a set of index *names* rather than Indexers
* b) marks existing indexes removed prior to rebuilding
* c) fails if such marking operation conflicts with any ongoing index builds, as full rebuilds cannot be run
* concurrently
*
* @param indexNames the list of indexes to be rebuilt
*/
public void rebuildIndexesBlocking(Set indexNames)
{
// Get the set of indexes that require blocking build
Set toRebuild = indexes.values()
.stream()
.filter(index -> indexNames.contains(index.getIndexMetadata().name))
.filter(Index::shouldBuildBlocking)
.collect(Collectors.toSet());
if (toRebuild.isEmpty())
{
logger.info("No defined indexes with the supplied names: {}", Joiner.on(',').join(indexNames));
return;
}
// Optimistically mark the indexes as writable, so we don't miss incoming writes
boolean needsFlush = false;
for (Index index : toRebuild)
{
String name = index.getIndexMetadata().name;
if (writableIndexes.put(name, index) == null)
{
logger.info("Index [{}] became writable starting recovery.", name);
needsFlush = true;
}
}
// Once we are tracking new writes, flush any memtable contents to not miss them from the sstable-based rebuild
if (needsFlush)
baseCfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.INDEX_BUILD_STARTED);
// Now that we are tracking new writes and we haven't left untracked contents on the memtables, we are ready to
// index the sstables
try (ColumnFamilyStore.RefViewFragment viewFragment = baseCfs.selectAndReference(View.selectFunction(SSTableSet.CANONICAL));
Refs allSSTables = viewFragment.refs)
{
buildIndexesBlocking(allSSTables, toRebuild, true);
}
}
/**
* Checks if the specified {@link ColumnFamilyStore} is a secondary index.
*
* @param cfs the ColumnFamilyStore to check.
* @return true if the specified ColumnFamilyStore is a secondary index,
* false otherwise.
*/
public static boolean isIndexColumnFamilyStore(ColumnFamilyStore cfs)
{
return isIndexColumnFamily(cfs.name);
}
/**
* Checks if the specified {@link ColumnFamilyStore} is the one secondary index.
*
* @param cfName the name of the ColumnFamilyStore to check.
* @return true if the specified ColumnFamilyStore is a secondary index,
* false otherwise.
*/
public static boolean isIndexColumnFamily(String cfName)
{
return cfName.contains(Directories.SECONDARY_INDEX_NAME_SEPARATOR);
}
/**
* Returns the parent of the specified {@link ColumnFamilyStore}.
*
* @param cfs the ColumnFamilyStore
* @return the parent of the specified ColumnFamilyStore
*/
public static ColumnFamilyStore getParentCfs(ColumnFamilyStore cfs)
{
String parentCfs = getParentCfsName(cfs.name);
return cfs.keyspace.getColumnFamilyStore(parentCfs);
}
/**
* Returns the parent name of the specified {@link ColumnFamilyStore}.
*
* @param cfName the ColumnFamilyStore name
* @return the parent name of the specified ColumnFamilyStore
*/
public static String getParentCfsName(String cfName)
{
assert isIndexColumnFamily(cfName);
return StringUtils.substringBefore(cfName, Directories.SECONDARY_INDEX_NAME_SEPARATOR);
}
/**
* Returns the index name
*
* @param cfs the ColumnFamilyStore
* @return the index name
*/
public static String getIndexName(ColumnFamilyStore cfs)
{
return getIndexName(cfs.name);
}
/**
* Returns the index name
*
* @param cfName the ColumnFamilyStore name
* @return the index name
*/
public static String getIndexName(String cfName)
{
assert isIndexColumnFamily(cfName);
return StringUtils.substringAfter(cfName, Directories.SECONDARY_INDEX_NAME_SEPARATOR);
}
/**
* Validates all index groups against the specified SSTables.
*
* @param sstables SSTables for which indexes in the group should be built
* @param throwOnIncomplete whether to throw an error if any index in the group is incomplete
*
* @return true if all indexes in all groups are complete and valid
* false if an index in any group is incomplete and {@code throwOnIncomplete} is false
*
* @throws IllegalStateException if {@code throwOnIncomplete} is true and an index in any group is incomplete
* @throws UncheckedIOException if there is a problem validating any on-disk component in any group
*/
public boolean validateSSTableAttachedIndexes(Collection sstables, boolean throwOnIncomplete)
{
boolean complete = true;
for (Index.Group group : indexGroups.values())
{
if (group.getIndexes().stream().anyMatch(Index::isSSTableAttached))
complete &= group.validateSSTableAttachedIndexes(sstables, throwOnIncomplete);
}
return complete;
}
/**
* Incrementally builds indexes for the specified SSTables in a blocking fashion.
*
* This is similar to {@link #buildIndexesBlocking}, but it is designed to be used in cases where failure will
* cascade through to failing the containing operation that actuates the build. (ex. streaming and SSTable import)
*
* It does not update index build status or queryablility on failure or success and does not call
* {@link #flushIndexesBlocking(Set, FutureCallback)}, as this is an artifact of the legacy non-SSTable-attached
* index implementation.
*
* @param sstables the SSTables for which indexes must be built
*/
public void buildSSTableAttachedIndexesBlocking(Collection sstables)
{
Set toBuild = indexes.values().stream().filter(Index::isSSTableAttached).collect(Collectors.toSet());
if (toBuild.isEmpty())
return;
logger.info("Submitting incremental index build of {} for data in {}...",
commaSeparated(toBuild),
sstables.stream().map(SSTableReader::toString).collect(Collectors.joining(",")));
// Group all building tasks
Map> byType = new HashMap<>();
for (Index index : toBuild)
{
Set stored = byType.computeIfAbsent(index.getBuildTaskSupport(), i -> new HashSet<>());
stored.add(index);
}
// Schedule all index building tasks with callbacks to handle success and failure
List> futures = new ArrayList<>(byType.size());
byType.forEach((buildingSupport, groupedIndexes) ->
{
SecondaryIndexBuilder builder = buildingSupport.getIndexBuildTask(baseCfs, groupedIndexes, sstables, false);
AsyncPromise