org.apache.solr.cloud.api.collections.DistributedCollectionConfigSetCommandRunner Maven / Gradle / Ivy
Show all versions of solr-core Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.cloud.api.collections;
import static org.apache.solr.cloud.api.collections.CollectionHandlingUtils.addExceptionToNamedList;
import static org.apache.solr.cloud.api.collections.CollectionHandlingUtils.logFailedOperation;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.REPLICA_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
import static org.apache.solr.common.params.CommonParams.NAME;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.solr.client.solrj.response.RequestStatusState;
import org.apache.solr.cloud.ConfigSetApiLockFactory;
import org.apache.solr.cloud.ConfigSetCmds;
import org.apache.solr.cloud.DistributedApiAsyncTracker;
import org.apache.solr.cloud.DistributedMultiLock;
import org.apache.solr.cloud.OverseerSolrResponse;
import org.apache.solr.cloud.ZkDistributedCollectionLockFactory;
import org.apache.solr.cloud.ZkDistributedConfigSetLockFactory;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.ConfigSetParams;
import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.Pair;
import org.apache.solr.common.util.SolrNamedThreadFactory;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.logging.MDCLoggingContext;
import org.apache.solr.response.SolrQueryResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Class for execution Collection API and Config Set API commands in a distributed way, without
* going through Overseer and {@link OverseerCollectionMessageHandler} or {@link
* org.apache.solr.cloud.OverseerConfigSetMessageHandler}.
*
* This class is only called when Collection and Config Set API calls are configured to be
* distributed, which implies cluster state updates are distributed as well.
*/
public class DistributedCollectionConfigSetCommandRunner {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final String ZK_PATH_SEPARATOR = "/";
private static final String ZK_DISTRIBUTED_API_ROOT = "/distributedapi";
/** Zookeeper node below which the locking hierarchy is anchored */
private static final String ZK_COLLECTION_LOCKS = ZK_DISTRIBUTED_API_ROOT + "/collectionlocks";
private static final String ZK_CONFIG_SET_LOCKS = ZK_DISTRIBUTED_API_ROOT + "/configsetlocks";
/** Zookeeper node below which the async id (requestId) tracking is done (for async requests) */
private static final String ZK_ASYNC_ROOT = ZK_DISTRIBUTED_API_ROOT + "/async";
private final ExecutorService distributedCollectionApiExecutorService;
/**
* All Collection API commands are executed as if they are asynchronous to stick to the same
* behavior as the Overseer based Collection API execution. The difference between sync and async
* is twofold: 1. Non async execution does wait for a while to see if the command has completed
* and if so, returns success (and if not, returns failure) 2. There is no way to query the status
* of a non async command that is still running, or to kill it (short of shutting down the node on
* which it's running). This is not the best design, but at this stage that's what happens with
* Overseer based Collection API, so implementing the same.
*
*
The common aspects between sync and async is that actual command execution is given an
* infinite time to eventually complete (or fail). Again this is not great, but that's how it is
* for now. Likely some cleanup required later (once Overseer is actually remove? Like Solr 10 or
* 11?).
*/
private final ExecutorService commandsExecutor;
private final CoreContainer coreContainer;
private final CollApiCmds.CommandMap commandMapper;
private final CollectionCommandContext ccc;
private final DistributedApiAsyncTracker asyncTaskTracker;
private volatile boolean shuttingDown = false;
public DistributedCollectionConfigSetCommandRunner(CoreContainer coreContainer) {
this.coreContainer = coreContainer;
if (log.isInfoEnabled()) {
// Note is it hard to print a log when Collection API is handled by Overseer because Overseer
// is started regardless of how Collection API is handled, so it doesn't really know...
log.info(
"Creating DistributedCollectionConfigSetCommandRunner. Collection and ConfigSet APIs are running distributed (not Overseer based)");
}
// TODO we should look at how everything is getting closed when the node is shutdown. But it
// seems that CollectionsHandler (that creates instances of this class) is not really closed, so
// maybe it doesn't matter?
// With distributed Collection API execution, each node will have such an executor but given how
// thread pools work, threads will only be created if needed (including the corePoolSize
// threads).
distributedCollectionApiExecutorService =
new ExecutorUtil.MDCAwareThreadPoolExecutor(
5,
10,
0L,
TimeUnit.MILLISECONDS,
new SynchronousQueue<>(),
new SolrNamedThreadFactory("DistributedCollectionApiExecutorService"));
commandsExecutor =
new ExecutorUtil.MDCAwareThreadPoolExecutor(
5,
20,
0L,
TimeUnit.MILLISECONDS,
new SynchronousQueue<>(),
new SolrNamedThreadFactory("DistributedCollectionApiCommandExecutor"));
ccc =
new DistributedCollectionCommandContext(
this.coreContainer, this.distributedCollectionApiExecutorService);
commandMapper = new CollApiCmds.CommandMap(ccc);
asyncTaskTracker =
new DistributedApiAsyncTracker(ccc.getZkStateReader().getZkClient(), ZK_ASYNC_ROOT);
}
/** See {@link DistributedApiAsyncTracker#getAsyncTaskRequestStatus(String)} */
public Pair getAsyncTaskRequestStatus(String asyncId)
throws Exception {
return asyncTaskTracker.getAsyncTaskRequestStatus(asyncId);
}
/** See {@link DistributedApiAsyncTracker#deleteSingleAsyncId(String)} */
public boolean deleteSingleAsyncId(String asyncId) throws Exception {
return asyncTaskTracker.deleteSingleAsyncId(asyncId);
}
/** See {@link DistributedApiAsyncTracker#deleteAllAsyncIds()} */
public void deleteAllAsyncIds() throws Exception {
asyncTaskTracker.deleteAllAsyncIds();
}
/**
* When {@link org.apache.solr.handler.admin.CollectionsHandler#invokeAction} does not enqueue to
* overseer queue and instead calls this method, this method is expected to do the equivalent of
* what Overseer does in {@link
* org.apache.solr.cloud.OverseerConfigSetMessageHandler#processMessage}.
*
* The steps leading to that call in the Overseer execution path are (and the equivalent is
* done here):
*
*
* - {@link org.apache.solr.cloud.OverseerTaskProcessor#run()} gets the message from the ZK
* queue, grabs the corresponding locks (write lock on the config set target of the API
* command and a read lock on the base config set if any - the case for config set creation)
* then executes the command using an executor service (it also checks the asyncId if any is
* specified but async calls are not supported for Config Set API calls).
*
- In {@link org.apache.solr.cloud.OverseerTaskProcessor}.{@code Runner.run()} (run on an
* executor thread) a call is made to {@link
* org.apache.solr.cloud.OverseerConfigSetMessageHandler#processMessage} which does a few
* checks and calls the appropriate Config Set method.
*
*/
public void runConfigSetCommand(
SolrQueryResponse rsp,
ConfigSetParams.ConfigSetAction action,
Map result,
long timeoutMs)
throws Exception {
// We refuse new tasks, but will wait for already submitted ones (i.e. those that made it
// through this method earlier). See stopAndWaitForPendingTasksToComplete() below
if (shuttingDown) {
throw new SolrException(
SolrException.ErrorCode.CONFLICT,
"Solr is shutting down, no more Config Set API tasks may be executed");
}
// never null
String configSetName = (String) result.get(NAME);
// baseConfigSetName will be null if we're not creating a new config set
String baseConfigSetName =
ConfigSetCmds.getBaseConfigSetName(
action, (String) result.get(ConfigSetCmds.BASE_CONFIGSET));
if (log.isInfoEnabled()) {
log.info("Running Config Set API locally for " + action + " " + configSetName); // nowarn
}
ConfigSetCommandRunner commandRunner =
new ConfigSetCommandRunner(
new ZkNodeProps(result), action, configSetName, baseConfigSetName);
final Future taskFuture;
try {
taskFuture = commandsExecutor.submit(commandRunner);
} catch (RejectedExecutionException ree) {
throw new SolrException(
SolrException.ErrorCode.SERVICE_UNAVAILABLE, "Too many executing commands", ree);
}
// Wait for a while... Just like Overseer based Config Set API (wait can timeout but actual
// command execution does not)
try {
taskFuture.get(timeoutMs, TimeUnit.MILLISECONDS);
} catch (TimeoutException te) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
action + " " + configSetName + " timed out after " + timeoutMs + "ms");
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR, action + " " + configSetName + " interrupted", e);
}
}
/**
* When {@link org.apache.solr.handler.admin.CollectionsHandler#invokeAction} does not enqueue to
* overseer queue and instead calls this method, this method is expected to do the equivalent of
* what Overseer does in {@link
* org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler#processMessage}.
*
* The steps leading to that call in the Overseer execution path are (and the equivalent is
* done here):
*
*
* - {@link org.apache.solr.cloud.OverseerTaskProcessor#run()} gets the message from the ZK
* queue, grabs the corresponding lock (Collection API calls do locking to prevent non
* compatible concurrent modifications of a collection), marks the async id of the task as
* running then executes the command using an executor service
*
- In {@link org.apache.solr.cloud.OverseerTaskProcessor}.{@code Runner.run()} (run on an
* executor thread) a call is made to {@link
* org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler#processMessage}
* which sets the logging context, calls {@link CollApiCmds.CollectionApiCommand#call}
*
*/
public OverseerSolrResponse runCollectionCommand(
ZkNodeProps message, CollectionParams.CollectionAction action, long timeoutMs) {
// We refuse new tasks, but will wait for already submitted ones (i.e. those that made it
// through this method earlier). See stopAndWaitForPendingTasksToComplete() below
if (shuttingDown) {
throw new SolrException(
SolrException.ErrorCode.CONFLICT,
"Solr is shutting down, no more Collection API tasks may be executed");
}
final String asyncId = message.getStr(ASYNC);
if (log.isInfoEnabled()) {
log.info(
"Running Collection API locally for " + action.name() + " asyncId=" + asyncId); // nowarn
}
// Following the call below returning true, we must eventually cancel or complete the task.
// Happens either in the CollectionCommandRunner below or in the catch when the runner would not
// execute.
if (!asyncTaskTracker.createNewAsyncJobTracker(asyncId)) {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"Task with the same requestid already exists. (" + asyncId + ")");
}
CollectionCommandRunner commandRunner = new CollectionCommandRunner(message, action, asyncId);
final Future taskFuture;
try {
taskFuture = commandsExecutor.submit(commandRunner);
} catch (RejectedExecutionException ree) {
// The command will not run, need to cancel the async ID so it can be reused on a subsequent
// attempt by the client
asyncTaskTracker.cancelAsyncId(asyncId);
throw new SolrException(
SolrException.ErrorCode.SERVICE_UNAVAILABLE, "Too many executing commands", ree);
}
if (asyncId == null) {
// Non async calls wait for a while in case the command completes. If they time out, there's
// no way to track the job progress (improvement suggestion: decorrelate having a task ID from
// the fact of waiting for the job to complete)
try {
return taskFuture.get(timeoutMs, TimeUnit.MILLISECONDS);
} catch (TimeoutException te) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR, action + " timed out after " + timeoutMs + "ms");
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, action + " interrupted", e);
} catch (Exception e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, action + " failed", e);
}
} else {
// Async calls do not wait for the command to finish but get instead back the async id (that
// they just sent...)
NamedList