org.apache.solr.cloud.api.collections.DistributedCollectionConfigSetCommandRunner Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.solr.cloud.api.collections;

import static org.apache.solr.cloud.api.collections.CollectionHandlingUtils.addExceptionToNamedList;
import static org.apache.solr.cloud.api.collections.CollectionHandlingUtils.logFailedOperation;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.REPLICA_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
import static org.apache.solr.common.params.CommonParams.NAME;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.solr.client.solrj.response.RequestStatusState;
import org.apache.solr.cloud.ConfigSetApiLockFactory;
import org.apache.solr.cloud.ConfigSetCmds;
import org.apache.solr.cloud.DistributedApiAsyncTracker;
import org.apache.solr.cloud.DistributedMultiLock;
import org.apache.solr.cloud.OverseerSolrResponse;
import org.apache.solr.cloud.ZkDistributedCollectionLockFactory;
import org.apache.solr.cloud.ZkDistributedConfigSetLockFactory;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.ConfigSetParams;
import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.Pair;
import org.apache.solr.common.util.SolrNamedThreadFactory;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.logging.MDCLoggingContext;
import org.apache.solr.response.SolrQueryResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Class for execution Collection API and Config Set API commands in a distributed way, without
 * going through Overseer and {@link OverseerCollectionMessageHandler} or {@link
 * org.apache.solr.cloud.OverseerConfigSetMessageHandler}.
 *
 * This class is only called when Collection and Config Set API calls are configured to be
 * distributed, which implies cluster state updates are distributed as well.
 */
public class DistributedCollectionConfigSetCommandRunner {
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  public static final String ZK_PATH_SEPARATOR = "/";

  private static final String ZK_DISTRIBUTED_API_ROOT = "/distributedapi";

  /** Zookeeper node below which the locking hierarchy is anchored */
  private static final String ZK_COLLECTION_LOCKS = ZK_DISTRIBUTED_API_ROOT + "/collectionlocks";

  private static final String ZK_CONFIG_SET_LOCKS = ZK_DISTRIBUTED_API_ROOT + "/configsetlocks";

  /** Zookeeper node below which the async id (requestId) tracking is done (for async requests) */
  private static final String ZK_ASYNC_ROOT = ZK_DISTRIBUTED_API_ROOT + "/async";

  private final ExecutorService distributedCollectionApiExecutorService;

  /**
   * All Collection API commands are executed as if they are asynchronous to stick to the same
   * behavior as the Overseer based Collection API execution. The difference between sync and async
   * is twofold: 1. Non async execution does wait for a while to see if the command has completed
   * and if so, returns success (and if not, returns failure) 2. There is no way to query the status
   * of a non async command that is still running, or to kill it (short of shutting down the node on
   * which it's running). This is not the best design, but at this stage that's what happens with
   * Overseer based Collection API, so implementing the same.
   *
   * 
The common aspects between sync and async is that actual command execution is given an
   * infinite time to eventually complete (or fail). Again this is not great, but that's how it is
   * for now. Likely some cleanup required later (once Overseer is actually remove? Like Solr 10 or
   * 11?).
   */
  private final ExecutorService commandsExecutor;

  private final CoreContainer coreContainer;
  private final CollApiCmds.CommandMap commandMapper;
  private final CollectionCommandContext ccc;
  private final DistributedApiAsyncTracker asyncTaskTracker;

  private volatile boolean shuttingDown = false;

  public DistributedCollectionConfigSetCommandRunner(CoreContainer coreContainer) {
    this.coreContainer = coreContainer;

    if (log.isInfoEnabled()) {
      // Note is it hard to print a log when Collection API is handled by Overseer because Overseer
      // is started regardless of how Collection API is handled, so it doesn't really know...
      log.info(
          "Creating DistributedCollectionConfigSetCommandRunner. Collection and ConfigSet APIs are running distributed (not Overseer based)");
    }

    // TODO we should look at how everything is getting closed when the node is shutdown. But it
    // seems that CollectionsHandler (that creates instances of this class) is not really closed, so
    // maybe it doesn't matter?
    // With distributed Collection API execution, each node will have such an executor but given how
    // thread pools work, threads will only be created if needed (including the corePoolSize
    // threads).
    distributedCollectionApiExecutorService =
        new ExecutorUtil.MDCAwareThreadPoolExecutor(
            5,
            10,
            0L,
            TimeUnit.MILLISECONDS,
            new SynchronousQueue<>(),
            new SolrNamedThreadFactory("DistributedCollectionApiExecutorService"));

    commandsExecutor =
        new ExecutorUtil.MDCAwareThreadPoolExecutor(
            5,
            20,
            0L,
            TimeUnit.MILLISECONDS,
            new SynchronousQueue<>(),
            new SolrNamedThreadFactory("DistributedCollectionApiCommandExecutor"));

    ccc =
        new DistributedCollectionCommandContext(
            this.coreContainer, this.distributedCollectionApiExecutorService);
    commandMapper = new CollApiCmds.CommandMap(ccc);
    asyncTaskTracker =
        new DistributedApiAsyncTracker(ccc.getZkStateReader().getZkClient(), ZK_ASYNC_ROOT);
  }

  /** See {@link DistributedApiAsyncTracker#getAsyncTaskRequestStatus(String)} */
  public Pair getAsyncTaskRequestStatus(String asyncId)
      throws Exception {
    return asyncTaskTracker.getAsyncTaskRequestStatus(asyncId);
  }

  /** See {@link DistributedApiAsyncTracker#deleteSingleAsyncId(String)} */
  public boolean deleteSingleAsyncId(String asyncId) throws Exception {
    return asyncTaskTracker.deleteSingleAsyncId(asyncId);
  }

  /** See {@link DistributedApiAsyncTracker#deleteAllAsyncIds()} */
  public void deleteAllAsyncIds() throws Exception {
    asyncTaskTracker.deleteAllAsyncIds();
  }

  /**
   * When {@link org.apache.solr.handler.admin.CollectionsHandler#invokeAction} does not enqueue to
   * overseer queue and instead calls this method, this method is expected to do the equivalent of
   * what Overseer does in {@link
   * org.apache.solr.cloud.OverseerConfigSetMessageHandler#processMessage}.
   *
   * 
The steps leading to that call in the Overseer execution path are (and the equivalent is
   * done here):
   *
   * 

   *   {@link org.apache.solr.cloud.OverseerTaskProcessor#run()} gets the message from the ZK
   *       queue, grabs the corresponding locks (write lock on the config set target of the API
   *       command and a read lock on the base config set if any - the case for config set creation)
   *       then executes the command using an executor service (it also checks the asyncId if any is
   *       specified but async calls are not supported for Config Set API calls).
   *   
In {@link org.apache.solr.cloud.OverseerTaskProcessor}.{@code Runner.run()} (run on an
   *       executor thread) a call is made to {@link
   *       org.apache.solr.cloud.OverseerConfigSetMessageHandler#processMessage} which does a few
   *       checks and calls the appropriate Config Set method.
   * 
   */
  public void runConfigSetCommand(
      SolrQueryResponse rsp,
      ConfigSetParams.ConfigSetAction action,
      Map result,
      long timeoutMs)
      throws Exception {
    // We refuse new tasks, but will wait for already submitted ones (i.e. those that made it
    // through this method earlier). See stopAndWaitForPendingTasksToComplete() below
    if (shuttingDown) {
      throw new SolrException(
          SolrException.ErrorCode.CONFLICT,
          "Solr is shutting down, no more Config Set API tasks may be executed");
    }

    // never null
    String configSetName = (String) result.get(NAME);
    // baseConfigSetName will be null if we're not creating a new config set
    String baseConfigSetName =
        ConfigSetCmds.getBaseConfigSetName(
            action, (String) result.get(ConfigSetCmds.BASE_CONFIGSET));

    if (log.isInfoEnabled()) {
      log.info("Running Config Set API locally for " + action + " " + configSetName); // nowarn
    }

    ConfigSetCommandRunner commandRunner =
        new ConfigSetCommandRunner(
            new ZkNodeProps(result), action, configSetName, baseConfigSetName);
    final Future taskFuture;
    try {
      taskFuture = commandsExecutor.submit(commandRunner);
    } catch (RejectedExecutionException ree) {
      throw new SolrException(
          SolrException.ErrorCode.SERVICE_UNAVAILABLE, "Too many executing commands", ree);
    }

    // Wait for a while... Just like Overseer based Config Set API (wait can timeout but actual
    // command execution does not)
    try {
      taskFuture.get(timeoutMs, TimeUnit.MILLISECONDS);
    } catch (TimeoutException te) {
      throw new SolrException(
          SolrException.ErrorCode.SERVER_ERROR,
          action + " " + configSetName + " timed out after " + timeoutMs + "ms");
    } catch (InterruptedException e) {
      Thread.currentThread().interrupt();
      throw new SolrException(
          SolrException.ErrorCode.SERVER_ERROR, action + " " + configSetName + " interrupted", e);
    }
  }

  /**
   * When {@link org.apache.solr.handler.admin.CollectionsHandler#invokeAction} does not enqueue to
   * overseer queue and instead calls this method, this method is expected to do the equivalent of
   * what Overseer does in {@link
   * org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler#processMessage}.
   *
   * The steps leading to that call in the Overseer execution path are (and the equivalent is
   * done here):
   *
   * 

   *   {@link org.apache.solr.cloud.OverseerTaskProcessor#run()} gets the message from the ZK
   *       queue, grabs the corresponding lock (Collection API calls do locking to prevent non
   *       compatible concurrent modifications of a collection), marks the async id of the task as
   *       running then executes the command using an executor service
   *   
In {@link org.apache.solr.cloud.OverseerTaskProcessor}.{@code Runner.run()} (run on an
   *       executor thread) a call is made to {@link
   *       org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler#processMessage}
   *       which sets the logging context, calls {@link CollApiCmds.CollectionApiCommand#call}
   * 
   */
  public OverseerSolrResponse runCollectionCommand(
      ZkNodeProps message, CollectionParams.CollectionAction action, long timeoutMs) {
    // We refuse new tasks, but will wait for already submitted ones (i.e. those that made it
    // through this method earlier). See stopAndWaitForPendingTasksToComplete() below
    if (shuttingDown) {
      throw new SolrException(
          SolrException.ErrorCode.CONFLICT,
          "Solr is shutting down, no more Collection API tasks may be executed");
    }

    final String asyncId = message.getStr(ASYNC);

    if (log.isInfoEnabled()) {
      log.info(
          "Running Collection API locally for " + action.name() + " asyncId=" + asyncId); // nowarn
    }

    // Following the call below returning true, we must eventually cancel or complete the task.
    // Happens either in the CollectionCommandRunner below or in the catch when the runner would not
    // execute.
    if (!asyncTaskTracker.createNewAsyncJobTracker(asyncId)) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Task with the same requestid already exists. (" + asyncId + ")");
    }

    CollectionCommandRunner commandRunner = new CollectionCommandRunner(message, action, asyncId);
    final Future taskFuture;
    try {
      taskFuture = commandsExecutor.submit(commandRunner);
    } catch (RejectedExecutionException ree) {
      // The command will not run, need to cancel the async ID so it can be reused on a subsequent
      // attempt by the client
      asyncTaskTracker.cancelAsyncId(asyncId);
      throw new SolrException(
          SolrException.ErrorCode.SERVICE_UNAVAILABLE, "Too many executing commands", ree);
    }

    if (asyncId == null) {
      // Non async calls wait for a while in case the command completes. If they time out, there's
      // no way to track the job progress (improvement suggestion: decorrelate having a task ID from
      // the fact of waiting for the job to complete)
      try {
        return taskFuture.get(timeoutMs, TimeUnit.MILLISECONDS);
      } catch (TimeoutException te) {
        throw new SolrException(
            SolrException.ErrorCode.SERVER_ERROR, action + " timed out after " + timeoutMs + "ms");
      } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, action + " interrupted", e);
      } catch (Exception e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, action + " failed", e);
      }
    } else {
      // Async calls do not wait for the command to finish but get instead back the async id (that
      // they just sent...)
      NamedList