org.apache.solr.update.PeerSync Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Show all versions of solr-core Show documentation
Apache Solr (module: core)
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update;
import static org.apache.solr.common.params.CommonParams.DISTRIB;
import static org.apache.solr.common.params.CommonParams.ID;
import static org.apache.solr.update.processor.DistributedUpdateProcessor.DistribPhase.FROMLEADER;
import static org.apache.solr.update.processor.DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM;
import com.codahale.metrics.Counter;
import com.codahale.metrics.Timer;
import com.google.common.annotations.VisibleForTesting;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.http.NoHttpResponseException;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.cloud.ZkController;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.IOUtils;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrInfoBean;
import org.apache.solr.handler.component.ShardHandler;
import org.apache.solr.handler.component.ShardHandlerFactory;
import org.apache.solr.handler.component.ShardRequest;
import org.apache.solr.handler.component.ShardResponse;
import org.apache.solr.metrics.SolrMetricProducer;
import org.apache.solr.metrics.SolrMetricsContext;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.update.processor.UpdateRequestProcessorChain;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This class is useful for performing peer to peer synchronization of recently indexed update
* commands during recovery process.
*
* @lucene.experimental
*/
public class PeerSync implements SolrMetricProducer {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final boolean debug = log.isDebugEnabled();
private List replicas;
private int nUpdates;
private UpdateHandler uhandler;
private UpdateLog ulog;
private ShardHandlerFactory shardHandlerFactory;
private ShardHandler shardHandler;
private List requests = new ArrayList<>();
@VisibleForTesting static final int SHARD_REQUEST_PURPOSE_GET_UPDATES = 0;
@VisibleForTesting static final int SHARD_REQUEST_PURPOSE_GET_VERSIONS = 1;
private final boolean cantReachIsSuccess;
private final boolean doFingerprint;
private final boolean onlyIfActive;
private SolrCore core;
private Updater updater;
private MissedUpdatesFinder missedUpdatesFinder;
// metrics
private Timer syncTime;
private Counter syncErrors;
private Counter syncSkipped;
private SolrMetricsContext solrMetricsContext;
// comparator that sorts by absolute value, putting highest first
public static Comparator absComparator =
(l1, l2) -> Long.compare(Math.abs(l2), Math.abs(l1));
private static class SyncShardRequest extends ShardRequest {
IndexFingerprint fingerprint;
boolean doFingerprintComparison;
Exception updateException;
long totalRequestedUpdates;
}
public PeerSync(SolrCore core, List replicas, int nUpdates, boolean cantReachIsSuccess) {
this(core, replicas, nUpdates, cantReachIsSuccess, false, true);
}
public PeerSync(
SolrCore core,
List replicas,
int nUpdates,
boolean cantReachIsSuccess,
boolean onlyIfActive,
boolean doFingerprint) {
this.core = core;
this.replicas = replicas;
this.nUpdates = nUpdates;
this.cantReachIsSuccess = cantReachIsSuccess;
this.doFingerprint =
doFingerprint && !("true".equals(System.getProperty("solr.disableFingerprint")));
this.onlyIfActive = onlyIfActive;
uhandler = core.getUpdateHandler();
ulog = uhandler.getUpdateLog();
// TODO: close
shardHandlerFactory = core.getCoreContainer().getShardHandlerFactory();
shardHandler = shardHandlerFactory.getShardHandler();
this.updater = new Updater(msg(), core);
core.getCoreMetricManager()
.registerMetricProducer(SolrInfoBean.Category.REPLICATION.toString(), this);
}
public static final String METRIC_SCOPE = "peerSync";
@Override
public SolrMetricsContext getSolrMetricsContext() {
return solrMetricsContext;
}
@Override
public void initializeMetrics(SolrMetricsContext parentContext, String scope) {
this.solrMetricsContext = parentContext.getChildContext(this);
syncTime = solrMetricsContext.timer("time", scope, METRIC_SCOPE);
syncErrors = solrMetricsContext.counter("errors", scope, METRIC_SCOPE);
syncSkipped = solrMetricsContext.counter("skipped", scope, METRIC_SCOPE);
}
public static long percentile(List arr, float frac) {
int elem = (int) (arr.size() * frac);
return Math.abs(arr.get(elem));
}
// start of peersync related debug messages. includes the core name for correlation.
private String msg() {
ZkController zkController = uhandler.core.getCoreContainer().getZkController();
String myURL = "";
if (zkController != null) {
myURL = zkController.getBaseUrl();
}
// TODO: core name turns up blank in many tests - find URL if cloud enabled?
return "PeerSync: core=" + uhandler.core.getName() + " url=" + myURL + " ";
}
/**
* Returns true if peer sync was successful, meaning that this core may be considered to have the
* latest updates. It does not mean that the remote replica is in sync with us.
*/
public PeerSyncResult sync() {
if (ulog == null) {
syncErrors.inc();
return PeerSyncResult.failure();
}
Timer.Context timerContext = null;
try {
if (log.isInfoEnabled()) {
log.info("{} START replicas={} nUpdates={}", msg(), replicas, nUpdates);
}
// check if we already in sync to begin with
if (doFingerprint && alreadyInSync()) {
syncSkipped.inc();
return PeerSyncResult.success();
}
// measure only when actual sync is performed
timerContext = syncTime.time();
// Fire off the requests before getting our own recent updates (for better concurrency)
// This also allows us to avoid getting updates we don't need... if we got our updates and
// then got their updates, they would have newer stuff that we also had (assuming updates are
// going on and are being forwarded).
for (String replica : replicas) {
requestVersions(replica);
}
long ourLowThreshold, ourHighThreshold;
List ourUpdates;
try (UpdateLog.RecentUpdates recentUpdates = ulog.getRecentUpdates()) {
ourUpdates = recentUpdates.getVersions(nUpdates);
}
ourUpdates.sort(absComparator);
if (ourUpdates.size() > 0) {
ourLowThreshold = percentile(ourUpdates, 0.8f);
ourHighThreshold = percentile(ourUpdates, 0.2f);
} else {
// we have no versions and hence no frame of reference to tell if we can use a peers
// updates to bring us into sync
if (log.isInfoEnabled()) {
log.info("{} DONE. We have no versions. sync failed.", msg());
}
for (; ; ) {
ShardResponse srsp = shardHandler.takeCompletedOrError();
if (srsp == null) break;
if (srsp.getException() == null) {
@SuppressWarnings({"unchecked"})
List otherVersions =
(List) srsp.getSolrResponse().getResponse().get("versions");
if (otherVersions != null && !otherVersions.isEmpty()) {
syncErrors.inc();
return PeerSyncResult.failure(true);
}
}
}
syncErrors.inc();
return PeerSyncResult.failure(false);
}
this.missedUpdatesFinder =
new MissedUpdatesFinder(ourUpdates, msg(), nUpdates, ourLowThreshold, ourHighThreshold);
for (; ; ) {
ShardResponse srsp = shardHandler.takeCompletedOrError();
if (srsp == null) break;
boolean success = handleResponse(srsp);
if (!success) {
if (log.isInfoEnabled()) {
log.info("{} DONE. sync failed", msg());
}
shardHandler.cancelAll();
syncErrors.inc();
return PeerSyncResult.failure();
}
}
// finish up any comparisons with other shards that we deferred
boolean success = true;
for (SyncShardRequest sreq : requests) {
if (sreq.doFingerprintComparison) {
success = compareFingerprint(sreq);
if (!success) break;
}
}
if (log.isInfoEnabled()) {
log.info("{} DONE. sync {}", msg(), (success ? "succeeded" : "failed"));
}
if (!success) {
syncErrors.inc();
}
return success ? PeerSyncResult.success() : PeerSyncResult.failure();
} finally {
if (timerContext != null) {
timerContext.close();
}
}
}
/** Check if we are already in sync. Simple fingerprint comparison should do */
private boolean alreadyInSync() {
for (String replica : replicas) {
requestFingerprint(replica);
}
// We only compute fingerprint during leader election. Therefore after heavy indexing,
// the call to compute fingerprint takes awhile and slows the leader election.
// So we do it in parallel with fetching the fingerprint from the other replicas
IndexFingerprint ourFingerprint;
try {
ourFingerprint = IndexFingerprint.getFingerprint(core, Long.MAX_VALUE);
} catch (IOException e) {
log.warn("Could not confirm if we are already in sync. Continue with PeerSync");
return false;
}
for (; ; ) {
ShardResponse srsp = shardHandler.takeCompletedOrError();
if (srsp == null) break;
Object replicaFingerprint = null;
if (srsp.getSolrResponse() != null && srsp.getSolrResponse().getResponse() != null) {
replicaFingerprint = srsp.getSolrResponse().getResponse().get("fingerprint");
}
if (replicaFingerprint == null) {
log.warn(
"Replica did not return a fingerprint - possibly an older Solr version or exception");
continue;
}
IndexFingerprint otherFingerprint = IndexFingerprint.fromObject(replicaFingerprint);
if (IndexFingerprint.compare(otherFingerprint, ourFingerprint) == 0) {
log.info("We are already in sync. No need to do a PeerSync ");
return true;
}
}
return false;
}
private void requestFingerprint(String replica) {
SyncShardRequest sreq = new SyncShardRequest();
requests.add(sreq);
sreq.shards = new String[] {replica};
sreq.actualShards = sreq.shards;
sreq.params = new ModifiableSolrParams();
sreq.params = new ModifiableSolrParams();
sreq.params.set("qt", "/get");
sreq.params.set(DISTRIB, false);
sreq.params.set("getFingerprint", String.valueOf(Long.MAX_VALUE));
shardHandler.submit(sreq, replica, sreq.params);
}
private void requestVersions(String replica) {
SyncShardRequest sreq = new SyncShardRequest();
requests.add(sreq);
sreq.purpose = SHARD_REQUEST_PURPOSE_GET_VERSIONS;
sreq.shards = new String[] {replica};
sreq.actualShards = sreq.shards;
sreq.params = new ModifiableSolrParams();
sreq.params.set("qt", "/get");
sreq.params.set(DISTRIB, false);
sreq.params.set("getVersions", nUpdates);
sreq.params.set("fingerprint", doFingerprint);
shardHandler.submit(sreq, replica, sreq.params);
}
private boolean handleResponse(ShardResponse srsp) {
ShardRequest sreq = srsp.getShardRequest();
if (srsp.getException() != null) {
// TODO: look at this more thoroughly - we don't want
// to fail on connection exceptions, but it may make sense
// to determine this based on the number of fails
//
// If the replica went down between asking for versions and asking for specific updates, that
// shouldn't be treated as success since we counted on getting those updates back (and avoided
// redundantly asking other replicas for them).
if (cantReachIsSuccess
&& sreq.purpose == SHARD_REQUEST_PURPOSE_GET_VERSIONS
&& srsp.getException() instanceof SolrServerException) {
Throwable solrException = ((SolrServerException) srsp.getException()).getRootCause();
boolean connectTimeoutExceptionInChain =
connectTimeoutExceptionInChain(srsp.getException());
if (connectTimeoutExceptionInChain
|| solrException instanceof ConnectTimeoutException
|| solrException instanceof SocketTimeoutException
|| solrException instanceof NoHttpResponseException
|| solrException instanceof SocketException) {
log.warn(
"{} couldn't connect to {}, counting as success ",
msg(),
srsp.getShardAddress(),
srsp.getException());
return true;
}
}
if (cantReachIsSuccess
&& sreq.purpose == SHARD_REQUEST_PURPOSE_GET_VERSIONS
&& srsp.getException() instanceof SolrException
&& ((SolrException) srsp.getException()).code() == 503) {
log.warn(
"{} got a 503 from {}, counting as success ",
msg(),
srsp.getShardAddress(),
srsp.getException());
return true;
}
if (cantReachIsSuccess
&& sreq.purpose == SHARD_REQUEST_PURPOSE_GET_VERSIONS
&& srsp.getException() instanceof SolrException
&& ((SolrException) srsp.getException()).code() == 404) {
log.warn(
"{} got a 404 from {}, counting as success. {} Perhaps /get is not registered?",
msg(),
srsp.getShardAddress(),
srsp.getException());
return true;
}
// TODO: we should return the above information so that when we can request a recovery through
// zookeeper, we do that for these nodes
// TODO: at least log???
// srsp.getException().printStackTrace(System.out);
log.warn(
"{} exception talking to {}, failed", msg(), srsp.getShardAddress(), srsp.getException());
return false;
}
if (sreq.purpose == SHARD_REQUEST_PURPOSE_GET_VERSIONS) {
return handleVersions(srsp);
} else {
return handleUpdates(srsp);
}
}
// sometimes the root exception is a SocketTimeoutException, but ConnectTimeoutException
// is in the chain
private boolean connectTimeoutExceptionInChain(Throwable exception) {
Throwable t = exception;
while (true) {
if (t instanceof ConnectTimeoutException) {
return true;
}
Throwable cause = t.getCause();
if (cause != null) {
t = cause;
} else {
return false;
}
}
}
private boolean handleVersions(ShardResponse srsp) {
// we retrieved the last N updates from the replica
@SuppressWarnings({"unchecked"})
List otherVersions = (List) srsp.getSolrResponse().getResponse().get("versions");
// TODO: how to handle short lists?
SyncShardRequest sreq = (SyncShardRequest) srsp.getShardRequest();
Object fingerprint = srsp.getSolrResponse().getResponse().get("fingerprint");
if (log.isInfoEnabled()) {
log.info(
"{} Received {} versions from {} fingerprint:{}",
msg(),
otherVersions.size(),
sreq.shards[0],
fingerprint);
}
if (fingerprint != null) {
sreq.fingerprint = IndexFingerprint.fromObject(fingerprint);
}
if (otherVersions.size() == 0) {
// when sync with other replicas, they may not contains any updates
return true;
}
MissedUpdatesRequest updatesRequest = missedUpdatesFinder.find(otherVersions, sreq.shards[0]);
if (updatesRequest == MissedUpdatesRequest.ALREADY_IN_SYNC) {
return true;
} else if (updatesRequest == MissedUpdatesRequest.UNABLE_TO_SYNC) {
return false;
} else if (updatesRequest == MissedUpdatesRequest.EMPTY) {
// If we requested updates from another replica, we can't compare fingerprints yet with this
// replica, we need to defer
if (doFingerprint) {
sreq.doFingerprintComparison = true;
}
return true;
}
sreq.totalRequestedUpdates = updatesRequest.totalRequestedUpdates;
return requestUpdates(
srsp, updatesRequest.versionsAndRanges, updatesRequest.totalRequestedUpdates);
}
private boolean compareFingerprint(SyncShardRequest sreq) {
if (sreq.fingerprint == null) return true;
try {
// check our fingerprint only upto the max version in the other fingerprint.
// Otherwise for missed updates (look at missed update test in PeerSyncTest) ourFingerprint
// won't match with otherFingerprint
IndexFingerprint ourFingerprint =
IndexFingerprint.getFingerprint(core, sreq.fingerprint.getMaxVersionSpecified());
int cmp = IndexFingerprint.compare(sreq.fingerprint, ourFingerprint);
log.info("Fingerprint comparison: {}", cmp);
if (cmp != 0) {
log.info("Other fingerprint: {}, Our fingerprint: {}", sreq.fingerprint, ourFingerprint);
}
return cmp == 0; // currently, we only check for equality...
} catch (IOException e) {
log.error("{} Error getting index fingerprint", msg(), e);
return false;
}
}
private boolean requestUpdates(ShardResponse srsp, String versionsAndRanges, long totalUpdates) {
String replica = srsp.getShardRequest().shards[0];
if (log.isInfoEnabled()) {
log.info(
"{} Requesting updates from {} n={} versions={}",
msg(),
replica,
totalUpdates,
versionsAndRanges);
}
// reuse our original request object
ShardRequest sreq = srsp.getShardRequest();
sreq.purpose = SHARD_REQUEST_PURPOSE_GET_UPDATES;
sreq.params = new ModifiableSolrParams();
sreq.params.set("qt", "/get");
sreq.params.set(DISTRIB, false);
sreq.params.set("getUpdates", versionsAndRanges);
sreq.params.set("onlyIfActive", onlyIfActive);
sreq.params.set("fingerprint", doFingerprint);
sreq.responses.clear(); // needs to be zeroed for correct correlation to occur
shardHandler.submit(sreq, sreq.shards[0], sreq.params);
return true;
}
private boolean handleUpdates(ShardResponse srsp) {
// we retrieved the last N updates from the replica
@SuppressWarnings({"unchecked"})
List