org.apache.solr.cloud.ZkShardTerms Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.cloud;
import java.lang.invoke.MethodHandles;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.ObjectReleaseTracker;
import org.apache.solr.common.util.Utils;
import org.apache.solr.core.CoreDescriptor;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.Watcher;
import org.apache.zookeeper.data.Stat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Class used for interact with a ZK term node.
* Each ZK term node relates to a shard of a collection and have this format (in json)
*
*
* {
* "replicaNodeName1" : 1,
* "replicaNodeName2" : 2,
* ..
* }
*
*
* The values correspond to replicas are called terms.
* Only replicas with highest term value are considered up to date and be able to become leader and serve queries.
*
* Terms can only updated in two strict ways:
*
* - A replica sets its term equals to leader's term
*
- The leader increase its term and some other replicas by 1
*
* This class should not be reused after {@link org.apache.zookeeper.Watcher.Event.KeeperState#Expired} event
*/
public class ZkShardTerms implements AutoCloseable{
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final Object writingLock = new Object();
private final String collection;
private final String shard;
private final String znodePath;
private final SolrZkClient zkClient;
private final Set listeners = new HashSet<>();
private final AtomicBoolean isClosed = new AtomicBoolean(false);
private Terms terms;
// Listener of a core for shard's term change events
interface CoreTermWatcher {
// return true if the listener wanna to be triggered in the next time
boolean onTermChanged(Terms terms);
}
public ZkShardTerms(String collection, String shard, SolrZkClient zkClient) {
this.znodePath = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/terms/" + shard;
this.collection = collection;
this.shard = shard;
this.zkClient = zkClient;
ensureTermNodeExist();
refreshTerms();
retryRegisterWatcher();
ObjectReleaseTracker.track(this);
}
/**
* Ensure that leader's term is higher than some replica's terms
* @param leader coreNodeName of leader
* @param replicasNeedingRecovery set of replicas in which their terms should be lower than leader's term
*/
public void ensureTermsIsHigher(String leader, Set replicasNeedingRecovery) {
if (replicasNeedingRecovery.isEmpty()) return;
Terms newTerms;
while( (newTerms = terms.increaseTerms(leader, replicasNeedingRecovery)) != null) {
if (forceSaveTerms(newTerms)) return;
}
}
/**
* Can this replica become leader?
* @param coreNodeName of the replica
* @return true if this replica can become leader, false if otherwise
*/
public boolean canBecomeLeader(String coreNodeName) {
return terms.canBecomeLeader(coreNodeName);
}
/**
* Should leader skip sending updates to this replica?
* @param coreNodeName of the replica
* @return true if this replica has term equals to leader's term, false if otherwise
*/
public boolean skipSendingUpdatesTo(String coreNodeName) {
return !terms.haveHighestTermValue(coreNodeName);
}
/**
* Did this replica registered its term? This is a sign to check f
* @param coreNodeName of the replica
* @return true if this replica registered its term, false if otherwise
*/
public boolean registered(String coreNodeName) {
return terms.getTerm(coreNodeName) != null;
}
public void close() {
// no watcher will be registered
isClosed.set(true);
synchronized (listeners) {
listeners.clear();
}
ObjectReleaseTracker.release(this);
}
// package private for testing, only used by tests
Map getTerms() {
synchronized (writingLock) {
return new HashMap<>(terms.values);
}
}
/**
* Add a listener so the next time the shard's term get updated, listeners will be called
*/
void addListener(CoreTermWatcher listener) {
synchronized (listeners) {
listeners.add(listener);
}
}
/**
* Remove the coreNodeName from terms map and also remove any expired listeners
* @return Return true if this object should not be reused
*/
boolean removeTerm(CoreDescriptor cd) {
int numListeners;
synchronized (listeners) {
// solrcore already closed
listeners.removeIf(coreTermWatcher -> !coreTermWatcher.onTermChanged(terms));
numListeners = listeners.size();
}
return removeTerm(cd.getCloudDescriptor().getCoreNodeName()) || numListeners == 0;
}
// package private for testing, only used by tests
// return true if this object should not be reused
boolean removeTerm(String coreNodeName) {
Terms newTerms;
while ( (newTerms = terms.removeTerm(coreNodeName)) != null) {
try {
if (saveTerms(newTerms)) return false;
} catch (KeeperException.NoNodeException e) {
return true;
}
}
return true;
}
/**
* Register a replica's term (term value will be 0).
* If a term is already associate with this replica do nothing
* @param coreNodeName of the replica
*/
void registerTerm(String coreNodeName) {
Terms newTerms;
while ( (newTerms = terms.registerTerm(coreNodeName)) != null) {
if (forceSaveTerms(newTerms)) break;
}
}
/**
* Set a replica's term equals to leader's term, and remove recovering flag of a replica.
* This call should only be used by {@link org.apache.solr.common.params.CollectionParams.CollectionAction#FORCELEADER}
* @param coreNodeName of the replica
*/
public void setTermEqualsToLeader(String coreNodeName) {
Terms newTerms;
while ( (newTerms = terms.setTermEqualsToLeader(coreNodeName)) != null) {
if (forceSaveTerms(newTerms)) break;
}
}
public void setTermToZero(String coreNodeName) {
Terms newTerms;
while ( (newTerms = terms.setTermToZero(coreNodeName)) != null) {
if (forceSaveTerms(newTerms)) break;
}
}
/**
* Mark {@code coreNodeName} as recovering
*/
public void startRecovering(String coreNodeName) {
Terms newTerms;
while ( (newTerms = terms.startRecovering(coreNodeName)) != null) {
if (forceSaveTerms(newTerms)) break;
}
}
/**
* Mark {@code coreNodeName} as finished recovering
*/
public void doneRecovering(String coreNodeName) {
Terms newTerms;
while ( (newTerms = terms.doneRecovering(coreNodeName)) != null) {
if (forceSaveTerms(newTerms)) break;
}
}
public boolean isRecovering(String name) {
return terms.values.containsKey(name + "_recovering");
}
/**
* When first updates come in, all replicas have some data now,
* so we must switch from term 0 (registered) to 1 (have some data)
*/
public void ensureHighestTermsAreNotZero() {
Terms newTerms;
while ( (newTerms = terms.ensureHighestTermsAreNotZero()) != null) {
if (forceSaveTerms(newTerms)) break;
}
}
public long getHighestTerm() {
return terms.getMaxTerm();
}
public long getTerm(String coreNodeName) {
Long term = terms.getTerm(coreNodeName);
return term == null? -1 : term;
}
// package private for testing, only used by tests
int getNumListeners() {
synchronized (listeners) {
return listeners.size();
}
}
/**
* Set new terms to ZK.
* In case of correspond ZK term node is not created, create it
* @param newTerms to be set
* @return true if terms is saved successfully to ZK, false if otherwise
*/
private boolean forceSaveTerms(Terms newTerms) {
try {
return saveTerms(newTerms);
} catch (KeeperException.NoNodeException e) {
ensureTermNodeExist();
return false;
}
}
/**
* Set new terms to ZK, the version of new terms must match the current ZK term node
* @param newTerms to be set
* @return true if terms is saved successfully to ZK, false if otherwise
* @throws KeeperException.NoNodeException correspond ZK term node is not created
*/
private boolean saveTerms(Terms newTerms) throws KeeperException.NoNodeException {
byte[] znodeData = Utils.toJSON(newTerms.values);
try {
Stat stat = zkClient.setData(znodePath, znodeData, newTerms.version, true);
setNewTerms(new Terms(newTerms.values, stat.getVersion()));
log.info("Successful update of terms at {} to {}", znodePath, newTerms);
return true;
} catch (KeeperException.BadVersionException e) {
log.info("Failed to save terms, version is not a match, retrying");
refreshTerms();
} catch (KeeperException.NoNodeException e) {
throw e;
} catch (Exception e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error while saving shard term for collection: " + collection, e);
}
return false;
}
/**
* Create correspond ZK term node
*/
private void ensureTermNodeExist() {
String path = "/collections/" + collection + "/terms";
try {
path += "/" + shard;
try {
Map initialTerms = new HashMap<>();
zkClient.makePath(path, Utils.toJSON(initialTerms), CreateMode.PERSISTENT, true);
} catch (KeeperException.NodeExistsException e) {
// it's okay if another beats us creating the node
}
} catch (InterruptedException e) {
Thread.interrupted();
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Error creating shard term node in Zookeeper for collection: " + collection, e);
} catch (KeeperException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Error creating shard term node in Zookeeper for collection: " + collection, e);
}
}
/**
* Fetch latest terms from ZK
*/
public void refreshTerms() {
Terms newTerms;
try {
Stat stat = new Stat();
byte[] data = zkClient.getData(znodePath, null, stat, true);
newTerms = new Terms((Map) Utils.fromJSON(data), stat.getVersion());
} catch (KeeperException e) {
Thread.interrupted();
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error updating shard term for collection: " + collection, e);
} catch (InterruptedException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error updating shard term for collection: " + collection, e);
}
setNewTerms(newTerms);
}
/**
* Retry register a watcher to the correspond ZK term node
*/
private void retryRegisterWatcher() {
while (!isClosed.get()) {
try {
registerWatcher();
return;
} catch (KeeperException.SessionExpiredException | KeeperException.AuthFailedException e) {
isClosed.set(true);
log.error("Failed watching shard term for collection: {} due to unrecoverable exception", collection, e);
return;
} catch (KeeperException e) {
log.warn("Failed watching shard term for collection: {}, retrying!", collection, e);
try {
zkClient.getConnectionManager().waitForConnected(zkClient.getZkClientTimeout());
} catch (TimeoutException te) {
if (Thread.interrupted()) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error watching shard term for collection: " + collection, te);
}
}
}
}
}
/**
* Register a watcher to the correspond ZK term node
*/
private void registerWatcher() throws KeeperException {
Watcher watcher = event -> {
// session events are not change events, and do not remove the watcher
if (Watcher.Event.EventType.None == event.getType()) {
return;
}
retryRegisterWatcher();
// Some events may be missed during register a watcher, so it is safer to refresh terms after registering watcher
refreshTerms();
};
try {
// exists operation is faster than getData operation
zkClient.exists(znodePath, watcher, true);
} catch (InterruptedException e) {
Thread.interrupted();
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error watching shard term for collection: " + collection, e);
}
}
/**
* Atomically update {@link ZkShardTerms#terms} and call listeners
* @param newTerms to be set
*/
private void setNewTerms(Terms newTerms) {
boolean isChanged = false;
synchronized (writingLock) {
if (terms == null || newTerms.version > terms.version) {
terms = newTerms;
isChanged = true;
}
}
if (isChanged) onTermUpdates(newTerms);
}
private void onTermUpdates(Terms newTerms) {
synchronized (listeners) {
listeners.removeIf(coreTermWatcher -> !coreTermWatcher.onTermChanged(newTerms));
}
}
/**
* Hold values of terms, this class is immutable
*/
static class Terms {
private final Map values;
private final long maxTerm;
// ZK node version
private final int version;
public Terms () {
this(new HashMap<>(), 0);
}
public Terms(Map values, int version) {
this.values = values;
this.version = version;
if (values.isEmpty()) this.maxTerm = 0;
else this.maxTerm = Collections.max(values.values());
}
/**
* Can {@code coreNodeName} become leader?
* @param coreNodeName of the replica
* @return true if {@code coreNodeName} can become leader, false if otherwise
*/
boolean canBecomeLeader(String coreNodeName) {
return haveHighestTermValue(coreNodeName) && !values.containsKey(coreNodeName + "_recovering");
}
/**
* Is {@code coreNodeName}'s term highest?
* @param coreNodeName of the replica
* @return true if term of {@code coreNodeName} is highest
*/
boolean haveHighestTermValue(String coreNodeName) {
if (values.isEmpty()) return true;
long maxTerm = Collections.max(values.values());
return values.getOrDefault(coreNodeName, 0L) == maxTerm;
}
Long getTerm(String coreNodeName) {
return values.get(coreNodeName);
}
/**
* Return a new {@link Terms} in which term of {@code leader} is higher than {@code replicasNeedingRecovery}
* @param leader coreNodeName of leader
* @param replicasNeedingRecovery set of replicas in which their terms should be lower than leader's term
* @return null if term of {@code leader} is already higher than {@code replicasNeedingRecovery}
*/
Terms increaseTerms(String leader, Set replicasNeedingRecovery) {
if (!values.containsKey(leader)) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Can not find leader's term " + leader);
}
boolean changed = false;
boolean foundReplicasInLowerTerms = false;
HashMap newValues = new HashMap<>(values);
long leaderTerm = newValues.get(leader);
for (String key : newValues.keySet()) {
if (replicasNeedingRecovery.contains(key)) foundReplicasInLowerTerms = true;
if (Objects.equals(newValues.get(key), leaderTerm)) {
if(skipIncreaseTermOf(key, replicasNeedingRecovery)) {
changed = true;
} else {
newValues.put(key, leaderTerm+1);
}
}
}
// We should skip the optimization if there are no replicasNeedingRecovery present in local terms,
// this may indicate that the current value is stale
if (!changed && foundReplicasInLowerTerms) return null;
return new Terms(newValues, version);
}
private boolean skipIncreaseTermOf(String key, Set replicasNeedingRecovery) {
if (key.endsWith("_recovering")) {
key = key.substring(0, key.length() - "_recovering".length());
return replicasNeedingRecovery.contains(key);
}
return replicasNeedingRecovery.contains(key);
}
/**
* Return a new {@link Terms} in which highest terms are not zero
* @return null if highest terms are already larger than zero
*/
Terms ensureHighestTermsAreNotZero() {
if (maxTerm > 0) return null;
else {
HashMap newValues = new HashMap<>(values);
for (String replica : values.keySet()) {
newValues.put(replica, 1L);
}
return new Terms(newValues, version);
}
}
/**
* Return a new {@link Terms} in which term of {@code coreNodeName} is removed
* @param coreNodeName of the replica
* @return null if term of {@code coreNodeName} is already not exist
*/
Terms removeTerm(String coreNodeName) {
if (!values.containsKey(coreNodeName)) return null;
HashMap newValues = new HashMap<>(values);
newValues.remove(coreNodeName);
return new Terms(newValues, version);
}
/**
* Return a new {@link Terms} in which the associate term of {@code coreNodeName} is not null
* @param coreNodeName of the replica
* @return null if term of {@code coreNodeName} is already exist
*/
Terms registerTerm(String coreNodeName) {
if (values.containsKey(coreNodeName)) return null;
HashMap newValues = new HashMap<>(values);
newValues.put(coreNodeName, 0L);
return new Terms(newValues, version);
}
Terms setTermToZero(String coreNodeName) {
if (values.getOrDefault(coreNodeName, -1L) == 0) {
return null;
}
HashMap newValues = new HashMap<>(values);
newValues.put(coreNodeName, 0L);
return new Terms(newValues, version);
}
/**
* Return a new {@link Terms} in which the term of {@code coreNodeName} is max
* @param coreNodeName of the replica
* @return null if term of {@code coreNodeName} is already maximum
*/
Terms setTermEqualsToLeader(String coreNodeName) {
long maxTerm = getMaxTerm();
if (values.get(coreNodeName) == maxTerm) return null;
HashMap newValues = new HashMap<>(values);
newValues.put(coreNodeName, maxTerm);
newValues.remove(coreNodeName+"_recovering");
return new Terms(newValues, version);
}
long getMaxTerm() {
return maxTerm;
}
/**
* Mark {@code coreNodeName} as recovering
* @param coreNodeName of the replica
* @return null if {@code coreNodeName} is already marked as doing recovering
*/
Terms startRecovering(String coreNodeName) {
long maxTerm = getMaxTerm();
if (values.get(coreNodeName) == maxTerm)
return null;
HashMap newValues = new HashMap<>(values);
if (!newValues.containsKey(coreNodeName+"_recovering")) {
long currentTerm = newValues.getOrDefault(coreNodeName, 0L);
// by keeping old term, we will have more information in leader election
newValues.put(coreNodeName+"_recovering", currentTerm);
}
newValues.put(coreNodeName, maxTerm);
return new Terms(newValues, version);
}
/**
* Mark {@code coreNodeName} as finished recovering
* @param coreNodeName of the replica
* @return null if term of {@code coreNodeName} is already finished doing recovering
*/
Terms doneRecovering(String coreNodeName) {
if (!values.containsKey(coreNodeName+"_recovering")) {
return null;
}
HashMap newValues = new HashMap<>(values);
newValues.remove(coreNodeName+"_recovering");
return new Terms(newValues, version);
}
@Override
public String toString() {
return "Terms{" +
"values=" + values +
", version=" + version +
'}';
}
}
}