All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.datakernel.remotefs.RemoteFsClusterClient Maven / Gradle / Ivy

Go to download

Package provides tools for building efficient, scalable remote file servers. It utilizes CSP for fast and reliable file transfer.

The newest version!
/*
 * Copyright (C) 2015-2019 SoftIndex LLC.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.datakernel.remotefs;

import io.datakernel.async.process.Cancellable;
import io.datakernel.async.service.EventloopService;
import io.datakernel.bytebuf.ByteBuf;
import io.datakernel.common.Initializable;
import io.datakernel.common.collection.Try;
import io.datakernel.common.exception.StacklessException;
import io.datakernel.common.tuple.Tuple2;
import io.datakernel.csp.ChannelConsumer;
import io.datakernel.csp.ChannelSupplier;
import io.datakernel.csp.process.ChannelSplitter;
import io.datakernel.eventloop.Eventloop;
import io.datakernel.eventloop.jmx.EventloopJmxMBeanEx;
import io.datakernel.jmx.api.JmxAttribute;
import io.datakernel.promise.Promise;
import io.datakernel.promise.Promises;
import io.datakernel.promise.jmx.PromiseStats;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.time.Duration;
import java.util.*;
import java.util.function.BiFunction;

import static io.datakernel.async.util.LogUtils.toLogger;
import static io.datakernel.common.Preconditions.checkArgument;
import static io.datakernel.common.Preconditions.checkState;
import static io.datakernel.csp.ChannelConsumer.getAcknowledgement;
import static io.datakernel.remotefs.ServerSelector.RENDEZVOUS_HASH_SHARDER;
import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.joining;
import static java.util.stream.Collectors.toList;

/**
 * An implementation of {@link FsClient} which operates on a map of other clients as a cluster.
 * Contains some redundancy and fail-safety capabilities.
 */
public final class RemoteFsClusterClient implements FsClient, Initializable, EventloopService, EventloopJmxMBeanEx {
	private static final Logger logger = LoggerFactory.getLogger(RemoteFsClusterClient.class);

	private final Eventloop eventloop;
	private final Map clients;
	private final Map aliveClients = new HashMap<>();
	private final Map deadClients = new HashMap<>();

	private int replicationCount = 1;
	private ServerSelector serverSelector = RENDEZVOUS_HASH_SHARDER;

	// region JMX
	private final PromiseStats connectPromise = PromiseStats.create(Duration.ofMinutes(5));
	private final PromiseStats uploadStartPromise = PromiseStats.create(Duration.ofMinutes(5));
	private final PromiseStats uploadFinishPromise = PromiseStats.create(Duration.ofMinutes(5));
	private final PromiseStats downloadStartPromise = PromiseStats.create(Duration.ofMinutes(5));
	private final PromiseStats downloadFinishPromise = PromiseStats.create(Duration.ofMinutes(5));
	private final PromiseStats movePromise = PromiseStats.create(Duration.ofMinutes(5));
	private final PromiseStats copyPromise = PromiseStats.create(Duration.ofMinutes(5));
	private final PromiseStats listPromise = PromiseStats.create(Duration.ofMinutes(5));
	private final PromiseStats deletePromise = PromiseStats.create(Duration.ofMinutes(5));
	// endregion

	// region creators
	private RemoteFsClusterClient(Eventloop eventloop, Map clients) {
		this.eventloop = eventloop;
		this.clients = clients;
		aliveClients.putAll(clients);
	}

	public static RemoteFsClusterClient create(Eventloop eventloop) {
		return new RemoteFsClusterClient(eventloop, new HashMap<>());
	}

	public static RemoteFsClusterClient create(Eventloop eventloop, Map clients) {
		return new RemoteFsClusterClient(eventloop, clients);
	}

	/**
	 * Adds given client with given partition id to this cluster
	 */
	public RemoteFsClusterClient withPartition(Object id, FsClient client) {
		clients.put(id, client);
		aliveClients.put(id, client);
		return this;
	}

	/**
	 * Sets the replication count that determines how many copies of the file should persist over the cluster.
	 */
	public RemoteFsClusterClient withReplicationCount(int replicationCount) {
		checkArgument(1 <= replicationCount && replicationCount <= clients.size(), "Replication count cannot be less than one or more than number of clients");
		this.replicationCount = replicationCount;
		return this;
	}

	/**
	 * Sets the server selection strategy based on file name, alive partitions, and replication count.
	 */
	public RemoteFsClusterClient withServerSelector(@NotNull ServerSelector serverSelector) {
		this.serverSelector = serverSelector;
		return this;
	}
	// endregion

	// region getters
	@NotNull
	@Override
	public Eventloop getEventloop() {
		return eventloop;
	}

	public Map getClients() {
		return Collections.unmodifiableMap(clients);
	}

	public Map getAliveClients() {
		return Collections.unmodifiableMap(aliveClients);
	}

	public Map getDeadClients() {
		return Collections.unmodifiableMap(deadClients);
	}

	public ServerSelector getServerSelector() {
		return serverSelector;
	}
	// endregion

	/**
	 * Starts a check process, which pings all partitions and marks them as dead or alive accordingly
	 *
	 * @return promise of the check
	 */
	public Promise checkAllPartitions() {
		return Promises.all(
				clients.entrySet().stream()
						.map(entry -> {
							Object id = entry.getKey();
							return entry.getValue()
									.ping()
									.mapEx(($, e) -> {
										if (e == null) {
											markAlive(id);
										} else {
											markDead(id, e);
										}
										return null;
									});
						}))
				.whenComplete(toLogger(logger, "checkAllPartitions"));
	}

	/**
	 * Starts a check process, which pings all dead partitions to possibly mark them as alive.
	 * This is the preferred method as it does nothing when no clients are marked as dead,
	 * and RemoteFS operations themselves do mark nodes as dead on connection failures.
	 *
	 * @return promise of the check
	 */
	public Promise checkDeadPartitions() {
		return Promises.all(
				deadClients.entrySet().stream()
						.map(entry -> entry.getValue()
								.ping()
								.mapEx(($, e) -> {
									if (e == null) {
										markAlive(entry.getKey());
									}
									return null;
								})))
				.whenComplete(toLogger(logger, "checkDeadPartitions"));
	}

	private void markAlive(Object partitionId) {
		FsClient client = deadClients.remove(partitionId);
		if (client != null) {
			logger.info("Partition " + partitionId + " is alive again!");
			aliveClients.put(partitionId, client);
		}
	}

	/**
	 * Mark partition as dead. It means that no operations will use it and it would not be given to the server selector.
	 * Next call of {@link #checkDeadPartitions()} or {@link #checkAllPartitions()} will ping this partition and possibly
	 * mark it as alive again.
	 *
	 * @param partitionId id of the partition to be marked
	 * @param e           optional exception for logging
	 * @return true if partition was alive and false otherwise
	 */
	public boolean markDead(Object partitionId, @Nullable Throwable e) {
		FsClient client = aliveClients.remove(partitionId);
		if (client != null) {
			logger.warn("marking " + partitionId + " as dead (" + e + ')');
			deadClients.put(partitionId, client);
			return true;
		}
		return false;
	}

	private void markIfDead(Object partitionId, Throwable e) {
		// marking as dead only on lower level connection and other I/O exceptions,
		// remote fs exceptions are the ones actually received with an ServerError response (so the node is obviously not dead)
		if (e.getClass() != StacklessException.class) {
			markDead(partitionId, e);
		}
	}

	private  BiFunction> wrapDeath(Object partitionId) {
		return (res, e) -> {
			if (e == null) {
				return Promise.of(res);
			}
			markIfDead(partitionId, e);
			return Promise.ofException(new StacklessException(RemoteFsClusterClient.class, "Node failed with exception", e));
		};
	}

	// shortcut for creating single Exception from list of possibly failed tries
	private static  Promise ofFailure(String message, List> failed) {
		StacklessException exception = new StacklessException(RemoteFsClusterClient.class, message);
		failed.stream()
				.map(Try::getExceptionOrNull)
				.filter(Objects::nonNull)
				.forEach(exception::addSuppressed);
		return Promise.ofException(exception);
	}

	private Promise> upload(@NotNull String filename, long offset, @Nullable Long revision) {
		List selected = serverSelector.selectFrom(filename, aliveClients.keySet(), replicationCount);

		checkState(!selected.isEmpty(), "Selected no servers to upload file " + filename);
		checkState(aliveClients.keySet().containsAll(selected), "Selected an id that is not one of client ids");

		class ConsumerWithId {
			final Object id;
			final ChannelConsumer consumer;

			ConsumerWithId(Object id, ChannelConsumer consumer) {
				this.id = id;
				this.consumer = consumer;
			}
		}

		return Promises.toList(selected.stream()
				.map(id -> {
					FsClient client = aliveClients.get(id);
					return (revision == null ? client.upload(filename, offset) : client.upload(filename, offset, revision))
							.thenEx(wrapDeath(id))
							.map(consumer -> new ConsumerWithId(id,
									consumer.withAcknowledgement(ack ->
											ack.whenException(e -> markIfDead(id, e)))))
							.toTry();
				}))
				.then(tries -> {
					List successes = tries.stream()
							.filter(Try::isSuccess)
							.map(Try::get)
							.collect(toList());

					if (successes.isEmpty()) {
						return ofFailure("Couldn't connect to any partition to upload file " + filename, tries);
					}

					ChannelSplitter splitter = ChannelSplitter.create().lenient();

					Promise>> uploadResults = Promises.toList(successes.stream()
							.map(s1 -> getAcknowledgement(fn ->
									splitter.addOutput()
											.set(s1.consumer.withAcknowledgement(fn)))
									.toTry()));

					if (logger.isTraceEnabled()) {
						logger.trace("uploading file {} to {}, {}", filename, successes.stream().map(s -> s.id.toString()).collect(joining(", ", "[", "]")), this);
					}

					ChannelConsumer consumer = splitter.getInput().getConsumer();

					// check number of uploads only here, so even if there were less connections
					// than replicationCount, they will still upload
					return Promise.of(consumer.withAcknowledgement(ack -> ack
							.then($ -> uploadResults)
							.then(ackTries -> {
								long successCount = ackTries.stream().filter(Try::isSuccess).count();
								// check number of uploads only here, so even if there were less connections
								// than replicationCount, they will still upload
								if (ackTries.size() < replicationCount) {
									return ofFailure("Didn't connect to enough partitions uploading " +
											filename + ", only " + successCount + " finished uploads", ackTries);
								}
								if (successCount < replicationCount) {
									return ofFailure("Couldn't finish uploadind file " +
											filename + ", only " + successCount + " acknowlegdes received", ackTries);
								}
								return Promise.complete();
							})
							.whenComplete(uploadFinishPromise.recordStats())));
				})
				.whenComplete(uploadStartPromise.recordStats());
	}

	@Override
	public Promise> upload(@NotNull String name, long offset) {
		return upload(name, offset, null);
	}

	@Override
	public Promise> upload(@NotNull String name, long offset, long revision) {
		return upload(name, offset, (Long) revision);
	}

	@Override
	public Promise> download(@NotNull String name, long offset, long length) {
		if (deadClients.size() >= replicationCount) {
			return ofFailure("There are more dead partitions than replication count(" +
					deadClients.size() + " dead, replication count is " + replicationCount + "), aborting", emptyList());
		}

		return Promises.toList(
				aliveClients.entrySet().stream()
						.map(entry -> {
							Object partitionId = entry.getKey();
							return entry.getValue().getMetadata(name) //   ↓ use null's as file non-existence indicators
									.map(res -> res != null ? new Tuple2<>(partitionId, res) : null)
									.thenEx(wrapDeath(partitionId))
									.toTry();
						}))
				.then(tries -> {
					List> successes = tries.stream() // filter successful connections
							.filter(Try::isSuccess)
							.map(Try::get)
							.collect(toList());

					// recheck if our download request marked any partitions as dead
					if (deadClients.size() >= replicationCount) {
						return ofFailure("There are more dead partitions than replication count(" +
								deadClients.size() + " dead, replication count is " + replicationCount + "), aborting", tries);
					}

					// filter partitions where file was found
					List> found = successes.stream().filter(Objects::nonNull).collect(toList());

					// find any partition with the biggest file size
					Optional> maybeBest = found.stream()
							.max(Comparator.comparing(Tuple2::getValue2, FileMetadata.COMPARATOR));

					if (!maybeBest.isPresent()) {
						return ofFailure("File not found: " + name, tries);
					}
					Tuple2 best = maybeBest.get();

					return Promises.any(found.stream()
							.filter(piwfs -> piwfs.getValue2().getRevision() == best.getValue2().getRevision())
							.map(piwfs -> {
								FsClient client = aliveClients.get(piwfs.getValue1());
								if (client == null) { // marked as dead already by somebody
									return Promise.ofException(new StacklessException(RemoteFsClusterClient.class, "Client " + piwfs.getValue1() + " is not alive"));
								}
								logger.trace("downloading file {} from {}", name, piwfs.getValue1());
								return client.download(name, offset, length)
										.whenException(e -> logger.warn("Failed to connect to server with key " + piwfs.getValue1() + " to download file " + name, e))
										.thenEx(wrapDeath(piwfs.getValue1()))
										.map(supplier -> supplier
												.withEndOfStream(eos -> eos
														.whenException(e -> markIfDead(piwfs.getValue1(), e))
														.whenComplete(downloadFinishPromise.recordStats())));
							}), Cancellable::cancel);
				})
				.whenComplete(downloadStartPromise.recordStats());
	}

	@Override
	public Promise move(@NotNull String name, @NotNull String target, long targetRevision, long tombstoneRevision) {
		if (deadClients.size() >= replicationCount) {
			return ofFailure("There are more dead partitions than replication count(" +
					deadClients.size() + " dead, replication count is " + replicationCount + "), aborting", emptyList());
		}

		return Promises.all(aliveClients.entrySet().stream().map(e -> e.getValue().move(name, target, targetRevision, tombstoneRevision).thenEx(wrapDeath(e.getKey()))))
				.whenComplete(movePromise.recordStats());
	}

	@Override
	public Promise copy(@NotNull String name, @NotNull String target, long targetRevision) {
		if (deadClients.size() >= replicationCount) {
			return ofFailure("There are more dead partitions than replication count(" +
					deadClients.size() + " dead, replication count is " + replicationCount + "), aborting", emptyList());
		}

		return Promises.all(aliveClients.entrySet().stream().map(e -> e.getValue().copy(name, target, targetRevision).thenEx(wrapDeath(e.getKey()))))
				.whenComplete(copyPromise.recordStats());
	}

	@Override
	public Promise delete(@NotNull String name, long revision) {
		return Promises.toList(
				aliveClients.entrySet().stream()
						.map(entry -> entry.getValue().delete(name)
								.thenEx(wrapDeath(entry.getKey()))
								.toTry()))
				.then(tries -> {
					if (tries.stream().anyMatch(Try::isSuccess)) { // connected at least to somebody
						return Promise.complete();
					}
					return ofFailure("Couldn't delete on any partition", tries);
				})
				.whenComplete(deletePromise.recordStats());
	}

	private Promise> doList(@NotNull String glob, BiFunction>> list) {
		if (deadClients.size() >= replicationCount) {
			return ofFailure("There are more dead partitions than replication count(" +
					deadClients.size() + " dead, replication count is " + replicationCount + "), aborting", emptyList());
		}

		// this all is the same as delete, but with list of lists of results, flattened and unified
		return Promises.toList(
				aliveClients.entrySet().stream()
						.map(entry -> list.apply(entry.getValue(), glob)
								.thenEx(wrapDeath(entry.getKey()))
								.toTry()))
				.then(tries -> {
					// recheck if our list request marked any partitions as dead
					if (deadClients.size() >= replicationCount) {
						return ofFailure("There are more dead partitions than replication count(" +
								deadClients.size() + " dead, replication count is " + replicationCount + "), aborting", tries);
					}
					return Promise.of(FileMetadata.flatten(tries.stream().filter(Try::isSuccess).map(Try::get)));
				})
				.whenComplete(listPromise.recordStats());
	}

	@Override
	public Promise> listEntities(@NotNull String glob) {
		return doList(glob, FsClient::listEntities);
	}

	@Override
	public Promise> list(@NotNull String glob) {
		return doList(glob, FsClient::list);
	}

	@Override
	public Promise ping() {
		return checkAllPartitions();
	}

	@NotNull
	@Override
	public Promise start() {
		return Promise.complete();
	}

	@NotNull
	@Override
	public Promise stop() {
		return Promise.complete();
	}

	@Override
	public String toString() {
		return "RemoteFsClusterClient{clients=" + clients + ", dead=" + deadClients.keySet() + '}';
	}

	// region JMX
	@JmxAttribute
	public int getReplicationCount() {
		return replicationCount;
	}

	@JmxAttribute
	public void setReplicationCount(int replicationCount) {
		withReplicationCount(replicationCount);
	}

	@JmxAttribute
	public int getAlivePartitionCount() {
		return aliveClients.size();
	}

	@JmxAttribute
	public int getDeadPartitionCount() {
		return deadClients.size();
	}

	@JmxAttribute
	public String[] getAlivePartitions() {
		return aliveClients.keySet().stream()
				.map(Object::toString)
				.toArray(String[]::new);
	}

	@JmxAttribute
	public String[] getDeadPartitions() {
		return deadClients.keySet().stream()
				.map(Object::toString)
				.toArray(String[]::new);
	}

	@JmxAttribute
	public PromiseStats getConnectPromise() {
		return connectPromise;
	}

	@JmxAttribute
	public PromiseStats getUploadStartPromise() {
		return uploadStartPromise;
	}

	@JmxAttribute
	public PromiseStats getUploadFinishPromise() {
		return uploadFinishPromise;
	}

	@JmxAttribute
	public PromiseStats getDownloadStartPromise() {
		return downloadStartPromise;
	}

	@JmxAttribute
	public PromiseStats getDownloadFinishPromise() {
		return downloadFinishPromise;
	}

	@JmxAttribute
	public PromiseStats getMovePromise() {
		return movePromise;
	}

	@JmxAttribute
	public PromiseStats getCopyPromise() {
		return copyPromise;
	}

	@JmxAttribute
	public PromiseStats getListPromise() {
		return listPromise;
	}

	@JmxAttribute
	public PromiseStats getDeletePromise() {
		return deletePromise;
	}
	// endregion
}