All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.activej.fs.cluster.ClusterRepartitionController Maven / Gradle / Ivy

Go to download

Provides tools for building efficient, scalable local, remote or clustered file servers. It utilizes ActiveJ CSP for fast and reliable file transfer.

There is a newer version: 6.0-rc2
Show newest version
/*
 * Copyright (C) 2020 ActiveJ LLC.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.activej.fs.cluster;

import io.activej.async.function.AsyncRunnable;
import io.activej.async.service.EventloopService;
import io.activej.common.Checks;
import io.activej.common.collection.Try;
import io.activej.common.initializer.WithInitializer;
import io.activej.common.ref.RefInt;
import io.activej.csp.ChannelConsumer;
import io.activej.csp.ChannelSupplier;
import io.activej.csp.process.ChannelByteRanger;
import io.activej.eventloop.Eventloop;
import io.activej.eventloop.jmx.EventloopJmxBeanWithStats;
import io.activej.fs.ActiveFs;
import io.activej.fs.FileMetadata;
import io.activej.fs.exception.FsIOException;
import io.activej.fs.exception.PathContainsFileException;
import io.activej.jmx.api.attribute.JmxAttribute;
import io.activej.jmx.api.attribute.JmxOperation;
import io.activej.promise.Promise;
import io.activej.promise.Promises;
import io.activej.promise.SettablePromise;
import io.activej.promise.jmx.PromiseStats;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.nio.file.FileSystems;
import java.nio.file.PathMatcher;
import java.nio.file.Paths;
import java.time.Duration;
import java.util.*;
import java.util.function.Function;
import java.util.function.Predicate;

import static io.activej.async.function.AsyncRunnables.reuse;
import static io.activej.async.util.LogUtils.Level.TRACE;
import static io.activej.async.util.LogUtils.toLogger;
import static io.activej.common.Checks.*;
import static io.activej.common.Utils.first;
import static io.activej.fs.util.RemoteFsUtils.isWildcard;
import static java.util.Collections.emptySet;
import static java.util.stream.Collectors.toMap;

public final class ClusterRepartitionController implements WithInitializer, EventloopJmxBeanWithStats, EventloopService {
	private static final Logger logger = LoggerFactory.getLogger(ClusterRepartitionController.class);
	private static final boolean CHECK = Checks.isEnabled(ClusterRepartitionController.class);

	private static final Duration DEFAULT_PLAN_RECALCULATION_INTERVAL = Duration.ofMinutes(1);

	private final Object localPartitionId;
	private final FsPartitions partitions;
	private final AsyncRunnable repartition = reuse(this::doRepartition);

	private final List processedFiles = new ArrayList<>();

	private ActiveFs localFs;
	private String glob = "**";
	private Predicate negativeGlobPredicate = $ -> true;
	private int replicationCount = 1;
	private long planRecalculationInterval = DEFAULT_PLAN_RECALCULATION_INTERVAL.toMillis();
	private Iterator repartitionPlan;

	private int allFiles = 0;
	private int ensuredFiles = 0;
	private int failedFiles = 0;
	private boolean isRepartitioning;

	private Set lastAlivePartitionIds = emptySet();
	private long lastPlanRecalculation;

	private @Nullable SettablePromise closeCallback;

	private final PromiseStats repartitionPromiseStats = PromiseStats.create(Duration.ofMinutes(5));
	private final PromiseStats singleFileRepartitionPromiseStats = PromiseStats.create(Duration.ofMinutes(5));

	private ClusterRepartitionController(Object localPartitionId, FsPartitions partitions) {
		this.localPartitionId = localPartitionId;
		this.partitions = partitions;
	}

	public static ClusterRepartitionController create(Object localPartitionId, FsPartitions partitions) {
		return new ClusterRepartitionController(localPartitionId, partitions);
	}

	public ClusterRepartitionController withGlob(@NotNull String glob) {
		this.glob = glob;
		return this;
	}

	public ClusterRepartitionController withNegativeGlob(@NotNull String negativeGlob) {
		if (negativeGlob.isEmpty()) {
			return this;
		}
		if (!isWildcard(negativeGlob)) {
			this.negativeGlobPredicate = file -> !file.equals(negativeGlob);
		} else {
			PathMatcher negativeMatcher = FileSystems.getDefault().getPathMatcher("glob:" + negativeGlob);
			this.negativeGlobPredicate = name -> !negativeMatcher.matches(Paths.get(name));
		}
		return this;
	}

	public ClusterRepartitionController withReplicationCount(int replicationCount) {
		this.replicationCount = replicationCount;
		return this;
	}

	public ClusterRepartitionController withPlanRecalculationInterval(Duration planRecalculationInterval) {
		this.planRecalculationInterval = planRecalculationInterval.toMillis();
		return this;
	}

	@Override
	public @NotNull Eventloop getEventloop() {
		return partitions.getEventloop();
	}

	public Object getLocalPartitionId() {
		return localPartitionId;
	}

	public ActiveFs getLocalFs() {
		return localFs;
	}

	public @NotNull Promise repartition() {
		return repartition.run();
	}

	private @NotNull Promise doRepartition() {
		if (CHECK)
			checkState(partitions.getEventloop().inEventloopThread(), "Should be called from eventloop thread");

		if (replicationCount == 1) {
			Set partitions = this.partitions.getPartitions().keySet();
			if (partitions.size() == 1 && first(partitions).equals(localPartitionId)) {
				logger.info("Only local partition is known, nowhere to repartition");
				return Promise.complete();
			}
		}

		isRepartitioning = true;
		processedFiles.clear();
		return recalculatePlan()
				.then(() -> Promises.repeat(
						() -> recalculatePlanIfNeeded()
								.then(() -> {
									if (!repartitionPlan.hasNext()) return Promise.of(false);
									String name = repartitionPlan.next();
									return localFs.info(name)
											.thenIfElse(Objects::isNull,
													$ -> {
														logger.warn("File '{}' that should be repartitioned has been deleted", name);
														return Promise.of(false);
													},
													meta -> repartitionFile(name, meta))
											.whenComplete(singleFileRepartitionPromiseStats.recordStats())
											.then(b -> {
												processedFiles.add(name);
												if (b) {
													ensuredFiles++;
												} else {
													failedFiles++;
												}
												return Promise.complete();
											})
											.map($ -> true);
								})))
				.whenComplete(() -> isRepartitioning = false)
				.whenComplete(repartitionPromiseStats.recordStats())
				.then(($, e) -> {
					if (e != null) {
						logger.warn("forced repartition finish, {} files ensured, {} errored, {} untouched", ensuredFiles, failedFiles, allFiles - ensuredFiles - failedFiles, e);
					} else {
						logger.info("repartition finished, {} files ensured, {} errored", ensuredFiles, failedFiles);
					}
					if (closeCallback != null) {
						closeCallback.accept($, e);
					}
					return Promise.complete();
				});
	}

	private Promise recalculatePlanIfNeeded() {
		if (updateLastAlivePartitionIds()) {
			return recalculatePlan();
		}
		if (getEventloop().currentTimeMillis() - lastPlanRecalculation > planRecalculationInterval) {
			return recalculatePlan();
		}
		return Promise.complete();
	}

	private Promise recalculatePlan() {
		return localFs.list(glob)
				.then(map -> {
					checkEnoughAlivePartitions();

					allFiles = map.size();

					Map filteredMap = map.entrySet().stream()
							.filter(entry -> negativeGlobPredicate.test(entry.getKey()))
							.filter(entry -> !processedFiles.contains(entry.getKey()))
							.collect(toMap(Map.Entry::getKey, Map.Entry::getValue));

					Map> groupedById = new HashMap<>();
					for (String name : filteredMap.keySet()) {
						List selected = partitions.select(name).subList(0, replicationCount);
						selected.remove(localPartitionId); // skip local partition if present
						for (Object id : selected) {
							groupedById.computeIfAbsent(id, $ -> new HashSet<>()).add(name);
						}
					}

					//noinspection ConstantConditions - get() after select()
					return Promises.reduce(
									filteredMap.entrySet().stream()
											.map(e -> new InfoResults(e.getKey(), e.getValue()))
											.collect(toMap(InfoResults::getName, Function.identity())),
									(result, metas) -> filteredMap.keySet().forEach(name -> result.get(name).remoteMetadata.add(metas.get(name))),
									Map::values,
									groupedById.size(),
									groupedById.entrySet().stream()
											.map(entry -> partitions.get(entry.getKey()).infoAll(entry.getValue())
													.whenException(e -> partitions.markIfDead(entry.getKey(), e)))
											.iterator())
							.whenResult(results -> {
								repartitionPlan = results.stream()
										.sorted()
										.filter(InfoResults::shouldBeProcessed)
										.map(InfoResults::getName)
										.iterator();

								lastPlanRecalculation = getEventloop().currentTimeMillis();
								updateLastAlivePartitionIds();
							})
							.toVoid()
							.then(Promise::of,
									e -> {
										logger.warn("Failed to recalculate repartition plan, retrying in 1 second", e);
										return Promises.delay(Duration.ofSeconds(1))
												.then(this::recalculatePlan);
									});
				});
	}

	private Promise repartitionFile(String name, FileMetadata meta) throws FsIOException {
		partitions.markAlive(localPartitionId); // ensure local partition could also be selected
		checkEnoughAlivePartitions();
		List selected = partitions.select(name).subList(0, replicationCount);
		List ids = new ArrayList<>(selected);
		boolean belongsToLocal = ids.remove(localPartitionId);
		return getInfoResults(name, meta, ids)
				.thenIfElse(Objects::isNull,
						$ -> Promise.of(false),
						infoResults -> {
							if (infoResults.shouldBeDeleted()) { // everybody had the file
								logger.trace("deleting file {} locally", meta);
								return localFs.delete(name) // so we delete the copy which does not belong to local partition
										.map($ -> {
											logger.info("handled file {} : {} (ensured on {})", name, meta, ids);
											return true;
										});
							}
							if (!infoResults.shouldBeUploaded()) {                             // everybody had the file AND
								logger.trace("handled file {} : {} (ensured on {})", name, meta, ids);     // we don't delete the local copy
								return Promise.of(true);
							}

							// else we need to upload to at least one non-local partition

							logger.trace("uploading file {} to partitions {}...", meta, infoResults);

							//noinspection OptionalGetWithoutIsPresent
							long offset = infoResults.remoteMetadata.stream()
									.mapToLong(metadata -> metadata == null ? 0 : metadata.getSize())
									.min()
									.getAsLong();

							ChannelByteSplitter splitter = ChannelByteSplitter.create(1)
									.withInput(ChannelSupplier.ofPromise(localFs.download(name, offset, meta.getSize())));

							RefInt idx = new RefInt(0);
							return Promises.toList(infoResults.remoteMetadata.stream() // upload file to target partitions
											.map(remoteMeta -> {
												Object partitionId = ids.get(idx.value++);
												if (remoteMeta != null && remoteMeta.getSize() >= meta.getSize()) {
													return Promise.of(Try.of(null));
												}
												// upload file to this partition
												ActiveFs fs = partitions.get(partitionId);
												if (fs == null) {
													return Promise.ofException(new FsIOException("File system '" + partitionId + "' is not alive"));
												}
												return Promise.ofCallback(cb ->
														splitter.addOutput()
																.set(ChannelConsumer.ofPromise(Promise.complete()
																				.then(() -> remoteMeta == null ?
																						fs.upload(name, meta.getSize()) :
																						fs.append(name, remoteMeta.getSize())
																								.map(consumer -> consumer.transformWith(ChannelByteRanger.drop(remoteMeta.getSize() - offset))))
																				.whenException(PathContainsFileException.class, e -> logger.error("Cluster contains files with clashing paths", e)))
																		.withAcknowledgement(ack -> ack
																				.whenResult(() -> logger.trace("file {} uploaded to '{}'", meta, partitionId))
																				.whenException(e -> {
																					logger.warn("failed uploading to partition {}", partitionId, e);
																					partitions.markIfDead(partitionId, e);
																				})
																				.whenComplete(cb::accept))));
											})
											.map(Promise::toTry))
									.thenIfElse(tries -> !tries.stream().allMatch(Try::isSuccess),
											$ -> {
												logger.warn("failed uploading file {}, skipping", meta);
												return Promise.of(false);
											},
											tries -> {
												if (belongsToLocal) { // don't delete local if it was marked
													logger.info("handled file {} : {} (ensured on {}, uploaded to {})", name, meta, selected, infoResults);
													return Promise.of(true);
												}

												logger.trace("deleting file {} on {}", meta, localPartitionId);
												return localFs.delete(name)
														.map($ -> {
															logger.info("handled file {} : {} (ensured on {}, uploaded to {})", name, meta, selected, infoResults);
															return true;
														});
											});
						})
				.whenComplete(toLogger(logger, TRACE, "repartitionFile", meta));
	}

	private Promise getInfoResults(String name, FileMetadata fileToUpload, List selected) {
		InfoResults infoResults = new InfoResults(name, fileToUpload);
		//noinspection ConstantConditions - get() right after select()
		return Promises.toList(selected.stream()
						.map(partitionId -> partitions.get(partitionId)
								.info(name) // checking file existence and size on particular partition
								.whenComplete(
										infoResults.remoteMetadata::add,
										e -> {
											logger.warn("failed connecting to partition {}", partitionId, e);
											partitions.markIfDead(partitionId, e);
										})
								.toTry()))
				.map(tries -> {
					if (!tries.stream().allMatch(Try::isSuccess)) { // any of info calls failed
						logger.warn("failed figuring out partitions for file {}, skipping", fileToUpload);
						return null; // using null to mark failure without exceptions
					}

					return infoResults;
				});
	}

	/**
	 * @return {@code true} if ids were updated, {@code false} otherwise
	 */
	private boolean updateLastAlivePartitionIds() {
		Set alivePartitionIds = new HashSet<>(partitions.getAlivePartitions().keySet());
		if (lastAlivePartitionIds.equals(alivePartitionIds)) {
			return false;
		}
		lastAlivePartitionIds = alivePartitionIds;
		return true;
	}

	private void checkEnoughAlivePartitions() throws FsIOException {
		if (partitions.getAlivePartitions().size() < replicationCount) {
			throw new FsIOException("Not enough alive partitions");
		}
	}

	@Override
	public @NotNull Promise start() {
		this.localFs = checkNotNull(partitions.getPartitions().get(localPartitionId), "Partitions do not contain local partition ID");
		return Promise.complete();
	}

	@Override
	public @NotNull Promise stop() {
		return isRepartitioning() ?
				Promise.ofCallback(cb -> this.closeCallback = cb) :
				Promise.complete();
	}

	// region JMX
	@JmxOperation(description = "start repartitioning")
	public void startRepartition() {
		repartition();
	}

	@JmxAttribute
	public boolean isRepartitioning() {
		return isRepartitioning;
	}

	@JmxAttribute
	public PromiseStats getRepartitionPromiseStats() {
		return repartitionPromiseStats;
	}

	@JmxAttribute
	public PromiseStats getSingleFileRepartitionPromiseStats() {
		return singleFileRepartitionPromiseStats;
	}

	@JmxAttribute
	public int getLastFilesToRepartition() {
		return allFiles;
	}

	@JmxAttribute
	public int getLastEnsuredFiles() {
		return ensuredFiles;
	}

	@JmxAttribute
	public int getLastFailedFiles() {
		return failedFiles;
	}

	@JmxAttribute(name = "")
	public FsPartitions getPartitions() {
		return partitions;
	}

	@JmxAttribute
	public int getReplicationCount() {
		return replicationCount;
	}

	@JmxAttribute
	public void setReplicationCount(int replicationCount) {
		this.replicationCount = checkArgument(replicationCount, count -> count > 0);
	}
	// endregion

	private static final Comparator INFO_RESULTS_COMPARATOR =
			Comparator.comparingLong(infoResults -> infoResults.remoteMetadata.stream()
							.filter(Objects::isNull)
							.count() +
							(infoResults.isLocalMetaTheBest() ? 1 : 0))
					.thenComparingLong(infoResults -> infoResults.remoteMetadata.stream()
							.filter(Objects::nonNull)
							.findAny().orElse(infoResults.localMetadata)
							.getSize());

	private final class InfoResults implements Comparable {
		final String name;
		final FileMetadata localMetadata;
		final List<@Nullable FileMetadata> remoteMetadata = new ArrayList<>();

		private InfoResults(@NotNull String name, @NotNull FileMetadata localMetadata) {
			this.name = name;
			this.localMetadata = localMetadata;
		}

		public String getName() {
			return name;
		}

		boolean shouldBeProcessed() {
			return shouldBeUploaded() || shouldBeDeleted();
		}

		// file should be uploaded if local file is the most complete file
		// and there are remote partitions that do not have this file or have not a full version
		boolean shouldBeUploaded() {
			return isLocalMetaTheBest() &&
					remoteMetadata.stream().anyMatch(metadata -> metadata == null || metadata.getSize() < localMetadata.getSize());
		}

		// (local) file should be deleted in case all the remote partitions have a better
		// version of a file
		boolean shouldBeDeleted() {
			return remoteMetadata.size() == replicationCount &&
					remoteMetadata.stream().noneMatch(metadata -> metadata == null || metadata.getSize() < localMetadata.getSize());
		}

		boolean isLocalMetaTheBest() {
			long maxSize = remoteMetadata.stream()
					.filter(Objects::nonNull)
					.mapToLong(FileMetadata::getSize)
					.max().orElse(0);

			return localMetadata.getSize() >= maxSize;
		}

		@Override
		public int compareTo(@NotNull ClusterRepartitionController.InfoResults o) {
			return INFO_RESULTS_COMPARATOR.compare(this, o);
		}
	}
}