All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.datakernel.aggregation.Aggregation Maven / Gradle / Ivy

/*
 * Copyright (C) 2015-2018 SoftIndex LLC.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.datakernel.aggregation;

import io.datakernel.aggregation.QueryPlan.Sequence;
import io.datakernel.aggregation.fieldtype.FieldType;
import io.datakernel.aggregation.ot.AggregationDiff;
import io.datakernel.aggregation.ot.AggregationStructure;
import io.datakernel.async.Promise;
import io.datakernel.codegen.ClassBuilder;
import io.datakernel.codegen.DefiningClassLoader;
import io.datakernel.eventloop.Eventloop;
import io.datakernel.jmx.EventloopJmxMBeanEx;
import io.datakernel.jmx.JmxAttribute;
import io.datakernel.serializer.BinarySerializer;
import io.datakernel.stream.StreamConsumer;
import io.datakernel.stream.StreamSupplier;
import io.datakernel.stream.processor.*;
import io.datakernel.stream.processor.StreamReducers.Reducer;
import io.datakernel.stream.stats.StreamStats;
import io.datakernel.util.Initializable;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.util.*;
import java.util.concurrent.Executor;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;

import static io.datakernel.aggregation.AggregationUtils.*;
import static io.datakernel.codegen.Expressions.arg;
import static io.datakernel.codegen.Expressions.cast;
import static io.datakernel.stream.StreamSupplierTransformer.identity;
import static io.datakernel.util.CollectionUtils.*;
import static io.datakernel.util.Preconditions.checkArgument;
import static java.lang.Math.min;
import static java.util.Collections.singletonList;
import static java.util.Comparator.comparing;
import static java.util.function.Predicate.isEqual;
import static java.util.stream.Collectors.toList;

/**
 * Represents an aggregation, which aggregates data using custom reducer and preaggregator.
 * Provides methods for loading and querying data.
 */
@SuppressWarnings({"unchecked", "rawtypes"})
public class Aggregation implements IAggregation, Initializable, EventloopJmxMBeanEx {
	private final Logger logger = LoggerFactory.getLogger(getClass());

	public static final int DEFAULT_CHUNK_SIZE = 1_000_000;
	public static final int DEFAULT_REDUCER_BUFFER_SIZE = StreamReducer.DEFAULT_BUFFER_SIZE;
	public static final int DEFAULT_SORTER_ITEMS_IN_MEMORY = 1_000_000;
	public static final Duration DEFAULT_MAX_INCREMENTAL_RELOAD_PERIOD = Duration.ofMinutes(10);
	public static final int DEFAULT_MAX_CHUNKS_TO_CONSOLIDATE = 1000;

	private final Eventloop eventloop;
	private final Executor executor;
	private final DefiningClassLoader classLoader;
	private final AggregationChunkStorage aggregationChunkStorage;
	private Path temporarySortDir;

	private final AggregationStructure structure;
	private AggregationState state;

	// settings
	private int chunkSize = DEFAULT_CHUNK_SIZE;
	private int reducerBufferSize = DEFAULT_REDUCER_BUFFER_SIZE;
	private int sorterItemsInMemory = DEFAULT_SORTER_ITEMS_IN_MEMORY;
	private Duration maxIncrementalReloadPeriod = DEFAULT_MAX_INCREMENTAL_RELOAD_PERIOD;
	private boolean ignoreChunkReadingExceptions = false;
	private int maxChunksToConsolidate = DEFAULT_MAX_CHUNKS_TO_CONSOLIDATE;

	// jmx

	private AggregationStats stats = new AggregationStats();
	private long consolidationStarted;
	private long consolidationLastTimeMillis;
	private int consolidations;
	private Throwable consolidationLastError;

	private Aggregation(Eventloop eventloop, Executor executor, DefiningClassLoader classLoader,
			AggregationChunkStorage aggregationChunkStorage, AggregationStructure structure,
			AggregationState state) {
		this.eventloop = eventloop;
		this.executor = executor;
		this.classLoader = classLoader;
		this.aggregationChunkStorage = aggregationChunkStorage;
		this.structure = structure;
		this.state = state;
	}

	/**
	 * Instantiates an aggregation with the specified structure, that runs in a given event loop,
	 * uses the specified class loader for creating dynamic classes, saves data and metadata to given storages.
	 * Maximum size of chunk is 1,000,000 bytes.
	 * No more than 1,000,000 records stay in memory while sorting.
	 * Maximum duration of consolidation attempt is 30 minutes.
	 * Consolidated chunks become available for removal in 10 minutes from consolidation.
	 *
	 * @param eventloop               event loop, in which the aggregation is to run
	 * @param executor                executor, that is used for asynchronous work with files
	 * @param classLoader             class loader for defining dynamic classes
	 * @param aggregationChunkStorage storage for data chunks
	 */
	public static Aggregation create(Eventloop eventloop, Executor executor, DefiningClassLoader classLoader,
			AggregationChunkStorage aggregationChunkStorage, AggregationStructure structure) {
		checkArgument(structure != null, "Cannot create Aggregation with AggregationStructure that is null");
		return new Aggregation(eventloop, executor, classLoader, aggregationChunkStorage, structure, new AggregationState(structure));
	}

	public Aggregation withChunkSize(int chunkSize) {
		this.chunkSize = chunkSize;
		return this;
	}

	public Aggregation withReducerBufferSize(int reducerBufferSize) {
		this.reducerBufferSize = reducerBufferSize;
		return this;
	}

	public Aggregation withSorterItemsInMemory(int sorterItemsInMemory) {
		this.sorterItemsInMemory = sorterItemsInMemory;
		return this;
	}

	public Aggregation withMaxIncrementalReloadPeriod(Duration maxIncrementalReloadPeriod) {
		this.maxIncrementalReloadPeriod = maxIncrementalReloadPeriod;
		return this;
	}

	public Aggregation withIgnoreChunkReadingExceptions(boolean ignoreChunkReadingExceptions) {
		this.ignoreChunkReadingExceptions = ignoreChunkReadingExceptions;
		return this;
	}

	public Aggregation withMaxChunksToConsolidate(int maxChunksToConsolidate) {
		this.maxChunksToConsolidate = maxChunksToConsolidate;
		return this;
	}

	public Aggregation withTemporarySortDir(Path temporarySortDir) {
		this.temporarySortDir = temporarySortDir;
		return this;
	}

	public Aggregation withStats(AggregationStats stats) {
		this.stats = stats;
		return this;
	}

	public AggregationStructure getStructure() {
		return structure;
	}

	public AggregationState getState() {
		return state;
	}

	public void setState(AggregationState state) {
		this.state = state;
	}

	public AggregationState detachState() {
		AggregationState state = this.state;
		//noinspection AssignmentToNull - in the lifecycle of a component field is not nullable
		this.state = null;
		return state;
	}

	public List getKeys() {
		return structure.getKeys();
	}

	public List getMeasures() {
		return structure.getMeasures();
	}

	public Map getKeyTypes() {
		return structure.getKeyTypes();
	}

	public Map getMeasureTypes() {
		return structure.getMeasureTypes();
	}

	public List getPartitioningKey() {
		return structure.getPartitioningKey();
	}

	public  Reducer aggregationReducer(Class inputClass, Class outputClass,
			List keys, List measures,
			DefiningClassLoader classLoader) {
		return AggregationUtils.aggregationReducer(structure, inputClass, outputClass,
				keys, measures, classLoader);
	}

	/**
	 * Provides a {@link StreamConsumer} for streaming data to this aggregation.
	 *
	 * @param inputClass class of input records
	 * @param         data records type
	 * @return consumer for streaming data to aggregation
	 */
	@SuppressWarnings("unchecked")
	public  Promise consume(StreamSupplier supplier,
			Class inputClass, Map keyFields, Map measureFields) {
		checkArgument(new HashSet<>(getKeys()).equals(keyFields.keySet()), "Expected keys: %s, actual keyFields: %s", getKeys(), keyFields);
		checkArgument(getMeasureTypes().keySet().containsAll(measureFields.keySet()), "Unknown measures: %s", difference(measureFields.keySet(),
				getMeasureTypes().keySet()));

		logger.info("Started consuming data in aggregation {}. Keys: {} Measures: {}", this, keyFields.keySet(), measureFields.keySet());

		Class keyClass = createKeyClass(
				keysToMap(getKeys().stream(), structure.getKeyTypes()::get),
				classLoader);
		Set measureFieldKeys = measureFields.keySet();
		List measures = getMeasureTypes().keySet().stream().filter(measureFieldKeys::contains).collect(toList());

		Class recordClass = createRecordClass(structure, getKeys(), measures, classLoader);

		Aggregate aggregate = createPreaggregator(structure, inputClass, recordClass,
				keyFields, measureFields,
				classLoader);

		Function keyFunction = createKeyFunction(inputClass, keyClass, getKeys(), classLoader);
		AggregationGroupReducer groupReducer = new AggregationGroupReducer<>(aggregationChunkStorage,
				structure, measures,
				recordClass,
				createPartitionPredicate(recordClass, getPartitioningKey(), classLoader),
				keyFunction,
				aggregate, chunkSize, classLoader);

		return supplier.streamTo(groupReducer)
				.then($ -> groupReducer.getResult())
				.map(chunks -> AggregationDiff.of(new HashSet<>(chunks)));
	}

	public  Promise consume(StreamSupplier supplier, Class inputClass) {
		return consume(supplier, inputClass, scanKeyFields(inputClass), scanMeasureFields(inputClass));
	}

	public double estimateCost(AggregationQuery query) {
		List measures = getMeasures();
		List aggregationFields = query.getMeasures().stream().filter(measures::contains).collect(toList());
		return state.findChunks(query.getPredicate(), aggregationFields).size();
	}

	public  StreamSupplier query(AggregationQuery query, Class outputClass) {
		return query(query, outputClass, classLoader);
	}

	/**
	 * Returns a {@link StreamSupplier} of the records retrieved from aggregation for the specified query.
	 *
	 * @param          type of output objects
	 * @param query       query
	 * @param outputClass class of output records
	 * @return supplier that streams query results
	 */
	@Override
	public  StreamSupplier query(AggregationQuery query, Class outputClass, DefiningClassLoader queryClassLoader) {
		checkArgument(iterate(queryClassLoader, Objects::nonNull, ClassLoader::getParent).anyMatch(isEqual(classLoader)),
				"Unrelated queryClassLoader");
		List fields = getMeasures().stream().filter(query.getMeasures()::contains).collect(toList());
		List allChunks = state.findChunks(query.getPredicate(), fields);
		return consolidatedSupplier(query.getKeys(),
				fields, outputClass, query.getPredicate(), allChunks, queryClassLoader);
	}

	private  StreamSupplier sortStream(StreamSupplier unsortedStream, Class resultClass,
			List allKeys, List measures, DefiningClassLoader classLoader) {
		Comparator keyComparator = createKeyComparator(resultClass, allKeys, classLoader);
		BinarySerializer binarySerializer = createBinarySerializer(structure, resultClass,
				getKeys(), measures, classLoader);
		Path sortDir = (temporarySortDir != null) ? temporarySortDir : createSortDir();
		StreamSupplier stream = unsortedStream
				.transformWith(StreamSorter.create(
						StreamSorterStorageImpl.create(executor, binarySerializer, sortDir),
						Function.identity(), keyComparator, false, sorterItemsInMemory));

		stream.getEndOfStream()
				.whenComplete(($, e) -> {
					if (temporarySortDir == null) {
						deleteSortDirSilent(sortDir);
					}
				});
		return stream;
	}

	private Promise> doConsolidation(List chunksToConsolidate) {
		Set aggregationFields = new HashSet<>(getMeasures());
		Set chunkFields = new HashSet<>();
		for (AggregationChunk chunk : chunksToConsolidate) {
			for (String measure : chunk.getMeasures()) {
				if (aggregationFields.contains(measure))
					chunkFields.add(measure);
			}
		}

		List measures = getMeasures().stream().filter(chunkFields::contains).collect(toList());
		Class resultClass = createRecordClass(structure, getKeys(), measures, classLoader);

		StreamSupplier consolidatedSupplier = consolidatedSupplier(getKeys(), measures, resultClass, AggregationPredicates.alwaysTrue(),
				chunksToConsolidate, classLoader);
		AggregationChunker chunker = AggregationChunker.create(
				structure, measures, resultClass,
				createPartitionPredicate(resultClass, getPartitioningKey(), classLoader),
				aggregationChunkStorage, classLoader, chunkSize);
		return consolidatedSupplier.streamTo(chunker)
				.then($ -> chunker.getResult());
	}

	private static void addChunkToPlan(Map, TreeMap>> planIndex,
			AggregationChunk chunk, List queryFields) {
		queryFields = new ArrayList<>(queryFields);
		queryFields.retainAll(chunk.getMeasures());
		checkArgument(!queryFields.isEmpty(), "All of query fields are contained in measures of a chunk");
		TreeMap> map = planIndex.computeIfAbsent(queryFields, k -> new TreeMap<>());

		Map.Entry> entry = map.lowerEntry(chunk.getMinPrimaryKey());
		Sequence sequence;
		if (entry == null) {
			sequence = new Sequence(queryFields);
		} else {
			List list = entry.getValue();
			sequence = list.remove(list.size() - 1);
			if (list.isEmpty()) {
				map.remove(entry.getKey());
			}
		}
		sequence.add(chunk);
		List list = map.computeIfAbsent(chunk.getMaxPrimaryKey(), k -> new ArrayList<>());
		list.add(sequence);
	}

	private static QueryPlan createPlan(List chunks, List queryFields) {
		Map, TreeMap>> index = new HashMap<>();
		chunks = new ArrayList<>(chunks);
		chunks.sort(comparing(AggregationChunk::getMinPrimaryKey));
		for (AggregationChunk chunk : chunks) {
			addChunkToPlan(index, chunk, queryFields);
		}
		List sequences = new ArrayList<>();
		for (TreeMap> map : index.values()) {
			for (List list : map.values()) {
				sequences.addAll(list);
			}
		}
		return new QueryPlan(sequences);
	}

	private  StreamSupplier consolidatedSupplier(List queryKeys,
			List measures, Class resultClass,
			AggregationPredicate where,
			List individualChunks,
			DefiningClassLoader queryClassLoader) {
		QueryPlan plan = createPlan(individualChunks, measures);

		logger.info("Query plan for {} in aggregation {}: {}", queryKeys, this, plan);

		boolean alreadySorted = getKeys().subList(0, min(getKeys().size(), queryKeys.size())).equals(queryKeys);

		List> sequenceStreams = new ArrayList<>();

		for (Sequence sequence : plan.getSequences()) {
			Class sequenceClass = createRecordClass(structure,
					getKeys(),
					sequence.getChunksFields(),
					classLoader);

			StreamSupplier stream = sequenceStream(where, sequence.getChunks(), sequenceClass, queryClassLoader);
			if (!alreadySorted) {
				stream = sortStream(stream, sequenceClass, queryKeys, sequence.getQueryFields(), classLoader);
			}

			sequenceStreams.add(new SequenceStream(stream, sequence.getQueryFields(), sequenceClass));
		}

		return mergeSequences(queryKeys, measures, resultClass, sequenceStreams, queryClassLoader);
	}

	static final class SequenceStream {
		final StreamSupplier stream;
		final List fields;
		final Class type;

		private SequenceStream(StreamSupplier stream, List fields, Class type) {
			this.stream = stream;
			this.fields = fields;
			this.type = type;
		}
	}

	private  StreamSupplier mergeSequences(List queryKeys, List measures,
			Class resultClass, List> sequences,
			DefiningClassLoader classLoader) {
		if (sequences.size() == 1 && new HashSet<>(queryKeys).equals(new HashSet<>(getKeys()))) {
			/*
			If there is only one sequential supplier and all aggregation keys are requested, then there is no need for
			using StreamReducer, because all records have unique keys and all we need to do is copy requested measures
			from record class to result class.
			 */
			SequenceStream sequence = sequences.get(0);
			Function mapper = createMapper(sequence.type, resultClass,
					queryKeys, measures.stream().filter(sequence.fields::contains).collect(toList()),
					classLoader);
			return sequence.stream
					.transformWith(StreamMapper.create(mapper))
					.transformWith((StreamStats) stats.mergeMapOutput);
		}

		StreamReducer streamReducer = StreamReducer.create(Comparable::compareTo);
		if (reducerBufferSize != 0 && reducerBufferSize != DEFAULT_REDUCER_BUFFER_SIZE) {
			streamReducer = streamReducer.withBufferSize(reducerBufferSize);
		}

		Class keyClass = createKeyClass(
				keysToMap(queryKeys.stream(), structure.getKeyTypes()::get),
				this.classLoader);

		for (SequenceStream sequence : sequences) {
			Function extractKeyFunction = createKeyFunction(sequence.type, keyClass, queryKeys, this.classLoader);

			Reducer reducer = AggregationUtils.aggregationReducer(structure,
					sequence.type, resultClass,
					queryKeys, measures.stream().filter(sequence.fields::contains).collect(toList()),
					classLoader);

			sequence.stream.streamTo(
					streamReducer.newInput(extractKeyFunction, reducer)
							.transformWith((StreamStats) stats.mergeReducerInput));
		}

		return streamReducer.getOutput()
				.transformWith((StreamStats) stats.mergeReducerOutput);
	}

	private  StreamSupplier sequenceStream(AggregationPredicate where,
			List individualChunks, Class sequenceClass,
			DefiningClassLoader queryClassLoader) {
		Iterator chunkIterator = individualChunks.iterator();
		return StreamSupplier.concat(new Iterator>() {
			@Override
			public boolean hasNext() {
				return chunkIterator.hasNext();
			}

			@Override
			public StreamSupplier next() {
				AggregationChunk chunk = chunkIterator.next();
				return chunkReaderWithFilter(where, chunk, sequenceClass, queryClassLoader);
			}
		});
	}

	private  StreamSupplier chunkReaderWithFilter(AggregationPredicate where, AggregationChunk chunk,
			Class chunkRecordClass, DefiningClassLoader queryClassLoader) {
		return StreamSupplier.ofPromise(
				aggregationChunkStorage.read(structure, chunk.getMeasures(), chunkRecordClass, chunk.getChunkId(), classLoader))
				.transformWith(where != AggregationPredicates.alwaysTrue() ?
						StreamFilter.create(
								createPredicate(chunkRecordClass, where, queryClassLoader)) :
						identity());
	}

	private  Predicate createPredicate(Class chunkRecordClass,
			AggregationPredicate where, DefiningClassLoader classLoader) {
		return ClassBuilder.create(classLoader, Predicate.class)
				.withMethod("test", boolean.class, singletonList(Object.class),
						where.createPredicateDef(cast(arg(0), chunkRecordClass), getKeyTypes()))
				.buildClassAndCreateNewInstance();
	}

	@JmxAttribute
	public int getNumberOfOverlappingChunks() {
		return state.findOverlappingChunks().size();
	}

	public Promise consolidateMinKey() {
		return doConsolidate(false);
	}

	public Promise consolidateHotSegment() {
		return doConsolidate(true);
	}

	private Promise doConsolidate(boolean hotSegment) {
		List chunks = hotSegment ?
				state.findChunksForConsolidationHotSegment(maxChunksToConsolidate) :
				state.findChunksForConsolidationMinKey(maxChunksToConsolidate, chunkSize);

		if (chunks.isEmpty()) {
			logger.info("Nothing to consolidate in aggregation '{}", this);
			return Promise.of(AggregationDiff.empty());
		}

		logger.info("Starting consolidation of aggregation '{}'", this);
		consolidationStarted = eventloop.currentTimeMillis();

		return doConsolidation(chunks)
				.whenComplete(($, e) -> {
					if (e == null) {
						consolidationLastTimeMillis = eventloop.currentTimeMillis() - consolidationStarted;
						consolidations++;
					} else {
						consolidationStarted = 0;
						consolidationLastError = e;
					}
				})
				.map(removedChunks -> AggregationDiff.of(new LinkedHashSet<>(removedChunks), new LinkedHashSet<>(chunks)));
	}

	private Path createSortDir() {
		try {
			return Files.createTempDirectory("aggregation_sort_dir");
		} catch (IOException e) {
			throw new UncheckedIOException(e);
		}
	}

	private void deleteSortDirSilent(Path sortDir) {
		try {
			Files.delete(sortDir);
		} catch (IOException e) {
			logger.warn("Could not delete temporal directory {} : {}", temporarySortDir, e.toString());
		}
	}

	public static String getChunkIds(Iterable chunks) {
		List ids = new ArrayList<>();
		for (AggregationChunk chunk : chunks) {
			ids.add(chunk.getChunkId());
		}
		return ids.stream().map(Object::toString).collect(Collectors.joining(", "));
	}

	// jmx

	@JmxAttribute
	public Duration getMaxIncrementalReloadPeriod() {
		return maxIncrementalReloadPeriod;
	}

	@JmxAttribute
	public void setMaxIncrementalReloadPeriod(Duration maxIncrementalReloadPeriod) {
		this.maxIncrementalReloadPeriod = maxIncrementalReloadPeriod;
	}

	@JmxAttribute
	public int getChunkSize() {
		return chunkSize;
	}

	@JmxAttribute
	public void setChunkSize(int chunkSize) {
		this.chunkSize = chunkSize;
	}

	@JmxAttribute
	public int getSorterItemsInMemory() {
		return sorterItemsInMemory;
	}

	@JmxAttribute
	public void setSorterItemsInMemory(int sorterItemsInMemory) {
		this.sorterItemsInMemory = sorterItemsInMemory;
	}

	@JmxAttribute
	public boolean isIgnoreChunkReadingExceptions() {
		return ignoreChunkReadingExceptions;
	}

	@JmxAttribute
	public void setIgnoreChunkReadingExceptions(boolean ignoreChunkReadingExceptions) {
		this.ignoreChunkReadingExceptions = ignoreChunkReadingExceptions;
	}

	@JmxAttribute
	public int getMaxChunksToConsolidate() {
		return maxChunksToConsolidate;
	}

	@JmxAttribute
	public void setMaxChunksToConsolidate(int maxChunksToConsolidate) {
		this.maxChunksToConsolidate = maxChunksToConsolidate;
	}

	@Nullable
	@JmxAttribute
	public Integer getConsolidationSeconds() {
		return consolidationStarted == 0 ? null : (int) ((eventloop.currentTimeMillis() - consolidationStarted) / 1000);
	}

	@Nullable
	@JmxAttribute
	public Integer getConsolidationLastTimeSeconds() {
		return consolidationLastTimeMillis == 0 ? null : (int) (consolidationLastTimeMillis / 1000);
	}

	@JmxAttribute
	public int getConsolidations() {
		return consolidations;
	}

	@JmxAttribute
	public Throwable getConsolidationLastError() {
		return consolidationLastError;
	}

	@JmxAttribute
	public int getChunks() {
		return state.getChunks().size();
	}

	@JmxAttribute
	public AggregationStats getStats() {
		return stats;
	}

	@NotNull
	@Override
	public Eventloop getEventloop() {
		return eventloop;
	}

	@Override
	public String toString() {
		return "{" + getKeyTypes().keySet() + " " + getMeasureTypes().keySet() + '}';
	}
}