Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
io.datakernel.aggregation.Aggregation Maven / Gradle / Ivy
/*
* Copyright (C) 2015-2018 SoftIndex LLC.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.datakernel.aggregation;
import io.datakernel.aggregation.QueryPlan.Sequence;
import io.datakernel.aggregation.fieldtype.FieldType;
import io.datakernel.aggregation.ot.AggregationDiff;
import io.datakernel.aggregation.ot.AggregationStructure;
import io.datakernel.async.Promise;
import io.datakernel.codegen.ClassBuilder;
import io.datakernel.codegen.DefiningClassLoader;
import io.datakernel.eventloop.Eventloop;
import io.datakernel.jmx.EventloopJmxMBeanEx;
import io.datakernel.jmx.JmxAttribute;
import io.datakernel.serializer.BinarySerializer;
import io.datakernel.stream.StreamConsumer;
import io.datakernel.stream.StreamSupplier;
import io.datakernel.stream.processor.*;
import io.datakernel.stream.processor.StreamReducers.Reducer;
import io.datakernel.stream.stats.StreamStats;
import io.datakernel.util.Initializable;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.util.*;
import java.util.concurrent.Executor;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import static io.datakernel.aggregation.AggregationUtils.*;
import static io.datakernel.codegen.Expressions.arg;
import static io.datakernel.codegen.Expressions.cast;
import static io.datakernel.stream.StreamSupplierTransformer.identity;
import static io.datakernel.util.CollectionUtils.*;
import static io.datakernel.util.Preconditions.checkArgument;
import static java.lang.Math.min;
import static java.util.Collections.singletonList;
import static java.util.Comparator.comparing;
import static java.util.function.Predicate.isEqual;
import static java.util.stream.Collectors.toList;
/**
* Represents an aggregation, which aggregates data using custom reducer and preaggregator.
* Provides methods for loading and querying data.
*/
@SuppressWarnings({"unchecked", "rawtypes"})
public class Aggregation implements IAggregation, Initializable, EventloopJmxMBeanEx {
private final Logger logger = LoggerFactory.getLogger(getClass());
public static final int DEFAULT_CHUNK_SIZE = 1_000_000;
public static final int DEFAULT_REDUCER_BUFFER_SIZE = StreamReducer.DEFAULT_BUFFER_SIZE;
public static final int DEFAULT_SORTER_ITEMS_IN_MEMORY = 1_000_000;
public static final Duration DEFAULT_MAX_INCREMENTAL_RELOAD_PERIOD = Duration.ofMinutes(10);
public static final int DEFAULT_MAX_CHUNKS_TO_CONSOLIDATE = 1000;
private final Eventloop eventloop;
private final Executor executor;
private final DefiningClassLoader classLoader;
private final AggregationChunkStorage aggregationChunkStorage;
private Path temporarySortDir;
private final AggregationStructure structure;
private AggregationState state;
// settings
private int chunkSize = DEFAULT_CHUNK_SIZE;
private int reducerBufferSize = DEFAULT_REDUCER_BUFFER_SIZE;
private int sorterItemsInMemory = DEFAULT_SORTER_ITEMS_IN_MEMORY;
private Duration maxIncrementalReloadPeriod = DEFAULT_MAX_INCREMENTAL_RELOAD_PERIOD;
private boolean ignoreChunkReadingExceptions = false;
private int maxChunksToConsolidate = DEFAULT_MAX_CHUNKS_TO_CONSOLIDATE;
// jmx
private AggregationStats stats = new AggregationStats();
private long consolidationStarted;
private long consolidationLastTimeMillis;
private int consolidations;
private Throwable consolidationLastError;
private Aggregation(Eventloop eventloop, Executor executor, DefiningClassLoader classLoader,
AggregationChunkStorage aggregationChunkStorage, AggregationStructure structure,
AggregationState state) {
this.eventloop = eventloop;
this.executor = executor;
this.classLoader = classLoader;
this.aggregationChunkStorage = aggregationChunkStorage;
this.structure = structure;
this.state = state;
}
/**
* Instantiates an aggregation with the specified structure, that runs in a given event loop,
* uses the specified class loader for creating dynamic classes, saves data and metadata to given storages.
* Maximum size of chunk is 1,000,000 bytes.
* No more than 1,000,000 records stay in memory while sorting.
* Maximum duration of consolidation attempt is 30 minutes.
* Consolidated chunks become available for removal in 10 minutes from consolidation.
*
* @param eventloop event loop, in which the aggregation is to run
* @param executor executor, that is used for asynchronous work with files
* @param classLoader class loader for defining dynamic classes
* @param aggregationChunkStorage storage for data chunks
*/
public static Aggregation create(Eventloop eventloop, Executor executor, DefiningClassLoader classLoader,
AggregationChunkStorage aggregationChunkStorage, AggregationStructure structure) {
checkArgument(structure != null, "Cannot create Aggregation with AggregationStructure that is null");
return new Aggregation(eventloop, executor, classLoader, aggregationChunkStorage, structure, new AggregationState(structure));
}
public Aggregation withChunkSize(int chunkSize) {
this.chunkSize = chunkSize;
return this;
}
public Aggregation withReducerBufferSize(int reducerBufferSize) {
this.reducerBufferSize = reducerBufferSize;
return this;
}
public Aggregation withSorterItemsInMemory(int sorterItemsInMemory) {
this.sorterItemsInMemory = sorterItemsInMemory;
return this;
}
public Aggregation withMaxIncrementalReloadPeriod(Duration maxIncrementalReloadPeriod) {
this.maxIncrementalReloadPeriod = maxIncrementalReloadPeriod;
return this;
}
public Aggregation withIgnoreChunkReadingExceptions(boolean ignoreChunkReadingExceptions) {
this.ignoreChunkReadingExceptions = ignoreChunkReadingExceptions;
return this;
}
public Aggregation withMaxChunksToConsolidate(int maxChunksToConsolidate) {
this.maxChunksToConsolidate = maxChunksToConsolidate;
return this;
}
public Aggregation withTemporarySortDir(Path temporarySortDir) {
this.temporarySortDir = temporarySortDir;
return this;
}
public Aggregation withStats(AggregationStats stats) {
this.stats = stats;
return this;
}
public AggregationStructure getStructure() {
return structure;
}
public AggregationState getState() {
return state;
}
public void setState(AggregationState state) {
this.state = state;
}
public AggregationState detachState() {
AggregationState state = this.state;
//noinspection AssignmentToNull - in the lifecycle of a component field is not nullable
this.state = null;
return state;
}
public List getKeys() {
return structure.getKeys();
}
public List getMeasures() {
return structure.getMeasures();
}
public Map getKeyTypes() {
return structure.getKeyTypes();
}
public Map getMeasureTypes() {
return structure.getMeasureTypes();
}
public List getPartitioningKey() {
return structure.getPartitioningKey();
}
public Reducer aggregationReducer(Class inputClass, Class outputClass,
List keys, List measures,
DefiningClassLoader classLoader) {
return AggregationUtils.aggregationReducer(structure, inputClass, outputClass,
keys, measures, classLoader);
}
/**
* Provides a {@link StreamConsumer} for streaming data to this aggregation.
*
* @param inputClass class of input records
* @param data records type
* @return consumer for streaming data to aggregation
*/
@SuppressWarnings("unchecked")
public Promise consume(StreamSupplier supplier,
Class inputClass, Map keyFields, Map measureFields) {
checkArgument(new HashSet<>(getKeys()).equals(keyFields.keySet()), "Expected keys: %s, actual keyFields: %s", getKeys(), keyFields);
checkArgument(getMeasureTypes().keySet().containsAll(measureFields.keySet()), "Unknown measures: %s", difference(measureFields.keySet(),
getMeasureTypes().keySet()));
logger.info("Started consuming data in aggregation {}. Keys: {} Measures: {}", this, keyFields.keySet(), measureFields.keySet());
Class keyClass = createKeyClass(
keysToMap(getKeys().stream(), structure.getKeyTypes()::get),
classLoader);
Set measureFieldKeys = measureFields.keySet();
List measures = getMeasureTypes().keySet().stream().filter(measureFieldKeys::contains).collect(toList());
Class recordClass = createRecordClass(structure, getKeys(), measures, classLoader);
Aggregate aggregate = createPreaggregator(structure, inputClass, recordClass,
keyFields, measureFields,
classLoader);
Function keyFunction = createKeyFunction(inputClass, keyClass, getKeys(), classLoader);
AggregationGroupReducer groupReducer = new AggregationGroupReducer<>(aggregationChunkStorage,
structure, measures,
recordClass,
createPartitionPredicate(recordClass, getPartitioningKey(), classLoader),
keyFunction,
aggregate, chunkSize, classLoader);
return supplier.streamTo(groupReducer)
.then($ -> groupReducer.getResult())
.map(chunks -> AggregationDiff.of(new HashSet<>(chunks)));
}
public Promise consume(StreamSupplier supplier, Class inputClass) {
return consume(supplier, inputClass, scanKeyFields(inputClass), scanMeasureFields(inputClass));
}
public double estimateCost(AggregationQuery query) {
List measures = getMeasures();
List aggregationFields = query.getMeasures().stream().filter(measures::contains).collect(toList());
return state.findChunks(query.getPredicate(), aggregationFields).size();
}
public StreamSupplier query(AggregationQuery query, Class outputClass) {
return query(query, outputClass, classLoader);
}
/**
* Returns a {@link StreamSupplier} of the records retrieved from aggregation for the specified query.
*
* @param type of output objects
* @param query query
* @param outputClass class of output records
* @return supplier that streams query results
*/
@Override
public StreamSupplier query(AggregationQuery query, Class outputClass, DefiningClassLoader queryClassLoader) {
checkArgument(iterate(queryClassLoader, Objects::nonNull, ClassLoader::getParent).anyMatch(isEqual(classLoader)),
"Unrelated queryClassLoader");
List fields = getMeasures().stream().filter(query.getMeasures()::contains).collect(toList());
List allChunks = state.findChunks(query.getPredicate(), fields);
return consolidatedSupplier(query.getKeys(),
fields, outputClass, query.getPredicate(), allChunks, queryClassLoader);
}
private StreamSupplier sortStream(StreamSupplier unsortedStream, Class resultClass,
List allKeys, List measures, DefiningClassLoader classLoader) {
Comparator keyComparator = createKeyComparator(resultClass, allKeys, classLoader);
BinarySerializer binarySerializer = createBinarySerializer(structure, resultClass,
getKeys(), measures, classLoader);
Path sortDir = (temporarySortDir != null) ? temporarySortDir : createSortDir();
StreamSupplier stream = unsortedStream
.transformWith(StreamSorter.create(
StreamSorterStorageImpl.create(executor, binarySerializer, sortDir),
Function.identity(), keyComparator, false, sorterItemsInMemory));
stream.getEndOfStream()
.whenComplete(($, e) -> {
if (temporarySortDir == null) {
deleteSortDirSilent(sortDir);
}
});
return stream;
}
private Promise> doConsolidation(List chunksToConsolidate) {
Set aggregationFields = new HashSet<>(getMeasures());
Set chunkFields = new HashSet<>();
for (AggregationChunk chunk : chunksToConsolidate) {
for (String measure : chunk.getMeasures()) {
if (aggregationFields.contains(measure))
chunkFields.add(measure);
}
}
List measures = getMeasures().stream().filter(chunkFields::contains).collect(toList());
Class resultClass = createRecordClass(structure, getKeys(), measures, classLoader);
StreamSupplier consolidatedSupplier = consolidatedSupplier(getKeys(), measures, resultClass, AggregationPredicates.alwaysTrue(),
chunksToConsolidate, classLoader);
AggregationChunker chunker = AggregationChunker.create(
structure, measures, resultClass,
createPartitionPredicate(resultClass, getPartitioningKey(), classLoader),
aggregationChunkStorage, classLoader, chunkSize);
return consolidatedSupplier.streamTo(chunker)
.then($ -> chunker.getResult());
}
private static void addChunkToPlan(Map, TreeMap>> planIndex,
AggregationChunk chunk, List queryFields) {
queryFields = new ArrayList<>(queryFields);
queryFields.retainAll(chunk.getMeasures());
checkArgument(!queryFields.isEmpty(), "All of query fields are contained in measures of a chunk");
TreeMap> map = planIndex.computeIfAbsent(queryFields, k -> new TreeMap<>());
Map.Entry> entry = map.lowerEntry(chunk.getMinPrimaryKey());
Sequence sequence;
if (entry == null) {
sequence = new Sequence(queryFields);
} else {
List list = entry.getValue();
sequence = list.remove(list.size() - 1);
if (list.isEmpty()) {
map.remove(entry.getKey());
}
}
sequence.add(chunk);
List list = map.computeIfAbsent(chunk.getMaxPrimaryKey(), k -> new ArrayList<>());
list.add(sequence);
}
private static QueryPlan createPlan(List chunks, List queryFields) {
Map, TreeMap>> index = new HashMap<>();
chunks = new ArrayList<>(chunks);
chunks.sort(comparing(AggregationChunk::getMinPrimaryKey));
for (AggregationChunk chunk : chunks) {
addChunkToPlan(index, chunk, queryFields);
}
List sequences = new ArrayList<>();
for (TreeMap> map : index.values()) {
for (List list : map.values()) {
sequences.addAll(list);
}
}
return new QueryPlan(sequences);
}
private StreamSupplier consolidatedSupplier(List queryKeys,
List measures, Class resultClass,
AggregationPredicate where,
List individualChunks,
DefiningClassLoader queryClassLoader) {
QueryPlan plan = createPlan(individualChunks, measures);
logger.info("Query plan for {} in aggregation {}: {}", queryKeys, this, plan);
boolean alreadySorted = getKeys().subList(0, min(getKeys().size(), queryKeys.size())).equals(queryKeys);
List> sequenceStreams = new ArrayList<>();
for (Sequence sequence : plan.getSequences()) {
Class sequenceClass = createRecordClass(structure,
getKeys(),
sequence.getChunksFields(),
classLoader);
StreamSupplier stream = sequenceStream(where, sequence.getChunks(), sequenceClass, queryClassLoader);
if (!alreadySorted) {
stream = sortStream(stream, sequenceClass, queryKeys, sequence.getQueryFields(), classLoader);
}
sequenceStreams.add(new SequenceStream(stream, sequence.getQueryFields(), sequenceClass));
}
return mergeSequences(queryKeys, measures, resultClass, sequenceStreams, queryClassLoader);
}
static final class SequenceStream {
final StreamSupplier stream;
final List fields;
final Class type;
private SequenceStream(StreamSupplier stream, List fields, Class type) {
this.stream = stream;
this.fields = fields;
this.type = type;
}
}
private StreamSupplier mergeSequences(List queryKeys, List measures,
Class resultClass, List> sequences,
DefiningClassLoader classLoader) {
if (sequences.size() == 1 && new HashSet<>(queryKeys).equals(new HashSet<>(getKeys()))) {
/*
If there is only one sequential supplier and all aggregation keys are requested, then there is no need for
using StreamReducer, because all records have unique keys and all we need to do is copy requested measures
from record class to result class.
*/
SequenceStream sequence = sequences.get(0);
Function mapper = createMapper(sequence.type, resultClass,
queryKeys, measures.stream().filter(sequence.fields::contains).collect(toList()),
classLoader);
return sequence.stream
.transformWith(StreamMapper.create(mapper))
.transformWith((StreamStats) stats.mergeMapOutput);
}
StreamReducer streamReducer = StreamReducer.create(Comparable::compareTo);
if (reducerBufferSize != 0 && reducerBufferSize != DEFAULT_REDUCER_BUFFER_SIZE) {
streamReducer = streamReducer.withBufferSize(reducerBufferSize);
}
Class keyClass = createKeyClass(
keysToMap(queryKeys.stream(), structure.getKeyTypes()::get),
this.classLoader);
for (SequenceStream sequence : sequences) {
Function extractKeyFunction = createKeyFunction(sequence.type, keyClass, queryKeys, this.classLoader);
Reducer reducer = AggregationUtils.aggregationReducer(structure,
sequence.type, resultClass,
queryKeys, measures.stream().filter(sequence.fields::contains).collect(toList()),
classLoader);
sequence.stream.streamTo(
streamReducer.newInput(extractKeyFunction, reducer)
.transformWith((StreamStats) stats.mergeReducerInput));
}
return streamReducer.getOutput()
.transformWith((StreamStats) stats.mergeReducerOutput);
}
private StreamSupplier sequenceStream(AggregationPredicate where,
List individualChunks, Class sequenceClass,
DefiningClassLoader queryClassLoader) {
Iterator chunkIterator = individualChunks.iterator();
return StreamSupplier.concat(new Iterator>() {
@Override
public boolean hasNext() {
return chunkIterator.hasNext();
}
@Override
public StreamSupplier next() {
AggregationChunk chunk = chunkIterator.next();
return chunkReaderWithFilter(where, chunk, sequenceClass, queryClassLoader);
}
});
}
private StreamSupplier chunkReaderWithFilter(AggregationPredicate where, AggregationChunk chunk,
Class chunkRecordClass, DefiningClassLoader queryClassLoader) {
return StreamSupplier.ofPromise(
aggregationChunkStorage.read(structure, chunk.getMeasures(), chunkRecordClass, chunk.getChunkId(), classLoader))
.transformWith(where != AggregationPredicates.alwaysTrue() ?
StreamFilter.create(
createPredicate(chunkRecordClass, where, queryClassLoader)) :
identity());
}
private Predicate createPredicate(Class chunkRecordClass,
AggregationPredicate where, DefiningClassLoader classLoader) {
return ClassBuilder.create(classLoader, Predicate.class)
.withMethod("test", boolean.class, singletonList(Object.class),
where.createPredicateDef(cast(arg(0), chunkRecordClass), getKeyTypes()))
.buildClassAndCreateNewInstance();
}
@JmxAttribute
public int getNumberOfOverlappingChunks() {
return state.findOverlappingChunks().size();
}
public Promise consolidateMinKey() {
return doConsolidate(false);
}
public Promise consolidateHotSegment() {
return doConsolidate(true);
}
private Promise doConsolidate(boolean hotSegment) {
List chunks = hotSegment ?
state.findChunksForConsolidationHotSegment(maxChunksToConsolidate) :
state.findChunksForConsolidationMinKey(maxChunksToConsolidate, chunkSize);
if (chunks.isEmpty()) {
logger.info("Nothing to consolidate in aggregation '{}", this);
return Promise.of(AggregationDiff.empty());
}
logger.info("Starting consolidation of aggregation '{}'", this);
consolidationStarted = eventloop.currentTimeMillis();
return doConsolidation(chunks)
.whenComplete(($, e) -> {
if (e == null) {
consolidationLastTimeMillis = eventloop.currentTimeMillis() - consolidationStarted;
consolidations++;
} else {
consolidationStarted = 0;
consolidationLastError = e;
}
})
.map(removedChunks -> AggregationDiff.of(new LinkedHashSet<>(removedChunks), new LinkedHashSet<>(chunks)));
}
private Path createSortDir() {
try {
return Files.createTempDirectory("aggregation_sort_dir");
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
private void deleteSortDirSilent(Path sortDir) {
try {
Files.delete(sortDir);
} catch (IOException e) {
logger.warn("Could not delete temporal directory {} : {}", temporarySortDir, e.toString());
}
}
public static String getChunkIds(Iterable chunks) {
List ids = new ArrayList<>();
for (AggregationChunk chunk : chunks) {
ids.add(chunk.getChunkId());
}
return ids.stream().map(Object::toString).collect(Collectors.joining(", "));
}
// jmx
@JmxAttribute
public Duration getMaxIncrementalReloadPeriod() {
return maxIncrementalReloadPeriod;
}
@JmxAttribute
public void setMaxIncrementalReloadPeriod(Duration maxIncrementalReloadPeriod) {
this.maxIncrementalReloadPeriod = maxIncrementalReloadPeriod;
}
@JmxAttribute
public int getChunkSize() {
return chunkSize;
}
@JmxAttribute
public void setChunkSize(int chunkSize) {
this.chunkSize = chunkSize;
}
@JmxAttribute
public int getSorterItemsInMemory() {
return sorterItemsInMemory;
}
@JmxAttribute
public void setSorterItemsInMemory(int sorterItemsInMemory) {
this.sorterItemsInMemory = sorterItemsInMemory;
}
@JmxAttribute
public boolean isIgnoreChunkReadingExceptions() {
return ignoreChunkReadingExceptions;
}
@JmxAttribute
public void setIgnoreChunkReadingExceptions(boolean ignoreChunkReadingExceptions) {
this.ignoreChunkReadingExceptions = ignoreChunkReadingExceptions;
}
@JmxAttribute
public int getMaxChunksToConsolidate() {
return maxChunksToConsolidate;
}
@JmxAttribute
public void setMaxChunksToConsolidate(int maxChunksToConsolidate) {
this.maxChunksToConsolidate = maxChunksToConsolidate;
}
@Nullable
@JmxAttribute
public Integer getConsolidationSeconds() {
return consolidationStarted == 0 ? null : (int) ((eventloop.currentTimeMillis() - consolidationStarted) / 1000);
}
@Nullable
@JmxAttribute
public Integer getConsolidationLastTimeSeconds() {
return consolidationLastTimeMillis == 0 ? null : (int) (consolidationLastTimeMillis / 1000);
}
@JmxAttribute
public int getConsolidations() {
return consolidations;
}
@JmxAttribute
public Throwable getConsolidationLastError() {
return consolidationLastError;
}
@JmxAttribute
public int getChunks() {
return state.getChunks().size();
}
@JmxAttribute
public AggregationStats getStats() {
return stats;
}
@NotNull
@Override
public Eventloop getEventloop() {
return eventloop;
}
@Override
public String toString() {
return "{" + getKeyTypes().keySet() + " " + getMeasureTypes().keySet() + '}';
}
}