
com.hazelcast.jet.aggregate.AggregateOperations Maven / Gradle / Ivy
/*
* Copyright (c) 2008-2024, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.jet.aggregate;
import com.hazelcast.aggregation.Aggregator;
import com.hazelcast.function.BiConsumerEx;
import com.hazelcast.function.BiFunctionEx;
import com.hazelcast.function.BinaryOperatorEx;
import com.hazelcast.function.ComparatorEx;
import com.hazelcast.function.FunctionEx;
import com.hazelcast.function.PredicateEx;
import com.hazelcast.function.SupplierEx;
import com.hazelcast.function.ToDoubleFunctionEx;
import com.hazelcast.function.ToLongFunctionEx;
import com.hazelcast.jet.Traverser;
import com.hazelcast.jet.accumulator.DoubleAccumulator;
import com.hazelcast.jet.accumulator.LinTrendAccumulator;
import com.hazelcast.jet.accumulator.LongAccumulator;
import com.hazelcast.jet.accumulator.LongDoubleAccumulator;
import com.hazelcast.jet.accumulator.LongLongAccumulator;
import com.hazelcast.jet.accumulator.MutableReference;
import com.hazelcast.jet.accumulator.PickAnyAccumulator;
import com.hazelcast.jet.core.Processor;
import com.hazelcast.jet.datamodel.ItemsByTag;
import com.hazelcast.jet.datamodel.Tuple2;
import com.hazelcast.jet.datamodel.Tuple3;
import com.hazelcast.jet.function.TriFunction;
import com.hazelcast.jet.impl.aggregate.AggregateOpAggregator;
import com.hazelcast.jet.pipeline.BatchStage;
import com.hazelcast.jet.pipeline.BatchStageWithKey;
import com.hazelcast.jet.pipeline.GeneralStage;
import com.hazelcast.jet.pipeline.StageWithWindow;
import com.hazelcast.map.IMap;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import static com.hazelcast.function.FunctionEx.identity;
import static com.hazelcast.internal.serialization.impl.SerializationUtil.checkSerializable;
import static com.hazelcast.jet.datamodel.Tuple2.tuple2;
import static com.hazelcast.jet.datamodel.Tuple3.tuple3;
/**
* Utility class with factory methods for several useful aggregate
* operations. See the Javadoc on {@link AggregateOperation}. You can
* also create your own aggregate operation using the {@link
* AggregateOperation#withCreate builder object}.
*
* @since Jet 3.0
*/
public final class AggregateOperations {
private AggregateOperations() {
}
/**
* Returns an aggregate operation that counts the items it observes. The
* result is of type {@code long}.
*
* This sample takes a stream of words and finds the number of occurrences
* of each word in it:
*
{@code
* BatchStage words = pipeline.readFrom(wordSource);
* BatchStage> wordFrequencies =
* words.groupingKey(wholeItem()).aggregate(counting());
* }
*/
@Nonnull
public static AggregateOperation1 counting() {
return AggregateOperation
.withCreate(LongAccumulator::new)
.andAccumulate((LongAccumulator a, T item) -> a.add(1))
.andCombine(LongAccumulator::add)
.andDeduct(LongAccumulator::subtractAllowingOverflow)
.andExportFinish(LongAccumulator::get);
}
/**
* Returns an aggregate operation that computes the sum of the {@code long}
* values it obtains by applying {@code getLongValueFn} to each item.
*
* This sample takes a stream of lines of text and outputs a single {@code
* long} number telling how many words there were in the stream:
*
{@code
* BatchStage linesOfText = pipeline.readFrom(textSource);
* BatchStage numberOfWordsInText =
* linesOfText
* .map(line -> line.split("\\W+"))
* .aggregate(summingLong(wordsInLine -> wordsInLine.length));
* }
*
* Note: if the sum exceeds {@code Long.MAX_VALUE}, the job
* will fail with an {@code ArithmeticException}.
*
* @param getLongValueFn function that extracts the {@code long} values you
* want to sum. It must be stateless and {@linkplain
* Processor#isCooperative() cooperative}.
* @param type of the input item
*/
@Nonnull
public static AggregateOperation1 summingLong(
@Nonnull ToLongFunctionEx super T> getLongValueFn
) {
checkSerializable(getLongValueFn, "getLongValueFn");
return AggregateOperation
.withCreate(LongAccumulator::new)
.andAccumulate((LongAccumulator a, T item) -> a.add(getLongValueFn.applyAsLong(item)))
.andCombine(LongAccumulator::add)
.andDeduct(LongAccumulator::subtract)
.andExportFinish(LongAccumulator::get);
}
/**
* Returns an aggregate operation that computes the sum of the {@code
* double} values it obtains by applying {@code getDoubleValueFn} to each
* item.
*
* This sample takes a stream of purchase events and outputs a single
* {@code double} value that tells the total sum of money spent in
* them:
*
{@code
* BatchStage purchases = pipeline.readFrom(purchaseSource);
* BatchStage purchaseVolume =
* purchases.aggregate(summingDouble(Purchase::amount));
* }
*
* @param getDoubleValueFn function that extracts the {@code double} values
* you want to sum. It must be stateless and {@linkplain
* Processor#isCooperative() cooperative}.
* @param type of the input item
*/
@Nonnull
public static AggregateOperation1 summingDouble(
@Nonnull ToDoubleFunctionEx super T> getDoubleValueFn
) {
checkSerializable(getDoubleValueFn, "getDoubleValueFn");
return AggregateOperation
.withCreate(DoubleAccumulator::new)
.andAccumulate((DoubleAccumulator a, T item) -> a.accumulate(getDoubleValueFn.applyAsDouble(item)))
.andCombine(DoubleAccumulator::combine)
.andDeduct(DoubleAccumulator::deduct)
.andExportFinish(DoubleAccumulator::export);
}
/**
* Returns an aggregate operation that computes the least item according to
* the given {@code comparator}.
*
* This sample takes a stream of people and finds the youngest person in it:
*
{@code
* BatchStage people = pipeline.readFrom(peopleSource);
* BatchStage youngestPerson =
* people.aggregate(minBy(ComparatorEx.comparing(Person::age)));
* }
* NOTE: if this aggregate operation doesn't observe any
* items, its result will be {@code null}. Since the non-keyed {@link
* BatchStage#aggregate} emits just the naked aggregation result, and since
* a {@code null} cannot travel through a Jet pipeline, you will not get
* any output in that case.
*
* If several items tie for the least one, this aggregate operation will
* choose any one to return and may choose a different one each time.
*
* Implementation note: this aggregate operation does not
* implement the {@link AggregateOperation1#deductFn() deduct} primitive.
* This has performance implications for sliding
* window aggregation.
*
* @param comparator comparator to compare the items. It must be stateless
* and {@linkplain Processor#isCooperative() cooperative}.
* @param type of the input item
*/
@Nonnull
public static AggregateOperation1, T> minBy(
@Nonnull ComparatorEx super T> comparator
) {
checkSerializable(comparator, "comparator");
return maxBy(comparator.reversed());
}
/**
* Returns an aggregate operation that computes the greatest item according
* to the given {@code comparator}.
*
* This sample takes a stream of people and finds the oldest person in it:
*
{@code
* BatchStage people = pipeline.readFrom(peopleSource);
* BatchStage oldestPerson =
* people.aggregate(maxBy(ComparatorEx.comparing(Person::age)));
* }
* NOTE: if this aggregate operation doesn't observe any
* items, its result will be {@code null}. Since the non-keyed {@link
* BatchStage#aggregate} emits just the naked aggregation result, and since
* a {@code null} cannot travel through a Jet pipeline, you will not get
* any output in that case.
*
* If several items tie for the greatest one, this aggregate operation will
* choose any one to return and may choose a different one each time.
*
* Implementation note: this aggregate operation does not
* implement the {@link AggregateOperation1#deductFn() deduct} primitive.
* This has performance implications for sliding
* window aggregation.
*
* @param comparator comparator to compare the items. It must be stateless
* and {@linkplain Processor#isCooperative() cooperative}.
* @param type of the input item
*/
@Nonnull
public static AggregateOperation1, T> maxBy(
@Nonnull ComparatorEx super T> comparator
) {
checkSerializable(comparator, "comparator");
return AggregateOperation
.withCreate(MutableReference::new)
.andAccumulate((MutableReference a, T i) -> {
if (a.isNull() || comparator.compare(i, a.get()) > 0) {
a.set(i);
}
})
.andCombine((a1, a2) -> {
if (a1.isNull() || (!a2.isNull() && comparator.compare(a1.get(), a2.get()) < 0)) {
a1.set(a2.get());
}
})
.andExportFinish(MutableReference::get);
}
/**
* Returns an aggregate operation that finds the top {@code n} items
* according to the given {@link ComparatorEx comparator}. It outputs a
* sorted list with the top item in the first position.
*
* This sample takes a stream of people and finds ten oldest persons in it:
*
{@code
* BatchStage people = pipeline.readFrom(peopleSource);
* BatchStage> oldestDudes =
* people.aggregate(topN(10, ComparatorEx.comparing(Person::age)));
* }
* Implementation note: this aggregate operation does not
* implement the {@link AggregateOperation1#deductFn() deduct} primitive.
* This has performance implications for sliding
* window aggregation.
*
* @param n number of top items to find
* @param comparator compares the items. It must be stateless and
* {@linkplain Processor#isCooperative() cooperative}.
* @param type of the input item
*/
@Nonnull
public static AggregateOperation1, List> topN(
int n, @Nonnull ComparatorEx super T> comparator
) {
checkSerializable(comparator, "comparator");
ComparatorEx super T> comparatorReversed = comparator.reversed();
BiConsumerEx, T> accumulateFn = (queue, item) -> {
if (queue.size() == n) {
if (comparator.compare(item, queue.peek()) <= 0) {
// the new item is smaller or equal to the smallest in queue
return;
}
queue.poll();
}
queue.offer(item);
};
return AggregateOperation
.withCreate(() -> new PriorityQueue(n, comparator))
.andAccumulate(accumulateFn)
.andCombine((left, right) -> {
for (T item : right) {
accumulateFn.accept(left, item);
}
})
.andExportFinish(queue -> {
ArrayList res = new ArrayList<>(queue);
res.sort(comparatorReversed);
return res;
});
}
/**
* Returns an aggregate operation that finds the bottom {@code n} items
* according to the given {@link ComparatorEx comparator}. It outputs a
* sorted list with the bottom item in the first position.
*
* This sample takes a stream of people and finds ten youngest persons in
* it:
*
{@code
* BatchStage people = pipeline.readFrom(peopleSource);
* BatchStage> youngestDudes =
* people.aggregate(bottomN(10, ComparatorEx.comparing(Person::age)));
* }
* Implementation note: this aggregate operation does not
* implement the {@link AggregateOperation1#deductFn() deduct} primitive.
* This has performance implications for sliding
* window aggregation.
*
* @param n number of bottom items to find
* @param comparator compares the items. It must be stateless and
* {@linkplain Processor#isCooperative() cooperative}.
* @param type of the input item
*/
@Nonnull
public static AggregateOperation1, List> bottomN(
int n, @Nonnull ComparatorEx super T> comparator
) {
return topN(n, comparator.reversed());
}
/**
* Returns an aggregate operation that finds the arithmetic mean (aka.
* average) of the {@code long} values it obtains by applying {@code
* getLongValueFn} to each item. It outputs the result as a {@code double}.
*
* This sample takes a stream of people and finds their mean age:
*
{@code
* BatchStage people = pipeline.readFrom(peopleSource);
* BatchStage meanAge = people.aggregate(averagingLong(Person::age));
* }
*
* If the aggregate operation does not observe any input, its result is
* {@link Double#NaN NaN}.
*
* NOTE: this operation accumulates the sum and the
* count as separate {@code long} variables and combines them at the end
* into the mean value. If either of these variables exceeds {@code
* Long.MAX_VALUE}, the job will fail with an {@link ArithmeticException}.
*
* @param getLongValueFn function that extracts the {@code long} value from
* the item. It must be stateless and {@linkplain Processor#isCooperative()
* cooperative}.
* @param type of the input item
*/
@Nonnull
public static AggregateOperation1 averagingLong(
@Nonnull ToLongFunctionEx super T> getLongValueFn
) {
checkSerializable(getLongValueFn, "getLongValueFn");
// count == accumulator.value1
// sum == accumulator.value2
return AggregateOperation
.withCreate(LongLongAccumulator::new)
.andAccumulate((LongLongAccumulator a, T i) -> {
// a bit faster check than in addExact, specialized for increment
if (a.get1() == Long.MAX_VALUE) {
throw new ArithmeticException("Counter overflow");
}
a.set1(a.get1() + 1);
a.set2(Math.addExact(a.get2(), getLongValueFn.applyAsLong(i)));
})
.andCombine((a1, a2) -> {
a1.set1(Math.addExact(a1.get1(), a2.get1()));
a1.set2(Math.addExact(a1.get2(), a2.get2()));
})
.andDeduct((a1, a2) -> {
a1.set1(Math.subtractExact(a1.get1(), a2.get1()));
a1.set2(Math.subtractExact(a1.get2(), a2.get2()));
})
.andExportFinish(a -> (double) a.get2() / a.get1());
}
/**
* Returns an aggregate operation that finds the arithmetic mean (aka.
* average) of the {@code double} values it obtains by applying {@code
* getDoubleValueFn} to each item. It outputs the result as a {@code double}.
*
* This sample takes a stream of people and finds their mean age:
*
{@code
* BatchStage people = pipeline.readFrom(peopleSource);
* BatchStage meanAge = people.aggregate(averagingDouble(Person::age));
* }
*
* If the aggregate operation does not observe any input, its result is
* {@link Double#NaN NaN}.
*
* @param getDoubleValueFn function that extracts the {@code double} value
* from the item. It must be stateless and {@linkplain
* Processor#isCooperative() cooperative}.
* @param type of the input item
*/
@Nonnull
public static AggregateOperation1 averagingDouble(
@Nonnull ToDoubleFunctionEx super T> getDoubleValueFn
) {
checkSerializable(getDoubleValueFn, "getDoubleValueFn");
// count == accumulator.value1
// sum == accumulator.value2
return AggregateOperation
.withCreate(LongDoubleAccumulator::new)
.andAccumulate((LongDoubleAccumulator a, T item) -> {
// a bit faster check than in addExact, specialized for increment
if (a.getLong() == Long.MAX_VALUE) {
throw new ArithmeticException("Counter overflow");
}
a.setLong(a.getLong() + 1);
a.setDouble(a.getDouble() + getDoubleValueFn.applyAsDouble(item));
})
.andCombine((a1, a2) -> {
a1.setLong(Math.addExact(a1.getLong(), a2.getLong()));
a1.setDouble(a1.getDouble() + a2.getDouble());
})
.andDeduct((a1, a2) -> {
a1.setLong(Math.subtractExact(a1.getLong(), a2.getLong()));
a1.setDouble(a1.getDouble() - a2.getDouble());
})
.andExportFinish(a -> a.getDouble() / a.getLong());
}
/**
* Returns an aggregate operation that computes a linear trend over the
* items. It will produce a {@code double}-valued coefficient that
* approximates the rate of change of {@code y} as a function of {@code x},
* where {@code x} and {@code y} are {@code long} quantities obtained
* by applying the two provided functions to each item.
*
* This sample takes an infinite stream of trade events and outputs the
* current rate of price change using a sliding window:
*
{@code
* StreamStage trades = pipeline
* .readFrom(tradeSource)
* .withTimestamps(Trade::getTimestamp, SECONDS.toMillis(1));
* StreamStage> priceTrend = trades
* .window(WindowDefinition.sliding(MINUTES.toMillis(5), SECONDS.toMillis(1)))
* .aggregate(linearTrend(Trade::getTimestamp, Trade::getPrice));
* }
* With the trade price given in cents and the timestamp in milliseconds,
* the output will be in cents per millisecond. Make sure you apply a
* scaling factor if you want another, more natural unit of measure.
*
* If this aggregate operation does not observe any input, its result is
* {@link Double#NaN NaN}.
*
* @param getXFn a function to extract x from the input.
* It must be stateless and {@linkplain Processor#isCooperative()
* cooperative}.
* @param getYFn a function to extract y from the input.
* It must be stateless and {@linkplain Processor#isCooperative()
* cooperative}.
* @param type of the input item
*/
@Nonnull
public static AggregateOperation1 linearTrend(
@Nonnull ToLongFunctionEx getXFn,
@Nonnull ToLongFunctionEx getYFn
) {
checkSerializable(getXFn, "getXFn");
checkSerializable(getYFn, "getYFn");
return AggregateOperation
.withCreate(LinTrendAccumulator::new)
.andAccumulate((LinTrendAccumulator a, T item) ->
a.accumulate(getXFn.applyAsLong(item), getYFn.applyAsLong(item)))
.andCombine(LinTrendAccumulator::combine)
.andDeduct(LinTrendAccumulator::deduct)
.andExportFinish(LinTrendAccumulator::export);
}
/**
* Returns an aggregate operation that takes string items and concatenates
* them into a single string.
*
* This sample outputs a string that you get by reading down the first
* column of the input text:
*
{@code
* BatchStage linesOfText = pipeline.readFrom(textSource);
* BatchStage lineStarters = linesOfText
* .map(line -> line.charAt(0))
* .map(Object::toString)
* .aggregate(concatenating());
* }
*/
public static AggregateOperation1 concatenating() {
return AggregateOperation
.withCreate(StringBuilder::new)
.andAccumulate(StringBuilder::append)
.andCombine(StringBuilder::append)
.andExportFinish(StringBuilder::toString);
}
/**
* Returns an aggregate operation that takes string items and concatenates
* them, separated by the given {@code delimiter}, into a single string.
*
* This sample outputs a single line of text that contains all the
* upper-cased and title-cased words of the input text:
*
{@code
* BatchStage linesOfText = pipeline.readFrom(textSource);
* BatchStage upcaseWords = linesOfText
* .map(line -> line.split("\\W+"))
* .flatMap(Traversers::traverseArray)
* .filter(word -> word.matches("\\p{Lu}.*"))
* .aggregate(concatenating(" "));
* }
*/
public static AggregateOperation1 concatenating(
CharSequence delimiter
) {
return concatenating(delimiter, "", "");
}
/**
* Returns an aggregate operation that takes string items and concatenates
* them, separated by the given {@code delimiter}, into a single string.
* The resulting string will start with the given {@code prefix} and end
* with the given {@code suffix}.
*
* This sample outputs a single item, a JSON array of all the upper-cased
* and title-cased words of the input text:
*
{@code
* BatchStage linesOfText = pipeline.readFrom(textSource);
* BatchStage upcaseWords = linesOfText
* .map(line -> line.split("\\W+"))
* .flatMap(Traversers::traverseArray)
* .filter(word -> word.matches("\\p{Lu}.*"))
* .aggregate(concatenating("['", "', '", "']"));
* }
*/
public static AggregateOperation1 concatenating(
CharSequence delimiter, CharSequence prefix, CharSequence suffix
) {
int prefixLen = prefix.length();
return AggregateOperation
.withCreate(() -> new StringBuilder().append(prefix))
.andAccumulate((builder, val) -> {
if (builder.length() != prefixLen && val.length() > 0) {
builder.append(delimiter);
}
builder.append(val);
})
.andCombine((l, r) -> {
if (l.length() != prefixLen && r.length() != prefixLen) {
l.append(delimiter);
}
l.append(r, prefixLen, r.length());
})
.andExportFinish(r -> {
try {
return r.append(suffix).toString();
} finally {
r.setLength(r.length() - suffix.length());
}
});
}
/**
* Adapts an aggregate operation that takes items of type {@code U} to one
* that takes items of type {@code T}, by applying the given mapping
* function to each item. Normally you should just apply the mapping in a
* stage before the aggregation, but this adapter is useful when
* simultaneously performing several aggregate operations using {@link
* #allOf}.
*
* In addition to mapping, you can apply filtering as well by returning
* {@code null} for an item you want filtered out.
*
* This sample takes a stream of people and builds two sorted lists from
* it, one with all the names and one with all the surnames:
*
{@code
* BatchStage people = pipeline.readFrom(peopleSource);
* BatchStage, List>> sortedNames =
* people.aggregate(allOf(
* mapping(Person::getFirstName, sorting(ComparatorEx.naturalOrder())),
* mapping(Person::getLastName, sorting(ComparatorEx.naturalOrder()))));
* }
*
* @see #filtering
* @see #flatMapping
*
* @param mapFn the function to apply to the input items. It must be
* stateless and {@linkplain Processor#isCooperative() cooperative}.
* @param downstream the downstream aggregate operation
* @param type of the input item
* @param input type of the downstream aggregate operation
* @param downstream operation's accumulator type
* @param downstream operation's result type
*/
public static AggregateOperation1 mapping(
@Nonnull FunctionEx super T, ? extends U> mapFn,
@Nonnull AggregateOperation1 super U, A, ? extends R> downstream
) {
checkSerializable(mapFn, "mapFn");
BiConsumerEx super A, ? super U> downstreamAccumulateFn = downstream.accumulateFn();
return AggregateOperation
.withCreate(downstream.createFn())
.andAccumulate((A a, T t) -> {
U mapped = mapFn.apply(t);
if (mapped != null) {
downstreamAccumulateFn.accept(a, mapped);
}
})
.andCombine(downstream.combineFn())
.andDeduct(downstream.deductFn())
.andExport(downstream.exportFn())
.andFinish(downstream.finishFn());
}
/**
* Adapts an aggregate operation so that it accumulates only the items
* passing the {@code filterFn} and ignores others. Normally you should
* just apply the filter in a stage before the aggregation, but this
* adapter is useful when simultaneously performing several aggregate
* operations using {@link #allOf}.
*
* This sample takes a stream of people and outputs two numbers, the
* average height of kids and grown-ups:
*
{@code
* BatchStage people = pipeline.readFrom(peopleSource);
* BatchStage> avgHeightByAge = people.aggregate(allOf(
* filtering((Person p) -> p.getAge() < 18, averagingLong(Person::getHeight)),
* filtering((Person p) -> p.getAge() >= 18, averagingLong(Person::getHeight))
* ));
* }
* @see #mapping
* @see #flatMapping
*
* @param filterFn the filtering function. It must be stateless and
* {@linkplain Processor#isCooperative() cooperative}.
* @param downstream the downstream aggregate operation
* @param type of the input item
* @param downstream operation's accumulator type
* @param downstream operation's result type
*
* @since Jet 3.1
*/
public static AggregateOperation1 filtering(
@Nonnull PredicateEx super T> filterFn,
@Nonnull AggregateOperation1 super T, A, ? extends R> downstream
) {
checkSerializable(filterFn, "filterFn");
BiConsumerEx super A, ? super T> downstreamAccumulateFn = downstream.accumulateFn();
return AggregateOperation
.withCreate(downstream.createFn())
.andAccumulate((A a, T t) -> {
if (filterFn.test(t)) {
downstreamAccumulateFn.accept(a, t);
}
})
.andCombine(downstream.combineFn())
.andDeduct(downstream.deductFn())
.andExport(downstream.exportFn())
.andFinish(downstream.finishFn());
}
/**
* Adapts an aggregate operation that takes items of type {@code U} to one
* that takes items of type {@code T}, by exploding each {@code T} into a
* sequence of {@code U}s and then accumulating all of them. Normally you
* should just apply the flat-mapping in a stage before the aggregation,
* but this adapter is useful when simultaneously performing several
* aggregate operations using {@link #allOf}.
*
* The traverser your function returns must be non-null and
* null-terminated.
*
* This sample takes a stream of people and outputs two numbers, the mean
* age of all the people and the mean age of people listed as someone's
* kid:
*
{@code
* BatchStage people = pipeline.readFrom(peopleSource);
* people.aggregate(allOf(
* averagingLong(Person::getAge),
* flatMapping((Person p) -> traverseIterable(p.getChildren()),
* averagingLong(Person::getAge))
* ));
* }
* @see #mapping
* @see #filtering
*
* @param flatMapFn the flat-mapping function to apply. It must be
* stateless and {@linkplain Processor#isCooperative() cooperative}.
* @param downstream the downstream aggregate operation
* @param type of the input item
* @param input type of the downstream aggregate operation
* @param downstream operation's accumulator type
* @param downstream operation's result type
*
* @since Jet 3.1
*/
public static AggregateOperation1 flatMapping(
@Nonnull FunctionEx super T, ? extends Traverser extends U>> flatMapFn,
@Nonnull AggregateOperation1 super U, A, ? extends R> downstream
) {
checkSerializable(flatMapFn, "flatMapFn");
BiConsumerEx super A, ? super U> downstreamAccumulateFn = downstream.accumulateFn();
return AggregateOperation
.withCreate(downstream.createFn())
.andAccumulate((A a, T t) -> {
Traverser extends U> trav = flatMapFn.apply(t);
for (U u; (u = trav.next()) != null; ) {
downstreamAccumulateFn.accept(a, u);
}
})
.andCombine(downstream.combineFn())
.andDeduct(downstream.deductFn())
.andExport(downstream.exportFn())
.andFinish(downstream.finishFn());
}
/**
* Returns an aggregate operation that accumulates the items into a {@code
* Collection}. It creates empty, mutable collections as needed by calling
* the provided {@code createCollectionFn}.
*
* This sample takes a stream of words and outputs a single sorted set of
* all the long words (above 5 letters):
*
{@code
* BatchStage words = pipeline.readFrom(wordSource);
* BatchStage> sortedLongWords = words
* .filter(w -> w.length() > 5)
* .aggregate(toCollection(TreeSet::new));
* }
* Note: if you use a collection that preserves the
* insertion order, keep in mind that Jet doesn't aggregate the items in
* any specified order.
*
* @param createCollectionFn a {@code Supplier} of empty, mutable {@code
* Collection}s. It must be stateless and {@linkplain
* Processor#isCooperative() cooperative}.
* @param type of the input item
* @param the type of the collection
*/
public static > AggregateOperation1 toCollection(
@Nonnull SupplierEx createCollectionFn
) {
checkSerializable(createCollectionFn, "createCollectionFn");
return AggregateOperation
.withCreate(createCollectionFn)
.andAccumulate(Collection::add)
.andCombine(Collection::addAll)
.andExport(acc -> {
C result = createCollectionFn.get();
result.addAll(acc);
return result;
})
.andFinish(identity());
}
/**
* Returns an aggregate operation that accumulates the items into an {@code
* ArrayList}.
*
* This sample takes a stream of words and outputs a single list of all the
* long words (above 5 letters):
*
{@code
* BatchStage words = pipeline.readFrom(wordSource);
* BatchStage> longWords = words
* .filter(w -> w.length() > 5)
* .aggregate(toList());
* }
* Note: accumulating all the data into an in-memory list
* shouldn't be your first choice in designing a pipeline. Consider
* draining the result stream to a sink.
*
* @param type of the input item
*/
public static AggregateOperation1, List> toList() {
return toCollection(ArrayList::new);
}
/**
* Returns an aggregate operation that accumulates the items into a {@code
* HashSet}.
*
* This sample takes a stream of people and outputs a single set of all the
* distinct cities they live in:
*
{@code
* pipeline.readFrom(personSource)
* .map(Person::getCity)
* .aggregate(toSet());
* }
* Note: accumulating all the data into an in-memory set
* shouldn't be your first choice in designing a pipeline. Consider
* draining the result stream to a sink.
* @param type of the input item
*/
public static AggregateOperation1, Set> toSet() {
return toCollection(HashSet::new);
}
/**
* Returns an aggregate operation that accumulates the items into a {@code
* HashMap} whose keys and values are the result of applying the provided
* mapping functions.
*
* This aggregate operation does not tolerate duplicate keys and will throw
* an {@code IllegalStateException} if it detects them. If your data
* contains duplicates, use {@link #toMap(FunctionEx, FunctionEx,
* BinaryOperatorEx) toMap(keyFn, valueFn, mergeFn)}.
*
* The following sample takes a stream of sensor readings and outputs a
* single map {sensor ID -> reading}:
*
{@code
* BatchStage
* Note: accumulating all the data into an in-memory map
* shouldn't be your first choice in designing a pipeline. Consider
* draining the stream to a sink.
*
* @param keyFn a function to extract the key from the input item. It must
* be stateless and {@linkplain Processor#isCooperative() cooperative}.
* @param valueFn a function to extract the value from the input item. It
* must be stateless and {@linkplain Processor#isCooperative()
* cooperative}.
* @param type of the input item
* @param type of the key
* @param type of the value
*
* @see #toMap(FunctionEx, FunctionEx, BinaryOperatorEx)
* @see #toMap(FunctionEx, FunctionEx, BinaryOperatorEx, SupplierEx)
* @see #groupingBy(FunctionEx)
*/
public static AggregateOperation1, Map> toMap(
FunctionEx super T, ? extends K> keyFn,
FunctionEx super T, ? extends U> valueFn
) {
checkSerializable(keyFn, "keyFn");
checkSerializable(valueFn, "valueFn");
return toMap(keyFn, valueFn,
(k, v) -> {
throw new IllegalStateException("Duplicate key: " + k);
},
HashMap::new);
}
/**
* Returns an aggregate operation that accumulates the items into a
* {@code HashMap} whose keys and values are the result of applying
* the provided mapping functions.
*
* This aggregate operation resolves duplicate keys by applying {@code
* mergeFn} to the conflicting values. {@code mergeFn} will act upon the
* values after {@code valueFn} has already been applied.
*
* The following sample takes a stream of sensor readings and outputs a
* single map {sensor ID -> reading}. Multiple readings from the same
* sensor get summed up:
*
{@code
* BatchStage
* Note: accumulating all the data into an in-memory map
* shouldn't be your first choice in designing a pipeline. Consider
* draining the stream to a sink.
*
* The given functions must be stateless and {@linkplain
* Processor#isCooperative() cooperative}.
*
* @param keyFn a function to extract the key from input item
* @param valueFn a function to extract value from input item
* @param mergeFn the function used to resolve collisions between values associated
* with the same key, will be passed to {@link Map#merge(Object, Object,
* java.util.function.BiFunction)}
* @param type of the input item
* @param the type of key
* @param the output type of the value mapping function
*
* @see #toMap(FunctionEx, FunctionEx)
* @see #toMap(FunctionEx, FunctionEx, BinaryOperatorEx, SupplierEx)
*/
public static AggregateOperation1, Map> toMap(
FunctionEx super T, ? extends K> keyFn,
FunctionEx super T, ? extends U> valueFn,
BinaryOperatorEx mergeFn
) {
checkSerializable(keyFn, "keyFn");
checkSerializable(valueFn, "valueFn");
return toMap(keyFn, valueFn, mergeFn, HashMap::new);
}
/**
* Returns an aggregate operation that accumulates elements into a
* user-supplied {@code Map} instance. The keys and values are the result
* of applying the provided mapping functions to the input elements.
*
* This aggregate operation resolves duplicate keys by applying {@code
* mergeFn} to the conflicting values. {@code mergeFn} will act upon the
* values after {@code valueFn} has already been applied.
*
* The following sample takes a stream of sensor readings and outputs a
* single {@code ObjectToLongHashMap} of {sensor ID -> reading}. Multiple
* readings from the same sensor get summed up:
*
{@code
* BatchStage
*
* The given functions must be stateless and {@linkplain
* Processor#isCooperative() cooperative}.
*
* @param keyFn a function to extract the key from input item
* @param valueFn a function to extract value from input item
* @param mergeFn a merge function, used to resolve collisions between
* values associated with the same key, as supplied
* to {@link Map#merge(Object, Object,
* java.util.function.BiFunction)}
* @param createMapFn a function which returns a new, empty {@code Map} into
* which the results will be inserted
* @param type of the input item
* @param the output type of the key mapping function
* @param the output type of the value mapping function
* @param the type of the resulting {@code Map}
*
* @see #toMap(FunctionEx, FunctionEx)
* @see #toMap(FunctionEx, FunctionEx, BinaryOperatorEx)
*/
public static > AggregateOperation1 toMap(
FunctionEx super T, ? extends K> keyFn,
FunctionEx super T, ? extends U> valueFn,
BinaryOperatorEx mergeFn,
SupplierEx createMapFn
) {
checkSerializable(keyFn, "keyFn");
checkSerializable(valueFn, "valueFn");
checkSerializable(mergeFn, "mergeFn");
checkSerializable(createMapFn, "createMapFn");
BiConsumerEx accumulateFn =
(map, element) -> map.merge(keyFn.apply(element), valueFn.apply(element), mergeFn);
return AggregateOperation
.withCreate(createMapFn)
.andAccumulate(accumulateFn)
.andCombine((l, r) -> r.forEach((key, value) -> l.merge(key, value, mergeFn)))
.andExport(acc -> {
M result = createMapFn.get();
result.putAll(acc);
return result;
})
.andFinish(identity());
}
/**
* Returns an aggregate operation that accumulates the items into a
* {@code HashMap} where the key is the result of applying {@code keyFn}
* and the value is a list of the items with that key.
*
* This operation is primarily useful when you need a cascaded group-by
* where you further classify the members of each group by a secondary key.
*
* This sample takes a stream of persons and classifies them first by
* country and then by gender. It outputs a stream of map entries where the
* key is the country and the value is a map from gender to the list of
* people of that gender from that country:
*
{@code
* BatchStage people = pipeline.readFrom(personSource);
* BatchStage>>> byCountryAndGender =
* people.groupingKey(Person::getCountry)
* .aggregate(groupingBy(Person::getGender));
* }
* }
*
* This aggregate operation has a similar effect to the dedicated {@link
* GeneralStage#groupingKey(FunctionEx) groupingKey()} pipeline transform,
* so you may wonder why not use it in all cases, not just cascaded
* grouping. To see the difference, check out these two snippets:
* {@code
* BatchStage people = pipeline.readFrom(personSource);
*
* // Snippet 1
* BatchStage>> byCountry1 =
* people.groupingKey(Person::getCountry)
* .aggregate(toList());
*
* // Snippet 2
* BatchStage
*
* Notice that snippet 1 outputs a stream of map entries whereas
* snippet 2 outputs a single map. To produce the single map,
* Jet must do all the work on a single thread and hold all the data on a
* single cluster member, so you lose the advantage of distributed
* computation. By contrast, snippet 1 allows Jet to partition the input by
* the grouping key and split the work across the cluster. This is why you
* should prefer a {@code groupingKey} stage if you have just one level of
* grouping.
*
* @param keyFn a function to extract the key from input item. It must be
* stateless and {@linkplain Processor#isCooperative() cooperative}.
* @param type of the input item
* @param the output type of the key mapping function
*
* @see #groupingBy(FunctionEx, AggregateOperation1)
* @see #groupingBy(FunctionEx, SupplierEx, AggregateOperation1)
* @see #toMap(FunctionEx, FunctionEx)
*/
public static AggregateOperation1>, Map>> groupingBy(
FunctionEx super T, ? extends K> keyFn
) {
checkSerializable(keyFn, "keyFn");
return groupingBy(keyFn, toList());
}
/**
* Returns an aggregate operation that accumulates the items into a
* {@code HashMap} where the key is the result of applying {@code keyFn}
* and the value is the result of applying the downstream aggregate
* operation to the items with that key.
*
* This operation is primarily useful when you need a cascaded group-by
* where you further classify the members of each group by a secondary key.
* For the difference between this operation and the {@link
* GeneralStage#groupingKey(FunctionEx) groupingKey()} pipeline transform,
* see the documentation on {@link #groupingBy(FunctionEx) groupingBy(keyFn)}.
*
* This sample takes a stream of people, classifies them by country and
* gender, and reports the number of people in each category:
*
{@code
* BatchStage people = pipeline.readFrom(personSource);
* BatchStage>> countByCountryAndGender =
* people.groupingKey(Person::getCountry)
* .aggregate(groupingBy(Person::getGender, counting()));
* }
*
*
* @param keyFn a function to extract the key from input item. It must be
* stateless and {@linkplain Processor#isCooperative() cooperative}.
* @param downstream the downstream aggregate operation
* @param type of the input item
* @param the output type of the key mapping function
* @param the type of the downstream aggregation result
* @param downstream aggregation's accumulator type
*
* @see #groupingBy(FunctionEx)
* @see #groupingBy(FunctionEx, SupplierEx, AggregateOperation1)
* @see #toMap(FunctionEx, FunctionEx)
*/
public static AggregateOperation1, Map> groupingBy(
FunctionEx super T, ? extends K> keyFn,
AggregateOperation1 super T, A, R> downstream
) {
checkSerializable(keyFn, "keyFn");
return groupingBy(keyFn, HashMap::new, downstream);
}
/**
* Returns an {@code AggregateOperation1} that accumulates the items into a
* {@code Map} (as obtained from {@code createMapFn}) where the key is the
* result of applying {@code keyFn} and the value is the result of
* applying the downstream aggregate operation to the items with that key.
*
* This operation is primarily useful when you need a cascaded group-by
* where you further classify the members of each group by a secondary key.
* For the difference between this operation and the {@link
* GeneralStage#groupingKey(FunctionEx) groupingKey()} pipeline transform,
* see the documentation on {@link #groupingBy(FunctionEx) groupingBy(keyFn)}.
*
* The following sample takes a stream of people, classifies them by country
* and gender, and reports the number of people in each category. It uses
* the {@code EnumMap} to optimize memory usage:
*
{@code
* BatchStage people = pipeline.readFrom(personSource);
* BatchStage>> countByCountryAndGender =
* people.groupingKey(Person::getCountry)
* .aggregate(groupingBy(
* Person::getGender,
* () -> new EnumMap<>(Gender.class),
* counting()));
* }
*
* @param keyFn a function to extract the key from input item. It must be
* stateless and {@linkplain Processor#isCooperative() cooperative}.
* @param createMapFn a function which returns a new, empty {@code Map} into
* which the results will be inserted. It must be stateless and {@linkplain
* Processor#isCooperative() cooperative}.
* @param downstream the downstream aggregate operation
* @param type of the input item
* @param the output type of the key mapping function
* @param the type of the downstream aggregation result
* @param downstream aggregation's accumulator type
* @param output type of the resulting {@code Map}
*
* @see #groupingBy(FunctionEx)
* @see #groupingBy(FunctionEx, AggregateOperation1)
* @see #toMap(FunctionEx, FunctionEx)
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
public static > AggregateOperation1, M> groupingBy(
FunctionEx super T, ? extends K> keyFn,
SupplierEx createMapFn,
AggregateOperation1 super T, A, R> downstream
) {
checkSerializable(keyFn, "keyFn");
checkSerializable(createMapFn, "createMapFn");
BiConsumerEx super Map, T> accumulateFn = (m, t) -> {
A acc = m.computeIfAbsent(keyFn.apply(t), k -> downstream.createFn().get());
downstream.accumulateFn().accept(acc, t);
};
BiConsumerEx