Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.data;
import org.apache.hudi.common.function.SerializableBiFunction;
import org.apache.hudi.common.function.SerializableFunction;
import org.apache.hudi.common.function.SerializablePairFunction;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.MappingIterator;
import org.apache.hudi.common.util.collection.Pair;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.function.Function;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper;
import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper;
/**
* In-memory implementation of {@link HoodiePairData} holding internally a {@link Stream} of {@link Pair}s.
*
* {@link HoodieListData} can have either of the 2 execution semantics:
*
*
*
Eager: with every operation being executed right away
*
Lazy: with every operation being "stacked up", with it execution postponed until
* "terminal" operation is invoked
*
*
*
* NOTE: This is an in-memory counterpart for {@code HoodieJavaPairRDD}, and it strives to provide
* similar semantic as RDD container -- all intermediate (non-terminal, not de-referencing
* the stream like "collect", "groupBy", etc) operations are executed *lazily*.
* This allows to make sure that compute/memory churn is minimal since only necessary
* computations will ultimately be performed.
*
* Please note, however, that while RDD container allows the same collection to be
* de-referenced more than once (ie terminal operation invoked more than once),
* {@link HoodieListData} allows that only when instantiated w/ an eager execution semantic.
*
* @param type of the key in the pair
* @param type of the value in the pair
*/
public class HoodieListPairData extends HoodieBaseListData> implements HoodiePairData {
private HoodieListPairData(List> data, boolean lazy) {
super(data, lazy);
}
HoodieListPairData(Stream> dataStream, boolean lazy) {
super(dataStream, lazy);
}
@Override
public List> get() {
return collectAsList();
}
@Override
public void persist(String cacheConfig) {
// no-op
}
@Override
public void unpersist() {
// no-op
}
@Override
public HoodieData keys() {
return new HoodieListData<>(asStream().map(Pair::getKey), lazy);
}
@Override
public HoodieData values() {
return new HoodieListData<>(asStream().map(Pair::getValue), lazy);
}
@Override
public Map countByKey() {
return asStream().collect(Collectors.groupingBy(Pair::getKey, Collectors.counting()));
}
@Override
public HoodiePairData> groupByKey() {
Collector, ?, List> mappingCollector = Collectors.mapping(Pair::getValue, Collectors.toList());
Collector, ?, Map>> groupingCollector =
Collectors.groupingBy(Pair::getKey, mappingCollector);
Map> groupedByKey = asStream().collect(groupingCollector);
return new HoodieListPairData<>(
groupedByKey.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue())),
lazy
);
}
@Override
public HoodiePairData reduceByKey(SerializableBiFunction combiner, int parallelism) {
Map> reducedMap = asStream().collect(
Collectors.groupingBy(
Pair::getKey,
HashMap::new,
Collectors.mapping(Pair::getValue, Collectors.reducing(combiner::apply))));
return new HoodieListPairData<>(
reducedMap.entrySet()
.stream()
.map(e -> Pair.of(e.getKey(), e.getValue().orElse(null))),
lazy
);
}
@Override
public HoodieData map(SerializableFunction, O> func) {
Function, O> uncheckedMapper = throwingMapWrapper(func);
return new HoodieListData<>(asStream().map(uncheckedMapper), lazy);
}
@Override
public HoodiePairData mapValues(SerializableFunction func) {
Function uncheckedMapper = throwingMapWrapper(func);
return new HoodieListPairData<>(asStream().map(p -> Pair.of(p.getKey(), uncheckedMapper.apply(p.getValue()))), lazy);
}
public HoodiePairData flatMapValues(SerializableFunction> func) {
Function> uncheckedMapper = throwingMapWrapper(func);
return new HoodieListPairData<>(asStream().flatMap(p -> {
Iterator mappedValuesIterator = uncheckedMapper.apply(p.getValue());
Iterator> mappedPairsIterator =
new MappingIterator<>(mappedValuesIterator, w -> Pair.of(p.getKey(), w));
return StreamSupport.stream(
Spliterators.spliteratorUnknownSize(mappedPairsIterator, Spliterator.ORDERED), true);
}), lazy);
}
@Override
public HoodiePairData mapToPair(SerializablePairFunction, L, W> mapToPairFunc) {
return new HoodieListPairData<>(asStream().map(p -> throwingMapToPairWrapper(mapToPairFunc).apply(p)), lazy);
}
@Override
public HoodiePairData>> leftOuterJoin(HoodiePairData other) {
ValidationUtils.checkArgument(other instanceof HoodieListPairData);
// Transform right-side container to a multi-map of [[K]] to [[List]] values
HashMap> rightStreamMap = ((HoodieListPairData) other).asStream().collect(
Collectors.groupingBy(
Pair::getKey,
HashMap::new,
Collectors.mapping(Pair::getValue, Collectors.toList())));
Stream>>> leftOuterJoined = asStream().flatMap(pair -> {
K key = pair.getKey();
V leftValue = pair.getValue();
List rightValues = rightStreamMap.get(key);
if (rightValues == null) {
return Stream.of(Pair.of(key, Pair.of(leftValue, Option.empty())));
} else {
return rightValues.stream().map(rightValue ->
Pair.of(key, Pair.of(leftValue, Option.of(rightValue))));
}
});
return new HoodieListPairData<>(leftOuterJoined, lazy);
}
@Override
public long count() {
return super.count();
}
@Override
public List> collectAsList() {
return super.collectAsList();
}
public static HoodieListPairData lazy(List> data) {
return new HoodieListPairData<>(data, true);
}
public static HoodieListPairData eager(List> data) {
return new HoodieListPairData<>(data, false);
}
public static HoodieListPairData lazy(Map> data) {
return new HoodieListPairData<>(explode(data), true);
}
public static HoodieListPairData eager(Map> data) {
return new HoodieListPairData<>(explode(data), false);
}
private static Stream> explode(Map> data) {
return data.entrySet().stream()
.flatMap(e -> e.getValue().stream().map(v -> Pair.of(e.getKey(), v)));
}
}