Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.common.data;
import org.apache.hudi.common.function.SerializableFunction;
import org.apache.hudi.common.function.SerializablePairFunction;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.Pair;
import java.util.Iterator;
import java.util.List;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.function.Function;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper;
import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper;
/**
* In-memory implementation of {@link HoodieData} holding internally a {@link Stream} of objects.
*
* {@link HoodieListData} can have either of the 2 execution semantics:
*
*
*
Eager: with every operation being executed right away
*
Lazy: with every operation being "stacked up", with it execution postponed until
* "terminal" operation is invoked
*
*
* NOTE: This is an in-memory counterpart for {@code HoodieJavaRDD}, and it strives to provide
* similar semantic as RDD container -- all intermediate (non-terminal, not de-referencing
* the stream like "collect", "groupBy", etc) operations are executed *lazily*.
* This allows to make sure that compute/memory churn is minimal since only necessary
* computations will ultimately be performed.
*
* Please note, however, that while RDD container allows the same collection to be
* de-referenced more than once (ie terminal operation invoked more than once),
* {@link HoodieListData} allows that only when instantiated w/ an eager execution semantic.
*
* @param type of object.
*/
public class HoodieListData extends HoodieBaseListData implements HoodieData {
private HoodieListData(List data, boolean lazy) {
super(data, lazy);
}
HoodieListData(Stream dataStream, boolean lazy) {
super(dataStream, lazy);
}
/**
* Creates instance of {@link HoodieListData} bearing *eager* execution semantic
*
* @param listData a {@link List} of objects in type T
* @param type of object
* @return a new instance containing the {@link List} reference
*/
public static HoodieListData eager(List listData) {
return new HoodieListData<>(listData, false);
}
/**
* Creates instance of {@link HoodieListData} bearing *lazy* execution semantic
*
* @param listData a {@link List} of objects in type T
* @param type of object
* @return a new instance containing the {@link List} reference
*/
public static HoodieListData lazy(List listData) {
return new HoodieListData<>(listData, true);
}
@Override
public void persist(String level) {
// No OP
}
@Override
public void unpersist() {
// No OP
}
@Override
public HoodieData map(SerializableFunction func) {
return new HoodieListData<>(asStream().map(throwingMapWrapper(func)), lazy);
}
@Override
public HoodieData mapPartitions(SerializableFunction, Iterator> func, boolean preservesPartitioning) {
Function, Iterator> mapper = throwingMapWrapper(func);
return new HoodieListData<>(
StreamSupport.stream(
Spliterators.spliteratorUnknownSize(
mapper.apply(asStream().iterator()), Spliterator.ORDERED), true),
lazy
);
}
@Override
public HoodieData flatMap(SerializableFunction> func) {
Function> mapper = throwingMapWrapper(func);
Stream mappedStream = asStream().flatMap(e ->
StreamSupport.stream(
Spliterators.spliteratorUnknownSize(mapper.apply(e), Spliterator.ORDERED), true));
return new HoodieListData<>(mappedStream, lazy);
}
@Override
public HoodiePairData flatMapToPair(SerializableFunction>> func) {
Function>> mapper = throwingMapWrapper(func);
Stream> mappedStream = asStream().flatMap(e ->
StreamSupport.stream(
Spliterators.spliteratorUnknownSize(mapper.apply(e), Spliterator.ORDERED), true));
return new HoodieListPairData<>(mappedStream, lazy);
}
@Override
public HoodiePairData mapToPair(SerializablePairFunction func) {
Function> throwableMapToPairFunc = throwingMapToPairWrapper(func);
return new HoodieListPairData<>(asStream().map(throwableMapToPairFunc), lazy);
}
@Override
public HoodieData distinct() {
return new HoodieListData<>(asStream().distinct(), lazy);
}
@Override
public HoodieData distinct(int parallelism) {
return distinct();
}
@Override
public HoodieData distinctWithKey(SerializableFunction keyGetter, int parallelism) {
return mapToPair(i -> Pair.of(keyGetter.apply(i), i))
.reduceByKey((value1, value2) -> value1, parallelism)
.values();
}
@Override
public HoodieData filter(SerializableFunction filterFunc) {
return new HoodieListData<>(asStream().filter(r -> throwingMapWrapper(filterFunc).apply(r)), lazy);
}
@Override
public HoodieData union(HoodieData other) {
ValidationUtils.checkArgument(other instanceof HoodieListData);
return new HoodieListData<>(Stream.concat(asStream(), ((HoodieListData)other).asStream()), lazy);
}
@Override
public HoodieData repartition(int parallelism) {
// no op
return this;
}
@Override
public boolean isEmpty() {
return super.isEmpty();
}
@Override
public long count() {
return super.count();
}
@Override
public int getNumPartitions() {
return 1;
}
@Override
public List collectAsList() {
return super.collectAsList();
}
}