All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.common.data.HoodieListPairData Maven / Gradle / Ivy

There is a newer version: 1.0.0-beta1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.common.data;

import org.apache.hudi.common.function.SerializableBiFunction;
import org.apache.hudi.common.function.SerializableFunction;
import org.apache.hudi.common.function.SerializablePairFunction;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.common.util.collection.MappingIterator;
import org.apache.hudi.common.util.collection.Pair;

import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.function.Function;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper;
import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper;

/**
 * In-memory implementation of {@link HoodiePairData} holding internally a {@link Stream} of {@link Pair}s.
 *
 * {@link HoodieListData} can have either of the 2 execution semantics:
 *
 * 
    *
  1. Eager: with every operation being executed right away
  2. *
  3. Lazy: with every operation being "stacked up", with it execution postponed until * "terminal" operation is invoked
  4. *
* * * NOTE: This is an in-memory counterpart for {@code HoodieJavaPairRDD}, and it strives to provide * similar semantic as RDD container -- all intermediate (non-terminal, not de-referencing * the stream like "collect", "groupBy", etc) operations are executed *lazily*. * This allows to make sure that compute/memory churn is minimal since only necessary * computations will ultimately be performed. * * Please note, however, that while RDD container allows the same collection to be * de-referenced more than once (ie terminal operation invoked more than once), * {@link HoodieListData} allows that only when instantiated w/ an eager execution semantic. * * @param type of the key in the pair * @param type of the value in the pair */ public class HoodieListPairData extends HoodieBaseListData> implements HoodiePairData { private HoodieListPairData(List> data, boolean lazy) { super(data, lazy); } HoodieListPairData(Stream> dataStream, boolean lazy) { super(dataStream, lazy); } @Override public List> get() { return collectAsList(); } @Override public void persist(String cacheConfig) { // no-op } @Override public void unpersist() { // no-op } @Override public HoodieData keys() { return new HoodieListData<>(asStream().map(Pair::getKey), lazy); } @Override public HoodieData values() { return new HoodieListData<>(asStream().map(Pair::getValue), lazy); } @Override public Map countByKey() { return asStream().collect(Collectors.groupingBy(Pair::getKey, Collectors.counting())); } @Override public HoodiePairData> groupByKey() { Collector, ?, List> mappingCollector = Collectors.mapping(Pair::getValue, Collectors.toList()); Collector, ?, Map>> groupingCollector = Collectors.groupingBy(Pair::getKey, mappingCollector); Map> groupedByKey = asStream().collect(groupingCollector); return new HoodieListPairData<>( groupedByKey.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue())), lazy ); } @Override public HoodiePairData reduceByKey(SerializableBiFunction combiner, int parallelism) { Map> reducedMap = asStream().collect( Collectors.groupingBy( Pair::getKey, HashMap::new, Collectors.mapping(Pair::getValue, Collectors.reducing(combiner::apply)))); return new HoodieListPairData<>( reducedMap.entrySet() .stream() .map(e -> Pair.of(e.getKey(), e.getValue().orElse(null))), lazy ); } @Override public HoodieData map(SerializableFunction, O> func) { Function, O> uncheckedMapper = throwingMapWrapper(func); return new HoodieListData<>(asStream().map(uncheckedMapper), lazy); } @Override public HoodiePairData mapValues(SerializableFunction func) { Function uncheckedMapper = throwingMapWrapper(func); return new HoodieListPairData<>(asStream().map(p -> Pair.of(p.getKey(), uncheckedMapper.apply(p.getValue()))), lazy); } public HoodiePairData flatMapValues(SerializableFunction> func) { Function> uncheckedMapper = throwingMapWrapper(func); return new HoodieListPairData<>(asStream().flatMap(p -> { Iterator mappedValuesIterator = uncheckedMapper.apply(p.getValue()); Iterator> mappedPairsIterator = new MappingIterator<>(mappedValuesIterator, w -> Pair.of(p.getKey(), w)); return StreamSupport.stream( Spliterators.spliteratorUnknownSize(mappedPairsIterator, Spliterator.ORDERED), true); }), lazy); } @Override public HoodiePairData mapToPair(SerializablePairFunction, L, W> mapToPairFunc) { return new HoodieListPairData<>(asStream().map(p -> throwingMapToPairWrapper(mapToPairFunc).apply(p)), lazy); } @Override public HoodiePairData>> leftOuterJoin(HoodiePairData other) { ValidationUtils.checkArgument(other instanceof HoodieListPairData); // Transform right-side container to a multi-map of [[K]] to [[List]] values HashMap> rightStreamMap = ((HoodieListPairData) other).asStream().collect( Collectors.groupingBy( Pair::getKey, HashMap::new, Collectors.mapping(Pair::getValue, Collectors.toList()))); Stream>>> leftOuterJoined = asStream().flatMap(pair -> { K key = pair.getKey(); V leftValue = pair.getValue(); List rightValues = rightStreamMap.get(key); if (rightValues == null) { return Stream.of(Pair.of(key, Pair.of(leftValue, Option.empty()))); } else { return rightValues.stream().map(rightValue -> Pair.of(key, Pair.of(leftValue, Option.of(rightValue)))); } }); return new HoodieListPairData<>(leftOuterJoined, lazy); } @Override public long count() { return super.count(); } @Override public List> collectAsList() { return super.collectAsList(); } public static HoodieListPairData lazy(List> data) { return new HoodieListPairData<>(data, true); } public static HoodieListPairData eager(List> data) { return new HoodieListPairData<>(data, false); } public static HoodieListPairData lazy(Map> data) { return new HoodieListPairData<>(explode(data), true); } public static HoodieListPairData eager(Map> data) { return new HoodieListPairData<>(explode(data), false); } private static Stream> explode(Map> data) { return data.entrySet().stream() .flatMap(e -> e.getValue().stream().map(v -> Pair.of(e.getKey(), v))); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy