org.apache.hudi.common.data.HoodiePairData Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hudi.common.data;
import org.apache.hudi.common.function.SerializableBiFunction;
import org.apache.hudi.common.function.SerializableFunction;
import org.apache.hudi.common.function.SerializablePairFunction;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
/**
* An abstraction for pairs of key in type K and value in type V to store the reference
* and do transformation.
*
* @param type of key.
* @param type of value.
*/
public interface HoodiePairData extends Serializable {
/**
* @return the collection of pairs.
*/
Object get();
/**
* Persists the data (if applicable)
*
* @param cacheConfig config value for caching.
*/
void persist(String cacheConfig);
/**
* Un-persists the data (if applicable)
*/
void unpersist();
/**
* Returns a {@link HoodieData} holding the key from every corresponding pair
*/
HoodieData keys();
/**
* Returns a {@link HoodieData} holding the value from every corresponding pair
*/
HoodieData values();
/**
* Returns number of held pairs
*/
long count();
/**
* Counts the number of pairs grouping them by key
*/
Map countByKey();
/**
* Groups the values for each key in the dataset into a single sequence
*/
HoodiePairData> groupByKey();
/**
* Reduces original sequence by de-duplicating the pairs w/ the same key, using provided
* binary operator {@code combiner}. Returns an instance of {@link HoodiePairData} holding
* the "de-duplicated" pairs, ie only pairs with unique keys.
*
* @param combiner method to combine values of the pairs with the same key
* @param parallelism target parallelism (if applicable)
*/
HoodiePairData reduceByKey(SerializableBiFunction combiner, int parallelism);
/**
* Maps key-value pairs of this {@link HoodiePairData} container leveraging provided mapper
*
* NOTE: That this returns {@link HoodieData} and not {@link HoodiePairData}
*/
HoodieData map(SerializableFunction, O> func);
/**
* Maps values of this {@link HoodiePairData} container leveraging provided mapper
*/
HoodiePairData mapValues(SerializableFunction func);
/**
* @param mapToPairFunc serializable map function to generate another pair.
* @param new key type.
* @param new value type.
* @return containing the result. Actual execution may be deferred.
*/
HoodiePairData mapToPair(
SerializablePairFunction, L, W> mapToPairFunc);
/**
* Performs a left outer join of this dataset against {@code other}.
*
* For each element (k, v) in this, the resulting {@link HoodiePairData} will either contain all
* pairs {@code (k, (v, Some(w)))} for every {@code w} in the {@code other}, or the pair {@code (k, (v, None))}
* if no elements in {@code other} have the pair w/ a key {@code k}
*
* @param other the other {@link HoodiePairData}
* @param value type of the other {@link HoodiePairData}
* @return containing the result of the left outer join
*/
HoodiePairData>> leftOuterJoin(HoodiePairData other);
/**
* Collects results of the underlying collection into a {@link List>}
*
* This is a terminal operation
*/
List> collectAsList();
}