All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.common.data.HoodiePairData Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.common.data;

import org.apache.hudi.common.function.SerializableBiFunction;
import org.apache.hudi.common.function.SerializableFunction;
import org.apache.hudi.common.function.SerializablePairFunction;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;

import java.io.Serializable;
import java.util.List;
import java.util.Map;

/**
 * An abstraction for pairs of key in type K and value in type V to store the reference
 * and do transformation.
 *
 * @param  type of key.
 * @param  type of value.
 */
public interface HoodiePairData extends Serializable {
  /**
   * @return the collection of pairs.
   */
  Object get();

  /**
   * Persists the data (if applicable)
   *
   * @param cacheConfig config value for caching.
   */
  void persist(String cacheConfig);

  /**
   * Un-persists the data (if applicable)
   */
  void unpersist();

  /**
   * Returns a {@link HoodieData} holding the key from every corresponding pair
   */
  HoodieData keys();

  /**
   * Returns a {@link HoodieData} holding the value from every corresponding pair
   */
  HoodieData values();

  /**
   * Returns number of held pairs
   */
  long count();

  /**
   * Counts the number of pairs grouping them by key
   */
  Map countByKey();

  /**
   * Groups the values for each key in the dataset into a single sequence
   */
  HoodiePairData> groupByKey();

  /**
   * Reduces original sequence by de-duplicating the pairs w/ the same key, using provided
   * binary operator {@code combiner}. Returns an instance of {@link HoodiePairData} holding
   * the "de-duplicated" pairs, ie only pairs with unique keys.
   *
   * @param combiner method to combine values of the pairs with the same key
   * @param parallelism target parallelism (if applicable)
   */
  HoodiePairData reduceByKey(SerializableBiFunction combiner, int parallelism);

  /**
   * Maps key-value pairs of this {@link HoodiePairData} container leveraging provided mapper
   *
   * NOTE: That this returns {@link HoodieData} and not {@link HoodiePairData}
   */
   HoodieData map(SerializableFunction, O> func);

  /**
   * Maps values of this {@link HoodiePairData} container leveraging provided mapper
   */
   HoodiePairData mapValues(SerializableFunction func);

  /**
   * @param mapToPairFunc serializable map function to generate another pair.
   * @param            new key type.
   * @param            new value type.
   * @return containing the result. Actual execution may be deferred.
   */
   HoodiePairData mapToPair(
      SerializablePairFunction, L, W> mapToPairFunc);

  /**
   * Performs a left outer join of this dataset against {@code other}.
   *
   * For each element (k, v) in this, the resulting {@link HoodiePairData} will either contain all
   * pairs {@code (k, (v, Some(w)))} for every {@code w} in the {@code other}, or the pair {@code (k, (v, None))}
   * if no elements in {@code other} have the pair w/ a key {@code k}
   *
   * @param other the other {@link HoodiePairData}
   * @param    value type of the other {@link HoodiePairData}
   * @return containing the result of the left outer join
   */
   HoodiePairData>> leftOuterJoin(HoodiePairData other);

  /**
   * Collects results of the underlying collection into a {@link List>}
   *
   * This is a terminal operation
   */
  List> collectAsList();

  /**
   * @return the deduce number of shuffle partitions
   */
  int deduceNumPartitions();
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy