All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.common.data.HoodieData Maven / Gradle / Ivy

There is a newer version: 1.0.0-beta2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.common.data;

import org.apache.hudi.common.function.SerializableFunction;
import org.apache.hudi.common.function.SerializablePairFunction;
import org.apache.hudi.common.util.collection.Pair;

import java.io.Serializable;
import java.util.Iterator;
import java.util.List;

/**
 * An interface abstracting a container holding a collection of objects of type {@code T}
 * allowing to perform common transformation on it.
 *
 * This abstraction provides common API implemented by
 * 
    *
  1. In-memory implementation ({@code HoodieListData}, {@code HoodieListPairData}), where all objects * are held in-memory by the executing process
  2. *
  3. RDD-based implementation ({@code HoodieJavaRDD}, etc)
  4. , where underlying collection is held * by an RDD allowing to execute transformations using Spark engine on the cluster *
* * All implementations provide for consistent semantic, where *
    *
  • All non-terminal* operations are executed lazily (for ex, {@code map}, {@code filter}, etc)
  • *
  • All terminal operations are executed eagerly, executing all previously accumulated transformations. * Note that, collection could not be re-used after invoking terminal operation on it.
  • *
* * @param type of object */ public interface HoodieData extends Serializable { /** * Persists the data w/ provided {@code level} (if applicable) */ void persist(String level); /** * Un-persists the data (if previously persisted) */ void unpersist(); /** * Returns whether the collection is empty. */ boolean isEmpty(); /** * Returns number of objects held in the collection *

* NOTE: This is a terminal operation */ long count(); /** * @return the number of data partitions in the engine-specific representation. */ int getNumPartitions(); /** * Maps every element in the collection using provided mapping {@code func}. *

* This is an intermediate operation * * @param func serializable map function * @param output object type * @return {@link HoodieData} holding mapped elements */ HoodieData map(SerializableFunction func); /** * Maps every element in the collection's partition (if applicable) by applying provided * mapping {@code func} to every collection's partition * * This is an intermediate operation * * @param func serializable map function accepting {@link Iterator} of a single * partition's elements and returning a new {@link Iterator} mapping * every element of the partition into a new one * @param preservesPartitioning whether to preserve partitioning in the resulting collection * @param output object type * @return {@link HoodieData} holding mapped elements */ HoodieData mapPartitions(SerializableFunction, Iterator> func, boolean preservesPartitioning); /** * Maps every element in the collection into a collection of the new elements (provided by * {@link Iterator}) using provided mapping {@code func}, subsequently flattening the result * (by concatenating) into a single collection * * This is an intermediate operation * * @param func serializable function mapping every element {@link T} into {@code Iterator} * @param output object type * @return {@link HoodieData} holding mapped elements */ HoodieData flatMap(SerializableFunction> func); /** * Maps every element in the collection using provided mapping {@code func} into a {@link Pair} * of elements {@code K} and {@code V} *

* This is an intermediate operation * * @param func serializable map function * @param key type of the pair * @param value type of the pair * @return {@link HoodiePairData} holding mapped elements */ HoodiePairData mapToPair(SerializablePairFunction func); /** * Returns new {@link HoodieData} collection holding only distinct objects of the original one * * This is a stateful intermediate operation */ HoodieData distinct(); /** * Returns new {@link HoodieData} collection holding only distinct objects of the original one * * This is a stateful intermediate operation */ HoodieData distinct(int parallelism); /** * Returns new instance of {@link HoodieData} collection only containing elements matching provided * {@code filterFunc} (ie ones it returns true on) * * @param filterFunc filtering func either accepting or rejecting the elements * @return {@link HoodieData} holding filtered elements */ HoodieData filter(SerializableFunction filterFunc); /** * Unions {@link HoodieData} with another instance of {@link HoodieData}. * Note that, it's only able to union same underlying collection implementations. * * This is a stateful intermediate operation * * @param other {@link HoodieData} collection * @return {@link HoodieData} holding superset of elements of this and {@code other} collections */ HoodieData union(HoodieData other); /** * Collects results of the underlying collection into a {@link List} * * This is a terminal operation */ List collectAsList(); /** * Re-partitions underlying collection (if applicable) making sure new {@link HoodieData} has * exactly {@code parallelism} partitions * * @param parallelism target number of partitions in the underlying collection * @return {@link HoodieData} holding re-partitioned collection */ HoodieData repartition(int parallelism); default HoodieData distinctWithKey(SerializableFunction keyGetter, int parallelism) { return mapToPair(i -> Pair.of(keyGetter.apply(i), i)) .reduceByKey((value1, value2) -> value1, parallelism) .values(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy