All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.common.data.HoodieData Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.hudi.common.data;

import org.apache.hudi.common.engine.HoodieEngineContext;
import org.apache.hudi.common.function.SerializableFunction;
import org.apache.hudi.common.function.SerializablePairFunction;
import org.apache.hudi.common.util.collection.Pair;

import java.io.Serializable;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;

/**
 * An interface abstracting a container holding a collection of objects of type {@code T}
 * allowing to perform common transformation on it.
 *
 * This abstraction provides common API implemented by
 * 
    *
  1. In-memory implementation ({@code HoodieListData}, {@code HoodieListPairData}), where all objects * are held in-memory by the executing process
  2. *
  3. RDD-based implementation ({@code HoodieJavaRDD}, etc)
  4. , where underlying collection is held * by an RDD allowing to execute transformations using Spark engine on the cluster *
* * All implementations provide for consistent semantic, where *
    *
  • All non-terminal* operations are executed lazily (for ex, {@code map}, {@code filter}, etc)
  • *
  • All terminal operations are executed eagerly, executing all previously accumulated transformations. * Note that, collection could not be re-used after invoking terminal operation on it.
  • *
* * @param type of object */ public interface HoodieData extends Serializable { /** * Get the {@link HoodieData}'s unique non-negative identifier. -1 indicates invalid id. */ int getId(); /** * Persists the data w/ provided {@code level} (if applicable). * * Use this method only when you call {@link #unpersist()} at some later point for the same {@link HoodieData}. * Otherwise, use {@link #persist(String, HoodieEngineContext, HoodieDataCacheKey)} instead for auto-unpersist * at the end of a client write operation. */ void persist(String level); /** * Persists the data w/ provided {@code level} (if applicable), and cache the data's ids within the {@code engineContext}. */ void persist(String level, HoodieEngineContext engineContext, HoodieDataCacheKey cacheKey); /** * Un-persists the data (if previously persisted) */ void unpersist(); /** * Returns whether the collection is empty. */ boolean isEmpty(); /** * Returns number of objects held in the collection *

* NOTE: This is a terminal operation */ long count(); /** * @return the number of data partitions in the engine-specific representation. */ int getNumPartitions(); /** * @return the deduce number of shuffle partitions */ int deduceNumPartitions(); /** * Maps every element in the collection using provided mapping {@code func}. *

* This is an intermediate operation * * @param func serializable map function * @param output object type * @return {@link HoodieData} holding mapped elements */ HoodieData map(SerializableFunction func); /** * Maps every element in the collection's partition (if applicable) by applying provided * mapping {@code func} to every collection's partition * * This is an intermediate operation * * @param func serializable map function accepting {@link Iterator} of a single * partition's elements and returning a new {@link Iterator} mapping * every element of the partition into a new one * @param preservesPartitioning whether to preserve partitioning in the resulting collection * @param output object type * @return {@link HoodieData} holding mapped elements */ HoodieData mapPartitions(SerializableFunction, Iterator> func, boolean preservesPartitioning); /** * Maps every element in the collection into a collection of the new elements using provided * mapping {@code func}, subsequently flattening the result (by concatenating) into a single * collection * * This is an intermediate operation * * @param func serializable function mapping every element {@link T} into {@code Iterator} * @param output object type * @return {@link HoodieData} holding mapped elements */ HoodieData flatMap(SerializableFunction> func); /** * Maps every element in the collection into a collection of the {@link Pair}s of new elements * using provided mapping {@code func}, subsequently flattening the result (by concatenating) into * a single collection * * NOTE: That this operation will convert container from {@link HoodieData} to {@link HoodiePairData} * * This is an intermediate operation */ HoodiePairData flatMapToPair(SerializableFunction>> func); /** * Maps every element in the collection using provided mapping {@code func} into a {@link Pair} * of elements {@code K} and {@code V} *

* This is an intermediate operation * * @param func serializable map function * @param key type of the pair * @param value type of the pair * @return {@link HoodiePairData} holding mapped elements */ HoodiePairData mapToPair(SerializablePairFunction func); /** * Returns new {@link HoodieData} collection holding only distinct objects of the original one * * This is a stateful intermediate operation */ HoodieData distinct(); /** * Returns new {@link HoodieData} collection holding only distinct objects of the original one * * This is a stateful intermediate operation */ HoodieData distinct(int parallelism); /** * Returns new instance of {@link HoodieData} collection only containing elements matching provided * {@code filterFunc} (ie ones it returns true on) * * @param filterFunc filtering func either accepting or rejecting the elements * @return {@link HoodieData} holding filtered elements */ HoodieData filter(SerializableFunction filterFunc); /** * Unions {@link HoodieData} with another instance of {@link HoodieData}. * Note that, it's only able to union same underlying collection implementations. * * This is a stateful intermediate operation * * @param other {@link HoodieData} collection * @return {@link HoodieData} holding superset of elements of this and {@code other} collections */ HoodieData union(HoodieData other); /** * Collects results of the underlying collection into a {@link List} * * This is a terminal operation */ List collectAsList(); /** * Re-partitions underlying collection (if applicable) making sure new {@link HoodieData} has * exactly {@code parallelism} partitions * * @param parallelism target number of partitions in the underlying collection * @return {@link HoodieData} holding re-partitioned collection */ HoodieData repartition(int parallelism); default HoodieData distinctWithKey(SerializableFunction keyGetter, int parallelism) { return mapToPair(i -> Pair.of(keyGetter.apply(i), i)) .reduceByKey((value1, value2) -> value1, parallelism) .values(); } /** * The key used in a caching map to identify a {@link HoodieData}. * * At the end of a write operation, we manually unpersist the {@link HoodieData} associated with that writer. * Therefore, in multi-writer scenario, we need to use both {@code basePath} and {@code instantTime} to identify {@link HoodieData}s. */ class HoodieDataCacheKey implements Serializable { public static HoodieDataCacheKey of(String basePath, String instantTime) { return new HoodieDataCacheKey(basePath, instantTime); } private final String basePath; private final String instantTime; private HoodieDataCacheKey(String basePath, String instantTime) { this.basePath = basePath; this.instantTime = instantTime; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } HoodieDataCacheKey that = (HoodieDataCacheKey) o; return basePath.equals(that.basePath) && instantTime.equals(that.instantTime); } @Override public int hashCode() { return Objects.hash(basePath, instantTime); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy