All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.crunch.PGroupedTable Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch;

import org.apache.crunch.Aggregator;
import org.apache.crunch.types.PGroupedTableType;
import org.apache.crunch.types.PType;

/**
 * The Crunch representation of a grouped {@link PTable}, which corresponds to the output of
 * the shuffle phase of a MapReduce job.
 */
public interface PGroupedTable extends PCollection>> {

  /**
   * Combines the values of this grouping using the given {@code CombineFn}.
   * 
   * @param combineFn
   *          The combiner function
   * @return A {@code PTable} where each key has a single value
   */
  PTable combineValues(CombineFn combineFn);
  
  /**
   * Combines and reduces the values of this grouping using the given {@code CombineFn} instances.
   * 
   * @param combineFn
   *          The combiner function during the combine phase
   * @param reduceFn
   *          The combiner function during the reduce phase
   * @return A {@code PTable} where each key has a single value
   */
  PTable combineValues(CombineFn combineFn, CombineFn reduceFn);

  /**
   * Combine the values in each group using the given {@link Aggregator}.
   *
   * @param aggregator The function to use
   * @return A {@link PTable} where each group key maps to an aggregated
   *         value. Group keys may be repeated if an aggregator returns
   *         more than one value.
   */
  PTable combineValues(Aggregator aggregator);

  /**
   * Combine and reduces the values in each group using the given {@link Aggregator} instances.
   *
   * @param combineAggregator The aggregator to use during the combine phase
   * @param reduceAggregator The aggregator to use during the reduce phase
   * @return A {@link PTable} where each group key maps to an aggregated
   *         value. Group keys may be repeated if an aggregator returns
   *         more than one value.
   */
  PTable combineValues(Aggregator combineAggregator, Aggregator reduceAggregator);

  /**
   * Maps the {@code Iterable} elements of each record to a new type. Just like
   * any {@code parallelDo} operation on a {@code PGroupedTable}, this may only be
   * called once.
   * 
   * @param mapFn The mapping function
   * @param ptype The serialization information for the returned data
   * @return A new {@code PTable} instance
   */
   PTable mapValues(MapFn, U> mapFn, PType ptype);

  /**
   * Maps the {@code Iterable} elements of each record to a new type. Just like
   * any {@code parallelDo} operation on a {@code PGroupedTable}, this may only be
   * called once.
   * 
   * @param name A name for this operation
   * @param mapFn The mapping function
   * @param ptype The serialization information for the returned data
   * @return A new {@code PTable} instance
   */
   PTable mapValues(String name, MapFn, U> mapFn, PType ptype);

  /**
   * Convert this grouping back into a multimap.
   * 
   * @return an ungrouped version of the data in this {@code PGroupedTable}.
   */
  PTable ungroup();
  
  /**
   * Return the {@code PGroupedTableType} containing serialization information for
   * this {@code PGroupedTable}.
   */
  PGroupedTableType getGroupedTableType();
}