All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.crunch.PTable Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch;

import java.util.Collection;
import java.util.Map;

import org.apache.crunch.types.PTableType;
import org.apache.crunch.types.PType;

/**
 * A sub-interface of {@code PCollection} that represents an immutable,
 * distributed multi-map of keys and values.
 *
 */
public interface PTable extends PCollection> {

  /**
   Returns a {@code PTable} instance that acts as the union of this
   * {@code PTable} and the other {@code PTable}s.
   */
  PTable union(PTable other);
  
  /**
   * Returns a {@code PTable} instance that acts as the union of this
   * {@code PTable} and the input {@code PTable}s.
   */
  PTable union(PTable... others);

  /**
   * Performs a grouping operation on the keys of this table.
   *
   * @return a {@code PGroupedTable} instance that represents the grouping
   */
  PGroupedTable groupByKey();

  /**
   * Performs a grouping operation on the keys of this table, using the given
   * number of partitions.
   *
   * @param numPartitions
   *          The number of partitions for the data.
   * @return a {@code PGroupedTable} instance that represents this grouping
   */
  PGroupedTable groupByKey(int numPartitions);

  /**
   * Performs a grouping operation on the keys of this table, using the
   * additional {@code GroupingOptions} to control how the grouping is executed.
   *
   * @param options
   *          The grouping options to use
   * @return a {@code PGroupedTable} instance that represents the grouping
   */
  PGroupedTable groupByKey(GroupingOptions options);

  /**
   * Writes this {@code PTable} to the given {@code Target}.
   */
  PTable write(Target target);

  /**
   * Writes this {@code PTable} to the given {@code Target}, using the
   * given {@code Target.WriteMode} to handle existing targets.
   */
  PTable write(Target target, Target.WriteMode writeMode);

  PTable cache();

  PTable cache(CachingOptions options);

  /**
   * Returns the {@code PTableType} of this {@code PTable}.
   */
  PTableType getPTableType();

  /**
   * Returns the {@code PType} of the key.
   */
  PType getKeyType();

  /**
   * Returns the {@code PType} of the value.
   */
  PType getValueType();

  /**
   * Returns a {@code PTable} that has the same keys as this instance, but
   * uses the given function to map the values.
   */
   PTable mapValues(MapFn mapFn, PType ptype);

  /**
   * Returns a {@code PTable} that has the same keys as this instance, but
   * uses the given function to map the values.
   */
   PTable mapValues(String name, MapFn mapFn, PType ptype);

  /**
   * Returns a {@code PTable} that has the same values as this instance, but
   * uses the given function to map the keys.
   */
   PTable mapKeys(MapFn mapFn, PType ptype);

  /**
   * Returns a {@code PTable} that has the same values as this instance, but
   * uses the given function to map the keys.
   */
   PTable mapKeys(String name, MapFn mapFn, PType ptype);
  
  /**
   * Aggregate all of the values with the same key into a single key-value pair
   * in the returned PTable.
   */
  PTable> collectValues();

  /**
   * Apply the given filter function to this instance and return the resulting
   * {@code PTable}.
   */
  PTable filter(FilterFn> filterFn);

  /**
   * Apply the given filter function to this instance and return the resulting
   * {@code PTable}.
   *
   * @param name
   *          An identifier for this processing step
   * @param filterFn
   *          The {@code FilterFn} to apply
   */
  PTable filter(String name, FilterFn> filterFn);

  /**
   * Returns a PTable made up of the pairs in this PTable with the largest value
   * field.
   *
   * @param count
   *          The number of pairs to return
   */
  PTable top(int count);

  /**
   * Returns a PTable made up of the pairs in this PTable with the smallest
   * value field.
   *
   * @param count
   *          The number of pairs to return
   */
  PTable bottom(int count);

  /**
   * Perform an inner join on this table and the one passed in as an argument on
   * their common keys.
   */
   PTable> join(PTable other);

  /**
   * Co-group operation with the given table.
   * 

* Note: If the given table contains keys that are not present in this PTable, an empty * PCollection is set for the relationship. */ PTable, Collection>> cogroup(PTable other); /** * Returns a {@link PCollection} made up of the keys in this PTable. */ PCollection keys(); /** * Returns a {@link PCollection} made up of the values in this PTable. */ PCollection values(); /** * Returns a Map made up of the keys and values in this PTable. *

* Note: The contents of the returned map may not be exactly the same * as this PTable, as a PTable is a multi-map (i.e. can contain multiple * values for a single key). */ Map materializeToMap(); /** * Returns a {@link PObject} encapsulating a {@link Map} made up of the keys and values in this * {@code PTable}. *

Note:The contents of the returned map may not be exactly the same as this PTable, * as a PTable is a multi-map (i.e. can contain multiple values for a single key). *

* * @return The {@code PObject} encapsulating a {@code Map} made up of the keys and values in * this {@code PTable}. */ PObject> asMap(); }