All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.crunch.PCollection Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch;

import java.util.Collection;

import org.apache.crunch.types.PTableType;
import org.apache.crunch.types.PType;
import org.apache.crunch.types.PTypeFamily;

/**
 * A representation of an immutable, distributed collection of elements that is
 * the fundamental target of computations in Crunch.
 *
 */
public interface PCollection {
  /**
   * Returns the {@code Pipeline} associated with this PCollection.
   */
  Pipeline getPipeline();

  /**
   * Returns a {@code PCollection} instance that acts as the union of this
   * {@code PCollection} and the given {@code PCollection}.
   */
  PCollection union(PCollection other);
  
  /**
   * Returns a {@code PCollection} instance that acts as the union of this
   * {@code PCollection} and the input {@code PCollection}s.
   */
  PCollection union(PCollection... collections);

  /**
   * Applies the given doFn to the elements of this {@code PCollection} and
   * returns a new {@code PCollection} that is the output of this processing.
   *
   * @param doFn
   *          The {@code DoFn} to apply
   * @param type
   *          The {@link PType} of the resulting {@code PCollection}
   * @return a new {@code PCollection}
   */
   PCollection parallelDo(DoFn doFn, PType type);

  /**
   * Applies the given doFn to the elements of this {@code PCollection} and
   * returns a new {@code PCollection} that is the output of this processing.
   *
   * @param name
   *          An identifier for this processing step, useful for debugging
   * @param doFn
   *          The {@code DoFn} to apply
   * @param type
   *          The {@link PType} of the resulting {@code PCollection}
   * @return a new {@code PCollection}
   */
   PCollection parallelDo(String name, DoFn doFn, PType type);
  
  /**
   * Applies the given doFn to the elements of this {@code PCollection} and
   * returns a new {@code PCollection} that is the output of this processing.
   *
   * @param name
   *          An identifier for this processing step, useful for debugging
   * @param doFn
   *          The {@code DoFn} to apply
   * @param type
   *          The {@link PType} of the resulting {@code PCollection}
   * @param options
   *          Optional information that is needed for certain pipeline operations
   * @return a new {@code PCollection}
   */
   PCollection parallelDo(String name, DoFn doFn, PType type,
      ParallelDoOptions options);

  /**
   * Similar to the other {@code parallelDo} instance, but returns a
   * {@code PTable} instance instead of a {@code PCollection}.
   *
   * @param doFn
   *          The {@code DoFn} to apply
   * @param type
   *          The {@link PTableType} of the resulting {@code PTable}
   * @return a new {@code PTable}
   */
   PTable parallelDo(DoFn> doFn, PTableType type);

  /**
   * Similar to the other {@code parallelDo} instance, but returns a
   * {@code PTable} instance instead of a {@code PCollection}.
   *
   * @param name
   *          An identifier for this processing step
   * @param doFn
   *          The {@code DoFn} to apply
   * @param type
   *          The {@link PTableType} of the resulting {@code PTable}
   * @return a new {@code PTable}
   */
   PTable parallelDo(String name, DoFn> doFn, PTableType type);
  
  /**
   * Similar to the other {@code parallelDo} instance, but returns a
   * {@code PTable} instance instead of a {@code PCollection}.
   *
   * @param name
   *          An identifier for this processing step
   * @param doFn
   *          The {@code DoFn} to apply
   * @param type
   *          The {@link PTableType} of the resulting {@code PTable}
   * @param options
   *          Optional information that is needed for certain pipeline operations
   * @return a new {@code PTable}
   */
   PTable parallelDo(String name, DoFn> doFn, PTableType type,
      ParallelDoOptions options);


  /**
   * Write the contents of this {@code PCollection} to the given {@code Target},
   * using the storage format specified by the target.
   *
   * @param target
   *          The target to write to
   */
  PCollection write(Target target);

  /**
   * Write the contents of this {@code PCollection} to the given {@code Target},
   * using the given {@code Target.WriteMode} to handle existing
   * targets.
   * 
   * @param target
   *          The target
   * @param writeMode
   *          The rule for handling existing outputs at the target location
   */
  PCollection write(Target target, Target.WriteMode writeMode);
  
  /**
   * Returns a reference to the data set represented by this PCollection that
   * may be used by the client to read the data locally.
   */
  Iterable materialize();

  /**
   * Marks this data as cached using the default {@link CachingOptions}. Cached {@code PCollection}s will only
   * be processed once, and then their contents will be saved so that downstream code can process them many times.
   *
   * @return this {@code PCollection} instance
   */
  PCollection cache();

  /**
   * Marks this data as cached using the given {@code CachingOptions}. Cached {@code PCollection}s will only
   * be processed once and then their contents will be saved so that downstream code can process them many times.
   *
   * @param options the options that control the cache settings for the data
   * @return this {@code PCollection} instance
   */
  PCollection cache(CachingOptions options);

  /**
   * @return A {@code PObject} encapsulating an in-memory {@link Collection} containing the values
   * of this {@code PCollection}.
   */
  PObject> asCollection();

  /**
   * @return The first element of this {@code PCollection}.
   */
  PObject first();

  /**
   * Adds the materialized data in this {@code PCollection} as a dependency to the given
   * {@code PipelineCallable} and registers it with the {@code Pipeline} associated with this
   * instance.
   *
   * @param label the label to use inside of the PipelineCallable for referencing this PCollection
   * @param pipelineCallable the function itself
   *
   * @return The value of the {@code getOutput} function on the given argument.
   */
   Output sequentialDo(String label, PipelineCallable pipelineCallable);

  /**
   * @return A reference to the data in this instance that can be read from a job running
   * on a cluster.
   *
   * @param materialize If true, materialize this data before returning a reference to it
   */
  ReadableData asReadable(boolean materialize);

  /**
   * Returns the {@code PType} of this {@code PCollection}.
   */
  PType getPType();

  /**
   * Returns the {@code PTypeFamily} of this {@code PCollection}.
   */
  PTypeFamily getTypeFamily();

  /**
   * Returns the size of the data represented by this {@code PCollection} in
   * bytes.
   */
  long getSize();

  /**
   * Returns the number of elements represented by this {@code PCollection}.
   *
   * @return An {@code PObject} containing the number of elements in this {@code PCollection}.
   */
  PObject length();

  /**
   * Returns a shorthand name for this PCollection.
   */
  String getName();

  /**
   * Apply the given filter function to this instance and return the resulting
   * {@code PCollection}.
   */
  PCollection filter(FilterFn filterFn);

  /**
   * Apply the given filter function to this instance and return the resulting
   * {@code PCollection}.
   *
   * @param name
   *          An identifier for this processing step
   * @param filterFn
   *          The {@code FilterFn} to apply
   */
  PCollection filter(String name, FilterFn filterFn);

  /**
   * Apply the given map function to each element of this instance in order to
   * create a {@code PTable}.
   */
   PTable by(MapFn extractKeyFn, PType keyType);

  /**
   * Apply the given map function to each element of this instance in order to
   * create a {@code PTable}.
   *
   * @param name
   *          An identifier for this processing step
   * @param extractKeyFn
   *          The {@code MapFn} to apply
   */
   PTable by(String name, MapFn extractKeyFn, PType keyType);

  /**
   * Returns a {@code PTable} instance that contains the counts of each unique
   * element of this PCollection.
   */
  PTable count();

  /**
   * Returns a {@code PObject} of the maximum element of this instance.
   */
  PObject max();

  /**
   * Returns a {@code PObject} of the minimum element of this instance.
   */
  PObject min();
  
  /**
   * Returns a {@code PCollection} that contains the result of aggregating all values in this instance.
   */
  PCollection aggregate(Aggregator aggregator);
  
}