All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.crunch.Pipeline Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.crunch;

import org.apache.crunch.types.PTableType;
import org.apache.crunch.types.PType;
import org.apache.hadoop.conf.Configuration;

import java.util.List;

/**
 * Manages the state of a pipeline execution.
 * 
 */
public interface Pipeline {

  /**
   * Set the {@code Configuration} to use with this pipeline.
   */
  void setConfiguration(Configuration conf);

  /**
   * Returns the name of this pipeline.
   * 
   * @return Name of the pipeline
   */
  String getName();

  /**
   * Returns the {@code Configuration} instance associated with this pipeline.
   */
  Configuration getConfiguration();

  /**
   * Converts the given {@code Source} into a {@code PCollection} that is
   * available to jobs run using this {@code Pipeline} instance.
   * 
   * @param source
   *          The source of data
   * @return A PCollection that references the given source
   */
   PCollection read(Source source);

  /**
   * Converts the given {@code Source} into a {@code PCollection} that is
   * available to jobs run using this {@code Pipeline} instance.
   *
   * @param source The source of data
   * @param named A name for the returned PCollection
   * @return A PCollection that references the given source
   */
   PCollection read(Source source, String named);

  /**
   * A version of the read method for {@code TableSource} instances that map to
   * {@code PTable}s.
   * 
   * @param tableSource
   *          The source of the data
   * @return A PTable that references the given source
   */
   PTable read(TableSource tableSource);

 /**
   * A version of the read method for {@code TableSource} instances that map to
   * {@code PTable}s.
   *
   * @param tableSource The source of the data
   * @param named A name for the returned PTable
   * @return A PTable that references the given source
   */
   PTable read(TableSource tableSource, String named);

  /**
   * Write the given collection to the given target on the next pipeline run. The
   * system will check to see if the target's location already exists using the
   * {@code WriteMode.DEFAULT} rule for the given {@code Target}.
   * 
   * @param collection
   *          The collection
   * @param target
   *          The output target
   */
  void write(PCollection collection, Target target);

  /**
  * Write the contents of the {@code PCollection} to the given {@code Target},
  * using the storage format specified by the target and the given
  * {@code WriteMode} for cases where the referenced {@code Target}
  * already exists.
  *
  * @param collection
  *          The collection
  * @param target
  *          The target to write to
  * @param writeMode
  *          The strategy to use for handling existing outputs
  */
 void write(PCollection collection, Target target,
     Target.WriteMode writeMode);

 /**
   * Create the given PCollection and read the data it contains into the
   * returned Collection instance for client use.
   * 
   * @param pcollection
   *          The PCollection to materialize
   * @return the data from the PCollection as a read-only Collection
   */
   Iterable materialize(PCollection pcollection);

  /**
   * Caches the given PCollection so that it will be processed at most once
   * during pipeline execution.
   *
   * @param pcollection The PCollection to cache
   * @param options The options for how the cached data is stored
   */
   void cache(PCollection pcollection, CachingOptions options);

  /**
   * Creates an empty {@code PCollection} of the given {@code PType}.
   *
   * @param ptype The PType of the empty PCollection
   * @return A valid PCollection with no contents
   */
   PCollection emptyPCollection(PType ptype);

  /**
   * Creates an empty {@code PTable} of the given {@code PTable Type}.
   *
   * @param ptype The PTableType of the empty PTable
   * @return A valid PTable with no contents
   */
   PTable emptyPTable(PTableType ptype);

  /**
   * Creates a {@code PCollection} containing the values found in the given {@code Iterable}
   * using an implementation-specific distribution mechanism.
   *
   * @param contents The values the new PCollection will contain
   * @param ptype The PType of the PCollection
   * @return A PCollection that contains the given values
   */
   PCollection create(Iterable contents, PType ptype);

  /**
   * Creates a {@code PCollection} containing the values found in the given {@code Iterable}
   * using an implementation-specific distribution mechanism.
   *
   * @param contents The values the new PCollection will contain
   * @param ptype The PType of the PCollection
   * @param options Additional options, such as the name or desired parallelism of the PCollection
   * @return A PCollection that contains the given values
   */
   PCollection create(Iterable contents, PType ptype, CreateOptions options);

  /**
   * Creates a {@code PTable} containing the values found in the given {@code Iterable}
   * using an implementation-specific distribution mechanism.
   *
   * @param contents The values the new PTable will contain
   * @param ptype The PTableType of the PTable
   * @return A PTable that contains the given values
   */
   PTable create(Iterable> contents, PTableType ptype);

  /**
   * Creates a {@code PTable} containing the values found in the given {@code Iterable}
   * using an implementation-specific distribution mechanism.
   *
   * @param contents The values the new PTable will contain
   * @param ptype The PTableType of the PTable
   * @param options Additional options, such as the name or desired parallelism of the PTable
   * @return A PTable that contains the given values
   */
   PTable create(Iterable> contents, PTableType ptype, CreateOptions options);

   PCollection union(List> collections);

   PTable unionTables(List> tables);

  /**
   * Executes the given {@code PipelineCallable} on the client after the {@code Targets}
   * that the PipelineCallable depends on (if any) have been created by other pipeline
   * processing steps.
   *
   * @param pipelineCallable The sequential logic to execute
   * @param  The return type of the PipelineCallable
   * @return The result of executing the PipelineCallable
   */
   Output sequentialDo(PipelineCallable pipelineCallable);

  /**
   * Constructs and executes a series of MapReduce jobs in order to write data
   * to the output targets.
   */
  PipelineResult run();

  /**
   * Constructs and starts a series of MapReduce jobs in order ot write data to
   * the output targets, but returns a {@code ListenableFuture} to allow clients to control
   * job execution.
   * @return
   */
  PipelineExecution runAsync();
  
  /**
   * Run any remaining jobs required to generate outputs and then clean up any
   * intermediate data files that were created in this run or previous calls to
   * {@code run}.
   */
  PipelineResult done();

  /**
  * Cleans up any artifacts created as a result of {@link #run() running} the pipeline.
  * @param force forces the cleanup even if all targets of the pipeline have not been completed.
  */
  void cleanup(boolean force);

  /**
   * A convenience method for reading a text file.
   */
  PCollection readTextFile(String pathName);

  /**
   * A convenience method for writing a text file.
   */
   void writeTextFile(PCollection collection, String pathName);

  /**
   * Turn on debug logging for jobs that are run from this pipeline.
   */
  void enableDebug();
}