org.apache.crunch.Pipeline Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch;
import org.apache.crunch.types.PTableType;
import org.apache.crunch.types.PType;
import org.apache.hadoop.conf.Configuration;
import java.util.List;
/**
* Manages the state of a pipeline execution.
*
*/
public interface Pipeline {
/**
* Set the {@code Configuration} to use with this pipeline.
*/
void setConfiguration(Configuration conf);
/**
* Returns the name of this pipeline.
*
* @return Name of the pipeline
*/
String getName();
/**
* Returns the {@code Configuration} instance associated with this pipeline.
*/
Configuration getConfiguration();
/**
* Converts the given {@code Source} into a {@code PCollection} that is
* available to jobs run using this {@code Pipeline} instance.
*
* @param source
* The source of data
* @return A PCollection that references the given source
*/
PCollection read(Source source);
/**
* Converts the given {@code Source} into a {@code PCollection} that is
* available to jobs run using this {@code Pipeline} instance.
*
* @param source The source of data
* @param named A name for the returned PCollection
* @return A PCollection that references the given source
*/
PCollection read(Source source, String named);
/**
* A version of the read method for {@code TableSource} instances that map to
* {@code PTable}s.
*
* @param tableSource
* The source of the data
* @return A PTable that references the given source
*/
PTable read(TableSource tableSource);
/**
* A version of the read method for {@code TableSource} instances that map to
* {@code PTable}s.
*
* @param tableSource The source of the data
* @param named A name for the returned PTable
* @return A PTable that references the given source
*/
PTable read(TableSource tableSource, String named);
/**
* Write the given collection to the given target on the next pipeline run. The
* system will check to see if the target's location already exists using the
* {@code WriteMode.DEFAULT} rule for the given {@code Target}.
*
* @param collection
* The collection
* @param target
* The output target
*/
void write(PCollection> collection, Target target);
/**
* Write the contents of the {@code PCollection} to the given {@code Target},
* using the storage format specified by the target and the given
* {@code WriteMode} for cases where the referenced {@code Target}
* already exists.
*
* @param collection
* The collection
* @param target
* The target to write to
* @param writeMode
* The strategy to use for handling existing outputs
*/
void write(PCollection> collection, Target target,
Target.WriteMode writeMode);
/**
* Create the given PCollection and read the data it contains into the
* returned Collection instance for client use.
*
* @param pcollection
* The PCollection to materialize
* @return the data from the PCollection as a read-only Collection
*/
Iterable materialize(PCollection pcollection);
/**
* Caches the given PCollection so that it will be processed at most once
* during pipeline execution.
*
* @param pcollection The PCollection to cache
* @param options The options for how the cached data is stored
*/
void cache(PCollection pcollection, CachingOptions options);
/**
* Creates an empty {@code PCollection} of the given {@code PType}.
*
* @param ptype The PType of the empty PCollection
* @return A valid PCollection with no contents
*/
PCollection emptyPCollection(PType ptype);
/**
* Creates an empty {@code PTable} of the given {@code PTable Type}.
*
* @param ptype The PTableType of the empty PTable
* @return A valid PTable with no contents
*/
PTable emptyPTable(PTableType ptype);
/**
* Creates a {@code PCollection} containing the values found in the given {@code Iterable}
* using an implementation-specific distribution mechanism.
*
* @param contents The values the new PCollection will contain
* @param ptype The PType of the PCollection
* @return A PCollection that contains the given values
*/
PCollection create(Iterable contents, PType ptype);
/**
* Creates a {@code PCollection} containing the values found in the given {@code Iterable}
* using an implementation-specific distribution mechanism.
*
* @param contents The values the new PCollection will contain
* @param ptype The PType of the PCollection
* @param options Additional options, such as the name or desired parallelism of the PCollection
* @return A PCollection that contains the given values
*/
PCollection create(Iterable contents, PType ptype, CreateOptions options);
/**
* Creates a {@code PTable} containing the values found in the given {@code Iterable}
* using an implementation-specific distribution mechanism.
*
* @param contents The values the new PTable will contain
* @param ptype The PTableType of the PTable
* @return A PTable that contains the given values
*/
PTable create(Iterable> contents, PTableType ptype);
/**
* Creates a {@code PTable} containing the values found in the given {@code Iterable}
* using an implementation-specific distribution mechanism.
*
* @param contents The values the new PTable will contain
* @param ptype The PTableType of the PTable
* @param options Additional options, such as the name or desired parallelism of the PTable
* @return A PTable that contains the given values
*/
PTable create(Iterable> contents, PTableType ptype, CreateOptions options);
PCollection union(List> collections);
PTable unionTables(List> tables);
/**
* Executes the given {@code PipelineCallable} on the client after the {@code Targets}
* that the PipelineCallable depends on (if any) have been created by other pipeline
* processing steps.
*
* @param pipelineCallable The sequential logic to execute
* @param