Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch;
import java.util.Collection;
import org.apache.crunch.types.PTableType;
import org.apache.crunch.types.PType;
import org.apache.crunch.types.PTypeFamily;
/**
* A representation of an immutable, distributed collection of elements that is
* the fundamental target of computations in Crunch.
*
*/
public interface PCollection {
/**
* Returns the {@code Pipeline} associated with this PCollection.
*/
Pipeline getPipeline();
/**
* Returns a {@code PCollection} instance that acts as the union of this
* {@code PCollection} and the given {@code PCollection}.
*/
PCollection union(PCollection other);
/**
* Returns a {@code PCollection} instance that acts as the union of this
* {@code PCollection} and the input {@code PCollection}s.
*/
PCollection union(PCollection... collections);
/**
* Applies the given doFn to the elements of this {@code PCollection} and
* returns a new {@code PCollection} that is the output of this processing.
*
* @param doFn
* The {@code DoFn} to apply
* @param type
* The {@link PType} of the resulting {@code PCollection}
* @return a new {@code PCollection}
*/
PCollection parallelDo(DoFn doFn, PType type);
/**
* Applies the given doFn to the elements of this {@code PCollection} and
* returns a new {@code PCollection} that is the output of this processing.
*
* @param name
* An identifier for this processing step, useful for debugging
* @param doFn
* The {@code DoFn} to apply
* @param type
* The {@link PType} of the resulting {@code PCollection}
* @return a new {@code PCollection}
*/
PCollection parallelDo(String name, DoFn doFn, PType type);
/**
* Applies the given doFn to the elements of this {@code PCollection} and
* returns a new {@code PCollection} that is the output of this processing.
*
* @param name
* An identifier for this processing step, useful for debugging
* @param doFn
* The {@code DoFn} to apply
* @param type
* The {@link PType} of the resulting {@code PCollection}
* @param options
* Optional information that is needed for certain pipeline operations
* @return a new {@code PCollection}
*/
PCollection parallelDo(String name, DoFn doFn, PType type,
ParallelDoOptions options);
/**
* Similar to the other {@code parallelDo} instance, but returns a
* {@code PTable} instance instead of a {@code PCollection}.
*
* @param doFn
* The {@code DoFn} to apply
* @param type
* The {@link PTableType} of the resulting {@code PTable}
* @return a new {@code PTable}
*/
PTable parallelDo(DoFn> doFn, PTableType type);
/**
* Similar to the other {@code parallelDo} instance, but returns a
* {@code PTable} instance instead of a {@code PCollection}.
*
* @param name
* An identifier for this processing step
* @param doFn
* The {@code DoFn} to apply
* @param type
* The {@link PTableType} of the resulting {@code PTable}
* @return a new {@code PTable}
*/
PTable parallelDo(String name, DoFn> doFn, PTableType type);
/**
* Similar to the other {@code parallelDo} instance, but returns a
* {@code PTable} instance instead of a {@code PCollection}.
*
* @param name
* An identifier for this processing step
* @param doFn
* The {@code DoFn} to apply
* @param type
* The {@link PTableType} of the resulting {@code PTable}
* @param options
* Optional information that is needed for certain pipeline operations
* @return a new {@code PTable}
*/
PTable parallelDo(String name, DoFn> doFn, PTableType type,
ParallelDoOptions options);
/**
* Write the contents of this {@code PCollection} to the given {@code Target},
* using the storage format specified by the target.
*
* @param target
* The target to write to
*/
PCollection write(Target target);
/**
* Write the contents of this {@code PCollection} to the given {@code Target},
* using the given {@code Target.WriteMode} to handle existing
* targets.
*
* @param target
* The target
* @param writeMode
* The rule for handling existing outputs at the target location
*/
PCollection write(Target target, Target.WriteMode writeMode);
/**
* Returns a reference to the data set represented by this PCollection that
* may be used by the client to read the data locally.
*/
Iterable materialize();
/**
* Marks this data as cached using the default {@link CachingOptions}. Cached {@code PCollection}s will only
* be processed once, and then their contents will be saved so that downstream code can process them many times.
*
* @return this {@code PCollection} instance
*/
PCollection cache();
/**
* Marks this data as cached using the given {@code CachingOptions}. Cached {@code PCollection}s will only
* be processed once and then their contents will be saved so that downstream code can process them many times.
*
* @param options the options that control the cache settings for the data
* @return this {@code PCollection} instance
*/
PCollection cache(CachingOptions options);
/**
* @return A {@code PObject} encapsulating an in-memory {@link Collection} containing the values
* of this {@code PCollection}.
*/
PObject> asCollection();
/**
* @return The first element of this {@code PCollection}.
*/
PObject first();
/**
* Adds the materialized data in this {@code PCollection} as a dependency to the given
* {@code PipelineCallable} and registers it with the {@code Pipeline} associated with this
* instance.
*
* @param label the label to use inside of the PipelineCallable for referencing this PCollection
* @param pipelineCallable the function itself
*
* @return The value of the {@code getOutput} function on the given argument.
*/