eu.stratosphere.api.java.DataSet Maven / Gradle / Ivy
/***********************************************************************************************************************
*
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
*
**********************************************************************************************************************/
package eu.stratosphere.api.java;
import org.apache.commons.lang3.Validate;
import eu.stratosphere.api.common.io.FileOutputFormat;
import eu.stratosphere.api.common.io.OutputFormat;
import eu.stratosphere.api.java.aggregation.Aggregations;
import eu.stratosphere.api.java.functions.CoGroupFunction;
import eu.stratosphere.api.java.functions.FilterFunction;
import eu.stratosphere.api.java.functions.FlatMapFunction;
import eu.stratosphere.api.java.functions.GroupReduceFunction;
import eu.stratosphere.api.java.functions.KeySelector;
import eu.stratosphere.api.java.functions.MapFunction;
import eu.stratosphere.api.java.functions.ReduceFunction;
import eu.stratosphere.api.java.io.CsvOutputFormat;
import eu.stratosphere.api.java.io.PrintingOutputFormat;
import eu.stratosphere.api.java.io.TextOutputFormat;
import eu.stratosphere.api.java.operators.AggregateOperator;
import eu.stratosphere.api.java.operators.CoGroupOperator;
import eu.stratosphere.api.java.operators.CoGroupOperator.CoGroupOperatorSets;
import eu.stratosphere.api.java.operators.CrossOperator;
import eu.stratosphere.api.java.operators.CrossOperator.DefaultCross;
import eu.stratosphere.api.java.operators.CustomUnaryOperation;
import eu.stratosphere.api.java.operators.DataSink;
import eu.stratosphere.api.java.operators.FilterOperator;
import eu.stratosphere.api.java.operators.FlatMapOperator;
import eu.stratosphere.api.java.operators.Grouping;
import eu.stratosphere.api.java.operators.JoinOperator;
import eu.stratosphere.api.java.operators.JoinOperator.JoinHint;
import eu.stratosphere.api.java.operators.JoinOperator.JoinOperatorSets;
import eu.stratosphere.api.java.operators.Keys;
import eu.stratosphere.api.java.operators.MapOperator;
import eu.stratosphere.api.java.operators.ProjectOperator;
import eu.stratosphere.api.java.operators.ProjectOperator.Projection;
import eu.stratosphere.api.java.operators.ReduceGroupOperator;
import eu.stratosphere.api.java.operators.ReduceOperator;
import eu.stratosphere.api.java.operators.SortedGrouping;
import eu.stratosphere.api.java.operators.UnionOperator;
import eu.stratosphere.api.java.operators.UnsortedGrouping;
import eu.stratosphere.api.java.record.functions.CrossFunction;
import eu.stratosphere.api.java.tuple.Tuple;
import eu.stratosphere.api.java.tuple.Tuple2;
import eu.stratosphere.api.java.typeutils.InputTypeConfigurable;
import eu.stratosphere.types.TypeInformation;
import eu.stratosphere.core.fs.FileSystem.WriteMode;
import eu.stratosphere.core.fs.Path;
/**
* A DataSet represents a collection of elements of the same type.
* A DataSet can be transformed into another DataSet by applying a transformation as for example
*
* - {@link DataSet#map(MapFunction)},
* - {@link DataSet#reduce(ReduceFunction)},
* - {@link DataSet#join(DataSet)}, or
* - {@link DataSet#coGroup(DataSet)}.
*
*
* @param The type of the DataSet, i.e., the type of the elements of the DataSet.
*/
public abstract class DataSet {
private final ExecutionEnvironment context;
private final TypeInformation type;
protected DataSet(ExecutionEnvironment context, TypeInformation type) {
if (context == null) {
throw new NullPointerException("context is null");
}
if (type == null) {
throw new NullPointerException("type is null");
}
this.context = context;
this.type = type;
}
/**
* Returns the {@link ExecutionEnvironment} in which this DataSet is registered.
*
* @return The ExecutionEnvironment in which this DataSet is registered.
*
* @see ExecutionEnvironment
*/
public ExecutionEnvironment getExecutionEnvironment() {
return this.context;
}
/**
* Returns the {@link TypeInformation} for the type of this DataSet.
*
* @return The TypeInformation for the type of this DataSet.
*
* @see TypeInformation
*/
public TypeInformation getType() {
return this.type;
}
// --------------------------------------------------------------------------------------------
// Filter & Transformations
// --------------------------------------------------------------------------------------------
/**
* Applies a Map transformation on a {@link DataSet}.
* The transformation calls a {@link MapFunction} for each element of the DataSet.
* Each MapFunction call returns exactly one element.
*
* @param mapper The MapFunction that is called for each element of the DataSet.
* @return A MapOperator that represents the transformed DataSet.
*
* @see MapFunction
* @see MapOperator
* @see DataSet
*/
public MapOperator map(MapFunction mapper) {
return new MapOperator(this, mapper);
}
/**
* Applies a FlatMap transformation on a {@link DataSet}.
* The transformation calls a {@link FlatMapFunction} for each element of the DataSet.
* Each FlatMapFunction call can return any number of elements including none.
*
* @param flatMapper The FlatMapFunction that is called for each element of the DataSet.
* @return A FlatMapOperator that represents the transformed DataSet.
*
* @see FlatMapFunction
* @see FlatMapOperator
* @see DataSet
*/
public FlatMapOperator flatMap(FlatMapFunction flatMapper) {
return new FlatMapOperator(this, flatMapper);
}
/**
* Applies a Filter transformation on a {@link DataSet}.
* The transformation calls a {@link FilterFunction} for each element of the DataSet
* and retains only those element for which the function returns true. Elements for
* which the function returns false are filtered.
*
* @param filter The FilterFunction that is called for each element of the DataSet.
* @return A FilterOperator that represents the filtered DataSet.
*
* @see FilterFunction
* @see FilterOperator
* @see DataSet
*/
public FilterOperator filter(FilterFunction filter) {
return new FilterOperator(this, filter);
}
// --------------------------------------------------------------------------------------------
// Projections
// --------------------------------------------------------------------------------------------
/**
* Initiates a Project transformation on a {@link Tuple} {@link DataSet}.
* Note: Only Tuple DataSets can be projected.
* The transformation projects each Tuple of the DataSet onto a (sub)set of fields.
* This method returns a {@link Projection} on which {@link Projection#types()} needs to
* be called to completed the transformation.
*
* @param fieldIndexes The field indexes of the input tuples that are retained.
* The order of fields in the output tuple corresponds to the order of field indexes.
* @return A Projection that needs to be converted into a {@link ProjectOperator} to complete the
* Project transformation by calling {@link Projection#types()}.
*
* @see Tuple
* @see DataSet
* @see Projection
* @see ProjectOperator
*/
public Projection project(int... fieldIndexes) {
return new Projection(this, fieldIndexes);
}
// --------------------------------------------------------------------------------------------
// Non-grouped aggregations
// --------------------------------------------------------------------------------------------
/**
* Applies an Aggregate transformation on a non-grouped {@link Tuple} {@link DataSet}.
* Note: Only Tuple DataSets can be aggregated.
* The transformation applies a built-in {@link Aggregations Aggregation} on a specified field
* of a Tuple DataSet. Additional aggregation functions can be added to the resulting
* {@link AggregateOperator} by calling {@link AggregateOperator#and(Aggregations, int)}.
*
* @param agg The built-in aggregation function that is computed.
* @param field The index of the Tuple field on which the aggregation function is applied.
* @return An AggregateOperator that represents the aggregated DataSet.
*
* @see Tuple
* @see Aggregations
* @see AggregateOperator
* @see DataSet
*/
public AggregateOperator aggregate(Aggregations agg, int field) {
return new AggregateOperator(this, agg, field);
}
/**
* Applies a Reduce transformation on a non-grouped {@link DataSet}.
* The transformation consecutively calls a {@link ReduceFunction}
* until only a single element remains which is the result of the transformation.
* A ReduceFunction combines two elements into one new element of the same type.
*
* @param reducer The ReduceFunction that is applied on the DataSet.
* @return A ReduceOperator that represents the reduced DataSet.
*
* @see ReduceFunction
* @see ReduceOperator
* @see DataSet
*/
public ReduceOperator reduce(ReduceFunction reducer) {
return new ReduceOperator(this, reducer);
}
/**
* Applies a GroupReduce transformation on a non-grouped {@link DataSet}.
* The transformation calls a {@link GroupReduceFunction} once with the full DataSet.
* The GroupReduceFunction can iterate over all elements of the DataSet and emit any
* number of output elements including none.
*
* @param reducer The GroupReduceFunction that is applied on the DataSet.
* @return A GroupReduceOperator that represents the reduced DataSet.
*
* @see GroupReduceFunction
* @see GroupReduceOperator
* @see DataSet
*/
public ReduceGroupOperator reduceGroup(GroupReduceFunction reducer) {
return new ReduceGroupOperator(this, reducer);
}
// --------------------------------------------------------------------------------------------
// distinct
// --------------------------------------------------------------------------------------------
// public > DistinctOperator distinct(KeySelector keyExtractor) {
// return new DistinctOperator(this, new Keys.SelectorFunctionKeys(keyExtractor, getType()));
// }
// public DistinctOperator distinct(int... fields) {
// return new DistinctOperator(this, new Keys.FieldPositionKeys(fields, getType(), true));
// }
// --------------------------------------------------------------------------------------------
// Grouping
// --------------------------------------------------------------------------------------------
/**
* Groups a {@link DataSet} using a {@link KeySelector} function.
* The KeySelector function is called for each element of the DataSet and extracts a single
* key value on which the DataSet is grouped.
* This method returns an {@link UnsortedGrouping} on which one of the following grouping transformation
* can be applied.
*
* - {@link UnsortedGrouping#sortGroup(int, eu.stratosphere.api.common.operators.Order)} to get a {@link SortedGrouping}.
*
- {@link Grouping#aggregate(Aggregations, int)} to apply an Aggregate transformation.
*
- {@link Grouping#reduce(ReduceFunction)} to apply a Reduce transformation.
*
- {@link Grouping#reduceGroup(GroupReduceFunction)} to apply a GroupReduce transformation.
*
*
* @param keyExtractor The KeySelector function which extracts the key values from the DataSet on which it is grouped.
* @return An UnsortedGrouping on which a transformation needs to be applied to obtain a transformed DataSet.
*
* @see KeySelector
* @see Grouping
* @see UnsortedGrouping
* @see SortedGrouping
* @see AggregateOperator
* @see ReduceOperator
* @see GroupReduceOperator
* @see DataSet
*/
public > UnsortedGrouping groupBy(KeySelector keyExtractor) {
return new UnsortedGrouping(this, new Keys.SelectorFunctionKeys(keyExtractor, getType()));
}
/**
* Groups a {@link Tuple} {@link DataSet} using field position keys.
* Note: Field position keys only be specified for Tuple DataSets.
* The field position keys specify the fields of Tuples on which the DataSet is grouped.
* This method returns an {@link UnsortedGrouping} on which one of the following grouping transformation
* can be applied.
*
* - {@link UnsortedGrouping#sortGroup(int, eu.stratosphere.api.common.operators.Order)} to get a {@link SortedGrouping}.
*
- {@link Grouping#aggregate(Aggregations, int)} to apply an Aggregate transformation.
*
- {@link Grouping#reduce(ReduceFunction)} to apply a Reduce transformation.
*
- {@link Grouping#reduceGroup(GroupReduceFunction)} to apply a GroupReduce transformation.
*
*
* @param fields One or more field positions on which the DataSet will be grouped.
* @return A Grouping on which a transformation needs to be applied to obtain a transformed DataSet.
*
* @see Tuple
* @see Grouping
* @see UnsortedGrouping
* @see SortedGrouping
* @see AggregateOperator
* @see ReduceOperator
* @see GroupReduceOperator
* @see DataSet
*/
public UnsortedGrouping groupBy(int... fields) {
return new UnsortedGrouping(this, new Keys.FieldPositionKeys(fields, getType(), false));
}
// --------------------------------------------------------------------------------------------
// Joining
// --------------------------------------------------------------------------------------------
/**
* Initiates a Join transformation.
* A Join transformation joins the elements of two
* {@link DataSet DataSets} on key equality and provides multiple ways to combine
* joining elements into one DataSet.
*
* This method returns a {@link JoinOperatorSets} on which
* {@link JoinOperatorSets#where()} needs to be called to define the join key of the first
* joining (i.e., this) DataSet.
*
* @param other The other DataSet with which this DataSet is joined.
* @return A JoinOperatorSets to continue the definition of the Join transformation.
*
* @see JoinOperatorSets
* @see JoinOperator
* @see DataSet
*/
public JoinOperatorSets join(DataSet other) {
return new JoinOperatorSets(this, other);
}
/**
* Initiates a Join transformation.
* A Join transformation joins the elements of two
* {@link DataSet DataSets} on key equality and provides multiple ways to combine
* joining elements into one DataSet.
* This method also gives the hint to the optimizer that the second DataSet to join is much
* smaller than the first one.
* This method returns a {@link JoinOperatorSets} on which
* {@link JoinOperatorSets#where()} needs to be called to define the join key of the first
* joining (i.e., this) DataSet.
*
* @param other The other DataSet with which this DataSet is joined.
* @return A JoinOperatorSets to continue the definition of the Join transformation.
*
* @see JoinOperatorSets
* @see JoinOperator
* @see DataSet
*/
public JoinOperatorSets joinWithTiny(DataSet other) {
return new JoinOperatorSets(this, other, JoinHint.BROADCAST_HASH_SECOND);
}
/**
* Initiates a Join transformation.
* A Join transformation joins the elements of two
* {@link DataSet DataSets} on key equality and provides multiple ways to combine
* joining elements into one DataSet.
* This method also gives the hint to the optimizer that the second DataSet to join is much
* larger than the first one.
* This method returns a {@link JoinOperatorSets JoinOperatorSet} on which
* {@link JoinOperatorSets#where()} needs to be called to define the join key of the first
* joining (i.e., this) DataSet.
*
* @param other The other DataSet with which this DataSet is joined.
* @return A JoinOperatorSet to continue the definition of the Join transformation.
*
* @see JoinOperatorSets
* @see JoinOperator
* @see DataSet
*/
public JoinOperatorSets joinWithHuge(DataSet other) {
return new JoinOperatorSets(this, other, JoinHint.BROADCAST_HASH_FIRST);
}
// --------------------------------------------------------------------------------------------
// Co-Grouping
// --------------------------------------------------------------------------------------------
/**
* Initiates a CoGroup transformation.
* A CoGroup transformation combines the elements of
* two {@link DataSet DataSets} into one DataSet. It groups each DataSet individually on a key and
* gives groups of both DataSets with equal keys together into a {@link CoGroupFunction}.
* If a DataSet has a group with no matching key in the other DataSet, the CoGroupFunction
* is called with an empty group for the non-existing group.
* The CoGroupFunction can iterate over the elements of both groups and return any number
* of elements including none.
* This method returns a {@link CoGroupOperatorSets} on which
* {@link CoGroupOperatorSets#where(} needs to be called to define the grouping key of the first
* (i.e., this) DataSet.
*
* @param other The other DataSet of the CoGroup transformation.
* @return A CoGroupOperatorSets to continue the definition of the CoGroup transformation.
*
* @see CoGroupOperatorSets
* @see CoGroupOperator
* @see DataSet
*/
public CoGroupOperator.CoGroupOperatorSets coGroup(DataSet other) {
return new CoGroupOperator.CoGroupOperatorSets(this, other);
}
// --------------------------------------------------------------------------------------------
// Cross
// --------------------------------------------------------------------------------------------
/**
* Continues a Join transformation and defines the {@link Tuple} fields of the second join
* {@link DataSet} that should be used as join keys.
* Note: Fields can only be selected as join keys on Tuple DataSets.
*
* The resulting {@link DefaultJoin} wraps each pair of joining elements into a {@link Tuple2}, with
* the element of the first input being the first field of the tuple and the element of the
* second input being the second field of the tuple.
*
* @param fields The indexes of the Tuple fields of the second join DataSet that should be used as keys.
* @return A DefaultJoin that represents the joined DataSet.
*/
/**
* Initiates a Cross transformation.
* A Cross transformation combines the elements of two
* {@link DataSet DataSets} into one DataSet. It builds all pair combinations of elements of
* both DataSets, i.e., it builds a Cartesian product.
*
*
* The resulting {@link DefaultCross} wraps each pair of crossed elements into a {@link Tuple2}, with
* the element of the first input being the first field of the tuple and the element of the
* second input being the second field of the tuple.
*
*
* Call {@link DefaultCross.with(CrossFunction)} to define a {@link CrossFunction} which is called for
* each pair of crossed elements. The CrossFunction returns a exactly one element for each pair of input elements.
*
* @param other The other DataSet with which this DataSet is crossed.
* @return A DefaultCross that returns a Tuple2 for each pair of crossed elements.
*
* @see DefaultCross
* @see CrossFunction
* @see DataSet
* @see Tuple2
*/
public CrossOperator.DefaultCross cross(DataSet other) {
return new CrossOperator.DefaultCross(this, other);
}
/**
* Initiates a Cross transformation.
* A Cross transformation combines the elements of two
* {@link DataSet DataSets} into one DataSet. It builds all pair combinations of elements of
* both DataSets, i.e., it builds a Cartesian product.
* This method also gives the hint to the optimizer that the second DataSet to cross is much
* smaller than the first one.
*
*
* The resulting {@link DefaultCross} wraps each pair of crossed elements into a {@link Tuple2}, with
* the element of the first input being the first field of the tuple and the element of the
* second input being the second field of the tuple.
*
*
* Call {@link DefaultCross.with(CrossFunction)} to define a {@link CrossFunction} which is called for
* each pair of crossed elements. The CrossFunction returns a exactly one element for each pair of input elements.
*
* @param other The other DataSet with which this DataSet is crossed.
* @return A DefaultCross that returns a Tuple2 for each pair of crossed elements.
*
* @see DefaultCross
* @see CrossFunction
* @see DataSet
* @see Tuple2
*/
public CrossOperator.DefaultCross crossWithTiny(DataSet other) {
return new CrossOperator.DefaultCross(this, other);
}
/**
* Initiates a Cross transformation.
* A Cross transformation combines the elements of two
* {@link DataSet DataSets} into one DataSet. It builds all pair combinations of elements of
* both DataSets, i.e., it builds a Cartesian product.
* This method also gives the hint to the optimizer that the second DataSet to cross is much
* larger than the first one.
*
*
* The resulting {@link DefaultCross} wraps each pair of crossed elements into a {@link Tuple2}, with
* the element of the first input being the first field of the tuple and the element of the
* second input being the second field of the tuple.
*
*
* Call {@link DefaultCross.with(CrossFunction)} to define a {@link CrossFunction} which is called for
* each pair of crossed elements. The CrossFunction returns a exactly one element for each pair of input elements.
*
* @param other The other DataSet with which this DataSet is crossed.
* @return A DefaultCross that returns a Tuple2 for each pair of crossed elements.
*
* @see DefaultCross
* @see CrossFunction
* @see DataSet
* @see Tuple2
*/
public CrossOperator.DefaultCross crossWithHuge(DataSet other) {
return new CrossOperator.DefaultCross(this, other);
}
// --------------------------------------------------------------------------------------------
// Iterations
// --------------------------------------------------------------------------------------------
/**
* Initiates an iterative part of the program that executes multiple times and feeds back data sets.
* The iterative part needs to be closed by calling {@link IterativeDataSet#closeWith(DataSet)}. The data set
* given to the {@code closeWith(DataSet)} method is the data set that will be fed back and used as the input
* to the next iteration. The return value of the {@code closeWith(DataSet)} method is the resulting
* data set after the iteration has terminated.
*
* An example of an iterative computation is as follows:
*
*
* {@code
* DataSet input = ...;
*
* DataSet startOfIteration = input.iterate(10);
* DataSet toBeFedBack = startOfIteration
* .map(new MyMapper())
* .groupBy(...).reduceGroup(new MyReducer());
* DataSet result = startOfIteration.closeWith(toBeFedBack);
* }
*
*
* The iteration has a maximum number of times that it executes. A dynamic termination can be realized by using a
* termination criterion (see {@link IterativeDataSet#closeWith(DataSet, DataSet)}).
*
* @param maxIterations The maximum number of times that the iteration is executed.
* @return An IterativeDataSet that marks the start of the iterative part and needs to be closed by
* {@link IterativeDataSet#closeWith(DataSet)}.
*
* @see eu.stratosphere.api.java.IterativeDataSet
*/
public IterativeDataSet iterate(int maxIterations) {
return new IterativeDataSet(getExecutionEnvironment(), getType(), this, maxIterations);
}
/**
* Initiates a delta iteration. A delta iteration is similar to a regular iteration (as started by {@link #iterate(int)},
* but maintains state across the individual iteration steps. The Solution set, which represents the current state
* at the beginning of each iteration can be obtained via {@link DeltaIteration#getSolutionSet()} ()}.
* It can be be accessed by joining (or CoGrouping) with it. The DataSet that represents the workset of an iteration
* can be obtained via {@link DeltaIteration#getWorkset()}.
* The solution set is updated by producing a delta for it, which is merged into the solution set at the end of each
* iteration step.
*
* The delta iteration must be closed by calling {@link DeltaIteration#closeWith(DataSet, DataSet)}. The two
* parameters are the delta for the solution set and the new workset (the data set that will be fed back).
* The return value of the {@code closeWith(DataSet, DataSet)} method is the resulting
* data set after the iteration has terminated. Delta iterations terminate when the feed back data set
* (the workset) is empty. In addition, a maximum number of steps is given as a fall back termination guard.
*
* Elements in the solution set are uniquely identified by a key. When merging the solution set delta, contained elements
* with the same key are replaced.
*
* NOTE: Delta iterations currently support only tuple valued data types. This restriction
* will be removed in the future. The key is specified by the tuple position.
*
* A code example for a delta iteration is as follows
*
* {@code
* DeltaIteration, Tuple2> iteration =
* initialState.iterateDelta(initialFeedbakSet, 100, 0);
*
* DataSet> delta = iteration.groupBy(0).aggregate(Aggregations.AVG, 1)
* .join(iteration.getSolutionSet()).where(0).equalTo(0)
* .flatMap(new ProjectAndFilter());
*
* DataSet> feedBack = delta.join(someOtherSet).where(...).equalTo(...).with(...);
*
* // close the delta iteration (delta and new workset are identical)
* DataSet> result = iteration.closeWith(delta, feedBack);
* }
*
*
* @param workset The initial version of the data set that is fed back to the next iteration step (the workset).
* @param maxIterations The maximum number of iteration steps, as a fall back safeguard.
* @param keyPositions The position of the tuple fields that is used as the key of the solution set.
*
* @return The DeltaIteration that marks the start of a delta iteration.
*
* @see DeltaIteration
*/
public DeltaIteration iterateDelta(DataSet workset, int maxIterations, int... keyPositions) {
Keys.FieldPositionKeys keys = new Keys.FieldPositionKeys(keyPositions, getType(), false);
return new DeltaIteration(getExecutionEnvironment(), getType(), this, workset, keys, maxIterations);
}
// --------------------------------------------------------------------------------------------
// Custom Operators
// -------------------------------------------------------------------------------------------
/**
* Runs a {@link CustomUnaryOperation} on the data set. Custom operations are typically complex
* operators that are composed of multiple steps.
*
* @param operation The operation to run.
* @return The data set produced by the operation.
*/
public DataSet runOperation(CustomUnaryOperation operation) {
Validate.notNull(operation, "The custom operator must not be null.");
operation.setInput(this);
return operation.createOperator();
}
// --------------------------------------------------------------------------------------------
// Union
// --------------------------------------------------------------------------------------------
/**
* Creates a union of this DataSet with an other DataSet. The other DataSet must be of the same data type.
*
* @param other The other DataSet which is unioned with the current DataSet.
* @return The resulting DataSet.
*/
public UnionOperator union(DataSet other){
return new UnionOperator(this, other);
}
// --------------------------------------------------------------------------------------------
// Top-K
// --------------------------------------------------------------------------------------------
// --------------------------------------------------------------------------------------------
// Result writing
// --------------------------------------------------------------------------------------------
/**
* Writes a DataSet as a text file to the specified location.
* For each element of the DataSet the result of {@link Object#toString()} is written.
*
* @param filePath The path pointing to the location the text file is written to.
* @return The DataSink that writes the DataSet.
*
* @see TextOutputFormat
*/
public DataSink writeAsText(String filePath) {
return output(new TextOutputFormat(new Path(filePath)));
}
/**
* Writes a DataSet as a text file to the specified location.
* For each element of the DataSet the result of {@link Object#toString()} is written.
*
* @param filePath The path pointing to the location the text file is written to.
* @param writeMode Control the behavior for existing files. Options are NO_OVERWRITE and OVERWRITE.
* @return The DataSink that writes the DataSet.
*
* @see TextOutputFormat
*/
public DataSink writeAsText(String filePath, WriteMode writeMode) {
TextOutputFormat tof = new TextOutputFormat(new Path(filePath));
tof.setWriteMode(writeMode);
return output(tof);
}
/**
* Writes a {@link Tuple} DataSet as a CSV file to the specified location.
* Note: Only a Tuple DataSet can written as a CSV file.
* For each Tuple field the result of {@link Object#toString()} is written.
* Tuple fields are separated by the default field delimiter {@link CsvOutputFormat.DEFAULT_FIELD_DELIMITER}.
* Tuples are are separated by the default line delimiter {@link CsvOutputFormat.DEFAULT_LINE_DELIMITER}.
*
* @param filePath The path pointing to the location the CSV file is written to.
* @return The DataSink that writes the DataSet.
*
* @see Tuple
* @see CsvOutputFormat
*/
public DataSink writeAsCsv(String filePath) {
return writeAsCsv(filePath, CsvOutputFormat.DEFAULT_LINE_DELIMITER, CsvOutputFormat.DEFAULT_FIELD_DELIMITER);
}
/**
* Writes a {@link Tuple} DataSet as a CSV file to the specified location with the specified field and line delimiters.
* Note: Only a Tuple DataSet can written as a CSV file.
* For each Tuple field the result of {@link Object#toString()} is written.
*
* @param filePath The path pointing to the location the CSV file is written to.
* @param rowDelimiter The row delimiter to separate Tuples.
* @param fieldDelimiter The field delimiter to separate Tuple fields.
*
* @see Tuple
* @see CsvOutputFormat
*/
public DataSink writeAsCsv(String filePath, String rowDelimiter, String fieldDelimiter) {
return internalWriteAsCsv(new Path(filePath), rowDelimiter, fieldDelimiter, null);
}
/**
* Writes a {@link Tuple} DataSet as a CSV file to the specified location with the specified field and line delimiters.
* Note: Only a Tuple DataSet can written as a CSV file.
* For each Tuple field the result of {@link Object#toString()} is written.
*
* @param filePath The path pointing to the location the CSV file is written to.
* @param rowDelimiter The row delimiter to separate Tuples.
* @param fieldDelimiter The field delimiter to separate Tuple fields.
* @param writeMode Control the behavior for existing files. Options are NO_OVERWRITE and OVERWRITE.
*
* @see Tuple
* @see CsvOutputFormat
*/
public DataSink writeAsCsv(String filePath, String rowDelimiter, String fieldDelimiter, WriteMode writeMode) {
return internalWriteAsCsv(new Path(filePath), rowDelimiter, fieldDelimiter, writeMode);
}
@SuppressWarnings("unchecked")
private DataSink internalWriteAsCsv(Path filePath, String rowDelimiter, String fieldDelimiter, WriteMode wm) {
Validate.isTrue(this.type.isTupleType(), "The writeAsCsv() method can only be used on data sets of tuples.");
CsvOutputFormat of = new CsvOutputFormat(filePath, rowDelimiter, fieldDelimiter);
if(wm != null) {
of.setWriteMode(wm);
}
return output((OutputFormat) of);
}
/**
* Writes a DataSet to the standard output stream (stdout).
* For each element of the DataSet the result of {@link Object#toString()} is written.
*
* @return The DataSink that writes the DataSet.
*/
public DataSink print() {
return output(new PrintingOutputFormat(false));
}
/**
* Writes a DataSet to the standard error stream (stderr).
* For each element of the DataSet the result of {@link Object#toString()} is written.
*
* @return The DataSink that writes the DataSet.
*/
public DataSink printToErr() {
return output(new PrintingOutputFormat(true));
}
/**
* Writes a DataSet using a {@link FileOutputFormat} to a specified location.
* This method adds a data sink to the program.
*
* @param outputFormat The FileOutputFormat to write the DataSet.
* @param filePath The path to the location where the DataSet is written.
* @return The DataSink that writes the DataSet.
*
* @see FileOutputFormat
*/
public DataSink write(FileOutputFormat outputFormat, String filePath) {
Validate.notNull(filePath, "File path must not be null.");
Validate.notNull(outputFormat, "Output format must not be null.");
outputFormat.setOutputFilePath(new Path(filePath));
return output(outputFormat);
}
/**
* Writes a DataSet using a {@link FileOutputFormat} to a specified location.
* This method adds a data sink to the program.
*
* @param outputFormat The FileOutputFormat to write the DataSet.
* @param filePath The path to the location where the DataSet is written.
* @param writeMode The mode of writing, indicating whether to overwrite existing files.
* @return The DataSink that writes the DataSet.
*
* @see FileOutputFormat
*/
public DataSink write(FileOutputFormat outputFormat, String filePath, WriteMode writeMode) {
Validate.notNull(filePath, "File path must not be null.");
Validate.notNull(writeMode, "Write mode must not be null.");
Validate.notNull(outputFormat, "Output format must not be null.");
outputFormat.setOutputFilePath(new Path(filePath));
outputFormat.setWriteMode(writeMode);
return output(outputFormat);
}
/**
* Emits a DataSet using an {@link OutputFormat}. This method adds a data sink to the program.
* Programs may have multiple data sinks. A DataSet may also have multiple consumers (data sinks
* or transformations) at the same time.
*
* @param outputFormat The OutputFormat to process the DataSet.
* @return The DataSink that processes the DataSet.
*
* @see OutputFormat
* @see DataSink
*/
public DataSink output(OutputFormat outputFormat) {
Validate.notNull(outputFormat);
// configure the type if needed
if (outputFormat instanceof InputTypeConfigurable) {
((InputTypeConfigurable) outputFormat).setInputType(this.type);
}
DataSink sink = new DataSink(this, outputFormat, this.type);
this.context.registerDataSink(sink);
return sink;
}
// --------------------------------------------------------------------------------------------
// Utilities
// --------------------------------------------------------------------------------------------
protected static void checkSameExecutionContext(DataSet> set1, DataSet> set2) {
if (set1.context != set2.context) {
throw new IllegalArgumentException("The two inputs have different execution contexts.");
}
}
}