org.apache.crunch.PTable Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch;
import java.util.Collection;
import java.util.Map;
import org.apache.crunch.types.PTableType;
import org.apache.crunch.types.PType;
/**
* A sub-interface of {@code PCollection} that represents an immutable,
* distributed multi-map of keys and values.
*
*/
public interface PTable extends PCollection> {
/**
Returns a {@code PTable} instance that acts as the union of this
* {@code PTable} and the other {@code PTable}s.
*/
PTable union(PTable other);
/**
* Returns a {@code PTable} instance that acts as the union of this
* {@code PTable} and the input {@code PTable}s.
*/
PTable union(PTable... others);
/**
* Performs a grouping operation on the keys of this table.
*
* @return a {@code PGroupedTable} instance that represents the grouping
*/
PGroupedTable groupByKey();
/**
* Performs a grouping operation on the keys of this table, using the given
* number of partitions.
*
* @param numPartitions
* The number of partitions for the data.
* @return a {@code PGroupedTable} instance that represents this grouping
*/
PGroupedTable groupByKey(int numPartitions);
/**
* Performs a grouping operation on the keys of this table, using the
* additional {@code GroupingOptions} to control how the grouping is executed.
*
* @param options
* The grouping options to use
* @return a {@code PGroupedTable} instance that represents the grouping
*/
PGroupedTable groupByKey(GroupingOptions options);
/**
* Writes this {@code PTable} to the given {@code Target}.
*/
PTable write(Target target);
/**
* Writes this {@code PTable} to the given {@code Target}, using the
* given {@code Target.WriteMode} to handle existing targets.
*/
PTable write(Target target, Target.WriteMode writeMode);
PTable cache();
PTable cache(CachingOptions options);
/**
* Returns the {@code PTableType} of this {@code PTable}.
*/
PTableType getPTableType();
/**
* Returns the {@code PType} of the key.
*/
PType getKeyType();
/**
* Returns the {@code PType} of the value.
*/
PType getValueType();
/**
* Returns a {@code PTable} that has the same keys as this instance, but
* uses the given function to map the values.
*/
PTable mapValues(MapFn mapFn, PType ptype);
/**
* Returns a {@code PTable} that has the same keys as this instance, but
* uses the given function to map the values.
*/
PTable mapValues(String name, MapFn mapFn, PType ptype);
/**
* Returns a {@code PTable} that has the same values as this instance, but
* uses the given function to map the keys.
*/
PTable mapKeys(MapFn mapFn, PType ptype);
/**
* Returns a {@code PTable} that has the same values as this instance, but
* uses the given function to map the keys.
*/
PTable mapKeys(String name, MapFn mapFn, PType ptype);
/**
* Aggregate all of the values with the same key into a single key-value pair
* in the returned PTable.
*/
PTable> collectValues();
/**
* Apply the given filter function to this instance and return the resulting
* {@code PTable}.
*/
PTable filter(FilterFn> filterFn);
/**
* Apply the given filter function to this instance and return the resulting
* {@code PTable}.
*
* @param name
* An identifier for this processing step
* @param filterFn
* The {@code FilterFn} to apply
*/
PTable filter(String name, FilterFn> filterFn);
/**
* Returns a PTable made up of the pairs in this PTable with the largest value
* field.
*
* @param count
* The number of pairs to return
*/
PTable top(int count);
/**
* Returns a PTable made up of the pairs in this PTable with the smallest
* value field.
*
* @param count
* The number of pairs to return
*/
PTable bottom(int count);
/**
* Perform an inner join on this table and the one passed in as an argument on
* their common keys.
*/
PTable> join(PTable other);
/**
* Co-group operation with the given table.
*
* Note: If the given table contains keys that are not present in this PTable, an empty
* PCollection is set for the relationship.
*/
PTable, Collection>> cogroup(PTable other);
/**
* Returns a {@link PCollection} made up of the keys in this PTable.
*/
PCollection keys();
/**
* Returns a {@link PCollection} made up of the values in this PTable.
*/
PCollection values();
/**
* Returns a Map made up of the keys and values in this PTable.
*
* Note: The contents of the returned map may not be exactly the same
* as this PTable, as a PTable is a multi-map (i.e. can contain multiple
* values for a single key).
*/
Map materializeToMap();
/**
* Returns a {@link PObject} encapsulating a {@link Map} made up of the keys and values in this
* {@code PTable}.
* Note:The contents of the returned map may not be exactly the same as this PTable,
* as a PTable is a multi-map (i.e. can contain multiple values for a single key).
*
*
* @return The {@code PObject} encapsulating a {@code Map} made up of the keys and values in
* this {@code PTable}.
*/
PObject