
com.google.cloud.dataflow.sdk.transforms.join.CoGroupByKey Maven / Gradle / Ivy
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.transforms.join;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.KvCoder;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.Flatten;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.join.CoGbkResult.CoGbkResultCoder;
import com.google.cloud.dataflow.sdk.transforms.join.KeyedPCollectionTuple.TaggedKeyedPCollection;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionList;
import java.util.ArrayList;
import java.util.List;
/**
* A {@link PTransform} that performs a {@link CoGroupByKey} on a tuple
* of tables. A {@link CoGroupByKey} groups results from all
* tables by like keys into {@link CoGbkResult}s,
* from which the results for any specific table can be accessed by the
* {@link com.google.cloud.dataflow.sdk.values.TupleTag}
* supplied with the initial table.
*
* Example of performing a {@link CoGroupByKey} followed by a
* {@link ParDo} that consumes
* the results:
*
* {@literal PCollection>} pt1 = ...;
* {@literal PCollection>} pt2 = ...;
*
* final {@literal TupleTag} t1 = new {@literal TupleTag<>()};
* final {@literal TupleTag} t2 = new {@literal TupleTag<>()};
* {@literal PCollection>} coGbkResultCollection =
* KeyedPCollectionTuple.of(t1, pt1)
* .and(t2, pt2)
* .apply({@literal CoGroupByKey.create()});
*
* {@literal PCollection} finalResultCollection =
* coGbkResultCollection.apply(ParDo.of(
* new {@literal DoFn, T>()} {
* {@literal @}Override
* public void processElement(ProcessContext c) {
* {@literal KV} e = c.element();
* {@literal Iterable} pt1Vals = e.getValue().getAll(t1);
* V2 pt2Val = e.getValue().getOnly(t2);
* ... Do Something ....
* c.output(...some T...);
* }
* }));
*
*
* @param the type of the keys in the input and output
* {@code PCollection}s
*/
@SuppressWarnings("serial")
public class CoGroupByKey extends
PTransform,
PCollection>> {
/**
* Returns a {@code CoGroupByKey} {@code PTransform}.
*
* @param the type of the keys in the input and output
* {@code PCollection}s
*/
public static CoGroupByKey create() {
return new CoGroupByKey<>();
}
private CoGroupByKey() { }
@Override
public PCollection> apply(
KeyedPCollectionTuple input) {
if (input.isEmpty()) {
throw new IllegalArgumentException(
"must have at least one input to a KeyedPCollections");
}
// First build the union coder.
// TODO: Look at better integration of union types with the
// schema specified in the input.
List> codersList = new ArrayList<>();
for (TaggedKeyedPCollection entry : input.getKeyedCollections()) {
codersList.add(getValueCoder(entry.pCollection));
}
UnionCoder unionCoder = UnionCoder.of(codersList);
Coder keyCoder = input.getKeyCoder();
KvCoder kVCoder =
KvCoder.of(keyCoder, unionCoder);
PCollectionList> unionTables =
PCollectionList.empty(input.getPipeline());
// TODO: Use the schema to order the indices rather than depending
// on the fact that the schema ordering is identical to the ordering from
// input.getJoinCollections().
int index = -1;
for (TaggedKeyedPCollection entry : input.getKeyedCollections()) {
index++;
PCollection> unionTable =
makeUnionTable(index, entry.pCollection, kVCoder);
unionTables = unionTables.and(unionTable);
}
PCollection> flattenedTable =
unionTables.apply(Flatten.>pCollections());
PCollection>> groupedTable =
flattenedTable.apply(GroupByKey.create());
CoGbkResultSchema tupleTags = input.getCoGbkResultSchema();
PCollection> result = groupedTable.apply(
ParDo.of(new ConstructCoGbkResultFn(tupleTags))
.named("ConstructCoGbkResultFn"));
result.setCoder(KvCoder.of(keyCoder,
CoGbkResultCoder.of(tupleTags, unionCoder)));
return result;
}
//////////////////////////////////////////////////////////////////////////////
/**
* Returns the value coder for the given PCollection. Assumes that the value
* coder is an instance of {@code KvCoder}.
*/
private Coder getValueCoder(PCollection> pCollection) {
// Assumes that the PCollection uses a KvCoder.
Coder> entryCoder = pCollection.getCoder();
if (!(entryCoder instanceof KvCoder, ?>)) {
throw new IllegalArgumentException("PCollection does not use a KvCoder");
}
@SuppressWarnings("unchecked")
KvCoder coder = (KvCoder) entryCoder;
return coder.getValueCoder();
}
/**
* Returns a UnionTable for the given input PCollection, using the given
* union index and the given unionTableEncoder.
*/
private PCollection> makeUnionTable(
final int index,
PCollection> pCollection,
KvCoder unionTableEncoder) {
return pCollection.apply(ParDo.of(
new ConstructUnionTableFn(index)).named("MakeUnionTable" + index))
.setCoder(unionTableEncoder);
}
/**
* A DoFn to construct a UnionTable (i.e., a
* {@code PCollection>} from a
* {@code PCollection>}.
*/
private static class ConstructUnionTableFn extends
DoFn, KV> {
private final int index;
public ConstructUnionTableFn(int index) {
this.index = index;
}
@Override
public void processElement(ProcessContext c) {
KV e = c.element();
c.output(KV.of(e.getKey(), new RawUnionValue(index, e.getValue())));
}
}
/**
* A DoFn to construct a CoGbkResult from an input grouped union
* table.
*/
private static class ConstructCoGbkResultFn
extends DoFn>,
KV> {
private final CoGbkResultSchema schema;
public ConstructCoGbkResultFn(CoGbkResultSchema schema) {
this.schema = schema;
}
@Override
public void processElement(ProcessContext c) {
KV> e = c.element();
c.output(KV.of(e.getKey(), new CoGbkResult(schema, e.getValue())));
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy