All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.transforms.join.CoGroupByKey Maven / Gradle / Ivy

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.transforms.join;

import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.KvCoder;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.Flatten;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.join.CoGbkResult.CoGbkResultCoder;
import com.google.cloud.dataflow.sdk.transforms.join.KeyedPCollectionTuple.TaggedKeyedPCollection;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionList;

import java.util.ArrayList;
import java.util.List;

/**
 * A {@link PTransform} that performs a {@link CoGroupByKey} on a tuple
 * of tables.  A {@link CoGroupByKey} groups results from all
 * tables by like keys into {@link CoGbkResult}s,
 * from which the results for any specific table can be accessed by the
 * {@link com.google.cloud.dataflow.sdk.values.TupleTag}
 * supplied with the initial table.
 *
 * 

Example of performing a {@link CoGroupByKey} followed by a * {@link ParDo} that consumes * the results: *

 
 * {@literal PCollection>} pt1 = ...;
 * {@literal PCollection>} pt2 = ...;
 *
 * final {@literal TupleTag} t1 = new {@literal TupleTag<>()};
 * final {@literal TupleTag} t2 = new {@literal TupleTag<>()};
 * {@literal PCollection>} coGbkResultCollection =
 *   KeyedPCollectionTuple.of(t1, pt1)
 *                        .and(t2, pt2)
 *                        .apply({@literal CoGroupByKey.create()});
 *
 * {@literal PCollection} finalResultCollection =
 *   coGbkResultCollection.apply(ParDo.of(
 *     new {@literal DoFn, T>()} {
 *       {@literal @}Override
 *       public void processElement(ProcessContext c) {
 *         {@literal KV} e = c.element();
 *         {@literal Iterable} pt1Vals = e.getValue().getAll(t1);
 *         V2 pt2Val = e.getValue().getOnly(t2);
 *          ... Do Something ....
 *         c.output(...some T...);
 *       }
 *     }));
 *  
* * @param the type of the keys in the input and output * {@code PCollection}s */ @SuppressWarnings("serial") public class CoGroupByKey extends PTransform, PCollection>> { /** * Returns a {@code CoGroupByKey} {@code PTransform}. * * @param the type of the keys in the input and output * {@code PCollection}s */ public static CoGroupByKey create() { return new CoGroupByKey<>(); } private CoGroupByKey() { } @Override public PCollection> apply( KeyedPCollectionTuple input) { if (input.isEmpty()) { throw new IllegalArgumentException( "must have at least one input to a KeyedPCollections"); } // First build the union coder. // TODO: Look at better integration of union types with the // schema specified in the input. List> codersList = new ArrayList<>(); for (TaggedKeyedPCollection entry : input.getKeyedCollections()) { codersList.add(getValueCoder(entry.pCollection)); } UnionCoder unionCoder = UnionCoder.of(codersList); Coder keyCoder = input.getKeyCoder(); KvCoder kVCoder = KvCoder.of(keyCoder, unionCoder); PCollectionList> unionTables = PCollectionList.empty(input.getPipeline()); // TODO: Use the schema to order the indices rather than depending // on the fact that the schema ordering is identical to the ordering from // input.getJoinCollections(). int index = -1; for (TaggedKeyedPCollection entry : input.getKeyedCollections()) { index++; PCollection> unionTable = makeUnionTable(index, entry.pCollection, kVCoder); unionTables = unionTables.and(unionTable); } PCollection> flattenedTable = unionTables.apply(Flatten.>pCollections()); PCollection>> groupedTable = flattenedTable.apply(GroupByKey.create()); CoGbkResultSchema tupleTags = input.getCoGbkResultSchema(); PCollection> result = groupedTable.apply( ParDo.of(new ConstructCoGbkResultFn(tupleTags)) .named("ConstructCoGbkResultFn")); result.setCoder(KvCoder.of(keyCoder, CoGbkResultCoder.of(tupleTags, unionCoder))); return result; } ////////////////////////////////////////////////////////////////////////////// /** * Returns the value coder for the given PCollection. Assumes that the value * coder is an instance of {@code KvCoder}. */ private Coder getValueCoder(PCollection> pCollection) { // Assumes that the PCollection uses a KvCoder. Coder entryCoder = pCollection.getCoder(); if (!(entryCoder instanceof KvCoder)) { throw new IllegalArgumentException("PCollection does not use a KvCoder"); } @SuppressWarnings("unchecked") KvCoder coder = (KvCoder) entryCoder; return coder.getValueCoder(); } /** * Returns a UnionTable for the given input PCollection, using the given * union index and the given unionTableEncoder. */ private PCollection> makeUnionTable( final int index, PCollection> pCollection, KvCoder unionTableEncoder) { return pCollection.apply(ParDo.of( new ConstructUnionTableFn(index)).named("MakeUnionTable" + index)) .setCoder(unionTableEncoder); } /** * A DoFn to construct a UnionTable (i.e., a * {@code PCollection>} from a * {@code PCollection>}. */ private static class ConstructUnionTableFn extends DoFn, KV> { private final int index; public ConstructUnionTableFn(int index) { this.index = index; } @Override public void processElement(ProcessContext c) { KV e = c.element(); c.output(KV.of(e.getKey(), new RawUnionValue(index, e.getValue()))); } } /** * A DoFn to construct a CoGbkResult from an input grouped union * table. */ private static class ConstructCoGbkResultFn extends DoFn>, KV> { private final CoGbkResultSchema schema; public ConstructCoGbkResultFn(CoGbkResultSchema schema) { this.schema = schema; } @Override public void processElement(ProcessContext c) { KV> e = c.element(); c.output(KV.of(e.getKey(), new CoGbkResult(schema, e.getValue()))); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy