
com.google.cloud.dataflow.sdk.transforms.join.KeyedPCollectionTuple Maven / Gradle / Ivy
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.transforms.join;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.KvCoder;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PInput;
import com.google.cloud.dataflow.sdk.values.POutput;
import com.google.cloud.dataflow.sdk.values.PValue;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.cloud.dataflow.sdk.values.TupleTagList;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
/**
* An immutable tuple of keyed {@link PCollection PCollections}
* with key type K.
* ({@link PCollection PCollections} containing values of type
* {@code KV})
*
* @param the type of key shared by all constituent PCollections
*/
public class KeyedPCollectionTuple implements PInput {
/**
* Returns an empty {@code KeyedPCollectionTuple} on the given pipeline.
*/
public static KeyedPCollectionTuple empty(Pipeline pipeline) {
return new KeyedPCollectionTuple<>(pipeline);
}
/**
* Returns a new {@code KeyedPCollectionTuple} with the given tag and initial
* PCollection.
*/
public static KeyedPCollectionTuple of(
TupleTag tag,
PCollection> pc) {
return new KeyedPCollectionTuple(pc.getPipeline()).and(tag, pc);
}
/**
* Returns a new {@code KeyedPCollectionTuple} that is the same as this,
* appended with the given PCollection.
*/
public KeyedPCollectionTuple and(
TupleTag< V> tag,
PCollection> pc) {
if (pc.getPipeline() != getPipeline()) {
throw new IllegalArgumentException(
"PCollections come from different Pipelines");
}
TaggedKeyedPCollection wrapper =
new TaggedKeyedPCollection<>(tag, pc);
Coder myKeyCoder = keyCoder == null ? getKeyCoder(pc) : keyCoder;
List>
newKeyedCollections =
copyAddLast(
keyedCollections,
wrapper);
return new KeyedPCollectionTuple<>(
getPipeline(),
newKeyedCollections,
schema.getTupleTagList().and(tag),
myKeyCoder);
}
public boolean isEmpty() {
return keyedCollections.isEmpty();
}
/**
* Returns a list of {@link TaggedKeyedPCollection TaggedKeyedPCollections} for the
* {@link PCollection PCollections} contained in this {@link KeyedPCollectionTuple}.
*/
public List> getKeyedCollections() {
return keyedCollections;
}
/**
* Like {@link #apply(String, PTransform)} but defaulting to the name
* provided by the {@link PTransform}.
*/
public OutputT apply(
PTransform, OutputT> transform) {
return Pipeline.applyTransform(this, transform);
}
/**
* Applies the given {@link PTransform} to this input {@code KeyedPCollectionTuple} and returns
* its {@code OutputT}. This uses {@code name} to identify the specific application of
* the transform. This name is used in various places, including the monitoring UI,
* logging, and to stably identify this application node in the job graph.
*/
public OutputT apply(
String name, PTransform, OutputT> transform) {
return Pipeline.applyTransform(name, this, transform);
}
/**
* Expands the component {@link PCollection PCollections}, stripping off
* any tag-specific information.
*/
@Override
public Collection extends PValue> expand() {
List> retval = new ArrayList<>();
for (TaggedKeyedPCollection taggedPCollection : keyedCollections) {
retval.add(taggedPCollection.pCollection);
}
return retval;
}
/**
* Returns the key {@link Coder} for all {@link PCollection PCollections}
* in this {@link KeyedPCollectionTuple}.
*/
public Coder getKeyCoder() {
if (keyCoder == null) {
throw new IllegalStateException("cannot return null keyCoder");
}
return keyCoder;
}
/**
* Returns the {@link CoGbkResultSchema} associated with this
* {@link KeyedPCollectionTuple}.
*/
public CoGbkResultSchema getCoGbkResultSchema() {
return schema;
}
@Override
public Pipeline getPipeline() {
return pipeline;
}
@Override
public void finishSpecifying() {
for (TaggedKeyedPCollection taggedPCollection : keyedCollections) {
taggedPCollection.pCollection.finishSpecifying();
}
}
/////////////////////////////////////////////////////////////////////////////
/**
* A utility class to help ensure coherence of tag and input PCollection
* types.
*/
public static class TaggedKeyedPCollection {
final TupleTag tupleTag;
final PCollection> pCollection;
public TaggedKeyedPCollection(
TupleTag tupleTag,
PCollection> pCollection) {
this.tupleTag = tupleTag;
this.pCollection = pCollection;
}
/**
* Returns the underlying PCollection of this TaggedKeyedPCollection.
*/
public PCollection> getCollection() {
return pCollection;
}
/**
* Returns the TupleTag of this TaggedKeyedPCollection.
*/
public TupleTag getTupleTag() {
return tupleTag;
}
}
/**
* We use a List to properly track the order in which collections are added.
*/
private final List> keyedCollections;
private final Coder keyCoder;
private final CoGbkResultSchema schema;
private final Pipeline pipeline;
KeyedPCollectionTuple(Pipeline pipeline) {
this(pipeline,
new ArrayList>(),
TupleTagList.empty(),
null);
}
KeyedPCollectionTuple(
Pipeline pipeline,
List> keyedCollections,
TupleTagList tupleTagList,
Coder keyCoder) {
this.pipeline = pipeline;
this.keyedCollections = keyedCollections;
this.schema = new CoGbkResultSchema(tupleTagList);
this.keyCoder = keyCoder;
}
private static Coder getKeyCoder(PCollection> pc) {
// Need to run coder inference on this PCollection before inspecting it.
pc.finishSpecifying();
// Assumes that the PCollection uses a KvCoder.
Coder> entryCoder = pc.getCoder();
if (!(entryCoder instanceof KvCoder, ?>)) {
throw new IllegalArgumentException("PCollection does not use a KvCoder");
}
@SuppressWarnings("unchecked")
KvCoder coder = (KvCoder) entryCoder;
return coder.getKeyCoder();
}
private static List> copyAddLast(
List> keyedCollections,
TaggedKeyedPCollection taggedCollection) {
List> retval =
new ArrayList<>(keyedCollections);
retval.add(taggedCollection);
return retval;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy