All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.transforms.join.KeyedPCollectionTuple Maven / Gradle / Ivy

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.transforms.join;

import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.KvCoder;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PInput;
import com.google.cloud.dataflow.sdk.values.POutput;
import com.google.cloud.dataflow.sdk.values.PValue;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.cloud.dataflow.sdk.values.TupleTagList;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;

/**
 * An immutable tuple of keyed {@link PCollection PCollections}
 * with key type K.
 * ({@link PCollection PCollections} containing values of type
 * {@code KV})
 *
 * @param  the type of key shared by all constituent PCollections
 */
public class KeyedPCollectionTuple implements PInput {
  /**
   * Returns an empty {@code KeyedPCollectionTuple} on the given pipeline.
   */
  public static  KeyedPCollectionTuple empty(Pipeline pipeline) {
    return new KeyedPCollectionTuple<>(pipeline);
  }

  /**
   * Returns a new {@code KeyedPCollectionTuple} with the given tag and initial
   * PCollection.
   */
  public static  KeyedPCollectionTuple of(
      TupleTag tag,
      PCollection> pc) {
    return new KeyedPCollectionTuple(pc.getPipeline()).and(tag, pc);
  }

  /**
   * Returns a new {@code KeyedPCollectionTuple} that is the same as this,
   * appended with the given PCollection.
   */
  public  KeyedPCollectionTuple and(
      TupleTag< V> tag,
      PCollection> pc) {
    if (pc.getPipeline() != getPipeline()) {
      throw new IllegalArgumentException(
          "PCollections come from different Pipelines");
    }
    TaggedKeyedPCollection wrapper =
        new TaggedKeyedPCollection<>(tag, pc);
    Coder myKeyCoder = keyCoder == null ? getKeyCoder(pc) : keyCoder;
    List>
      newKeyedCollections =
        copyAddLast(
            keyedCollections,
            wrapper);
    return new KeyedPCollectionTuple<>(
        getPipeline(),
        newKeyedCollections,
        schema.getTupleTagList().and(tag),
        myKeyCoder);
  }

  public boolean isEmpty() {
    return keyedCollections.isEmpty();
  }

  /**
   * Returns a list of {@link TaggedKeyedPCollection TaggedKeyedPCollections} for the
   * {@link PCollection PCollections} contained in this {@link KeyedPCollectionTuple}.
   */
  public List> getKeyedCollections() {
    return keyedCollections;
  }

  /**
   * Like {@link #apply(String, PTransform)} but defaulting to the name
   * provided by the {@link PTransform}.
   */
  public  OutputT apply(
      PTransform, OutputT> transform) {
    return Pipeline.applyTransform(this, transform);
  }

  /**
   * Applies the given {@link PTransform} to this input {@code KeyedPCollectionTuple} and returns
   * its {@code OutputT}. This uses {@code name} to identify the specific application of
   * the transform. This name is used in various places, including the monitoring UI,
   * logging, and to stably identify this application node in the job graph.
   */
  public  OutputT apply(
      String name, PTransform, OutputT> transform) {
    return Pipeline.applyTransform(name, this, transform);
  }

  /**
   * Expands the component {@link PCollection PCollections}, stripping off
   * any tag-specific information.
   */
  @Override
  public Collection expand() {
    List> retval = new ArrayList<>();
    for (TaggedKeyedPCollection taggedPCollection : keyedCollections) {
      retval.add(taggedPCollection.pCollection);
    }
    return retval;
  }

  /**
   * Returns the key {@link Coder} for all {@link PCollection PCollections}
   * in this {@link KeyedPCollectionTuple}.
   */
  public Coder getKeyCoder() {
    if (keyCoder == null) {
      throw new IllegalStateException("cannot return null keyCoder");
    }
    return keyCoder;
  }

  /**
   * Returns the {@link CoGbkResultSchema} associated with this
   * {@link KeyedPCollectionTuple}.
   */
  public CoGbkResultSchema getCoGbkResultSchema() {
    return schema;
  }

  @Override
  public Pipeline getPipeline() {
    return pipeline;
  }

  @Override
  public void finishSpecifying() {
    for (TaggedKeyedPCollection taggedPCollection : keyedCollections) {
      taggedPCollection.pCollection.finishSpecifying();
    }
  }

  /////////////////////////////////////////////////////////////////////////////

  /**
   * A utility class to help ensure coherence of tag and input PCollection
   * types.
   */
  public static class TaggedKeyedPCollection {

    final TupleTag tupleTag;
    final PCollection> pCollection;

    public TaggedKeyedPCollection(
        TupleTag tupleTag,
        PCollection> pCollection) {
      this.tupleTag = tupleTag;
      this.pCollection = pCollection;
    }

    /**
     * Returns the underlying PCollection of this TaggedKeyedPCollection.
     */
    public PCollection> getCollection() {
      return pCollection;
    }

    /**
     * Returns the TupleTag of this TaggedKeyedPCollection.
     */
    public TupleTag getTupleTag() {
      return tupleTag;
    }
  }

  /**
   * We use a List to properly track the order in which collections are added.
   */
  private final List> keyedCollections;

  private final Coder keyCoder;

  private final CoGbkResultSchema schema;

  private final Pipeline pipeline;

  KeyedPCollectionTuple(Pipeline pipeline) {
    this(pipeline,
         new ArrayList>(),
         TupleTagList.empty(),
         null);
  }

  KeyedPCollectionTuple(
      Pipeline pipeline,
      List> keyedCollections,
      TupleTagList tupleTagList,
      Coder keyCoder) {
    this.pipeline = pipeline;
    this.keyedCollections = keyedCollections;
    this.schema = new CoGbkResultSchema(tupleTagList);
    this.keyCoder = keyCoder;
  }

  private static  Coder getKeyCoder(PCollection> pc) {
    // Need to run coder inference on this PCollection before inspecting it.
    pc.finishSpecifying();

    // Assumes that the PCollection uses a KvCoder.
    Coder entryCoder = pc.getCoder();
    if (!(entryCoder instanceof KvCoder)) {
      throw new IllegalArgumentException("PCollection does not use a KvCoder");
    }
    @SuppressWarnings("unchecked")
    KvCoder coder = (KvCoder) entryCoder;
    return coder.getKeyCoder();
  }

  private static  List> copyAddLast(
        List> keyedCollections,
        TaggedKeyedPCollection taggedCollection) {
    List> retval =
        new ArrayList<>(keyedCollections);
    retval.add(taggedCollection);
    return retval;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy