com.google.cloud.dataflow.sdk.transforms.join.CoGbkResult Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0

Show newest version

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.transforms.join;

import static com.google.cloud.dataflow.sdk.util.Structs.addObject;

import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.CoderException;
import com.google.cloud.dataflow.sdk.coders.IterableCoder;
import com.google.cloud.dataflow.sdk.coders.StandardCoder;
import com.google.cloud.dataflow.sdk.util.CloudObject;
import com.google.cloud.dataflow.sdk.util.PropertyNames;
import com.google.cloud.dataflow.sdk.util.common.Reiterator;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.cloud.dataflow.sdk.values.TupleTagList;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;

/**
 * A row result of a {@link CoGroupByKey}.  This is a tuple of {@link Iterable}s produced for
 * a given key, and these can be accessed in different ways.
 */
public class CoGbkResult {
  /**
   * A map of integer union tags to a list of union objects.
   * Note: the key and the embedded union tag are the same, so it is redundant
   * to store it multiple times, but for now it makes encoding easier.
   */
  private final List> valueMap;

  private final CoGbkResultSchema schema;

  private static final int DEFAULT_IN_MEMORY_ELEMENT_COUNT = 10_000;

  private static final Logger LOG = LoggerFactory.getLogger(CoGbkResult.class);

  /**
   * A row in the {@link PCollection} resulting from a {@link CoGroupByKey} transform.
   * Currently, this row must fit into memory.
   *
   * @param schema the set of tuple tags used to refer to input tables and
   *               result values
   * @param taggedValues the raw results from a group-by-key
   */
  public CoGbkResult(
      CoGbkResultSchema schema,
      Iterable taggedValues) {
    this(schema, taggedValues, DEFAULT_IN_MEMORY_ELEMENT_COUNT);
  }

  @SuppressWarnings("unchecked")
  public CoGbkResult(
      CoGbkResultSchema schema,
      Iterable taggedValues,
      int inMemoryElementCount) {
    this.schema = schema;
    valueMap = new ArrayList<>();
    for (int unionTag = 0; unionTag < schema.size(); unionTag++) {
      valueMap.add(new ArrayList<>());
    }

    // Demultiplex the first imMemoryElementCount tagged union values
    // according to their tag.
    final Iterator taggedIter = taggedValues.iterator();
    int elementCount = 0;
    while (taggedIter.hasNext()) {
      if (elementCount++ >= inMemoryElementCount && taggedIter instanceof Reiterator) {
        // Let the tails be lazy.
        break;
      }
      RawUnionValue value = taggedIter.next();
      // Make sure the given union tag has a corresponding tuple tag in the
      // schema.
      int unionTag = value.getUnionTag();
      if (schema.size() <= unionTag) {
        throw new IllegalStateException("union tag " + unionTag +
            " has no corresponding tuple tag in the result schema");
      }
      List