All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.runners.inprocess.KeyedPValueTrackingVisitor Maven / Gradle / Ivy

Go to download

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2016 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.cloud.dataflow.sdk.runners.inprocess;

import static com.google.common.base.Preconditions.checkState;

import com.google.cloud.dataflow.sdk.Pipeline.PipelineVisitor;
import com.google.cloud.dataflow.sdk.runners.TransformTreeNode;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.values.PValue;

import java.util.HashSet;
import java.util.Set;

/**
 * A pipeline visitor that tracks all keyed {@link PValue PValues}. A {@link PValue} is keyed if it
 * is the result of a {@link PTransform} that produces keyed outputs. A {@link PTransform} that
 * produces keyed outputs is assumed to colocate output elements that share a key.
 *
 * 

All {@link GroupByKey} transforms, or their runner-specific implementation primitive, produce * keyed output. */ // TODO: Handle Key-preserving transforms when appropriate and more aggressively make PTransforms // unkeyed class KeyedPValueTrackingVisitor implements PipelineVisitor { @SuppressWarnings("rawtypes") private final Set> producesKeyedOutputs; private final Set keyedValues; private boolean finalized; public static KeyedPValueTrackingVisitor create( @SuppressWarnings("rawtypes") Set> producesKeyedOutputs) { return new KeyedPValueTrackingVisitor(producesKeyedOutputs); } private KeyedPValueTrackingVisitor( @SuppressWarnings("rawtypes") Set> producesKeyedOutputs) { this.producesKeyedOutputs = producesKeyedOutputs; this.keyedValues = new HashSet<>(); } @Override public void enterCompositeTransform(TransformTreeNode node) { checkState( !finalized, "Attempted to use a %s that has already been finalized on a pipeline (visiting node %s)", KeyedPValueTrackingVisitor.class.getSimpleName(), node); } @Override public void leaveCompositeTransform(TransformTreeNode node) { checkState( !finalized, "Attempted to use a %s that has already been finalized on a pipeline (visiting node %s)", KeyedPValueTrackingVisitor.class.getSimpleName(), node); if (node.isRootNode()) { finalized = true; } else if (producesKeyedOutputs.contains(node.getTransform().getClass())) { keyedValues.addAll(node.getExpandedOutputs()); } } @Override public void visitTransform(TransformTreeNode node) {} @Override public void visitValue(PValue value, TransformTreeNode producer) { if (producesKeyedOutputs.contains(producer.getTransform().getClass())) { keyedValues.addAll(value.expand()); } } public Set getKeyedPValues() { checkState( finalized, "can't call getKeyedPValues before a Pipeline has been completely traversed"); return keyedValues; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy