
com.google.cloud.dataflow.sdk.runners.worker.DataflowSideInputReader Maven / Gradle / Ivy
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.runners.worker;
import com.google.api.services.dataflow.model.SideInputInfo;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow;
import com.google.cloud.dataflow.sdk.util.DirectSideInputReader;
import com.google.cloud.dataflow.sdk.util.ExecutionContext;
import com.google.cloud.dataflow.sdk.util.PTuple;
import com.google.cloud.dataflow.sdk.util.SideInputReader;
import com.google.cloud.dataflow.sdk.util.Sized;
import com.google.cloud.dataflow.sdk.util.SizedSideInputReader;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import com.google.cloud.dataflow.sdk.values.TupleTag;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import java.util.Map;
import java.util.Observable;
import java.util.Observer;
/**
* A simple side input reader that re-reads a side input from its iterable each time it is
* requested.
*
* Sizes are accurate only for {@link PCollectionView} implementations that read the same
* amount of data for each access.
*/
public class DataflowSideInputReader
extends SizedSideInputReader.Defaults
implements SizedSideInputReader {
/** An observer for each side input to count its size as it is being read. */
private final Map, ByteSizeObserver> observers;
/** An byte count saved as overhead per side input, not cleared when the observer is reset. */
private final Map, Long> overheads;
/** The underlying reader, which does not keep track of sizes. */
private final SideInputReader subReader;
private DataflowSideInputReader(
Iterable extends SideInputInfo> sideInputInfos,
PipelineOptions options,
ExecutionContext executionContext) throws Exception {
// Initializing the values may or may not actually read through the
// source. The full size is the amount read here plus the amount
// read when view.fromIterableInternal() is called.
this.observers = Maps.newHashMap();
this.overheads = Maps.newHashMap();
PTuple sideInputValues = PTuple.empty();
for (SideInputInfo sideInputInfo : sideInputInfos) {
TupleTag