All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.io.BoundedReadFromUnboundedSource Maven / Gradle / Ivy

/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.io;

import static com.google.cloud.dataflow.sdk.util.StringUtils.approximateSimpleName;

import com.google.api.client.util.BackOff;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.RemoveDuplicates;
import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
import com.google.cloud.dataflow.sdk.util.IntervalBoundedExponentialBackOff;
import com.google.cloud.dataflow.sdk.util.ValueWithRecordId;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PInput;

import org.joda.time.Duration;
import org.joda.time.Instant;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;


/**
 * {@link PTransform} that reads a bounded amount of data from an {@link UnboundedSource},
 * specified as one or both of a maximum number of elements or a maximum period of time to read.
 *
 * 

Created by {@link Read}. */ class BoundedReadFromUnboundedSource extends PTransform> { private static final long serialVersionUID = 0L; private final UnboundedSource source; private final long maxNumRecords; private final Duration maxReadTime; /** * Returns a new {@link BoundedReadFromUnboundedSource} that reads a bounded amount * of data from the given {@link UnboundedSource}. The bound is specified as a number * of records to read. * *

This may take a long time to execute if the splits of this source are slow to read * records. */ public BoundedReadFromUnboundedSource withMaxNumRecords(long maxNumRecords) { return new BoundedReadFromUnboundedSource(source, maxNumRecords, maxReadTime); } /** * Returns a new {@link BoundedReadFromUnboundedSource} that reads a bounded amount * of data from the given {@link UnboundedSource}. The bound is specified as an amount * of time to read for. Each split of the source will read for this much time. */ public BoundedReadFromUnboundedSource withMaxReadTime(Duration maxReadTime) { return new BoundedReadFromUnboundedSource(source, maxNumRecords, maxReadTime); } BoundedReadFromUnboundedSource( UnboundedSource source, long maxNumRecords, Duration maxReadTime) { this.source = source; this.maxNumRecords = maxNumRecords; this.maxReadTime = maxReadTime; } @Override public PCollection apply(PInput input) { PCollection> read = Pipeline.applyTransform(input, Read.from(new UnboundedToBoundedSourceAdapter<>(source, maxNumRecords, maxReadTime))); if (source.requiresDeduping()) { read = read.apply(RemoveDuplicates.withRepresentativeValueFn( new SerializableFunction, byte[]>() { private static final long serialVersionUID = 0L; @Override public byte[] apply(ValueWithRecordId input) { return input.getId(); } })); } return read.apply(ValueWithRecordId.stripIds()); } @Override protected Coder getDefaultOutputCoder() { return source.getDefaultOutputCoder(); } @Override public String getKindString() { return "Read(" + approximateSimpleName(source.getClass()) + ")"; } private static class UnboundedToBoundedSourceAdapter extends BoundedSource> { private static final long serialVersionUID = 0L; private final UnboundedSource source; private final long maxNumRecords; private final Duration maxReadTime; private UnboundedToBoundedSourceAdapter( UnboundedSource source, long maxNumRecords, Duration maxReadTime) { this.source = source; this.maxNumRecords = maxNumRecords; this.maxReadTime = maxReadTime; } /** * Divide the given number of records into {@code numSplits} approximately * equal parts that sum to {@code numRecords}. */ private static long[] splitNumRecords(long numRecords, int numSplits) { long[] splitNumRecords = new long[numSplits]; for (int i = 0; i < numSplits; i++) { splitNumRecords[i] = numRecords / numSplits; } for (int i = 0; i < numRecords % numSplits; i++) { splitNumRecords[i] = splitNumRecords[i] + 1; } return splitNumRecords; } /** * Pick a number of initial splits based on the number of records expected to be processed. */ private static int numInitialSplits(long numRecords) { final int maxSplits = 100; final long recordsPerSplit = 10000; return (int) Math.min(maxSplits, numRecords / recordsPerSplit + 1); } @Override public List>> splitIntoBundles( long desiredBundleSizeBytes, PipelineOptions options) throws Exception { List> result = new ArrayList<>(); int numInitialSplits = numInitialSplits(maxNumRecords); List> splits = source.generateInitialSplits(numInitialSplits, options); int numSplits = splits.size(); long[] numRecords = splitNumRecords(maxNumRecords, numSplits); for (int i = 0; i < numSplits; i++) { result.add( new UnboundedToBoundedSourceAdapter(splits.get(i), numRecords[i], maxReadTime)); } return result; } @Override public long getEstimatedSizeBytes(PipelineOptions options) { // No way to estimate bytes, so returning 0. return 0L; } @Override public boolean producesSortedKeys(PipelineOptions options) { return false; } @Override public Coder> getDefaultOutputCoder() { return ValueWithRecordId.ValueWithRecordIdCoder.of(source.getDefaultOutputCoder()); } @Override public void validate() { source.validate(); } @Override public BoundedReader> createReader(PipelineOptions options) { return new Reader(source.createReader(options, null)); } private class Reader extends BoundedReader> { private long recordsRead = 0L; private Instant endTime = Instant.now().plus(maxReadTime); private UnboundedSource.UnboundedReader reader; private Reader(UnboundedSource.UnboundedReader reader) { this.recordsRead = 0L; if (maxReadTime != null) { this.endTime = Instant.now().plus(maxReadTime); } else { this.endTime = null; } this.reader = reader; } @Override public boolean start() throws IOException { if (maxNumRecords <= 0 || (maxReadTime != null && maxReadTime.getMillis() == 0)) { return false; } recordsRead++; if (reader.start()) { return true; } else { return advanceWithBackoff(); } } @Override public boolean advance() throws IOException { if (recordsRead >= maxNumRecords) { finalizeCheckpoint(); return false; } recordsRead++; return advanceWithBackoff(); } private boolean advanceWithBackoff() throws IOException { // Try reading from the source with exponential backoff BackOff backoff = new IntervalBoundedExponentialBackOff(10000, 10); long nextSleep = backoff.nextBackOffMillis(); while (nextSleep != BackOff.STOP) { if (endTime != null && Instant.now().isAfter(endTime)) { finalizeCheckpoint(); return false; } if (reader.advance()) { return true; } try { Thread.sleep(nextSleep); } catch (InterruptedException e) {} nextSleep = backoff.nextBackOffMillis(); } finalizeCheckpoint(); return false; } private void finalizeCheckpoint() throws IOException { reader.getCheckpointMark().finalizeCheckpoint(); } @Override public ValueWithRecordId getCurrent() throws NoSuchElementException { return new ValueWithRecordId<>(reader.getCurrent(), reader.getCurrentRecordId()); } @Override public Instant getCurrentTimestamp() throws NoSuchElementException { return reader.getCurrentTimestamp(); } @Override public void close() {} @Override public BoundedSource> getCurrentSource() { return UnboundedToBoundedSourceAdapter.this; } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy