All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.beam.sdk.io.BoundedReadFromUnboundedSource Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io;

import com.google.api.client.util.BackOff;
import com.google.auto.value.AutoValue;
import com.google.common.util.concurrent.Uninterruptibles;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.concurrent.TimeUnit;
import javax.annotation.Nullable;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.transforms.Distinct;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.util.FluentBackoff;
import org.apache.beam.sdk.util.NameUtils;
import org.apache.beam.sdk.util.ValueWithRecordId;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.joda.time.Duration;
import org.joda.time.Instant;


/**
 * {@link PTransform} that reads a bounded amount of data from an {@link UnboundedSource},
 * specified as one or both of a maximum number of elements or a maximum period of time to read.
 */
public class BoundedReadFromUnboundedSource extends PTransform> {
  private final UnboundedSource source;
  private final long maxNumRecords;
  private final Duration maxReadTime;
  private final BoundedSource> adaptedSource;
  private static final FluentBackoff BACKOFF_FACTORY =
      FluentBackoff.DEFAULT
          .withInitialBackoff(Duration.millis(10))
          .withMaxBackoff(Duration.standardSeconds(10));

  /**
   * Returns a new {@link BoundedReadFromUnboundedSource} that reads a bounded amount
   * of data from the given {@link UnboundedSource}.  The bound is specified as a number
   * of records to read.
   *
   * 

This may take a long time to execute if the splits of this source are slow to read * records. */ public BoundedReadFromUnboundedSource withMaxNumRecords(long maxNumRecords) { return new BoundedReadFromUnboundedSource(source, maxNumRecords, maxReadTime); } /** * Returns a new {@link BoundedReadFromUnboundedSource} that reads a bounded amount * of data from the given {@link UnboundedSource}. The bound is specified as an amount * of time to read for. Each split of the source will read for this much time. */ public BoundedReadFromUnboundedSource withMaxReadTime(Duration maxReadTime) { return new BoundedReadFromUnboundedSource(source, maxNumRecords, maxReadTime); } BoundedReadFromUnboundedSource( UnboundedSource source, long maxNumRecords, Duration maxReadTime) { this.source = source; this.maxNumRecords = maxNumRecords; this.maxReadTime = maxReadTime; this.adaptedSource = new AutoValue_BoundedReadFromUnboundedSource_UnboundedToBoundedSourceAdapter .Builder() .setSource(source) .setMaxNumRecords(maxNumRecords) .setMaxReadTime(maxReadTime).build(); } /** * Returns an adapted {@link BoundedSource} wrapping the underlying {@link UnboundedSource}, * with the specified bounds on number of records and read time. */ @Experimental public BoundedSource> getAdaptedSource() { return adaptedSource; } @Override public PCollection expand(PBegin input) { PCollection> read = Pipeline.applyTransform(input, Read.from(getAdaptedSource())); if (source.requiresDeduping()) { read = read.apply(Distinct.withRepresentativeValueFn( new SerializableFunction, byte[]>() { @Override public byte[] apply(ValueWithRecordId input) { return input.getId(); } })); } return read.apply("StripIds", ParDo.of(new ValueWithRecordId.StripIdsDoFn())); } @Override protected Coder getDefaultOutputCoder() { return source.getDefaultOutputCoder(); } @Override public String getKindString() { return String.format("Read(%s)", NameUtils.approximateSimpleName(source)); } @Override public void populateDisplayData(DisplayData.Builder builder) { // We explicitly do not register base-class data, instead we use the delegate inner source. builder .add(DisplayData.item("source", source.getClass()) .withLabel("Read Source")) .addIfNotDefault(DisplayData.item("maxRecords", maxNumRecords) .withLabel("Maximum Read Records"), Long.MAX_VALUE) .addIfNotNull(DisplayData.item("maxReadTime", maxReadTime) .withLabel("Maximum Read Time")) .include("source", source); } /** * Adapter that wraps the underlying {@link UnboundedSource} with the specified bounds on * number of records and read time into a {@link BoundedSource}. */ @AutoValue abstract static class UnboundedToBoundedSourceAdapter extends BoundedSource> { @Nullable abstract UnboundedSource getSource(); @Nullable abstract long getMaxNumRecords(); @Nullable abstract Duration getMaxReadTime(); abstract Builder toBuilder(); @AutoValue.Builder abstract static class Builder { abstract Builder setSource(UnboundedSource source); abstract Builder setMaxNumRecords(long maxNumRecords); abstract Builder setMaxReadTime(Duration maxReadTime); abstract UnboundedToBoundedSourceAdapter build(); } /** * Divide the given number of records into {@code numSplits} approximately * equal parts that sum to {@code numRecords}. */ private static long[] splitNumRecords(long numRecords, int numSplits) { long[] splitNumRecords = new long[numSplits]; for (int i = 0; i < numSplits; i++) { splitNumRecords[i] = numRecords / numSplits; } for (int i = 0; i < numRecords % numSplits; i++) { splitNumRecords[i] = splitNumRecords[i] + 1; } return splitNumRecords; } /** * Pick a number of initial splits based on the number of records expected to be processed. */ private static int numInitialSplits(long numRecords) { final int maxSplits = 100; final long recordsPerSplit = 10000; return (int) Math.min(maxSplits, numRecords / recordsPerSplit + 1); } @Override public List>> splitIntoBundles( long desiredBundleSizeBytes, PipelineOptions options) throws Exception { List> result = new ArrayList<>(); int numInitialSplits = numInitialSplits(getMaxNumRecords()); List> splits = getSource().generateInitialSplits(numInitialSplits, options); int numSplits = splits.size(); long[] numRecords = splitNumRecords(getMaxNumRecords(), numSplits); for (int i = 0; i < numSplits; i++) { result.add(toBuilder() .setSource(splits.get(i)) .setMaxNumRecords(numRecords[i]) .setMaxReadTime(getMaxReadTime()) .build()); } return result; } @Override public long getEstimatedSizeBytes(PipelineOptions options) { // No way to estimate bytes, so returning 0. return 0L; } @Override public Coder> getDefaultOutputCoder() { return ValueWithRecordId.ValueWithRecordIdCoder.of(getSource().getDefaultOutputCoder()); } @Override public void validate() { getSource().validate(); } @Override public BoundedReader> createReader(PipelineOptions options) throws IOException { return new Reader(getSource().createReader(options, null)); } @Override public void populateDisplayData(DisplayData.Builder builder) { builder.delegate(getSource()); } private class Reader extends BoundedReader> { private long recordsRead = 0L; private Instant endTime = Instant.now().plus(getMaxReadTime()); private UnboundedSource.UnboundedReader reader; private Reader(UnboundedSource.UnboundedReader reader) { this.recordsRead = 0L; if (getMaxReadTime() != null) { this.endTime = Instant.now().plus(getMaxReadTime()); } else { this.endTime = null; } this.reader = reader; } @Override public boolean start() throws IOException { if (getMaxNumRecords() <= 0 || (getMaxReadTime() != null && getMaxReadTime().getMillis() == 0)) { return false; } recordsRead++; if (reader.start()) { return true; } else { return advanceWithBackoff(); } } @Override public boolean advance() throws IOException { if (recordsRead >= getMaxNumRecords()) { finalizeCheckpoint(); return false; } recordsRead++; return advanceWithBackoff(); } private boolean advanceWithBackoff() throws IOException { // Try reading from the source with exponential backoff BackOff backoff = BACKOFF_FACTORY.backoff(); long nextSleep = backoff.nextBackOffMillis(); while (nextSleep != BackOff.STOP) { if (endTime != null && Instant.now().isAfter(endTime)) { finalizeCheckpoint(); return false; } if (reader.advance()) { return true; } Uninterruptibles.sleepUninterruptibly(nextSleep, TimeUnit.MILLISECONDS); nextSleep = backoff.nextBackOffMillis(); } finalizeCheckpoint(); return false; } private void finalizeCheckpoint() throws IOException { reader.getCheckpointMark().finalizeCheckpoint(); } @Override public ValueWithRecordId getCurrent() throws NoSuchElementException { return new ValueWithRecordId<>(reader.getCurrent(), reader.getCurrentRecordId()); } @Override public Instant getCurrentTimestamp() throws NoSuchElementException { return reader.getCurrentTimestamp(); } @Override public void close() throws IOException { reader.close(); } @Override public BoundedSource> getCurrentSource() { return UnboundedToBoundedSourceAdapter.this; } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy