
com.google.cloud.dataflow.sdk.io.Read Maven / Gradle / Ivy
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.io;
import static com.google.cloud.dataflow.sdk.util.StringUtils.approximateSimpleName;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.runners.DirectPipelineRunner;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.util.SerializableUtils;
import com.google.cloud.dataflow.sdk.util.WindowedValue;
import com.google.cloud.dataflow.sdk.util.WindowingStrategy;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollection.IsBounded;
import com.google.cloud.dataflow.sdk.values.PInput;
import org.joda.time.Duration;
import java.util.ArrayList;
import java.util.List;
import javax.annotation.Nullable;
/**
* A {@link PTransform} for reading from a {@link Source}.
*
* Usage example:
*
* Pipeline p = Pipeline.create();
* p.apply(Read.from(new MySource().withFoo("foo").withBar("bar"))
* .named("foobar"));
*
*/
public class Read {
/**
* Returns a new {@code Read} {@code PTransform} builder with the given name.
*/
public static Builder named(String name) {
return new Builder(name);
}
/**
* Returns a new {@code Read.Bounded} {@code PTransform} reading from the given
* {@code BoundedSource}.
*/
public static Bounded from(BoundedSource source) {
return new Bounded<>(null, source);
}
/**
* Returns a new {@code Read.Unbounded} {@code PTransform} reading from the given
* {@code UnboundedSource}.
*/
public static Unbounded from(UnboundedSource source) {
return new Unbounded<>(null, source);
}
/**
* Helper class for building {@code Read} transforms.
*/
public static class Builder {
private final String name;
private Builder(String name) {
this.name = name;
}
/**
* Returns a new {@code Read.Bounded} {@code PTransform} reading from the given
* {@code BoundedSource}.
*/
public Bounded from(BoundedSource source) {
return new Bounded<>(name, source);
}
/**
* Returns a new {@code Read.Unbounded} {@code PTransform} reading from the given
* {@code UnboundedSource}.
*/
public Unbounded from(UnboundedSource source) {
return new Unbounded<>(name, source);
}
}
/**
* {@link PTransform} that reads from a {@link BoundedSource}.
*/
public static class Bounded extends PTransform> {
private final BoundedSource source;
private Bounded(@Nullable String name, BoundedSource source) {
super(name);
this.source = SerializableUtils.ensureSerializable(source);
}
/**
* Returns a new {@code Bounded} {@code PTransform} that's like this one but
* has the given name.
*
* Does not modify this object.
*/
public Bounded named(String name) {
return new Bounded(name, source);
}
@Override
protected Coder getDefaultOutputCoder() {
return source.getDefaultOutputCoder();
}
@Override
public final PCollection apply(PInput input) {
source.validate();
return PCollection.createPrimitiveOutputInternal(input.getPipeline(),
WindowingStrategy.globalDefault(), IsBounded.BOUNDED)
.setCoder(getDefaultOutputCoder());
}
/**
* Returns the {@code BoundedSource} used to create this {@code Read} {@code PTransform}.
*/
public BoundedSource getSource() {
return source;
}
@Override
public String getKindString() {
return "Read(" + approximateSimpleName(source.getClass()) + ")";
}
static {
registerDefaultTransformEvaluator();
}
@SuppressWarnings({"rawtypes", "unchecked"})
private static void registerDefaultTransformEvaluator() {
DirectPipelineRunner.registerDefaultTransformEvaluator(
Bounded.class,
new DirectPipelineRunner.TransformEvaluator() {
@Override
public void evaluate(
Bounded transform, DirectPipelineRunner.EvaluationContext context) {
evaluateReadHelper(transform, context);
}
private void evaluateReadHelper(
Read.Bounded transform, DirectPipelineRunner.EvaluationContext context) {
try {
List> output = new ArrayList<>();
BoundedSource source = transform.getSource();
try (BoundedSource.BoundedReader reader =
source.createReader(context.getPipelineOptions())) {
for (boolean available = reader.start();
available;
available = reader.advance()) {
output.add(
DirectPipelineRunner.ValueWithMetadata.of(
WindowedValue.timestampedValueInGlobalWindow(
reader.getCurrent(), reader.getCurrentTimestamp())));
}
}
context.setPCollectionValuesWithMetadata(context.getOutput(transform), output);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
});
}
}
/**
* {@link PTransform} that reads from a {@link UnboundedSource}.
*/
public static class Unbounded extends PTransform> {
private final UnboundedSource source;
private Unbounded(@Nullable String name, UnboundedSource source) {
super(name);
this.source = SerializableUtils.ensureSerializable(source);
}
/**
* Returns a new {@code Unbounded} {@code PTransform} that's like this one but
* has the given name.
*
* Does not modify this object.
*/
public Unbounded named(String name) {
return new Unbounded(name, source);
}
/**
* Returns a new {@link BoundedReadFromUnboundedSource} that reads a bounded amount
* of data from the given {@link UnboundedSource}. The bound is specified as a number
* of records to read.
*
* This may take a long time to execute if the splits of this source are slow to read
* records.
*/
public BoundedReadFromUnboundedSource withMaxNumRecords(long maxNumRecords) {
return new BoundedReadFromUnboundedSource(source, maxNumRecords, null);
}
/**
* Returns a new {@link BoundedReadFromUnboundedSource} that reads a bounded amount
* of data from the given {@link UnboundedSource}. The bound is specified as an amount
* of time to read for. Each split of the source will read for this much time.
*/
public BoundedReadFromUnboundedSource withMaxReadTime(Duration maxReadTime) {
return new BoundedReadFromUnboundedSource(source, Long.MAX_VALUE, maxReadTime);
}
@Override
protected Coder getDefaultOutputCoder() {
return source.getDefaultOutputCoder();
}
@Override
public final PCollection apply(PInput input) {
source.validate();
return PCollection.createPrimitiveOutputInternal(
input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED);
}
/**
* Returns the {@code UnboundedSource} used to create this {@code Read} {@code PTransform}.
*/
public UnboundedSource getSource() {
return source;
}
@Override
public String getKindString() {
return "Read(" + approximateSimpleName(source.getClass()) + ")";
}
}
}