org.apache.beam.sdk.io.Write Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import com.google.common.collect.Lists;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.ThreadLocalRandom;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.SerializableCoder;
import org.apache.beam.sdk.io.Sink.WriteOperation;
import org.apache.beam.sdk.io.Sink.Writer;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.GroupByKey;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.View;
import org.apache.beam.sdk.transforms.WithKeys;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.transforms.windowing.DefaultTrigger;
import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollection.IsBounded;
import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.PDone;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A {@link PTransform} that writes to a {@link Sink}. A write begins with a sequential global
* initialization of a sink, followed by a parallel write, and ends with a sequential finalization
* of the write. The output of a write is {@link PDone}.
*
* By default, every bundle in the input {@link PCollection} will be processed by a
* {@link WriteOperation}, so the number of outputs will vary based on runner behavior, though at
* least 1 output will always be produced. The exact parallelism of the write stage can be
* controlled using {@link Write.Bound#withNumShards}, typically used to control how many files are
* produced or to globally limit the number of workers connecting to an external service. However,
* this option can often hurt performance: it adds an additional {@link GroupByKey} to the pipeline.
*
*
{@code Write} re-windows the data into the global window, so it is typically not well suited
* to use in streaming pipelines.
*
*
Example usage with runner-controlled sharding:
*
*
{@code p.apply(Write.to(new MySink(...)));}
*
* Example usage with a fixed number of shards:
*
*
{@code p.apply(Write.to(new MySink(...)).withNumShards(3));}
*/
@Experimental(Experimental.Kind.SOURCE_SINK)
public class Write {
private static final Logger LOG = LoggerFactory.getLogger(Write.class);
/**
* Creates a {@link Write} transform that writes to the given {@link Sink}, letting the runner
* control how many different shards are produced.
*/
public static Bound to(Sink sink) {
checkNotNull(sink, "sink");
return new Bound<>(sink, 0 /* runner-controlled sharding */);
}
/**
* A {@link PTransform} that writes to a {@link Sink}. See the class-level Javadoc for more
* information.
*
* @see Write
* @see Sink
*/
public static class Bound extends PTransform, PDone> {
private final Sink sink;
private int numShards;
private Bound(Sink sink, int numShards) {
this.sink = sink;
this.numShards = numShards;
}
@Override
public PDone expand(PCollection input) {
checkArgument(IsBounded.BOUNDED == input.isBounded(),
"%s can only be applied to a Bounded PCollection",
Write.class.getSimpleName());
PipelineOptions options = input.getPipeline().getOptions();
sink.validate(options);
return createWrite(input, sink.createWriteOperation(options));
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder
.add(DisplayData.item("sink", sink.getClass()).withLabel("Write Sink"))
.include("sink", sink)
.addIfNotDefault(
DisplayData.item("numShards", getNumShards()).withLabel("Fixed Number of Shards"),
0);
}
/**
* Returns the number of shards that will be produced in the output.
*
* @see Write for more information
*/
public int getNumShards() {
return numShards;
}
/**
* Returns the {@link Sink} associated with this PTransform.
*/
public Sink getSink() {
return sink;
}
/**
* Returns a new {@link Write.Bound} that will write to the current {@link Sink} using the
* specified number of shards.
*
* This option should be used sparingly as it can hurt performance. See {@link Write} for
* more information.
*
*
A value less than or equal to 0 will be equivalent to the default behavior of
* runner-controlled sharding.
*/
public Bound withNumShards(int numShards) {
return new Bound<>(sink, Math.max(numShards, 0));
}
/**
* Writes all the elements in a bundle using a {@link Writer} produced by the
* {@link WriteOperation} associated with the {@link Sink}.
*/
private class WriteBundles extends DoFn {
// Writer that will write the records in this bundle. Lazily
// initialized in processElement.
private Writer writer = null;
private final PCollectionView> writeOperationView;
WriteBundles(PCollectionView> writeOperationView) {
this.writeOperationView = writeOperationView;
}
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
// Lazily initialize the Writer
if (writer == null) {
WriteOperation writeOperation = c.sideInput(writeOperationView);
LOG.info("Opening writer for write operation {}", writeOperation);
writer = writeOperation.createWriter(c.getPipelineOptions());
writer.open(UUID.randomUUID().toString());
LOG.debug("Done opening writer {} for operation {}", writer, writeOperationView);
}
try {
writer.write(c.element());
} catch (Exception e) {
// Discard write result and close the write.
try {
writer.close();
// The writer does not need to be reset, as this DoFn cannot be reused.
} catch (Exception closeException) {
if (closeException instanceof InterruptedException) {
// Do not silently ignore interrupted state.
Thread.currentThread().interrupt();
}
// Do not mask the exception that caused the write to fail.
e.addSuppressed(closeException);
}
throw e;
}
}
@FinishBundle
public void finishBundle(Context c) throws Exception {
if (writer != null) {
WriteT result = writer.close();
c.output(result);
// Reset state in case of reuse.
writer = null;
}
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
builder.delegate(Write.Bound.this);
}
}
/**
* Like {@link WriteBundles}, but where the elements for each shard have been collected into
* a single iterable.
*
* @see WriteBundles
*/
private class WriteShardedBundles extends DoFn>, WriteT> {
private final PCollectionView> writeOperationView;
WriteShardedBundles(PCollectionView> writeOperationView) {
this.writeOperationView = writeOperationView;
}
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
// In a sharded write, single input element represents one shard. We can open and close
// the writer in each call to processElement.
WriteOperation writeOperation = c.sideInput(writeOperationView);
LOG.info("Opening writer for write operation {}", writeOperation);
Writer writer = writeOperation.createWriter(c.getPipelineOptions());
writer.open(UUID.randomUUID().toString());
LOG.debug("Done opening writer {} for operation {}", writer, writeOperationView);
try {
for (T t : c.element().getValue()) {
writer.write(t);
}
} catch (Exception e) {
try {
writer.close();
} catch (Exception closeException) {
if (closeException instanceof InterruptedException) {
// Do not silently ignore interrupted state.
Thread.currentThread().interrupt();
}
// Do not mask the exception that caused the write to fail.
e.addSuppressed(closeException);
}
throw e;
}
// Close the writer; if this throws let the error propagate.
WriteT result = writer.close();
c.output(result);
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
builder.delegate(Write.Bound.this);
}
}
private static class ApplyShardingKey implements SerializableFunction {
private final int numShards;
private int shardNumber;
ApplyShardingKey(int numShards) {
this.numShards = numShards;
shardNumber = -1;
}
@Override
public Integer apply(T input) {
if (shardNumber == -1) {
// We want to desynchronize the first record sharding key for each instance of
// ApplyShardingKey, so records in a small PCollection will be statistically balanced.
shardNumber = ThreadLocalRandom.current().nextInt(numShards);
} else {
shardNumber = (shardNumber + 1) % numShards;
}
return shardNumber;
}
}
/**
* A write is performed as sequence of three {@link ParDo}'s.
*
* In the first, a do-once ParDo is applied to a singleton PCollection containing the Sink's
* {@link WriteOperation}. In this initialization ParDo, {@link WriteOperation#initialize} is
* called. The output of this ParDo is a singleton PCollection
* containing the WriteOperation.
*
*
This singleton collection containing the WriteOperation is then used as a side input to a
* ParDo over the PCollection of elements to write. In this bundle-writing phase,
* {@link WriteOperation#createWriter} is called to obtain a {@link Writer}.
* {@link Writer#open} and {@link Writer#close} are called in {@link DoFn#startBundle} and
* {@link DoFn#finishBundle}, respectively, and {@link Writer#write} method is called for
* every element in the bundle. The output of this ParDo is a PCollection of
* writer result objects (see {@link Sink} for a description of writer results)-one for
* each bundle.
*
*
The final do-once ParDo uses the singleton collection of the WriteOperation as input and
* the collection of writer results as a side-input. In this ParDo,
* {@link WriteOperation#finalize} is called to finalize the write.
*
*
If the write of any element in the PCollection fails, {@link Writer#close} will be called
* before the exception that caused the write to fail is propagated and the write result will be
* discarded.
*
*
Since the {@link WriteOperation} is serialized after the initialization ParDo and
* deserialized in the bundle-writing and finalization phases, any state change to the
* WriteOperation object that occurs during initialization is visible in the latter phases.
* However, the WriteOperation is not serialized after the bundle-writing phase. This is why
* implementations should guarantee that {@link WriteOperation#createWriter} does not mutate
* WriteOperation).
*/
private PDone createWrite(
PCollection input, WriteOperation writeOperation) {
Pipeline p = input.getPipeline();
// A coder to use for the WriteOperation.
@SuppressWarnings("unchecked")
Coder> operationCoder =
(Coder>) SerializableCoder.of(writeOperation.getClass());
// A singleton collection of the WriteOperation, to be used as input to a ParDo to initialize
// the sink.
PCollection> operationCollection =
p.apply(Create.of(writeOperation).withCoder(operationCoder));
// Initialize the resource in a do-once ParDo on the WriteOperation.
operationCollection = operationCollection
.apply("Initialize", ParDo.of(
new DoFn, WriteOperation>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
WriteOperation writeOperation = c.element();
LOG.info("Initializing write operation {}", writeOperation);
writeOperation.initialize(c.getPipelineOptions());
LOG.debug("Done initializing write operation {}", writeOperation);
// The WriteOperation is also the output of this ParDo, so it can have mutable
// state.
c.output(writeOperation);
}
}))
.setCoder(operationCoder);
// Create a view of the WriteOperation to be used as a sideInput to the parallel write phase.
final PCollectionView> writeOperationView =
operationCollection.apply(View.>asSingleton());
// Re-window the data into the global window and remove any existing triggers.
PCollection inputInGlobalWindow =
input.apply(
Window.into(new GlobalWindows())
.triggering(DefaultTrigger.of())
.discardingFiredPanes());
// Perform the per-bundle writes as a ParDo on the input PCollection (with the WriteOperation
// as a side input) and collect the results of the writes in a PCollection.
// There is a dependency between this ParDo and the first (the WriteOperation PCollection
// as a side input), so this will happen after the initial ParDo.
PCollection results;
if (getNumShards() <= 0) {
results = inputInGlobalWindow
.apply("WriteBundles",
ParDo.of(new WriteBundles<>(writeOperationView))
.withSideInputs(writeOperationView));
} else {
results = inputInGlobalWindow
.apply("ApplyShardLabel", WithKeys.of(new ApplyShardingKey(getNumShards())))
.apply("GroupIntoShards", GroupByKey.create())
.apply("WriteShardedBundles",
ParDo.of(new WriteShardedBundles<>(writeOperationView))
.withSideInputs(writeOperationView));
}
results.setCoder(writeOperation.getWriterResultCoder());
final PCollectionView> resultsView =
results.apply(View.asIterable());
// Finalize the write in another do-once ParDo on the singleton collection containing the
// Writer. The results from the per-bundle writes are given as an Iterable side input.
// The WriteOperation's state is the same as after its initialization in the first do-once
// ParDo. There is a dependency between this ParDo and the parallel write (the writer results
// collection as a side input), so it will happen after the parallel write.
operationCollection
.apply("Finalize", ParDo.of(new DoFn, Integer>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
WriteOperation writeOperation = c.element();
LOG.info("Finalizing write operation {}.", writeOperation);
List results = Lists.newArrayList(c.sideInput(resultsView));
LOG.debug("Side input initialized to finalize write operation {}.", writeOperation);
// We must always output at least 1 shard, and honor user-specified numShards if set.
int minShardsNeeded = Math.max(1, getNumShards());
int extraShardsNeeded = minShardsNeeded - results.size();
if (extraShardsNeeded > 0) {
LOG.info(
"Creating {} empty output shards in addition to {} written for a total of {}.",
extraShardsNeeded, results.size(), minShardsNeeded);
for (int i = 0; i < extraShardsNeeded; ++i) {
Writer writer = writeOperation.createWriter(c.getPipelineOptions());
writer.open(UUID.randomUUID().toString());
WriteT emptyWrite = writer.close();
results.add(emptyWrite);
}
LOG.debug("Done creating extra shards.");
}
writeOperation.finalize(results, c.getPipelineOptions());
LOG.debug("Done finalizing write operation {}", writeOperation);
}
}).withSideInputs(resultsView));
return PDone.in(input.getPipeline());
}
}
}