com.google.cloud.dataflow.sdk.io.AvroIO Maven / Gradle / Ivy
Show all versions of google-cloud-dataflow-java-sdk-all Show documentation
/*
* Copyright (C) 2015 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.cloud.dataflow.sdk.io;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import com.google.cloud.dataflow.sdk.coders.AvroCoder;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.VoidCoder;
import com.google.cloud.dataflow.sdk.io.Read.Bounded;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
import com.google.cloud.dataflow.sdk.util.MimeTypes;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PDone;
import com.google.cloud.dataflow.sdk.values.PInput;
import com.google.common.annotations.VisibleForTesting;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.reflect.ReflectData;
import java.io.IOException;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
/**
* {@link PTransform}s for reading and writing Avro files.
*
* To read a {@link PCollection} from one or more Avro files, use
* {@link AvroIO.Read}, specifying {@link AvroIO.Read#from} to specify
* the path of the file(s) to read from (e.g., a local filename or
* filename pattern if running locally, or a Google Cloud Storage
* filename or filename pattern of the form
* {@code "gs:///"}), and optionally
* {@link AvroIO.Read#named} to specify the name of the pipeline step.
*
* It is required to specify {@link AvroIO.Read#withSchema}. To
* read specific records, such as Avro-generated classes, provide an
* Avro-generated class type. To read {@link GenericRecord GenericRecords}, provide either
* a {@link Schema} object or an Avro schema in a JSON-encoded string form.
* An exception will be thrown if a record doesn't match the specified
* schema.
*
*
For example:
*
{@code
* Pipeline p = ...;
*
* // A simple Read of a local file (only runs locally):
* PCollection records =
* p.apply(AvroIO.Read.from("/path/to/file.avro")
* .withSchema(AvroAutoGenClass.class));
*
* // A Read from a GCS file (runs locally and via the Google Cloud
* // Dataflow service):
* Schema schema = new Schema.Parser().parse(new File("schema.avsc"));
* PCollection records =
* p.apply(AvroIO.Read.named("ReadFromAvro")
* .from("gs://my_bucket/path/to/records-*.avro")
* .withSchema(schema));
* }
*
* To write a {@link PCollection} to one or more Avro files, use
* {@link AvroIO.Write}, specifying {@link AvroIO.Write#to} to specify
* the path of the file to write to (e.g., a local filename or sharded
* filename pattern if running locally, or a Google Cloud Storage
* filename or sharded filename pattern of the form
* {@code "gs:///"}), and optionally
* {@link AvroIO.Write#named} to specify the name of the pipeline step.
*
* It is required to specify {@link AvroIO.Write#withSchema}. To
* write specific records, such as Avro-generated classes, provide an
* Avro-generated class type. To write {@link GenericRecord GenericRecords}, provide either
* a {@link Schema} object or a schema in a JSON-encoded string form.
* An exception will be thrown if a record doesn't match the specified
* schema.
*
*
For example:
*
{@code
* // A simple Write to a local file (only runs locally):
* PCollection records = ...;
* records.apply(AvroIO.Write.to("/path/to/file.avro")
* .withSchema(AvroAutoGenClass.class));
*
* // A Write to a sharded GCS file (runs locally and via the Google Cloud
* // Dataflow service):
* Schema schema = new Schema.Parser().parse(new File("schema.avsc"));
* PCollection records = ...;
* records.apply(AvroIO.Write.named("WriteToAvro")
* .to("gs://my_bucket/path/to/numbers")
* .withSchema(schema)
* .withSuffix(".avro"));
* }
*
* Permissions
* Permission requirements depend on the {@link PipelineRunner} that is used to execute the
* Dataflow job. Please refer to the documentation of corresponding {@link PipelineRunner}s for
* more details.
*/
public class AvroIO {
/**
* A root {@link PTransform} that reads from an Avro file (or multiple Avro
* files matching a pattern) and returns a {@link PCollection} containing
* the decoding of each record.
*/
public static class Read {
/**
* Returns a {@link PTransform} with the given step name.
*/
public static Bound named(String name) {
return new Bound<>(GenericRecord.class).named(name);
}
/**
* Returns a {@link PTransform} that reads from the file(s)
* with the given name or pattern. This can be a local filename
* or filename pattern (if running locally), or a Google Cloud
* Storage filename or filename pattern of the form
* {@code "gs:///"} (if running locally or via
* the Google Cloud Dataflow service). Standard
* Java
* Filesystem glob patterns ("*", "?", "[..]") are supported.
*/
public static Bound from(String filepattern) {
return new Bound<>(GenericRecord.class).from(filepattern);
}
/**
* Returns a {@link PTransform} that reads Avro file(s)
* containing records whose type is the specified Avro-generated class.
*
* @param the type of the decoded elements, and the elements
* of the resulting {@link PCollection}
*/
public static Bound withSchema(Class type) {
return new Bound<>(type).withSchema(type);
}
/**
* Returns a {@link PTransform} that reads Avro file(s)
* containing records of the specified schema.
*/
public static Bound withSchema(Schema schema) {
return new Bound<>(GenericRecord.class).withSchema(schema);
}
/**
* Returns a {@link PTransform} that reads Avro file(s)
* containing records of the specified schema in a JSON-encoded
* string form.
*/
public static Bound withSchema(String schema) {
return withSchema((new Schema.Parser()).parse(schema));
}
/**
* Returns a {@link PTransform} that reads Avro file(s)
* that has GCS path validation on pipeline creation disabled.
*
* This can be useful in the case where the GCS input location does
* not exist at the pipeline creation time, but is expected to be available
* at execution time.
*/
public static Bound withoutValidation() {
return new Bound<>(GenericRecord.class).withoutValidation();
}
/**
* A {@link PTransform} that reads from an Avro file (or multiple Avro
* files matching a pattern) and returns a bounded {@link PCollection} containing
* the decoding of each record.
*
* @param the type of each of the elements of the resulting
* PCollection
*/
public static class Bound extends PTransform> {
/** The filepattern to read from. */
@Nullable
final String filepattern;
/** The class type of the records. */
final Class type;
/** The schema of the input file. */
@Nullable
final Schema schema;
/** An option to indicate if input validation is desired. Default is true. */
final boolean validate;
Bound(Class type) {
this(null, null, type, null, true);
}
Bound(String name, String filepattern, Class type, Schema schema, boolean validate) {
super(name);
this.filepattern = filepattern;
this.type = type;
this.schema = schema;
this.validate = validate;
}
/**
* Returns a new {@link PTransform} that's like this one but
* with the given step name.
*
* Does not modify this object.
*/
public Bound named(String name) {
return new Bound<>(name, filepattern, type, schema, validate);
}
/**
* Returns a new {@link PTransform} that's like this one but
* that reads from the file(s) with the given name or pattern.
* (See {@link AvroIO.Read#from} for a description of
* filepatterns.)
*
* Does not modify this object.
*/
public Bound from(String filepattern) {
return new Bound<>(name, filepattern, type, schema, validate);
}
/**
* Returns a new {@link PTransform} that's like this one but
* that reads Avro file(s) containing records whose type is the
* specified Avro-generated class.
*
* Does not modify this object.
*
* @param the type of the decoded elements and the elements of
* the resulting PCollection
*/
public Bound withSchema(Class type) {
return new Bound<>(name, filepattern, type, ReflectData.get().getSchema(type), validate);
}
/**
* Returns a new {@link PTransform} that's like this one but
* that reads Avro file(s) containing records of the specified schema.
*
* Does not modify this object.
*/
public Bound withSchema(Schema schema) {
return new Bound<>(name, filepattern, GenericRecord.class, schema, validate);
}
/**
* Returns a new {@link PTransform} that's like this one but
* that reads Avro file(s) containing records of the specified schema
* in a JSON-encoded string form.
*
* Does not modify this object.
*/
public Bound withSchema(String schema) {
return withSchema((new Schema.Parser()).parse(schema));
}
/**
* Returns a new {@link PTransform} that's like this one but
* that has GCS input path validation on pipeline creation disabled.
*
* Does not modify this object.
*
*
This can be useful in the case where the GCS input location does
* not exist at the pipeline creation time, but is expected to be
* available at execution time.
*/
public Bound withoutValidation() {
return new Bound<>(name, filepattern, type, schema, false);
}
@Override
public PCollection apply(PInput input) {
if (filepattern == null) {
throw new IllegalStateException(
"need to set the filepattern of an AvroIO.Read transform");
}
if (schema == null) {
throw new IllegalStateException("need to set the schema of an AvroIO.Read transform");
}
if (validate) {
try {
checkState(
!IOChannelUtils.getFactory(filepattern).match(filepattern).isEmpty(),
"Unable to find any files matching %s",
filepattern);
} catch (IOException e) {
throw new IllegalStateException(
String.format("Failed to validate %s", filepattern), e);
}
}
@SuppressWarnings("unchecked")
Bounded read =
type == GenericRecord.class
? (Bounded) com.google.cloud.dataflow.sdk.io.Read.from(
AvroSource.from(filepattern).withSchema(schema))
: com.google.cloud.dataflow.sdk.io.Read.from(
AvroSource.from(filepattern).withSchema(type));
PCollection pcol = input.getPipeline().apply("Read", read);
// Honor the default output coder that would have been used by this PTransform.
pcol.setCoder(getDefaultOutputCoder());
return pcol;
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder
.addIfNotNull(DisplayData.item("filePattern", filepattern)
.withLabel("Input File Pattern"))
.addIfNotDefault(DisplayData.item("validation", validate)
.withLabel("Validation Enabled"), true);
}
@Override
protected Coder getDefaultOutputCoder() {
return AvroCoder.of(type, schema);
}
public String getFilepattern() {
return filepattern;
}
public Schema getSchema() {
return schema;
}
public boolean needsValidation() {
return validate;
}
}
/** Disallow construction of utility class. */
private Read() {}
}
/////////////////////////////////////////////////////////////////////////////
/**
* A root {@link PTransform} that writes a {@link PCollection} to an Avro file (or
* multiple Avro files matching a sharding pattern).
*/
public static class Write {
/**
* Returns a {@link PTransform} with the given step name.
*/
public static Bound named(String name) {
return new Bound<>(GenericRecord.class).named(name);
}
/**
* Returns a {@link PTransform} that writes to the file(s)
* with the given prefix. This can be a local filename
* (if running locally), or a Google Cloud Storage filename of
* the form {@code "gs:///"}
* (if running locally or via the Google Cloud Dataflow service).
*
* The files written will begin with this prefix, followed by
* a shard identifier (see {@link Bound#withNumShards}, and end
* in a common extension, if given by {@link Bound#withSuffix}.
*/
public static Bound to(String prefix) {
return new Bound<>(GenericRecord.class).to(prefix);
}
/**
* Returns a {@link PTransform} that writes to the file(s) with the
* given filename suffix.
*/
public static Bound withSuffix(String filenameSuffix) {
return new Bound<>(GenericRecord.class).withSuffix(filenameSuffix);
}
/**
* Returns a {@link PTransform} that uses the provided shard count.
*
* Constraining the number of shards is likely to reduce
* the performance of a pipeline. Setting this value is not recommended
* unless you require a specific number of output files.
*
* @param numShards the number of shards to use, or 0 to let the system
* decide.
*/
public static Bound withNumShards(int numShards) {
return new Bound<>(GenericRecord.class).withNumShards(numShards);
}
/**
* Returns a {@link PTransform} that uses the given shard name
* template.
*
* See {@link ShardNameTemplate} for a description of shard templates.
*/
public static Bound withShardNameTemplate(String shardTemplate) {
return new Bound<>(GenericRecord.class).withShardNameTemplate(shardTemplate);
}
/**
* Returns a {@link PTransform} that forces a single file as
* output.
*
* Constraining the number of shards is likely to reduce
* the performance of a pipeline. Setting this value is not recommended
* unless you require a specific number of output files.
*/
public static Bound withoutSharding() {
return new Bound<>(GenericRecord.class).withoutSharding();
}
/**
* Returns a {@link PTransform} that writes Avro file(s)
* containing records whose type is the specified Avro-generated class.
*
* @param the type of the elements of the input PCollection
*/
public static Bound withSchema(Class type) {
return new Bound<>(type).withSchema(type);
}
/**
* Returns a {@link PTransform} that writes Avro file(s)
* containing records of the specified schema.
*/
public static Bound withSchema(Schema schema) {
return new Bound<>(GenericRecord.class).withSchema(schema);
}
/**
* Returns a {@link PTransform} that writes Avro file(s)
* containing records of the specified schema in a JSON-encoded
* string form.
*/
public static Bound withSchema(String schema) {
return withSchema((new Schema.Parser()).parse(schema));
}
/**
* Returns a {@link PTransform} that writes Avro file(s) that has GCS path validation on
* pipeline creation disabled.
*
* This can be useful in the case where the GCS output location does
* not exist at the pipeline creation time, but is expected to be available
* at execution time.
*/
public static Bound withoutValidation() {
return new Bound<>(GenericRecord.class).withoutValidation();
}
/**
* A {@link PTransform} that writes a bounded {@link PCollection} to an Avro file (or
* multiple Avro files matching a sharding pattern).
*
* @param the type of each of the elements of the input PCollection
*/
public static class Bound extends PTransform, PDone> {
private static final String DEFAULT_SHARD_TEMPLATE = ShardNameTemplate.INDEX_OF_MAX;
/** The filename to write to. */
@Nullable
final String filenamePrefix;
/** Suffix to use for each filename. */
final String filenameSuffix;
/** Requested number of shards. 0 for automatic. */
final int numShards;
/** Shard template string. */
final String shardTemplate;
/** The class type of the records. */
final Class type;
/** The schema of the output file. */
@Nullable
final Schema schema;
/** An option to indicate if output validation is desired. Default is true. */
final boolean validate;
Bound(Class type) {
this(null, null, "", 0, DEFAULT_SHARD_TEMPLATE, type, null, true);
}
Bound(
String name,
String filenamePrefix,
String filenameSuffix,
int numShards,
String shardTemplate,
Class type,
Schema schema,
boolean validate) {
super(name);
this.filenamePrefix = filenamePrefix;
this.filenameSuffix = filenameSuffix;
this.numShards = numShards;
this.shardTemplate = shardTemplate;
this.type = type;
this.schema = schema;
this.validate = validate;
}
/**
* Returns a new {@link PTransform} that's like this one but
* with the given step name.
*
* Does not modify this object.
*/
public Bound named(String name) {
return new Bound<>(
name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate);
}
/**
* Returns a new {@link PTransform} that's like this one but
* that writes to the file(s) with the given filename prefix.
*
* See {@link AvroIO.Write#to(String)} for more information
* about filenames.
*
*
Does not modify this object.
*/
public Bound to(String filenamePrefix) {
validateOutputComponent(filenamePrefix);
return new Bound<>(
name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate);
}
/**
* Returns a new {@link PTransform} that's like this one but
* that writes to the file(s) with the given filename suffix.
*
* See {@link ShardNameTemplate} for a description of shard templates.
*
*
Does not modify this object.
*/
public Bound withSuffix(String filenameSuffix) {
validateOutputComponent(filenameSuffix);
return new Bound<>(
name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate);
}
/**
* Returns a new {@link PTransform} that's like this one but
* that uses the provided shard count.
*
* Constraining the number of shards is likely to reduce
* the performance of a pipeline. Setting this value is not recommended
* unless you require a specific number of output files.
*
*
Does not modify this object.
*
* @param numShards the number of shards to use, or 0 to let the system
* decide.
* @see ShardNameTemplate
*/
public Bound withNumShards(int numShards) {
checkArgument(numShards >= 0);
return new Bound<>(
name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate);
}
/**
* Returns a new {@link PTransform} that's like this one but
* that uses the given shard name template.
*
* Does not modify this object.
*
* @see ShardNameTemplate
*/
public Bound withShardNameTemplate(String shardTemplate) {
return new Bound<>(
name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate);
}
/**
* Returns a new {@link PTransform} that's like this one but
* that forces a single file as output.
*
* This is a shortcut for
* {@code .withNumShards(1).withShardNameTemplate("")}
*
*
Does not modify this object.
*/
public Bound withoutSharding() {
return new Bound<>(name, filenamePrefix, filenameSuffix, 1, "", type, schema, validate);
}
/**
* Returns a new {@link PTransform} that's like this one but
* that writes to Avro file(s) containing records whose type is the
* specified Avro-generated class.
*
* Does not modify this object.
*
* @param the type of the elements of the input PCollection
*/
public Bound withSchema(Class type) {
return new Bound<>(
name,
filenamePrefix,
filenameSuffix,
numShards,
shardTemplate,
type,
ReflectData.get().getSchema(type),
validate);
}
/**
* Returns a new {@link PTransform} that's like this one but
* that writes to Avro file(s) containing records of the specified
* schema.
*
* Does not modify this object.
*/
public Bound withSchema(Schema schema) {
return new Bound<>(
name,
filenamePrefix,
filenameSuffix,
numShards,
shardTemplate,
GenericRecord.class,
schema,
validate);
}
/**
* Returns a new {@link PTransform} that's like this one but
* that writes to Avro file(s) containing records of the specified
* schema in a JSON-encoded string form.
*
* Does not modify this object.
*/
public Bound withSchema(String schema) {
return withSchema((new Schema.Parser()).parse(schema));
}
/**
* Returns a new {@link PTransform} that's like this one but
* that has GCS output path validation on pipeline creation disabled.
*
* Does not modify this object.
*
*
This can be useful in the case where the GCS output location does
* not exist at the pipeline creation time, but is expected to be
* available at execution time.
*/
public Bound withoutValidation() {
return new Bound<>(
name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, false);
}
@Override
public PDone apply(PCollection input) {
if (filenamePrefix == null) {
throw new IllegalStateException(
"need to set the filename prefix of an AvroIO.Write transform");
}
if (schema == null) {
throw new IllegalStateException("need to set the schema of an AvroIO.Write transform");
}
com.google.cloud.dataflow.sdk.io.Write.Bound write =
com.google.cloud.dataflow.sdk.io.Write.to(
new AvroSink<>(
filenamePrefix, filenameSuffix, shardTemplate, AvroCoder.of(type, schema)));
if (getNumShards() > 0) {
write = write.withNumShards(getNumShards());
}
return input.apply("Write", write);
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
super.populateDisplayData(builder);
builder
.add(DisplayData.item("schema", type)
.withLabel("Record Schema"))
.addIfNotNull(DisplayData.item("filePrefix", filenamePrefix)
.withLabel("Output File Prefix"))
.addIfNotDefault(DisplayData.item("shardNameTemplate", shardTemplate)
.withLabel("Output Shard Name Template"),
DEFAULT_SHARD_TEMPLATE)
.addIfNotDefault(DisplayData.item("fileSuffix", filenameSuffix)
.withLabel("Output File Suffix"),
"")
.addIfNotDefault(DisplayData.item("numShards", numShards)
.withLabel("Maximum Output Shards"),
0)
.addIfNotDefault(DisplayData.item("validation", validate)
.withLabel("Validation Enabled"),
true);
}
/**
* Returns the current shard name template string.
*/
public String getShardNameTemplate() {
return shardTemplate;
}
@Override
protected Coder getDefaultOutputCoder() {
return VoidCoder.of();
}
public String getFilenamePrefix() {
return filenamePrefix;
}
public String getShardTemplate() {
return shardTemplate;
}
public int getNumShards() {
return numShards;
}
public String getFilenameSuffix() {
return filenameSuffix;
}
public Class getType() {
return type;
}
public Schema getSchema() {
return schema;
}
public boolean needsValidation() {
return validate;
}
}
/** Disallow construction of utility class. */
private Write() {}
}
// Pattern which matches old-style shard output patterns, which are now
// disallowed.
private static final Pattern SHARD_OUTPUT_PATTERN = Pattern.compile("@([0-9]+|\\*)");
private static void validateOutputComponent(String partialFilePattern) {
checkArgument(
!SHARD_OUTPUT_PATTERN.matcher(partialFilePattern).find(),
"Output name components are not allowed to contain @* or @N patterns: "
+ partialFilePattern);
}
/////////////////////////////////////////////////////////////////////////////
/** Disallow construction of utility class. */
private AvroIO() {}
/**
* A {@link FileBasedSink} for Avro files.
*/
@VisibleForTesting
static class AvroSink extends FileBasedSink {
private final AvroCoder coder;
@VisibleForTesting
AvroSink(
String baseOutputFilename, String extension, String fileNameTemplate, AvroCoder coder) {
super(baseOutputFilename, extension, fileNameTemplate);
this.coder = coder;
}
@Override
public FileBasedSink.FileBasedWriteOperation createWriteOperation(PipelineOptions options) {
return new AvroWriteOperation<>(this, coder);
}
/**
* A {@link com.google.cloud.dataflow.sdk.io.FileBasedSink.FileBasedWriteOperation
* FileBasedWriteOperation} for Avro files.
*/
private static class AvroWriteOperation extends FileBasedWriteOperation {
private final AvroCoder coder;
private AvroWriteOperation(AvroSink sink, AvroCoder coder) {
super(sink);
this.coder = coder;
}
@Override
public FileBasedWriter createWriter(PipelineOptions options) throws Exception {
return new AvroWriter<>(this, coder);
}
}
/**
* A {@link com.google.cloud.dataflow.sdk.io.FileBasedSink.FileBasedWriter FileBasedWriter}
* for Avro files.
*/
private static class AvroWriter extends FileBasedWriter {
private final AvroCoder coder;
private DataFileWriter dataFileWriter;
public AvroWriter(FileBasedWriteOperation writeOperation, AvroCoder coder) {
super(writeOperation);
this.mimeType = MimeTypes.BINARY;
this.coder = coder;
}
@SuppressWarnings("deprecation") // uses internal test functionality.
@Override
protected void prepareWrite(WritableByteChannel channel) throws Exception {
dataFileWriter = new DataFileWriter<>(coder.createDatumWriter());
dataFileWriter.create(coder.getSchema(), Channels.newOutputStream(channel));
}
@Override
public void write(T value) throws Exception {
dataFileWriter.append(value);
}
@Override
protected void writeFooter() throws Exception {
dataFileWriter.flush();
}
}
}
}