All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.cloud.dataflow.sdk.io.AvroIO Maven / Gradle / Ivy

Go to download

Google Cloud Dataflow Java SDK provides a simple, Java-based interface for processing virtually any size data using Google cloud resources. This artifact includes entire Dataflow Java SDK.

There is a newer version: 2.5.0
Show newest version
/*
 * Copyright (C) 2015 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package com.google.cloud.dataflow.sdk.io;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;

import com.google.cloud.dataflow.sdk.coders.AvroCoder;
import com.google.cloud.dataflow.sdk.coders.Coder;
import com.google.cloud.dataflow.sdk.coders.VoidCoder;
import com.google.cloud.dataflow.sdk.io.Read.Bounded;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.runners.PipelineRunner;
import com.google.cloud.dataflow.sdk.transforms.PTransform;
import com.google.cloud.dataflow.sdk.transforms.display.DisplayData;
import com.google.cloud.dataflow.sdk.util.IOChannelUtils;
import com.google.cloud.dataflow.sdk.util.MimeTypes;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PDone;
import com.google.cloud.dataflow.sdk.values.PInput;
import com.google.common.annotations.VisibleForTesting;

import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.reflect.ReflectData;

import java.io.IOException;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.regex.Pattern;

import javax.annotation.Nullable;

/**
 * {@link PTransform}s for reading and writing Avro files.
 *
 * 

To read a {@link PCollection} from one or more Avro files, use * {@link AvroIO.Read}, specifying {@link AvroIO.Read#from} to specify * the path of the file(s) to read from (e.g., a local filename or * filename pattern if running locally, or a Google Cloud Storage * filename or filename pattern of the form * {@code "gs:///"}), and optionally * {@link AvroIO.Read#named} to specify the name of the pipeline step. * *

It is required to specify {@link AvroIO.Read#withSchema}. To * read specific records, such as Avro-generated classes, provide an * Avro-generated class type. To read {@link GenericRecord GenericRecords}, provide either * a {@link Schema} object or an Avro schema in a JSON-encoded string form. * An exception will be thrown if a record doesn't match the specified * schema. * *

For example: *

 {@code
 * Pipeline p = ...;
 *
 * // A simple Read of a local file (only runs locally):
 * PCollection records =
 *     p.apply(AvroIO.Read.from("/path/to/file.avro")
 *                        .withSchema(AvroAutoGenClass.class));
 *
 * // A Read from a GCS file (runs locally and via the Google Cloud
 * // Dataflow service):
 * Schema schema = new Schema.Parser().parse(new File("schema.avsc"));
 * PCollection records =
 *     p.apply(AvroIO.Read.named("ReadFromAvro")
 *                        .from("gs://my_bucket/path/to/records-*.avro")
 *                        .withSchema(schema));
 * } 
* *

To write a {@link PCollection} to one or more Avro files, use * {@link AvroIO.Write}, specifying {@link AvroIO.Write#to} to specify * the path of the file to write to (e.g., a local filename or sharded * filename pattern if running locally, or a Google Cloud Storage * filename or sharded filename pattern of the form * {@code "gs:///"}), and optionally * {@link AvroIO.Write#named} to specify the name of the pipeline step. * *

It is required to specify {@link AvroIO.Write#withSchema}. To * write specific records, such as Avro-generated classes, provide an * Avro-generated class type. To write {@link GenericRecord GenericRecords}, provide either * a {@link Schema} object or a schema in a JSON-encoded string form. * An exception will be thrown if a record doesn't match the specified * schema. * *

For example: *

 {@code
 * // A simple Write to a local file (only runs locally):
 * PCollection records = ...;
 * records.apply(AvroIO.Write.to("/path/to/file.avro")
 *                           .withSchema(AvroAutoGenClass.class));
 *
 * // A Write to a sharded GCS file (runs locally and via the Google Cloud
 * // Dataflow service):
 * Schema schema = new Schema.Parser().parse(new File("schema.avsc"));
 * PCollection records = ...;
 * records.apply(AvroIO.Write.named("WriteToAvro")
 *                           .to("gs://my_bucket/path/to/numbers")
 *                           .withSchema(schema)
 *                           .withSuffix(".avro"));
 * } 
* *

Permissions

* Permission requirements depend on the {@link PipelineRunner} that is used to execute the * Dataflow job. Please refer to the documentation of corresponding {@link PipelineRunner}s for * more details. */ public class AvroIO { /** * A root {@link PTransform} that reads from an Avro file (or multiple Avro * files matching a pattern) and returns a {@link PCollection} containing * the decoding of each record. */ public static class Read { /** * Returns a {@link PTransform} with the given step name. */ public static Bound named(String name) { return new Bound<>(GenericRecord.class).named(name); } /** * Returns a {@link PTransform} that reads from the file(s) * with the given name or pattern. This can be a local filename * or filename pattern (if running locally), or a Google Cloud * Storage filename or filename pattern of the form * {@code "gs:///"} (if running locally or via * the Google Cloud Dataflow service). Standard * Java * Filesystem glob patterns ("*", "?", "[..]") are supported. */ public static Bound from(String filepattern) { return new Bound<>(GenericRecord.class).from(filepattern); } /** * Returns a {@link PTransform} that reads Avro file(s) * containing records whose type is the specified Avro-generated class. * * @param the type of the decoded elements, and the elements * of the resulting {@link PCollection} */ public static Bound withSchema(Class type) { return new Bound<>(type).withSchema(type); } /** * Returns a {@link PTransform} that reads Avro file(s) * containing records of the specified schema. */ public static Bound withSchema(Schema schema) { return new Bound<>(GenericRecord.class).withSchema(schema); } /** * Returns a {@link PTransform} that reads Avro file(s) * containing records of the specified schema in a JSON-encoded * string form. */ public static Bound withSchema(String schema) { return withSchema((new Schema.Parser()).parse(schema)); } /** * Returns a {@link PTransform} that reads Avro file(s) * that has GCS path validation on pipeline creation disabled. * *

This can be useful in the case where the GCS input location does * not exist at the pipeline creation time, but is expected to be available * at execution time. */ public static Bound withoutValidation() { return new Bound<>(GenericRecord.class).withoutValidation(); } /** * A {@link PTransform} that reads from an Avro file (or multiple Avro * files matching a pattern) and returns a bounded {@link PCollection} containing * the decoding of each record. * * @param the type of each of the elements of the resulting * PCollection */ public static class Bound extends PTransform> { /** The filepattern to read from. */ @Nullable final String filepattern; /** The class type of the records. */ final Class type; /** The schema of the input file. */ @Nullable final Schema schema; /** An option to indicate if input validation is desired. Default is true. */ final boolean validate; Bound(Class type) { this(null, null, type, null, true); } Bound(String name, String filepattern, Class type, Schema schema, boolean validate) { super(name); this.filepattern = filepattern; this.type = type; this.schema = schema; this.validate = validate; } /** * Returns a new {@link PTransform} that's like this one but * with the given step name. * *

Does not modify this object. */ public Bound named(String name) { return new Bound<>(name, filepattern, type, schema, validate); } /** * Returns a new {@link PTransform} that's like this one but * that reads from the file(s) with the given name or pattern. * (See {@link AvroIO.Read#from} for a description of * filepatterns.) * *

Does not modify this object. */ public Bound from(String filepattern) { return new Bound<>(name, filepattern, type, schema, validate); } /** * Returns a new {@link PTransform} that's like this one but * that reads Avro file(s) containing records whose type is the * specified Avro-generated class. * *

Does not modify this object. * * @param the type of the decoded elements and the elements of * the resulting PCollection */ public Bound withSchema(Class type) { return new Bound<>(name, filepattern, type, ReflectData.get().getSchema(type), validate); } /** * Returns a new {@link PTransform} that's like this one but * that reads Avro file(s) containing records of the specified schema. * *

Does not modify this object. */ public Bound withSchema(Schema schema) { return new Bound<>(name, filepattern, GenericRecord.class, schema, validate); } /** * Returns a new {@link PTransform} that's like this one but * that reads Avro file(s) containing records of the specified schema * in a JSON-encoded string form. * *

Does not modify this object. */ public Bound withSchema(String schema) { return withSchema((new Schema.Parser()).parse(schema)); } /** * Returns a new {@link PTransform} that's like this one but * that has GCS input path validation on pipeline creation disabled. * *

Does not modify this object. * *

This can be useful in the case where the GCS input location does * not exist at the pipeline creation time, but is expected to be * available at execution time. */ public Bound withoutValidation() { return new Bound<>(name, filepattern, type, schema, false); } @Override public PCollection apply(PInput input) { if (filepattern == null) { throw new IllegalStateException( "need to set the filepattern of an AvroIO.Read transform"); } if (schema == null) { throw new IllegalStateException("need to set the schema of an AvroIO.Read transform"); } if (validate) { try { checkState( !IOChannelUtils.getFactory(filepattern).match(filepattern).isEmpty(), "Unable to find any files matching %s", filepattern); } catch (IOException e) { throw new IllegalStateException( String.format("Failed to validate %s", filepattern), e); } } @SuppressWarnings("unchecked") Bounded read = type == GenericRecord.class ? (Bounded) com.google.cloud.dataflow.sdk.io.Read.from( AvroSource.from(filepattern).withSchema(schema)) : com.google.cloud.dataflow.sdk.io.Read.from( AvroSource.from(filepattern).withSchema(type)); PCollection pcol = input.getPipeline().apply("Read", read); // Honor the default output coder that would have been used by this PTransform. pcol.setCoder(getDefaultOutputCoder()); return pcol; } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder .addIfNotNull(DisplayData.item("filePattern", filepattern) .withLabel("Input File Pattern")) .addIfNotDefault(DisplayData.item("validation", validate) .withLabel("Validation Enabled"), true); } @Override protected Coder getDefaultOutputCoder() { return AvroCoder.of(type, schema); } public String getFilepattern() { return filepattern; } public Schema getSchema() { return schema; } public boolean needsValidation() { return validate; } } /** Disallow construction of utility class. */ private Read() {} } ///////////////////////////////////////////////////////////////////////////// /** * A root {@link PTransform} that writes a {@link PCollection} to an Avro file (or * multiple Avro files matching a sharding pattern). */ public static class Write { /** * Returns a {@link PTransform} with the given step name. */ public static Bound named(String name) { return new Bound<>(GenericRecord.class).named(name); } /** * Returns a {@link PTransform} that writes to the file(s) * with the given prefix. This can be a local filename * (if running locally), or a Google Cloud Storage filename of * the form {@code "gs:///"} * (if running locally or via the Google Cloud Dataflow service). * *

The files written will begin with this prefix, followed by * a shard identifier (see {@link Bound#withNumShards}, and end * in a common extension, if given by {@link Bound#withSuffix}. */ public static Bound to(String prefix) { return new Bound<>(GenericRecord.class).to(prefix); } /** * Returns a {@link PTransform} that writes to the file(s) with the * given filename suffix. */ public static Bound withSuffix(String filenameSuffix) { return new Bound<>(GenericRecord.class).withSuffix(filenameSuffix); } /** * Returns a {@link PTransform} that uses the provided shard count. * *

Constraining the number of shards is likely to reduce * the performance of a pipeline. Setting this value is not recommended * unless you require a specific number of output files. * * @param numShards the number of shards to use, or 0 to let the system * decide. */ public static Bound withNumShards(int numShards) { return new Bound<>(GenericRecord.class).withNumShards(numShards); } /** * Returns a {@link PTransform} that uses the given shard name * template. * *

See {@link ShardNameTemplate} for a description of shard templates. */ public static Bound withShardNameTemplate(String shardTemplate) { return new Bound<>(GenericRecord.class).withShardNameTemplate(shardTemplate); } /** * Returns a {@link PTransform} that forces a single file as * output. * *

Constraining the number of shards is likely to reduce * the performance of a pipeline. Setting this value is not recommended * unless you require a specific number of output files. */ public static Bound withoutSharding() { return new Bound<>(GenericRecord.class).withoutSharding(); } /** * Returns a {@link PTransform} that writes Avro file(s) * containing records whose type is the specified Avro-generated class. * * @param the type of the elements of the input PCollection */ public static Bound withSchema(Class type) { return new Bound<>(type).withSchema(type); } /** * Returns a {@link PTransform} that writes Avro file(s) * containing records of the specified schema. */ public static Bound withSchema(Schema schema) { return new Bound<>(GenericRecord.class).withSchema(schema); } /** * Returns a {@link PTransform} that writes Avro file(s) * containing records of the specified schema in a JSON-encoded * string form. */ public static Bound withSchema(String schema) { return withSchema((new Schema.Parser()).parse(schema)); } /** * Returns a {@link PTransform} that writes Avro file(s) that has GCS path validation on * pipeline creation disabled. * *

This can be useful in the case where the GCS output location does * not exist at the pipeline creation time, but is expected to be available * at execution time. */ public static Bound withoutValidation() { return new Bound<>(GenericRecord.class).withoutValidation(); } /** * A {@link PTransform} that writes a bounded {@link PCollection} to an Avro file (or * multiple Avro files matching a sharding pattern). * * @param the type of each of the elements of the input PCollection */ public static class Bound extends PTransform, PDone> { private static final String DEFAULT_SHARD_TEMPLATE = ShardNameTemplate.INDEX_OF_MAX; /** The filename to write to. */ @Nullable final String filenamePrefix; /** Suffix to use for each filename. */ final String filenameSuffix; /** Requested number of shards. 0 for automatic. */ final int numShards; /** Shard template string. */ final String shardTemplate; /** The class type of the records. */ final Class type; /** The schema of the output file. */ @Nullable final Schema schema; /** An option to indicate if output validation is desired. Default is true. */ final boolean validate; Bound(Class type) { this(null, null, "", 0, DEFAULT_SHARD_TEMPLATE, type, null, true); } Bound( String name, String filenamePrefix, String filenameSuffix, int numShards, String shardTemplate, Class type, Schema schema, boolean validate) { super(name); this.filenamePrefix = filenamePrefix; this.filenameSuffix = filenameSuffix; this.numShards = numShards; this.shardTemplate = shardTemplate; this.type = type; this.schema = schema; this.validate = validate; } /** * Returns a new {@link PTransform} that's like this one but * with the given step name. * *

Does not modify this object. */ public Bound named(String name) { return new Bound<>( name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate); } /** * Returns a new {@link PTransform} that's like this one but * that writes to the file(s) with the given filename prefix. * *

See {@link AvroIO.Write#to(String)} for more information * about filenames. * *

Does not modify this object. */ public Bound to(String filenamePrefix) { validateOutputComponent(filenamePrefix); return new Bound<>( name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate); } /** * Returns a new {@link PTransform} that's like this one but * that writes to the file(s) with the given filename suffix. * *

See {@link ShardNameTemplate} for a description of shard templates. * *

Does not modify this object. */ public Bound withSuffix(String filenameSuffix) { validateOutputComponent(filenameSuffix); return new Bound<>( name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate); } /** * Returns a new {@link PTransform} that's like this one but * that uses the provided shard count. * *

Constraining the number of shards is likely to reduce * the performance of a pipeline. Setting this value is not recommended * unless you require a specific number of output files. * *

Does not modify this object. * * @param numShards the number of shards to use, or 0 to let the system * decide. * @see ShardNameTemplate */ public Bound withNumShards(int numShards) { checkArgument(numShards >= 0); return new Bound<>( name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate); } /** * Returns a new {@link PTransform} that's like this one but * that uses the given shard name template. * *

Does not modify this object. * * @see ShardNameTemplate */ public Bound withShardNameTemplate(String shardTemplate) { return new Bound<>( name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, validate); } /** * Returns a new {@link PTransform} that's like this one but * that forces a single file as output. * *

This is a shortcut for * {@code .withNumShards(1).withShardNameTemplate("")} * *

Does not modify this object. */ public Bound withoutSharding() { return new Bound<>(name, filenamePrefix, filenameSuffix, 1, "", type, schema, validate); } /** * Returns a new {@link PTransform} that's like this one but * that writes to Avro file(s) containing records whose type is the * specified Avro-generated class. * *

Does not modify this object. * * @param the type of the elements of the input PCollection */ public Bound withSchema(Class type) { return new Bound<>( name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, ReflectData.get().getSchema(type), validate); } /** * Returns a new {@link PTransform} that's like this one but * that writes to Avro file(s) containing records of the specified * schema. * *

Does not modify this object. */ public Bound withSchema(Schema schema) { return new Bound<>( name, filenamePrefix, filenameSuffix, numShards, shardTemplate, GenericRecord.class, schema, validate); } /** * Returns a new {@link PTransform} that's like this one but * that writes to Avro file(s) containing records of the specified * schema in a JSON-encoded string form. * *

Does not modify this object. */ public Bound withSchema(String schema) { return withSchema((new Schema.Parser()).parse(schema)); } /** * Returns a new {@link PTransform} that's like this one but * that has GCS output path validation on pipeline creation disabled. * *

Does not modify this object. * *

This can be useful in the case where the GCS output location does * not exist at the pipeline creation time, but is expected to be * available at execution time. */ public Bound withoutValidation() { return new Bound<>( name, filenamePrefix, filenameSuffix, numShards, shardTemplate, type, schema, false); } @Override public PDone apply(PCollection input) { if (filenamePrefix == null) { throw new IllegalStateException( "need to set the filename prefix of an AvroIO.Write transform"); } if (schema == null) { throw new IllegalStateException("need to set the schema of an AvroIO.Write transform"); } com.google.cloud.dataflow.sdk.io.Write.Bound write = com.google.cloud.dataflow.sdk.io.Write.to( new AvroSink<>( filenamePrefix, filenameSuffix, shardTemplate, AvroCoder.of(type, schema))); if (getNumShards() > 0) { write = write.withNumShards(getNumShards()); } return input.apply("Write", write); } @Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder .add(DisplayData.item("schema", type) .withLabel("Record Schema")) .addIfNotNull(DisplayData.item("filePrefix", filenamePrefix) .withLabel("Output File Prefix")) .addIfNotDefault(DisplayData.item("shardNameTemplate", shardTemplate) .withLabel("Output Shard Name Template"), DEFAULT_SHARD_TEMPLATE) .addIfNotDefault(DisplayData.item("fileSuffix", filenameSuffix) .withLabel("Output File Suffix"), "") .addIfNotDefault(DisplayData.item("numShards", numShards) .withLabel("Maximum Output Shards"), 0) .addIfNotDefault(DisplayData.item("validation", validate) .withLabel("Validation Enabled"), true); } /** * Returns the current shard name template string. */ public String getShardNameTemplate() { return shardTemplate; } @Override protected Coder getDefaultOutputCoder() { return VoidCoder.of(); } public String getFilenamePrefix() { return filenamePrefix; } public String getShardTemplate() { return shardTemplate; } public int getNumShards() { return numShards; } public String getFilenameSuffix() { return filenameSuffix; } public Class getType() { return type; } public Schema getSchema() { return schema; } public boolean needsValidation() { return validate; } } /** Disallow construction of utility class. */ private Write() {} } // Pattern which matches old-style shard output patterns, which are now // disallowed. private static final Pattern SHARD_OUTPUT_PATTERN = Pattern.compile("@([0-9]+|\\*)"); private static void validateOutputComponent(String partialFilePattern) { checkArgument( !SHARD_OUTPUT_PATTERN.matcher(partialFilePattern).find(), "Output name components are not allowed to contain @* or @N patterns: " + partialFilePattern); } ///////////////////////////////////////////////////////////////////////////// /** Disallow construction of utility class. */ private AvroIO() {} /** * A {@link FileBasedSink} for Avro files. */ @VisibleForTesting static class AvroSink extends FileBasedSink { private final AvroCoder coder; @VisibleForTesting AvroSink( String baseOutputFilename, String extension, String fileNameTemplate, AvroCoder coder) { super(baseOutputFilename, extension, fileNameTemplate); this.coder = coder; } @Override public FileBasedSink.FileBasedWriteOperation createWriteOperation(PipelineOptions options) { return new AvroWriteOperation<>(this, coder); } /** * A {@link com.google.cloud.dataflow.sdk.io.FileBasedSink.FileBasedWriteOperation * FileBasedWriteOperation} for Avro files. */ private static class AvroWriteOperation extends FileBasedWriteOperation { private final AvroCoder coder; private AvroWriteOperation(AvroSink sink, AvroCoder coder) { super(sink); this.coder = coder; } @Override public FileBasedWriter createWriter(PipelineOptions options) throws Exception { return new AvroWriter<>(this, coder); } } /** * A {@link com.google.cloud.dataflow.sdk.io.FileBasedSink.FileBasedWriter FileBasedWriter} * for Avro files. */ private static class AvroWriter extends FileBasedWriter { private final AvroCoder coder; private DataFileWriter dataFileWriter; public AvroWriter(FileBasedWriteOperation writeOperation, AvroCoder coder) { super(writeOperation); this.mimeType = MimeTypes.BINARY; this.coder = coder; } @SuppressWarnings("deprecation") // uses internal test functionality. @Override protected void prepareWrite(WritableByteChannel channel) throws Exception { dataFileWriter = new DataFileWriter<>(coder.createDatumWriter()); dataFileWriter.create(coder.getSchema(), Channels.newOutputStream(channel)); } @Override public void write(T value) throws Exception { dataFileWriter.append(value); } @Override protected void writeFooter() throws Exception { dataFileWriter.flush(); } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy