org.apache.avro.hadoop.io.AvroSequenceFile Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.apache.avro.hadoop.io;
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.SequenceFile.Metadata;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.util.Progressable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A wrapper around a Hadoop {@link org.apache.hadoop.io.SequenceFile} that
* also supports reading and writing Avro data.
*
* The vanilla Hadoop SequenceFile
contains a header
* followed by a sequence of records. A record consists of a
* key and a value. The key and value must
* either:
*
*
* - implement the
Writable
interface, or
* - be accepted by a
Serialization
registered with the
* SerializationFactory
.
*
*
* Since Avro data are Plain Old Java Objects (e.g., Integer
* for data with schema "int"), they do not implement Writable.
* Furthermore, a {@link org.apache.hadoop.io.Serialization} implementation
* cannot determine whether an object instance of type
* CharSequence
that also implements Writable
should
* be serialized using Avro or WritableSerialization.
*
* The solution implemented in AvroSequenceFile
is to:
*
*
* - wrap Avro key data in an
AvroKey
object,
* - wrap Avro value data in an
AvroValue
object,
* - configure and register
AvroSerialization
with the
* SerializationFactory
, which will accept only objects that are instances
* of either AvroKey
or AvroValue
, and
* - store the Avro key and value schemas in the SequenceFile header.
*
*/
public class AvroSequenceFile {
private static final Logger LOG = LoggerFactory.getLogger(AvroSequenceFile.class);
/** The SequencFile.Metadata field for the Avro key writer schema. */
public static final Text METADATA_FIELD_KEY_SCHEMA = new Text("avro.key.schema");
/** The SequencFile.Metadata field for the Avro value writer schema. */
public static final Text METADATA_FIELD_VALUE_SCHEMA = new Text("avro.value.schema");
/** Constructor disabled for this container class. */
private AvroSequenceFile() {}
/**
* Creates a writer from a set of options.
*
* Since there are different implementations of Writer
depending on the
* compression type, this method constructs the appropriate subclass depending on the
* compression type given in the options
.
*
* @param options The options for the writer.
* @return A new writer instance.
* @throws IOException If the writer cannot be created.
*/
public static SequenceFile.Writer createWriter(Writer.Options options) throws IOException {
return SequenceFile.createWriter(
options.getFileSystem(), options.getConfigurationWithAvroSerialization(),
options.getOutputPath(), options.getKeyClass(), options.getValueClass(),
options.getBufferSizeBytes(), options.getReplicationFactor(),
options.getBlockSizeBytes(),
options.getCompressionType(), options.getCompressionCodec(),
options.getProgressable(), options.getMetadataWithAvroSchemas());
}
/**
* A writer for an uncompressed SequenceFile that supports Avro data.
*/
public static class Writer extends SequenceFile.Writer {
/**
* A helper class to encapsulate the options that can be used to construct a Writer.
*/
public static class Options {
/** The default write buffer size in bytes. */
public static final int DEFAULT_BUFFER_SIZE_BYTES = 4096;
/**
* A magic value representing the default for buffer size, block size, and
* replication factor.
*/
private static final short DEFAULT = -1;
private FileSystem mFileSystem;
private Configuration mConf;
private Path mOutputPath;
private Class> mKeyClass;
private Schema mKeyWriterSchema;
private Class> mValueClass;
private Schema mValueWriterSchema;
private int mBufferSizeBytes;
private short mReplicationFactor;
private long mBlockSizeBytes;
private Progressable mProgressable;
private CompressionType mCompressionType;
private CompressionCodec mCompressionCodec;
private Metadata mMetadata;
/**
* Creates a new Options
instance with default values.
*/
public Options() {
mBufferSizeBytes = DEFAULT;
mReplicationFactor = DEFAULT;
mBlockSizeBytes = DEFAULT;
mCompressionType = CompressionType.NONE;
mMetadata = new Metadata();
}
/**
* Sets the filesystem the SequenceFile should be written to.
*
* @param fileSystem The filesystem.
* @return This options instance.
*/
public Options withFileSystem(FileSystem fileSystem) {
if (null == fileSystem) {
throw new IllegalArgumentException("Filesystem may not be null");
}
mFileSystem = fileSystem;
return this;
}
/**
* Sets the Hadoop configuration.
*
* @param conf The configuration.
* @return This options instance.
*/
public Options withConfiguration(Configuration conf) {
if (null == conf) {
throw new IllegalArgumentException("Configuration may not be null");
}
mConf = conf;
return this;
}
/**
* Sets the output path for the SequenceFile.
*
* @param outputPath The output path.
* @return This options instance.
*/
public Options withOutputPath(Path outputPath) {
if (null == outputPath) {
throw new IllegalArgumentException("Output path may not be null");
}
mOutputPath = outputPath;
return this;
}
/**
* Sets the class of the key records to be written.
*
* If the keys will be Avro data, use {@link
* #withKeySchema(org.apache.avro.Schema)} to specify the writer schema. The key
* class will be automatically set to {@link org.apache.avro.mapred.AvroKey}.
*
* @param keyClass The key class.
* @return This options instance.
*/
public Options withKeyClass(Class> keyClass) {
if (null == keyClass) {
throw new IllegalArgumentException("Key class may not be null");
}
mKeyClass = keyClass;
return this;
}
/**
* Sets the writer schema of the key records when using Avro data.
*
* The key class will automatically be set to {@link
* org.apache.avro.mapred.AvroKey}, so there is no need to call {@link
* #withKeyClass(Class)} when using this method.
*
* @param keyWriterSchema The writer schema for the keys.
* @return This options instance.
*/
public Options withKeySchema(Schema keyWriterSchema) {
if (null == keyWriterSchema) {
throw new IllegalArgumentException("Key schema may not be null");
}
withKeyClass(AvroKey.class);
mKeyWriterSchema = keyWriterSchema;
return this;
}
/**
* Sets the class of the value records to be written.
*
* If the values will be Avro data, use {@link
* #withValueSchema(org.apache.avro.Schema)} to specify the writer schema. The value
* class will be automatically set to {@link org.apache.avro.mapred.AvroValue}.
*
* @param valueClass The value class.
* @return This options instance.
*/
public Options withValueClass(Class> valueClass) {
if (null == valueClass) {
throw new IllegalArgumentException("Value class may not be null");
}
mValueClass = valueClass;
return this;
}
/**
* Sets the writer schema of the value records when using Avro data.
*
* The value class will automatically be set to {@link
* org.apache.avro.mapred.AvroValue}, so there is no need to call {@link
* #withValueClass(Class)} when using this method.
*
* @param valueWriterSchema The writer schema for the values.
* @return This options instance.
*/
public Options withValueSchema(Schema valueWriterSchema) {
if (null == valueWriterSchema) {
throw new IllegalArgumentException("Value schema may not be null");
}
withValueClass(AvroValue.class);
mValueWriterSchema = valueWriterSchema;
return this;
}
/**
* Sets the write buffer size in bytes.
*
* @param bytes The desired buffer size.
* @return This options instance.
*/
public Options withBufferSizeBytes(int bytes) {
if (bytes < 0) {
throw new IllegalArgumentException("Buffer size may not be negative");
}
mBufferSizeBytes = bytes;
return this;
}
/**
* Sets the desired replication factor for the file.
*
* @param replicationFactor The replication factor.
* @return This options instance.
*/
public Options withReplicationFactor(short replicationFactor) {
if (replicationFactor <= 0) {
throw new IllegalArgumentException("Replication factor must be positive");
}
mReplicationFactor = replicationFactor;
return this;
}
/**
* Sets the desired size of the file blocks.
*
* @param bytes The desired block size in bytes.
* @return This options instance.
*/
public Options withBlockSizeBytes(long bytes) {
if (bytes <= 0) {
throw new IllegalArgumentException("Block size must be positive");
}
mBlockSizeBytes = bytes;
return this;
}
/**
* Sets an object to report progress to.
*
* @param progressable A progressable object to track progress.
* @return This options instance.
*/
public Options withProgressable(Progressable progressable) {
mProgressable = progressable;
return this;
}
/**
* Sets the type of compression.
*
* @param compressionType The type of compression for the output file.
* @return This options instance.
*/
public Options withCompressionType(CompressionType compressionType) {
mCompressionType = compressionType;
return this;
}
/**
* Sets the compression codec to use if it is enabled.
*
* @param compressionCodec The compression codec.
* @return This options instance.
*/
public Options withCompressionCodec(CompressionCodec compressionCodec) {
mCompressionCodec = compressionCodec;
return this;
}
/**
* Sets the metadata that should be stored in the file header.
*
* @param metadata The file metadata.
* @return This options instance.
*/
public Options withMetadata(Metadata metadata) {
if (null == metadata) {
throw new IllegalArgumentException("Metadata may not be null");
}
mMetadata = metadata;
return this;
}
/**
* Gets the filesystem the SequenceFile should be written to.
*
* @return The file system to write to.
*/
public FileSystem getFileSystem() {
if (null == mFileSystem) {
throw new RuntimeException("Must call Options.withFileSystem()");
}
return mFileSystem;
}
/**
* Gets the Hadoop configuration.
*
* @return The Hadoop configuration.
*/
public Configuration getConfiguration() {
return mConf;
}
/**
* Gets the Hadoop configuration with Avro serialization registered.
*
* @return The Hadoop configuration.
*/
public Configuration getConfigurationWithAvroSerialization() {
Configuration conf = getConfiguration();
if (null == conf) {
throw new RuntimeException("Must call Options.withConfiguration()");
}
Configuration confWithAvro = new Configuration(conf);
if (null != mKeyWriterSchema) {
AvroSerialization.setKeyWriterSchema(confWithAvro, mKeyWriterSchema);
}
if (null != mValueWriterSchema) {
AvroSerialization.setValueWriterSchema(confWithAvro, mValueWriterSchema);
}
AvroSerialization.addToConfiguration(confWithAvro);
return confWithAvro;
}
/**
* Gets the output path for the sequence file.
*
* @return The output path.
*/
public Path getOutputPath() {
if (null == mOutputPath) {
throw new RuntimeException("Must call Options.withOutputPath()");
}
return mOutputPath;
}
/**
* Gets the class of the key records.
*
* @return The key class.
*/
public Class> getKeyClass() {
if (null == mKeyClass) {
throw new RuntimeException(
"Must call Options.withKeyClass() or Options.withKeySchema()");
}
return mKeyClass;
}
/**
* Gets the class of the value records.
*
* @return The value class.
*/
public Class> getValueClass() {
if (null == mValueClass) {
throw new RuntimeException(
"Must call Options.withValueClass() or Options.withValueSchema()");
}
return mValueClass;
}
/**
* Gets the desired size of the buffer used when flushing records to disk.
*
* @return The buffer size in bytes.
*/
public int getBufferSizeBytes() {
if (DEFAULT == mBufferSizeBytes) {
return getConfiguration().getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE_BYTES);
}
return mBufferSizeBytes;
}
/**
* Gets the desired number of replicas to store for each block of the file.
*
* @return The replciation factor for the blocks of the file.
*/
public short getReplicationFactor() {
if (DEFAULT == mReplicationFactor) {
return getFileSystem().getDefaultReplication();
}
return mReplicationFactor;
}
/**
* Gets the desired size of the file blocks.
*
* @return The size of a file block in bytes.
*/
public long getBlockSizeBytes() {
if (DEFAULT == mBlockSizeBytes) {
return getFileSystem().getDefaultBlockSize();
}
return mBlockSizeBytes;
}
/**
* Gets the object to report progress to.
*
* @return A progressable object to track progress.
*/
public Progressable getProgressable() {
return mProgressable;
}
/**
* Gets the type of compression.
*
* @return The compression type.
*/
public CompressionType getCompressionType() {
return mCompressionType;
}
/**
* Gets the compression codec.
*
* @return The compression codec.
*/
public CompressionCodec getCompressionCodec() {
return mCompressionCodec;
}
/**
* Gets the SequenceFile metadata to store in the header.
*
* @return The metadata header.
*/
public Metadata getMetadata() {
return mMetadata;
}
/**
* Gets the metadata to store in the file header, which includes
* any necessary Avro writer schemas.
*
* @return The metadata header with Avro writer schemas if Avro data is being written.
*/
private Metadata getMetadataWithAvroSchemas() {
// mMetadata was intialized in the constructor, and cannot be set to null.
assert null != mMetadata;
if (null != mKeyWriterSchema) {
mMetadata.set(METADATA_FIELD_KEY_SCHEMA, new Text(mKeyWriterSchema.toString()));
}
if (null != mValueWriterSchema) {
mMetadata.set(METADATA_FIELD_VALUE_SCHEMA, new Text(mValueWriterSchema.toString()));
}
return mMetadata;
}
}
/**
* Creates a new Writer
to a SequenceFile that supports Avro data.
*
* @param options The writer options.
* @throws IOException If the writer cannot be initialized.
*/
public Writer(Options options) throws IOException {
super(options.getFileSystem(), options.getConfigurationWithAvroSerialization(),
options.getOutputPath(), options.getKeyClass(), options.getValueClass(),
options.getBufferSizeBytes(), options.getReplicationFactor(),
options.getBlockSizeBytes(), options.getProgressable(),
options.getMetadataWithAvroSchemas());
}
}
/**
* A reader for SequenceFiles that may contain Avro data.
*/
public static class Reader extends SequenceFile.Reader {
/**
* A helper class to encapsulate the options that can be used to construct a Reader.
*/
public static class Options {
private FileSystem mFileSystem;
private Path mInputPath;
private Configuration mConf;
private Schema mKeyReaderSchema;
private Schema mValueReaderSchema;
/**
* Sets the filesystem the SequenceFile should be read from.
*
* @param fileSystem The filesystem.
* @return This options instance.
*/
public Options withFileSystem(FileSystem fileSystem) {
if (null == fileSystem) {
throw new IllegalArgumentException("Filesystem may not be null");
}
mFileSystem = fileSystem;
return this;
}
/**
* Sets the input path for the SequenceFile.
*
* @param inputPath The input path.
* @return This options instance.
*/
public Options withInputPath(Path inputPath) {
if (null == inputPath) {
throw new IllegalArgumentException("Input path may not be null");
}
mInputPath = inputPath;
return this;
}
/**
* Sets the Hadoop configuration.
*
* @param conf The configuration.
* @return This options instance.
*/
public Options withConfiguration(Configuration conf) {
if (null == conf) {
throw new IllegalArgumentException("Configuration may not be null");
}
mConf = conf;
return this;
}
/**
* Sets the reader schema of the key records when using Avro data.
*
* If not set, the writer schema will be used as the reader schema.
*
* @param keyReaderSchema The reader schema for the keys.
* @return This options instance.
*/
public Options withKeySchema(Schema keyReaderSchema) {
mKeyReaderSchema = keyReaderSchema;
return this;
}
/**
* Sets the reader schema of the value records when using Avro data.
*
* If not set, the writer schema will be used as the reader schema.
*
* @param valueReaderSchema The reader schema for the values.
* @return This options instance.
*/
public Options withValueSchema(Schema valueReaderSchema) {
mValueReaderSchema = valueReaderSchema;
return this;
}
/**
* Gets the filesystem the SequenceFile should be read rom.
*
* @return The file system to read from.
*/
public FileSystem getFileSystem() {
if (null == mFileSystem) {
throw new RuntimeException("Must call Options.withFileSystem()");
}
return mFileSystem;
}
/**
* Gets the input path for the sequence file.
*
* @return The input path.
*/
public Path getInputPath() {
if (null == mInputPath) {
throw new RuntimeException("Must call Options.withInputPath()");
}
return mInputPath;
}
/**
* Gets the Hadoop configuration.
*
* @return The Hadoop configuration.
*/
public Configuration getConfiguration() {
return mConf;
}
/**
* Gets the Hadoop configuration with Avro serialization registered.
*
* @return The Hadoop configuration.
* @throws IOException If there is an error configuring Avro serialization.
*/
public Configuration getConfigurationWithAvroSerialization() throws IOException {
Configuration conf = getConfiguration();
if (null == conf) {
throw new RuntimeException("Must call Options.withConfiguration()");
}
// Configure schemas and add Avro serialization to the configuration.
Configuration confWithAvro = new Configuration(conf);
AvroSerialization.addToConfiguration(confWithAvro);
// Read the metadata header from the SequenceFile to get the writer schemas.
Metadata metadata = AvroSequenceFile.getMetadata(
getFileSystem(), getInputPath(), confWithAvro);
// Set the key schema if present in the metadata.
Text keySchemaText = metadata.get(METADATA_FIELD_KEY_SCHEMA);
if (null != keySchemaText) {
LOG.debug("Using key writer schema from SequenceFile metadata: "
+ keySchemaText.toString());
AvroSerialization.setKeyWriterSchema(
confWithAvro, Schema.parse(keySchemaText.toString()));
if (null != mKeyReaderSchema) {
AvroSerialization.setKeyReaderSchema(confWithAvro, mKeyReaderSchema);
}
}
// Set the value schema if present in the metadata.
Text valueSchemaText = metadata.get(METADATA_FIELD_VALUE_SCHEMA);
if (null != valueSchemaText) {
LOG.debug("Using value writer schema from SequenceFile metadata: "
+ valueSchemaText.toString());
AvroSerialization.setValueWriterSchema(
confWithAvro, Schema.parse(valueSchemaText.toString()));
if (null != mValueReaderSchema) {
AvroSerialization.setValueReaderSchema(confWithAvro, mValueReaderSchema);
}
}
return confWithAvro;
}
}
/**
* Creates a new Reader
from a SequenceFile that supports Avro data.
*
* @param options The reader options.
* @throws IOException If the reader cannot be initialized.
*/
public Reader(Options options) throws IOException {
super(options.getFileSystem(), options.getInputPath(),
options.getConfigurationWithAvroSerialization());
}
}
/**
* Open and read just the metadata header from a SequenceFile.
*
* @param fs The FileSystem the SequenceFile is on.
* @param path The path to the file.
* @param conf The Hadoop configuration.
* @return The metadata header.
* @throws IOException If the metadata cannot be read from the file.
*/
private static Metadata getMetadata(FileSystem fs, Path path, Configuration conf)
throws IOException {
SequenceFile.Reader metadataReader = null;
try {
metadataReader = new SequenceFile.Reader(fs, path, conf);
return metadataReader.getMetadata();
} finally {
if (null != metadataReader) {
metadataReader.close();
}
}
}
}