org.apache.avro.mapreduce.AvroOutputFormatBase Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.apache.avro.mapreduce;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.avro.AvroRuntimeException;
import org.apache.avro.file.CodecFactory;
import org.apache.avro.file.DataFileConstants;
import org.apache.avro.hadoop.file.HadoopCodecFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import static org.apache.avro.file.CodecFactory.DEFAULT_ZSTANDARD_BUFFERPOOL;
import static org.apache.avro.file.CodecFactory.DEFAULT_ZSTANDARD_LEVEL;
/**
* Abstract base class for output formats that write Avro container files.
*
* @param The type of key to write.
* @param The type of value to write.
*/
public abstract class AvroOutputFormatBase extends FileOutputFormat {
/**
* Gets the configured compression codec from the task context.
*
* @param context The task attempt context.
* @return The compression codec to use for the output Avro container file.
*/
protected static CodecFactory getCompressionCodec(TaskAttemptContext context) {
if (FileOutputFormat.getCompressOutput(context)) {
// Default to deflate compression.
int deflateLevel = context.getConfiguration().getInt(org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY,
CodecFactory.DEFAULT_DEFLATE_LEVEL);
int xzLevel = context.getConfiguration().getInt(org.apache.avro.mapred.AvroOutputFormat.XZ_LEVEL_KEY,
CodecFactory.DEFAULT_XZ_LEVEL);
int zstdLevel = context.getConfiguration().getInt(org.apache.avro.mapred.AvroOutputFormat.ZSTD_LEVEL_KEY,
DEFAULT_ZSTANDARD_LEVEL);
boolean zstdBufferPool = context.getConfiguration()
.getBoolean(org.apache.avro.mapred.AvroOutputFormat.ZSTD_BUFFERPOOL_KEY, DEFAULT_ZSTANDARD_BUFFERPOOL);
String outputCodec = context.getConfiguration().get(AvroJob.CONF_OUTPUT_CODEC);
if (outputCodec == null) {
String compressionCodec = context.getConfiguration().get("mapred.output.compression.codec");
String avroCodecName = HadoopCodecFactory.getAvroCodecName(compressionCodec);
if (avroCodecName != null) {
context.getConfiguration().set(AvroJob.CONF_OUTPUT_CODEC, avroCodecName);
return HadoopCodecFactory.fromHadoopString(compressionCodec);
} else {
return CodecFactory.deflateCodec(deflateLevel);
}
} else if (DataFileConstants.DEFLATE_CODEC.equals(outputCodec)) {
return CodecFactory.deflateCodec(deflateLevel);
} else if (DataFileConstants.XZ_CODEC.equals(outputCodec)) {
return CodecFactory.xzCodec(xzLevel);
} else if (DataFileConstants.ZSTANDARD_CODEC.equals(outputCodec)) {
return CodecFactory.zstandardCodec(zstdLevel, false, zstdBufferPool);
} else {
return CodecFactory.fromString(outputCodec);
}
}
// No compression.
return CodecFactory.nullCodec();
}
private Path getWorkPathFromCommitter(TaskAttemptContext context) throws IOException {
// When Hadoop 2 support is dropped, this method removed to a simple cast
// See https://github.com/apache/avro/pull/1431/
OutputCommitter committer = getOutputCommitter(context);
try {
return (Path) committer.getClass().getMethod("getWorkPath").invoke(committer);
} catch (ReflectiveOperationException e) {
throw new AvroRuntimeException(
"Committer: " + committer.getClass().getName() + " does not have method getWorkPath", e);
}
}
/**
* Gets the target output stream where the Avro container file should be
* written.
*
* @param context The task attempt context.
* @return The target output stream.
*/
protected OutputStream getAvroFileOutputStream(TaskAttemptContext context) throws IOException {
Path path = new Path(getWorkPathFromCommitter(context),
getUniqueFile(context, context.getConfiguration().get("avro.mo.config.namedOutput", "part"),
org.apache.avro.mapred.AvroOutputFormat.EXT));
return path.getFileSystem(context.getConfiguration()).create(path);
}
/**
* Gets the configured sync interval from the task context.
*
* @param context The task attempt context.
* @return The sync interval to use for the output Avro container file.
*/
protected static int getSyncInterval(TaskAttemptContext context) {
return context.getConfiguration().getInt(org.apache.avro.mapred.AvroOutputFormat.SYNC_INTERVAL_KEY,
DataFileConstants.DEFAULT_SYNC_INTERVAL);
}
}