![JAR search and dependency download from the Maven repository](/logo.png)
kafka.bridge.hadoop.KafkaOutputFormat Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package kafka.bridge.hadoop;
import java.io.IOException;
import java.net.URI;
import java.util.*;
import kafka.common.KafkaException;
import kafka.javaapi.producer.Producer;
import kafka.producer.ProducerConfig;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.log4j.Logger;
public class KafkaOutputFormat extends OutputFormat
{
private Logger log = Logger.getLogger(KafkaOutputFormat.class);
public static final String KAFKA_URL = "kafka.output.url";
/** Bytes to buffer before the OutputFormat does a send (i.e., the amortization window):
* We set the default to a million bytes so that the server will not reject the batch of messages
* with a MessageSizeTooLargeException. The actual size will be smaller after compression.
*/
public static final int KAFKA_QUEUE_BYTES = 1000000;
public static final String KAFKA_CONFIG_PREFIX = "kafka.output";
private static final Map kafkaConfigMap;
static {
Map cMap = new HashMap();
// default Hadoop producer configs
cMap.put("producer.type", "sync");
cMap.put("compression.codec", Integer.toString(1));
cMap.put("request.required.acks", Integer.toString(1));
kafkaConfigMap = Collections.unmodifiableMap(cMap);
}
public KafkaOutputFormat()
{
super();
}
public static void setOutputPath(Job job, Path outputUrl)
{
job.getConfiguration().set(KafkaOutputFormat.KAFKA_URL, outputUrl.toString());
job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
}
public static Path getOutputPath(JobContext job)
{
String name = job.getConfiguration().get(KafkaOutputFormat.KAFKA_URL);
return name == null ? null : new Path(name);
}
@Override
public void checkOutputSpecs(JobContext jobContext) throws IOException, InterruptedException
{
}
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException
{
// Is there a programmatic way to get the temp dir? I see it hardcoded everywhere in Hadoop, Hive, and Pig.
return new FileOutputCommitter(new Path("/tmp/" + taskAttemptContext.getTaskAttemptID().getJobID().toString()), taskAttemptContext);
}
@Override
public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException
{
Path outputPath = getOutputPath(context);
if (outputPath == null)
throw new KafkaException("no kafka output url specified");
URI uri = URI.create(outputPath.toString());
Configuration job = context.getConfiguration();
Properties props = new Properties();
String topic;
props.putAll(kafkaConfigMap); // inject default configuration
for (Map.Entry m : job) { // handle any overrides
if (!m.getKey().startsWith(KAFKA_CONFIG_PREFIX))
continue;
if (m.getKey().equals(KAFKA_URL))
continue;
String kafkaKeyName = m.getKey().substring(KAFKA_CONFIG_PREFIX.length()+1);
props.setProperty(kafkaKeyName, m.getValue()); // set Kafka producer property
}
// inject Kafka producer props back into jobconf for easier debugging
for (Map.Entry
© 2015 - 2025 Weber Informatics LLC | Privacy Policy