
co.cask.hydrator.plugin.HDFSSink Maven / Gradle / Ivy
/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.hydrator.plugin;
import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.data.batch.Output;
import co.cask.cdap.api.data.batch.OutputFormatProvider;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.lib.KeyValue;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.batch.BatchSinkContext;
import co.cask.hydrator.common.ReferenceBatchSink;
import co.cask.hydrator.common.ReferencePluginConfig;
import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;
/**
* HDFS Sink
*/
@Plugin(type = "batchsink")
@Name("HDFS")
@Description("Batch HDFS Sink")
public class HDFSSink extends ReferenceBatchSink {
private HDFSSinkConfig config;
public HDFSSink(HDFSSinkConfig config) {
super(config);
this.config = config;
}
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
super.configurePipeline(pipelineConfigurer);
// Verify if the timeSuffix format is valid.
if (!Strings.isNullOrEmpty(config.timeSufix)) {
new SimpleDateFormat(config.timeSufix);
}
}
@Override
public void prepareRun(BatchSinkContext context) throws Exception {
context.addOutput(Output.of(config.referenceName, new SinkOutputFormatProvider(config, context))
.alias(config.path));
}
@Override
public void transform(StructuredRecord input, Emitter> emitter) throws Exception {
List dataArray = new ArrayList<>();
for (Schema.Field field : input.getSchema().getFields()) {
dataArray.add(input.get(field.getName()).toString());
}
emitter.emit(new KeyValue<>(new Text(Joiner.on(",").join(dataArray)), NullWritable.get()));
}
/**
* HDFS Sink Output Provider.
*/
public static class SinkOutputFormatProvider implements OutputFormatProvider {
private final Map conf;
private final HDFSSinkConfig config;
public SinkOutputFormatProvider(HDFSSinkConfig config, BatchSinkContext context) {
this.conf = new HashMap<>();
this.config = config;
String timeSuffix = !Strings.isNullOrEmpty(config.timeSufix) ?
new SimpleDateFormat(config.timeSufix).format(context.getLogicalStartTime()) : "";
conf.put(FileOutputFormat.OUTDIR, String.format("%s/%s", config.path, timeSuffix));
}
@Override
public String getOutputFormatClassName() {
return outputFormatClassName("TEXT");
}
@Override
public Map getOutputFormatConfiguration() {
return conf;
}
}
private static String outputFormatClassName(String option) {
// Use option to extend it to more output formats
return TextOutputFormat.class.getName();
}
/**
* Config for HDFSSinkConfig.
*/
public static class HDFSSinkConfig extends ReferencePluginConfig {
@Name("path")
@Description("HDFS Destination Path Prefix. For example, 'hdfs://mycluster.net:8020/output")
private String path;
@Name("suffix")
@Description("Time Suffix used for destination directory for each run. For example, 'YYYY-MM-dd-HH-mm'. " +
"By default, no time suffix is used.")
@Nullable
private String timeSufix;
public HDFSSinkConfig(String referenceName, String path, String suffix, String outputFormat) {
super(referenceName);
this.path = path;
this.timeSufix = suffix;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy