Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
io.druid.indexer.updater.HadoopConverterJob Maven / Gradle / Ivy
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.indexer.updater;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.io.Files;
import io.druid.indexer.JobHelper;
import io.druid.indexer.hadoop.DatasourceInputSplit;
import io.druid.indexer.hadoop.WindowedDataSegment;
import io.druid.java.util.common.IAE;
import io.druid.java.util.common.ISE;
import io.druid.java.util.common.StringUtils;
import io.druid.java.util.common.logger.Logger;
import io.druid.timeline.DataSegment;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobPriority;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskReport;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.util.Progressable;
import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class HadoopConverterJob
{
private static final Logger log = new Logger(HadoopConverterJob.class);
private static final String COUNTER_GROUP = "Hadoop Druid Converter";
private static final String COUNTER_LOADED = "Loaded Bytes";
private static final String COUNTER_WRITTEN = "Written Bytes";
private static void setJobName(JobConf jobConf, List segments)
{
if (segments.size() == 1) {
final DataSegment segment = segments.get(0);
jobConf.setJobName(
StringUtils.format(
"druid-convert-%s-%s-%s",
segment.getDataSource(),
segment.getInterval(),
segment.getVersion()
)
);
} else {
final Set dataSources = Sets.newHashSet(
Iterables.transform(
segments,
new Function()
{
@Override
public String apply(DataSegment input)
{
return input.getDataSource();
}
}
)
);
final Set versions = Sets.newHashSet(
Iterables.transform(
segments,
new Function()
{
@Override
public String apply(DataSegment input)
{
return input.getVersion();
}
}
)
);
jobConf.setJobName(
StringUtils.format(
"druid-convert-%s-%s",
Arrays.toString(dataSources.toArray()),
Arrays.toString(versions.toArray())
)
);
}
}
public static Path getJobPath(JobID jobID, Path workingDirectory)
{
return new Path(workingDirectory, jobID.toString());
}
public static Path getTaskPath(JobID jobID, TaskAttemptID taskAttemptID, Path workingDirectory)
{
return new Path(getJobPath(jobID, workingDirectory), taskAttemptID.toString());
}
public static Path getJobClassPathDir(String jobName, Path workingDirectory) throws IOException
{
return new Path(workingDirectory, jobName.replace(":", ""));
}
public static void cleanup(Job job) throws IOException
{
final Path jobDir = getJobPath(job.getJobID(), job.getWorkingDirectory());
final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
RuntimeException e = null;
try {
JobHelper.deleteWithRetry(fs, jobDir, true);
}
catch (RuntimeException ex) {
e = ex;
}
try {
JobHelper.deleteWithRetry(fs, getJobClassPathDir(job.getJobName(), job.getWorkingDirectory()), true);
}
catch (RuntimeException ex) {
if (e == null) {
e = ex;
} else {
e.addSuppressed(ex);
}
}
if (e != null) {
throw e;
}
}
public static HadoopDruidConverterConfig converterConfigFromConfiguration(Configuration configuration)
throws IOException
{
final String property = Preconditions.checkNotNull(
configuration.get(HadoopDruidConverterConfig.CONFIG_PROPERTY),
HadoopDruidConverterConfig.CONFIG_PROPERTY
);
return HadoopDruidConverterConfig.fromString(property);
}
public static void converterConfigIntoConfiguration(
HadoopDruidConverterConfig priorConfig,
List segments,
Configuration configuration
)
{
final HadoopDruidConverterConfig config = new HadoopDruidConverterConfig(
priorConfig.getDataSource(),
priorConfig.getInterval(),
priorConfig.getIndexSpec(),
segments,
priorConfig.isValidate(),
priorConfig.getDistributedSuccessCache(),
priorConfig.getHadoopProperties(),
priorConfig.getJobPriority(),
priorConfig.getSegmentOutputPath()
);
try {
configuration.set(
HadoopDruidConverterConfig.CONFIG_PROPERTY,
HadoopDruidConverterConfig.jsonMapper.writeValueAsString(config)
);
}
catch (JsonProcessingException e) {
throw Throwables.propagate(e);
}
}
private final HadoopDruidConverterConfig converterConfig;
private long loadedBytes = 0L;
private long writtenBytes = 0L;
public HadoopConverterJob(
HadoopDruidConverterConfig converterConfig
)
{
this.converterConfig = converterConfig;
}
public List run() throws IOException
{
final JobConf jobConf = new JobConf();
jobConf.setKeepFailedTaskFiles(false);
for (Map.Entry entry : converterConfig.getHadoopProperties().entrySet()) {
jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()");
}
final List segments = converterConfig.getSegments();
if (segments.isEmpty()) {
throw new IAE(
"No segments found for datasource [%s]",
converterConfig.getDataSource()
);
}
converterConfigIntoConfiguration(converterConfig, segments, jobConf);
jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format
jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache()));
setJobName(jobConf, segments);
if (converterConfig.getJobPriority() != null) {
jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority()));
}
final Job job = Job.getInstance(jobConf);
job.setInputFormatClass(ConfigInputFormat.class);
job.setMapperClass(ConvertingMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setMapSpeculativeExecution(false);
job.setOutputFormatClass(ConvertingOutputFormat.class);
JobHelper.setupClasspath(
JobHelper.distributedClassPath(jobConf.getWorkingDirectory()),
JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())),
job
);
Throwable throwable = null;
try {
job.submit();
log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
final boolean success = job.waitForCompletion(true);
if (!success) {
final TaskReport[] reports = job.getTaskReports(TaskType.MAP);
if (reports != null) {
for (final TaskReport report : reports) {
log.error("Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics()));
}
}
return null;
}
try {
loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue();
writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue();
}
catch (IOException ex) {
log.error(ex, "Could not fetch counters");
}
final JobID jobID = job.getJobID();
final Path jobDir = getJobPath(jobID, job.getWorkingDirectory());
final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
final RemoteIterator it = fs.listFiles(jobDir, true);
final List goodPaths = new ArrayList<>();
while (it.hasNext()) {
final LocatedFileStatus locatedFileStatus = it.next();
if (locatedFileStatus.isFile()) {
final Path myPath = locatedFileStatus.getPath();
if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) {
goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY));
}
}
}
if (goodPaths.isEmpty()) {
log.warn("No good data found at [%s]", jobDir);
return null;
}
final List returnList = ImmutableList.copyOf(
Lists.transform(
goodPaths, new Function()
{
@Nullable
@Override
public DataSegment apply(final Path input)
{
try {
if (!fs.exists(input)) {
throw new ISE(
"Somehow [%s] was found but [%s] is missing at [%s]",
ConvertingOutputFormat.DATA_SUCCESS_KEY,
ConvertingOutputFormat.DATA_FILE_KEY,
jobDir
);
}
}
catch (final IOException e) {
throw Throwables.propagate(e);
}
try (final InputStream stream = fs.open(input)) {
return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class);
}
catch (final IOException e) {
throw Throwables.propagate(e);
}
}
}
)
);
if (returnList.size() == segments.size()) {
return returnList;
} else {
throw new ISE(
"Tasks reported success but result length did not match! Expected %d found %d at path [%s]",
segments.size(),
returnList.size(),
jobDir
);
}
}
catch (InterruptedException | ClassNotFoundException e) {
RuntimeException exception = Throwables.propagate(e);
throwable = exception;
throw exception;
}
catch (Throwable t) {
throwable = t;
throw t;
}
finally {
try {
cleanup(job);
}
catch (IOException e) {
if (throwable != null) {
throwable.addSuppressed(e);
} else {
log.error(e, "Could not clean up job [%s]", job.getJobID());
}
}
}
}
public long getLoadedBytes()
{
return loadedBytes;
}
public long getWrittenBytes()
{
return writtenBytes;
}
public static class ConvertingOutputFormat extends OutputFormat
{
protected static final String DATA_FILE_KEY = "result";
protected static final String DATA_SUCCESS_KEY = "_SUCCESS";
protected static final String PUBLISHED_SEGMENT_KEY = "io.druid.indexer.updater.converter.publishedSegment";
private static final Logger log = new Logger(ConvertingOutputFormat.class);
@Override
public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException
{
return new RecordWriter()
{
@Override
public void write(Text key, Text value) throws IOException, InterruptedException
{
// NOOP
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException
{
// NOOP
}
};
}
@Override
public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException
{
// NOOP
}
@Override
public OutputCommitter getOutputCommitter(final TaskAttemptContext context)
throws IOException, InterruptedException
{
return new OutputCommitter()
{
@Override
public void setupJob(JobContext jobContext) throws IOException
{
// NOOP
}
@Override
public void setupTask(TaskAttemptContext taskContext) throws IOException
{
// NOOP
}
@Override
public boolean needsTaskCommit(TaskAttemptContext taskContext) throws IOException
{
return taskContext.getConfiguration().get(PUBLISHED_SEGMENT_KEY) != null;
}
@Override
public void commitTask(final TaskAttemptContext taskContext) throws IOException
{
final Progressable commitProgressable = new Progressable()
{
@Override
public void progress()
{
taskContext.progress();
}
};
final String finalSegmentString = taskContext.getConfiguration().get(PUBLISHED_SEGMENT_KEY);
if (finalSegmentString == null) {
throw new IOException("Could not read final segment");
}
final DataSegment newSegment = HadoopDruidConverterConfig.jsonMapper.readValue(
finalSegmentString,
DataSegment.class
);
log.info("Committing new segment [%s]", newSegment);
taskContext.progress();
final FileSystem fs = taskContext.getWorkingDirectory().getFileSystem(taskContext.getConfiguration());
final Path taskAttemptDir = getTaskPath(
context.getJobID(),
context.getTaskAttemptID(),
taskContext.getWorkingDirectory()
);
final Path taskAttemptFile = new Path(taskAttemptDir, DATA_FILE_KEY);
final Path taskAttemptSuccess = new Path(taskAttemptDir, DATA_SUCCESS_KEY);
try (final OutputStream outputStream = fs.create(taskAttemptFile, false, 1 << 10, commitProgressable)) {
outputStream.write(HadoopDruidConverterConfig.jsonMapper.writeValueAsBytes(newSegment));
}
fs.create(taskAttemptSuccess, false).close();
taskContext.progress();
taskContext.setStatus("Committed");
}
@Override
public void abortTask(TaskAttemptContext taskContext) throws IOException
{
log.warn("Aborting task. Nothing to clean up.");
}
};
}
}
public static class ConvertingMapper extends Mapper
{
private static final Logger log = new Logger(ConvertingMapper.class);
private static final String TMP_FILE_LOC_KEY = "io.druid.indexer.updater.converter.reducer.tmpDir";
@Override
protected void map(
String key, String value,
final Context context
) throws IOException, InterruptedException
{
final InputSplit split = context.getInputSplit();
if (!(split instanceof DatasourceInputSplit)) {
throw new IAE(
"Unexpected split type. Expected [%s] was [%s]",
DatasourceInputSplit.class.getCanonicalName(),
split.getClass().getCanonicalName()
);
}
final String tmpDirLoc = context.getConfiguration().get(TMP_FILE_LOC_KEY);
final File tmpDir = Paths.get(tmpDirLoc).toFile();
final DataSegment segment = Iterables.getOnlyElement(((DatasourceInputSplit) split).getSegments()).getSegment();
final HadoopDruidConverterConfig config = converterConfigFromConfiguration(context.getConfiguration());
context.setStatus("DOWNLOADING");
context.progress();
final Path inPath = new Path(JobHelper.getURIFromSegment(segment));
final File inDir = new File(tmpDir, "in");
if (inDir.exists() && !inDir.delete()) {
log.warn("Could not delete [%s]", inDir);
}
if (!inDir.mkdir() && (!inDir.exists() || inDir.isDirectory())) {
log.warn("Unable to make directory");
}
final long inSize = JobHelper.unzipNoGuava(inPath, context.getConfiguration(), inDir, context);
log.debug("Loaded %d bytes into [%s] for converting", inSize, inDir.getAbsolutePath());
context.getCounter(COUNTER_GROUP, COUNTER_LOADED).increment(inSize);
context.setStatus("CONVERTING");
context.progress();
final File outDir = new File(tmpDir, "out");
FileUtils.forceMkdir(outDir);
try {
HadoopDruidConverterConfig.INDEX_MERGER.convert(
inDir,
outDir,
config.getIndexSpec(),
JobHelper.progressIndicatorForContext(context),
null
);
}
catch (Exception e) {
log.error(e, "Conversion failed.");
throw e;
}
if (config.isValidate()) {
context.setStatus("Validating");
HadoopDruidConverterConfig.INDEX_IO.validateTwoSegments(inDir, outDir);
}
context.progress();
context.setStatus("Starting PUSH");
final Path baseOutputPath = new Path(config.getSegmentOutputPath());
final FileSystem outputFS = baseOutputPath.getFileSystem(context.getConfiguration());
final DataSegment finalSegmentTemplate = segment.withVersion(
segment.getVersion()
+ "_converted"
);
final DataSegment finalSegment = JobHelper.serializeOutIndex(
finalSegmentTemplate,
context.getConfiguration(),
context,
outDir,
JobHelper.makeFileNamePath(
baseOutputPath,
outputFS,
finalSegmentTemplate,
JobHelper.INDEX_ZIP,
config.DATA_SEGMENT_PUSHER
),
JobHelper.makeFileNamePath(
baseOutputPath,
outputFS,
finalSegmentTemplate,
JobHelper.DESCRIPTOR_JSON,
config.DATA_SEGMENT_PUSHER
),
JobHelper.makeTmpPath(
baseOutputPath,
outputFS,
finalSegmentTemplate,
context.getTaskAttemptID(),
config.DATA_SEGMENT_PUSHER
),
config.DATA_SEGMENT_PUSHER
);
context.progress();
context.setStatus("Finished PUSH");
final String finalSegmentString = HadoopDruidConverterConfig.jsonMapper.writeValueAsString(finalSegment);
context.getConfiguration().set(ConvertingOutputFormat.PUBLISHED_SEGMENT_KEY, finalSegmentString);
context.write(new Text("dataSegment"), new Text(finalSegmentString));
context.getCounter(COUNTER_GROUP, COUNTER_WRITTEN).increment(finalSegment.getSize());
context.progress();
context.setStatus("Ready To Commit");
}
@Override
protected void setup(Context context) throws IOException, InterruptedException
{
final File tmpFile = Files.createTempDir();
context.getConfiguration().set(TMP_FILE_LOC_KEY, tmpFile.getAbsolutePath());
}
@Override
protected void cleanup(
Context context
) throws IOException, InterruptedException
{
final String tmpDirLoc = context.getConfiguration().get(TMP_FILE_LOC_KEY);
final File tmpDir = Paths.get(tmpDirLoc).toFile();
FileUtils.deleteDirectory(tmpDir);
context.progress();
context.setStatus("Clean");
}
}
public static class ConfigInputFormat extends InputFormat
{
@Override
public List getSplits(final JobContext jobContext) throws IOException, InterruptedException
{
final HadoopDruidConverterConfig config = converterConfigFromConfiguration(jobContext.getConfiguration());
final List segments = config.getSegments();
if (segments == null) {
throw new IOException("Bad config, missing segments");
}
return Lists.transform(
segments, new Function()
{
@Nullable
@Override
public InputSplit apply(DataSegment input)
{
return new DatasourceInputSplit(ImmutableList.of(WindowedDataSegment.of(input)), null);
}
}
);
}
@Override
public RecordReader createRecordReader(
final InputSplit inputSplit, final TaskAttemptContext taskAttemptContext
) throws IOException, InterruptedException
{
return new RecordReader()
{
boolean readAnything = false;
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException
{
// NOOP
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException
{
return !readAnything;
}
@Override
public String getCurrentKey() throws IOException, InterruptedException
{
return "key";
}
@Override
public String getCurrentValue() throws IOException, InterruptedException
{
readAnything = true;
return "fakeValue";
}
@Override
public float getProgress() throws IOException, InterruptedException
{
return readAnything ? 0.0F : 1.0F;
}
@Override
public void close() throws IOException
{
// NOOP
}
};
}
}
}