
org.apache.druid.inputsource.hdfs.HdfsInputSource Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.inputsource.hdfs;
import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import org.apache.druid.data.input.AbstractInputSource;
import org.apache.druid.data.input.InputFormat;
import org.apache.druid.data.input.InputRowSchema;
import org.apache.druid.data.input.InputSourceReader;
import org.apache.druid.data.input.InputSplit;
import org.apache.druid.data.input.SplitHintSpec;
import org.apache.druid.data.input.impl.InputEntityIteratingReader;
import org.apache.druid.data.input.impl.SplittableInputSource;
import org.apache.druid.guice.Hdfs;
import org.apache.druid.java.util.common.IAE;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class HdfsInputSource extends AbstractInputSource implements SplittableInputSource
{
private static final String PROP_PATHS = "paths";
private final List inputPaths;
private final Configuration configuration;
// Although the javadocs for SplittableInputSource say to avoid caching splits to reduce memory, HdfsInputSource
// *does* cache the splits for the following reasons:
//
// 1) It will improve compatibility with the index_hadoop task, allowing people to easily migrate from Hadoop.
// For example, input paths with globs will be supported (lazily expanding the wildcard glob is tricky).
//
// 2) The index_hadoop task allocates splits eagerly, so the memory usage should not be a problem for anyone
// migrating from Hadoop.
private List cachedPaths;
@JsonCreator
public HdfsInputSource(
@JsonProperty(PROP_PATHS) Object inputPaths,
@JacksonInject @Hdfs Configuration configuration
)
{
this.inputPaths = coerceInputPathsToList(inputPaths, PROP_PATHS);
this.configuration = configuration;
this.cachedPaths = null;
}
public static List coerceInputPathsToList(Object inputPaths, String propertyName)
{
final List paths;
if (inputPaths instanceof String) {
paths = Collections.singletonList((String) inputPaths);
} else if (inputPaths instanceof List && ((List>) inputPaths).stream().allMatch(x -> x instanceof String)) {
paths = ((List>) inputPaths).stream().map(x -> (String) x).collect(Collectors.toList());
} else {
throw new IAE("'%s' must be a string or an array of strings", propertyName);
}
return paths;
}
public static Collection getPaths(List inputPaths, Configuration configuration) throws IOException
{
if (inputPaths.isEmpty()) {
return Collections.emptySet();
}
// Use FileInputFormat to read splits. To do this, we need to make a fake Job.
Job job = Job.getInstance(configuration);
// Add paths to the fake JobContext.
for (String inputPath : inputPaths) {
FileInputFormat.addInputPaths(job, inputPath);
}
return new HdfsFileInputFormat().getSplits(job)
.stream()
.map(split -> ((FileSplit) split).getPath())
.collect(Collectors.toSet());
}
/**
* Helper for leveraging hadoop code to interpret HDFS paths with globs
*/
private static class HdfsFileInputFormat extends FileInputFormat
© 2015 - 2025 Weber Informatics LLC | Privacy Policy