org.apache.druid.inputsource.hdfs.HdfsInputSource Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.inputsource.hdfs;
import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterators;
import org.apache.druid.data.input.AbstractInputSource;
import org.apache.druid.data.input.InputEntity;
import org.apache.druid.data.input.InputFileAttribute;
import org.apache.druid.data.input.InputFormat;
import org.apache.druid.data.input.InputRowSchema;
import org.apache.druid.data.input.InputSourceReader;
import org.apache.druid.data.input.InputSplit;
import org.apache.druid.data.input.SplitHintSpec;
import org.apache.druid.data.input.impl.InputEntityIteratingReader;
import org.apache.druid.data.input.impl.SplittableInputSource;
import org.apache.druid.data.input.impl.systemfield.SystemField;
import org.apache.druid.data.input.impl.systemfield.SystemFieldDecoratorFactory;
import org.apache.druid.data.input.impl.systemfield.SystemFieldInputSource;
import org.apache.druid.data.input.impl.systemfield.SystemFields;
import org.apache.druid.guice.Hdfs;
import org.apache.druid.java.util.common.CloseableIterators;
import org.apache.druid.java.util.common.IAE;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.storage.hdfs.HdfsStorageDruidModule;
import org.apache.druid.utils.Streams;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class HdfsInputSource
extends AbstractInputSource
implements SplittableInputSource>, SystemFieldInputSource
{
static final String TYPE_KEY = HdfsStorageDruidModule.SCHEME;
private static final String PROP_PATHS = "paths";
private final List inputPaths;
private final SystemFields systemFields;
private final Configuration configuration;
private final HdfsInputSourceConfig inputSourceConfig;
// Although the javadocs for SplittableInputSource say to avoid caching splits to reduce memory, HdfsInputSource
// *does* cache the splits for the following reasons:
//
// 1) It will improve compatibility with the index_hadoop task, allowing people to easily migrate from Hadoop.
// For example, input paths with globs will be supported (lazily expanding the wildcard glob is tricky).
//
// 2) The index_hadoop task allocates splits eagerly, so the memory usage should not be a problem for anyone
// migrating from Hadoop.
@Nullable
private List cachedPaths = null;
@JsonCreator
public HdfsInputSource(
@JsonProperty(PROP_PATHS) Object inputPaths,
@JsonProperty(SYSTEM_FIELDS_PROPERTY) SystemFields systemFields,
@JacksonInject @Hdfs Configuration configuration,
@JacksonInject HdfsInputSourceConfig inputSourceConfig
)
{
this.inputPaths = coerceInputPathsToList(inputPaths, PROP_PATHS);
this.systemFields = systemFields == null ? SystemFields.none() : systemFields;
this.configuration = configuration;
this.inputSourceConfig = inputSourceConfig;
this.inputPaths.forEach(p -> verifyProtocol(configuration, inputSourceConfig, p));
}
@JsonIgnore
@Nonnull
@Override
public Set getTypes()
{
return Collections.singleton(TYPE_KEY);
}
public static List coerceInputPathsToList(Object inputPaths, String propertyName)
{
if (inputPaths instanceof String) {
return Collections.singletonList((String) inputPaths);
} else if (inputPaths instanceof List && ((List>) inputPaths).stream().allMatch(x -> x instanceof String)) {
return ((List>) inputPaths).stream().map(x -> (String) x).collect(Collectors.toList());
} else {
throw new IAE("'%s' must be a string or an array of strings", propertyName);
}
}
public static void verifyProtocol(Configuration conf, HdfsInputSourceConfig config, String pathString)
{
Path path = new Path(pathString);
try {
throwIfInvalidProtocol(config, path.getFileSystem(conf).getScheme());
}
catch (IOException e) {
throw new RuntimeException(e);
}
}
private static void throwIfInvalidProtocol(HdfsInputSourceConfig config, String scheme)
{
if (!config.getAllowedProtocols().contains(StringUtils.toLowerCase(scheme))) {
throw new IAE("Only %s protocols are allowed", config.getAllowedProtocols());
}
}
public static Collection getPaths(List inputPaths, Configuration configuration) throws IOException
{
if (inputPaths.isEmpty()) {
return Collections.emptySet();
}
// Use FileInputFormat to read splits. To do this, we need to make a fake Job.
Job job = Job.getInstance(configuration);
// Add paths to the fake JobContext.
for (String inputPath : inputPaths) {
FileInputFormat.addInputPaths(job, inputPath);
}
return new HdfsFileInputFormat().getSplits(job)
.stream()
.filter(split -> ((FileSplit) split).getLength() > 0)
.map(split -> ((FileSplit) split).getPath())
.collect(Collectors.toSet());
}
/**
* Helper for leveraging hadoop code to interpret HDFS paths with globs
*/
private static class HdfsFileInputFormat extends FileInputFormat