Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.indexer.hadoop;
import com.fasterxml.jackson.core.type.TypeReference;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import com.google.common.collect.Lists;
import io.druid.data.input.InputRow;
import io.druid.indexer.HadoopDruidIndexerConfig;
import io.druid.indexer.JobHelper;
import io.druid.java.util.common.ISE;
import io.druid.java.util.common.logger.Logger;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class DatasourceInputFormat extends InputFormat
{
private static final Logger logger = new Logger(DatasourceInputFormat.class);
public static final String CONF_INPUT_SEGMENTS = "druid.segments";
public static final String CONF_DRUID_SCHEMA = "druid.datasource.schema";
public static final String CONF_TRANSFORM_SPEC = "druid.datasource.transformSpec";
public static final String CONF_MAX_SPLIT_SIZE = "druid.datasource.split.max.size";
@Override
public List getSplits(JobContext context) throws IOException, InterruptedException
{
JobConf conf = new JobConf(context.getConfiguration());
String segmentsStr = Preconditions.checkNotNull(
conf.get(CONF_INPUT_SEGMENTS),
"No segments found to read"
);
List segments = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(
segmentsStr,
new TypeReference>()
{
}
);
if (segments == null || segments.size() == 0) {
throw new ISE("No segments found to read");
}
logger.info("segments to read [%s]", segmentsStr);
long maxSize = conf.getLong(CONF_MAX_SPLIT_SIZE, 0);
if (maxSize < 0) {
long totalSize = 0;
for (WindowedDataSegment segment : segments) {
totalSize += segment.getSegment().getSize();
}
int mapTask = conf.getNumMapTasks();
if (mapTask > 0) {
maxSize = totalSize / mapTask;
}
}
if (maxSize > 0) {
//combining is to happen, let us sort the segments list by size so that they
//are combined appropriately
Collections.sort(
segments,
new Comparator()
{
@Override
public int compare(WindowedDataSegment s1, WindowedDataSegment s2)
{
return Long.compare(s1.getSegment().getSize(), s2.getSegment().getSize());
}
}
);
}
List splits = Lists.newArrayList();
List list = new ArrayList<>();
long size = 0;
org.apache.hadoop.mapred.InputFormat fio = supplier.get();
for (WindowedDataSegment segment : segments) {
if (size + segment.getSegment().getSize() > maxSize && size > 0) {
splits.add(toDataSourceSplit(list, fio, conf));
list = Lists.newArrayList();
size = 0;
}
list.add(segment);
size += segment.getSegment().getSize();
}
if (list.size() > 0) {
splits.add(toDataSourceSplit(list, fio, conf));
}
logger.info("Number of splits [%d]", splits.size());
return splits;
}
@Override
public RecordReader createRecordReader(
InputSplit split,
TaskAttemptContext context
) throws IOException, InterruptedException
{
return new DatasourceRecordReader();
}
private Supplier supplier = new Supplier()
{
@Override
public org.apache.hadoop.mapred.InputFormat get()
{
return new TextInputFormat()
{
//Always consider non-splittable as we only want to get location of blocks for the segment
//and not consider the splitting.
//also without this, isSplitable(..) fails with NPE because compressionCodecs is not properly setup.
@Override
protected boolean isSplitable(FileSystem fs, Path file)
{
return false;
}
@Override
protected FileStatus[] listStatus(JobConf job) throws IOException
{
// to avoid globbing which needs input path should be hadoop-compatible (':' is not acceptable in path, etc.)
List statusList = Lists.newArrayList();
for (Path path : FileInputFormat.getInputPaths(job)) {
// load spec in segment points specifically zip file itself
statusList.add(path.getFileSystem(job).getFileStatus(path));
}
return statusList.toArray(new FileStatus[statusList.size()]);
}
};
}
};
@VisibleForTesting
DatasourceInputFormat setSupplier(Supplier supplier)
{
this.supplier = supplier;
return this;
}
private DatasourceInputSplit toDataSourceSplit(
List segments,
org.apache.hadoop.mapred.InputFormat fio,
JobConf conf
)
{
String[] locations = getFrequentLocations(getLocations(segments, fio, conf));
return new DatasourceInputSplit(segments, locations);
}
@VisibleForTesting
static Stream getLocations(
final List segments,
final org.apache.hadoop.mapred.InputFormat fio,
final JobConf conf
)
{
return segments.stream().sequential().flatMap(
(final WindowedDataSegment segment) -> {
FileInputFormat.setInputPaths(
conf,
new Path(JobHelper.getURIFromSegment(segment.getSegment()))
);
try {
return Arrays.stream(fio.getSplits(conf, 1)).flatMap(
(final org.apache.hadoop.mapred.InputSplit split) -> {
try {
return Arrays.stream(split.getLocations());
}
catch (final Exception e) {
logger.error(e, "Exception getting locations");
return Stream.empty();
}
}
);
}
catch (final Exception e) {
logger.error(e, "Exception getting splits");
return Stream.empty();
}
}
);
}
@VisibleForTesting
static String[] getFrequentLocations(final Stream locations)
{
final Map locationCountMap = locations.collect(
Collectors.groupingBy(location -> location, Collectors.counting())
);
final Comparator> valueComparator =
Map.Entry.comparingByValue(Comparator.reverseOrder());
final Comparator> keyComparator =
Map.Entry.comparingByKey();
return locationCountMap
.entrySet().stream()
.sorted(valueComparator.thenComparing(keyComparator))
.limit(3)
.map(Map.Entry::getKey)
.toArray(String[]::new);
}
}