io.druid.segment.IndexMerger Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of druid-processing Show documentation
Show all versions of druid-processing Show documentation
A module that is everything required to understands Druid Segments
/*
* Licensed to Metamarkets Group Inc. (Metamarkets) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Metamarkets licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.druid.segment;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.base.Strings;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Ordering;
import com.google.common.collect.PeekingIterator;
import com.google.common.collect.Sets;
import com.google.common.io.ByteStreams;
import com.google.common.io.Files;
import com.google.common.io.OutputSupplier;
import com.google.common.primitives.Ints;
import com.google.inject.Inject;
import com.metamx.collections.bitmap.BitmapFactory;
import com.metamx.collections.bitmap.ImmutableBitmap;
import com.metamx.collections.bitmap.MutableBitmap;
import com.metamx.collections.spatial.ImmutableRTree;
import com.metamx.collections.spatial.RTree;
import com.metamx.collections.spatial.split.LinearGutmanSplitStrategy;
import com.metamx.common.IAE;
import com.metamx.common.ISE;
import com.metamx.common.Pair;
import com.metamx.common.guava.FunctionalIterable;
import com.metamx.common.guava.MergeIterable;
import com.metamx.common.guava.nary.BinaryFn;
import com.metamx.common.io.smoosh.Smoosh;
import com.metamx.common.logger.Logger;
import io.druid.collections.CombiningIterable;
import io.druid.common.guava.FileOutputSupplier;
import io.druid.common.guava.GuavaUtils;
import io.druid.common.utils.JodaUtils;
import io.druid.common.utils.SerializerUtils;
import io.druid.query.aggregation.AggregatorFactory;
import io.druid.segment.column.ColumnCapabilities;
import io.druid.segment.column.ColumnCapabilitiesImpl;
import io.druid.segment.column.ValueType;
import io.druid.segment.data.BitmapSerdeFactory;
import io.druid.segment.data.ByteBufferWriter;
import io.druid.segment.data.CompressedLongsSupplierSerializer;
import io.druid.segment.data.CompressedObjectStrategy;
import io.druid.segment.data.GenericIndexed;
import io.druid.segment.data.GenericIndexedWriter;
import io.druid.segment.data.IOPeon;
import io.druid.segment.data.Indexed;
import io.druid.segment.data.IndexedInts;
import io.druid.segment.data.IndexedIterable;
import io.druid.segment.data.IndexedRTree;
import io.druid.segment.data.ListIndexed;
import io.druid.segment.data.TmpFileIOPeon;
import io.druid.segment.data.VSizeIndexedWriter;
import io.druid.segment.incremental.IncrementalIndex;
import io.druid.segment.incremental.IncrementalIndexAdapter;
import io.druid.segment.serde.ComplexMetricColumnSerializer;
import io.druid.segment.serde.ComplexMetricSerde;
import io.druid.segment.serde.ComplexMetrics;
import org.apache.commons.io.FileUtils;
import org.joda.time.DateTime;
import org.joda.time.Interval;
import javax.annotation.Nullable;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.IntBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeSet;
/**
*/
public class IndexMerger
{
private static final Logger log = new Logger(IndexMerger.class);
protected static final ListIndexed EMPTY_STR_DIM_VAL = new ListIndexed<>(Arrays.asList(""), String.class);
protected static final SerializerUtils serializerUtils = new SerializerUtils();
protected static final int INVALID_ROW = -1;
protected static final Splitter SPLITTER = Splitter.on(",");
protected final ObjectMapper mapper;
protected final IndexIO indexIO;
@Inject
public IndexMerger(
ObjectMapper mapper,
IndexIO indexIO
)
{
this.mapper = Preconditions.checkNotNull(mapper, "null ObjectMapper");
this.indexIO = Preconditions.checkNotNull(indexIO, "null IndexIO");
}
public File persist(
final IncrementalIndex index,
File outDir,
IndexSpec indexSpec
) throws IOException
{
return persist(index, index.getInterval(), outDir, indexSpec);
}
/**
* This is *not* thread-safe and havok will ensue if this is called and writes are still occurring
* on the IncrementalIndex object.
*
* @param index the IncrementalIndex to persist
* @param dataInterval the Interval that the data represents
* @param outDir the directory to persist the data to
*
* @return the index output directory
*
* @throws java.io.IOException if an IO error occurs persisting the index
*/
public File persist(
final IncrementalIndex index,
final Interval dataInterval,
File outDir,
IndexSpec indexSpec
) throws IOException
{
return persist(index, dataInterval, outDir, indexSpec, new BaseProgressIndicator());
}
public File persist(
final IncrementalIndex index,
final Interval dataInterval,
File outDir,
IndexSpec indexSpec,
ProgressIndicator progress
) throws IOException
{
if (index.isEmpty()) {
throw new IAE("Trying to persist an empty index!");
}
final long firstTimestamp = index.getMinTime().getMillis();
final long lastTimestamp = index.getMaxTime().getMillis();
if (!(dataInterval.contains(firstTimestamp) && dataInterval.contains(lastTimestamp))) {
throw new IAE(
"interval[%s] does not encapsulate the full range of timestamps[%s, %s]",
dataInterval,
new DateTime(firstTimestamp),
new DateTime(lastTimestamp)
);
}
if (!outDir.exists()) {
outDir.mkdirs();
}
if (!outDir.isDirectory()) {
throw new ISE("Can only persist to directories, [%s] wasn't a directory", outDir);
}
log.info("Starting persist for interval[%s], rows[%,d]", dataInterval, index.size());
return merge(
Arrays.asList(
new IncrementalIndexAdapter(
dataInterval,
index,
indexSpec.getBitmapSerdeFactory().getBitmapFactory()
)
),
index.getMetricAggs(),
outDir,
indexSpec,
progress
);
}
public File mergeQueryableIndex(
List indexes,
final AggregatorFactory[] metricAggs,
File outDir,
IndexSpec indexSpec
) throws IOException
{
return mergeQueryableIndex(indexes, metricAggs, outDir, indexSpec, new BaseProgressIndicator());
}
public File mergeQueryableIndex(
List indexes,
final AggregatorFactory[] metricAggs,
File outDir,
IndexSpec indexSpec,
ProgressIndicator progress
) throws IOException
{
// We are materializing the list for performance reasons. Lists.transform
// only creates a "view" of the original list, meaning the function gets
// applied every time you access an element.
List indexAdapteres = Lists.newArrayList(
Iterables.transform(
indexes,
new Function()
{
@Override
public IndexableAdapter apply(final QueryableIndex input)
{
return new QueryableIndexIndexableAdapter(input);
}
}
)
);
return merge(
indexAdapteres,
metricAggs,
outDir,
indexSpec,
progress
);
}
public File merge(
List indexes,
final AggregatorFactory[] metricAggs,
File outDir,
IndexSpec indexSpec
) throws IOException
{
return merge(indexes, metricAggs, outDir, indexSpec, new BaseProgressIndicator());
}
private static List getLexicographicMergedDimensions(List indexes)
{
return mergeIndexed(
Lists.transform(
indexes,
new Function>()
{
@Override
public Iterable apply(@Nullable IndexableAdapter input)
{
return input.getDimensionNames();
}
}
)
);
}
private static List getLongestSharedDimOrder(List indexes)
{
int maxSize = 0;
Iterable orderingCandidate = null;
for (IndexableAdapter index : indexes) {
int iterSize = index.getDimensionNames().size();
if (iterSize > maxSize) {
maxSize = iterSize;
orderingCandidate = index.getDimensionNames();
}
}
if (orderingCandidate == null) {
return null;
}
for (IndexableAdapter index : indexes) {
Iterator candidateIter = orderingCandidate.iterator();
for (String matchDim : index.getDimensionNames()) {
boolean matched = false;
while (candidateIter.hasNext()) {
String nextDim = candidateIter.next();
if (matchDim.equals(nextDim)) {
matched = true;
break;
}
}
if (!matched) {
return null;
}
}
}
return ImmutableList.copyOf(orderingCandidate);
}
public static List getMergedDimensions(List indexes)
{
if (indexes.size() == 0) {
return ImmutableList.of();
}
List commonDimOrder = getLongestSharedDimOrder(indexes);
if (commonDimOrder == null) {
log.warn("Indexes have incompatible dimension orders, using lexicographic order.");
return getLexicographicMergedDimensions(indexes);
} else {
return commonDimOrder;
}
}
public File merge(
List indexes,
final AggregatorFactory[] metricAggs,
File outDir,
IndexSpec indexSpec,
ProgressIndicator progress
) throws IOException
{
FileUtils.deleteDirectory(outDir);
if (!outDir.mkdirs()) {
throw new ISE("Couldn't make outdir[%s].", outDir);
}
final List mergedDimensions = getMergedDimensions(indexes);
final List mergedMetrics = Lists.transform(
mergeIndexed(
Lists.newArrayList(
FunctionalIterable
.create(indexes)
.transform(
new Function>()
{
@Override
public Iterable apply(@Nullable IndexableAdapter input)
{
return input.getMetricNames();
}
}
)
)
),
new Function()
{
@Override
public String apply(@Nullable String input)
{
return input;
}
}
);
final AggregatorFactory[] sortedMetricAggs = new AggregatorFactory[mergedMetrics.size()];
for (int i = 0; i < metricAggs.length; i++) {
AggregatorFactory metricAgg = metricAggs[i];
int metricIndex = mergedMetrics.indexOf(metricAgg.getName());
/*
If metricIndex is negative, one of the metricAggs was not present in the union of metrics from the indices
we are merging
*/
if (metricIndex > -1) {
sortedMetricAggs[metricIndex] = metricAgg;
}
}
/*
If there is nothing at sortedMetricAggs[i], then we did not have a metricAgg whose name matched the name
of the ith element of mergedMetrics. I.e. There was a metric in the indices to merge that we did not ask for.
*/
for (int i = 0; i < sortedMetricAggs.length; i++) {
if (sortedMetricAggs[i] == null) {
throw new IAE("Indices to merge contained metric[%s], but requested metrics did not", mergedMetrics.get(i));
}
}
for (int i = 0; i < mergedMetrics.size(); i++) {
if (!sortedMetricAggs[i].getName().equals(mergedMetrics.get(i))) {
throw new IAE(
"Metric mismatch, index[%d] [%s] != [%s]",
i,
sortedMetricAggs[i].getName(),
mergedMetrics.get(i)
);
}
}
Function>, Iterable> rowMergerFn = new Function>, Iterable>()
{
@Override
public Iterable apply(
@Nullable ArrayList> boats
)
{
return CombiningIterable.create(
new MergeIterable(
Ordering.natural().nullsFirst(),
boats
),
Ordering.natural().nullsFirst(),
new RowboatMergeFunction(sortedMetricAggs)
);
}
};
return makeIndexFiles(
indexes,
sortedMetricAggs,
outDir,
progress,
mergedDimensions,
mergedMetrics,
rowMergerFn,
indexSpec
);
}
// Faster than IndexMaker
public File convert(final File inDir, final File outDir, final IndexSpec indexSpec) throws IOException
{
return convert(inDir, outDir, indexSpec, new BaseProgressIndicator());
}
public File convert(
final File inDir, final File outDir, final IndexSpec indexSpec, final ProgressIndicator progress
) throws IOException
{
try (QueryableIndex index = indexIO.loadIndex(inDir)) {
final IndexableAdapter adapter = new QueryableIndexIndexableAdapter(index);
return makeIndexFiles(
ImmutableList.of(adapter),
null,
outDir,
progress,
Lists.newArrayList(adapter.getDimensionNames()),
Lists.newArrayList(adapter.getMetricNames()),
new Function>, Iterable>()
{
@Nullable
@Override
public Iterable apply(ArrayList> input)
{
return input.get(0);
}
},
indexSpec
);
}
}
public File append(
List indexes, AggregatorFactory[] aggregators, File outDir, IndexSpec indexSpec
) throws IOException
{
return append(indexes, aggregators, outDir, indexSpec, new BaseProgressIndicator());
}
public File append(
List indexes,
AggregatorFactory[] aggregators,
File outDir,
IndexSpec indexSpec,
ProgressIndicator progress
) throws IOException
{
FileUtils.deleteDirectory(outDir);
if (!outDir.mkdirs()) {
throw new ISE("Couldn't make outdir[%s].", outDir);
}
final List mergedDimensions = getMergedDimensions(indexes);
final List mergedMetrics = mergeIndexed(
Lists.transform(
indexes,
new Function>()
{
@Override
public Iterable apply(@Nullable IndexableAdapter input)
{
return Iterables.transform(
input.getMetricNames(),
new Function()
{
@Override
public String apply(@Nullable String input)
{
return input;
}
}
);
}
}
)
);
Function>, Iterable> rowMergerFn = new Function>, Iterable>()
{
@Override
public Iterable apply(
@Nullable final ArrayList> boats
)
{
return new MergeIterable(
Ordering.natural().nullsFirst(),
boats
);
}
};
return makeIndexFiles(
indexes,
aggregators,
outDir,
progress,
mergedDimensions,
mergedMetrics,
rowMergerFn,
indexSpec
);
}
protected File makeIndexFiles(
final List indexes,
final AggregatorFactory[] metricAggs,
final File outDir,
final ProgressIndicator progress,
final List mergedDimensions,
final List mergedMetrics,
final Function>, Iterable> rowMergerFn,
final IndexSpec indexSpec
) throws IOException
{
List metadataList = Lists.transform(
indexes,
new Function()
{
@Nullable
@Override
public Metadata apply(IndexableAdapter input)
{
return input.getMetadata();
}
}
);
Metadata segmentMetadata = null;
if (metricAggs != null) {
AggregatorFactory[] combiningMetricAggs = new AggregatorFactory[metricAggs.length];
for (int i = 0; i < metricAggs.length; i++) {
combiningMetricAggs[i] = metricAggs[i].getCombiningFactory();
}
segmentMetadata = Metadata.merge(
metadataList,
combiningMetricAggs
);
} else {
segmentMetadata = Metadata.merge(
metadataList,
null
);
}
final Map valueTypes = Maps.newTreeMap(Ordering.natural().nullsFirst());
final Map metricTypeNames = Maps.newTreeMap(Ordering.natural().nullsFirst());
final Map columnCapabilities = Maps.newHashMap();
for (IndexableAdapter adapter : indexes) {
for (String dimension : adapter.getDimensionNames()) {
ColumnCapabilitiesImpl mergedCapabilities = columnCapabilities.get(dimension);
ColumnCapabilities capabilities = adapter.getCapabilities(dimension);
if (mergedCapabilities == null) {
mergedCapabilities = new ColumnCapabilitiesImpl();
mergedCapabilities.setType(ValueType.STRING);
}
columnCapabilities.put(dimension, mergedCapabilities.merge(capabilities));
}
for (String metric : adapter.getMetricNames()) {
ColumnCapabilitiesImpl mergedCapabilities = columnCapabilities.get(metric);
ColumnCapabilities capabilities = adapter.getCapabilities(metric);
if (mergedCapabilities == null) {
mergedCapabilities = new ColumnCapabilitiesImpl();
}
columnCapabilities.put(metric, mergedCapabilities.merge(capabilities));
valueTypes.put(metric, capabilities.getType());
metricTypeNames.put(metric, adapter.getMetricType(metric));
}
}
final Interval dataInterval;
File v8OutDir = new File(outDir, "v8-tmp");
v8OutDir.mkdirs();
/************* Main index.drd file **************/
progress.progress();
long startTime = System.currentTimeMillis();
File indexFile = new File(v8OutDir, "index.drd");
try (FileOutputStream fileOutputStream = new FileOutputStream(indexFile);
FileChannel channel = fileOutputStream.getChannel()) {
channel.write(ByteBuffer.wrap(new byte[]{IndexIO.V8_VERSION}));
GenericIndexed.fromIterable(mergedDimensions, GenericIndexed.STRING_STRATEGY).writeToChannel(channel);
GenericIndexed.fromIterable(mergedMetrics, GenericIndexed.STRING_STRATEGY).writeToChannel(channel);
DateTime minTime = new DateTime(JodaUtils.MAX_INSTANT);
DateTime maxTime = new DateTime(JodaUtils.MIN_INSTANT);
for (IndexableAdapter index : indexes) {
minTime = JodaUtils.minDateTime(minTime, index.getDataInterval().getStart());
maxTime = JodaUtils.maxDateTime(maxTime, index.getDataInterval().getEnd());
}
dataInterval = new Interval(minTime, maxTime);
serializerUtils.writeString(channel, String.format("%s/%s", minTime, maxTime));
serializerUtils.writeString(channel, mapper.writeValueAsString(indexSpec.getBitmapSerdeFactory()));
}
IndexIO.checkFileSize(indexFile);
log.info("outDir[%s] completed index.drd in %,d millis.", v8OutDir, System.currentTimeMillis() - startTime);
/************* Setup Dim Conversions **************/
progress.progress();
startTime = System.currentTimeMillis();
IOPeon ioPeon = new TmpFileIOPeon();
ArrayList dimOuts = Lists.newArrayListWithCapacity(mergedDimensions.size());
Map dimensionCardinalities = Maps.newHashMap();
ArrayList
© 2015 - 2025 Weber Informatics LLC | Privacy Policy