![JAR search and dependency download from the Maven repository](/logo.png)
org.apache.druid.server.compaction.DataSourceCompactibleSegmentIterator Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.server.compaction;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.Intervals;
import org.apache.druid.java.util.common.JodaUtils;
import org.apache.druid.java.util.common.granularity.Granularity;
import org.apache.druid.java.util.common.guava.Comparators;
import org.apache.druid.java.util.common.logger.Logger;
import org.apache.druid.server.coordinator.DataSourceCompactionConfig;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.Partitions;
import org.apache.druid.timeline.SegmentTimeline;
import org.apache.druid.timeline.TimelineObjectHolder;
import org.apache.druid.timeline.partition.NumberedPartitionChunk;
import org.apache.druid.timeline.partition.NumberedShardSpec;
import org.apache.druid.timeline.partition.PartitionChunk;
import org.apache.druid.utils.CollectionUtils;
import org.apache.druid.utils.Streams;
import org.joda.time.DateTime;
import org.joda.time.Interval;
import org.joda.time.Period;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.stream.Collectors;
/**
* Iterator over compactible segments of a datasource in order of specified priority.
*/
public class DataSourceCompactibleSegmentIterator implements CompactionSegmentIterator
{
private static final Logger log = new Logger(DataSourceCompactibleSegmentIterator.class);
private final String dataSource;
private final DataSourceCompactionConfig config;
private final CompactionStatusTracker statusTracker;
private final CompactionCandidateSearchPolicy searchPolicy;
private final List compactedSegments = new ArrayList<>();
private final List skippedSegments = new ArrayList<>();
// This is needed for datasource that has segmentGranularity configured
// If configured segmentGranularity in config is finer than current segmentGranularity, the same set of segments
// can belong to multiple intervals in the timeline. We keep track of the compacted intervals between each
// run of the compaction job and skip any interval that was already previously compacted.
private final Set queuedIntervals = new HashSet<>();
private final PriorityQueue queue;
public DataSourceCompactibleSegmentIterator(
DataSourceCompactionConfig config,
SegmentTimeline timeline,
List skipIntervals,
CompactionCandidateSearchPolicy searchPolicy,
CompactionStatusTracker statusTracker
)
{
this.statusTracker = statusTracker;
this.config = config;
this.dataSource = config.getDataSource();
this.searchPolicy = searchPolicy;
this.queue = new PriorityQueue<>(searchPolicy);
populateQueue(timeline, skipIntervals);
}
private void populateQueue(SegmentTimeline timeline, List skipIntervals)
{
if (timeline != null) {
if (!timeline.isEmpty()) {
SegmentTimeline originalTimeline = null;
if (config.getSegmentGranularity() != null) {
final Set segments = timeline.findNonOvershadowedObjectsInInterval(
Intervals.ETERNITY,
Partitions.ONLY_COMPLETE
);
// Skip compaction if any segment has partial-eternity interval
// See https://github.com/apache/druid/issues/13208
final List partialEternitySegments = new ArrayList<>();
for (DataSegment segment : segments) {
if (Intervals.ETERNITY.getStart().equals(segment.getInterval().getStart())
|| Intervals.ETERNITY.getEnd().equals(segment.getInterval().getEnd())) {
partialEternitySegments.add(segment);
}
}
if (!partialEternitySegments.isEmpty()) {
CompactionCandidate candidatesWithStatus = CompactionCandidate.from(partialEternitySegments).withCurrentStatus(
CompactionStatus.skipped("Segments have partial-eternity intervals")
);
skippedSegments.add(candidatesWithStatus);
statusTracker.onCompactionStatusComputed(candidatesWithStatus, config);
return;
}
// Convert original segmentGranularity to new granularities bucket by configuredSegmentGranularity
// For example, if the original is interval of 2020-01-28/2020-02-03 with WEEK granularity
// and the configuredSegmentGranularity is MONTH, the segment will be split to two segments
// of 2020-01/2020-02 and 2020-02/2020-03.
final SegmentTimeline timelineWithConfiguredSegmentGranularity = new SegmentTimeline();
final Map> intervalToPartitionMap = new HashMap<>();
for (DataSegment segment : segments) {
for (Interval interval : config.getSegmentGranularity().getIterable(segment.getInterval())) {
intervalToPartitionMap.computeIfAbsent(interval, k -> new HashSet<>())
.add(segment);
}
}
final String temporaryVersion = DateTimes.nowUtc().toString();
for (Map.Entry> partitionsPerInterval : intervalToPartitionMap.entrySet()) {
Interval interval = partitionsPerInterval.getKey();
int partitionNum = 0;
Set segmentSet = partitionsPerInterval.getValue();
int partitions = segmentSet.size();
for (DataSegment segment : segmentSet) {
DataSegment segmentsForCompact = segment.withShardSpec(new NumberedShardSpec(partitionNum, partitions));
timelineWithConfiguredSegmentGranularity.add(
interval,
temporaryVersion,
NumberedPartitionChunk.make(partitionNum, partitions, segmentsForCompact)
);
partitionNum += 1;
}
}
// PartitionHolder can only holds chunks of one partition space
// However, partition in the new timeline (timelineWithConfiguredSegmentGranularity) can be hold multiple
// partitions of the original timeline (when the new segmentGranularity is larger than the original
// segmentGranularity). Hence, we group all the segments of the original timeline into intervals bucket
// by the new configuredSegmentGranularity. We then convert each segment into a new partition space so that
// there is no duplicate partitionNum across all segments of each new Interval.
// Similarly, segment versions may be mixed in the same time chunk based on new segment granularity
// Hence we create the new timeline with a temporary version, setting the fake version to all be the same
// for the same new time bucket.
// We need to save and store the originalTimeline so that we can use it
// to get the original ShardSpec and original version back (when converting the segment back to return from this iterator).
originalTimeline = timeline;
timeline = timelineWithConfiguredSegmentGranularity;
}
final List searchIntervals = findInitialSearchInterval(timeline, skipIntervals);
if (!searchIntervals.isEmpty()) {
findAndEnqueueSegmentsToCompact(
new CompactibleSegmentIterator(timeline, searchIntervals, originalTimeline)
);
} else {
log.warn("Skipping compaction for datasource[%s] as it has no compactible segments.", dataSource);
}
}
}
}
@Override
public List getCompactedSegments()
{
return compactedSegments;
}
@Override
public List getSkippedSegments()
{
return skippedSegments;
}
@Override
public boolean hasNext()
{
return !queue.isEmpty();
}
@Override
public CompactionCandidate next()
{
if (hasNext()) {
return queue.poll();
} else {
throw new NoSuchElementException();
}
}
/**
* Iterates compactible segments in a {@link SegmentTimeline}.
*/
private static class CompactibleSegmentIterator implements Iterator>
{
private final List> holders;
@Nullable
private final SegmentTimeline originalTimeline;
CompactibleSegmentIterator(
SegmentTimeline timeline,
List totalIntervalsToSearch,
// originalTimeline can be null if timeline was not modified
@Nullable SegmentTimeline originalTimeline
)
{
this.holders = totalIntervalsToSearch.stream().flatMap(
interval -> timeline
.lookup(interval)
.stream()
.filter(holder -> isCompactibleHolder(interval, holder))
).collect(Collectors.toList());
this.originalTimeline = originalTimeline;
}
/**
* Checks if the {@link TimelineObjectHolder} satisfies the following:
*
* - It has atleast one segment.
* - The interval of the segments is contained in the searchInterval.
* - The total bytes across all the segments is positive.
*
*/
private boolean isCompactibleHolder(Interval searchInterval, TimelineObjectHolder holder)
{
final Iterator> chunks = holder.getObject().iterator();
if (!chunks.hasNext()) {
return false;
}
PartitionChunk firstChunk = chunks.next();
if (!searchInterval.contains(firstChunk.getObject().getInterval())) {
return false;
}
long partitionBytes = firstChunk.getObject().getSize();
while (partitionBytes == 0 && chunks.hasNext()) {
partitionBytes += chunks.next().getObject().getSize();
}
return partitionBytes > 0;
}
@Override
public boolean hasNext()
{
return !holders.isEmpty();
}
/**
* Returns the next list of compactible segments in the datasource timeline.
* The returned list satisfies the following conditions:
*
* - The list is non-null and non-empty.
* - The segments are present in the search interval.
* - Total bytes of segments in the list is greater than zero.
*
*/
@Override
public List next()
{
if (!hasNext()) {
throw new NoSuchElementException();
}
TimelineObjectHolder timelineObjectHolder = holders.remove(holders.size() - 1);
List candidates = Streams.sequentialStreamFrom(timelineObjectHolder.getObject())
.map(PartitionChunk::getObject)
.collect(Collectors.toList());
if (originalTimeline == null) {
return candidates;
} else {
Interval umbrellaInterval = JodaUtils.umbrellaInterval(
candidates.stream().map(DataSegment::getInterval).collect(Collectors.toList())
);
return Lists.newArrayList(
originalTimeline.findNonOvershadowedObjectsInInterval(umbrellaInterval, Partitions.ONLY_COMPLETE)
);
}
}
}
/**
* Finds segments to compact together for the given datasource and adds them to
* the priority queue.
*/
private void findAndEnqueueSegmentsToCompact(CompactibleSegmentIterator compactibleSegmentIterator)
{
while (compactibleSegmentIterator.hasNext()) {
List segments = compactibleSegmentIterator.next();
if (CollectionUtils.isNullOrEmpty(segments)) {
continue;
}
// Do not compact an interval which contains a single tombstone
// If there are multiple tombstones in the interval, we may still want to compact them
if (segments.size() == 1 && segments.get(0).isTombstone()) {
continue;
}
final CompactionCandidate candidates = CompactionCandidate.from(segments);
final CompactionStatus compactionStatus
= statusTracker.computeCompactionStatus(candidates, config, searchPolicy);
final CompactionCandidate candidatesWithStatus = candidates.withCurrentStatus(compactionStatus);
statusTracker.onCompactionStatusComputed(candidatesWithStatus, config);
if (compactionStatus.isComplete()) {
compactedSegments.add(candidatesWithStatus);
} else if (compactionStatus.isSkipped()) {
skippedSegments.add(candidatesWithStatus);
} else if (!queuedIntervals.contains(candidates.getUmbrellaInterval())) {
queue.add(candidatesWithStatus);
queuedIntervals.add(candidates.getUmbrellaInterval());
}
}
}
/**
* Returns the initial searchInterval which is {@code (timeline.first().start, timeline.last().end - skipOffset)}.
*/
private List findInitialSearchInterval(
SegmentTimeline timeline,
@Nullable List skipIntervals
)
{
final Period skipOffset = config.getSkipOffsetFromLatest();
Preconditions.checkArgument(timeline != null && !timeline.isEmpty(), "timeline should not be null or empty");
Preconditions.checkNotNull(skipOffset, "skipOffset");
final TimelineObjectHolder first = Preconditions.checkNotNull(timeline.first(), "first");
final TimelineObjectHolder last = Preconditions.checkNotNull(timeline.last(), "last");
final Interval latestSkipInterval = computeLatestSkipInterval(
config.getSegmentGranularity(),
last.getInterval().getEnd(),
skipOffset
);
final List allSkipIntervals
= sortAndAddSkipIntervalFromLatest(latestSkipInterval, skipIntervals);
// Collect stats for all skipped segments
for (Interval skipInterval : allSkipIntervals) {
final List segments = new ArrayList<>(
timeline.findNonOvershadowedObjectsInInterval(skipInterval, Partitions.ONLY_COMPLETE)
);
if (!CollectionUtils.isNullOrEmpty(segments)) {
final CompactionCandidate candidates = CompactionCandidate.from(segments);
final CompactionStatus reason;
if (candidates.getUmbrellaInterval().overlaps(latestSkipInterval)) {
reason = CompactionStatus.skipped("skip offset from latest[%s]", skipOffset);
} else {
reason = CompactionStatus.skipped("interval locked by another task");
}
final CompactionCandidate candidatesWithStatus = candidates.withCurrentStatus(reason);
skippedSegments.add(candidatesWithStatus);
statusTracker.onCompactionStatusComputed(candidatesWithStatus, config);
}
}
final Interval totalInterval = new Interval(first.getInterval().getStart(), last.getInterval().getEnd());
final List filteredInterval = filterSkipIntervals(totalInterval, allSkipIntervals);
final List searchIntervals = new ArrayList<>();
for (Interval lookupInterval : filteredInterval) {
if (Intervals.ETERNITY.getStart().equals(lookupInterval.getStart())
|| Intervals.ETERNITY.getEnd().equals(lookupInterval.getEnd())) {
log.warn(
"Cannot compact datasource[%s] since interval[%s] coincides with ETERNITY.",
dataSource, lookupInterval
);
return Collections.emptyList();
}
final List segments = timeline
.findNonOvershadowedObjectsInInterval(lookupInterval, Partitions.ONLY_COMPLETE)
.stream()
// findNonOvershadowedObjectsInInterval() may return segments merely intersecting with lookupInterval, while
// we are interested only in segments fully lying within lookupInterval here.
.filter(segment -> lookupInterval.contains(segment.getInterval()))
.collect(Collectors.toList());
if (segments.isEmpty()) {
continue;
}
DateTime searchStart = segments
.stream()
.map(segment -> segment.getId().getIntervalStart())
.min(Comparator.naturalOrder())
.orElseThrow(AssertionError::new);
DateTime searchEnd = segments
.stream()
.map(segment -> segment.getId().getIntervalEnd())
.max(Comparator.naturalOrder())
.orElseThrow(AssertionError::new);
searchIntervals.add(new Interval(searchStart, searchEnd));
}
return searchIntervals;
}
static Interval computeLatestSkipInterval(
@Nullable Granularity configuredSegmentGranularity,
DateTime latestDataTimestamp,
Period skipOffsetFromLatest
)
{
if (configuredSegmentGranularity == null) {
return new Interval(skipOffsetFromLatest, latestDataTimestamp);
} else {
DateTime skipFromLastest = new DateTime(latestDataTimestamp, latestDataTimestamp.getZone()).minus(skipOffsetFromLatest);
DateTime skipOffsetBucketToSegmentGranularity = configuredSegmentGranularity.bucketStart(skipFromLastest);
return new Interval(skipOffsetBucketToSegmentGranularity, latestDataTimestamp);
}
}
@VisibleForTesting
static List sortAndAddSkipIntervalFromLatest(
Interval skipFromLatest,
@Nullable List skipIntervals
)
{
final List nonNullSkipIntervals = skipIntervals == null
? new ArrayList<>(1)
: new ArrayList<>(skipIntervals.size());
if (skipIntervals != null) {
final List sortedSkipIntervals = new ArrayList<>(skipIntervals);
sortedSkipIntervals.sort(Comparators.intervalsByStartThenEnd());
final List overlapIntervals = new ArrayList<>();
for (Interval interval : sortedSkipIntervals) {
if (interval.overlaps(skipFromLatest)) {
overlapIntervals.add(interval);
} else {
nonNullSkipIntervals.add(interval);
}
}
if (!overlapIntervals.isEmpty()) {
overlapIntervals.add(skipFromLatest);
nonNullSkipIntervals.add(JodaUtils.umbrellaInterval(overlapIntervals));
} else {
nonNullSkipIntervals.add(skipFromLatest);
}
} else {
nonNullSkipIntervals.add(skipFromLatest);
}
return nonNullSkipIntervals;
}
/**
* Returns a list of intervals which are contained by totalInterval but don't ovarlap with skipIntervals.
*
* @param totalInterval total interval
* @param skipIntervals intervals to skip. This should be sorted by {@link Comparators#intervalsByStartThenEnd()}.
*/
@VisibleForTesting
static List filterSkipIntervals(Interval totalInterval, List skipIntervals)
{
final List filteredIntervals = new ArrayList<>(skipIntervals.size() + 1);
DateTime remainingStart = totalInterval.getStart();
DateTime remainingEnd = totalInterval.getEnd();
for (Interval skipInterval : skipIntervals) {
if (skipInterval.getStart().isBefore(remainingStart) && skipInterval.getEnd().isAfter(remainingStart)) {
remainingStart = skipInterval.getEnd();
} else if (skipInterval.getStart().isBefore(remainingEnd) && skipInterval.getEnd().isAfter(remainingEnd)) {
remainingEnd = skipInterval.getStart();
} else if (!remainingStart.isAfter(skipInterval.getStart()) && !remainingEnd.isBefore(skipInterval.getEnd())) {
filteredIntervals.add(new Interval(remainingStart, skipInterval.getStart()));
remainingStart = skipInterval.getEnd();
} else {
// Ignore this skipInterval
log.warn(
"skipInterval[%s] is not contained in remainingInterval[%s]",
skipInterval,
new Interval(remainingStart, remainingEnd)
);
}
}
if (!remainingStart.equals(remainingEnd)) {
filteredIntervals.add(new Interval(remainingStart, remainingEnd));
}
return filteredIntervals;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy