
org.apache.lucene.index.TieredMergePolicy Maven / Gradle / Ivy
Show all versions of org.apache.servicemix.bundles.lucene
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
/**
* Merges segments of approximately equal size, subject to an allowed number of segments per tier.
* This is similar to {@link LogByteSizeMergePolicy}, except this merge policy is able to merge
* non-adjacent segment, and separates how many segments are merged at once ({@link
* #setMaxMergeAtOnce}) from how many segments are allowed per tier ({@link #setSegmentsPerTier}).
* This merge policy also does not over-merge (i.e. cascade merges).
*
* For normal merging, this policy first computes a "budget" of how many segments are allowed to
* be in the index. If the index is over-budget, then the policy sorts segments by decreasing size
* (pro-rating by percent deletes), and then finds the least-cost merge. Merge cost is measured by a
* combination of the "skew" of the merge (size of largest segment divided by smallest segment),
* total merge size and percent deletes reclaimed, so that merges with lower skew, smaller size and
* those reclaiming more deletes, are favored.
*
*
If a merge will produce a segment that's larger than {@link #setMaxMergedSegmentMB}, then the
* policy will merge fewer segments (down to 1 at once, if that one has deletions) to keep the
* segment size under budget.
*
*
NOTE: this policy freely merges non-adjacent segments; if this is a problem, use {@link
* LogMergePolicy}.
*
*
NOTE: This policy always merges by byte size of the segments, always pro-rates by
* percent deletes
*
*
NOTE Starting with Lucene 7.5, if you call {@link IndexWriter#forceMerge(int)} with
* this (default) merge policy, if {@link #setMaxMergedSegmentMB} is in conflict with {@code
* maxNumSegments} passed to {@link IndexWriter#forceMerge} then {@code maxNumSegments} wins. For
* example, if your index has 50 1 GB segments, and you have {@link #setMaxMergedSegmentMB} at 1024
* (1 GB), and you call {@code forceMerge(10)}, the two settings are clearly in conflict. {@code
* TieredMergePolicy} will choose to break the {@link #setMaxMergedSegmentMB} constraint and try to
* merge down to at most ten segments, each up to 5 * 1.25 GB in size (since an extra 25% buffer
* increase in the expected segment size is targetted).
*
*
findForcedDeletesMerges should never produce segments greater than maxSegmentSize.
*
*
NOTE: This policy returns natural merges whose size is below the {@link
* #setFloorSegmentMB(double) floor segment size} for {@link #findFullFlushMerges full-flush
* merges}.
*
* @lucene.experimental
*/
// TODO
// - we could try to take into account whether a large
// merge is already running (under CMS) and then bias
// ourselves towards picking smaller merges if so (or,
// maybe CMS should do so)
public class TieredMergePolicy extends MergePolicy {
/**
* Default noCFSRatio. If a merge's size is {@code >= 10%} of the index, then we disable compound
* file for it.
*
* @see MergePolicy#setNoCFSRatio
*/
public static final double DEFAULT_NO_CFS_RATIO = 0.1;
// User-specified maxMergeAtOnce. In practice we always take the min of its
// value and segsPerTier for segments above the floor size to avoid suboptimal merging.
private int maxMergeAtOnce = 10;
private long maxMergedSegmentBytes = 5 * 1024 * 1024 * 1024L;
private long floorSegmentBytes = 16 * 1024 * 1024L;
private double segsPerTier = 10.0;
private double forceMergeDeletesPctAllowed = 10.0;
private double deletesPctAllowed = 20.0;
private int targetSearchConcurrency = 1;
/** Sole constructor, setting all settings to their defaults. */
public TieredMergePolicy() {
super(DEFAULT_NO_CFS_RATIO, MergePolicy.DEFAULT_MAX_CFS_SEGMENT_SIZE);
}
/**
* Maximum number of segments to be merged at a time during "normal" merging. Default is 10.
*
*
NOTE: Merges above the {@link #setFloorSegmentMB(double) floor segment size} also
* bound the number of merged segments by {@link #setSegmentsPerTier(double) the number of
* segments per tier}.
*/
public TieredMergePolicy setMaxMergeAtOnce(int v) {
if (v < 2) {
throw new IllegalArgumentException("maxMergeAtOnce must be > 1 (got " + v + ")");
}
maxMergeAtOnce = v;
return this;
}
private enum MERGE_TYPE {
NATURAL,
FORCE_MERGE,
FORCE_MERGE_DELETES
}
/**
* Returns the current maxMergeAtOnce setting.
*
* @see #setMaxMergeAtOnce
*/
public int getMaxMergeAtOnce() {
return maxMergeAtOnce;
}
// TODO: should addIndexes do explicit merging, too? And,
// if user calls IW.maybeMerge "explicitly"
/**
* Maximum sized segment to produce during normal merging. This setting is approximate: the
* estimate of the merged segment size is made by summing sizes of to-be-merged segments
* (compensating for percent deleted docs). Default is 5 GB.
*/
public TieredMergePolicy setMaxMergedSegmentMB(double v) {
if (v < 0.0) {
throw new IllegalArgumentException("maxMergedSegmentMB must be >=0 (got " + v + ")");
}
v *= 1024 * 1024;
maxMergedSegmentBytes = v > Long.MAX_VALUE ? Long.MAX_VALUE : (long) v;
return this;
}
/**
* Returns the current maxMergedSegmentMB setting.
*
* @see #setMaxMergedSegmentMB
*/
public double getMaxMergedSegmentMB() {
return maxMergedSegmentBytes / 1024.0 / 1024.0;
}
/**
* Controls the maximum percentage of deleted documents that is tolerated in the index. Lower
* values make the index more space efficient at the expense of increased CPU and I/O activity.
* Values must be between 5 and 50. Default value is 20.
*
*
When the maximum delete percentage is lowered, the indexing thread will call for merges more
* often, meaning that write amplification factor will be increased. Write amplification factor
* measures the number of times each document in the index is written. A higher write
* amplification factor will lead to higher CPU and I/O activity as indicated above.
*/
public TieredMergePolicy setDeletesPctAllowed(double v) {
if (v < 5 || v > 50) {
throw new IllegalArgumentException(
"indexPctDeletedTarget must be >= 5.0 and <= 50 (got " + v + ")");
}
deletesPctAllowed = v;
return this;
}
/**
* Returns the current deletesPctAllowed setting.
*
* @see #setDeletesPctAllowed
*/
public double getDeletesPctAllowed() {
return deletesPctAllowed;
}
/**
* Segments smaller than this size are merged more aggressively:
*
*
* - They are candidates for full-flush merges, in order to reduce the number of segments in
* the index prior to opening a new point-in-time view of the index.
*
- For background merges, smaller segments are "rounded up" to this size.
*
*
* In both cases, this helps prevent frequent flushing of tiny segments to create a long tail of
* small segments in the index. Default is 16MB.
*/
public TieredMergePolicy setFloorSegmentMB(double v) {
if (v <= 0.0) {
throw new IllegalArgumentException("floorSegmentMB must be > 0.0 (got " + v + ")");
}
v *= 1024 * 1024;
floorSegmentBytes = v > Long.MAX_VALUE ? Long.MAX_VALUE : (long) v;
return this;
}
/**
* Returns the current floorSegmentMB.
*
* @see #setFloorSegmentMB
*/
public double getFloorSegmentMB() {
return floorSegmentBytes / (1024 * 1024.);
}
@Override
protected long maxFullFlushMergeSize() {
return floorSegmentBytes;
}
/**
* When forceMergeDeletes is called, we only merge away a segment if its delete percentage is over
* this threshold. Default is 10%.
*/
public TieredMergePolicy setForceMergeDeletesPctAllowed(double v) {
if (v < 0.0 || v > 100.0) {
throw new IllegalArgumentException(
"forceMergeDeletesPctAllowed must be between 0.0 and 100.0 inclusive (got " + v + ")");
}
forceMergeDeletesPctAllowed = v;
return this;
}
/**
* Returns the current forceMergeDeletesPctAllowed setting.
*
* @see #setForceMergeDeletesPctAllowed
*/
public double getForceMergeDeletesPctAllowed() {
return forceMergeDeletesPctAllowed;
}
/**
* Sets the allowed number of segments per tier. Smaller values mean more merging but fewer
* segments.
*
* Default is 10.0.
*/
public TieredMergePolicy setSegmentsPerTier(double v) {
if (v < 2.0) {
throw new IllegalArgumentException("segmentsPerTier must be >= 2.0 (got " + v + ")");
}
segsPerTier = v;
return this;
}
/**
* Returns the current segmentsPerTier setting.
*
* @see #setSegmentsPerTier
*/
public double getSegmentsPerTier() {
return segsPerTier;
}
/**
* Sets the target search concurrency. This prevents creating segments that are bigger than
* maxDoc/targetSearchConcurrency, which in turn makes the work parallelizable into
* targetSearchConcurrency slices of similar doc counts. It also makes merging less aggressive, as
* higher values result in indices that do less merging and have more segments
*/
public TieredMergePolicy setTargetSearchConcurrency(int targetSearchConcurrency) {
if (targetSearchConcurrency < 1) {
throw new IllegalArgumentException(
"targetSearchConcurrency must be >= 1 (got " + targetSearchConcurrency + ")");
}
this.targetSearchConcurrency = targetSearchConcurrency;
return this;
}
/** Returns the target search concurrency. */
public int getTargetSearchConcurrency() {
return targetSearchConcurrency;
}
private static class SegmentSizeAndDocs {
private final SegmentCommitInfo segInfo;
/// Size of the segment in bytes, pro-rated by the number of live documents.
private final long sizeInBytes;
private final int delCount;
private final int maxDoc;
private final String name;
SegmentSizeAndDocs(SegmentCommitInfo info, final long sizeInBytes, final int segDelCount)
throws IOException {
segInfo = info;
this.name = info.info.name;
this.sizeInBytes = sizeInBytes;
this.delCount = segDelCount;
this.maxDoc = info.info.maxDoc();
}
}
/** Holds score and explanation for a single candidate merge. */
protected abstract static class MergeScore {
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
protected MergeScore() {}
/** Returns the score for this merge candidate; lower scores are better. */
abstract double getScore();
/** Human readable explanation of how the merge got this score. */
abstract String getExplanation();
}
// The size can change concurrently while we are running here, because deletes
// are now applied concurrently, and this can piss off TimSort! So we
// call size() once per segment and sort by that:
private List getSortedBySegmentSize(
final SegmentInfos infos, final MergeContext mergeContext) throws IOException {
List sortedBySize = new ArrayList<>();
for (SegmentCommitInfo info : infos) {
sortedBySize.add(
new SegmentSizeAndDocs(
info, size(info, mergeContext), mergeContext.numDeletesToMerge(info)));
}
sortedBySize.sort(
(o1, o2) -> {
// Sort by largest size:
int cmp = Long.compare(o2.sizeInBytes, o1.sizeInBytes);
if (cmp == 0) {
cmp = o1.name.compareTo(o2.name);
}
return cmp;
});
return sortedBySize;
}
@Override
public MergeSpecification findMerges(
MergeTrigger mergeTrigger, SegmentInfos infos, MergeContext mergeContext) throws IOException {
final Set merging = mergeContext.getMergingSegments();
// Compute total index bytes & print details about the index
long totIndexBytes = 0;
long minSegmentBytes = Long.MAX_VALUE;
int totalDelDocs = 0;
int totalMaxDoc = 0;
long mergingBytes = 0;
List sortedInfos = getSortedBySegmentSize(infos, mergeContext);
Iterator iter = sortedInfos.iterator();
while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next();
final long segBytes = segSizeDocs.sizeInBytes;
if (verbose(mergeContext)) {
String extra = merging.contains(segSizeDocs.segInfo) ? " [merging]" : "";
if (segBytes >= maxMergedSegmentBytes) {
extra += " [skip: too large]";
} else if (segBytes < floorSegmentBytes) {
extra += " [floored]";
}
message(
" seg="
+ segString(mergeContext, Collections.singleton(segSizeDocs.segInfo))
+ " size="
+ String.format(Locale.ROOT, "%.3f", segBytes / 1024. / 1024.)
+ " MB"
+ extra,
mergeContext);
}
if (merging.contains(segSizeDocs.segInfo)) {
mergingBytes += segSizeDocs.sizeInBytes;
iter.remove();
// if this segment is merging, then its deletes are being reclaimed already.
// only count live docs in the total max doc
totalMaxDoc += segSizeDocs.maxDoc - segSizeDocs.delCount;
} else {
totalDelDocs += segSizeDocs.delCount;
totalMaxDoc += segSizeDocs.maxDoc;
}
minSegmentBytes = Math.min(segBytes, minSegmentBytes);
totIndexBytes += segBytes;
}
assert totalMaxDoc >= 0;
assert totalDelDocs >= 0;
final double totalDelPct = 100 * (double) totalDelDocs / totalMaxDoc;
int allowedDelCount = (int) (deletesPctAllowed * totalMaxDoc / 100);
// If we have too-large segments, grace them out of the maximum segment count
// If we're above certain thresholds of deleted docs, we can merge very large segments.
int tooBigCount = 0;
// We relax merging for the bigger segments for concurrency reasons, as we want to have several
// segments on the highest tier without over-merging on the lower tiers.
int concurrencyCount = 0;
iter = sortedInfos.iterator();
double allowedSegCount = 0;
// remove large segments from consideration under two conditions.
// 1> Overall percent deleted docs relatively small and this segment is larger than 50%
// maxSegSize
// 2> overall percent deleted docs large and this segment is large and has few deleted docs
while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next();
double segDelPct = 100 * (double) segSizeDocs.delCount / (double) segSizeDocs.maxDoc;
if (segSizeDocs.sizeInBytes > maxMergedSegmentBytes / 2
&& (totalDelPct <= deletesPctAllowed || segDelPct <= deletesPctAllowed)) {
iter.remove();
tooBigCount++;
totIndexBytes -= segSizeDocs.sizeInBytes;
allowedDelCount -= segSizeDocs.delCount;
} else if (concurrencyCount + tooBigCount < targetSearchConcurrency - 1) {
// Make sure we count a whole segment for the first targetSearchConcurrency-1 segments to
// avoid over merging on the lower levels.
concurrencyCount++;
allowedSegCount++;
totIndexBytes -= segSizeDocs.sizeInBytes;
}
}
allowedDelCount = Math.max(0, allowedDelCount);
final int mergeFactor = (int) Math.min(maxMergeAtOnce, segsPerTier);
// Compute max allowed segments for the remainder of the index
long levelSize = Math.max(minSegmentBytes, floorSegmentBytes);
long bytesLeft = totIndexBytes;
while (true) {
final double segCountLevel = bytesLeft / (double) levelSize;
if (segCountLevel < segsPerTier || levelSize == maxMergedSegmentBytes) {
allowedSegCount += Math.ceil(segCountLevel);
break;
}
allowedSegCount += segsPerTier;
bytesLeft -= segsPerTier * levelSize;
levelSize = Math.min(maxMergedSegmentBytes, levelSize * mergeFactor);
}
// allowedSegCount may occasionally be less than segsPerTier
// if segment sizes are below the floor size
allowedSegCount = Math.max(allowedSegCount, segsPerTier);
// No need to merge if the total number of segments (including too big segments) is less than or
// equal to the target search concurrency.
allowedSegCount = Math.max(allowedSegCount, targetSearchConcurrency - tooBigCount);
int allowedDocCount = getMaxAllowedDocs(totalMaxDoc, totalDelDocs);
if (verbose(mergeContext) && tooBigCount > 0) {
message(
" allowedSegmentCount="
+ allowedSegCount
+ " vs count="
+ infos.size()
+ " (eligible count="
+ sortedInfos.size()
+ ") tooBigCount= "
+ tooBigCount
+ " allowedDocCount="
+ allowedDocCount
+ " vs doc count="
+ infos.totalMaxDoc(),
mergeContext);
}
return doFindMerges(
sortedInfos,
maxMergedSegmentBytes,
mergeFactor,
(int) allowedSegCount,
allowedDelCount,
allowedDocCount,
MERGE_TYPE.NATURAL,
mergeContext,
mergingBytes >= maxMergedSegmentBytes);
}
private MergeSpecification doFindMerges(
List sortedEligibleInfos,
final long maxMergedSegmentBytes,
final int mergeFactor,
final int allowedSegCount,
final int allowedDelCount,
final int allowedDocCount,
final MERGE_TYPE mergeType,
MergeContext mergeContext,
boolean maxMergeIsRunning)
throws IOException {
List sortedEligible = new ArrayList<>(sortedEligibleInfos);
Map segInfosSizes = new HashMap<>();
for (SegmentSizeAndDocs segSizeDocs : sortedEligible) {
segInfosSizes.put(segSizeDocs.segInfo, segSizeDocs);
}
int originalSortedSize = sortedEligible.size();
if (verbose(mergeContext)) {
message("findMerges: " + originalSortedSize + " segments", mergeContext);
}
if (originalSortedSize == 0) {
return null;
}
final Set toBeMerged = new HashSet<>();
MergeSpecification spec = null;
// Cycle to possibly select more than one merge:
// The trigger point for total deleted documents in the index leads to a bunch of large segment
// merges at the same time. So only put one large merge in the list of merges per cycle. We'll
// pick up another
// merge next time around.
boolean haveOneLargeMerge = false;
while (true) {
// Gather eligible segments for merging, ie segments
// not already being merged and not already picked (by
// prior iteration of this loop) for merging:
// Remove ineligible segments. These are either already being merged or already picked by
// prior iterations
Iterator iter = sortedEligible.iterator();
while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next();
if (toBeMerged.contains(segSizeDocs.segInfo)) {
iter.remove();
}
}
if (verbose(mergeContext)) {
message(
" allowedSegmentCount="
+ allowedSegCount
+ " vs count="
+ originalSortedSize
+ " (eligible count="
+ sortedEligible.size()
+ ")",
mergeContext);
}
if (sortedEligible.size() == 0) {
return spec;
}
final int remainingDelCount = sortedEligible.stream().mapToInt(c -> c.delCount).sum();
if (mergeType == MERGE_TYPE.NATURAL
&& sortedEligible.size() <= allowedSegCount
&& remainingDelCount <= allowedDelCount) {
return spec;
}
// OK we are over budget -- find best merge!
MergeScore bestScore = null;
List best = null;
boolean bestTooLarge = false;
long bestMergeBytes = 0;
for (int startIdx = 0; startIdx < sortedEligible.size(); startIdx++) {
final List candidate = new ArrayList<>();
boolean hitTooLarge = false;
long bytesThisMerge = 0;
long docCountThisMerge = 0;
for (int idx = startIdx;
idx < sortedEligible.size()
&& candidate.size() < maxMergeAtOnce
// We allow merging more than mergeFactor segments together if the merged segment
// would be less than the floor segment size. This is important because segments
// below the floor segment size are more aggressively merged by this policy, so we
// need to grow them as quickly as possible.
&& (candidate.size() < mergeFactor || bytesThisMerge < floorSegmentBytes)
&& bytesThisMerge < maxMergedSegmentBytes
&& (bytesThisMerge < floorSegmentBytes || docCountThisMerge <= allowedDocCount);
idx++) {
final SegmentSizeAndDocs segSizeDocs = sortedEligible.get(idx);
final long segBytes = segSizeDocs.sizeInBytes;
int segDocCount = segSizeDocs.maxDoc - segSizeDocs.delCount;
if (bytesThisMerge + segBytes > maxMergedSegmentBytes
|| (bytesThisMerge > floorSegmentBytes
&& docCountThisMerge + segDocCount > allowedDocCount)) {
// Only set hitTooLarge when reaching the maximum byte size, as this will create
// segments of the maximum size which will no longer be eligible for merging for a long
// time (until they accumulate enough deletes).
hitTooLarge |= bytesThisMerge + segBytes > maxMergedSegmentBytes;
// We should never have something coming in that _cannot_ be merged, so handle
// singleton merges
if (candidate.size() > 0) {
// NOTE: we continue, so that we can try
// "packing" smaller segments into this merge
// to see if we can get closer to the max
// size; this in general is not perfect since
// this is really "bin packing" and we'd have
// to try different permutations.
continue;
}
}
candidate.add(segSizeDocs.segInfo);
bytesThisMerge += segBytes;
docCountThisMerge += segDocCount;
}
// We should never see an empty candidate: we iterated over maxMergeAtOnce
// segments, and already pre-excluded the too-large segments:
assert candidate.size() > 0;
SegmentSizeAndDocs maxCandidateSegmentSize = segInfosSizes.get(candidate.get(0));
if (hitTooLarge == false
&& mergeType == MERGE_TYPE.NATURAL
&& bytesThisMerge < maxCandidateSegmentSize.sizeInBytes * 1.5
&& maxCandidateSegmentSize.delCount
< maxCandidateSegmentSize.maxDoc * deletesPctAllowed / 100) {
// Ignore any merge where the resulting segment is not at least 50% larger than the
// biggest input segment.
// Otherwise we could run into pathological O(N^2) merging where merges keep rewriting
// again and again the biggest input segment into a segment that is barely bigger.
// The only exception we make is when the merge would reclaim lots of deletes in the
// biggest segment. This is important for cases when lots of documents get deleted at once
// without introducing new segments of a similar size for instance.
continue;
}
// A singleton merge with no deletes makes no sense. We can get here when forceMerge is
// looping around...
if (candidate.size() == 1 && maxCandidateSegmentSize.delCount == 0) {
continue;
}
// If we didn't find a too-large merge and have a list of candidates
// whose length is less than the merge factor, it means we are reaching
// the tail of the list of segments and will only find smaller merges.
// Stop here.
if (bestScore != null && hitTooLarge == false && candidate.size() < mergeFactor) {
break;
}
final MergeScore score = score(candidate, hitTooLarge, segInfosSizes);
if (verbose(mergeContext)) {
message(
" maybe="
+ segString(mergeContext, candidate)
+ " score="
+ score.getScore()
+ " "
+ score.getExplanation()
+ " tooLarge="
+ hitTooLarge
+ " size="
+ String.format(Locale.ROOT, "%.3f MB", bytesThisMerge / 1024. / 1024.),
mergeContext);
}
if ((bestScore == null || score.getScore() < bestScore.getScore())
&& (!hitTooLarge || !maxMergeIsRunning)) {
best = candidate;
bestScore = score;
bestTooLarge = hitTooLarge;
bestMergeBytes = bytesThisMerge;
}
}
if (best == null) {
return spec;
}
// The mergeType == FORCE_MERGE_DELETES behaves as the code does currently and can create a
// large number of
// concurrent big merges. If we make findForcedDeletesMerges behave as findForcedMerges and
// cycle through
// we should remove this.
if (haveOneLargeMerge == false
|| bestTooLarge == false
|| mergeType == MERGE_TYPE.FORCE_MERGE_DELETES) {
haveOneLargeMerge |= bestTooLarge;
if (spec == null) {
spec = new MergeSpecification();
}
final OneMerge merge = new OneMerge(best);
spec.add(merge);
if (verbose(mergeContext)) {
message(
" add merge="
+ segString(mergeContext, merge.segments)
+ " size="
+ String.format(Locale.ROOT, "%.3f MB", bestMergeBytes / 1024. / 1024.)
+ " score="
+ String.format(Locale.ROOT, "%.3f", bestScore.getScore())
+ " "
+ bestScore.getExplanation()
+ (bestTooLarge ? " [max merge]" : ""),
mergeContext);
}
}
// whether we're going to return this list in the spec of not, we need to remove it from
// consideration on the next loop.
toBeMerged.addAll(best);
}
}
/** Expert: scores one merge; subclasses can override. */
protected MergeScore score(
List candidate,
boolean hitTooLarge,
Map segmentsSizes)
throws IOException {
long totBeforeMergeBytes = 0;
long totAfterMergeBytes = 0;
long totAfterMergeBytesFloored = 0;
for (SegmentCommitInfo info : candidate) {
final long segBytes = segmentsSizes.get(info).sizeInBytes;
totAfterMergeBytes += segBytes;
totAfterMergeBytesFloored += floorSize(segBytes);
totBeforeMergeBytes += info.sizeInBytes();
}
// Roughly measure "skew" of the merge, i.e. how
// "balanced" the merge is (whether the segments are
// about the same size), which can range from
// 1.0/numSegsBeingMerged (good) to 1.0 (poor). Heavily
// lopsided merges (skew near 1.0) is no good; it means
// O(N^2) merge cost over time:
final double skew;
if (hitTooLarge) {
// Pretend the merge has perfect skew; skew doesn't
// matter in this case because this merge will not
// "cascade" and so it cannot lead to N^2 merge cost
// over time:
final int mergeFactor = (int) Math.min(maxMergeAtOnce, segsPerTier);
skew = 1.0 / mergeFactor;
} else {
skew =
((double) floorSize(segmentsSizes.get(candidate.get(0)).sizeInBytes))
/ totAfterMergeBytesFloored;
}
// Strongly favor merges with less skew (smaller
// mergeScore is better):
double mergeScore = skew;
// Gently favor smaller merges over bigger ones. We
// don't want to make this exponent too large else we
// can end up doing poor merges of small segments in
// order to avoid the large merges:
mergeScore *= Math.pow((double) totAfterMergeBytes, 0.05);
// Strongly favor merges that reclaim deletes:
final double nonDelRatio = ((double) totAfterMergeBytes) / totBeforeMergeBytes;
mergeScore *= Math.pow(nonDelRatio, 2);
final double finalMergeScore = mergeScore;
return new MergeScore() {
@Override
public double getScore() {
return finalMergeScore;
}
@Override
public String getExplanation() {
return "skew="
+ String.format(Locale.ROOT, "%.3f", skew)
+ " nonDelRatio="
+ String.format(Locale.ROOT, "%.3f", nonDelRatio);
}
};
}
@Override
public MergeSpecification findForcedMerges(
SegmentInfos infos,
int maxSegmentCount,
Map segmentsToMerge,
MergeContext mergeContext)
throws IOException {
if (verbose(mergeContext)) {
message(
"findForcedMerges maxSegmentCount="
+ maxSegmentCount
+ " infos="
+ segString(mergeContext, infos)
+ " segmentsToMerge="
+ segmentsToMerge,
mergeContext);
}
List sortedSizeAndDocs = getSortedBySegmentSize(infos, mergeContext);
long totalMergeBytes = 0;
final Set merging = mergeContext.getMergingSegments();
// Trim the list down, remove if we're respecting max segment size and it's not original.
// Presumably it's been merged before and is close enough to the max segment size we
// shouldn't add it in again.
Iterator iter = sortedSizeAndDocs.iterator();
boolean forceMergeRunning = false;
while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next();
final Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo);
if (isOriginal == null) {
iter.remove();
} else {
if (merging.contains(segSizeDocs.segInfo)) {
forceMergeRunning = true;
iter.remove();
} else {
totalMergeBytes += segSizeDocs.sizeInBytes;
}
}
}
long maxMergeBytes = maxMergedSegmentBytes;
// Set the maximum segment size based on how many segments have been specified.
if (maxSegmentCount == 1) {
maxMergeBytes = Long.MAX_VALUE;
} else if (maxSegmentCount != Integer.MAX_VALUE) {
maxMergeBytes =
Math.max(
(long) (((double) totalMergeBytes / (double) maxSegmentCount)),
maxMergedSegmentBytes);
// Fudge this up a bit so we have a better chance of not having to do a second pass of merging
// to get
// down to the requested target segment count. If we use the exact size, it's almost
// guaranteed
// that the segments selected below won't fit perfectly and we'll be left with more segments
// than
// we want and have to re-merge in the code at the bottom of this method.
maxMergeBytes = (long) ((double) maxMergeBytes * 1.25);
}
iter = sortedSizeAndDocs.iterator();
boolean foundDeletes = false;
while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next();
Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo);
if (segSizeDocs.delCount != 0) {
// This is forceMerge; all segments with deleted docs should be merged.
if (isOriginal != null && isOriginal) {
foundDeletes = true;
}
continue;
}
// Let the scoring handle whether to merge large segments.
if (maxSegmentCount == Integer.MAX_VALUE && isOriginal != null && isOriginal == false) {
iter.remove();
}
// Don't try to merge a segment with no deleted docs that's over the max size.
if (maxSegmentCount != Integer.MAX_VALUE && segSizeDocs.sizeInBytes >= maxMergeBytes) {
iter.remove();
}
}
// Nothing to merge this round.
if (sortedSizeAndDocs.size() == 0) {
return null;
}
// We only bail if there are no deletions
if (foundDeletes == false) {
SegmentCommitInfo infoZero = sortedSizeAndDocs.get(0).segInfo;
if ((maxSegmentCount != Integer.MAX_VALUE
&& maxSegmentCount > 1
&& sortedSizeAndDocs.size() <= maxSegmentCount)
|| (maxSegmentCount == 1
&& sortedSizeAndDocs.size() == 1
&& (segmentsToMerge.get(infoZero) != null
|| isMerged(infos, infoZero, mergeContext)))) {
if (verbose(mergeContext)) {
message("already merged", mergeContext);
}
return null;
}
}
if (verbose(mergeContext)) {
message("eligible=" + sortedSizeAndDocs, mergeContext);
}
final int startingSegmentCount = sortedSizeAndDocs.size();
if (forceMergeRunning) {
// hmm this is a little dangerous -- if a user kicks off a forceMerge, it is taking forever,
// lots of
// new indexing/segments happened since, and they want to kick off another to ensure those
// newly
// indexed segments partake in the force merge, they (silently) won't due to this?
return null;
}
// This is the special case of merging down to one segment
if (maxSegmentCount == 1 && totalMergeBytes < maxMergeBytes) {
MergeSpecification spec = new MergeSpecification();
List allOfThem = new ArrayList<>();
for (SegmentSizeAndDocs segSizeDocs : sortedSizeAndDocs) {
allOfThem.add(segSizeDocs.segInfo);
}
spec.add(new OneMerge(allOfThem));
return spec;
}
MergeSpecification spec = null;
int index = startingSegmentCount - 1;
int resultingSegments = startingSegmentCount;
while (true) {
List candidate = new ArrayList<>();
long currentCandidateBytes = 0L;
while (index >= 0 && resultingSegments > maxSegmentCount) {
final SegmentCommitInfo current = sortedSizeAndDocs.get(index).segInfo;
final int initialCandidateSize = candidate.size();
final long currentSegmentSize = current.sizeInBytes();
// We either add to the bin because there's space or because the it is the smallest possible
// bin since
// decrementing the index will move us to even larger segments.
if (currentCandidateBytes + currentSegmentSize <= maxMergeBytes
|| initialCandidateSize < 2) {
candidate.add(current);
--index;
currentCandidateBytes += currentSegmentSize;
if (initialCandidateSize > 0) {
// Any merge that handles two or more segments reduces the resulting number of segments
// by the number of segments handled - 1
--resultingSegments;
}
} else {
break;
}
}
final int candidateSize = candidate.size();
// While a force merge is running, only merges that cover the maximum allowed number of
// segments or that create a segment close to the
// maximum allowed segment sized are permitted
if (candidateSize > 1
&& (forceMergeRunning == false || currentCandidateBytes > 0.7 * maxMergeBytes)) {
final OneMerge merge = new OneMerge(candidate);
if (verbose(mergeContext)) {
message("add merge=" + segString(mergeContext, merge.segments), mergeContext);
}
if (spec == null) {
spec = new MergeSpecification();
}
spec.add(merge);
} else {
return spec;
}
}
}
@Override
public MergeSpecification findForcedDeletesMerges(SegmentInfos infos, MergeContext mergeContext)
throws IOException {
if (verbose(mergeContext)) {
message(
"findForcedDeletesMerges infos="
+ segString(mergeContext, infos)
+ " forceMergeDeletesPctAllowed="
+ forceMergeDeletesPctAllowed,
mergeContext);
}
// First do a quick check that there's any work to do.
// NOTE: this makes BaseMergePOlicyTestCase.testFindForcedDeletesMerges work
final Set merging = mergeContext.getMergingSegments();
boolean haveWork = false;
int totalDelCount = 0;
for (SegmentCommitInfo info : infos) {
int delCount = mergeContext.numDeletesToMerge(info);
assert assertDelCount(delCount, info);
totalDelCount += delCount;
double pctDeletes = 100. * ((double) delCount) / info.info.maxDoc();
haveWork = haveWork || (pctDeletes > forceMergeDeletesPctAllowed && !merging.contains(info));
}
if (haveWork == false) {
return null;
}
List sortedInfos = getSortedBySegmentSize(infos, mergeContext);
Iterator iter = sortedInfos.iterator();
while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next();
double pctDeletes = 100. * ((double) segSizeDocs.delCount / (double) segSizeDocs.maxDoc);
if (merging.contains(segSizeDocs.segInfo) || pctDeletes <= forceMergeDeletesPctAllowed) {
iter.remove();
}
}
if (verbose(mergeContext)) {
message("eligible=" + sortedInfos, mergeContext);
}
return doFindMerges(
sortedInfos,
maxMergedSegmentBytes,
Integer.MAX_VALUE,
Integer.MAX_VALUE,
0,
getMaxAllowedDocs(infos.totalMaxDoc(), totalDelCount),
MERGE_TYPE.FORCE_MERGE_DELETES,
mergeContext,
false);
}
int getMaxAllowedDocs(int totalMaxDoc, int totalDelDocs) {
return Math.ceilDiv(totalMaxDoc - totalDelDocs, targetSearchConcurrency);
}
private long floorSize(long bytes) {
return Math.max(floorSegmentBytes, bytes);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("[" + getClass().getSimpleName() + ": ");
sb.append("maxMergeAtOnce=").append(maxMergeAtOnce).append(", ");
sb.append("maxMergedSegmentMB=").append(maxMergedSegmentBytes / 1024. / 1024.).append(", ");
sb.append("floorSegmentMB=").append(floorSegmentBytes / 1024. / 1024.).append(", ");
sb.append("forceMergeDeletesPctAllowed=").append(forceMergeDeletesPctAllowed).append(", ");
sb.append("segmentsPerTier=").append(segsPerTier).append(", ");
sb.append("maxCFSSegmentSizeMB=").append(getMaxCFSSegmentSizeMB()).append(", ");
sb.append("noCFSRatio=").append(noCFSRatio).append(", ");
sb.append("deletesPctAllowed=").append(deletesPctAllowed).append(", ");
sb.append("targetSearchConcurrency=").append(targetSearchConcurrency);
return sb.toString();
}
}