org.apache.lucene.util.MSBRadixSorter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of lucene-core Show documentation
Apache Lucene Java Core
There is a newer version: 9.11.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util;

import java.util.Arrays;

/** Radix sorter for variable-length strings. This class sorts based on the most
 *  significant byte first and falls back to {@link IntroSorter} when the size
 *  of the buckets to sort becomes small. It is NOT stable.
 *  Worst-case memory usage is about {@code 2.3 KB}.
 *  @lucene.internal */
public abstract class MSBRadixSorter extends Sorter {

  // after that many levels of recursion we fall back to introsort anyway
  // this is used as a protection against the fact that radix sort performs
  // worse when there are long common prefixes (probably because of cache
  // locality)
  private static final int LEVEL_THRESHOLD = 8;
  // size of histograms: 256 + 1 to indicate that the string is finished
  private static final int HISTOGRAM_SIZE = 257;
  // buckets below this size will be sorted with introsort
  private static final int LENGTH_THRESHOLD = 100;

  // we store one histogram per recursion level
  private final int[][] histograms = new int[LEVEL_THRESHOLD][];
  private final int[] endOffsets = new int[HISTOGRAM_SIZE];
  private final int[] commonPrefix;

  private final int maxLength;

  /**
   * Sole constructor.
   * @param maxLength the maximum length of keys, pass {@link Integer#MAX_VALUE} if unknown.
   */
  protected MSBRadixSorter(int maxLength) {
    this.maxLength = maxLength;
    this.commonPrefix = new int[Math.min(24, maxLength)];
  }

  /** Return the k-th byte of the entry at index {@code i}, or {@code -1} if
   * its length is less than or equal to {@code k}. This may only be called
   * with a value of {@code i} between {@code 0} included and
   * {@code maxLength} excluded. */
  protected abstract int byteAt(int i, int k);

  /** Get a fall-back sorter which may assume that the first k bytes of all compared strings are equal. */
  protected Sorter getFallbackSorter(int k) {
    return new IntroSorter() {
      @Override
      protected void swap(int i, int j) {
        MSBRadixSorter.this.swap(i, j);
      }

      @Override
      protected int compare(int i, int j) {
        for (int o = k; o < maxLength; ++o) {
          final int b1 = byteAt(i, o);
          final int b2 = byteAt(j, o);
          if (b1 != b2) {
            return b1 - b2;
          } else if (b1 == -1) {
            break;
          }
        }
        return 0;
      }

      @Override
      protected void setPivot(int i) {
        pivot.setLength(0);
        for (int o = k; o < maxLength; ++o) {
          final int b = byteAt(i, o);
          if (b == -1) {
            break;
          }
          pivot.append((byte) b);
        }
      }

      @Override
      protected int comparePivot(int j) {
        for (int o = 0; o < pivot.length(); ++o) {
          final int b1 = pivot.byteAt(o) & 0xff;
          final int b2 = byteAt(j, k + o);
          if (b1 != b2) {
            return b1 - b2;
          }
        }
        if (k + pivot.length() == maxLength) {
          return 0;
        }
        return -1 - byteAt(j, k + pivot.length());
      }

      private final BytesRefBuilder pivot = new BytesRefBuilder();
    };
  }

  @Override
  protected final int compare(int i, int j) {
    throw new UnsupportedOperationException("unused: not a comparison-based sort");
  }

  @Override
  public void sort(int from, int to) {
    checkRange(from, to);
    sort(from, to, 0, 0);
  }

  private void sort(int from, int to, int k, int l) {
    if (to - from <= LENGTH_THRESHOLD || l >= LEVEL_THRESHOLD) {
      introSort(from, to, k);
    } else {
      radixSort(from, to, k, l);
    }
  }

  private void introSort(int from, int to, int k) {
    getFallbackSorter(k).sort(from, to);
  }

  /**
   * @param k the character number to compare
   * @param l the level of recursion
   */
  private void radixSort(int from, int to, int k, int l) {
    int[] histogram = histograms[l];
    if (histogram == null) {
      histogram = histograms[l] = new int[HISTOGRAM_SIZE];
    } else {
      Arrays.fill(histogram, 0);
    }

    final int commonPrefixLength = computeCommonPrefixLengthAndBuildHistogram(from, to, k, histogram);
    if (commonPrefixLength > 0) {
      // if there are no more chars to compare or if all entries fell into the
      // first bucket (which means strings are shorter than k) then we are done
      // otherwise recurse
      if (k + commonPrefixLength < maxLength
          && histogram[0] < to - from) {
        radixSort(from, to, k + commonPrefixLength, l);
      }
      return;
    }
    assert assertHistogram(commonPrefixLength, histogram);

    int[] startOffsets = histogram;
    int[] endOffsets = this.endOffsets;
    sumHistogram(histogram, endOffsets);
    reorder(from, to, startOffsets, endOffsets, k);
    endOffsets = startOffsets;

    if (k + 1 < maxLength) {
      // recurse on all but the first bucket since all keys are equals in this
      // bucket (we already compared all bytes)
      for (int prev = endOffsets[0], i = 1; i < HISTOGRAM_SIZE; ++i) {
        int h = endOffsets[i];
        final int bucketLen = h - prev;
        if (bucketLen > 1) {
          sort(from + prev, from + h, k + 1, l + 1);
        }
        prev = h;
      }
    }
  }

  // only used from assert
  private boolean assertHistogram(int commonPrefixLength, int[] histogram) {
    int numberOfUniqueBytes = 0;
    for (int freq : histogram) {
      if (freq > 0) {
        numberOfUniqueBytes++;
      }
    }
    if (numberOfUniqueBytes == 1) {
      assert commonPrefixLength >= 1;
    } else {
      assert commonPrefixLength == 0 : commonPrefixLength;
    }
    return true;
  }

  /** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
  private int getBucket(int i, int k) {
    return byteAt(i, k) + 1;
  }

  /** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}
   *  and return a common prefix length for all visited values.
   *  @see #buildHistogram */
  private int computeCommonPrefixLengthAndBuildHistogram(int from, int to, int k, int[] histogram) {
    final int[] commonPrefix = this.commonPrefix;
    int commonPrefixLength = Math.min(commonPrefix.length, maxLength - k);
    for (int j = 0; j < commonPrefixLength; ++j) {
      final int b = byteAt(from, k + j);
      commonPrefix[j] = b;
      if (b == -1) {
        commonPrefixLength = j + 1;
        break;
      }
    }

    int i;
    outer: for (i = from + 1; i < to; ++i) {
      for (int j = 0; j < commonPrefixLength; ++j) {
        final int b = byteAt(i, k + j);
        if (b != commonPrefix[j]) {
          commonPrefixLength = j;
          if (commonPrefixLength == 0) { // we have no common prefix
            histogram[commonPrefix[0] + 1] = i - from;
            histogram[b + 1] = 1;
            break outer;
          }
          break;
        }
      }
    }

    if (i < to) {
      // the loop got broken because there is no common prefix
      assert commonPrefixLength == 0;
      buildHistogram(i + 1, to, k, histogram);
    } else {
      assert commonPrefixLength > 0;
      histogram[commonPrefix[0] + 1] = to - from;
    }

    return commonPrefixLength;
  }

  /** Build an histogram of the k-th characters of values occurring between
   *  offsets {@code from} and {@code to}, using {@link #getBucket}. */
  private void buildHistogram(int from, int to, int k, int[] histogram) {
    for (int i = from; i < to; ++i) {
      histogram[getBucket(i, k)]++;
    }
  }

  /** Accumulate values of the histogram so that it does not store counts but
   *  start offsets. {@code endOffsets} will store the end offsets. */
  private static void sumHistogram(int[] histogram, int[] endOffsets) {
    int accum = 0;
    for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
      final int count = histogram[i];
      histogram[i] = accum;
      accum += count;
      endOffsets[i] = accum;
    }
  }

  /**
   * Reorder based on start/end offsets for each bucket. When this method
   * returns, startOffsets and endOffsets are equal.
   * @param startOffsets start offsets per bucket
   * @param endOffsets end offsets per bucket
   */
  private void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
    // reorder in place, like the dutch flag problem
    for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
      final int limit = endOffsets[i];
      for (int h1 = startOffsets[i]; h1 < limit; h1 = startOffsets[i]) {
        final int b = getBucket(from + h1, k);
        final int h2 = startOffsets[b]++;
        swap(from + h1, from + h2);
      }
    }
  }
}