org.apache.solr.uninverting.DocTermOrds Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.uninverting;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.StringHelper;

/**
 * This class enables fast access to multiple term ords for a specified field across all docIDs.
 *
 * Like FieldCache, it uninverts the index and holds a packed data structure in RAM to enable
 * fast access. Unlike FieldCache, it can handle multi-valued fields, and, it does not hold the term
 * bytes in RAM. Rather, you must obtain a TermsEnum from the {@link #getOrdTermsEnum} method, and
 * then seek-by-ord to get the term's bytes.
 *
 * 
While normally term ords are type long, in this API they are int as the internal
 * representation here cannot address more than MAX_INT unique terms. Also, typically this class is
 * used on fields with relatively few unique terms vs the number of documents. A previous internal
 * limit (16 MB) on how many bytes each chunk of documents may consume has been increased to 2 GB.
 *
 * 
Deleted documents are skipped during uninversion, and if you look them up you'll get 0 ords.
 *
 * 
The returned per-document ords do not retain their original order in the document. Instead
 * they are returned in sorted (by ord, ie term's BytesRef comparator) order. They are also de-dup'd
 * (ie if doc has same term more than once in this field, you'll only get that ord back once).
 *
 * 
This class will create its own term index internally, allowing to create a wrapped TermsEnum
 * that can handle ord. The {@link #getOrdTermsEnum} method then provides this wrapped enum.
 *
 * 
The RAM consumption of this class can be high!
 *
 * @lucene.experimental
 */

/*
 * The un-inverted field:
 *   Each document points to a list of term numbers that are contained in that document.
 *
 *   Term numbers are in sorted order, and are encoded as variable-length deltas from the
 *   previous term number.  Real term numbers start at 2 since 0 and 1 are reserved.  A
 *   term number of 0 signals the end of the termNumber list.
 *
 *   There is a single int[maxDoc()] which either contains a pointer into a byte[] for
 *   the termNumber lists, or directly contains the termNumber list if it fits as a vInt-list
 *   in the 4 bytes of an integer. As bit 7 within each byte is used in the vInt encoding to
 *   signal overflow into the next byte, bit 7 of the highest byte (bit 31 in the full integer)
 *   will never be 1. If bit 31 in the integer is set, this signals a pointer and bit 0-30
 *   is then the value of the pointer into a byte[] where the termNumber list starts.
 *
 *   A single entry is thus either 0b0xxxxxxxx_xxxxxxxx_xxxxxxxx_xxxxxxxx holding 0-4 vInts
 *   (low byte first) or 0b1xxxxxxxx_xxxxxxxx_xxxxxxxx_xxxxxxxx holding a 31-bit pointer.
 *
 *   There are 256 byte arrays, as the previous version of DocTermOrds had a pointer limit
 *   of 24 bits / 3 bytes. The correct byte array for a document is a function of its id.
 *
 *   To save space and speed up faceting, any term that matches enough documents will
 *   not be un-inverted... it will be skipped while building the un-inverted field structure,
 *   and will use a set intersection method during faceting.
 *
 *   To further save memory, the terms (the actual string values) are not all stored in
 *   memory, but a TermIndex is used to convert term numbers to term values only
 *   for the terms needed after faceting has completed.  Only every 128th term value
 *   is stored, along with its corresponding term number, and this is used as an
 *   index to find the closest term and iterate until the desired number is hit (very
 *   much like Lucene's own internal term index).
 */

public class DocTermOrds implements Accountable {

  // Term ords are shifted by this, internally, to reserve
  // values 0 (end term) and 1 (index is a pointer into byte array)
  private static final int TNUM_OFFSET = 2;

  /** Every 128th term is indexed, by default. */
  public static final int DEFAULT_INDEX_INTERVAL_BITS =
      7; // decrease to a low number like 2 for testing

  private int indexIntervalBits;
  private int indexIntervalMask;
  private int indexInterval;

  /** Don't uninvert terms that exceed this count. */
  protected final int maxTermDocFreq;

  /** Field we are uninverting. */
  protected final String field;

  /** Number of terms in the field. */
  protected int numTermsInField;

  /** Total number of references to term numbers. */
  protected long termInstances;

  private long memsz;

  /** Total time to uninvert the field. */
  protected int total_time;

  /** Time for phase1 of the uninvert process. */
  protected int phase1_time;

  /** Holds the per-document ords or a pointer to the ords. */
  protected int[] index;

  /** Holds term ords for documents. */
  protected byte[][] tnums = new byte[256][];

  /** Total bytes (sum of term lengths) for all indexed terms. */
  protected long sizeOfIndexedStrings;

  /** Holds the indexed (by default every 128th) terms. */
  // TODO: This seems like an obvious candidate for using BytesRefArray extended with binarySearch
  // This would save heap space as well as avoid a lot of small Objects (BytesRefs).
  // This would also increase data locality for binarySearch lookups, potentially making it faster.
  protected BytesRef[] indexedTermsArray = new BytesRef[0];

  /** If non-null, only terms matching this prefix were indexed. */
  protected BytesRef prefix;

  /**
   * Ordinal of the first term in the field, or 0 if the {@link PostingsFormat} does not implement
   * {@link TermsEnum#ord}.
   */
  protected int ordBase;

  /** Used while uninverting. */
  protected PostingsEnum postingsEnum;

  /**
   * If true, check and throw an exception if the field has docValues enabled. Normally, docValues
   * should be used in preference to DocTermOrds.
   */
  protected boolean checkForDocValues = true;

  // TODO: Why is indexedTermsArray not part of this?
  /** Returns total bytes used. */
  @Override
  public long ramBytesUsed() {
    // can cache the mem size since it shouldn't change
    if (memsz != 0) return memsz;
    long sz = 8 * 8 + 32; // local fields
    if (index != null) sz += index.length * 4L;
    if (tnums != null) {
      for (byte[] arr : tnums) if (arr != null) sz += arr.length;
    }
    if (indexedTermsArray != null) {
      // assume 8 byte references?
      sz += 8 + 8 + 8 + 8 + ((long) indexedTermsArray.length << 3) + sizeOfIndexedStrings;
    }
    memsz = sz;
    return sz;
  }

  /** Inverts all terms. */
  public DocTermOrds(LeafReader reader, Bits liveDocs, String field) throws IOException {
    this(reader, liveDocs, field, null, Integer.MAX_VALUE);
  }

  // TODO: instead of all these ctors and options, take termsenum!

  /** Inverts only terms starting w/ prefix */
  public DocTermOrds(LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix)
      throws IOException {
    this(reader, liveDocs, field, termPrefix, Integer.MAX_VALUE);
  }

  /**
   * Inverts only terms starting w/ prefix, and only terms whose docFreq (not taking deletions into
   * account) is <= maxTermDocFreq
   */
  public DocTermOrds(
      LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix, int maxTermDocFreq)
      throws IOException {
    this(reader, liveDocs, field, termPrefix, maxTermDocFreq, DEFAULT_INDEX_INTERVAL_BITS);
  }

  /**
   * Inverts only terms starting w/ prefix, and only terms whose docFreq (not taking deletions into
   * account) is <= maxTermDocFreq, with a custom indexing interval (default is every 128nd
   * term).
   */
  public DocTermOrds(
      LeafReader reader,
      Bits liveDocs,
      String field,
      BytesRef termPrefix,
      int maxTermDocFreq,
      int indexIntervalBits)
      throws IOException {
    this(field, maxTermDocFreq, indexIntervalBits);
    uninvert(reader, liveDocs, termPrefix);
  }

  /** Subclass inits w/ this, but be sure you then call uninvert, only once */
  protected DocTermOrds(String field, int maxTermDocFreq, int indexIntervalBits) {
    // System.out.println("DTO init field=" + field + " maxTDFreq=" + maxTermDocFreq);
    this.field = field;
    this.maxTermDocFreq = maxTermDocFreq;
    this.indexIntervalBits = indexIntervalBits;
    indexIntervalMask = 0xffffffff >>> (32 - indexIntervalBits);
    indexInterval = 1 << indexIntervalBits;
  }

  /**
   * Returns a TermsEnum that implements ord, or null if no terms in field.
   *
   * 
we build a "private" terms index internally (WARNING: consumes RAM) and use that index to
   * implement ord. This also enables ord on top of a composite reader. The returned TermsEnum is
   * unpositioned. This returns null if there are no terms.
   *
   * NOTE: you must pass the same reader that was used when creating this class
   */
  public TermsEnum getOrdTermsEnum(LeafReader reader) throws IOException {
    // NOTE: see LUCENE-6529 before attempting to optimize this method to
    // return a TermsEnum directly from the reader if it already supports ord().

    assert null != indexedTermsArray;

    if (0 == indexedTermsArray.length) {
      return null;
    } else {
      return new OrdWrappedTermsEnum(reader);
    }
  }

  /** Returns the number of terms in this field */
  public int numTerms() {
    return numTermsInField;
  }

  /** Returns {@code true} if no terms were indexed. */
  public boolean isEmpty() {
    return index == null;
  }

  /** Subclass can override this */
  protected void visitTerm(TermsEnum te, int termNum) throws IOException {}

  /**
   * Invoked during {@link #uninvert(org.apache.lucene.index.LeafReader,Bits,BytesRef)} to record
   * the document frequency for each uninverted term.
   */
  protected void setActualDocFreq(int termNum, int df) throws IOException {}

  /** Call this only once (if you subclass!) */
  protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix)
      throws IOException {
    final FieldInfo info = reader.getFieldInfos().fieldInfo(field);
    if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) {
      throw new IllegalStateException(
          "Type mismatch: " + field + " was indexed as " + info.getDocValuesType());
    }
    // System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
    final long startTime = System.nanoTime();
    prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix);

    final int maxDoc = reader.maxDoc();
    // immediate term numbers, or the index into the byte[] representing the last number
    final int[] index = new int[maxDoc];
    // last term we saw for this document
    final int[] lastTerm = new int[maxDoc];
    // list of term numbers for the doc (delta encoded vInts)
    final byte[][] bytes = new byte[maxDoc][];

    final Terms terms = reader.terms(field);
    if (terms == null) {
      // No terms
      return;
    }

    final TermsEnum te = terms.iterator();
    final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef();
    // System.out.println("seekStart=" + seekStart.utf8ToString());
    if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) {
      // No terms match
      return;
    }

    // For our "term index wrapper"
    final List indexedTerms = new ArrayList<>();
    final PagedBytes indexedTermsBytes = new PagedBytes(15);

    // we need a minimum of 9 bytes, but round up to 12 since the space would
    // be wasted with most allocators anyway.
    byte[] tempArr = new byte[12];

    //
    // enumerate all terms, and build an intermediate form of the un-inverted field.
    //
    // During this intermediate form, every document has a (potential) byte[]
    // and the int[maxDoc()] array either contains the termNumber list directly
    // or the *end* offset of the termNumber list in its byte array (for faster
    // appending and faster creation of the final form).
    //
    // idea... if things are too large while building, we could do a range of docs
    // at a time (but it would be a fair amount slower to build)
    // could also do ranges in parallel to take advantage of multiple CPUs

    // OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
    // values.  This requires going over the field first to find the most
    // frequent terms ahead of time.

    int termNum = 0;
    postingsEnum = null;

    // Loop begins with te positioned to first term (we call
    // seek above):
    for (; ; ) {
      final BytesRef t = te.term();
      if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) {
        break;
      }
      // System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);

      visitTerm(te, termNum);

      if ((termNum & indexIntervalMask) == 0) {
        // Index this term
        sizeOfIndexedStrings += t.length;
        BytesRef indexedTerm = new BytesRef();
        indexedTermsBytes.copy(t, indexedTerm);
        // TODO: really should 1) strip off useless suffix,
        // and 2) use FST not array/PagedBytes
        indexedTerms.add(indexedTerm);
      }

      final int df = te.docFreq();
      if (df <= maxTermDocFreq) {

        postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE);

        // dF, but takes deletions into account
        int actualDF = 0;

        for (; ; ) {
          int doc = postingsEnum.nextDoc();
          if (doc == DocIdSetIterator.NO_MORE_DOCS) {
            break;
          }
          // System.out.println("  chunk=" + chunk + " docs");

          actualDF++;
          termInstances++;

          // System.out.println("    docID=" + doc);
          // add TNUM_OFFSET to the term number to make room for special reserved values:
          // 0 (end term) and 1 (index into byte array follows)
          int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
          lastTerm[doc] = termNum;
          int val = index[doc];

          if ((val & 0x80000000) != 0) {
            // index into byte array (actually the end of the doc-specific byte[] when building)
            int pos = val & 0x7fffffff;
            int ilen = vIntSize(delta);
            byte[] arr = bytes[doc];
            int newend = pos + ilen;
            if (newend > arr.length) {
              // We avoid a doubling strategy to lower memory usage.
              // this faceting method isn't for docs with many terms.
              // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit
              // boundary.
              // TODO: figure out what array lengths we can round up to w/o actually using more
              // memory
              // (how much space does a byte[] take up?  Is data preceded by a 32 bit length only?
              // It should be safe to round up to the nearest 32 bits in any case.
              int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment
              byte[] newarr = new byte[newLen];
              System.arraycopy(arr, 0, newarr, 0, pos);
              arr = newarr;
              bytes[doc] = newarr;
            }
            pos = writeInt(delta, arr, pos);
            index[doc] = pos | 0x80000000; // update pointer to end index in byte[]
          } else {
            // OK, this int has data in it... find the end (a zero starting byte - not
            // part of another number, hence not following a byte with the high bit set).
            int ipos;
            if (val == 0) {
              ipos = 0;
            } else if ((val & 0x0000ff80) == 0) {
              ipos = 1;
            } else if ((val & 0x00ff8000) == 0) {
              ipos = 2;
            } else if ((val & 0xff800000) == 0) {
              ipos = 3;
            } else {
              ipos = 4;
            }

            // System.out.println("      ipos=" + ipos);

            int endPos = writeInt(delta, tempArr, ipos);
            // System.out.println("      endpos=" + endPos);
            if (endPos <= 4) {
              // System.out.println("      fits!");
              // value will fit in the integer... move bytes back
              for (int j = ipos; j < endPos; j++) {
                val |= (tempArr[j] & 0xff) << (j << 3);
              }
              index[doc] = val;
            } else {
              // value won't fit... move integer into byte[]
              for (int j = 0; j < ipos; j++) {
                tempArr[j] = (byte) val;
                val >>>= 8;
              }
              // point at the end index in the byte[]
              index[doc] = endPos | 0x80000000;
              bytes[doc] = tempArr;
              tempArr = new byte[12];
            }
          }
        }
        setActualDocFreq(termNum, actualDF);
      }

      termNum++;
      if (te.next() == null) {
        break;
      }
    }

    numTermsInField = termNum;

    long midPoint = System.nanoTime();

    if (termInstances == 0) {
      // we didn't invert anything
      // lower memory consumption.
      tnums = null;
    } else {

      this.index = index;

      //
      // transform intermediate form into the final form, building a single byte[]
      // at a time, and releasing the intermediate byte[]s as we go to avoid
      // increasing the memory footprint.
      //

      for (int pass = 0; pass < 256; pass++) {
        byte[] target = tnums[pass];
        int pos = 0; // end in target;
        if (target != null) {
          pos = target.length;
        } else {
          target = new byte[4096];
        }

        // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
        // where pp is the pass (which array we are building), and xx is all values.
        // each pass shares the same byte[] for termNumber lists.
        for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) {
          int lim = Math.min(docbase + (1 << 16), maxDoc);
          for (int doc = docbase; doc < lim; doc++) {
            // System.out.println("  pass=" + pass + " process docID=" + doc);
            int val = index[doc];
            if ((val & 0x80000000) != 0) {
              int len = val & 0x7fffffff;
              // System.out.println("    ptr pos=" + pos);
              // index[doc] = (pos<<8)|1; // change index to point to start of array
              index[doc] = pos | 0x80000000; // change index to point to start of array
              byte[] arr = bytes[doc];
              /*
              for(byte b : arr) {
                //System.out.println("      b=" + Integer.toHexString((int) b));
              }
              */
              bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
              if (target.length <= pos + len) {
                int newlen = target.length;
                while (newlen <= pos + len) {
                  if ((newlen <<= 1) < 0) { // Double until overflow
                    newlen =
                        Integer.MAX_VALUE
                            - 16; // ArrayList.MAX_ARRAY_SIZE says 8. We double that to be sure
                    if (newlen <= pos + len) {
                      throw new IllegalStateException(
                          "Too many terms (> Integer.MAX_VALUE-16) to uninvert field '"
                              + field
                              + "'");
                    }
                  }
                }
                byte[] newtarget = new byte[newlen];
                System.arraycopy(target, 0, newtarget, 0, pos);
                target = newtarget;
              }
              System.arraycopy(arr, 0, target, pos, len);
              pos += len + 1; // skip single byte at end and leave it 0 for terminator
            }
          }
        }

        // shrink array
        if (pos < target.length) {
          byte[] newtarget = new byte[pos];
          System.arraycopy(target, 0, newtarget, 0, pos);
          target = newtarget;
        }

        tnums[pass] = target;

        if ((pass << 16) > maxDoc) break;
      }
    }
    indexedTermsArray = indexedTerms.toArray(new BytesRef[0]);

    long endTime = System.nanoTime();

    total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS);
    phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS);
  }

  /** Number of bytes to represent an unsigned int as a vint. */
  private static int vIntSize(int x) {
    // Tests outside of this code base shows that the previous conditional-based vIntSize is fairly
    // slow until JITted and still about 1/3 slower after JIT than the numberOfLeadingZeros version
    // below.
    return BLOCK7[Integer.numberOfLeadingZeros(x)]; // Intrinsic on modern CPUs
  }

  private static final byte[] BLOCK7 =
      new byte[] {
        5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
        1, 1
      };

  // todo: if we know the size of the vInt already, we could do
  // a single switch on the size

  /**
   * Write the x value as vInt at pos in arr, returning the new endPos. This requires arr to be
   * capable of holding the bytes needed to represent x. Array length checking should be performed
   * beforehand.
   *
   * @param x the value to write as vInt.
   * @param arr the array holding vInt-values.
   * @param pos the position in arr where the vInt representation of x should be written.
   * @return the new end position after writing x at pos.
   */
  private static int writeInt(int x, byte[] arr, int pos) {
    int a;
    a = (x >>> (7 * 4));
    if (a != 0) {
      arr[pos++] = (byte) (a | 0x80);
    }
    a = (x >>> (7 * 3));
    if (a != 0) {
      arr[pos++] = (byte) (a | 0x80);
    }
    a = (x >>> (7 * 2));
    if (a != 0) {
      arr[pos++] = (byte) (a | 0x80);
    }
    a = (x >>> (7 * 1));
    if (a != 0) {
      arr[pos++] = (byte) (a | 0x80);
    }
    arr[pos++] = (byte) (x & 0x7f);
    return pos;
  }

  /**
   * "wrap" our own terms index around the original IndexReader. Only valid if there are terms for
   * this field rom the original reader
   */
  private final class OrdWrappedTermsEnum extends BaseTermsEnum {
    private final TermsEnum termsEnum;
    private BytesRef term;
    private long ord = -indexInterval - 1L; // force "real" seek

    public OrdWrappedTermsEnum(LeafReader reader) throws IOException {
      assert indexedTermsArray != null;
      assert 0 != indexedTermsArray.length;
      termsEnum = reader.terms(field).iterator();
    }

    @Override
    public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
      return termsEnum.postings(reuse, flags);
    }

    @Override
    public ImpactsEnum impacts(int flags) throws IOException {
      return termsEnum.impacts(flags);
    }

    @Override
    public BytesRef term() {
      return term;
    }

    @Override
    public BytesRef next() throws IOException {
      if (++ord < 0) {
        ord = 0;
      }
      if (termsEnum.next() == null) {
        term = null;
        return null;
      }
      return setTerm(); // this is extra work if we know we are in bounds...
    }

    @Override
    public int docFreq() throws IOException {
      return termsEnum.docFreq();
    }

    @Override
    public long totalTermFreq() throws IOException {
      return termsEnum.totalTermFreq();
    }

    @Override
    public long ord() {
      return ordBase + ord;
    }

    @Override
    public SeekStatus seekCeil(BytesRef target) throws IOException {

      // already here
      if (term != null && term.equals(target)) {
        return SeekStatus.FOUND;
      }

      int startIdx = Arrays.binarySearch(indexedTermsArray, target);

      if (startIdx >= 0) {
        // we hit the term exactly... lucky us!
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
        assert seekStatus == TermsEnum.SeekStatus.FOUND;
        ord = (long) startIdx << indexIntervalBits;
        setTerm();
        assert term != null;
        return SeekStatus.FOUND;
      }

      // we didn't hit the term exactly
      startIdx = -startIdx - 1;

      if (startIdx == 0) {
        // our target occurs *before* the first term
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
        assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND;
        ord = 0;
        setTerm();
        assert term != null;
        return SeekStatus.NOT_FOUND;
      }

      // back up to the start of the block
      startIdx--;

      if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) {
        // we are already in the right block and the current term is before the term we want,
        // so we don't need to seek.
      } else {
        // seek to the right block
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]);
        assert seekStatus == TermsEnum.SeekStatus.FOUND;
        ord = (long) startIdx << indexIntervalBits;
        setTerm();
        assert term != null; // should be non-null since it's in the index
      }

      while (term != null && term.compareTo(target) < 0) {
        next();
      }

      if (term == null) {
        return SeekStatus.END;
      } else if (term.compareTo(target) == 0) {
        return SeekStatus.FOUND;
      } else {
        return SeekStatus.NOT_FOUND;
      }
    }

    @Override
    public boolean seekExact(BytesRef text) throws IOException {
      return seekCeil(text) == SeekStatus.FOUND;
    }

    @Override
    public void seekExact(long targetOrd) throws IOException {
      int delta = (int) (targetOrd - ordBase - ord);
      // System.out.println("  seek(ord) targetOrd=" + targetOrd + " delta=" + delta + " ord=" + ord
      // + " ii=" + indexInterval);
      if (delta < 0 || delta > indexInterval) {
        final int idx = (int) (targetOrd >>> indexIntervalBits);
        final BytesRef base = indexedTermsArray[idx];
        // System.out.println("  do seek term=" + base.utf8ToString());
        ord = (long) idx << indexIntervalBits;
        delta = (int) (targetOrd - ord);
        final TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(base);
        assert seekStatus == TermsEnum.SeekStatus.FOUND;
      } else {
        // System.out.println("seek w/in block");
      }

      while (--delta >= 0) {
        BytesRef br = termsEnum.next();
        if (br == null) {
          assert false;
          return;
        }
        ord++;
      }

      setTerm();
      assert term != null;
    }

    private BytesRef setTerm() throws IOException {
      term = termsEnum.term();
      // System.out.println("  setTerm() term=" + term.utf8ToString() + " vs prefix=" + (prefix ==
      // null ? "null" : prefix.utf8ToString()));
      if (prefix != null && !StringHelper.startsWith(term, prefix)) {
        term = null;
      }
      return term;
    }
  }

  /** Returns the term ({@link BytesRef}) corresponding to the provided ordinal. */
  public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException {
    termsEnum.seekExact(ord);
    return termsEnum.term();
  }

  /** Returns a SortedSetDocValues view of this instance */
  public SortedSetDocValues iterator(LeafReader reader) throws IOException {
    if (isEmpty()) {
      return DocValues.emptySortedSet();
    } else {
      return new Iterator(reader);
    }
  }

  private class Iterator extends SortedSetDocValues {
    final LeafReader reader;
    final TermsEnum te; // used internally for lookupOrd() and lookupTerm()
    final int maxDoc;
    // currently we read 5 at a time (using the logic of the old iterator)
    final int buffer[] = new int[5];
    int bufferUpto;
    int bufferLength;

    private int doc = -1;
    private int tnum;
    private int upto;
    private byte[] arr;

    Iterator(LeafReader reader) throws IOException {
      this.reader = reader;
      this.maxDoc = reader.maxDoc();
      this.te = termsEnum();
    }

    @Override
    public long nextOrd() {
      while (bufferUpto == bufferLength) {
        if (bufferLength < buffer.length) {
          return NO_MORE_ORDS;
        } else {
          bufferLength = read(buffer);
          bufferUpto = 0;
        }
      }
      return buffer[bufferUpto++];
    }

    @Override
    public int docValueCount() {
      if (arr == null) {
        // This value was inlined, and then read into a single buffer
        return bufferLength;
      } else {
        // scan logic taken from read()
        int start = index[doc] & 0x7fffffff;
        int cursor = start;
        for (; ; ) {
          int delta = 0;
          for (; ; ) {
            byte b = arr[cursor++];
            delta = (delta << 7) | (b & 0x7f);
            if ((b & 0x80) == 0) break;
          }
          if (delta == 0) break;
        }

        return cursor - start - 1;
      }
    }

    /**
     * Buffer must be at least 5 ints long. Returns number of term ords placed into buffer; if this
     * count is less than buffer.length then that is the end.
     */
    int read(int[] buffer) {
      int bufferUpto = 0;
      if (arr == null) {
        // code is inlined into upto
        // System.out.println("inlined");
        int code = upto;
        int delta = 0;
        for (; ; ) {
          delta = (delta << 7) | (code & 0x7f);
          if ((code & 0x80) == 0) {
            if (delta == 0) break;
            tnum += delta - TNUM_OFFSET;
            buffer[bufferUpto++] = ordBase + tnum;
            // System.out.println("  tnum=" + tnum);
            delta = 0;
          }
          code >>>= 8;
        }
      } else {
        // upto is a pointer into the array
        for (; ; ) {
          int delta = 0;
          for (; ; ) {
            byte b = arr[upto++];
            delta = (delta << 7) | (b & 0x7f);
            // System.out.println("    cycle: upto=" + upto + " delta=" + delta + " b=" + b);
            if ((b & 0x80) == 0) break;
          }
          // System.out.println("  delta=" + delta);
          if (delta == 0) break;
          tnum += delta - TNUM_OFFSET;
          // System.out.println("  tnum=" + tnum);
          buffer[bufferUpto++] = ordBase + tnum;
          if (bufferUpto == buffer.length) {
            break;
          }
        }
      }

      return bufferUpto;
    }

    private void setDocument(int docID) {
      this.doc = docID;
      tnum = 0;
      final int code = index[docID];
      if ((code & 0x80000000) != 0) {
        // a pointer
        upto = code & 0x7fffffff;
        // System.out.println("    pointer!  upto=" + upto);
        int whichArray = (docID >>> 16) & 0xff;
        arr = tnums[whichArray];
      } else {
        // System.out.println("    inline!");
        arr = null;
        upto = code;
      }
      bufferUpto = 0;
      bufferLength = read(buffer);
    }

    @Override
    public boolean advanceExact(int target) throws IOException {
      setDocument(target);
      return bufferLength > 0;
    }

    @Override
    public int docID() {
      return doc;
    }

    @Override
    public int nextDoc() throws IOException {
      return advance(docID() + 1);
    }

    @Override
    public int advance(int target) throws IOException {
      for (int d = target; d < maxDoc; ++d) {
        if (advanceExact(d)) {
          return d;
        }
      }
      return doc = NO_MORE_DOCS;
    }

    @Override
    public long cost() {
      return maxDoc;
    }

    @Override
    public BytesRef lookupOrd(long ord) {
      try {
        return DocTermOrds.this.lookupTerm(te, (int) ord);
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }

    @Override
    public long getValueCount() {
      return numTerms();
    }

    @Override
    public long lookupTerm(BytesRef key) {
      try {
        switch (te.seekCeil(key)) {
          case FOUND:
            assert te.ord() >= 0;
            return te.ord();
          case NOT_FOUND:
            assert te.ord() >= 0;
            return -te.ord() - 1;
          default: /* END */
            return -numTerms() - 1L;
        }
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }

    @Override
    public TermsEnum termsEnum() {
      try {
        return getOrdTermsEnum(reader);
      } catch (IOException e) {
        throw new RuntimeException(e);
      }
    }
  }
}