All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.uninverting.DocTermOrds Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.uninverting;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.StringHelper;

/**
 * This class enables fast access to multiple term ords for a specified field across all docIDs.
 *
 * 

Like FieldCache, it uninverts the index and holds a packed data structure in RAM to enable * fast access. Unlike FieldCache, it can handle multi-valued fields, and, it does not hold the term * bytes in RAM. Rather, you must obtain a TermsEnum from the {@link #getOrdTermsEnum} method, and * then seek-by-ord to get the term's bytes. * *

While normally term ords are type long, in this API they are int as the internal * representation here cannot address more than MAX_INT unique terms. Also, typically this class is * used on fields with relatively few unique terms vs the number of documents. A previous internal * limit (16 MB) on how many bytes each chunk of documents may consume has been increased to 2 GB. * *

Deleted documents are skipped during uninversion, and if you look them up you'll get 0 ords. * *

The returned per-document ords do not retain their original order in the document. Instead * they are returned in sorted (by ord, ie term's BytesRef comparator) order. They are also de-dup'd * (ie if doc has same term more than once in this field, you'll only get that ord back once). * *

This class will create its own term index internally, allowing to create a wrapped TermsEnum * that can handle ord. The {@link #getOrdTermsEnum} method then provides this wrapped enum. * *

The RAM consumption of this class can be high! * * @lucene.experimental */ /* * The un-inverted field: * Each document points to a list of term numbers that are contained in that document. * * Term numbers are in sorted order, and are encoded as variable-length deltas from the * previous term number. Real term numbers start at 2 since 0 and 1 are reserved. A * term number of 0 signals the end of the termNumber list. * * There is a single int[maxDoc()] which either contains a pointer into a byte[] for * the termNumber lists, or directly contains the termNumber list if it fits as a vInt-list * in the 4 bytes of an integer. As bit 7 within each byte is used in the vInt encoding to * signal overflow into the next byte, bit 7 of the highest byte (bit 31 in the full integer) * will never be 1. If bit 31 in the integer is set, this signals a pointer and bit 0-30 * is then the value of the pointer into a byte[] where the termNumber list starts. * * A single entry is thus either 0b0xxxxxxxx_xxxxxxxx_xxxxxxxx_xxxxxxxx holding 0-4 vInts * (low byte first) or 0b1xxxxxxxx_xxxxxxxx_xxxxxxxx_xxxxxxxx holding a 31-bit pointer. * * There are 256 byte arrays, as the previous version of DocTermOrds had a pointer limit * of 24 bits / 3 bytes. The correct byte array for a document is a function of its id. * * To save space and speed up faceting, any term that matches enough documents will * not be un-inverted... it will be skipped while building the un-inverted field structure, * and will use a set intersection method during faceting. * * To further save memory, the terms (the actual string values) are not all stored in * memory, but a TermIndex is used to convert term numbers to term values only * for the terms needed after faceting has completed. Only every 128th term value * is stored, along with its corresponding term number, and this is used as an * index to find the closest term and iterate until the desired number is hit (very * much like Lucene's own internal term index). */ public class DocTermOrds implements Accountable { // Term ords are shifted by this, internally, to reserve // values 0 (end term) and 1 (index is a pointer into byte array) private static final int TNUM_OFFSET = 2; /** Every 128th term is indexed, by default. */ public static final int DEFAULT_INDEX_INTERVAL_BITS = 7; // decrease to a low number like 2 for testing private int indexIntervalBits; private int indexIntervalMask; private int indexInterval; /** Don't uninvert terms that exceed this count. */ protected final int maxTermDocFreq; /** Field we are uninverting. */ protected final String field; /** Number of terms in the field. */ protected int numTermsInField; /** Total number of references to term numbers. */ protected long termInstances; private long memsz; /** Total time to uninvert the field. */ protected int total_time; /** Time for phase1 of the uninvert process. */ protected int phase1_time; /** Holds the per-document ords or a pointer to the ords. */ protected int[] index; /** Holds term ords for documents. */ protected byte[][] tnums = new byte[256][]; /** Total bytes (sum of term lengths) for all indexed terms. */ protected long sizeOfIndexedStrings; /** Holds the indexed (by default every 128th) terms. */ // TODO: This seems like an obvious candidate for using BytesRefArray extended with binarySearch // This would save heap space as well as avoid a lot of small Objects (BytesRefs). // This would also increase data locality for binarySearch lookups, potentially making it faster. protected BytesRef[] indexedTermsArray = new BytesRef[0]; /** If non-null, only terms matching this prefix were indexed. */ protected BytesRef prefix; /** * Ordinal of the first term in the field, or 0 if the {@link PostingsFormat} does not implement * {@link TermsEnum#ord}. */ protected int ordBase; /** Used while uninverting. */ protected PostingsEnum postingsEnum; /** * If true, check and throw an exception if the field has docValues enabled. Normally, docValues * should be used in preference to DocTermOrds. */ protected boolean checkForDocValues = true; // TODO: Why is indexedTermsArray not part of this? /** Returns total bytes used. */ @Override public long ramBytesUsed() { // can cache the mem size since it shouldn't change if (memsz != 0) return memsz; long sz = 8 * 8 + 32; // local fields if (index != null) sz += index.length * 4L; if (tnums != null) { for (byte[] arr : tnums) if (arr != null) sz += arr.length; } if (indexedTermsArray != null) { // assume 8 byte references? sz += 8 + 8 + 8 + 8 + ((long) indexedTermsArray.length << 3) + sizeOfIndexedStrings; } memsz = sz; return sz; } /** Inverts all terms. */ public DocTermOrds(LeafReader reader, Bits liveDocs, String field) throws IOException { this(reader, liveDocs, field, null, Integer.MAX_VALUE); } // TODO: instead of all these ctors and options, take termsenum! /** Inverts only terms starting w/ prefix */ public DocTermOrds(LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix) throws IOException { this(reader, liveDocs, field, termPrefix, Integer.MAX_VALUE); } /** * Inverts only terms starting w/ prefix, and only terms whose docFreq (not taking deletions into * account) is <= maxTermDocFreq */ public DocTermOrds( LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix, int maxTermDocFreq) throws IOException { this(reader, liveDocs, field, termPrefix, maxTermDocFreq, DEFAULT_INDEX_INTERVAL_BITS); } /** * Inverts only terms starting w/ prefix, and only terms whose docFreq (not taking deletions into * account) is <= maxTermDocFreq, with a custom indexing interval (default is every 128nd * term). */ public DocTermOrds( LeafReader reader, Bits liveDocs, String field, BytesRef termPrefix, int maxTermDocFreq, int indexIntervalBits) throws IOException { this(field, maxTermDocFreq, indexIntervalBits); uninvert(reader, liveDocs, termPrefix); } /** Subclass inits w/ this, but be sure you then call uninvert, only once */ protected DocTermOrds(String field, int maxTermDocFreq, int indexIntervalBits) { // System.out.println("DTO init field=" + field + " maxTDFreq=" + maxTermDocFreq); this.field = field; this.maxTermDocFreq = maxTermDocFreq; this.indexIntervalBits = indexIntervalBits; indexIntervalMask = 0xffffffff >>> (32 - indexIntervalBits); indexInterval = 1 << indexIntervalBits; } /** * Returns a TermsEnum that implements ord, or null if no terms in field. * *

we build a "private" terms index internally (WARNING: consumes RAM) and use that index to * implement ord. This also enables ord on top of a composite reader. The returned TermsEnum is * unpositioned. This returns null if there are no terms. * *

NOTE: you must pass the same reader that was used when creating this class */ public TermsEnum getOrdTermsEnum(LeafReader reader) throws IOException { // NOTE: see LUCENE-6529 before attempting to optimize this method to // return a TermsEnum directly from the reader if it already supports ord(). assert null != indexedTermsArray; if (0 == indexedTermsArray.length) { return null; } else { return new OrdWrappedTermsEnum(reader); } } /** Returns the number of terms in this field */ public int numTerms() { return numTermsInField; } /** Returns {@code true} if no terms were indexed. */ public boolean isEmpty() { return index == null; } /** Subclass can override this */ protected void visitTerm(TermsEnum te, int termNum) throws IOException {} /** * Invoked during {@link #uninvert(org.apache.lucene.index.LeafReader,Bits,BytesRef)} to record * the document frequency for each uninverted term. */ protected void setActualDocFreq(int termNum, int df) throws IOException {} /** Call this only once (if you subclass!) */ protected void uninvert(final LeafReader reader, Bits liveDocs, final BytesRef termPrefix) throws IOException { final FieldInfo info = reader.getFieldInfos().fieldInfo(field); if (checkForDocValues && info != null && info.getDocValuesType() != DocValuesType.NONE) { throw new IllegalStateException( "Type mismatch: " + field + " was indexed as " + info.getDocValuesType()); } // System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix); final long startTime = System.nanoTime(); prefix = termPrefix == null ? null : BytesRef.deepCopyOf(termPrefix); final int maxDoc = reader.maxDoc(); // immediate term numbers, or the index into the byte[] representing the last number final int[] index = new int[maxDoc]; // last term we saw for this document final int[] lastTerm = new int[maxDoc]; // list of term numbers for the doc (delta encoded vInts) final byte[][] bytes = new byte[maxDoc][]; final Terms terms = reader.terms(field); if (terms == null) { // No terms return; } final TermsEnum te = terms.iterator(); final BytesRef seekStart = termPrefix != null ? termPrefix : new BytesRef(); // System.out.println("seekStart=" + seekStart.utf8ToString()); if (te.seekCeil(seekStart) == TermsEnum.SeekStatus.END) { // No terms match return; } // For our "term index wrapper" final List indexedTerms = new ArrayList<>(); final PagedBytes indexedTermsBytes = new PagedBytes(15); // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. byte[] tempArr = new byte[12]; // // enumerate all terms, and build an intermediate form of the un-inverted field. // // During this intermediate form, every document has a (potential) byte[] // and the int[maxDoc()] array either contains the termNumber list directly // or the *end* offset of the termNumber list in its byte array (for faster // appending and faster creation of the final form). // // idea... if things are too large while building, we could do a range of docs // at a time (but it would be a fair amount slower to build) // could also do ranges in parallel to take advantage of multiple CPUs // OPTIONAL: remap the largest df terms to the lowest 128 (single byte) // values. This requires going over the field first to find the most // frequent terms ahead of time. int termNum = 0; postingsEnum = null; // Loop begins with te positioned to first term (we call // seek above): for (; ; ) { final BytesRef t = te.term(); if (t == null || (termPrefix != null && !StringHelper.startsWith(t, termPrefix))) { break; } // System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); visitTerm(te, termNum); if ((termNum & indexIntervalMask) == 0) { // Index this term sizeOfIndexedStrings += t.length; BytesRef indexedTerm = new BytesRef(); indexedTermsBytes.copy(t, indexedTerm); // TODO: really should 1) strip off useless suffix, // and 2) use FST not array/PagedBytes indexedTerms.add(indexedTerm); } final int df = te.docFreq(); if (df <= maxTermDocFreq) { postingsEnum = te.postings(postingsEnum, PostingsEnum.NONE); // dF, but takes deletions into account int actualDF = 0; for (; ; ) { int doc = postingsEnum.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } // System.out.println(" chunk=" + chunk + " docs"); actualDF++; termInstances++; // System.out.println(" docID=" + doc); // add TNUM_OFFSET to the term number to make room for special reserved values: // 0 (end term) and 1 (index into byte array follows) int delta = termNum - lastTerm[doc] + TNUM_OFFSET; lastTerm[doc] = termNum; int val = index[doc]; if ((val & 0x80000000) != 0) { // index into byte array (actually the end of the doc-specific byte[] when building) int pos = val & 0x7fffffff; int ilen = vIntSize(delta); byte[] arr = bytes[doc]; int newend = pos + ilen; if (newend > arr.length) { // We avoid a doubling strategy to lower memory usage. // this faceting method isn't for docs with many terms. // In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit // boundary. // TODO: figure out what array lengths we can round up to w/o actually using more // memory // (how much space does a byte[] take up? Is data preceded by a 32 bit length only? // It should be safe to round up to the nearest 32 bits in any case. int newLen = (newend + 3) & 0xfffffffc; // 4 byte alignment byte[] newarr = new byte[newLen]; System.arraycopy(arr, 0, newarr, 0, pos); arr = newarr; bytes[doc] = newarr; } pos = writeInt(delta, arr, pos); index[doc] = pos | 0x80000000; // update pointer to end index in byte[] } else { // OK, this int has data in it... find the end (a zero starting byte - not // part of another number, hence not following a byte with the high bit set). int ipos; if (val == 0) { ipos = 0; } else if ((val & 0x0000ff80) == 0) { ipos = 1; } else if ((val & 0x00ff8000) == 0) { ipos = 2; } else if ((val & 0xff800000) == 0) { ipos = 3; } else { ipos = 4; } // System.out.println(" ipos=" + ipos); int endPos = writeInt(delta, tempArr, ipos); // System.out.println(" endpos=" + endPos); if (endPos <= 4) { // System.out.println(" fits!"); // value will fit in the integer... move bytes back for (int j = ipos; j < endPos; j++) { val |= (tempArr[j] & 0xff) << (j << 3); } index[doc] = val; } else { // value won't fit... move integer into byte[] for (int j = 0; j < ipos; j++) { tempArr[j] = (byte) val; val >>>= 8; } // point at the end index in the byte[] index[doc] = endPos | 0x80000000; bytes[doc] = tempArr; tempArr = new byte[12]; } } } setActualDocFreq(termNum, actualDF); } termNum++; if (te.next() == null) { break; } } numTermsInField = termNum; long midPoint = System.nanoTime(); if (termInstances == 0) { // we didn't invert anything // lower memory consumption. tnums = null; } else { this.index = index; // // transform intermediate form into the final form, building a single byte[] // at a time, and releasing the intermediate byte[]s as we go to avoid // increasing the memory footprint. // for (int pass = 0; pass < 256; pass++) { byte[] target = tnums[pass]; int pos = 0; // end in target; if (target != null) { pos = target.length; } else { target = new byte[4096]; } // loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx // where pp is the pass (which array we are building), and xx is all values. // each pass shares the same byte[] for termNumber lists. for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24)) { int lim = Math.min(docbase + (1 << 16), maxDoc); for (int doc = docbase; doc < lim; doc++) { // System.out.println(" pass=" + pass + " process docID=" + doc); int val = index[doc]; if ((val & 0x80000000) != 0) { int len = val & 0x7fffffff; // System.out.println(" ptr pos=" + pos); // index[doc] = (pos<<8)|1; // change index to point to start of array index[doc] = pos | 0x80000000; // change index to point to start of array byte[] arr = bytes[doc]; /* for(byte b : arr) { //System.out.println(" b=" + Integer.toHexString((int) b)); } */ bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM if (target.length <= pos + len) { int newlen = target.length; while (newlen <= pos + len) { if ((newlen <<= 1) < 0) { // Double until overflow newlen = Integer.MAX_VALUE - 16; // ArrayList.MAX_ARRAY_SIZE says 8. We double that to be sure if (newlen <= pos + len) { throw new IllegalStateException( "Too many terms (> Integer.MAX_VALUE-16) to uninvert field '" + field + "'"); } } } byte[] newtarget = new byte[newlen]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } System.arraycopy(arr, 0, target, pos, len); pos += len + 1; // skip single byte at end and leave it 0 for terminator } } } // shrink array if (pos < target.length) { byte[] newtarget = new byte[pos]; System.arraycopy(target, 0, newtarget, 0, pos); target = newtarget; } tnums[pass] = target; if ((pass << 16) > maxDoc) break; } } indexedTermsArray = indexedTerms.toArray(new BytesRef[0]); long endTime = System.nanoTime(); total_time = (int) TimeUnit.MILLISECONDS.convert(endTime - startTime, TimeUnit.NANOSECONDS); phase1_time = (int) TimeUnit.MILLISECONDS.convert(midPoint - startTime, TimeUnit.NANOSECONDS); } /** Number of bytes to represent an unsigned int as a vint. */ private static int vIntSize(int x) { // Tests outside of this code base shows that the previous conditional-based vIntSize is fairly // slow until JITted and still about 1/3 slower after JIT than the numberOfLeadingZeros version // below. return BLOCK7[Integer.numberOfLeadingZeros(x)]; // Intrinsic on modern CPUs } private static final byte[] BLOCK7 = new byte[] { 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1 }; // todo: if we know the size of the vInt already, we could do // a single switch on the size /** * Write the x value as vInt at pos in arr, returning the new endPos. This requires arr to be * capable of holding the bytes needed to represent x. Array length checking should be performed * beforehand. * * @param x the value to write as vInt. * @param arr the array holding vInt-values. * @param pos the position in arr where the vInt representation of x should be written. * @return the new end position after writing x at pos. */ private static int writeInt(int x, byte[] arr, int pos) { int a; a = (x >>> (7 * 4)); if (a != 0) { arr[pos++] = (byte) (a | 0x80); } a = (x >>> (7 * 3)); if (a != 0) { arr[pos++] = (byte) (a | 0x80); } a = (x >>> (7 * 2)); if (a != 0) { arr[pos++] = (byte) (a | 0x80); } a = (x >>> (7 * 1)); if (a != 0) { arr[pos++] = (byte) (a | 0x80); } arr[pos++] = (byte) (x & 0x7f); return pos; } /** * "wrap" our own terms index around the original IndexReader. Only valid if there are terms for * this field rom the original reader */ private final class OrdWrappedTermsEnum extends BaseTermsEnum { private final TermsEnum termsEnum; private BytesRef term; private long ord = -indexInterval - 1L; // force "real" seek public OrdWrappedTermsEnum(LeafReader reader) throws IOException { assert indexedTermsArray != null; assert 0 != indexedTermsArray.length; termsEnum = reader.terms(field).iterator(); } @Override public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { return termsEnum.postings(reuse, flags); } @Override public ImpactsEnum impacts(int flags) throws IOException { return termsEnum.impacts(flags); } @Override public BytesRef term() { return term; } @Override public BytesRef next() throws IOException { if (++ord < 0) { ord = 0; } if (termsEnum.next() == null) { term = null; return null; } return setTerm(); // this is extra work if we know we are in bounds... } @Override public int docFreq() throws IOException { return termsEnum.docFreq(); } @Override public long totalTermFreq() throws IOException { return termsEnum.totalTermFreq(); } @Override public long ord() { return ordBase + ord; } @Override public SeekStatus seekCeil(BytesRef target) throws IOException { // already here if (term != null && term.equals(target)) { return SeekStatus.FOUND; } int startIdx = Arrays.binarySearch(indexedTermsArray, target); if (startIdx >= 0) { // we hit the term exactly... lucky us! TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = (long) startIdx << indexIntervalBits; setTerm(); assert term != null; return SeekStatus.FOUND; } // we didn't hit the term exactly startIdx = -startIdx - 1; if (startIdx == 0) { // our target occurs *before* the first term TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target); assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND; ord = 0; setTerm(); assert term != null; return SeekStatus.NOT_FOUND; } // back up to the start of the block startIdx--; if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) { // we are already in the right block and the current term is before the term we want, // so we don't need to seek. } else { // seek to the right block TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]); assert seekStatus == TermsEnum.SeekStatus.FOUND; ord = (long) startIdx << indexIntervalBits; setTerm(); assert term != null; // should be non-null since it's in the index } while (term != null && term.compareTo(target) < 0) { next(); } if (term == null) { return SeekStatus.END; } else if (term.compareTo(target) == 0) { return SeekStatus.FOUND; } else { return SeekStatus.NOT_FOUND; } } @Override public boolean seekExact(BytesRef text) throws IOException { return seekCeil(text) == SeekStatus.FOUND; } @Override public void seekExact(long targetOrd) throws IOException { int delta = (int) (targetOrd - ordBase - ord); // System.out.println(" seek(ord) targetOrd=" + targetOrd + " delta=" + delta + " ord=" + ord // + " ii=" + indexInterval); if (delta < 0 || delta > indexInterval) { final int idx = (int) (targetOrd >>> indexIntervalBits); final BytesRef base = indexedTermsArray[idx]; // System.out.println(" do seek term=" + base.utf8ToString()); ord = (long) idx << indexIntervalBits; delta = (int) (targetOrd - ord); final TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(base); assert seekStatus == TermsEnum.SeekStatus.FOUND; } else { // System.out.println("seek w/in block"); } while (--delta >= 0) { BytesRef br = termsEnum.next(); if (br == null) { assert false; return; } ord++; } setTerm(); assert term != null; } private BytesRef setTerm() throws IOException { term = termsEnum.term(); // System.out.println(" setTerm() term=" + term.utf8ToString() + " vs prefix=" + (prefix == // null ? "null" : prefix.utf8ToString())); if (prefix != null && !StringHelper.startsWith(term, prefix)) { term = null; } return term; } } /** Returns the term ({@link BytesRef}) corresponding to the provided ordinal. */ public BytesRef lookupTerm(TermsEnum termsEnum, int ord) throws IOException { termsEnum.seekExact(ord); return termsEnum.term(); } /** Returns a SortedSetDocValues view of this instance */ public SortedSetDocValues iterator(LeafReader reader) throws IOException { if (isEmpty()) { return DocValues.emptySortedSet(); } else { return new Iterator(reader); } } private class Iterator extends SortedSetDocValues { final LeafReader reader; final TermsEnum te; // used internally for lookupOrd() and lookupTerm() final int maxDoc; // currently we read 5 at a time (using the logic of the old iterator) final int buffer[] = new int[5]; int bufferUpto; int bufferLength; private int doc = -1; private int tnum; private int upto; private byte[] arr; Iterator(LeafReader reader) throws IOException { this.reader = reader; this.maxDoc = reader.maxDoc(); this.te = termsEnum(); } @Override public long nextOrd() { while (bufferUpto == bufferLength) { if (bufferLength < buffer.length) { return NO_MORE_ORDS; } else { bufferLength = read(buffer); bufferUpto = 0; } } return buffer[bufferUpto++]; } @Override public int docValueCount() { if (arr == null) { // This value was inlined, and then read into a single buffer return bufferLength; } else { // scan logic taken from read() int start = index[doc] & 0x7fffffff; int cursor = start; for (; ; ) { int delta = 0; for (; ; ) { byte b = arr[cursor++]; delta = (delta << 7) | (b & 0x7f); if ((b & 0x80) == 0) break; } if (delta == 0) break; } return cursor - start - 1; } } /** * Buffer must be at least 5 ints long. Returns number of term ords placed into buffer; if this * count is less than buffer.length then that is the end. */ int read(int[] buffer) { int bufferUpto = 0; if (arr == null) { // code is inlined into upto // System.out.println("inlined"); int code = upto; int delta = 0; for (; ; ) { delta = (delta << 7) | (code & 0x7f); if ((code & 0x80) == 0) { if (delta == 0) break; tnum += delta - TNUM_OFFSET; buffer[bufferUpto++] = ordBase + tnum; // System.out.println(" tnum=" + tnum); delta = 0; } code >>>= 8; } } else { // upto is a pointer into the array for (; ; ) { int delta = 0; for (; ; ) { byte b = arr[upto++]; delta = (delta << 7) | (b & 0x7f); // System.out.println(" cycle: upto=" + upto + " delta=" + delta + " b=" + b); if ((b & 0x80) == 0) break; } // System.out.println(" delta=" + delta); if (delta == 0) break; tnum += delta - TNUM_OFFSET; // System.out.println(" tnum=" + tnum); buffer[bufferUpto++] = ordBase + tnum; if (bufferUpto == buffer.length) { break; } } } return bufferUpto; } private void setDocument(int docID) { this.doc = docID; tnum = 0; final int code = index[docID]; if ((code & 0x80000000) != 0) { // a pointer upto = code & 0x7fffffff; // System.out.println(" pointer! upto=" + upto); int whichArray = (docID >>> 16) & 0xff; arr = tnums[whichArray]; } else { // System.out.println(" inline!"); arr = null; upto = code; } bufferUpto = 0; bufferLength = read(buffer); } @Override public boolean advanceExact(int target) throws IOException { setDocument(target); return bufferLength > 0; } @Override public int docID() { return doc; } @Override public int nextDoc() throws IOException { return advance(docID() + 1); } @Override public int advance(int target) throws IOException { for (int d = target; d < maxDoc; ++d) { if (advanceExact(d)) { return d; } } return doc = NO_MORE_DOCS; } @Override public long cost() { return maxDoc; } @Override public BytesRef lookupOrd(long ord) { try { return DocTermOrds.this.lookupTerm(te, (int) ord); } catch (IOException e) { throw new RuntimeException(e); } } @Override public long getValueCount() { return numTerms(); } @Override public long lookupTerm(BytesRef key) { try { switch (te.seekCeil(key)) { case FOUND: assert te.ord() >= 0; return te.ord(); case NOT_FOUND: assert te.ord() >= 0; return -te.ord() - 1; default: /* END */ return -numTerms() - 1L; } } catch (IOException e) { throw new RuntimeException(e); } } @Override public TermsEnum termsEnum() { try { return getOrdTermsEnum(reader); } catch (IOException e) { throw new RuntimeException(e); } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy