All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.search.facet.UnInvertedField Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.search.facet;

import java.io.Closeable;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.solr.common.SolrException;
import org.apache.solr.index.SlowCompositeReaderWrapper;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.TrieField;
import org.apache.solr.search.BitDocSet;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrCache;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.facet.SlotAcc.CountSlotAcc;
import org.apache.solr.search.facet.SlotAcc.SlotContext;
import org.apache.solr.search.facet.SlotAcc.SweepCountAccStruct;
import org.apache.solr.search.facet.SlotAcc.SweepingCountSlotAcc;
import org.apache.solr.search.facet.SweepCountAware.SegCountGlobal;
import org.apache.solr.search.facet.SweepDocIterator.SweepIteratorAndCounts;
import org.apache.solr.uninverting.DocTermOrds;
import org.apache.solr.util.TestInjection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Final form of the un-inverted field: Each document points to a list of term numbers that are
 * contained in that document.
 *
 * 

Term numbers are in sorted order, and are encoded as variable-length deltas from the previous * term number. Real term numbers start at 2 since 0 and 1 are reserved. A term number of 0 signals * the end of the termNumber list. * *

There is a single int[maxDoc()] which either contains a pointer into a byte[] for the * termNumber lists, or directly contains the termNumber list if it fits in the 4 bytes of an * integer. If the first byte in the integer is 1, the next 3 bytes are a pointer into a byte[] * where the termNumber list starts. * *

There are actually 256 byte arrays, to compensate for the fact that the pointers into the byte * arrays are only 3 bytes long. The correct byte array for a document is a function of its id. * *

To save space and speed up faceting, any term that matches enough documents will not be * un-inverted... it will be skipped while building the un-inverted field structure, and will use a * set intersection method during faceting. * *

To further save memory, the terms (the actual string values) are not all stored in memory, but * a TermIndex is used to convert term numbers to term values only for the terms needed after * faceting has completed. Only every 128th term value is stored, along with its corresponding term * number, and this is used as an index to find the closest term and iterate until the desired * number is hit (very much like Lucene's own internal term index). */ public class UnInvertedField extends DocTermOrds { private static int TNUM_OFFSET = 2; private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); static class TopTerm { Query termQuery; BytesRef term; int termNum; long memSize() { return 8L + // obj header 8 + 8 + term.length + // term 4; // int } } long memsz; final AtomicLong use = new AtomicLong(); // number of uses /* The number of documents holding the term {@code maxDocs = maxTermCounts[termNum]}. */ int[] maxTermCounts = new int[1024]; /* termNum -> docIDs for big terms. */ final Map bigTerms = new LinkedHashMap<>(); private SolrIndexSearcher.DocsEnumState deState; private final SolrIndexSearcher searcher; private static final UnInvertedField uifPlaceholder = new UnInvertedField(); private UnInvertedField() { // Dummy for synchronization. super("fake", 0, 0); // cheapest initialization I can find. searcher = null; } /** * Called for each term in the field being uninverted. Collects {@link #maxTermCounts} for all * bigTerms as well as storing them in {@link #bigTerms}. * * @param te positioned at the current term. * @param termNum the ID/pointer/ordinal of the current term. Monotonically increasing between * calls. */ @Override protected void visitTerm(TermsEnum te, int termNum) throws IOException { if (termNum >= maxTermCounts.length) { // resize by doubling - for very large number of unique terms, expanding // by 4K and resultant GC will dominate uninvert times. Resize at end if material int[] newMaxTermCounts = new int[Math.min(Integer.MAX_VALUE - 16, maxTermCounts.length * 2)]; System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, termNum); maxTermCounts = newMaxTermCounts; } final BytesRef term = te.term(); if (te.docFreq() > maxTermDocFreq) { Term t = new Term(field, term); // this makes a deep copy of the term bytes TopTerm topTerm = new TopTerm(); topTerm.term = t.bytes(); topTerm.termNum = termNum; topTerm.termQuery = new TermQuery(t); bigTerms.put(topTerm.termNum, topTerm); if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); deState.fieldName = field; deState.liveDocs = searcher.getLiveDocsBits(); // TODO: check for MultiTermsEnum in SolrIndexSearcher could now fail? deState.termsEnum = te; deState.postingsEnum = postingsEnum; deState.minSetSizeCached = maxTermDocFreq; } postingsEnum = deState.postingsEnum; DocSet set = searcher.getDocSet(deState); maxTermCounts[termNum] = set.size(); } } @Override protected void setActualDocFreq(int termNum, int docFreq) { maxTermCounts[termNum] = docFreq; } public long memSize() { // can cache the mem size since it shouldn't change if (memsz != 0) return memsz; long sz = super.ramBytesUsed(); sz += 8 * 8 + 32; // local fields sz += bigTerms.size() * 64L; for (TopTerm tt : bigTerms.values()) { sz += tt.memSize(); } if (maxTermCounts != null) sz += maxTermCounts.length * 4L; memsz = sz; return sz; } public UnInvertedField(String field, SolrIndexSearcher searcher) throws IOException { super( field, // threshold, over which we use set intersections instead of counting // to (1) save memory, and (2) speed up faceting. // Add 2 for testing purposes so that there will always be some terms under // the threshold even when the index is very // small. searcher.maxDoc() / 20 + 2, DEFAULT_INDEX_INTERVAL_BITS); assert TestInjection.injectUIFOutOfMemoryError(); final String prefix = TrieField.getMainValuePrefix(searcher.getSchema().getFieldType(field)); this.searcher = searcher; try { // TODO: it's wasteful to create one of these each time // but DocTermOrds will throw an exception if it thinks the field has doc values (which is // faked by UnInvertingReader) LeafReader r = SlowCompositeReaderWrapper.wrap(searcher.getRawReader()); uninvert(r, r.getLiveDocs(), prefix == null ? null : new BytesRef(prefix)); } catch (IllegalStateException ise) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, ise); } if (tnums != null) { for (byte[] target : tnums) { if (target != null && target.length > (1 << 24) * .9) { log.warn( "Approaching too many values for UnInvertedField faceting on field '{}' : bucket size={}", field, target.length); } } } // free space if outrageously wasteful (tradeoff memory/cpu) if ((maxTermCounts.length - numTermsInField) > 1024) { // too much waste! int[] newMaxTermCounts = new int[numTermsInField]; System.arraycopy(maxTermCounts, 0, newMaxTermCounts, 0, numTermsInField); maxTermCounts = newMaxTermCounts; } log.info("UnInverted multi-valued field {}", this); // System.out.println("CREATED: " + toString() + " ti.index=" + ti.index); } public int getNumTerms() { return numTermsInField; } public class DocToTerm implements Closeable { private final DocSet[] bigTermSets; private final int[] bigTermNums; private TermsEnum te; public DocToTerm() throws IOException { bigTermSets = new DocSet[bigTerms.size()]; bigTermNums = new int[bigTerms.size()]; int i = 0; for (TopTerm tt : bigTerms.values()) { bigTermSets[i] = searcher.getDocSet(tt.termQuery); bigTermNums[i] = tt.termNum; i++; } } public BytesRef lookupOrd(int ord) throws IOException { return getTermValue(getTermsEnum(), ord); } public TermsEnum getTermsEnum() throws IOException { if (te == null) { te = getOrdTermsEnum(searcher.getSlowAtomicReader()); } return te; } public void getBigTerms(int doc, Callback target) throws IOException { if (bigTermSets != null) { for (int i = 0; i < bigTermSets.length; i++) { if (bigTermSets[i].exists(doc)) { target.call(bigTermNums[i]); } } } } public void getSmallTerms(int doc, Callback target) { if (termInstances > 0) { int code = index[doc]; if ((code & 0x80000000) != 0) { int pos = code & 0x7fffffff; int whichArray = (doc >>> 16) & 0xff; byte[] arr = tnums[whichArray]; int tnum = 0; for (; ; ) { int delta = 0; for (; ; ) { byte b = arr[pos++]; delta = (delta << 7) | (b & 0x7f); if ((b & 0x80) == 0) break; } if (delta == 0) break; tnum += delta - TNUM_OFFSET; target.call(tnum); } } else { int tnum = 0; int delta = 0; for (; ; ) { delta = (delta << 7) | (code & 0x7f); if ((code & 0x80) == 0) { if (delta == 0) break; tnum += delta - TNUM_OFFSET; target.call(tnum); delta = 0; } code >>>= 8; } } } } @Override public void close() throws IOException { for (DocSet set : bigTermSets) { // set.decref(); // OFF-HEAP } } } public interface Callback { public void call(int termNum); } private void getCounts(FacetFieldProcessorByArrayUIF processor) throws IOException { DocSet docs = processor.fcontext.base; int baseSize = docs.size(); int maxDoc = searcher.maxDoc(); // what about allBuckets? if (baseSize < processor.effectiveMincount) { return; } SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor); final List others = SweepingCountSlotAcc.otherStructsOf(processor); final int[] index = this.index; boolean doNegative = baseSize > maxDoc >> 1 && termInstances > 0 && docs instanceof BitDocSet && baseCountAccStruct != null; if (doNegative) { FixedBitSet bs = ((BitDocSet) docs).getBits().clone(); bs.flip(0, maxDoc); // TODO: when iterator across negative elements is available, use that // instead of creating a new bitset and inverting. docs = new BitDocSet(bs, maxDoc - baseSize); // simply negating will mean that we have deleted docs in the set. // that should be OK, as their entries in our table should be empty. baseCountAccStruct = new SweepCountAccStruct(baseCountAccStruct, docs); } // For the biggest terms, do straight set intersections for (TopTerm tt : bigTerms.values()) { // TODO: counts could be deferred if sorting by index order final int termOrd = tt.termNum; Iterator othersIter = others.iterator(); SweepCountAccStruct entry = baseCountAccStruct != null ? baseCountAccStruct : othersIter.next(); for (; ; ) { entry.countAcc.incrementCount(termOrd, searcher.numDocs(tt.termQuery, entry.docSet)); if (!othersIter.hasNext()) { break; } entry = othersIter.next(); } } // TODO: we could short-circuit counting altogether for sorted faceting // where we already have enough terms from the bigTerms if (termInstances > 0) { final SweepIteratorAndCounts iterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others); final SweepDocIterator iter = iterAndCounts.iter; final SegCountGlobal counts = new SegCountGlobal(iterAndCounts.countAccs); while (iter.hasNext()) { int doc = iter.nextDoc(); int maxIdx = iter.registerCounts(counts); int code = index[doc]; if ((code & 0x80000000) != 0) { int pos = code & 0x7fffffff; int whichArray = (doc >>> 16) & 0xff; byte[] arr = tnums[whichArray]; int tnum = 0; for (; ; ) { int delta = 0; for (; ; ) { byte b = arr[pos++]; delta = (delta << 7) | (b & 0x7f); if ((b & 0x80) == 0) break; } if (delta == 0) break; tnum += delta - TNUM_OFFSET; counts.incrementCount(tnum, 1, maxIdx); } } else { int tnum = 0; int delta = 0; for (; ; ) { delta = (delta << 7) | (code & 0x7f); if ((code & 0x80) == 0) { if (delta == 0) break; tnum += delta - TNUM_OFFSET; counts.incrementCount(tnum, 1, maxIdx); delta = 0; } code >>>= 8; } } } } if (doNegative) { final CountSlotAcc baseCounts = processor.countAcc; for (int i = 0; i < numTermsInField; i++) { // counts[i] = maxTermCounts[i] - counts[i]; baseCounts.incrementCount(i, maxTermCounts[i] - (int) baseCounts.getCount(i) * 2); } } /* TODO - future optimization to handle allBuckets if (processor.allBucketsSlot >= 0) { int all = 0; // overflow potential for (int i=0; i= numTermsInField) { getCounts(processor); return; } collectDocsGeneric(processor); } // called from FieldFacetProcessor // TODO: do a callback version that can be specialized! public void collectDocsGeneric(FacetFieldProcessorByArrayUIF processor) throws IOException { use.incrementAndGet(); int startTermIndex = processor.startTermIndex; int endTermIndex = processor.endTermIndex; int nTerms = processor.nTerms; DocSet docs = processor.fcontext.base; int uniqueTerms = 0; final CountSlotAcc countAcc = processor.countAcc; final SweepCountAccStruct baseCountAccStruct = SweepingCountSlotAcc.baseStructOf(processor); final List others = SweepingCountSlotAcc.otherStructsOf(processor); for (TopTerm tt : bigTerms.values()) { if (tt.termNum >= startTermIndex && tt.termNum < endTermIndex) { // handle the biggest terms DocSet termSet = searcher.getDocSet(tt.termQuery); DocSet intersection = termSet.intersection(docs); int collected = processor.collectFirstPhase( intersection, tt.termNum - startTermIndex, slotNum -> { return new SlotContext(tt.termQuery); }); final int termOrd = tt.termNum - startTermIndex; countAcc.incrementCount(termOrd, collected); for (SweepCountAccStruct entry : others) { entry.countAcc.incrementCount(termOrd, termSet.intersectionSize(entry.docSet)); } if (collected > 0) { uniqueTerms++; } } } if (termInstances > 0) { final List leaves = searcher.getIndexReader().leaves(); final Iterator ctxIt = leaves.iterator(); LeafReaderContext ctx = null; int segBase = 0; int segMax; int adjustedMax = 0; // TODO: handle facet.prefix here!!! SweepIteratorAndCounts sweepIterAndCounts = SweepDocIterator.newInstance(baseCountAccStruct, others); final SweepDocIterator iter = sweepIterAndCounts.iter; final CountSlotAcc[] countAccs = sweepIterAndCounts.countAccs; final SegCountGlobal counts = new SegCountGlobal(countAccs); while (iter.hasNext()) { int doc = iter.nextDoc(); int maxIdx = iter.registerCounts(counts); boolean collectBase = iter.collectBase(); if (doc >= adjustedMax) { do { ctx = ctxIt.next(); if (ctx == null) { // should be impossible throw new RuntimeException("INTERNAL FACET ERROR"); } segBase = ctx.docBase; segMax = ctx.reader().maxDoc(); adjustedMax = segBase + segMax; } while (doc >= adjustedMax); assert doc >= ctx.docBase; processor.setNextReaderFirstPhase(ctx); } int segDoc = doc - segBase; int code = index[doc]; if ((code & 0x80000000) != 0) { int pos = code & 0x7fffffff; int whichArray = (doc >>> 16) & 0xff; byte[] arr = tnums[whichArray]; int tnum = 0; for (; ; ) { int delta = 0; for (; ; ) { byte b = arr[pos++]; delta = (delta << 7) | (b & 0x7f); if ((b & 0x80) == 0) break; } if (delta == 0) break; tnum += delta - TNUM_OFFSET; int arrIdx = tnum - startTermIndex; if (arrIdx < 0) continue; if (arrIdx >= nTerms) break; counts.incrementCount(arrIdx, 1, maxIdx); if (collectBase) { processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext); } } } else { int tnum = 0; int delta = 0; for (; ; ) { delta = (delta << 7) | (code & 0x7f); if ((code & 0x80) == 0) { if (delta == 0) break; tnum += delta - TNUM_OFFSET; int arrIdx = tnum - startTermIndex; if (arrIdx >= 0) { if (arrIdx >= nTerms) break; counts.incrementCount(arrIdx, 1, maxIdx); if (collectBase) { processor.collectFirstPhase(segDoc, arrIdx, processor.slotContext); } } delta = 0; } code >>>= 8; } } } } } String getReadableValue(BytesRef termval, FieldType ft, CharsRefBuilder charsRef) { return ft.indexedToReadable(termval, charsRef).toString(); } /** may return a reused BytesRef */ BytesRef getTermValue(TermsEnum te, int termNum) throws IOException { // System.out.println("getTermValue termNum=" + termNum + " this=" + this + " numTerms=" + // numTermsInField); if (bigTerms.size() > 0) { // see if the term is one of our big terms. TopTerm tt = bigTerms.get(termNum); if (tt != null) { // System.out.println(" return big " + tt.term); return tt.term; } } return lookupTerm(te, termNum); } @Override public String toString() { final long indexSize = indexedTermsArray == null ? 0 : (8 + 8 + 8 + 8 + (indexedTermsArray.length << 3) + sizeOfIndexedStrings); // assume 8 byte references? return "{field=" + field + ",memSize=" + memSize() + ",tindexSize=" + indexSize + ",time=" + total_time + ",phase1=" + phase1_time + ",nTerms=" + numTermsInField + ",bigTerms=" + bigTerms.size() + ",termInstances=" + termInstances + ",uses=" + use.get() + "}"; } ////////////////////////////////////////////////////////////////// //////////////////////////// caching ///////////////////////////// ////////////////////////////////////////////////////////////////// public static UnInvertedField getUnInvertedField(String field, SolrIndexSearcher searcher) throws IOException { SolrCache cache = searcher.getFieldValueCache(); if (cache == null) { return new UnInvertedField(field, searcher); } return cache.computeIfAbsent(field, f -> new UnInvertedField(f, searcher)); } // Returns null if not already populated public static UnInvertedField checkUnInvertedField(String field, SolrIndexSearcher searcher) throws IOException { SolrCache cache = searcher.getFieldValueCache(); if (cache == null) { return null; } Object uif = cache.get(field); // cache is already synchronized, so no extra sync needed // placeholder is an implementation detail, keep it hidden and return null if that is what we // got return uif == uifPlaceholder || !(uif instanceof UnInvertedField) ? null : (UnInvertedField) uif; // TODO: SolrCache is not used safely in other places, but this might be simpligfied to: // return uif==uifPlaceholder ? null : uif; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy