All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.index.fielddata.ordinals.OrdinalsBuilder Maven / Gradle / Ivy

There is a newer version: 8.14.1
Show newest version
package org.elasticsearch.index.fielddata.ordinals;
/*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FilteredTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.*;
import org.apache.lucene.util.IntBlockPool.Allocator;
import org.apache.lucene.util.IntBlockPool.DirectAllocator;
import org.apache.lucene.util.packed.GrowableWriter;
import org.apache.lucene.util.packed.PackedInts;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.settings.Settings;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;

/**
 * Simple class to build document ID <-> ordinal mapping. Note: Ordinals are
 * 1 based monotocially increasing positive integers. 0
 * donates the missing value in this context.
 */
public final class OrdinalsBuilder implements Closeable {

    private final int maxDoc;
    private int[] mvOrds;
    private GrowableWriter svOrds;

    private int[] offsets;
    private final IntBlockPool pool;
    private final IntBlockPool.SliceWriter writer;
    private final IntsRef intsRef = new IntsRef(1);
    private final IntBlockPool.SliceReader reader;
    private int currentOrd = 0;
    private int numDocsWithValue = 0;
    private int numMultiValuedDocs = 0;
    private int totalNumOrds = 0;

    public OrdinalsBuilder(Terms terms, boolean preDefineBitsRequired, int maxDoc, Allocator allocator) throws IOException {
        this.maxDoc = maxDoc;
        // TODO: Make configurable...
        float acceptableOverheadRatio = PackedInts.FAST;
        if (preDefineBitsRequired) {
            int numTerms = (int) terms.size();
            if (numTerms == -1) {
                svOrds = new GrowableWriter(1, maxDoc, acceptableOverheadRatio);
            } else {
                svOrds = new GrowableWriter(PackedInts.bitsRequired(numTerms), maxDoc, acceptableOverheadRatio);
            }
        } else {
            svOrds = new GrowableWriter(1, maxDoc, acceptableOverheadRatio);
        }
        pool = new IntBlockPool(allocator);
        reader = new IntBlockPool.SliceReader(pool);
        writer = new IntBlockPool.SliceWriter(pool);
    }
    
    public OrdinalsBuilder(int maxDoc) throws IOException {
        this(null, false, maxDoc);
    }

    public OrdinalsBuilder(Terms terms, boolean preDefineBitsRequired, int maxDoc) throws IOException {
        this(terms, preDefineBitsRequired, maxDoc, new DirectAllocator());
    }

    public OrdinalsBuilder(Terms terms, int maxDoc) throws IOException {
        this(terms, true, maxDoc, new DirectAllocator());
    }

    /**
     * Advances the {@link OrdinalsBuilder} to the next ordinal and
     * return the current ordinal.
     */
    public int nextOrdinal() {
        return ++currentOrd;
    }
    
    /**
     * Retruns the current ordinal or 0 if this build has not been advanced via
     * {@link #nextOrdinal()}.
     */
    public int currentOrdinal() {
        return currentOrd;
    }

    /**
     * Associates the given document id with the current ordinal. 
     */
    public OrdinalsBuilder addDoc(int doc) {
        totalNumOrds++;
        if (svOrds != null) {
            int docsOrd = (int) svOrds.get(doc);
            if (docsOrd == 0) {
                svOrds.set(doc, currentOrd);
                numDocsWithValue++;
            } else {
                // Rebuilding ords that supports mv based on sv ords.
                mvOrds = new int[maxDoc];
                for (int docId = 0; docId < maxDoc; docId++) {
                    mvOrds[docId] = (int) svOrds.get(docId);
                }
                svOrds = null;
            }
        }

        if (mvOrds != null) {
            int docsOrd = mvOrds[doc];
            if (docsOrd == 0) {
                mvOrds[doc] = currentOrd;
                numDocsWithValue++;
            } else if (docsOrd > 0) {
                numMultiValuedDocs++;
                int offset = writer.startNewSlice();
                writer.writeInt(docsOrd);
                writer.writeInt(currentOrd);
                if (offsets == null) {
                    offsets = new int[mvOrds.length];
                }
                offsets[doc] = writer.getCurrentOffset();
                mvOrds[doc] = (-1 * offset) - 1;
            } else {
                assert offsets != null;
                writer.reset(offsets[doc]);
                writer.writeInt(currentOrd);
                offsets[doc] = writer.getCurrentOffset();
            }
        }
        return this;
    }

    /**
     * Returns true iff this builder contains a document ID that is associated with more than one ordinal. Otherwise false;
     */
    public boolean isMultiValued() {
        return offsets != null;
    }

    /**
     * Returns the number distinct of document IDs with one or more values.
     */
    public int getNumDocsWithValue() {
        return numDocsWithValue;
    }

    /**
     * Returns the number distinct of document IDs associated with exactly one value.
     */
    public int getNumSingleValuedDocs() {
        return numDocsWithValue - numMultiValuedDocs;
    }

    /**
     * Returns the number distinct of document IDs associated with two or more values.
     */
    public int getNumMultiValuesDocs() {
        return numMultiValuedDocs;
    }

    /**
     * Returns the number of document ID to ordinal pairs in this builder.
     */
    public int getTotalNumOrds() {
        return totalNumOrds;
    }

    /**
     * Returns the number of distinct ordinals in this builder.  
     */
    public int getNumOrds() {
        return currentOrd;
    }

    /**
     * Builds a {@link FixedBitSet} where each documents bit is that that has one or more ordinals associated with it.
     * if every document has an ordinal associated with it this method returns null
     */
    public FixedBitSet buildDocsWithValuesSet() {
        if (numDocsWithValue == maxDoc) {
            return null;
        }
        final FixedBitSet bitSet = new FixedBitSet(maxDoc);
        if (svOrds != null) {
            for (int docId = 0; docId < maxDoc; docId++) {
                int ord = (int) svOrds.get(docId);
                if (ord != 0) {
                    bitSet.set(docId);
                }
            }
        } else {
            for (int docId = 0; docId < maxDoc; docId++) {
                if (mvOrds[docId] != 0) {
                    bitSet.set(docId);
                }
            }
        }
        return bitSet;
    }

    /**
     * Builds an {@link Ordinals} instance from the builders current state. 
     */
    public Ordinals build(Settings settings) {
        if (numMultiValuedDocs == 0) {
            return new SinglePackedOrdinals(svOrds.getMutable(), getNumOrds());
        }
        final String multiOrdinals = settings.get("multi_ordinals", "sparse");
        if ("flat".equals(multiOrdinals)) {
            final ArrayList ordinalBuffer = new ArrayList();
            for (int i = 0; i < mvOrds.length; i++) {
                final IntsRef docOrds = docOrds(i);
                while (ordinalBuffer.size() < docOrds.length) {
                    ordinalBuffer.add(new int[mvOrds.length]);
                }
                
                for (int j = docOrds.offset; j < docOrds.offset+docOrds.length; j++) {
                    ordinalBuffer.get(j)[i] = docOrds.ints[j];
                }
            }
            int[][] nativeOrdinals = new int[ordinalBuffer.size()][];
            for (int i = 0; i < nativeOrdinals.length; i++) {
                nativeOrdinals[i] = ordinalBuffer.get(i);
            }
            return new MultiFlatArrayOrdinals(nativeOrdinals, getNumOrds());
        } else if ("sparse".equals(multiOrdinals)) {
            int multiOrdinalsMaxDocs = settings.getAsInt("multi_ordinals_max_docs", 16777216 /* Equal to 64MB per storeage array */);
            return new SparseMultiArrayOrdinals(this, multiOrdinalsMaxDocs);
        } else {
            throw new ElasticSearchIllegalArgumentException("no applicable fielddata multi_ordinals value, got [" + multiOrdinals + "]");
        }
    }

    /**
     * Returns a shared {@link IntsRef} instance for the given doc ID holding all ordinals associated with it.
     */
    public IntsRef docOrds(int doc) {
        if (svOrds != null) {
            int docsOrd = (int) svOrds.get(doc);
            intsRef.offset = 0;
            if (docsOrd == 0) {
                intsRef.length = 0;
            } else if (docsOrd > 0) {
                intsRef.ints[0] = docsOrd;
                intsRef.length = 1;
            }
        } else {
            int docsOrd = mvOrds[doc];
            intsRef.offset = 0;
            if (docsOrd == 0) {
                intsRef.length = 0;
            } else if (docsOrd > 0) {
                intsRef.ints[0] = mvOrds[doc];
                intsRef.length = 1;
            } else {
                assert offsets != null;
                reader.reset(-1 * (mvOrds[doc] + 1), offsets[doc]);
                int pos = 0;
                while (!reader.endOfSlice()) {
                    if (intsRef.ints.length <= pos) {
                        intsRef.ints = ArrayUtil.grow(intsRef.ints, pos + 1);
                    }
                    intsRef.ints[pos++] = reader.readInt();
                }
                intsRef.length = pos;
            }
        }
        return intsRef;
    }

    /**
     * Returns the maximum document ID this builder can associate with an ordinal
     */
    public int maxDoc() {
        return maxDoc;
    }
    
    /**
     * A {@link TermsEnum} that iterates only full precision prefix coded 64 bit values.
     * @see #buildFromTerms(TermsEnum, Bits)
     */
    public TermsEnum wrapNumeric64Bit(TermsEnum termsEnum) {
        return new FilteredTermsEnum(termsEnum, false) {
            @Override
            protected AcceptStatus accept(BytesRef term) throws IOException {
                // we stop accepting terms once we moved across the prefix codec terms - redundant values!
                return NumericUtils.getPrefixCodedLongShift(term) == 0 ? AcceptStatus.YES : AcceptStatus.END;
            }
        };
    }

    /**
     * A {@link TermsEnum} that iterates only full precision prefix coded 32 bit values.
     * @see #buildFromTerms(TermsEnum, Bits)
     */
    public TermsEnum wrapNumeric32Bit(TermsEnum termsEnum) {
        return new FilteredTermsEnum(termsEnum, false) {
            
            @Override
            protected AcceptStatus accept(BytesRef term) throws IOException {
                // we stop accepting terms once we moved across the prefix codec terms - redundant values!
                return NumericUtils.getPrefixCodedIntShift(term) == 0 ? AcceptStatus.YES : AcceptStatus.END;
            }
        };
    }

    /**
     * This method iterates all terms in the given {@link TermsEnum} and
     * associates each terms ordinal with the terms documents. The caller must
     * exhaust the returned {@link BytesRefIterator} which returns all values
     * where the first returned value is associted with the ordinal 1
     * etc.
     * 

* If the {@link TermsEnum} contains prefix coded numerical values the terms * enum should be wrapped with either {@link #wrapNumeric32Bit(TermsEnum)} * or {@link #wrapNumeric64Bit(TermsEnum)} depending on its precision. If * the {@link TermsEnum} is not wrapped the returned * {@link BytesRefIterator} will contain partial precision terms rather than * only full-precision terms. *

*/ public BytesRefIterator buildFromTerms(final TermsEnum termsEnum, final Bits liveDocs) throws IOException { return new BytesRefIterator() { private DocsEnum docsEnum = null; @Override public BytesRef next() throws IOException { BytesRef ref; if ((ref = termsEnum.next()) != null) { docsEnum = termsEnum.docs(liveDocs, docsEnum, DocsEnum.FLAG_NONE); nextOrdinal(); int docId; while((docId = docsEnum.nextDoc()) != DocsEnum.NO_MORE_DOCS) { addDoc(docId); } } return ref; } @Override public Comparator getComparator() { return termsEnum.getComparator(); } }; } /** * Closes this builder and release all resources. */ @Override public void close() throws IOException { pool.reset(true, false); offsets = null; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy