org.elasticsearch.index.fielddata.plain.SortedSetDVAtomicFieldData Maven / Gradle / Ivy

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.index.fielddata.plain;

import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongsRef;
import org.elasticsearch.ElasticsearchIllegalStateException;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.IntArray;
import org.elasticsearch.index.fielddata.AtomicFieldData;
import org.elasticsearch.index.fielddata.BytesValues;
import org.elasticsearch.index.fielddata.ordinals.Ordinals;

import java.io.IOException;

/**
 * {@link AtomicFieldData} impl based on Lucene's {@link SortedSetDocValues}.
 * Implementation note: Lucene's ordinal for unset values is -1 whereas Elasticsearch's is 0, this is why there are all
 * these +1 to translate from Lucene's ordinals to ES's.
 */
abstract class SortedSetDVAtomicFieldData {

    private final AtomicReader reader;
    private final String field;
    private volatile IntArray hashes;

    SortedSetDVAtomicFieldData(AtomicReader reader, String field) {
        this.reader = reader;
        this.field = field;
    }

    public boolean isMultiValued() {
        // we could compute it when loading the values for the first time and then cache it but it would defeat the point of
        // doc values which is to make loading faster
        return true;
    }

    public int getNumDocs() {
        return reader.maxDoc();
    }

    public long getNumberUniqueValues() {
        final SortedSetDocValues values = getValuesNoException(reader, field);
        return values.getValueCount();
    }

    public long getMemorySizeInBytes() {
        // There is no API to access memory usage per-field and RamUsageEstimator can't help since there are often references
        // from a per-field instance to all other instances handled by the same format
        return -1L;
    }

    public void close() {
        // no-op
    }

    public org.elasticsearch.index.fielddata.BytesValues.WithOrdinals getBytesValues(boolean needsHashes) {
        final SortedSetDocValues values = getValuesNoException(reader, field);
        return new SortedSetValues(reader, field, values);
    }

    public org.elasticsearch.index.fielddata.BytesValues.WithOrdinals getHashedBytesValues() {
        final SortedSetDocValues values = getValuesNoException(reader, field);
        if (hashes == null) {
            synchronized (this) {
                if (hashes == null) {
                    final long valueCount = values.getValueCount();
                    final IntArray hashes = BigArrays.newIntArray(1L + valueCount);
                    BytesRef scratch = new BytesRef(16);
                    hashes.set(0, scratch.hashCode());
                    for (long i = 0; i < valueCount; ++i) {
                        values.lookupOrd(i, scratch);
                        hashes.set(1L + i, scratch.hashCode());
                    }
                    this.hashes = hashes;
                }
            }
        }
        return new SortedSetHashedValues(reader, field, values, hashes);
    }

    private static SortedSetDocValues getValuesNoException(AtomicReader reader, String field) {
        try {
            SortedSetDocValues values = reader.getSortedSetDocValues(field);
            if (values == null) {
                // This field has not been populated
                assert reader.getFieldInfos().fieldInfo(field) == null;
                values = SortedSetDocValues.EMPTY;
            }
            return values;
        } catch (IOException e) {
            throw new ElasticsearchIllegalStateException("Couldn't load doc values", e);
        }
    }

    static class SortedSetValues extends BytesValues.WithOrdinals {

        protected final SortedSetDocValues values;

        SortedSetValues(AtomicReader reader, String field, SortedSetDocValues values) {
            super(new SortedSetDocs(new SortedSetOrdinals(reader, field, values.getValueCount()), values));
            this.values = values;
        }

        @Override
        public BytesRef getValueByOrd(long ord) {
            assert ord != Ordinals.MISSING_ORDINAL;
            values.lookupOrd(ord - 1, scratch);
            return scratch;
        }

        @Override
        public BytesRef nextValue() {
            values.lookupOrd(ordinals.nextOrd()-1, scratch);
            return scratch;
        }
    }

    static final class SortedSetHashedValues extends SortedSetValues {

        private final IntArray hashes;

        SortedSetHashedValues(AtomicReader reader, String field, SortedSetDocValues values, IntArray hashes) {
            super(reader, field, values);
            this.hashes = hashes;
        }

        @Override
        public int currentValueHash() {
            assert ordinals.currentOrd() >= 0;
            return hashes.get(ordinals.currentOrd());
        }
    }

    static final class SortedSetOrdinals implements Ordinals {

        // We don't store SortedSetDocValues as a member because Ordinals must be thread-safe
        private final AtomicReader reader;
        private final String field;
        private final long numOrds;

        public SortedSetOrdinals(AtomicReader reader, String field, long numOrds) {
            super();
            this.reader = reader;
            this.field = field;
            this.numOrds = numOrds;
        }

        @Override
        public long getMemorySizeInBytes() {
            // Ordinals can't be distinguished from the atomic field data instance
            return -1;
        }

        @Override
        public boolean isMultiValued() {
            return true;
        }

        @Override
        public int getNumDocs() {
            return reader.maxDoc();
        }

        @Override
        public long getNumOrds() {
            return numOrds;
        }

        @Override
        public long getMaxOrd() {
            return 1 + numOrds;
        }

        @Override
        public Docs ordinals() {
            final SortedSetDocValues values = getValuesNoException(reader, field);
            assert values.getValueCount() == numOrds;
            return new SortedSetDocs(this, values);
        }

    }

    static class SortedSetDocs implements Ordinals.Docs {

        private final SortedSetOrdinals ordinals;
        private final SortedSetDocValues values;
        private final LongsRef longScratch;
        private int ordIndex = Integer.MAX_VALUE;
        private long currentOrdinal = -1;

        SortedSetDocs(SortedSetOrdinals ordinals, SortedSetDocValues values) {
            this.ordinals = ordinals;
            this.values = values;
            longScratch = new LongsRef(8);
        }

        @Override
        public Ordinals ordinals() {
            return ordinals;
        }

        @Override
        public int getNumDocs() {
            return ordinals.getNumDocs();
        }

        @Override
        public long getNumOrds() {
            return ordinals.getNumOrds();
        }

        @Override
        public long getMaxOrd() {
            return ordinals.getMaxOrd();
        }

        @Override
        public boolean isMultiValued() {
            return ordinals.isMultiValued();
        }

        @Override
        public long getOrd(int docId) {
            values.setDocument(docId);
            return currentOrdinal = 1 + values.nextOrd();
        }

        @Override
        public LongsRef getOrds(int docId) {
            values.setDocument(docId);
            longScratch.offset = 0;
            longScratch.length = 0;
            for (long ord = values.nextOrd(); ord != SortedSetDocValues.NO_MORE_ORDS; ord = values.nextOrd()) {
                longScratch.longs = ArrayUtil.grow(longScratch.longs, longScratch.length + 1);
                longScratch.longs[longScratch.length++] = 1 + ord;
            }
            return longScratch;
        }

        @Override
        public long nextOrd() {
            assert ordIndex < longScratch.length;
            return currentOrdinal = longScratch.longs[ordIndex++];
        }

        @Override
        public int setDocument(int docId) {
            // For now, we consume all ords and pass them to the iter instead of doing it in a streaming way because Lucene's
            // SORTED_SET doc values are cached per thread, you can't have a fully independent instance
            final LongsRef ords = getOrds(docId);
            ordIndex = 0;
            return ords.length;
        }

        @Override
        public long currentOrd() {
            return currentOrdinal;
        }
    }
}