org.apache.lucene.index.FieldUpdatesBuffer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of org.apache.servicemix.bundles.lucene
This OSGi bundle wraps ${pkgArtifactId} ${pkgVersion} jar file.
There is a newer version: 6.4.2_1
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.index;

import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.BytesRefComparator;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.RamUsageEstimator;

/**
 * This class efficiently buffers numeric and binary field updates and stores terms, values and
 * metadata in a memory efficient way without creating large amounts of objects. Update terms are
 * stored without de-duplicating the update term. In general we try to optimize for several
 * use-cases. For instance we try to use constant space for update terms field since the common case
 * always updates on the same field. Also for docUpTo we try to optimize for the case when updates
 * should be applied to all docs ie. docUpTo=Integer.MAX_VALUE. In other cases each update will
 * likely have a different docUpTo. Along the same lines this impl optimizes the case when all
 * updates have a value. Lastly, if all updates share the same value for a numeric field we only
 * store the value once.
 */
final class FieldUpdatesBuffer {
  private static final long SELF_SHALLOW_SIZE =
      RamUsageEstimator.shallowSizeOfInstance(FieldUpdatesBuffer.class);
  private static final long STRING_SHALLOW_SIZE =
      RamUsageEstimator.shallowSizeOfInstance(String.class);
  private final Counter bytesUsed;
  private int numUpdates = 1;
  // we use a very simple approach and store the update term values without de-duplication
  // which is also not a common case to keep updating the same value more than once...
  // we might pay a higher price in terms of memory in certain cases but will gain
  // on CPU for those. We also use a stable sort to sort in order to apply the terms in order
  // since by definition we store them in order.
  private final BytesRefArray termValues;
  private BytesRefArray.SortState termSortState;
  private final BytesRefArray byteValues; // this will be null if we are buffering numerics
  private int[] docsUpTo;
  private long[] numericValues; // this will be null if we are buffering binaries
  private FixedBitSet hasValues;
  private long maxNumeric = Long.MIN_VALUE;
  private long minNumeric = Long.MAX_VALUE;
  private String[] fields;
  private final boolean isNumeric;
  private boolean finished = false;

  private FieldUpdatesBuffer(
      Counter bytesUsed, DocValuesUpdate initialValue, int docUpTo, boolean isNumeric) {
    this.bytesUsed = bytesUsed;
    this.bytesUsed.addAndGet(SELF_SHALLOW_SIZE);
    termValues = new BytesRefArray(bytesUsed);
    termValues.append(initialValue.term.bytes);
    fields = new String[] {initialValue.term.field};
    bytesUsed.addAndGet(sizeOfString(initialValue.term.field));
    docsUpTo = new int[] {docUpTo};
    if (initialValue.hasValue == false) {
      hasValues = new FixedBitSet(1);
      bytesUsed.addAndGet(hasValues.ramBytesUsed());
    }
    this.isNumeric = isNumeric;
    byteValues = isNumeric ? null : new BytesRefArray(bytesUsed);
  }

  private static long sizeOfString(String string) {
    return STRING_SHALLOW_SIZE + (string.length() * (long) Character.BYTES);
  }

  FieldUpdatesBuffer(
      Counter bytesUsed, DocValuesUpdate.NumericDocValuesUpdate initialValue, int docUpTo) {
    this(bytesUsed, initialValue, docUpTo, true);
    if (initialValue.hasValue()) {
      numericValues = new long[] {initialValue.getValue()};
      maxNumeric = minNumeric = initialValue.getValue();
    } else {
      numericValues = new long[] {0};
    }
    bytesUsed.addAndGet(Long.BYTES);
  }

  FieldUpdatesBuffer(
      Counter bytesUsed, DocValuesUpdate.BinaryDocValuesUpdate initialValue, int docUpTo) {
    this(bytesUsed, initialValue, docUpTo, false);
    if (initialValue.hasValue()) {
      byteValues.append(initialValue.getValue());
    }
  }

  long getMaxNumeric() {
    assert isNumeric;
    if (minNumeric == Long.MAX_VALUE && maxNumeric == Long.MIN_VALUE) {
      return 0; // we don't have any value;
    }
    return maxNumeric;
  }

  long getMinNumeric() {
    assert isNumeric;
    if (minNumeric == Long.MAX_VALUE && maxNumeric == Long.MIN_VALUE) {
      return 0; // we don't have any value
    }
    return minNumeric;
  }

  void add(String field, int docUpTo, int ord, boolean hasValue) {
    assert finished == false : "buffer was finished already";
    if (fields[0].equals(field) == false || fields.length != 1) {
      if (fields.length <= ord) {
        String[] array = ArrayUtil.grow(fields, ord + 1);
        if (fields.length == 1) {
          Arrays.fill(array, 1, ord, fields[0]);
        }
        bytesUsed.addAndGet(
            (array.length - fields.length) * (long) RamUsageEstimator.NUM_BYTES_OBJECT_REF);
        fields = array;
      }
      if (field != fields[0]) { // that's an easy win of not accounting if there is an outlier
        bytesUsed.addAndGet(sizeOfString(field));
      }
      fields[ord] = field;
    }

    if (docsUpTo[0] != docUpTo || docsUpTo.length != 1) {
      if (docsUpTo.length <= ord) {
        int[] array = ArrayUtil.grow(docsUpTo, ord + 1);
        if (docsUpTo.length == 1) {
          Arrays.fill(array, 1, ord, docsUpTo[0]);
        }
        bytesUsed.addAndGet((array.length - docsUpTo.length) * (long) Integer.BYTES);
        docsUpTo = array;
      }
      docsUpTo[ord] = docUpTo;
    }

    if (hasValue == false || hasValues != null) {
      if (hasValues == null) {
        hasValues = new FixedBitSet(ord + 1);
        hasValues.set(0, ord);
        bytesUsed.addAndGet(hasValues.ramBytesUsed());
      } else if (hasValues.length() <= ord) {
        FixedBitSet fixedBitSet =
            FixedBitSet.ensureCapacity(hasValues, ArrayUtil.oversize(ord + 1, 1));
        bytesUsed.addAndGet(fixedBitSet.ramBytesUsed() - hasValues.ramBytesUsed());
        hasValues = fixedBitSet;
      }
      if (hasValue) {
        hasValues.set(ord);
      }
    }
  }

  void addUpdate(Term term, long value, int docUpTo) {
    assert isNumeric;
    final int ord = append(term);
    String field = term.field;
    add(field, docUpTo, ord, true);
    minNumeric = Math.min(minNumeric, value);
    maxNumeric = Math.max(maxNumeric, value);
    if (numericValues[0] != value || numericValues.length != 1) {
      if (numericValues.length <= ord) {
        long[] array = ArrayUtil.grow(numericValues, ord + 1);
        if (numericValues.length == 1) {
          Arrays.fill(array, 1, ord, numericValues[0]);
        }
        bytesUsed.addAndGet((array.length - numericValues.length) * (long) Long.BYTES);
        numericValues = array;
      }
      numericValues[ord] = value;
    }
  }

  void addNoValue(Term term, int docUpTo) {
    final int ord = append(term);
    add(term.field, docUpTo, ord, false);
  }

  void addUpdate(Term term, BytesRef value, int docUpTo) {
    assert isNumeric == false;
    final int ord = append(term);
    byteValues.append(value);
    add(term.field, docUpTo, ord, true);
  }

  private int append(Term term) {
    termValues.append(term.bytes);
    return numUpdates++;
  }

  void finish() {
    if (finished) {
      throw new IllegalStateException("buffer was finished already");
    }
    finished = true;
    final boolean sortedTerms = hasSingleValue() && hasValues == null && fields.length == 1;
    if (sortedTerms) {
      termSortState = termValues.sort(BytesRefComparator.NATURAL, true);
      assert assertTermAndDocInOrder();
      bytesUsed.addAndGet(termSortState.ramBytesUsed());
    }
  }

  private boolean assertTermAndDocInOrder() {
    try {
      BytesRefArray.IndexedBytesRefIterator iterator = termValues.iterator(termSortState);
      BytesRef last = null;
      int lastOrd = -1;
      BytesRef current;
      while ((current = iterator.next()) != null) {
        if (last != null) {
          int cmp = current.compareTo(last);
          assert cmp >= 0 : "term in reverse order";
          assert cmp != 0
                  || docsUpTo[getArrayIndex(docsUpTo.length, lastOrd)]
                      <= docsUpTo[getArrayIndex(docsUpTo.length, iterator.ord())]
              : "doc id in reverse order";
        }
        last = BytesRef.deepCopyOf(current);
        lastOrd = iterator.ord();
      }
    } catch (IOException e) {
      assert false : e.getMessage();
    }
    return true;
  }

  BufferedUpdateIterator iterator() {
    if (finished == false) {
      throw new IllegalStateException("buffer is not finished yet");
    }
    return new BufferedUpdateIterator();
  }

  boolean isNumeric() {
    assert isNumeric || byteValues != null;
    return isNumeric;
  }

  boolean hasSingleValue() {
    // we only do this optimization for numerics so far.
    return isNumeric && numericValues.length == 1;
  }

  long getNumericValue(int idx) {
    if (hasValues != null && hasValues.get(idx) == false) {
      return 0;
    }
    return numericValues[getArrayIndex(numericValues.length, idx)];
  }

  /** Struct like class that is used to iterate over all updates in this buffer */
  static class BufferedUpdate {

    private BufferedUpdate() {}

    /** the max document ID this update should be applied to */
    int docUpTo;

    /** a numeric value or 0 if this buffer holds binary updates */
    long numericValue;

    /** a binary value or null if this buffer holds numeric updates */
    BytesRef binaryValue;

    /** true if this update has a value */
    boolean hasValue;

    /** The update terms field. This will never be null. */
    String termField;

    /** The update terms value. This will never be null. */
    BytesRef termValue;

    @Override
    public int hashCode() {
      throw new UnsupportedOperationException(
          "this struct should not be use in map or other data-structures that use hashCode / equals");
    }

    @Override
    public boolean equals(Object obj) {
      throw new UnsupportedOperationException(
          "this struct should not be use in map or other data-structures that use hashCode / equals");
    }
  }

  /** An iterator that iterates over all updates in insertion order */
  class BufferedUpdateIterator {
    private final BytesRefArray.IndexedBytesRefIterator termValuesIterator;
    private final BytesRefArray.IndexedBytesRefIterator lookAheadTermIterator;
    private final BytesRefIterator byteValuesIterator;
    private final BufferedUpdate bufferedUpdate = new BufferedUpdate();
    private final Bits updatesWithValue;

    BufferedUpdateIterator() {
      this.termValuesIterator = termValues.iterator(termSortState);
      this.lookAheadTermIterator =
          termSortState != null ? termValues.iterator(termSortState) : null;
      this.byteValuesIterator = isNumeric ? null : byteValues.iterator();
      updatesWithValue = hasValues == null ? new Bits.MatchAllBits(numUpdates) : hasValues;
    }

    /**
     * If all updates update a single field to the same value, then we can apply these updates in
     * the term order instead of the request order as both will yield the same result. This
     * optimization allows us to iterate the term dictionary faster and de-duplicate updates.
     */
    boolean isSortedTerms() {
      return termSortState != null;
    }

    /**
     * Moves to the next BufferedUpdate or return null if all updates are consumed. The returned
     * instance is a shared instance and must be fully consumed before the next call to this method.
     */
    BufferedUpdate next() throws IOException {
      BytesRef next = nextTerm();
      if (next != null) {
        final int idx = termValuesIterator.ord();
        bufferedUpdate.termValue = next;
        bufferedUpdate.hasValue = updatesWithValue.get(idx);
        bufferedUpdate.termField = fields[getArrayIndex(fields.length, idx)];
        bufferedUpdate.docUpTo = docsUpTo[getArrayIndex(docsUpTo.length, idx)];
        if (bufferedUpdate.hasValue) {
          if (isNumeric) {
            bufferedUpdate.numericValue = numericValues[getArrayIndex(numericValues.length, idx)];
            bufferedUpdate.binaryValue = null;
          } else {
            bufferedUpdate.binaryValue = byteValuesIterator.next();
          }
        } else {
          bufferedUpdate.binaryValue = null;
          bufferedUpdate.numericValue = 0;
        }
        return bufferedUpdate;
      } else {
        return null;
      }
    }

    private BytesRef nextTerm() throws IOException {
      if (lookAheadTermIterator != null) {
        if (bufferedUpdate.termValue == null) {
          lookAheadTermIterator.next();
        }
        BytesRef lastTerm, aheadTerm;
        do {
          aheadTerm = lookAheadTermIterator.next();
          lastTerm = termValuesIterator.next();
        } while (aheadTerm != null
            // Shortcut to avoid equals, we did a stable sort before, so aheadTerm can only equal
            // lastTerm when aheadTerm has a lager ord.
            && lookAheadTermIterator.ord() > termValuesIterator.ord()
            && aheadTerm.equals(lastTerm));
        return lastTerm;
      } else {
        return termValuesIterator.next();
      }
    }
  }

  private static int getArrayIndex(int arrayLength, int index) {
    assert arrayLength == 1 || arrayLength > index
        : "illegal array index length: " + arrayLength + " index: " + index;
    return Math.min(arrayLength - 1, index);
  }
}