All Downloads are FREE. Search and download functionalities are using the official Maven repository.

solutions.siren.join.action.terms.collector.IntegerTermsSet Maven / Gradle / Ivy

The newest version!
/**
 * Copyright (c) 2016, SIREn Solutions. All Rights Reserved.
 *
 * This file is part of the SIREn project.
 *
 * SIREn is a free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * SIREn is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public
 * License along with this program. If not, see .
 */
package solutions.siren.join.action.terms.collector;

import com.carrotsearch.hppc.BufferAllocationException;
import com.carrotsearch.hppc.IntHashSet;
import com.carrotsearch.hppc.IntScatterSet;
import com.carrotsearch.hppc.cursors.IntCursor;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.breaker.CircuitBreaker;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import solutions.siren.join.action.terms.TermsByQueryRequest;
import solutions.siren.join.common.Bytes;
import solutions.siren.join.common.Math;

import java.io.IOException;
import java.util.Iterator;

public class IntegerTermsSet extends NumericTermsSet {

  private transient IntHashSet set;

  /**
   * The size of the header: four bytes for the terms encoding ordinal,
   * 1 byte for the {@link #isPruned} flag, and four bytes for the size.
   */
  private static final int HEADER_SIZE = 9;

  private static final ESLogger logger = Loggers.getLogger(IntegerTermsSet.class);

  public IntegerTermsSet(final CircuitBreaker breaker) {
    super(breaker);
  }

  public IntegerTermsSet(final long expectedElements, final CircuitBreaker breaker) {
    super(breaker);
    this.set = new CircuitBreakerIntHashSet(Math.toIntExact(expectedElements));
  }

  /**
   * Constructor based on a byte array containing the encoded set of terms.
   * Used in {@link solutions.siren.join.index.query.FieldDataTermsQuery}.
   */
  public IntegerTermsSet(BytesRef bytes) {
    super(null);
    this.readFromBytes(bytes);
  }

  @Override
  public void add(long term) {
    this.set.add((int) term);
  }

  @Override
  public boolean contains(long term) {
    return this.set.contains((int) term);
  }

  @Override
  protected void addAll(TermsSet terms) {
    if (!(terms instanceof IntegerTermsSet)) {
      throw new UnsupportedOperationException("Invalid type: IntegerTermsSet expected.");
    }
    this.set.addAll(((IntegerTermsSet) terms).set);
  }

  @Override
  public int size() {
    return this.set.size();
  }

  @Override
  public void readFrom(StreamInput in) throws IOException {
    this.setIsPruned(in.readBoolean());
    int size = in.readInt();
    set = new CircuitBreakerIntHashSet(size);
    for (long i = 0; i < size; i++) {
      set.add(in.readVInt());
    }
  }

  /**
   * Serialize the list of terms to the {@link StreamOutput}.
   * 
* Given the low performance of {@link org.elasticsearch.common.io.stream.BytesStreamOutput} when writing a large number * of longs (5 to 10 times slower than writing directly to a byte[]), we use a small buffer of 8kb * to optimise the throughput. 8kb seems to be the optimal buffer size, larger buffer size did not improve * the throughput. * * @param out the output */ @Override public void writeTo(StreamOutput out) throws IOException { // Encode flag out.writeBoolean(this.isPruned()); // Encode size of list out.writeInt(set.size()); // Encode ints BytesRef buffer = new BytesRef(new byte[1024 * 8]); Iterator it = set.iterator(); while (it.hasNext()) { Bytes.writeVInt(buffer, it.next().value); if (buffer.offset > buffer.bytes.length - 5) { out.write(buffer.bytes, 0, buffer.offset); buffer.offset = 0; } } // flush the remaining bytes from the buffer out.write(buffer.bytes, 0, buffer.offset); } @Override public BytesRef writeToBytes() { long start = System.nanoTime(); int size = set.size(); BytesRef bytesRef = new BytesRef(new byte[HEADER_SIZE + size * 5]); // Encode encoding type Bytes.writeInt(bytesRef, this.getEncoding().ordinal()); // Encode flag bytesRef.bytes[bytesRef.offset++] = (byte) (this.isPruned() ? 1 : 0); // Encode size of list Bytes.writeInt(bytesRef, size); // Encode ints for (IntCursor i : set) { Bytes.writeVInt(bytesRef, i.value); } logger.debug("Serialized {} terms - took {} ms", this.size(), (System.nanoTime() - start) / 1000000); bytesRef.length = bytesRef.offset; bytesRef.offset = 0; return bytesRef; } private void readFromBytes(BytesRef bytesRef) { // Read pruned flag this.setIsPruned(bytesRef.bytes[bytesRef.offset++] == 1 ? true : false); // Read size fo the set int size = Bytes.readInt(bytesRef); // Read terms // Scatter set is slightly more efficient than the hash set, but should be used only for lookups, // not for merging set = new IntScatterSet(size); for (int i = 0; i < size; i++) { set.add(Bytes.readVInt(bytesRef)); } } @Override public TermsByQueryRequest.TermsEncoding getEncoding() { return TermsByQueryRequest.TermsEncoding.INTEGER; } @Override public void release() { if (set != null) { set.release(); } } /** * A {@link IntHashSet} integrated with the {@link CircuitBreaker}. It will adjust the circuit breaker * for every new call to {@link #allocateBuffers(int)}. *

* This set must not be reused after a call to {@link #release()}. */ private final class CircuitBreakerIntHashSet extends IntHashSet { public CircuitBreakerIntHashSet(int expectedElements) { super(expectedElements); } @Override protected void allocateBuffers(int arraySize) { long newMemSize = (arraySize + 1) * 4l; // array size + emtpyElementSlot long oldMemSize = keys == null ? 0 : keys.length * 4l; // Adjust the breaker with the new memory size breaker.addEstimateBytesAndMaybeBreak(newMemSize, ""); try { // Allocate the new buffer super.allocateBuffers(arraySize); // Adjust the breaker by removing old memory size breaker.addWithoutBreaking(-oldMemSize); } catch (BufferAllocationException e) { // If the allocation failed, remove breaker.addWithoutBreaking(-newMemSize); } } @Override public void release() { long memSize = keys == null ? 0 : keys.length * 4l; // Release - do not allocate a new minimal buffer assigned = 0; hasEmptyKey = false; keys = null; // Adjust breaker breaker.addWithoutBreaking(-memSize); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy