All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.arrow.vector.BaseVariableWidthVector Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.arrow.vector;

import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.arrow.memory.BaseAllocator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.OutOfMemoryException;
import org.apache.arrow.memory.util.ArrowBufPointer;
import org.apache.arrow.memory.util.ByteFunctionHelpers;
import org.apache.arrow.memory.util.hash.ArrowBufHasher;
import org.apache.arrow.util.Preconditions;
import org.apache.arrow.vector.compare.VectorVisitor;
import org.apache.arrow.vector.ipc.message.ArrowFieldNode;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.util.CallBack;
import org.apache.arrow.vector.util.OversizedAllocationException;
import org.apache.arrow.vector.util.TransferPair;

import io.netty.buffer.ArrowBuf;

/**
 * BaseVariableWidthVector is a base class providing functionality for strings/bytes types.
 */
public abstract class BaseVariableWidthVector extends BaseValueVector
        implements VariableWidthVector, FieldVector, VectorDefinitionSetter {
  private static final int DEFAULT_RECORD_BYTE_COUNT = 8;
  private static final int INITIAL_BYTE_COUNT = INITIAL_VALUE_ALLOCATION * DEFAULT_RECORD_BYTE_COUNT;
  private int lastValueCapacity;
  private long lastValueAllocationSizeInBytes;

  /* protected members */
  public static final int OFFSET_WIDTH = 4; /* 4 byte unsigned int to track offsets */
  protected static final byte[] emptyByteArray = new byte[]{};
  protected ArrowBuf validityBuffer;
  protected ArrowBuf valueBuffer;
  protected ArrowBuf offsetBuffer;
  protected int valueCount;
  protected int lastSet;
  protected final Field field;

  /**
   * Constructs a new instance.
   *
   * @param field The field materialized by this vector.
   * @param allocator The allocator to use for creating/resizing buffers
   */
  public BaseVariableWidthVector(Field field, final BufferAllocator allocator) {
    super(allocator);
    this.field = field;
    lastValueAllocationSizeInBytes = INITIAL_BYTE_COUNT;
    // -1 because we require one extra slot for the offset array.
    lastValueCapacity = INITIAL_VALUE_ALLOCATION - 1;
    valueCount = 0;
    lastSet = -1;
    offsetBuffer = allocator.getEmpty();
    validityBuffer = allocator.getEmpty();
    valueBuffer = allocator.getEmpty();
  }

  @Override
  public String getName() {
    return field.getName();
  }

  /* TODO:
   * see if getNullCount() can be made faster -- O(1)
   */

  /* TODO:
   * Once the entire hierarchy has been refactored, move common functions
   * like getNullCount(), splitAndTransferValidityBuffer to top level
   * base class BaseValueVector.
   *
   * Along with this, some class members (validityBuffer) can also be
   * abstracted out to top level base class.
   *
   * Right now BaseValueVector is the top level base class for other
   * vector types in ValueVector hierarchy (non-nullable) and those
   * vectors have not yet been refactored/removed so moving things to
   * the top class as of now is not a good idea.
   */

  /**
   * Get buffer that manages the validity (NULL or NON-NULL nature) of
   * elements in the vector. Consider it as a buffer for internal bit vector
   * data structure.
   * @return buffer
   */
  @Override
  public ArrowBuf getValidityBuffer() {
    return validityBuffer;
  }

  /**
   * Get the buffer that stores the data for elements in the vector.
   * @return buffer
   */
  @Override
  public ArrowBuf getDataBuffer() {
    return valueBuffer;
  }

  /**
   * buffer that stores the offsets for elements
   * in the vector. This operation is not supported for fixed-width vectors.
   * @return buffer
   */
  @Override
  public ArrowBuf getOffsetBuffer() {
    return offsetBuffer;
  }

  /**
   * Get the memory address of buffer that stores the offsets for elements
   * in the vector.
   * @return starting address of the buffer
   */
  @Override
  public long getOffsetBufferAddress() {
    return offsetBuffer.memoryAddress();
  }

  /**
   * Get the memory address of buffer that manages the validity
   * (NULL or NON-NULL nature) of elements in the vector.
   * @return starting address of the buffer
   */
  @Override
  public long getValidityBufferAddress() {
    return validityBuffer.memoryAddress();
  }

  /**
   * Get the memory address of buffer that stores the data for elements
   * in the vector.
   * @return starting address of the buffer
   */
  @Override
  public long getDataBufferAddress() {
    return valueBuffer.memoryAddress();
  }

  /**
   * Sets the desired value capacity for the vector. This function doesn't
   * allocate any memory for the vector.
   * @param valueCount desired number of elements in the vector
   */
  @Override
  public void setInitialCapacity(int valueCount) {
    final long size = (long) valueCount * DEFAULT_RECORD_BYTE_COUNT;
    checkDataBufferSize(size);
    computeAndCheckOffsetsBufferSize(valueCount);
    lastValueAllocationSizeInBytes = (int) size;
    lastValueCapacity = valueCount;
  }

  /**
   * Sets the desired value capacity for the vector. This function doesn't
   * allocate any memory for the vector.
   * @param valueCount desired number of elements in the vector
   * @param density average number of bytes per variable width element
   */
  @Override
  public void setInitialCapacity(int valueCount, double density) {
    long size = Math.max((long)(valueCount * density), 1L);
    checkDataBufferSize(size);
    computeAndCheckOffsetsBufferSize(valueCount);
    lastValueAllocationSizeInBytes = (int) size;
    lastValueCapacity = valueCount;
  }

  /**
   * Get the density of this ListVector.
   * @return density
   */
  public double getDensity() {
    if (valueCount == 0) {
      return 0.0D;
    }
    final int startOffset = offsetBuffer.getInt(0);
    final int endOffset = offsetBuffer.getInt(valueCount * OFFSET_WIDTH);
    final double totalListSize = endOffset - startOffset;
    return totalListSize / valueCount;
  }

  /**
   * Get the current value capacity for the vector.
   * @return number of elements that vector can hold.
   */
  @Override
  public int getValueCapacity() {
    final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity() - 1, 0);
    return Math.min(offsetValueCapacity, getValidityBufferValueCapacity());
  }

  private int getValidityBufferValueCapacity() {
    return capAtMaxInt(validityBuffer.capacity() * 8);
  }

  private int getOffsetBufferValueCapacity() {
    return capAtMaxInt(offsetBuffer.capacity() / OFFSET_WIDTH);
  }

  /**
   * zero out the vector and the data in associated buffers.
   */
  public void zeroVector() {
    initValidityBuffer();
    initOffsetBuffer();
    valueBuffer.setZero(0, valueBuffer.capacity());
  }

  /* zero out the validity buffer */
  private void initValidityBuffer() {
    validityBuffer.setZero(0, validityBuffer.capacity());
  }

  /* zero out the offset buffer */
  private void initOffsetBuffer() {
    offsetBuffer.setZero(0, offsetBuffer.capacity());
  }

  /**
   * Reset the vector to initial state. Same as {@link #zeroVector()}.
   * Note that this method doesn't release any memory.
   */
  public void reset() {
    zeroVector();
    lastSet = -1;
    valueCount = 0;
  }

  /**
   * Close the vector and release the associated buffers.
   */
  @Override
  public void close() {
    clear();
  }

  /**
   * Same as {@link #close()}.
   */
  @Override
  public void clear() {
    validityBuffer = releaseBuffer(validityBuffer);
    valueBuffer = releaseBuffer(valueBuffer);
    offsetBuffer = releaseBuffer(offsetBuffer);
    lastSet = -1;
    valueCount = 0;
  }

  @Override
  @Deprecated
  public List getFieldInnerVectors() {
    throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers");
  }

  /**
   * Initialize the children in schema for this Field. This operation is a
   * NO-OP for scalar types since they don't have any children.
   * @param children the schema
   * @throws IllegalArgumentException if children is a non-empty list for scalar types.
   */
  @Override
  public void initializeChildrenFromFields(List children) {
    if (!children.isEmpty()) {
      throw new IllegalArgumentException("primitive type vector can not have children");
    }
  }

  /**
   * Get the inner child vectors.
   * @return list of child vectors for complex types, empty list for scalar vector types
   */
  @Override
  public List getChildrenFromFields() {
    return Collections.emptyList();
  }


  /**
   * Load the buffers of this vector with provided source buffers.
   * The caller manages the source buffers and populates them before invoking
   * this method.
   * @param fieldNode  the fieldNode indicating the value count
   * @param ownBuffers the buffers for this Field (own buffers only, children not included)
   */
  @Override
  public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) {
    ArrowBuf bitBuffer = ownBuffers.get(0);
    ArrowBuf offBuffer = ownBuffers.get(1);
    ArrowBuf dataBuffer = ownBuffers.get(2);

    validityBuffer.getReferenceManager().release();
    validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator);
    offsetBuffer.getReferenceManager().release();
    offsetBuffer = offBuffer.getReferenceManager().retain(offBuffer, allocator);
    valueBuffer.getReferenceManager().release();
    valueBuffer = dataBuffer.getReferenceManager().retain(dataBuffer, allocator);

    lastSet = fieldNode.getLength() - 1;
    valueCount = fieldNode.getLength();
  }

  /**
   * Get the buffers belonging to this vector.
   * @return the inner buffers.
   */
  public List getFieldBuffers() {
    // before flight/IPC, we must bring the vector to a consistent state.
    // this is because, it is possible that the offset buffers of some trailing values
    // are not updated. this may cause some data in the data buffer being lost.
    // for details, please see TestValueVector#testUnloadVariableWidthVector.
    fillHoles(valueCount);

    List result = new ArrayList<>(3);
    setReaderAndWriterIndex();
    result.add(validityBuffer);
    result.add(offsetBuffer);
    result.add(valueBuffer);

    return result;
  }

  /**
   * Set the reader and writer indexes for the inner buffers.
   */
  private void setReaderAndWriterIndex() {
    validityBuffer.readerIndex(0);
    offsetBuffer.readerIndex(0);
    valueBuffer.readerIndex(0);
    if (valueCount == 0) {
      validityBuffer.writerIndex(0);
      offsetBuffer.writerIndex(0);
      valueBuffer.writerIndex(0);
    } else {
      final int lastDataOffset = getStartOffset(valueCount);
      validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount));
      offsetBuffer.writerIndex((valueCount + 1) * OFFSET_WIDTH);
      valueBuffer.writerIndex(lastDataOffset);
    }
  }

  /**
   * Same as {@link #allocateNewSafe()}.
   */
  @Override
  public void allocateNew() {
    allocateNew(lastValueAllocationSizeInBytes, lastValueCapacity);
  }

  /**
   * Allocate memory for the vector. We internally use a default value count
   * of 4096 to allocate memory for at least these many elements in the
   * vector. See {@link #allocateNew(long, int)} for allocating memory for specific
   * number of elements in the vector.
   *
   * @return false if memory allocation fails, true otherwise.
   */
  @Override
  public boolean allocateNewSafe() {
    try {
      allocateNew(lastValueAllocationSizeInBytes, lastValueCapacity);
      return true;
    } catch (Exception e) {
      return false;
    }
  }

  /**
   * Allocate memory for the vector to support storing at least the provided number of
   * elements in the vector. This method must be called prior to using the ValueVector.
   *
   * @param totalBytes desired total memory capacity
   * @param valueCount the desired number of elements in the vector
   * @throws org.apache.arrow.memory.OutOfMemoryException if memory allocation fails
   */
  @Override
  public void allocateNew(long totalBytes, int valueCount) {
    assert totalBytes >= 0;

    checkDataBufferSize(totalBytes);
    computeAndCheckOffsetsBufferSize(valueCount);

    /* we are doing a new allocation -- release the current buffers */
    clear();

    try {
      allocateBytes(totalBytes, valueCount);
    } catch (Exception e) {
      clear();
      throw e;
    }
  }

  @Override
  public void allocateNew(int valueCount) {
    allocateNew(lastValueAllocationSizeInBytes, valueCount);
  }

  /* Check if the data buffer size is within bounds. */
  private void checkDataBufferSize(long size) {
    if (size > MAX_ALLOCATION_SIZE || size < 0) {
      throw new OversizedAllocationException("Memory required for vector " +
          " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")");
    }
  }

  /*
   * Compute the buffer size required for 'valueCount' offsets and validity, and check if it's
   * within bounds.
   */
  private long computeAndCheckOffsetsBufferSize(int valueCount) {
    /* to track the end offset of last data element in vector, we need
     * an additional slot in offset buffer.
     */
    final long size = computeCombinedBufferSize(valueCount + 1, OFFSET_WIDTH);
    if (size > MAX_ALLOCATION_SIZE) {
      throw new OversizedAllocationException("Memory required for vector capacity " +
          valueCount +
          " is (" + size + "), which is more than max allowed (" + MAX_ALLOCATION_SIZE + ")");
    }
    return size;
  }

  /* allocate the inner buffers */
  private void allocateBytes(final long valueBufferSize, final int valueCount) {
    /* allocate data buffer */
    long curSize = valueBufferSize;
    valueBuffer = allocator.buffer(curSize);
    valueBuffer.readerIndex(0);

    /* allocate offset buffer and validity buffer */
    DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(valueCount + 1, OFFSET_WIDTH);
    offsetBuffer = buffers.getDataBuf();
    validityBuffer = buffers.getValidityBuf();
    initOffsetBuffer();
    initValidityBuffer();

    lastValueCapacity = getValueCapacity();
    lastValueAllocationSizeInBytes = capAtMaxInt(valueBuffer.capacity());
  }

  /* allocate offset buffer */
  private void allocateOffsetBuffer(final long size) {
    final int curSize = (int) size;
    offsetBuffer = allocator.buffer(curSize);
    offsetBuffer.readerIndex(0);
    initOffsetBuffer();
  }

  /* allocate validity buffer */
  private void allocateValidityBuffer(final long size) {
    final int curSize = (int) size;
    validityBuffer = allocator.buffer(curSize);
    validityBuffer.readerIndex(0);
    initValidityBuffer();
  }

  /**
   * Resize the vector to increase the capacity. The internal behavior is to
   * double the current value capacity.
   */
  public void reAlloc() {
    reallocDataBuffer();
    reallocValidityAndOffsetBuffers();
  }

  /**
   * Reallocate the data buffer. Data Buffer stores the actual data for
   * VARCHAR or VARBINARY elements in the vector. The behavior is to double
   * the size of buffer.
   * @throws OversizedAllocationException if the desired new size is more than
   *                                      max allowed
   * @throws OutOfMemoryException if the internal memory allocation fails
   */
  public void reallocDataBuffer() {
    final long currentBufferCapacity = valueBuffer.capacity();
    long newAllocationSize = currentBufferCapacity * 2;
    if (newAllocationSize == 0) {
      if (lastValueAllocationSizeInBytes > 0) {
        newAllocationSize = lastValueAllocationSizeInBytes;
      } else {
        newAllocationSize = INITIAL_BYTE_COUNT * 2;
      }
    }
    newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize);
    assert newAllocationSize >= 1;

    checkDataBufferSize(newAllocationSize);

    final ArrowBuf newBuf = allocator.buffer(newAllocationSize);
    newBuf.setBytes(0, valueBuffer, 0, currentBufferCapacity);
    valueBuffer.getReferenceManager().release();
    valueBuffer = newBuf;
    lastValueAllocationSizeInBytes = valueBuffer.capacity();
  }

  /**
   * Reallocate the validity and offset buffers for this vector. Validity
   * buffer is used to track the NULL or NON-NULL nature of elements in
   * the vector and offset buffer is used to store the lengths of variable
   * width elements in the vector.
   *
   * 

Note that data buffer for variable length vectors moves independent * of the companion validity and offset buffers. This is in * contrast to what we have for fixed width vectors. * *

So even though we may have setup an initial capacity of 1024 * elements in the vector, it is quite possible * that we need to reAlloc() the data buffer when we are setting * the 5th element in the vector simply because previous * variable length elements have exhausted the buffer capacity. * However, we really don't need to reAlloc() validity and * offset buffers until we try to set the 1025th element * This is why we do a separate check for safe methods to * determine which buffer needs reallocation. * @throws OversizedAllocationException if the desired new size is more than * max allowed * @throws OutOfMemoryException if the internal memory allocation fails */ public void reallocValidityAndOffsetBuffers() { int targetOffsetCount = capAtMaxInt((offsetBuffer.capacity() / OFFSET_WIDTH) * 2); if (targetOffsetCount == 0) { if (lastValueCapacity > 0) { targetOffsetCount = (lastValueCapacity + 1); } else { targetOffsetCount = 2 * (INITIAL_VALUE_ALLOCATION + 1); } } computeAndCheckOffsetsBufferSize(targetOffsetCount); DataAndValidityBuffers buffers = allocFixedDataAndValidityBufs(targetOffsetCount, OFFSET_WIDTH); final ArrowBuf newOffsetBuffer = buffers.getDataBuf(); newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity()); newOffsetBuffer.setZero(offsetBuffer.capacity(), newOffsetBuffer.capacity() - offsetBuffer.capacity()); offsetBuffer.getReferenceManager().release(); offsetBuffer = newOffsetBuffer; final ArrowBuf newValidityBuffer = buffers.getValidityBuf(); newValidityBuffer.setBytes(0, validityBuffer, 0, validityBuffer.capacity()); newValidityBuffer.setZero(validityBuffer.capacity(), newValidityBuffer.capacity() - validityBuffer.capacity()); validityBuffer.getReferenceManager().release(); validityBuffer = newValidityBuffer; lastValueCapacity = getValueCapacity(); } /** * Get the size (number of bytes) of underlying data buffer. * @return number of bytes in the data buffer */ @Override public int getByteCapacity() { return capAtMaxInt(valueBuffer.capacity()); } @Override public int sizeOfValueBuffer() { if (valueCount == 0) { return 0; } return offsetBuffer.getInt(valueCount * OFFSET_WIDTH); } /** * Get the size (number of bytes) of underlying buffers used by this * vector. * @return size of underlying buffers. */ @Override public int getBufferSize() { return getBufferSizeFor(this.valueCount); } /** * Get the potential buffer size for a particular number of records. * @param valueCount desired number of elements in the vector * @return estimated size of underlying buffers if the vector holds * a given number of elements */ @Override public int getBufferSizeFor(final int valueCount) { if (valueCount == 0) { return 0; } final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); final int offsetBufferSize = (valueCount + 1) * OFFSET_WIDTH; /* get the end offset for this valueCount */ final int dataBufferSize = offsetBuffer.getInt(valueCount * OFFSET_WIDTH); return validityBufferSize + offsetBufferSize + dataBufferSize; } /** * Get information about how this field is materialized. * @return the field corresponding to this vector */ @Override public Field getField() { return field; } /** * Return the underlying buffers associated with this vector. Note that this doesn't * impact the reference counts for this buffer so it only should be used for in-context * access. Also note that this buffer changes regularly thus * external classes shouldn't hold a reference to it (unless they change it). * * @param clear Whether to clear vector before returning; the buffers will still be refcounted * but the returned array will be the only reference to them * @return The underlying {@link io.netty.buffer.ArrowBuf buffers} that is used by this * vector instance. */ @Override public ArrowBuf[] getBuffers(boolean clear) { final ArrowBuf[] buffers; setReaderAndWriterIndex(); if (getBufferSize() == 0) { buffers = new ArrowBuf[0]; } else { buffers = new ArrowBuf[3]; buffers[0] = validityBuffer; buffers[1] = offsetBuffer; buffers[2] = valueBuffer; } if (clear) { for (final ArrowBuf buffer : buffers) { buffer.getReferenceManager().retain(); } clear(); } return buffers; } /** * Construct a transfer pair of this vector and another vector of same type. * @param ref name of the target vector * @param allocator allocator for the target vector * @param callBack not used * @return TransferPair */ @Override public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { return getTransferPair(ref, allocator); } /** * Construct a transfer pair of this vector and another vector of same type. * @param allocator allocator for the target vector * @return TransferPair */ @Override public TransferPair getTransferPair(BufferAllocator allocator) { return getTransferPair(getName(), allocator); } /** * Construct a transfer pair of this vector and another vector of same type. * @param ref name of the target vector * @param allocator allocator for the target vector * @return TransferPair */ public abstract TransferPair getTransferPair(String ref, BufferAllocator allocator); /** * Transfer this vector'data to another vector. The memory associated * with this vector is transferred to the allocator of target vector * for accounting and management purposes. * @param target destination vector for transfer */ public void transferTo(BaseVariableWidthVector target) { compareTypes(target, "transferTo"); target.clear(); target.validityBuffer = transferBuffer(validityBuffer, target.allocator); target.valueBuffer = transferBuffer(valueBuffer, target.allocator); target.offsetBuffer = transferBuffer(offsetBuffer, target.allocator); target.setLastSet(this.lastSet); if (this.valueCount > 0) { target.setValueCount(this.valueCount); } clear(); } /** * Slice this vector at desired index and length and transfer the * corresponding data to the target vector. * @param startIndex start position of the split in source vector. * @param length length of the split. * @param target destination vector */ public void splitAndTransferTo(int startIndex, int length, BaseVariableWidthVector target) { Preconditions.checkArgument(startIndex >= 0 && length >= 0 && startIndex + length <= valueCount, "Invalid parameters startIndex: %s, length: %s for valueCount: %s", startIndex, length, valueCount); compareTypes(target, "splitAndTransferTo"); target.clear(); splitAndTransferValidityBuffer(startIndex, length, target); splitAndTransferOffsetBuffer(startIndex, length, target); target.setLastSet(length - 1); if (length > 0) { target.setValueCount(length); } } /** * Transfer the offsets along with data. Unlike the data buffer, we cannot simply * slice the offset buffer for split and transfer. The reason is that offsets * in the target vector have to be adjusted and made relative to the staring * offset in source vector from the start index of split. This is why, we * need to explicitly allocate the offset buffer and set the adjusted offsets * in the target vector. */ private void splitAndTransferOffsetBuffer(int startIndex, int length, BaseVariableWidthVector target) { final int start = offsetBuffer.getInt(startIndex * OFFSET_WIDTH); final int end = offsetBuffer.getInt((startIndex + length) * OFFSET_WIDTH); final int dataLength = end - start; target.allocateOffsetBuffer((length + 1) * OFFSET_WIDTH); for (int i = 0; i < length + 1; i++) { final int relativeSourceOffset = offsetBuffer.getInt((startIndex + i) * OFFSET_WIDTH) - start; target.offsetBuffer.setInt(i * OFFSET_WIDTH, relativeSourceOffset); } final ArrowBuf slicedBuffer = valueBuffer.slice(start, dataLength); target.valueBuffer = transferBuffer(slicedBuffer, target.allocator); } /* * Transfer the validity. */ private void splitAndTransferValidityBuffer(int startIndex, int length, BaseVariableWidthVector target) { int firstByteSource = BitVectorHelper.byteIndex(startIndex); int lastByteSource = BitVectorHelper.byteIndex(valueCount - 1); int byteSizeTarget = getValidityBufferSizeFromCount(length); int offset = startIndex % 8; if (length > 0) { if (offset == 0) { // slice if (target.validityBuffer != null) { target.validityBuffer.getReferenceManager().release(); } target.validityBuffer = validityBuffer.slice(firstByteSource, byteSizeTarget); target.validityBuffer.getReferenceManager().retain(); } else { /* Copy data * When the first bit starts from the middle of a byte (offset != 0), * copy data from src BitVector. * Each byte in the target is composed by a part in i-th byte, * another part in (i+1)-th byte. */ target.allocateValidityBuffer(byteSizeTarget); for (int i = 0; i < byteSizeTarget - 1; i++) { byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, firstByteSource + i, offset); byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, firstByteSource + i + 1, offset); target.validityBuffer.setByte(i, (b1 + b2)); } /* Copying the last piece is done in the following manner: * if the source vector has 1 or more bytes remaining, we copy * the last piece as a byte formed by shifting data * from the current byte and the next byte. * * if the source vector has no more bytes remaining * (we are at the last byte), we copy the last piece as a byte * by shifting data from the current byte. */ if ((firstByteSource + byteSizeTarget - 1) < lastByteSource) { byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, firstByteSource + byteSizeTarget - 1, offset); byte b2 = BitVectorHelper.getBitsFromNextByte(this.validityBuffer, firstByteSource + byteSizeTarget, offset); target.validityBuffer.setByte(byteSizeTarget - 1, b1 + b2); } else { byte b1 = BitVectorHelper.getBitsFromCurrentByte(this.validityBuffer, firstByteSource + byteSizeTarget - 1, offset); target.validityBuffer.setByte(byteSizeTarget - 1, b1); } } } } /*----------------------------------------------------------------* | | | common getters and setters | | | *----------------------------------------------------------------*/ /** * Get the number of elements that are null in the vector. * * @return the number of null elements. */ public int getNullCount() { return BitVectorHelper.getNullCount(validityBuffer, valueCount); } /** * Check if the given index is within the current value capacity * of the vector. * * @param index position to check * @return true if index is within the current value capacity */ public boolean isSafe(int index) { return index < getValueCapacity(); } /** * Check if element at given index is null. * * @param index position of element * @return true if element at given index is null */ public boolean isNull(int index) { return (isSet(index) == 0); } /** * Same as {@link #isNull(int)}. * * @param index position of element * @return 1 if element at given index is not null, 0 otherwise */ public int isSet(int index) { final int byteIndex = index >> 3; final byte b = validityBuffer.getByte(byteIndex); final int bitIndex = index & 7; return (b >> bitIndex) & 0x01; } /** * Get the value count of vector. This will always be zero unless * setValueCount(int) has been called prior to calling this. * * @return valueCount for the vector */ public int getValueCount() { return valueCount; } /** * Sets the value count for the vector. * * @param valueCount value count */ public void setValueCount(int valueCount) { assert valueCount >= 0; this.valueCount = valueCount; while (valueCount > getValueCapacity()) { reallocValidityAndOffsetBuffers(); } fillHoles(valueCount); lastSet = valueCount - 1; setReaderAndWriterIndex(); } /** * Create holes in the vector upto the given index (exclusive). * Holes will be created from the current last set position in * the vector. * * @param index target index */ public void fillEmpties(int index) { handleSafe(index, emptyByteArray.length); fillHoles(index); lastSet = index - 1; } /** * Set the index of last non-null element in the vector. * It is important to call this method with appropriate value * before calling {@link #setValueCount(int)}. * * @param value desired index of last non-null element. */ public void setLastSet(int value) { lastSet = value; } /** * Get the index of last non-null element in the vector. * * @return index of the last non-null element */ public int getLastSet() { return lastSet; } /** * Get the starting position (offset) in the data stream for a given * element in the vector. * * @param index position of the element in the vector * @return starting offset for the element */ public long getStartEnd(int index) { return offsetBuffer.getLong(index * OFFSET_WIDTH); } /** * Mark the particular position in the vector as non-null. * * @param index position of the element. */ @Override public void setIndexDefined(int index) { while (index >= getValidityBufferValueCapacity()) { reallocValidityAndOffsetBuffers(); } BitVectorHelper.setBit(validityBuffer, index); } /** * Sets the value length for an element. * * @param index position of the element to set * @param length length of the element */ public void setValueLengthSafe(int index, int length) { assert index >= 0; handleSafe(index, length); fillHoles(index); final int startOffset = getStartOffset(index); offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); lastSet = index; } /** * Get the variable length element at specified index as Text. * * @param index position of element to get * @return greater than 0 length for non-null element, 0 otherwise */ public int getValueLength(int index) { assert index >= 0; if (isSet(index) == 0) { return 0; } final int startOffset = getStartOffset(index); final int dataLength = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH) - startOffset; return dataLength; } /** * Set the variable length element at the specified index to the supplied * byte array. This is same as using {@link #set(int, byte[], int, int)} * with start as 0 and length as value.length * * @param index position of the element to set * @param value array of bytes to write */ public void set(int index, byte[] value) { assert index >= 0; fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); setBytes(index, value, 0, value.length); lastSet = index; } /** * Same as {@link #set(int, byte[])} except that it handles the * case where index and length of new element are beyond the existing * capacity of the vector. * * @param index position of the element to set * @param value array of bytes to write */ public void setSafe(int index, byte[] value) { assert index >= 0; handleSafe(index, value.length); fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); setBytes(index, value, 0, value.length); lastSet = index; } /** * Set the variable length element at the specified index to the supplied * byte array. * * @param index position of the element to set * @param value array of bytes to write * @param start start index in array of bytes * @param length length of data in array of bytes */ public void set(int index, byte[] value, int start, int length) { assert index >= 0; fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); setBytes(index, value, start, length); lastSet = index; } /** * Same as {@link #set(int, byte[], int, int)} except that it handles the * case where index and length of new element are beyond the existing * capacity of the vector. * * @param index position of the element to set * @param value array of bytes to write * @param start start index in array of bytes * @param length length of data in array of bytes */ public void setSafe(int index, byte[] value, int start, int length) { assert index >= 0; handleSafe(index, length); fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); setBytes(index, value, start, length); lastSet = index; } /** * Set the variable length element at the specified index to the * content in supplied ByteBuffer. * * @param index position of the element to set * @param value ByteBuffer with data * @param start start index in ByteBuffer * @param length length of data in ByteBuffer */ public void set(int index, ByteBuffer value, int start, int length) { assert index >= 0; fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); final int startOffset = getStartOffset(index); offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); valueBuffer.setBytes(startOffset, value, start, length); lastSet = index; } /** * Same as {@link #set(int, ByteBuffer, int, int)} except that it handles the * case where index and length of new element are beyond the existing * capacity of the vector. * * @param index position of the element to set * @param value ByteBuffer with data * @param start start index in ByteBuffer * @param length length of data in ByteBuffer */ public void setSafe(int index, ByteBuffer value, int start, int length) { assert index >= 0; handleSafe(index, length); fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); final int startOffset = getStartOffset(index); offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); valueBuffer.setBytes(startOffset, value, start, length); lastSet = index; } /** * Set the element at the given index to null. * * @param index position of element */ public void setNull(int index) { while (index >= getValidityBufferValueCapacity()) { reallocValidityAndOffsetBuffers(); } BitVectorHelper.unsetBit(validityBuffer, index); } /** * Store the given value at a particular position in the vector. isSet indicates * whether the value is NULL or not. * @param index position of the new value * @param isSet 0 for NULL value, 1 otherwise * @param start start position of data in buffer * @param end end position of data in buffer * @param buffer data buffer containing the variable width element to be stored * in the vector */ public void set(int index, int isSet, int start, int end, ArrowBuf buffer) { assert index >= 0; final int dataLength = end - start; fillHoles(index); BitVectorHelper.setValidityBit(validityBuffer, index, isSet); final int startOffset = offsetBuffer.getInt(index * OFFSET_WIDTH); offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); valueBuffer.setBytes(startOffset, buffer, start, dataLength); lastSet = index; } /** * Same as {@link #set(int, int, int, int, ArrowBuf)} except that it handles the case * when index is greater than or equal to current value capacity of the * vector. * @param index position of the new value * @param isSet 0 for NULL value, 1 otherwise * @param start start position of data in buffer * @param end end position of data in buffer * @param buffer data buffer containing the variable width element to be stored * in the vector */ public void setSafe(int index, int isSet, int start, int end, ArrowBuf buffer) { assert index >= 0; final int dataLength = end - start; handleSafe(index, dataLength); fillHoles(index); BitVectorHelper.setValidityBit(validityBuffer, index, isSet); final int startOffset = offsetBuffer.getInt(index * OFFSET_WIDTH); offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + dataLength); valueBuffer.setBytes(startOffset, buffer, start, dataLength); lastSet = index; } /** * Store the given value at a particular position in the vector. isSet indicates * whether the value is NULL or not. * @param index position of the new value * @param start start position of data in buffer * @param length length of data in buffer * @param buffer data buffer containing the variable width element to be stored * in the vector */ public void set(int index, int start, int length, ArrowBuf buffer) { assert index >= 0; fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); final int startOffset = offsetBuffer.getInt(index * OFFSET_WIDTH); offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); final ArrowBuf bb = buffer.slice(start, length); valueBuffer.setBytes(startOffset, bb); lastSet = index; } /** * Same as {@link #set(int, int, int, int, ArrowBuf)} except that it handles the case * when index is greater than or equal to current value capacity of the * vector. * @param index position of the new value * @param start start position of data in buffer * @param length length of data in buffer * @param buffer data buffer containing the variable width element to be stored * in the vector */ public void setSafe(int index, int start, int length, ArrowBuf buffer) { assert index >= 0; handleSafe(index, length); fillHoles(index); BitVectorHelper.setBit(validityBuffer, index); final int startOffset = offsetBuffer.getInt(index * OFFSET_WIDTH); offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); final ArrowBuf bb = buffer.slice(start, length); valueBuffer.setBytes(startOffset, bb); lastSet = index; } /*----------------------------------------------------------------* | | | helper methods for setters | | | *----------------------------------------------------------------*/ protected final void fillHoles(int index) { for (int i = lastSet + 1; i < index; i++) { setBytes(i, emptyByteArray, 0, emptyByteArray.length); } lastSet = index - 1; } protected final void setBytes(int index, byte[] value, int start, int length) { /* end offset of current last element in the vector. this will * be the start offset of new element we are trying to store. */ final int startOffset = getStartOffset(index); /* set new end offset */ offsetBuffer.setInt((index + 1) * OFFSET_WIDTH, startOffset + length); /* store the var length data in value buffer */ valueBuffer.setBytes(startOffset, value, start, length); } /** * Gets the starting offset of a record, given its index. * This method is deprecated. Please use {@link BaseVariableWidthVector#getStartOffset(int)} instead. * @param index index of the record. * @return the starting offset of the record. */ @Deprecated protected final int getstartOffset(int index) { return getStartOffset(index); } public final int getStartOffset(int index) { return offsetBuffer.getInt(index * OFFSET_WIDTH); } protected final void handleSafe(int index, int dataLength) { /* * IMPORTANT: * value buffer for variable length vectors moves independent * of the companion validity and offset buffers. This is in * contrast to what we have for fixed width vectors. * * Here there is no concept of getValueCapacity() in the * data stream. getValueCapacity() is applicable only to validity * and offset buffers. * * So even though we may have setup an initial capacity of 1024 * elements in the vector, it is quite possible * that we need to reAlloc() the data buffer when we are setting * the 5th element in the vector simply because previous * variable length elements have exhausted the buffer capacity. * However, we really don't need to reAlloc() validity and * offset buffers until we try to set the 1025th element * This is why we do a separate check for safe methods to * determine which buffer needs reallocation. */ while (index >= getValueCapacity()) { reallocValidityAndOffsetBuffers(); } final int startOffset = lastSet < 0 ? 0 : getStartOffset(lastSet + 1); while (valueBuffer.capacity() < (startOffset + dataLength)) { reallocDataBuffer(); } } /** * Method used by Json Writer to read a variable width element from * the variable width vector and write to Json. * *

This method should not be used externally. * * @param data buffer storing the variable width vector elements * @param offset buffer storing the offsets of variable width vector elements * @param index position of the element in the vector * @return array of bytes */ public static byte[] get(final ArrowBuf data, final ArrowBuf offset, int index) { final int currentStartOffset = offset.getInt(index * OFFSET_WIDTH); final int dataLength = offset.getInt((index + 1) * OFFSET_WIDTH) - currentStartOffset; final byte[] result = new byte[dataLength]; data.getBytes(currentStartOffset, result, 0, dataLength); return result; } /** * Method used by Json Reader to explicitly set the offsets of the variable * width vector data. The method takes care of allocating the memory for * offsets if the caller hasn't done so. * *

This method should not be used externally. * * @param buffer ArrowBuf to store offsets for variable width elements * @param allocator memory allocator * @param valueCount number of elements * @param index position of the element * @param value offset of the element * @return buffer holding the offsets */ public static ArrowBuf set(ArrowBuf buffer, BufferAllocator allocator, int valueCount, int index, int value) { if (buffer == null) { buffer = allocator.buffer(valueCount * OFFSET_WIDTH); } buffer.setInt(index * OFFSET_WIDTH, value); if (index == (valueCount - 1)) { buffer.writerIndex(valueCount * OFFSET_WIDTH); } return buffer; } /** * Copy a cell value from a particular index in source vector to a particular * position in this vector. * * @param fromIndex position to copy from in source vector * @param thisIndex position to copy to in this vector * @param from source vector */ @Override public void copyFrom(int fromIndex, int thisIndex, ValueVector from) { Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); if (from.isNull(fromIndex)) { fillHoles(thisIndex); BitVectorHelper.unsetBit(this.validityBuffer, thisIndex); final int copyStart = offsetBuffer.getInt(thisIndex * OFFSET_WIDTH); offsetBuffer.setInt((thisIndex + 1) * OFFSET_WIDTH, copyStart); } else { final int start = from.getOffsetBuffer().getInt(fromIndex * OFFSET_WIDTH); final int end = from.getOffsetBuffer().getInt((fromIndex + 1) * OFFSET_WIDTH); final int length = end - start; fillHoles(thisIndex); BitVectorHelper.setBit(this.validityBuffer, thisIndex); final int copyStart = offsetBuffer.getInt(thisIndex * OFFSET_WIDTH); from.getDataBuffer().getBytes(start, this.valueBuffer, copyStart, length); offsetBuffer.setInt((thisIndex + 1) * OFFSET_WIDTH, copyStart + length); } lastSet = thisIndex; } /** * Same as {@link #copyFrom(int, int, ValueVector)} except that * it handles the case when the capacity of the vector needs to be expanded * before copy. * * @param fromIndex position to copy from in source vector * @param thisIndex position to copy to in this vector * @param from source vector */ @Override public void copyFromSafe(int fromIndex, int thisIndex, ValueVector from) { Preconditions.checkArgument(this.getMinorType() == from.getMinorType()); if (from.isNull(fromIndex)) { handleSafe(thisIndex, 0); fillHoles(thisIndex); BitVectorHelper.unsetBit(this.validityBuffer, thisIndex); final int copyStart = offsetBuffer.getInt(thisIndex * OFFSET_WIDTH); offsetBuffer.setInt((thisIndex + 1) * OFFSET_WIDTH, copyStart); } else { final int start = from.getOffsetBuffer().getInt(fromIndex * OFFSET_WIDTH); final int end = from.getOffsetBuffer().getInt((fromIndex + 1) * OFFSET_WIDTH); final int length = end - start; handleSafe(thisIndex, length); fillHoles(thisIndex); BitVectorHelper.setBit(this.validityBuffer, thisIndex); final int copyStart = offsetBuffer.getInt(thisIndex * OFFSET_WIDTH); from.getDataBuffer().getBytes(start, this.valueBuffer, copyStart, length); offsetBuffer.setInt((thisIndex + 1) * OFFSET_WIDTH, copyStart + length); } lastSet = thisIndex; } @Override public ArrowBufPointer getDataPointer(int index) { return getDataPointer(index, new ArrowBufPointer()); } @Override public ArrowBufPointer getDataPointer(int index, ArrowBufPointer reuse) { if (isNull(index)) { reuse.set(null, 0, 0); } else { int offset = offsetBuffer.getInt(index * OFFSET_WIDTH); int length = offsetBuffer.getInt((index + 1) * OFFSET_WIDTH) - offset; reuse.set(valueBuffer, offset, length); } return reuse; } @Override public int hashCode(int index) { return hashCode(index, null); } @Override public int hashCode(int index, ArrowBufHasher hasher) { if (isNull(index)) { return ArrowBufPointer.NULL_HASH_CODE; } final int start = getStartOffset(index); final int end = getStartOffset(index + 1); return ByteFunctionHelpers.hash(hasher, this.getDataBuffer(), start, end); } @Override public OUT accept(VectorVisitor visitor, IN value) { return visitor.visit(this, value); } /** * Gets the ending offset of a record, given its index. */ public final int getEndOffset(int index) { return offsetBuffer.getInt((index + 1) * OFFSET_WIDTH); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy