All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.arrow.vector.dictionary.DictionaryHashTable Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.arrow.vector.dictionary;

import org.apache.arrow.memory.util.hash.ArrowBufHasher;
import org.apache.arrow.memory.util.hash.SimpleHasher;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.compare.Range;
import org.apache.arrow.vector.compare.RangeEqualsVisitor;

/**
 * HashTable used for Dictionary encoding. It holds two vectors (the vector to encode and dictionary vector)
 * It stores the index in dictionary vector and for a given index in encode vector,
 * it could return dictionary index.
 */
public class DictionaryHashTable {

  /**
   * Represents a null value in map.
   */
  static final int NULL_VALUE = -1;

  /**
   * The default initial capacity - MUST be a power of two.
   */
  static final int DEFAULT_INITIAL_CAPACITY = 1 << 4;

  /**
   * The maximum capacity, used if a higher value is implicitly specified
   * by either of the constructors with arguments.
   */
  static final int MAXIMUM_CAPACITY = 1 << 30;

  /**
   * The load factor used when none specified in constructor.
   */
  static final float DEFAULT_LOAD_FACTOR = 0.75f;

  static final DictionaryHashTable.Entry[] EMPTY_TABLE = {};

  /**
   * The table, initialized on first use, and resized as
   * necessary. When allocated, length is always a power of two.
   */
  transient DictionaryHashTable.Entry[] table = EMPTY_TABLE;

  /**
   * The number of key-value mappings contained in this map.
   */
  transient int size;

  /**
   * The next size value at which to resize (capacity * load factor).
   */
  int threshold;

  /**
   * The load factor for the hash table.
   */
  final float loadFactor;

  private final ValueVector dictionary;

  private final ArrowBufHasher hasher;

  /**
   * Constructs an empty map with the specified initial capacity and load factor.
   */
  public DictionaryHashTable(int initialCapacity, ValueVector dictionary, ArrowBufHasher hasher) {
    if (initialCapacity < 0) {
      throw new IllegalArgumentException("Illegal initial capacity: " +
          initialCapacity);
    }
    if (initialCapacity > MAXIMUM_CAPACITY) {
      initialCapacity = MAXIMUM_CAPACITY;
    }
    this.loadFactor = DEFAULT_LOAD_FACTOR;
    this.threshold = initialCapacity;

    this.dictionary = dictionary;

    this.hasher = hasher;

    // build hash table
    for (int i = 0; i < this.dictionary.getValueCount(); i++) {
      put(i);
    }
  }

  public DictionaryHashTable(ValueVector dictionary, ArrowBufHasher hasher) {
    this(DEFAULT_INITIAL_CAPACITY, dictionary, hasher);
  }

  public DictionaryHashTable(ValueVector dictionary) {
    this(dictionary, SimpleHasher.INSTANCE);
  }

  /**
   * Compute the capacity with given threshold and create init table.
   */
  private void inflateTable(int threshold) {
    int capacity = roundUpToPowerOf2(threshold);
    this.threshold = (int) Math.min(capacity * loadFactor, MAXIMUM_CAPACITY + 1);
    table = new DictionaryHashTable.Entry[capacity];
  }

  /**
   * Computes the storage location in an array for the given hashCode.
   */
  static int indexFor(int h, int length) {
    return h & (length - 1);
  }

  /**
   * Returns a power of two size for the given size.
   */
  static final int roundUpToPowerOf2(int size) {
    int n = size - 1;
    n |= n >>> 1;
    n |= n >>> 2;
    n |= n >>> 4;
    n |= n >>> 8;
    n |= n >>> 16;
    return (n < 0) ? 1 : (n >= MAXIMUM_CAPACITY) ? MAXIMUM_CAPACITY : n + 1;
  }

  /**
   * get the corresponding dictionary index with the given index in vector which to encode.
   * @param indexInArray index in vector.
   * @return dictionary vector index or -1 if no value equals.
   */
  public int getIndex(int indexInArray, ValueVector toEncode) {
    int hash = toEncode.hashCode(indexInArray, this.hasher);
    int index = indexFor(hash, table.length);

    RangeEqualsVisitor equalVisitor = new RangeEqualsVisitor(dictionary, toEncode, null);
    Range range = new Range(0, 0, 1);

    for (DictionaryHashTable.Entry e = table[index]; e != null ; e = e.next) {
      if (e.hash == hash) {
        int dictIndex = e.index;

        range = range.setRightStart(indexInArray)
            .setLeftStart(dictIndex);
        if (equalVisitor.rangeEquals(range)) {
          return dictIndex;
        }
      }
    }
    return NULL_VALUE;
  }

  /**
   * put the index of dictionary vector to build hash table.
   */
  private void put(int indexInDictionary) {
    if (table == EMPTY_TABLE) {
      inflateTable(threshold);
    }

    int hash = dictionary.hashCode(indexInDictionary, this.hasher);
    int i = indexFor(hash, table.length);
    for (DictionaryHashTable.Entry e = table[i]; e != null; e = e.next) {
      if (e.hash == hash && e.index == indexInDictionary) {
        //already has this index, return
        return;
      }
    }

    addEntry(hash, indexInDictionary, i);
  }

  /**
   * Create a new Entry at the specific position of table.
   */
  void createEntry(int hash, int index, int bucketIndex) {
    DictionaryHashTable.Entry e = table[bucketIndex];
    table[bucketIndex] = new DictionaryHashTable.Entry(hash, index, e);
    size++;
  }

  /**
   * Add Entry at the specified location of the table.
   */
  void addEntry(int hash, int index, int bucketIndex) {
    if ((size >= threshold) && (null != table[bucketIndex])) {
      resize(2 * table.length);
      bucketIndex = indexFor(hash, table.length);
    }

    createEntry(hash, index, bucketIndex);
  }

  /**
   * Resize table with given new capacity.
   */
  void resize(int newCapacity) {
    DictionaryHashTable.Entry[] oldTable = table;
    int oldCapacity = oldTable.length;
    if (oldCapacity == MAXIMUM_CAPACITY) {
      threshold = Integer.MAX_VALUE;
      return;
    }

    DictionaryHashTable.Entry[] newTable = new DictionaryHashTable.Entry[newCapacity];
    transfer(newTable);
    table = newTable;
    threshold = (int) Math.min(newCapacity * loadFactor, MAXIMUM_CAPACITY + 1);
  }

  /**
   * Transfer entries into new table from old table.
   * @param newTable new table
   */
  void transfer(DictionaryHashTable.Entry[] newTable) {
    int newCapacity = newTable.length;
    for (DictionaryHashTable.Entry e : table) {
      while (null != e) {
        DictionaryHashTable.Entry next = e.next;
        int i = indexFor(e.hash, newCapacity);
        e.next = newTable[i];
        newTable[i] = e;
        e = next;
      }
    }
  }

  /**
   * Returns the number of mappings in this Map.
   */
  public int size() {
    return size;
  }

  /**
   * Removes all elements from this map, leaving it empty.
   */
  public void clear() {
    size = 0;
    for (int i = 0; i < table.length; i++) {
      table[i] = null;
    }
  }

  /**
   * Class to keep dictionary index data within hash table.
   */
  static class Entry {
    //dictionary index
    int index;
    DictionaryHashTable.Entry next;
    int hash;

    Entry(int hash, int index, DictionaryHashTable.Entry next) {
      this.index = index;
      this.hash = hash;
      this.next = next;
    }

    public final int getIndex() {
      return this.index;
    }

    @Override
    public int hashCode() {
      return hash;
    }

    public final boolean equals(Object o) {
      if (!(o instanceof DictionaryHashTable.Entry)) {
        return false;
      }
      DictionaryHashTable.Entry e = (DictionaryHashTable.Entry) o;
      if (index == e.getIndex()) {
        return true;
      }
      return false;
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy