com.facebook.hive.orc.StringDictionaryEncoder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-dwrf Show documentation
DWRF file format for Hive
There is a newer version: 0.18.9
//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.hive.orc;

import com.google.common.primitives.Ints;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntComparator;
import org.apache.hadoop.io.Text;

import java.io.IOException;
import java.io.OutputStream;

/**
 * A fast, memory efficient implementation of dictionary encoding stores strings. The strings are stored as UTF-8 bytes
 * and an offset/length for each entry.
 */
class StringDictionaryEncoder extends DictionaryEncoder {
  private final DynamicByteArray byteArray = new DynamicByteArray();

  // The following int arrays represent entries in the dictionary
  // int[]'s were used instead of DynamicIntArrays because they are
  // accessed a lot, and DynamicIntArrays perform poorly
  private int[] offsets = new int[DynamicIntArray.DEFAULT_SIZE];
  private int[] hashcodes = new int[DynamicIntArray.DEFAULT_SIZE];
  private int[] nexts = new int[DynamicIntArray.DEFAULT_SIZE];
  private int[] counts = new int[DynamicIntArray.DEFAULT_SIZE];
  private int[] indexStrides = new int[DynamicIntArray.DEFAULT_SIZE];

  private final TextCompressedHashSet htDictionary = new TextCompressedHashSet();

  // The number of elements in the dictionary
  private int numElements = 0;

  private final boolean sortByStride;

  // A custom implementation of a hash set, based on fastutil's ObjectOpenCustomHashSet
  // Takes in indices into the int arrays of the surrounding class
  // NOTE: For the sake of speed and simplicity, assumes index 0 is never used (means we don't
  // have to initialize all the values in the set to -1 or some other impossible value)
  private class TextCompressedHashSet {
    // Essentially an alias for length - 1
    private int mask;
    // An array of indeces into the int[]'s in StringDictionaryEncoder
    private int[] key;
    // The number of values in the dictionary
    private int numValues;
    // The point beyond which the dictionary needs to grow
    private int maxFill;
    // The current size of the dictionary
    private int length;

    // The fraction of the dictionary size beyond which it should grow
    private static final float LOAD_FACTOR = 0.75f;
    // The number of entries the set should be initialized to expect
    private static final int MIN_EXPECTED = 128;

    public TextCompressedHashSet() {
      this.length = it.unimi.dsi.fastutil.HashCommon.arraySize(MIN_EXPECTED, LOAD_FACTOR);
      this.mask = length - 1;
      this.maxFill = it.unimi.dsi.fastutil.HashCommon.maxFill(length, LOAD_FACTOR);
      this.key = new int[length];
    }

    public int add(Slice newKey, int k) {
      // Compute the bucket the value at k is in
      int pos = it.unimi.dsi.fastutil.HashCommon.murmurHash3(hashcodes[k]) & mask;
      int other = key[pos];
      int prev = 0;
      // Iterate over the chain in that bucket
      while(other != 0) {
        // Compare the hashcodes as a quick way to rule out some results
        if (hashcodes[k] == hashcodes[other] && equalsValue(newKey, offsets[other], getEnd(other) - offsets[other])) {
          // If the value is found to already exist, and it's not already at the head of the chain
          // move it there to speed up adding this value in the future.
          if (other != key[pos]) {
            nexts[prev] = nexts[other];
            nexts[other] = key[pos];
            key[pos] = other;
          }
          counts[other]++;
          return other;
        }
        prev = other;
        other = nexts[other];
      }

      // If it's not already in the bucket add it at the front
      nexts[k] = key[pos];
      key[pos] = k;
      counts[k] = 1;
      // Check if it's necessary to rehash
      if (++numValues >= maxFill) {
        rehash(it.unimi.dsi.fastutil.HashCommon.arraySize(numValues + 1, LOAD_FACTOR));
      }
      return 0;
    }

    private void rehash(final int newN) {
      int i = 0;
      int pos;
      int k;
      int next;
      final int[] key = this.key;
      final int newMask = newN - 1;
      final int[] newKey = new int[newN];
      // Iterate over all values in the set and rehash them
      for(int j = numValues; j != 0;) {
        while((k = key[i]) == 0) {
          i++;
        }
        do {
          // Compute the new hash
          pos = it.unimi.dsi.fastutil.HashCommon.murmurHash3(hashcodes[k])  & newMask;
          // Store the next value in the current bucket of the old set
          next = nexts[k];
          // Add the value to the beginning of the new bucket
          nexts[k] = newKey[pos];
          newKey[pos] = k;
          j--;
          // Repeat for each value in the old bucket
        } while ((k = next) != 0);
        i++;
      }
      length = newN;
      mask = newMask;
      maxFill = it.unimi.dsi.fastutil.HashCommon.maxFill(length, LOAD_FACTOR);
      this.key = newKey;
    }
    public void clear() {
      if (numValues == 0) {
        return;
      }

      numValues = 0;
      // Set all values to 0
      for (int i = 0; i < length; i++) {
        key[i] = 0;
      }
    }

    public int size() {
      return numValues;
      }
    }

  public class TextPositionComparator implements IntComparator {
   @Override
   public int compare (Integer k1, Integer k2) {
     return this.compare(k1.intValue(), k2.intValue());
   }

	 @Override
	 public int compare (int k1, int k2) {
	   if (sortByStride) {
  	   if ((counts[k1] == 1 || counts[k2] == 1) && (counts[k1] != 1 || counts[k2] != 1)) {
  	     return Ints.compare(counts[k1], counts[k2]);
  	   }

  	   if ((counts[k1] == 1 && counts[k2] == 1) && indexStrides[k1] != indexStrides[k2]) {
  	     return Ints.compare(indexStrides[k1], indexStrides[k2]);
  	   }
	   }

		 int k1Length = getEnd(k1) - offsets[k1];

		 int k2Length = getEnd(k2) - offsets[k2];

		 return byteArray.compare(offsets[k1], k1Length, offsets[k2], k2Length);
	 }
  }

  public StringDictionaryEncoder() {
    super();
    this.sortByStride = false;
  }

  public StringDictionaryEncoder(boolean sortKeys, boolean sortByStride) {
    super(sortKeys);
    this.sortByStride = sortByStride;
  }

  public int add(Text value, int indexStride) {
    int len = value.getLength();
    // See the comment on TextCompressedHashSet
    // This intentionally skips index 0
    int newKeyIndex = numElements + 1;
    hashcodes[newKeyIndex] = value.hashCode();
    Slice newKeySlice = Slices.wrappedBuffer(value.getBytes(), 0, value.getLength());
    int existing = htDictionary.add(newKeySlice, newKeyIndex);
    if (existing != 0) {
      return existing - 1;
    } else {
      // update count of hashset keys
      int valRow = numElements;
      numElements += 1;
      // If we've outgrown the arrays, resize them
      if (newKeyIndex + 1 >= offsets.length) {
        offsets = getDoubleSizeArray(offsets);
        hashcodes = getDoubleSizeArray(hashcodes);
        nexts = getDoubleSizeArray(nexts);
        counts = getDoubleSizeArray(counts);
        indexStrides = getDoubleSizeArray(indexStrides);
      }
      // set current key offset and length
      offsets[newKeyIndex] = byteArray.add(value.getBytes(), 0, len);
      indexStrides[newKeyIndex] = indexStride;
      return valRow;
    }
  }

  private int[] getDoubleSizeArray(int[] array) {
    int[] newArray = new int[array.length * 2];
    System.arraycopy(array, 0, newArray, 0, array.length);
    return newArray;
  }

  private int getEnd(int pos) {
    if (pos + 1 > numElements) {
      return byteArray.size();
    }

    return offsets[pos + 1];
  }

  protected boolean equalsValue(Slice key, int offset, int length) {
    return byteArray.equals(key, 0, key.length(), offset, length);
  }

  private class VisitorContextImpl implements VisitorContext {
    private int originalPosition;
    private int start;
    private int length;
    private int count;
    private int indexStride;
    private final Text text = new Text();

    public void setOriginalPosition(int pos) {
      originalPosition = pos - 1;
      start = offsets[pos];
      length = getEnd(pos) - offsets[pos];
      count = counts[pos];
      indexStride = indexStrides[pos];
    }

    public int getOriginalPosition() {
      return originalPosition;
    }

    public Text getKey() {
      byteArray.setText(text, start, length);
      return text;
    }

    public void writeBytes(OutputStream out) throws IOException {
        byteArray.write(out, start, length);
    }

    public int getLength() {
      return length;
    }

    public int getCount() {
      return count;
    }

    public int getIndexStride() {
      return indexStride;
    }
  }

  private void visitDictionary(Visitor visitor, VisitorContextImpl context
                      ) throws IOException {
      int[] keysArray = null;
      if (sortKeys) {
        keysArray = new int[numElements];
        for (int idx = 0; idx < numElements; idx++) {
          keysArray[idx] = idx + 1;
        }
        IntArrays.quickSort(keysArray, new TextPositionComparator());
      }

      for (int pos = 0; pos < numElements; pos++) {
        context.setOriginalPosition(keysArray == null? pos + 1: keysArray[pos]);
        visitor.visit(context);
      }
      keysArray = null;
  }

  /**
   * Visit all of the nodes in the tree in sorted order.
   * @param visitor the action to be applied to each ndoe
   * @throws IOException
   */
  public void visit(Visitor visitor) throws IOException {
    visitDictionary(visitor, new VisitorContextImpl());
  }

  public void getText(Text result, int originalPosition) {
    byteArray.setText(result, offsets[originalPosition + 1], getEnd(originalPosition + 1) - offsets[originalPosition + 1]);
  }

  /**
   * Reset the table to empty.
   */
  @Override
  public void clear() {
    byteArray.clear();
    htDictionary.clear();
    offsets = new int[DynamicIntArray.DEFAULT_SIZE];
    hashcodes = new int[DynamicIntArray.DEFAULT_SIZE];
    nexts = new int[DynamicIntArray.DEFAULT_SIZE];
    counts = new int[DynamicIntArray.DEFAULT_SIZE];
    indexStrides = new int[DynamicIntArray.DEFAULT_SIZE];
    numElements = 0;
  }

  /**
   * Get the size of the character data in the table.
   * @return the bytes used by the table
   */
  public long getCharacterSize() {
    return byteArray.getSizeInBytes();
  }

  public int getUncompressedLength() {
    // The amount of memory used by entries in the dictionary
    return byteArray.size();
  }

  /**
   * Calculate the approximate size in memory.
   * @return the number of bytes used in storing the tree.
   */
  public long getSizeInBytes() {
    // one for dictionary keys
    long refSizes = (htDictionary.size() * 4);

    // 2 int fields per element (TextCompressed object)
    long textCompressedSizes = offsets.length * 4 + hashcodes.length * 4 + nexts.length * 4 +
        counts.length * 4 + indexStrides.length * 4;

    // bytes in the characters
    // size of the int array storing the offsets
    long totalSize =  getCharacterSize();
    totalSize += refSizes + textCompressedSizes;
    return totalSize;
  }

  /**
   * Get the number of elements in the set.
   */
  @Override
  public int size() {
    return numElements;
  }

}