All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.facebook.hive.orc.IntDictionaryEncoder Maven / Gradle / Ivy

There is a newer version: 0.18.9
Show newest version
//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.hive.orc;

import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntComparator;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;

import java.io.IOException;
import java.io.OutputStream;

class IntDictionaryEncoder extends DictionaryEncoder {

  private long newKey;
  private int numElements = 0;
  private final int numBytes;
  private final boolean useVInts;

  protected final DynamicLongArray keys = new DynamicLongArray();
  protected final DynamicIntArray counts = new DynamicIntArray();
  protected Long2IntOpenHashMapWithByteSize dictionary = new Long2IntOpenHashMapWithByteSize();

  public IntDictionaryEncoder(int numBytes, boolean useVInts) {
    super();
    this.numBytes = numBytes;
    this.useVInts = useVInts;
  }

  public IntDictionaryEncoder(boolean sortKeys, int numBytes, boolean useVInts) {
    super(sortKeys);
    this.numBytes = numBytes;
    this.useVInts = useVInts;
  }

  public long getValue(int position) {
    return keys.get(position);
  }

  private class Long2IntOpenHashMapWithByteSize extends Long2IntOpenHashMap {
    private static final long serialVersionUID = 0L;

    public Long2IntOpenHashMapWithByteSize() {
      super();
    }

    public int getByteSize() {
      int size = key.length * 8 + value.length * 4 + used.length;

      // If we're close to the point where the dictionary is going to be rehashed, be pessimistic
      // and adjust the size assuming we will
      if (size + 5000 >= maxFill) {
        return (int) (size / f);
      }
      return size;
    }
  }

  /**
   *
   */
  public class LongPositionComparator implements IntComparator {
    @Override
    public int compare(Integer pos, Integer cmpPos) {
      return this.compare(pos.intValue(), cmpPos.intValue());
    }

    @Override
    public int compare(int pos, int cmpPos) {
      return compareValue(keys.get(pos), keys.get(cmpPos));
    }
  }

  public void visitDictionary(Visitor visitor, IntDictionaryEncoderVisitorContext context) throws IOException {
      int[] keysArray = null;
      if (sortKeys) {
        keysArray = new int[numElements];
        for (int idx = 0; idx < numElements; idx++) {
          keysArray[idx] = idx;
        }
        IntArrays.quickSort(keysArray, new LongPositionComparator());
      }
      for (int pos = 0; pos < numElements; pos++) {
        context.setOriginalPosition(keysArray == null? pos : keysArray[pos]);
        visitor.visit(context);
      }
      keysArray = null;
  }

  public void visit(Visitor visitor) throws IOException {
    visitDictionary(visitor, new IntDictionaryEncoderVisitorContext());
  }

  @Override
  public void clear() {
    keys.clear();
    counts.clear();
    dictionary = new Long2IntOpenHashMapWithByteSize();
    numElements = 0;
  }

  private int compareValue (long k, long cmpKey) {
    if (k > cmpKey) {
      return 1;
    } else if (k < cmpKey) {
      return -1;
    }
    return 0;
  }

  public int add (long value) {
    newKey = value;
    if (dictionary.containsKey(value)) {
      int index = dictionary.get(value);
      counts.increment(index, 1);
      return dictionary.get(value);
    } else {
      int valRow = numElements;
      numElements++;
      dictionary.put(value, valRow);
      keys.add(newKey);
      counts.add(1);
      return valRow;
    }
  }

  public class IntDictionaryEncoderVisitorContext implements VisitorContext {

    int originalPosition;
    public void setOriginalPosition(int pos) {
      originalPosition = pos;
    }
    public int getOriginalPosition() {
      return originalPosition;
    }

    public Long getKey() {
      return keys.get(originalPosition);
    }

    public void writeBytes(OutputStream outputStream) throws IOException {
      long cur = keys.get(originalPosition);
      SerializationUtils.writeIntegerType(outputStream, cur, numBytes, true, useVInts);
    }

    // TODO: this should be different
    public int getLength() {
      return 8;
    }

    public int getCount() {
      return counts.get(originalPosition);
    }
    @Override
    public int getIndexStride() {
      throw new UnsupportedOperationException("IntDictionaryEncoder does not currently track the" +
      		" index stride");
    }
  }

  public long getByteSize() {

    // Long2IntOpenHashMap stores per element:
    // key in long[] (8 bytes)
    // value in int[] (4 bytes)
    // whether each bucket was used or not in boolean [] (1 byte
    long posSizes = dictionary.getByteSize();

    return keys.getSizeInBytes() + counts.getSizeInBytes() +
        posSizes;
  }

  public int getUncompressedLength() {
    // The amount of memory used by entries in the dictionary
    return numElements * 8;
  }

  /**
   * Get the number of elements in the set.
   */
  @Override
  public int size() {
    return numElements;
  }

}





© 2015 - 2025 Weber Informatics LLC | Privacy Policy