All Downloads are FREE. Search and download functionalities are using the official Maven repository.

parquet.column.values.boundedint.BoundedIntValuesWriter Maven / Gradle / Ivy

/* 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package parquet.column.values.boundedint;

import static parquet.bytes.BytesInput.concat;
import static parquet.column.Encoding.RLE;
import parquet.Log;
import parquet.bytes.BytesInput;
import parquet.column.Encoding;
import parquet.column.values.ValuesWriter;
import parquet.column.values.bitpacking.BitPackingValuesWriter;
import parquet.io.ParquetEncodingException;

/**
 * This is a special ColumnWriter for the case when you need to write
 * integers in a known range. This is intended primarily for use with
 * repetition and definition levels, since the maximum value that will
 * be written is known a priori based on the schema. Assumption is that
 * the values written are between 0 and the bound, inclusive.
 *
 * This differs from {@link BitPackingValuesWriter} in that this also performs
 * run-length encoding of the data, so is useful when long runs of repeated
 * values are expected.
 */
class BoundedIntValuesWriter extends ValuesWriter {
  private static final Log LOG = Log.getLog(BoundedIntValuesWriter.class);

  private int currentValue = -1;
  private int currentValueCt = -1;
  private boolean currentValueIsRepeated = false;
  private boolean thereIsABufferedValue = false;
  private int shouldRepeatThreshold = 0;
  private int bitsPerValue;
  private BitWriter bitWriter;
  private boolean isFirst = true;

  private static final int[] byteToTrueMask = new int[8];
  static {
    int currentMask = 1;
    for (int i = 0; i < byteToTrueMask.length; i++) {
      byteToTrueMask[i] = currentMask;
      currentMask <<= 1;
    }
  }

  public BoundedIntValuesWriter(int bound, int initialCapacity, int pageSize) {
    if (bound == 0) {
      throw new ParquetEncodingException("Value bound cannot be 0. Use DevNullColumnWriter instead.");
    }
    this.bitWriter = new BitWriter(initialCapacity, pageSize);
    bitsPerValue = (int)Math.ceil(Math.log(bound + 1)/Math.log(2));
    shouldRepeatThreshold = (bitsPerValue + 9)/(1 + bitsPerValue);
    if (Log.DEBUG) LOG.debug("init column with bit width of " + bitsPerValue + " and repeat threshold of " + shouldRepeatThreshold);
  }

  @Override
  public long getBufferedSize() {
    // currentValue + currentValueCt = 8 bytes
    // shouldRepeatThreshold + bitsPerValue = 8 bytes
    // bitWriter = 8 bytes
    // currentValueIsRepeated + isFirst = 2 bytes (rounded to 8 b/c of word boundaries)
    return 32 + (bitWriter == null ? 0 : bitWriter.getMemSize());
  }

  // This assumes that the full state must be serialized, since there is no close method
  @Override
  public BytesInput getBytes() {
    serializeCurrentValue();
    BytesInput buf = bitWriter.finish();
    if (Log.DEBUG) LOG.debug("writing a buffer of size " + buf.size() + " + 4 bytes");
    // We serialize the length so that on deserialization we can
    // deserialize as we go, instead of having to load everything
    // into memory
    return concat(BytesInput.fromInt((int)buf.size()), buf);
  }

  @Override
  public void reset() {
    currentValue = -1;
    currentValueCt = -1;
    currentValueIsRepeated = false;
    thereIsABufferedValue = false;
    isFirst = true;
    bitWriter.reset();
  }

  @Override
  public void writeInteger(int val) {
    if (currentValue == val) {
      currentValueCt++;
      if (!currentValueIsRepeated && currentValueCt >= shouldRepeatThreshold) {
        currentValueIsRepeated = true;
      }
    } else {
      if (!isFirst) {
        serializeCurrentValue();
      } else {
        isFirst = false;
      }

      newCurrentValue(val);
    }
  }

  private void serializeCurrentValue() {
    if (thereIsABufferedValue) {
      if (currentValueIsRepeated) {
        bitWriter.writeBit(true);
        bitWriter.writeNBitInteger(currentValue, bitsPerValue);
        bitWriter.writeUnsignedVarint(currentValueCt);
      } else {
        for (int i = 0; i < currentValueCt; i++) {
          bitWriter.writeBit(false);
          bitWriter.writeNBitInteger(currentValue, bitsPerValue);
        }
      }
    }
    thereIsABufferedValue = false;
  }

  private void newCurrentValue(int val) {
    currentValue = val;
    currentValueCt = 1;
    currentValueIsRepeated = false;
    thereIsABufferedValue = true;
  }

  @Override
  public long getAllocatedSize() {
    return bitWriter.getCapacity();
  }

  @Override
  public Encoding getEncoding() {
    return RLE;
  }

  @Override
  public String memUsageString(String prefix) {
    return bitWriter.memUsageString(prefix);
  }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy