All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.datasketches.tuple.CompactSketch Maven / Gradle / Ivy

Go to download

Core sketch algorithms used alone and by other Java repositories in the DataSketches library.

There is a newer version: 7.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.tuple;

import java.lang.reflect.Array;
import java.nio.ByteOrder;

import org.apache.datasketches.ByteArrayUtil;
import org.apache.datasketches.Family;
import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.memory.Memory;

/**
 * CompactSketches are never created directly. They are created as a result of
 * the compact() method of an UpdatableSketch or as a result of the getResult()
 * method of a set operation like Union, Intersection or AnotB. CompactSketch
 * consists of a compact list (i.e. no intervening spaces) of hash values,
 * corresponding list of Summaries, and a value for theta. The lists may or may
 * not be ordered. CompactSketch is read-only.
 *
 * @param  type of Summary
 */
public class CompactSketch extends Sketch {
  private static final byte serialVersionWithSummaryClassNameUID = 1;
  private static final byte serialVersionUID = 2;

  private enum Flags { IS_BIG_ENDIAN, IS_EMPTY, HAS_ENTRIES, IS_THETA_INCLUDED }

  CompactSketch(final long[] keys, final S[] summaries, final long theta, final boolean isEmpty) {
    keys_ = keys;
    summaries_ = summaries;
    theta_ = theta;
    isEmpty_ = isEmpty;
  }

  /**
   * This is to create an instance of a CompactSketch given a serialized form
   *
   * @param mem Memory object with serialized CompactSketch
   * @param deserializer the SummaryDeserializer
   */
  @SuppressWarnings({"unchecked"})
  CompactSketch(final Memory mem, final SummaryDeserializer deserializer) {
    int offset = 0;
    final byte preambleLongs = mem.getByte(offset++);
    final byte version = mem.getByte(offset++);
    final byte familyId = mem.getByte(offset++);
    SerializerDeserializer.validateFamily(familyId, preambleLongs);
    if (version > serialVersionUID) {
      throw new SketchesArgumentException(
          "Unsupported serial version. Expected: " + serialVersionUID + " or lower, actual: " + version);
    }
    SerializerDeserializer
      .validateType(mem.getByte(offset++), SerializerDeserializer.SketchType.CompactSketch);
    final byte flags = mem.getByte(offset++);
    final boolean isBigEndian = (flags & (1 << Flags.IS_BIG_ENDIAN.ordinal())) > 0;
    if (isBigEndian ^ ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) {
      throw new SketchesArgumentException("Byte order mismatch");
    }
    isEmpty_ = (flags & (1 << Flags.IS_EMPTY.ordinal())) > 0;
    final boolean isThetaIncluded = (flags & (1 << Flags.IS_THETA_INCLUDED.ordinal())) > 0;
    if (isThetaIncluded) {
      theta_ = mem.getLong(offset);
      offset += Long.BYTES;
    } else {
      theta_ = Long.MAX_VALUE;
    }
    final boolean hasEntries = (flags & (1 << Flags.HAS_ENTRIES.ordinal())) > 0;
    if (hasEntries) {
      int classNameLength = 0;
      if (version == serialVersionWithSummaryClassNameUID) {
        classNameLength = mem.getByte(offset++);
      }
      final int count = mem.getInt(offset);
      offset += Integer.BYTES;
      if (version == serialVersionWithSummaryClassNameUID) {
        offset += classNameLength;
      }
      keys_ = new long[count];
      for (int i = 0; i < count; i++) {
        keys_[i] = mem.getLong(offset);
        offset += Long.BYTES;
      }
      for (int i = 0; i < count; i++) {
        final Memory memRegion = mem.region(offset, mem.getCapacity() - offset);
        final DeserializeResult result = deserializer.heapifySummary(memRegion);
        final S summary = result.getObject();
        offset += result.getSize();
        if (summaries_ == null) {
          summaries_ = (S[]) Array.newInstance(summary.getClass(), count);
        }
        summaries_[i] = summary;
      }
    }
  }

  @Override
  public int getRetainedEntries() {
    return keys_ == null ? 0 : keys_.length;
  }

  // Layout of first 8 bytes:
  // Long || Start Byte Adr:
  // Adr:
  //      ||    7   |    6   |    5   |    4   |    3   |    2   |    1   |     0              |
  //  0   ||                          |  Flags | SkType | FamID  | SerVer |  Preamble_Longs    |
  @SuppressWarnings("null")
  @Override
  public byte[] toByteArray() {
    int summariesBytesLength = 0;
    byte[][] summariesBytes = null;
    final int count = getRetainedEntries();
    if (count > 0) {
      summariesBytes = new byte[count][];
      for (int i = 0; i < count; i++) {
        summariesBytes[i] = summaries_[i].toByteArray();
        summariesBytesLength += summariesBytes[i].length;
      }
    }

    int sizeBytes =
        Byte.BYTES // preamble longs
      + Byte.BYTES // serial version
      + Byte.BYTES // family id
      + Byte.BYTES // sketch type
      + Byte.BYTES; // flags
    final boolean isThetaIncluded = theta_ < Long.MAX_VALUE;
    if (isThetaIncluded) {
      sizeBytes += Long.BYTES; // theta
    }
    if (count > 0) {
      sizeBytes +=
        + Integer.BYTES // count
        + (Long.BYTES * count) + summariesBytesLength;
    }
    final byte[] bytes = new byte[sizeBytes];
    int offset = 0;
    bytes[offset++] = PREAMBLE_LONGS;
    bytes[offset++] = serialVersionUID;
    bytes[offset++] = (byte) Family.TUPLE.getID();
    bytes[offset++] = (byte) SerializerDeserializer.SketchType.CompactSketch.ordinal();
    final boolean isBigEndian = ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN);
    bytes[offset++] = (byte) (
      (isBigEndian ? 1 << Flags.IS_BIG_ENDIAN.ordinal() : 0)
      | (isEmpty_ ? 1 << Flags.IS_EMPTY.ordinal() : 0)
      | (count > 0 ? 1 << Flags.HAS_ENTRIES.ordinal() : 0)
      | (isThetaIncluded ? 1 << Flags.IS_THETA_INCLUDED.ordinal() : 0)
    );
    if (isThetaIncluded) {
      ByteArrayUtil.putLongLE(bytes, offset, theta_);
      offset += Long.BYTES;
    }
    if (count > 0) {
      ByteArrayUtil.putIntLE(bytes, offset, getRetainedEntries());
      offset += Integer.BYTES;
      for (int i = 0; i < count; i++) {
        ByteArrayUtil.putLongLE(bytes, offset, keys_[i]);
        offset += Long.BYTES;
      }
      for (int i = 0; i < count; i++) {
        System.arraycopy(summariesBytes[i], 0, bytes, offset, summariesBytes[i].length);
        offset += summariesBytes[i].length;
      }
    }
    return bytes;
  }

}