All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.datasketches.hll.Union Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.hll;

import static java.lang.Math.min;
import static org.apache.datasketches.hll.CurMode.HLL;
import static org.apache.datasketches.hll.HllUtil.EMPTY;
import static org.apache.datasketches.hll.TgtHllType.HLL_8;

import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableMemory;

/**
 * This performs union operations for HLL sketches. This union operator is configured with a
 * lgMaxK instead of the normal lgConfigK.
 *
 * 

This union operator does permit the unioning of sketches with different values of * lgConfigK. The user should be aware that the resulting accuracy of a sketch returned * at the end of the unioning process will be a function of the smallest of lgMaxK and * lgConfigK that the union operator has seen. * *

This union operator also permits unioning of any of the three different target HllSketch * types. * *

Although the API for this union operator parallels many of the methods of the * HllSketch, the behavior of the union operator has some fundamental differences. * *

First, the user cannot specify the {@link TgtHllType} as an input parameter. * Instead, it is specified for the sketch returned with {@link #getResult(TgtHllType)}. * *

Second, the internal effective value of log-base-2 of K for the union operation can * change dynamically based on the smallest lgConfigK that the union operation has seen. * * @author Lee Rhodes * @author Kevin Lang */ public class Union extends BaseHllSketch { final int lgMaxK; private final HllSketch gadget; /** * Construct this Union operator with the default maximum log-base-2 of K. */ public Union() { this.lgMaxK = HllSketch.DEFAULT_LG_K; gadget = new HllSketch(lgMaxK, HLL_8); } /** * Construct this Union operator with a given maximum log-base-2 of K. * @param lgMaxK the desired maximum log-base-2 of K. This value must be * between 7 and 21 inclusively. */ public Union(final int lgMaxK) { this.lgMaxK = HllUtil.checkLgK(lgMaxK); gadget = new HllSketch(lgMaxK, HLL_8); } /** * Construct this Union operator with a given maximum log-base-2 of K and the given * WritableMemory as the destination for this Union. This WritableMemory is usually configured * for off-heap memory. What remains on the java heap is a thin wrapper object that reads and * writes to the given WritableMemory. * *

The given dstMem is checked for the required capacity as determined by * {@link HllSketch#getMaxUpdatableSerializationBytes(int, TgtHllType)}. * @param lgMaxK the desired maximum log-base-2 of K. This value must be * between 7 and 21 inclusively. * @param dstMem the destination memory for the sketch. */ public Union(final int lgMaxK, final WritableMemory dstMem) { this.lgMaxK = HllUtil.checkLgK(lgMaxK); gadget = new HllSketch(lgMaxK, HLL_8, dstMem); } Union(final HllSketch sketch) { lgMaxK = sketch.getLgConfigK(); final TgtHllType tgtHllType = sketch.getTgtHllType(); if (tgtHllType != TgtHllType.HLL_8) { throw new SketchesArgumentException("Union can only wrap HLL_8 sketches."); } gadget = sketch; } /** * Construct a union operator populated with the given byte array image of an HllSketch. * @param byteArray the given byte array * @return a union operator populated with the given byte array image of an HllSketch. */ public static final Union heapify(final byte[] byteArray) { return heapify(Memory.wrap(byteArray)); } /** * Construct a union operator populated with the given Memory image of an HllSketch. * @param mem the given Memory * @return a union operator populated with the given Memory image of an HllSketch. */ public static final Union heapify(final Memory mem) { final int lgK = HllUtil.checkLgK(mem.getByte(PreambleUtil.LG_K_BYTE)); final HllSketch sk = HllSketch.heapify(mem); final Union union = new Union(lgK); union.update(sk); return union; } /** * Wraps the given WritableMemory, which must be a image of a valid updatable HLL_8 sketch, * and may have data. What remains on the java heap is a * thin wrapper object that reads and writes to the given WritableMemory, which, depending on * how the user configures the WritableMemory, may actually reside on the Java heap or off-heap. * *

The given dstMem is checked for the required capacity as determined by * {@link HllSketch#getMaxUpdatableSerializationBytes(int, TgtHllType)}, and for the correct type. * @param wmem an writable image of a valid sketch with data. * @return a Union operator where the sketch data is in the given dstMem. */ public static final Union writableWrap(final WritableMemory wmem) { return new Union(HllSketch.writableWrap(wmem)); } @Override public double getCompositeEstimate() { return gadget.hllSketchImpl.getCompositeEstimate(); } @Override CurMode getCurMode() { return gadget.getCurMode(); } @Override public int getCompactSerializationBytes() { return gadget.getCompactSerializationBytes(); } @Override public double getEstimate() { return gadget.getEstimate(); } /** * Gets the effective lgConfigK for the union operator, which may be less than * lgMaxK. * @return the lgConfigK. */ @Override public int getLgConfigK() { return gadget.getLgConfigK(); } /** * Returns the maximum size in bytes that this union operator can grow to given a lgK. * * @param lgK The maximum Log2 of K for this union operator. This value must be * between 4 and 21 inclusively. * @return the maximum size in bytes that this union operator can grow to. */ public static int getMaxSerializationBytes(final int lgK) { return HllSketch.getMaxUpdatableSerializationBytes(lgK, TgtHllType.HLL_8); } @Override public double getLowerBound(final int numStdDev) { return gadget.getLowerBound(numStdDev); } /** * Return the result of this union operator as an HLL_4 sketch. * @return the result of this union operator as an HLL_4 sketch. */ public HllSketch getResult() { return gadget.copyAs(HllSketch.DEFAULT_HLL_TYPE); } /** * Return the result of this union operator with the specified {@link TgtHllType} * @param tgtHllType the TgtHllType enum * @return the result of this union operator with the specified TgtHllType */ public HllSketch getResult(final TgtHllType tgtHllType) { return gadget.copyAs(tgtHllType); } @Override public TgtHllType getTgtHllType() { return TgtHllType.HLL_8; } @Override public int getUpdatableSerializationBytes() { return gadget.getUpdatableSerializationBytes(); } @Override public double getUpperBound(final int numStdDev) { return gadget.getUpperBound(numStdDev); } @Override public boolean isCompact() { return gadget.isCompact(); } @Override public boolean isEmpty() { return gadget.isEmpty(); } @Override public boolean isMemory() { return gadget.isMemory(); } @Override public boolean isOffHeap() { return gadget.isOffHeap(); } @Override boolean isOutOfOrderFlag() { return gadget.isOutOfOrderFlag(); } @Override public boolean isSameResource(final Memory mem) { return gadget.isSameResource(mem); } /** * Resets to empty and retains the current lgK, but does not change the configured value of * lgMaxK. */ @Override public void reset() { gadget.reset(); } /** * Gets the serialization of this union operator as a byte array in compact form, which is * designed to be heapified only. It is not directly updatable. * For the Union operator, this is the serialization of the internal state of * the union operator as a sketch. * @return the serialization of this union operator as a byte array. */ @Override public byte[] toCompactByteArray() { return gadget.toCompactByteArray(); } @Override public byte[] toUpdatableByteArray() { return gadget.toUpdatableByteArray(); } @Override public String toString(final boolean summary, final boolean hllDetail, final boolean auxDetail, final boolean all) { return gadget.toString(summary, hllDetail, auxDetail, all); } /** * Update this union operator with the given sketch. * @param sketch the given sketch. */ public void update(final HllSketch sketch) { gadget.hllSketchImpl = unionImpl(sketch.hllSketchImpl, gadget.hllSketchImpl, lgMaxK); } @Override void couponUpdate(final int coupon) { if (coupon == EMPTY) { return; } gadget.hllSketchImpl = gadget.hllSketchImpl.couponUpdate(coupon); } // Union operator logic /** * Union the given source and destination sketches. This static method examines the state of * the current internal gadget and the incoming sketch and determines the optimum way to * perform the union. This may involve swapping, down-sampling, transforming, and / or * copying one of the arguments and may completely replace the internals of the union. * * @param incomingImpl the given incoming sketch, which may not be modified. * @param gadgetImpl the given gadget sketch, which must have a target of HLL_8 and may be * modified. * @param lgMaxK the maximum value of log2 K for this union. * @return the union of the two sketches in the form of the internal HllSketchImpl, which for * the union is always in HLL_8 form. */ private static final HllSketchImpl unionImpl(final HllSketchImpl incomingImpl, final HllSketchImpl gadgetImpl, final int lgMaxK) { assert gadgetImpl.getTgtHllType() == HLL_8; HllSketchImpl srcImpl = incomingImpl; //default HllSketchImpl dstImpl = gadgetImpl; //default if ((incomingImpl == null) || incomingImpl.isEmpty()) { return gadgetImpl; } final int hi2bits = (gadgetImpl.isEmpty()) ? 3 : gadgetImpl.getCurMode().ordinal(); final int lo2bits = incomingImpl.getCurMode().ordinal(); final int sw = (hi2bits << 2) | lo2bits; switch (sw) { case 0: { //src: LIST, gadget: LIST final PairIterator srcItr = srcImpl.iterator(); //LIST while (srcItr.nextValid()) { dstImpl = dstImpl.couponUpdate(srcItr.getPair()); //assignment required } //whichever is True wins: dstImpl.putOutOfOrderFlag(dstImpl.isOutOfOrderFlag() | srcImpl.isOutOfOrderFlag()); break; } case 1: { //src: SET, gadget: LIST //consider a swap here final PairIterator srcItr = srcImpl.iterator(); //SET while (srcItr.nextValid()) { dstImpl = dstImpl.couponUpdate(srcItr.getPair()); //assignment required } dstImpl.putOutOfOrderFlag(true); //SET oooFlag is always true break; } case 2: { //src: HLL, gadget: LIST //swap so that src is gadget-LIST, tgt is HLL //use lgMaxK because LIST has effective K of 2^26 srcImpl = gadgetImpl; dstImpl = copyOrDownsampleHll(incomingImpl, lgMaxK); final PairIterator srcItr = srcImpl.iterator(); while (srcItr.nextValid()) { dstImpl = dstImpl.couponUpdate(srcItr.getPair()); //assignment required } //whichever is True wins: dstImpl.putOutOfOrderFlag(srcImpl.isOutOfOrderFlag() | dstImpl.isOutOfOrderFlag()); break; } case 4: { //src: LIST, gadget: SET final PairIterator srcItr = srcImpl.iterator(); //LIST while (srcItr.nextValid()) { dstImpl = dstImpl.couponUpdate(srcItr.getPair()); //assignment required } dstImpl.putOutOfOrderFlag(true); //SET oooFlag is always true break; } case 5: { //src: SET, gadget: SET final PairIterator srcItr = srcImpl.iterator(); //SET while (srcItr.nextValid()) { dstImpl = dstImpl.couponUpdate(srcItr.getPair()); //assignment required } dstImpl.putOutOfOrderFlag(true); //SET oooFlag is always true break; } case 6: { //src: HLL, gadget: SET //swap so that src is gadget-SET, tgt is HLL //use lgMaxK because LIST has effective K of 2^26 srcImpl = gadgetImpl; dstImpl = copyOrDownsampleHll(incomingImpl, lgMaxK); final PairIterator srcItr = srcImpl.iterator(); //LIST assert dstImpl.getCurMode() == HLL; while (srcItr.nextValid()) { dstImpl = dstImpl.couponUpdate(srcItr.getPair()); //assignment required } dstImpl.putOutOfOrderFlag(true); //merging SET into non-empty HLL -> true break; } case 8: { //src: LIST, gadget: HLL assert dstImpl.getCurMode() == HLL; final PairIterator srcItr = srcImpl.iterator(); //LIST while (srcItr.nextValid()) { dstImpl = dstImpl.couponUpdate(srcItr.getPair()); //assignment required } //whichever is True wins: dstImpl.putOutOfOrderFlag(dstImpl.isOutOfOrderFlag() | srcImpl.isOutOfOrderFlag()); break; } case 9: { //src: SET, gadget: HLL assert dstImpl.getCurMode() == HLL; final PairIterator srcItr = srcImpl.iterator(); //SET while (srcItr.nextValid()) { dstImpl = dstImpl.couponUpdate(srcItr.getPair()); //assignment required } dstImpl.putOutOfOrderFlag(true); //merging SET into existing HLL -> true break; } case 10: { //src: HLL, gadget: HLL final int srcLgK = srcImpl.getLgConfigK(); final int dstLgK = dstImpl.getLgConfigK(); if ((srcLgK < dstLgK) || (dstImpl.getTgtHllType() != HLL_8)) { dstImpl = copyOrDownsampleHll(dstImpl, min(dstLgK, srcLgK)); //TODO Fix for off-heap } final PairIterator srcItr = srcImpl.iterator(); //HLL while (srcItr.nextValid()) { dstImpl = dstImpl.couponUpdate(srcItr.getPair()); //assignment required } dstImpl.putOutOfOrderFlag(true); //union of two HLL modes is always true break; } case 12: { //src: LIST, gadget: empty final PairIterator srcItr = srcImpl.iterator(); //LIST while (srcItr.nextValid()) { dstImpl = dstImpl.couponUpdate(srcItr.getPair()); //assignment required } dstImpl.putOutOfOrderFlag(srcImpl.isOutOfOrderFlag()); //whatever source is break; } case 13: { //src: SET, gadget: empty final PairIterator srcItr = srcImpl.iterator(); //SET while (srcItr.nextValid()) { dstImpl = dstImpl.couponUpdate(srcItr.getPair()); //assignment required } dstImpl.putOutOfOrderFlag(true); //SET oooFlag is always true break; } case 14: { //src: HLL, gadget: empty dstImpl = copyOrDownsampleHll(srcImpl, lgMaxK); dstImpl.putOutOfOrderFlag(srcImpl.isOutOfOrderFlag()); //whatever source is. break; } } if (gadgetImpl.isMemory() && !dstImpl.isMemory()) { //dstImpl is on heap, gadget is Memory; we have to put dstImpl back into the gadget final WritableMemory gadgetWmem = gadgetImpl.getWritableMemory(); assert gadgetWmem != null; final int bytes = HllSketch.getMaxUpdatableSerializationBytes(dstImpl.getLgConfigK(), HLL_8); gadgetWmem.clear(0, bytes); final byte[] dstByteArr = dstImpl.toUpdatableByteArray(); gadgetWmem.putByteArray(0, dstByteArr, 0, dstByteArr.length); dstImpl = HllSketch.writableWrap(gadgetWmem).hllSketchImpl; } return dstImpl; } //Used by union operator. Always copies or downsamples to Heap HLL_8. //Caller must ultimately manage oooFlag, as caller has more info private static final HllSketchImpl copyOrDownsampleHll( final HllSketchImpl srcImpl, final int tgtLgK) { assert srcImpl.getCurMode() == HLL; final AbstractHllArray src = (AbstractHllArray) srcImpl; final int srcLgK = src.getLgConfigK(); if ((srcLgK <= tgtLgK) && (src.getTgtHllType() == TgtHllType.HLL_8)) { return src.copy(); } final int minLgK = Math.min(srcLgK, tgtLgK); final HllArray tgtHllArr = HllArray.newHeapHll(minLgK, TgtHllType.HLL_8); final PairIterator srcItr = src.iterator(); while (srcItr.nextValid()) { tgtHllArr.couponUpdate(srcItr.getPair()); } //both of these are required for isomorphism tgtHllArr.putHipAccum(src.getHipAccum()); tgtHllArr.putOutOfOrderFlag(src.isOutOfOrderFlag()); return tgtHllArr; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy