All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.datasketches.theta.PairwiseSetOperations Maven / Gradle / Ivy

Go to download

Core sketch algorithms used alone and by other Java repositories in the DataSketches library.

There is a newer version: 7.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.theta;

import static org.apache.datasketches.theta.SetOperation.createCompactSketch;

import java.util.Arrays;

import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.Util;

/**
 * Set Operations where the arguments are presented in pairs as in C = Op(A,B). These are
 * stateless operations and the result is returned immediately.
 *
 * 

These operations are designed for convenience and accept Sketches that may be either * Heap-based or Direct. * * @author Lee Rhodes */ public class PairwiseSetOperations { /** * This implements a stateless, pair-wise Intersect operation on sketches * that are either Heap-based or Direct. * If both inputs are null an EmptyCompactSketch is returned. * * @param skA The first Sketch argument. * @param skB The second Sketch argument. * @return the result as an ordered CompactSketch on the heap. */ public static CompactSketch intersect(final Sketch skA, final Sketch skB) { if (((skA == null) || (skA instanceof EmptyCompactSketch)) && ((skB == null) || (skB instanceof EmptyCompactSketch))) { return EmptyCompactSketch.getInstance(); } final short seedHash = (skA == null) ? skB.getSeedHash() : skA.getSeedHash(); final Intersection inter = new IntersectionImpl(seedHash); return inter.intersect(skA, skB, true, null); } /** * This implements a stateless, pair-wise A AND NOT B operation on Sketches * that are either Heap-based or Direct. * If both inputs are null an EmptyCompactSketch is returned. * * @param skA The first Sketch argument. * @param skB The second Sketch argument. * @return the result as an ordered CompactSketch on the heap. */ public static CompactSketch aNotB(final Sketch skA, final Sketch skB) { if (((skA == null) || (skA instanceof EmptyCompactSketch)) && ((skB == null) || (skB instanceof EmptyCompactSketch))) { return EmptyCompactSketch.getInstance(); } final short seedHash = ((skA == null) || (skA instanceof EmptyCompactSketch)) ? skB.getSeedHash() : skA.getSeedHash(); // lgtm [java/dereferenced-value-may-be-null] final HeapAnotB anotb = new HeapAnotB(seedHash); return anotb.aNotB(skA, skB, true, null); //handles null arguments just fine. } /** * This implements a stateless, pair-wise union operation on ordered, * CompactSketches that are either Heap-based or Direct. * If both inputs are null an EmptyCompactSketch is returned. * If one is null the other is returned, which can be either Heap-based or Direct. * This is equivalent to union(skA, skB, k) where k is the default of 4096. * * @param skA The first ordered, CompactSketch argument. * @param skB The second ordered, CompactSketch argument * @return the result as an ordered CompactSketch. */ public static CompactSketch union(final CompactSketch skA, final CompactSketch skB) { return union(skA, skB, Util.DEFAULT_NOMINAL_ENTRIES); } /** * This implements a stateless, pair-wise union operation on ordered, * CompactSketches that are either Heap-based or Direct. The returned sketch will be cutback to * k if required, similar to the regular Union operation. If a cutback is required, the returned * sketch will always be on the heap. * If both inputs are null a null is returned. If either sketch is empty its Theta is ignored. * If one is null the other is returned, which may be either Direct or heap-based if a cutback * is required. * * @param skA The first ordered, CompactSketch argument. * @param skB The second ordered, CompactSketch argument * @param k The upper bound of the number of entries to be retained by the sketch * @return the result as an ordered CompactSketch. */ @SuppressWarnings("null") public static CompactSketch union(final CompactSketch skA, final CompactSketch skB, final int k) { //Handle all corner cases with null or empty arguments //For backward compatibility, we must allow input empties with Theta < 1.0. final int swA, swB; swA = ((skA == null) || (skA instanceof EmptyCompactSketch)) ? 0 : 2; swB = ((skB == null) || (skB instanceof EmptyCompactSketch)) ? 0 : 1; final int sw = swA | swB; switch (sw) { case 0: { //skA == null/ECS; skB == null/ECS; return EmptyCompactSketch. return EmptyCompactSketch.getInstance(); } case 1: { //skA == null/ECS; skB == valid; return skB checkOrdered(skB); return maybeCutback(skB, k); } case 2: { //skA == valid; skB == null/ECS; return skA checkOrdered(skA); return maybeCutback(skA, k); } case 3: { //skA == valid; skB == valid; perform full union checkOrdered(skA); checkOrdered(skB); seedHashesCheck(skA, skB); break; } //default: cannot happen } //Both sketches are valid with matching seedhashes and ordered //Full Union operation: final long thetaLongA = skA.getThetaLong(); //lgtm [java/dereferenced-value-may-be-null] final long thetaLongB = skB.getThetaLong(); //lgtm [java/dereferenced-value-may-be-null] long thetaLong = Math.min(thetaLongA, thetaLongB); //Theta rule final long[] cacheA = (skA.hasMemory()) ? skA.getCache() : skA.getCache().clone(); final long[] cacheB = (skB.hasMemory()) ? skB.getCache() : skB.getCache().clone(); final int aLen = cacheA.length; final int bLen = cacheB.length; final long[] outCache = new long[aLen + bLen]; int indexA = 0; int indexB = 0; int indexOut = 0; long hashA = (aLen == 0) ? thetaLong : cacheA[indexA]; long hashB = (bLen == 0) ? thetaLong : cacheB[indexB]; while ((indexA < aLen) || (indexB < bLen)) { if (hashA == hashB) { if (hashA < thetaLong) { if (indexOut >= k) { thetaLong = hashA; break; } outCache[indexOut++] = hashA; hashA = (++indexA < aLen) ? cacheA[indexA] : thetaLong; hashB = (++indexB < bLen) ? cacheB[indexB] : thetaLong; continue; } break; } else if (hashA < hashB) { if (hashA < thetaLong) { if (indexOut >= k) { thetaLong = hashA; break; } outCache[indexOut++] = hashA; hashA = (++indexA < aLen) ? cacheA[indexA] : thetaLong; continue; } break; } else { //hashA > hashB if (hashB < thetaLong) { if (indexOut >= k) { thetaLong = hashB; break; } outCache[indexOut++] = hashB; hashB = (++indexB < bLen) ? cacheB[indexB] : thetaLong; continue; } break; } } int curCount = indexOut; final long[] outArr; if (indexOut > k) { //unlikely outArr = Arrays.copyOf(outCache, k); //cutback to k, just in case curCount = k; } else { outArr = Arrays.copyOf(outCache, curCount); //copy only valid items } return createCompactSketch(outArr, false, skA.getSeedHash(), curCount, thetaLong, true, null); } private static CompactSketch maybeCutback(final CompactSketch csk, final int k) { final boolean empty = csk.isEmpty(); int curCount = csk.getRetainedEntries(true); long thetaLong = csk.getThetaLong(); if (curCount > k) { //cutback to k final long[] cache = (csk.hasMemory()) ? csk.getCache() : csk.getCache().clone(); thetaLong = cache[k]; final long[] arr = Arrays.copyOf(cache, k); curCount = k; return createCompactSketch(arr, empty, csk.getSeedHash(), curCount, thetaLong, true, null); } return csk; } private static void checkOrdered(final CompactSketch csk) { if (!csk.isOrdered()) { throw new SketchesArgumentException("Given sketch must be ordered."); } } private static short seedHashesCheck(final Sketch skA, final Sketch skB) { final short seedHashA = skA.getSeedHash(); //lgtm [java/dereferenced-value-may-be-null] final short seedHashB = skB.getSeedHash(); //lgtm [java/dereferenced-value-may-be-null] return Util.checkSeedHashes(seedHashA, seedHashB); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy