org.apache.datasketches.theta.PairwiseSetOperations Maven / Gradle / Ivy
Show all versions of datasketches-java Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.theta;
import static org.apache.datasketches.theta.SetOperation.createCompactSketch;
import java.util.Arrays;
import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.Util;
/**
* Set Operations where the arguments are presented in pairs as in C = Op(A,B). These are
* stateless operations and the result is returned immediately.
*
* These operations are designed for convenience and accept Sketches that may be either
* Heap-based or Direct.
*
* @author Lee Rhodes
*/
public class PairwiseSetOperations {
/**
* This implements a stateless, pair-wise Intersect operation on sketches
* that are either Heap-based or Direct.
* If both inputs are null an EmptyCompactSketch is returned.
*
* @param skA The first Sketch argument.
* @param skB The second Sketch argument.
* @return the result as an ordered CompactSketch on the heap.
*/
public static CompactSketch intersect(final Sketch skA, final Sketch skB) {
if (((skA == null) || (skA instanceof EmptyCompactSketch))
&& ((skB == null) || (skB instanceof EmptyCompactSketch))) {
return EmptyCompactSketch.getInstance();
}
final short seedHash = (skA == null) ? skB.getSeedHash() : skA.getSeedHash();
final Intersection inter = new IntersectionImpl(seedHash);
return inter.intersect(skA, skB, true, null);
}
/**
* This implements a stateless, pair-wise A AND NOT B operation on Sketches
* that are either Heap-based or Direct.
* If both inputs are null an EmptyCompactSketch is returned.
*
* @param skA The first Sketch argument.
* @param skB The second Sketch argument.
* @return the result as an ordered CompactSketch on the heap.
*/
public static CompactSketch aNotB(final Sketch skA, final Sketch skB) {
if (((skA == null) || (skA instanceof EmptyCompactSketch))
&& ((skB == null) || (skB instanceof EmptyCompactSketch))) {
return EmptyCompactSketch.getInstance();
}
final short seedHash = ((skA == null) || (skA instanceof EmptyCompactSketch))
? skB.getSeedHash() : skA.getSeedHash(); // lgtm [java/dereferenced-value-may-be-null]
final HeapAnotB anotb = new HeapAnotB(seedHash);
return anotb.aNotB(skA, skB, true, null); //handles null arguments just fine.
}
/**
* This implements a stateless, pair-wise union operation on ordered,
* CompactSketches that are either Heap-based or Direct.
* If both inputs are null an EmptyCompactSketch is returned.
* If one is null the other is returned, which can be either Heap-based or Direct.
* This is equivalent to union(skA, skB, k) where k is the default of 4096.
*
* @param skA The first ordered, CompactSketch argument.
* @param skB The second ordered, CompactSketch argument
* @return the result as an ordered CompactSketch.
*/
public static CompactSketch union(final CompactSketch skA, final CompactSketch skB) {
return union(skA, skB, Util.DEFAULT_NOMINAL_ENTRIES);
}
/**
* This implements a stateless, pair-wise union operation on ordered,
* CompactSketches that are either Heap-based or Direct. The returned sketch will be cutback to
* k if required, similar to the regular Union operation. If a cutback is required, the returned
* sketch will always be on the heap.
* If both inputs are null a null is returned. If either sketch is empty its Theta is ignored.
* If one is null the other is returned, which may be either Direct or heap-based if a cutback
* is required.
*
* @param skA The first ordered, CompactSketch argument.
* @param skB The second ordered, CompactSketch argument
* @param k The upper bound of the number of entries to be retained by the sketch
* @return the result as an ordered CompactSketch.
*/
@SuppressWarnings("null")
public static CompactSketch union(final CompactSketch skA, final CompactSketch skB, final int k) {
//Handle all corner cases with null or empty arguments
//For backward compatibility, we must allow input empties with Theta < 1.0.
final int swA, swB;
swA = ((skA == null) || (skA instanceof EmptyCompactSketch)) ? 0 : 2;
swB = ((skB == null) || (skB instanceof EmptyCompactSketch)) ? 0 : 1;
final int sw = swA | swB;
switch (sw) {
case 0: { //skA == null/ECS; skB == null/ECS; return EmptyCompactSketch.
return EmptyCompactSketch.getInstance();
}
case 1: { //skA == null/ECS; skB == valid; return skB
checkOrdered(skB);
return maybeCutback(skB, k);
}
case 2: { //skA == valid; skB == null/ECS; return skA
checkOrdered(skA);
return maybeCutback(skA, k);
}
case 3: { //skA == valid; skB == valid; perform full union
checkOrdered(skA);
checkOrdered(skB);
seedHashesCheck(skA, skB);
break;
}
//default: cannot happen
}
//Both sketches are valid with matching seedhashes and ordered
//Full Union operation:
final long thetaLongA = skA.getThetaLong(); //lgtm [java/dereferenced-value-may-be-null]
final long thetaLongB = skB.getThetaLong(); //lgtm [java/dereferenced-value-may-be-null]
long thetaLong = Math.min(thetaLongA, thetaLongB); //Theta rule
final long[] cacheA = (skA.hasMemory()) ? skA.getCache() : skA.getCache().clone();
final long[] cacheB = (skB.hasMemory()) ? skB.getCache() : skB.getCache().clone();
final int aLen = cacheA.length;
final int bLen = cacheB.length;
final long[] outCache = new long[aLen + bLen];
int indexA = 0;
int indexB = 0;
int indexOut = 0;
long hashA = (aLen == 0) ? thetaLong : cacheA[indexA];
long hashB = (bLen == 0) ? thetaLong : cacheB[indexB];
while ((indexA < aLen) || (indexB < bLen)) {
if (hashA == hashB) {
if (hashA < thetaLong) {
if (indexOut >= k) {
thetaLong = hashA;
break;
}
outCache[indexOut++] = hashA;
hashA = (++indexA < aLen) ? cacheA[indexA] : thetaLong;
hashB = (++indexB < bLen) ? cacheB[indexB] : thetaLong;
continue;
}
break;
}
else if (hashA < hashB) {
if (hashA < thetaLong) {
if (indexOut >= k) {
thetaLong = hashA;
break;
}
outCache[indexOut++] = hashA;
hashA = (++indexA < aLen) ? cacheA[indexA] : thetaLong;
continue;
}
break;
}
else { //hashA > hashB
if (hashB < thetaLong) {
if (indexOut >= k) {
thetaLong = hashB;
break;
}
outCache[indexOut++] = hashB;
hashB = (++indexB < bLen) ? cacheB[indexB] : thetaLong;
continue;
}
break;
}
}
int curCount = indexOut;
final long[] outArr;
if (indexOut > k) { //unlikely
outArr = Arrays.copyOf(outCache, k); //cutback to k, just in case
curCount = k;
} else {
outArr = Arrays.copyOf(outCache, curCount); //copy only valid items
}
return createCompactSketch(outArr, false, skA.getSeedHash(), curCount, thetaLong, true, null);
}
private static CompactSketch maybeCutback(final CompactSketch csk, final int k) {
final boolean empty = csk.isEmpty();
int curCount = csk.getRetainedEntries(true);
long thetaLong = csk.getThetaLong();
if (curCount > k) { //cutback to k
final long[] cache = (csk.hasMemory()) ? csk.getCache() : csk.getCache().clone();
thetaLong = cache[k];
final long[] arr = Arrays.copyOf(cache, k);
curCount = k;
return createCompactSketch(arr, empty, csk.getSeedHash(), curCount, thetaLong, true, null);
}
return csk;
}
private static void checkOrdered(final CompactSketch csk) {
if (!csk.isOrdered()) {
throw new SketchesArgumentException("Given sketch must be ordered.");
}
}
private static short seedHashesCheck(final Sketch skA, final Sketch skB) {
final short seedHashA = skA.getSeedHash(); //lgtm [java/dereferenced-value-may-be-null]
final short seedHashB = skB.getSeedHash(); //lgtm [java/dereferenced-value-may-be-null]
return Util.checkSeedHashes(seedHashA, seedHashB);
}
}