com.groupon.lex.metrics.Histogram Maven / Gradle / Ivy
The newest version!
package com.groupon.lex.metrics;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import static java.util.Collections.EMPTY_LIST;
import static java.util.Collections.sort;
import static java.util.Collections.unmodifiableList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Objects;
import java.util.Optional;
import java.util.function.BiFunction;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NonNull;
import lombok.Value;
/**
* A histogram with buckets indexed by a value.
*
* A histogram contains zero or more buckets, each with a count of the number of
* events in that bucket.
*
* @author ariane
*/
public class Histogram implements Serializable, Comparable {
/**
* Buckets, sorted by range.
*/
private final List buckets_;
/**
* Create a histogram from a set of ranges with associated event counters.
*
* @throws IllegalArgumentException If the items contain mixed signs.
*/
public Histogram(RangeWithCount... items) {
this(Arrays.stream(items));
}
/**
* Create a histogram from a set of ranges with associated event counters.
*
* @throws IllegalArgumentException If the items contain mixed signs.
*/
public Histogram(Stream items) {
final List iter = cleanup_(items.map(RangeWithCount::clone).collect(Collectors.toList()));
if (iter.isEmpty()) {
buckets_ = EMPTY_LIST;
return;
}
if (iter.stream()
.map(RangeWithCount::getCount)
.map(Math::signum)
.distinct()
.count() > 1) {
throw new IllegalArgumentException("mixed sign");
}
final List buckets = new ArrayList<>(iter.size());
double running_total = 0;
for (RangeWithCount rwc : iter) {
running_total += rwc.getCount();
buckets.add(new Bucket(rwc.getRange(), rwc.getCount(), running_total));
}
buckets_ = unmodifiableList(buckets);
}
/**
* Returns a map of range -> event count. The elements of the stream are
* a mutable copy of the internal data.
*/
public Stream stream() {
return buckets_.stream()
.map(bucket -> new RangeWithCount(bucket.getRange(), bucket.getEvents()));
}
/**
* Return the event count of this histogram.
*/
public double getEventCount() {
return (buckets_.isEmpty() ? 0 : buckets_.get(buckets_.size() - 1).getRunningEventsCount());
}
/**
* Test if the histogram is empty.
*/
public boolean isEmpty() {
return buckets_.isEmpty();
}
/**
* Return the minimum value in the histogram.
*/
public Optional min() {
if (isEmpty()) return Optional.empty();
return Optional.of(buckets_.get(0).getRange().getFloor());
}
/**
* Return the minimum value in the histogram.
*/
public Optional max() {
if (isEmpty()) return Optional.empty();
return Optional.of(buckets_.get(buckets_.size() - 1).getRange().getCeil());
}
/**
* Return the median of the histogram.
*/
public Optional median() {
if (isEmpty()) return Optional.empty();
return Optional.of(percentile(50));
}
/**
* Return the average of the histogram.
*/
public Optional avg() {
if (isEmpty()) return Optional.empty();
return Optional.of(sum() / getEventCount());
}
/**
* Return the sum of the histogram.
*/
public double sum() {
return buckets_.stream()
.mapToDouble(b -> b.getRange().getMidPoint() * b.getEvents())
.sum();
}
/**
* Get the value at a given position.
*/
public double get(double index) {
ListIterator b = buckets_.listIterator(0);
ListIterator e = buckets_.listIterator(buckets_.size());
while (b.nextIndex() < e.previousIndex()) {
final ListIterator mid = buckets_.listIterator(b.nextIndex() / 2 + e.nextIndex() / 2);
final Bucket mid_bucket = mid.next();
mid.previous(); // Undo position change made by mid.next().
if (mid_bucket.getRunningEventsCount() == index && mid.nextIndex() >= e.previousIndex()) {
return mid_bucket.getRange().getCeil();
} else if (mid_bucket.getRunningEventsCount() <= index) {
b = mid;
b.next();
} else if (mid_bucket.getRunningEventsCount() - mid_bucket.getEvents() > index) {
e = mid;
} else {
b = mid;
break;
}
}
final Bucket bucket = b.next();
b.previous(); // Undo position change made by b.next().
final double low = bucket.getRunningEventsCount() - bucket.getEvents();
final double off = index - low;
final double left_fraction = off / bucket.getEvents();
final double right_fraction = 1 - left_fraction;
return bucket.getRange().getCeil() * left_fraction + bucket.getRange().getFloor() * right_fraction;
}
/**
* Get the value at the given percentile.
*/
public double percentile(double percentile) {
return get(percentile * getEventCount() / 100);
}
/**
* Create a new histogram, after applying the function on each of the event
* counters.
*/
public Histogram modifyEventCounters(BiFunction fn) {
return new Histogram(stream()
.map(entry -> {
entry.setCount(fn.apply(entry.getRange(), entry.getCount()));
return entry;
}));
}
/**
* Add two histograms together.
*/
public static Histogram add(Histogram x, Histogram y) {
return new Histogram(Stream.concat(x.stream(), y.stream()));
}
/**
* Negates the counters on the histogram.
*/
public static Histogram negate(Histogram x) {
return x.modifyEventCounters((r, d) -> -d);
}
/**
* Subtracts two histograms.
*
* @throws IllegalArgumentException If the result contains mixed signs.
*/
public static Histogram subtract(Histogram x, Histogram y) {
return new Histogram(Stream.concat(
x.stream(),
y.stream().map(rwc -> {
rwc.setCount(-rwc.getCount());
return rwc;
})));
}
/**
* Multiply histogram by scalar.
*/
public static Histogram multiply(Histogram x, double y) {
return x.modifyEventCounters((r, d) -> d * y);
}
/**
* Divide histogram by scalar.
*/
public static Histogram divide(Histogram x, double y) {
return x.modifyEventCounters((r, d) -> d / y);
}
@Override
public int hashCode() {
int hash = 7;
hash = 29 * hash + Objects.hashCode(this.buckets_);
return hash;
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final Histogram other = (Histogram) obj;
if (!Objects.equals(this.buckets_, other.buckets_)) {
return false;
}
return true;
}
/**
* Compare two histograms.
*/
@Override
public int compareTo(Histogram o) {
int cmp = 0;
final Iterator iter = buckets_.iterator(), o_iter = o.buckets_.iterator();
while (cmp == 0 && iter.hasNext() && o_iter.hasNext()) {
final Bucket next = iter.next(), o_next = o_iter.next();
cmp = Double.compare(next.getRange().getFloor(), o_next.getRange().getFloor());
if (cmp == 0)
cmp = Double.compare(next.getRange().getCeil(), o_next.getRange().getCeil());
if (cmp == 0)
cmp = Double.compare(next.getEvents(), o_next.getEvents());
}
if (cmp == 0)
cmp = (iter.hasNext() ? 1 : (o_iter.hasNext() ? -1 : 0));
return cmp;
}
@Override
public String toString() {
return buckets_.stream()
.map(bucket -> bucket.getRange().getFloor() + ".." + bucket.getRange().getCeil() + "=" + bucket.getEvents())
.collect(Collectors.joining(", ", "[ ", " ]"));
}
@Value
public static class Range implements Serializable {
private final double floor, ceil;
/**
* Constructor.
*
* @throws IllegalArgumentException If the ceil is less than the floor.
*/
public Range(double floor, double ceil) {
if (floor > ceil)
throw new IllegalArgumentException("negative range");
this.floor = floor;
this.ceil = ceil;
}
/**
* Returns the width of this range.
*/
public double getWidth() {
return getCeil() - getFloor();
}
/**
* Returns the mid-point of this range.
*/
public double getMidPoint() {
return getFloor() / 2 + getCeil() / 2;
}
}
@Value
private static class Bucket implements Serializable {
@NonNull
private final Range range;
private final double events;
private final double runningEventsCount;
}
@Data
@AllArgsConstructor
public static class RangeWithCount implements Serializable, Cloneable {
private Range range;
private double count;
public RangeWithCount(double floor, double ceil, double count) {
this(new Range(floor, ceil), count);
}
@Override
public RangeWithCount clone() {
return new RangeWithCount(range, count);
}
}
/**
* Clean up an arbitrary collection of ranges.
*
* Ranges are split at their intersection point. Ranges with a count of zero
* are omitted.
*
* @param imed An arbitrary collection of ranges. The operations on this
* list will be destructive.
* @return An ordered list of ranges, none of which intersect eachother.
*/
private static List cleanup_(List imed) {
final Comparator cmp = Comparator
.comparing((RangeWithCount range_count) -> range_count.getRange().getFloor())
.thenComparing(Comparator.comparing((RangeWithCount range_count) -> range_count.getRange().getCeil()));
final List result = new ArrayList<>(imed.size());
sort(imed, cmp);
while (imed.size() >= 2) {
final RangeWithCount head = imed.remove(0);
final RangeWithCount succ = imed.get(0);
// Merge adjecent ranges.
if (head.getRange().equals(succ.getRange())) {
succ.setCount(succ.getCount() + head.getCount());
continue;
}
// Move elements from extending range.
if (head.getRange().getFloor() == succ.getRange().getFloor()) {
final double mid = head.getRange().getCeil();
final double ceil = succ.getRange().getCeil();
final double succ_range = succ.getRange().getWidth();
final double succ_left_fraction = (mid - succ.getRange().getFloor()) / succ_range;
final double succ_right_fraction = 1 - succ_left_fraction;
head.setCount(head.getCount() + succ_left_fraction * succ.getCount());
succ.setCount(succ_right_fraction * succ.getCount());
succ.setRange(new Range(mid, ceil));
imed.add(0, head);
sort(imed, cmp);
continue;
}
// Emit disjunt head range.
if (head.getRange().getCeil() <= succ.getRange().getFloor()) {
if (Math.signum(head.getCount()) != 0)
result.add(head);
continue;
}
// head.floor < succ.floor < head.ceil
assert (head.getRange().getFloor() < succ.getRange().getFloor());
assert (succ.getRange().getFloor() < head.getRange().getCeil());
// Head is intersected by succ, split it in two, at the succ.floor boundary.
final double floor = head.getRange().getFloor();
final double ceil = succ.getRange().getFloor();
final double head_range = head.getRange().getWidth();
final double head_left_fraction = (ceil - floor) / head_range;
final double head_right_fraction = 1 - head_left_fraction;
imed.add(0, head);
imed.add(0, new RangeWithCount(new Range(floor, ceil), head_left_fraction * head.getCount()));
head.setRange(new Range(ceil, head.getRange().getCeil()));
head.setCount(head_right_fraction * head.getCount());
sort(imed, cmp);
}
imed.stream()
.filter(rwc -> Math.signum(rwc.getCount()) != 0)
.forEach(result::add);
// Merge adjecent entries, if they have the same distribution.
for (int i = 0; i < result.size() - 1;) {
final RangeWithCount pred = result.get(i);
final RangeWithCount succ = result.get(i + 1);
final double pred_range = pred.getRange().getWidth();
final double succ_range = succ.getRange().getWidth();
if (pred.getRange().getCeil() == succ.getRange().getFloor()
&& pred.getCount() * succ_range == succ.getCount() * pred_range) {
result.remove(i);
succ.setRange(new Range(pred.getRange().getFloor(), succ.getRange().getCeil()));
succ.setCount(succ.getCount() + pred.getCount());
} else {
++i;
}
}
return result;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy