All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.amazon.randomcutforest.parkservices.threshold.BasicThresholder Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License").
 * You may not use this file except in compliance with the License.
 * A copy of the License is located at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * or in the "license" file accompanying this file. This file is distributed
 * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 * express or implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package com.amazon.randomcutforest.parkservices.threshold;

import static com.amazon.randomcutforest.CommonUtils.checkArgument;
import static com.amazon.randomcutforest.RandomCutForest.DEFAULT_SAMPLE_SIZE;
import static com.amazon.randomcutforest.RandomCutForest.DEFAULT_SAMPLE_SIZE_COEFFICIENT_IN_TIME_DECAY;
import static java.lang.Math.max;
import static java.lang.Math.min;
import static java.lang.Math.sqrt;

import java.util.List;

import com.amazon.randomcutforest.config.TransformMethod;
import com.amazon.randomcutforest.statistics.Deviation;
import com.amazon.randomcutforest.util.Weighted;

public class BasicThresholder {

    public static double DEFAULT_SCORE_DIFFERENCING = 0.5;
    public static int DEFAULT_MINIMUM_SCORES = 10;
    public static double DEFAULT_FACTOR_ADJUSTMENT_THRESHOLD = 0.9;
    public static double DEFAULT_ABSOLUTE_THRESHOLD = 0.8;
    public static double DEFAULT_INITIAL_THRESHOLD = 1.5;
    public static double DEFAULT_Z_FACTOR = 3.0;
    public static double MINIMUM_Z_FACTOR = 2.0;
    public static boolean DEFAULT_AUTO_THRESHOLD = true;
    public static int DEFAULT_DEVIATION_STATES = 3;

    // keeping a count of the values seen because both deviation variables
    // primaryDeviation
    // and secondaryDeviation may not be used always
    protected int count = 0;

    // horizon = 0 is short term, switches to secondary
    // horizon = 1 long term, switches to primary
    protected double scoreDifferencing = DEFAULT_SCORE_DIFFERENCING;

    // below these many observations, deviation is not useful
    protected int minimumScores = DEFAULT_MINIMUM_SCORES;

    protected Deviation primaryDeviation;

    protected Deviation secondaryDeviation;

    protected Deviation thresholdDeviation;

    protected boolean autoThreshold = DEFAULT_AUTO_THRESHOLD;

    // an absoluteThreshold
    protected double absoluteThreshold = DEFAULT_ABSOLUTE_THRESHOLD;

    // the upper threshold of scores above which points are likely anomalies
    protected double factorAdjustmentThreshold = DEFAULT_FACTOR_ADJUSTMENT_THRESHOLD;
    // initial absolute threshold used to determine anomalies before sufficient
    // values are seen
    protected double initialThreshold = DEFAULT_INITIAL_THRESHOLD;
    // used to determine the surprise coefficient above which we can call a
    // potential anomaly
    protected double zFactor = DEFAULT_Z_FACTOR;

    public BasicThresholder(double primaryDiscount, double secondaryDiscount, boolean adjust) {
        primaryDeviation = new Deviation(primaryDiscount);
        secondaryDeviation = new Deviation(secondaryDiscount);
        // a longer horizon to adjust
        thresholdDeviation = new Deviation(primaryDiscount / 2);
        autoThreshold = adjust;
    }

    public BasicThresholder(double discount) {
        this(discount, discount, false);
    }

    public BasicThresholder(Deviation[] deviations) {
        int length = (deviations == null) ? 0 : deviations.length;
        if (length != DEFAULT_DEVIATION_STATES) {
            double timeDecay = 1.0 / (DEFAULT_SAMPLE_SIZE * DEFAULT_SAMPLE_SIZE_COEFFICIENT_IN_TIME_DECAY);
            this.primaryDeviation = new Deviation(timeDecay);
            this.secondaryDeviation = new Deviation(timeDecay);
            this.thresholdDeviation = new Deviation(0.1 * timeDecay);
        } else {
            this.primaryDeviation = deviations[0];
            this.secondaryDeviation = deviations[1];
            this.thresholdDeviation = deviations[2];
        }
    }

    public BasicThresholder(List scores, double rate) {
        this.primaryDeviation = new Deviation(0);
        this.secondaryDeviation = new Deviation(0);
        this.thresholdDeviation = new Deviation(0);
        if (scores != null) {
            scores.forEach(s -> update(s, s));
        }
        primaryDeviation.setDiscount(rate);
        secondaryDeviation.setDiscount(rate);
        thresholdDeviation.setDiscount(0.1 * rate);
    }

    /**
     * a boolean that determines if enough values have been seen to be able to
     * discern deviations
     * 
     * @return true/false based on counts of various statistic
     */
    public boolean isDeviationReady() {
        if (count < minimumScores) {
            return false;
        }

        if (scoreDifferencing != 0) {
            return secondaryDeviation.getCount() >= minimumScores;
        }
        return true;
    }

    /**
     * this function helps switch from short term (not able to use deviation, using
     * absolute scores) which is the first minimumScores observations of the scoring
     * function to using deviation (and not using absokute scores, except as a lower
     * bound) at 2*minimumScores It is often the case that the data has "run"
     * effects and the initial scopres can all come in low or can all come in high
     * 
     * @return a parameter that helps smoot transition of initial to long term
     *         behavior
     */
    protected double intermediateTermFraction() {
        if (count < minimumScores) {
            return 0;
        } else if (count > 2 * minimumScores) {
            return 1;
        } else {
            return (count - minimumScores) * 1.0 / minimumScores;
        }
    }

    @Deprecated
    public double threshold() {
        return getPrimaryThreshold();
    }

    public double getPrimaryThreshold() {
        if (!isDeviationReady()) {
            return 0;
        }
        return primaryDeviation.getMean() + zFactor * primaryDeviation.getDeviation();
    }

    /**
     * The simplest thresholder that does not use any auxilliary correction, an can
     * be used for multiple scoring capabilities.
     *
     * @param score the value being thresholded
     * @return a computation of grade between [-1,1], grades in the range (0,1] are
     *         to be considered anomalous
     */
    public double getPrimaryGrade(double score) {
        if (!isDeviationReady()) {
            return 0;
        }
        double tFactor = 2 * zFactor;
        double deviation = primaryDeviation.getDeviation();
        if (deviation > 0) {
            tFactor = min(tFactor, (score - primaryDeviation.getMean()) / deviation);
        } else {
            return (score > primaryDeviation.getMean() + 1e-10) ? 1.0 : 0;
        }
        double t = (tFactor - zFactor) / (zFactor);
        return max(0, t);
    }

    public Weighted getPrimaryThresholdAndGrade(double score) {
        if (!isDeviationReady() || score <= 0) {
            return new Weighted(0.0, 0.0f);
        }
        double threshold = getPrimaryThreshold();
        float grade = (threshold > 0 && score > threshold) ? (float) getPrimaryGrade(score) : 0f;
        return new Weighted<>(threshold, grade);
    }

    @Deprecated
    public double getAnomalyGrade(double score, boolean flag) {
        return getPrimaryGrade(score);
    }

    /**
     * The following adapts the notion of x-sigma (standard deviation) to admit the
     * case that RCF scores are asymmetric and values lower than 1 (closer to 0.5)
     * can be more common; whereas anomalies are typically larger the x-factor is
     * automatically scaled to be calibrated with the average score (bounded below
     * by an absolute constant like 0.7)
     * 
     * @param factor    the factor being scaled
     * @param method    transformation method
     * @param dimension the dimension of the problem (currently unused)
     * @return a scaled value of the factor
     */

    protected double adjustedFactor(double factor, TransformMethod method, int dimension) {
        double correctedFactor = factor;
        double base = primaryDeviation.getMean();
        if (base < factorAdjustmentThreshold && method != TransformMethod.NONE) {
            correctedFactor = primaryDeviation.getMean() * factor / factorAdjustmentThreshold;
        }
        return max(correctedFactor, MINIMUM_Z_FACTOR);
    }

    /**
     * The following computes the standard deviation of the scores. But we have
     * multiple ways of measuring that -- if the scores are typically symmetric then
     * many of these measures concide. However transformation of the values may
     * cause the score distribution to be unusual. For example, if NORMALIZATION is
     * used then the scores (below the average) end up being close to the average
     * (an example of the asymmetry) and thus only standard deviation is used. But
     * for other distributions we could directly estimate the deviation of the
     * scores below the dynamic mean in an online manner, and we do so in
     * thresholdDeviation. An orthogonal component is the effect of
     * shingling/differencing which connect up the scores from consecutive input.
     * 
     * @param method      transformation method
     * @param shingleSize shinglesize used
     * @return an estimate of long term deviation from mean of a stochastic series
     */
    protected double longTermDeviation(TransformMethod method, int shingleSize) {

        if (shingleSize == 1
                && !(method == TransformMethod.DIFFERENCE || method == TransformMethod.NORMALIZE_DIFFERENCE)) {
            // control the effect of large values above a threshold from raising the
            // threshold
            return min(sqrt(2.0) * thresholdDeviation.getDeviation(), primaryDeviation.getDeviation());
        } else {
            double first = primaryDeviation.getDeviation();
            first = min(first, max(secondaryDeviation.getDeviation(), sqrt(2.0) * thresholdDeviation.getDeviation()));
            // there is a role of differencing; either by shingling or by explicit
            // transformation
            return scoreDifferencing * first + (1 - scoreDifferencing) * secondaryDeviation.getDeviation();
        }

    }

    public Weighted getThresholdAndGrade(double score, TransformMethod method, int dimension, int shingleSize) {
        return getThresholdAndGrade(score, zFactor, method, dimension, shingleSize);
    }

    public Weighted getThresholdAndGrade(double score, double factor, TransformMethod method, int dimension,
            int shingleSize) {
        double intermediateFraction = intermediateTermFraction();
        double newFactor = adjustedFactor(factor, method, dimension);
        double longTerm = longTermDeviation(method, shingleSize);
        double scaledDeviation = (newFactor - 1) * longTerm + primaryDeviation.getDeviation();

        double absolute = absoluteThreshold;
        if (autoThreshold && intermediateFraction >= 1.0 && primaryDeviation.getMean() < factorAdjustmentThreshold) {
            absolute = primaryDeviation.getMean() * absolute / factorAdjustmentThreshold;
        }
        double threshold = (!isDeviationReady()) ? max(initialThreshold, absolute)
                : max(absolute, intermediateFraction * (primaryDeviation.getMean() + scaledDeviation)
                        + (1 - intermediateFraction) * initialThreshold);
        if (score < threshold || threshold <= 0) {
            return new Weighted<>(threshold, 0);
        } else {
            double t = getSurpriseIndex(score, threshold, newFactor, scaledDeviation / newFactor);
            t = min((Math.floor(t * 20)) / 16, 1.0); // grade 1 at scaledDeviation at 4 sigma
            if (t == 0) {
                // round off errors
                threshold = score;
            }
            return new Weighted<>(threshold, (float) t);
        }
    }

    /**
     * how surprised are seeing a value from a series with mean base with deviation,
     * where factor controls the separation
     * 
     * @param score     score
     * @param base      mean of series
     * @param factor    control parameter for determining surprise
     * @param deviation relevant deviation for the series
     * @return a clipped value of the "surpise" index
     */
    protected float getSurpriseIndex(double score, double base, double factor, double deviation) {
        if (isDeviationReady()) {
            double tFactor = 2 * factor;
            if (deviation > 0) {
                tFactor = min(factor, (score - base) / deviation);
            }
            return max(0, (float) (tFactor / factor));
        } else {
            return (float) min(1, max(0, (score - absoluteThreshold) / absoluteThreshold));
        }
    }

    // mean or below; uses the asymmetry of the RCF score
    protected void updateThreshold(double score) {
        double gap = primaryDeviation.getMean() - score;
        if (gap > 0) {
            thresholdDeviation.update(gap);
        }
    }

    protected void updatePrimary(double score) {
        updateThreshold(score);
        primaryDeviation.update(score);
        ++count;
    }

    public void update(double primary, double secondary) {
        updateThreshold(primary);
        primaryDeviation.update(primary);
        secondaryDeviation.update(secondary);
        ++count;
    }

    public void update(double score, double secondScore, double lastScore, TransformMethod method) {
        update(min(score, 2.0), secondScore - lastScore);
    }

    public Deviation getPrimaryDeviation() {
        return primaryDeviation;
    }

    public Deviation getSecondaryDeviation() {
        return secondaryDeviation;
    }

    public void setZfactor(double factor) {
        zFactor = factor;
    }

    /**
     * sets the lower threshold -- which is used to scale the factor variable
     */
    public void setLowerThreshold(double lower) {
        factorAdjustmentThreshold = lower;
    }

    /**
     * 
     * @param value absolute lower bound thresholds turns off auto adjustment -- to
     *              respect the direct setting
     */
    public void setAbsoluteThreshold(double value) {
        autoThreshold = false;
        absoluteThreshold = value;
    }

    public void setInitialThreshold(double initial) {
        initialThreshold = initial;
    }

    public void setScoreDifferencing(double scoreDifferencing) {
        checkArgument(scoreDifferencing >= 0 && scoreDifferencing <= 1, "incorrect score differencing parameter");
        this.scoreDifferencing = scoreDifferencing;
    }

    // to be updated as more deviations are added
    public Deviation[] getDeviations() {
        Deviation[] deviations = new Deviation[DEFAULT_DEVIATION_STATES];
        deviations[0] = primaryDeviation.copy();
        deviations[1] = secondaryDeviation.copy();
        deviations[2] = thresholdDeviation.copy();
        return deviations;
    }

    public boolean isAutoThreshold() {
        return autoThreshold;
    }

    public int getCount() {
        return count;
    }

    public void setCount(int count) {
        this.count = count;
    }

    public double getAbsoluteThreshold() {
        return absoluteThreshold;
    }

    public double getLowerThreshold() {
        return factorAdjustmentThreshold;
    }

    public double getInitialThreshold() {
        return initialThreshold;
    }

    public double getScoreDifferencing() {
        return scoreDifferencing;
    }

    public double getZFactor() {
        return zFactor;
    }

    public int getMinimumScores() {
        return minimumScores;
    }

    public void setMinimumScores(int minimumScores) {
        this.minimumScores = minimumScores;
    }

    public void setAutoThreshold(boolean autoThreshold) {
        this.autoThreshold = autoThreshold;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy