com.amazon.randomcutforest.parkservices.ThresholdedRandomCutForest Maven / Gradle / Ivy
/*
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package com.amazon.randomcutforest.parkservices;
import static com.amazon.randomcutforest.CommonUtils.checkArgument;
import static com.amazon.randomcutforest.CommonUtils.toFloatArray;
import static com.amazon.randomcutforest.RandomCutForest.DEFAULT_BOUNDING_BOX_CACHE_FRACTION;
import static com.amazon.randomcutforest.RandomCutForest.DEFAULT_CENTER_OF_MASS_ENABLED;
import static com.amazon.randomcutforest.RandomCutForest.DEFAULT_INITIAL_ACCEPT_FRACTION;
import static com.amazon.randomcutforest.RandomCutForest.DEFAULT_INTERNAL_SHINGLING_ENABLED;
import static com.amazon.randomcutforest.RandomCutForest.DEFAULT_NUMBER_OF_TREES;
import static com.amazon.randomcutforest.RandomCutForest.DEFAULT_OUTPUT_AFTER_FRACTION;
import static com.amazon.randomcutforest.RandomCutForest.DEFAULT_PARALLEL_EXECUTION_ENABLED;
import static com.amazon.randomcutforest.RandomCutForest.DEFAULT_SAMPLE_SIZE;
import static com.amazon.randomcutforest.RandomCutForest.DEFAULT_SHINGLE_SIZE;
import static com.amazon.randomcutforest.RandomCutForest.DEFAULT_STORE_SEQUENCE_INDEXES_ENABLED;
import static com.amazon.randomcutforest.config.ImputationMethod.RCF;
import static com.amazon.randomcutforest.parkservices.threshold.BasicThresholder.DEFAULT_ABSOLUTE_THRESHOLD;
import static com.amazon.randomcutforest.parkservices.threshold.BasicThresholder.DEFAULT_SCORE_DIFFERENCING;
import static com.amazon.randomcutforest.parkservices.threshold.BasicThresholder.DEFAULT_Z_FACTOR;
import static com.amazon.randomcutforest.preprocessor.Preprocessor.DEFAULT_START_NORMALIZATION;
import static com.amazon.randomcutforest.preprocessor.Preprocessor.DEFAULT_STOP_NORMALIZATION;
import static java.lang.Math.max;
import static java.lang.Math.min;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.Random;
import java.util.function.Function;
import lombok.Getter;
import lombok.Setter;
import com.amazon.randomcutforest.RandomCutForest;
import com.amazon.randomcutforest.config.ForestMode;
import com.amazon.randomcutforest.config.ImputationMethod;
import com.amazon.randomcutforest.config.Precision;
import com.amazon.randomcutforest.config.TransformMethod;
import com.amazon.randomcutforest.parkservices.config.ScoringStrategy;
import com.amazon.randomcutforest.parkservices.returntypes.RCFComputeDescriptor;
import com.amazon.randomcutforest.parkservices.threshold.BasicThresholder;
import com.amazon.randomcutforest.preprocessor.IPreprocessor;
import com.amazon.randomcutforest.preprocessor.Preprocessor;
import com.amazon.randomcutforest.returntypes.DiVector;
import com.amazon.randomcutforest.returntypes.RangeVector;
import com.amazon.randomcutforest.returntypes.TimedRangeVector;
/**
* This class provides a combined RCF and thresholder, both of which operate in
* a streaming manner and respect the arrow of time.
*/
@Getter
@Setter
public class ThresholdedRandomCutForest {
// saved description of the last seen anomaly
RCFComputeDescriptor lastAnomalyDescriptor;
// forestMode of operation
protected ForestMode forestMode = ForestMode.STANDARD;
protected TransformMethod transformMethod = TransformMethod.NONE;
protected ScoringStrategy scoringStrategy = ScoringStrategy.EXPECTED_INVERSE_DEPTH;
protected RandomCutForest forest;
protected PredictorCorrector predictorCorrector;
protected IPreprocessor preprocessor;
public ThresholdedRandomCutForest(Builder> builder) {
forestMode = builder.forestMode;
transformMethod = builder.transformMethod;
scoringStrategy = builder.scoringStrategy;
Preprocessor.Builder> preprocessorBuilder = Preprocessor.builder().shingleSize(builder.shingleSize)
.transformMethod(builder.transformMethod).forestMode(builder.forestMode);
int inputLength;
if (builder.forestMode == ForestMode.TIME_AUGMENTED) {
inputLength = builder.dimensions / builder.shingleSize;
preprocessorBuilder.inputLength(inputLength);
builder.dimensions += builder.shingleSize;
preprocessorBuilder.normalizeTime(builder.normalizeTime);
// force internal shingling for this option
builder.internalShinglingEnabled = Optional.of(true);
} else if (builder.forestMode == ForestMode.STREAMING_IMPUTE) {
// already validated
inputLength = builder.dimensions / builder.shingleSize;
preprocessorBuilder.inputLength(inputLength);
preprocessorBuilder.imputationMethod(builder.imputationMethod);
preprocessorBuilder.normalizeTime(true);
if (builder.fillValues != null) {
preprocessorBuilder.fillValues(builder.fillValues);
}
// forcing external for the forest to control admittance
builder.internalShinglingEnabled = Optional.of(true);
preprocessorBuilder.useImputedFraction(builder.useImputedFraction.orElse(0.5));
} else {
// STANDARD
boolean smallInput = builder.internalShinglingEnabled.orElse(DEFAULT_INTERNAL_SHINGLING_ENABLED);
inputLength = (smallInput) ? builder.dimensions / builder.shingleSize : builder.dimensions;
preprocessorBuilder.inputLength(inputLength);
}
forest = builder.buildForest();
validateNonNegativeArray(builder.weights);
preprocessorBuilder.weights(builder.weights);
preprocessorBuilder.weightTime(builder.weightTime.orElse(1.0));
preprocessorBuilder.transformDecay(builder.transformDecay.orElse(1.0 / builder.sampleSize));
// to be used later
preprocessorBuilder.randomSeed(builder.randomSeed.orElse(0L) + 1);
preprocessorBuilder.dimensions(builder.dimensions);
preprocessorBuilder.stopNormalization(builder.stopNormalization.orElse(DEFAULT_STOP_NORMALIZATION));
preprocessorBuilder.startNormalization(builder.startNormalization.orElse(DEFAULT_START_NORMALIZATION));
preprocessor = preprocessorBuilder.build();
predictorCorrector = new PredictorCorrector(forest.getTimeDecay(), builder.anomalyRate, builder.autoAdjust,
builder.dimensions / builder.shingleSize, builder.randomSeed.orElse(0L));
lastAnomalyDescriptor = new RCFComputeDescriptor(null, 0, builder.forestMode, builder.transformMethod,
builder.imputationMethod);
predictorCorrector.setAbsoluteThreshold(builder.lowerThreshold.orElse(DEFAULT_ABSOLUTE_THRESHOLD));
predictorCorrector.setZfactor(builder.zFactor);
predictorCorrector.setScoreDifferencing(builder.scoreDifferencing.orElse(DEFAULT_SCORE_DIFFERENCING));
builder.ignoreNearExpectedFromAbove.ifPresent(predictorCorrector::setIgnoreNearExpectedFromAbove);
builder.ignoreNearExpectedFromBelow.ifPresent(predictorCorrector::setIgnoreNearExpectedFromBelow);
builder.ignoreNearExpectedFromAboveByRatio.ifPresent(predictorCorrector::setIgnoreNearExpectedFromAboveByRatio);
builder.ignoreNearExpectedFromBelowByRatio.ifPresent(predictorCorrector::setIgnoreNearExpectedFromBelowByRatio);
predictorCorrector.setLastStrategy(builder.scoringStrategy);
predictorCorrector.setIgnoreDrift(builder.alertOnceInDrift);
}
void validateNonNegativeArray(double[] array) {
if (array != null) {
for (double element : array) {
checkArgument(element >= 0, " has to be non-negative");
}
}
}
// for mappers
public ThresholdedRandomCutForest(ForestMode forestMode, TransformMethod transformMethod,
ScoringStrategy scoringStrategy, RandomCutForest forest, PredictorCorrector predictorCorrector,
Preprocessor preprocessor, RCFComputeDescriptor descriptor) {
this.forestMode = forestMode;
this.transformMethod = transformMethod;
this.scoringStrategy = scoringStrategy;
this.forest = forest;
this.predictorCorrector = predictorCorrector;
this.preprocessor = preprocessor;
this.lastAnomalyDescriptor = descriptor;
}
// this constructor produces an internally shingled ThresholdedRCF model from an
// externally shingled RCF model -- possibly as a part of an externally shingled
// ThresholdedRCF, absent any transformations and augmentations.
// (these externally shingled models may or may not be in use in current version
// of OpenSearch)
// A benefit of this conversion would be that imputations would be accessible
// to ThresholdedRCF -- that is, even if not every value of the input tuple is
// known
// the function process() would be able to provide an anomaly score (which is
// likely near
// minimum, since RCF is used to fill in the missing values). As a result, high
// values of the
// anomaly score will continue to be likely anomalies.
// Note that the basic RandomCutForest cannot be changed easily
// but the process() function would only require a fraction of the input
// see ThresholdedRandomCutForestMapperTest
public ThresholdedRandomCutForest(RandomCutForest forest, double futureAnomalyRate, List values,
double[] lastShingledInput) {
this.forest = forest;
int dimensions = forest.getDimensions();
int inputLength = dimensions / forest.getShingleSize();
Preprocessor preprocessor = new Preprocessor.Builder<>().transformMethod(TransformMethod.NONE)
.dimensions(dimensions).shingleSize(forest.getShingleSize()).inputLength(inputLength)
.initialShingledInput(lastShingledInput).initialPoint(toFloatArray(lastShingledInput))
.imputationMethod(RCF).startNormalization(0).build();
this.predictorCorrector = new PredictorCorrector(new BasicThresholder(values, futureAnomalyRate), inputLength);
preprocessor.setValuesSeen((int) forest.getTotalUpdates());
preprocessor.getDataQuality()[0].update(1.0);
this.preprocessor = preprocessor;
this.lastAnomalyDescriptor = new RCFComputeDescriptor(null, forest.getTotalUpdates());
}
protected boolean saveDescriptor(T lastDescriptor) {
return (lastDescriptor.getAnomalyGrade() > 0);
}
protected void augment(P description) {
description.setScoringStrategy(scoringStrategy);
initialSetup(description, lastAnomalyDescriptor, forest);
predictorCorrector.detect(description, lastAnomalyDescriptor, forest);
postProcess(description);
if (saveDescriptor(description)) {
lastAnomalyDescriptor = description.copyOf();
}
}
/**
* a single call that prepreprocesses data, compute score/grade and updates
* state
*
* @param inputPoint current input point
* @param timestamp time stamp of input
* @return anomaly descriptor for the current input point
*/
public AnomalyDescriptor process(double[] inputPoint, long timestamp) {
return process(inputPoint, timestamp, null);
}
/**
* a single call that prepreprocesses data, compute score/grade and updates
* state when the current input has potentially missing values
*
* @param inputPoint current input point
* @param timestamp time stamp of input
* @param missingValues indices of the input which are missing/questionable
* values
* @return anomaly descriptor for the current input point
*/
public AnomalyDescriptor process(double[] inputPoint, long timestamp, int[] missingValues) {
AnomalyDescriptor description = new AnomalyDescriptor(inputPoint, timestamp);
description.setScoringStrategy(scoringStrategy);
boolean cacheDisabled = (forest.getBoundingBoxCacheFraction() == 0);
try {
if (cacheDisabled) { // turn caching on temporarily
forest.setBoundingBoxCacheFraction(1.0);
}
if (missingValues != null) {
checkArgument(missingValues.length <= inputPoint.length, " incorrect data");
for (int i = 0; i < missingValues.length; i++) {
checkArgument(missingValues[i] >= 0, " missing values cannot be at negative position");
checkArgument(missingValues[i] < inputPoint.length,
"missing values cannot be at position larger than input length");
}
description.setMissingValues(missingValues);
}
augment(description);
} finally {
if (cacheDisabled) { // turn caching off
forest.setBoundingBoxCacheFraction(0);
}
}
if (saveDescriptor(description)) {
lastAnomalyDescriptor = description.copyOf();
}
return description;
}
/**
* the following function processes a list of vectors sequentially; the main
* benefit of this invocation is the caching is persisted from one data point to
* another and thus the execution is efficient. Moreover in many scenarios where
* serialization deserialization is expensive then it may be of benefit of
* invoking sequential process on a contiguous chunk of input (we avoid the use
* of the word batch -- the entire goal of this procedure is to provide
* sequential processing and not standard batch processing). The procedure
* avoids transfer of ephemeral transient objects for non-anomalies and thereby
* can have additional benefits. At the moment the operation does not support
* external timestamps.
*
* @param data a vectors of vectors (each of which has to have the same
* inputLength)
* @param filter a condition to drop desriptor (recommended filter: anomalyGrade
* positive)
* @return collection of descriptors of the anomalies filtered by the condition
*/
public List processSequentially(double[][] data, Function filter) {
ArrayList answer = new ArrayList<>();
if (data != null && data.length > 0) {
boolean cacheDisabled = (forest.getBoundingBoxCacheFraction() == 0);
try {
if (cacheDisabled) { // turn caching on temporarily
forest.setBoundingBoxCacheFraction(1.0);
}
long timestamp = preprocessor.getInternalTimeStamp();
int length = preprocessor.getInputLength();
for (double[] point : data) {
checkArgument(point.length == length, " nonuniform lengths ");
AnomalyDescriptor description = new AnomalyDescriptor(point, timestamp++);
augment(description);
if (saveDescriptor(description)) {
lastAnomalyDescriptor = description.copyOf();
}
if (filter.apply(description)) {
answer.add(description);
}
}
} finally {
if (cacheDisabled) { // turn caching off
forest.setBoundingBoxCacheFraction(0);
}
}
}
return answer;
}
// recommended filter
public List processSequentially(double[][] data) {
return processSequentially(data, x -> x.getAnomalyGrade() > 0);
}
/**
* a function that extrapolates the data seen by the ThresholdedRCF model, and
* uses the transformations allowed (as opposed to just using RCFs). The
* forecasting also allows for predictor-corrector pattern which implies that
* some noise can be eliminated -- this can be important for various
* transformations. While the algorithm can function for STREAMING_IMPUTE mode
* where missing data is imputed on the fly, it may require effort to validate
* that the internal imputation is reasonably consistent with extrapolation. In
* general, since the STREAMING_IMPUTE can use non-RCF options to fill in
* missing data, the internal imputation and extrapolation need not be
* consistent.
*
* @param horizon the length of time in the future which is being forecast
* @param correct a boolean indicating if predictor-corrector subroutine
* should be turned on; this is specially helpful if there has
* been an anomaly in the recent past
* @param centrality in general RCF predicts the p50 value of conditional
* samples (centrality = 1). This parameter relaxes the
* conditional sampling. Using assumptions about input data
* (hence external to this code) it may be possible to use
* this parameter and the range information for confidence
* bounds.
* @return a timed range vector where the values[i] correspond to the forecast
* for horizon (i+1). The upper and lower arrays indicate the
* corresponding bounds based on the conditional sampling (and
* transformation). Note that TRCF manages time in process() and thus
* the forecasts always have timestamps associated which makes it easier
* to execute the same code for various forest modes such as
* STREAMING_IMPUTE, STANDARD and TIME_AUGMENTED. For STREAMING_IMPUTE
* the time components of the prediction will be 0 because the time
* information is already being used to fill in missing entries. For
* STANDARD mode the time components would correspond to average arrival
* difference. For TIME_AUGMENTED mode the time componentes would be the
* result of the joint prediction. Finally note that setting weight of
* time or any of the input columns will also 0 out the corresponding
* forecast.
*/
public TimedRangeVector extrapolate(int horizon, boolean correct, double centrality) {
int shingleSize = preprocessor.getShingleSize();
checkArgument(shingleSize > 1, "extrapolation is not meaningful for shingle size = 1");
// note the forest may have external shingling ...
int dimensions = forest.getDimensions();
int blockSize = dimensions / shingleSize;
float[] lastPoint = preprocessor.getLastShingledPoint();
if (forest.isOutputReady()) {
int gap = (int) (preprocessor.getInternalTimeStamp() - lastAnomalyDescriptor.getInternalTimeStamp());
float[] newPoint = lastPoint;
// gap will be at least 1
if (gap <= shingleSize && correct && lastAnomalyDescriptor.getExpectedRCFPoint() != null) {
if (gap == 1) {
newPoint = lastAnomalyDescriptor.getExpectedRCFPoint();
} else {
newPoint = predictorCorrector.applyPastCorrector(newPoint, gap, shingleSize, blockSize,
preprocessor.getScale(), transformMethod, lastAnomalyDescriptor);
}
}
RangeVector answer = forest.extrapolateWithRanges(newPoint, horizon, blockSize, false, 0, centrality);
return preprocessor.invertForecastRange(answer, lastAnomalyDescriptor.getInputTimestamp(),
lastAnomalyDescriptor.getDeltaShift(), lastAnomalyDescriptor.getExpectedRCFPoint() != null,
lastAnomalyDescriptor.getExpectedTimeStamp());
} else {
return new TimedRangeVector(new TimedRangeVector(horizon * blockSize, horizon));
}
}
public TimedRangeVector extrapolate(int horizon) {
return extrapolate(horizon, true, 1.0);
}
public RandomCutForest getForest() {
return forest;
}
public void setZfactor(double factor) {
predictorCorrector.setZfactor(factor);
}
public void setLowerThreshold(double lower) {
predictorCorrector.setAbsoluteThreshold(lower);
}
@Deprecated
public void setHorizon(double horizon) {
predictorCorrector.setScoreDifferencing(1 - horizon);
}
public void setScoreDifferencing(double scoreDifferencing) {
predictorCorrector.setScoreDifferencing(scoreDifferencing);
}
public void setIgnoreNearExpectedFromAbove(double[] ignoreSimilarFromAbove) {
predictorCorrector.setIgnoreNearExpectedFromAbove(ignoreSimilarFromAbove);
}
public void setIgnoreNearExpectedFromAboveByRatio(double[] ignoreSimilarFromAbove) {
predictorCorrector.setIgnoreNearExpectedFromAboveByRatio(ignoreSimilarFromAbove);
}
public void setIgnoreNearExpectedFromBelow(double[] ignoreSimilarFromBelow) {
predictorCorrector.setIgnoreNearExpectedFromBelow(ignoreSimilarFromBelow);
}
public void setIgnoreNearExpectedFromBelowByRatio(double[] ignoreSimilarFromBelow) {
predictorCorrector.setIgnoreNearExpectedFromBelowByRatio(ignoreSimilarFromBelow);
}
public void setScoringStrategy(ScoringStrategy strategy) {
this.scoringStrategy = strategy;
}
@Deprecated
public void setInitialThreshold(double initial) {
predictorCorrector.setInitialThreshold(initial);
}
/**
* sets up the AnomalyDescriptor object
*
* @param description description of the input point
* @param lastAnomalyDescriptor the descriptor of the last anomaly
* @param forest the RCF
* @return the descriptor to be used for anomaly scoring
*/
P initialSetup(P description, RCFComputeDescriptor lastAnomalyDescriptor,
RandomCutForest forest) {
description.setForestMode(forestMode);
description.setTransformMethod(transformMethod);
description.setImputationMethod(preprocessor.getImputationMethod());
description.setNumberOfTrees(forest.getNumberOfTrees());
description.setTotalUpdates(forest.getTotalUpdates());
description.setLastAnomalyInternalTimestamp(lastAnomalyDescriptor.getInternalTimeStamp());
description.setLastExpectedRCFPoint(lastAnomalyDescriptor.getExpectedRCFPoint());
description.setDataConfidence(forest.getTimeDecay(), preprocessor.getValuesSeen(), forest.getOutputAfter(),
preprocessor.dataQuality());
description.setShingleSize(preprocessor.getShingleSize());
description.setInputLength(preprocessor.getInputLength());
description.setDimension(forest.getDimensions());
description.setReasonableForecast(forest.isOutputReady() && forest.getDimensions() >= 4);
description.setScale(preprocessor.getScale());
description.setShift(preprocessor.getShift());
description.setDeviations(preprocessor.getSmoothedDeviations());
description.setNumberOfNewImputes(preprocessor.numberOfImputes(description.getInputTimestamp()));
description.setInternalTimeStamp(preprocessor.getInternalTimeStamp() + description.getNumberOfNewImputes());
description.setRCFPoint(preprocessor.getScaledShingledInput(description.getCurrentInput(),
description.getInputTimestamp(), description.getMissingValues(), forest));
return description;
}
void postProcess(P result) {
float[] point = result.getRCFPoint();
if (point != null) {
// first populate the description with current knowledge
// then update the preprocessor
// then update the RCF
if (result.getAnomalyGrade() > 0) {
/**
* adds information of expected point to the result descriptor (provided it is
* marked anomalous) Note that is uses relativeIndex; that is, it can determine
* that the anomaly occurred in the past (but within the shingle) and not at the
* current point -- even though the detection has triggered now While this may
* appear to be improper, information theoretically we may have a situation
* where an anomaly is only discoverable after the "horse has bolted" -- suppose
* that we see a random mixture of the triples { 1, 2, 3} and {2, 4, 5}
* corresponding to "slow weeks" and "busy weeks". For example 1, 2, 3, 1, 2, 3,
* 2, 4, 5, 1, 2, 3, 2, 4, 5, ... etc. If we see { 2, 2, X } (at positions 0 and
* 1 (mod 3)) and are yet to see X, then we can infer that the pattern is
* anomalous -- but we cannot determine which of the 2's are to blame. If it
* were the first 2, then the detection is late. If X = 3 then we know it is the
* first 2 in that unfinished triple; and if X = 5 then it is the second 2. In a
* sense we are only truly wiser once the bolted horse has returned! But if we
* were to say that the anomaly was always at the second 2 then that appears to
* be suboptimal -- one natural path can be based on the ratio of the triples {
* 1, 2, 3} and {2, 4, 5} seen before. Even better, we can attempt to estimate a
* dynamic time dependent ratio -- and that is what RCF would do.
*
* @param result the description of the current point
*/
int shingleSize = result.getShingleSize();
int dimension = result.getDimension();
int base = dimension / shingleSize;
double[] reference = result.getCurrentInput();
float[] newPoint = result.getExpectedRCFPoint();
int index = result.getRelativeIndex();
if (index < 0) {
reference = preprocessor.getShingledInput(shingleSize + index);
result.setPastTimeStamp(preprocessor.getTimeStamp(shingleSize + index));
}
result.setPastValues(reference);
if (newPoint != null) {
double[] values = preprocessor.getExpectedValue(index, reference, point, newPoint);
if (forestMode == ForestMode.TIME_AUGMENTED) {
int endPosition = (shingleSize + index) * base;
double timeGap = (newPoint[endPosition - 1] - point[endPosition - 1]);
long expectedTimestamp = (timeGap == 0) ? result.getInputTimestamp() : (long) values[base - 1];
if (index < 0) {
expectedTimestamp = (timeGap == 0) ? preprocessor.getTimeStamp(shingleSize - 1 + index)
: (long) values[base - 1];
}
result.setExpectedTimeStamp(expectedTimestamp);
double[] plausibleValues = Arrays.copyOf(values, base - 1);
result.setExpectedValues(0, plausibleValues, 1.0);
} else {
result.setExpectedValues(0, values, 1.0);
}
}
int startPosition = (shingleSize - 1 + result.getRelativeIndex()) * base;
DiVector attribution = result.getAttribution();
if (forestMode == ForestMode.TIME_AUGMENTED) {
--base;
}
double[] flattenedAttribution = new double[base];
for (int i = 0; i < base; i++) {
flattenedAttribution[i] = attribution.getHighLowSum(startPosition + i);
}
result.setRelevantAttribution(flattenedAttribution);
if (forestMode == ForestMode.TIME_AUGMENTED) {
result.setTimeAttribution(attribution.getHighLowSum(startPosition + base));
}
}
}
// will update the forest
preprocessor.update(result.getCurrentInput(), point, result.getInputTimestamp(), result.getMissingValues(),
forest);
if (point != null) {
if (result.getAnomalyGrade() > 0) {
double[] postShift = preprocessor.getShift(); // may have changed
result.setPostShift(postShift);
result.setTransformDecay(preprocessor.getTransformDecay());
}
}
if (preprocessor.isOutputReady()) {
result.setPostDeviations(preprocessor.getSmoothedDeviations());
}
}
/**
* @return a new builder.
*/
public static Builder> builder() {
return new Builder<>();
}
public static class Builder> {
// We use Optional types for optional primitive fields when it doesn't make
// sense to use a constant default.
protected int dimensions;
protected int sampleSize = DEFAULT_SAMPLE_SIZE;
protected Optional outputAfter = Optional.empty();
protected Optional startNormalization = Optional.empty();
protected Optional stopNormalization = Optional.empty();
protected int numberOfTrees = DEFAULT_NUMBER_OF_TREES;
protected Optional timeDecay = Optional.empty();
protected Optional scoreDifferencing = Optional.empty();
protected Optional lowerThreshold = Optional.empty();
protected Optional weightTime = Optional.empty();
protected Optional randomSeed = Optional.empty();
protected boolean storeSequenceIndexesEnabled = DEFAULT_STORE_SEQUENCE_INDEXES_ENABLED;
protected boolean centerOfMassEnabled = DEFAULT_CENTER_OF_MASS_ENABLED;
protected boolean parallelExecutionEnabled = DEFAULT_PARALLEL_EXECUTION_ENABLED;
protected Optional threadPoolSize = Optional.empty();
protected double boundingBoxCacheFraction = DEFAULT_BOUNDING_BOX_CACHE_FRACTION;
protected int shingleSize = DEFAULT_SHINGLE_SIZE;
protected Optional internalShinglingEnabled = Optional.empty();
protected double initialAcceptFraction = DEFAULT_INITIAL_ACCEPT_FRACTION;
protected double anomalyRate = 0.01;
protected TransformMethod transformMethod = TransformMethod.NONE;
protected ImputationMethod imputationMethod = RCF;
protected ForestMode forestMode = ForestMode.STANDARD;
protected ScoringStrategy scoringStrategy = ScoringStrategy.EXPECTED_INVERSE_DEPTH;
protected boolean normalizeTime = false;
protected double[] fillValues = null;
protected double[] weights = null;
protected Optional useImputedFraction = Optional.empty();
protected boolean autoAdjust = false;
protected double zFactor = DEFAULT_Z_FACTOR;
protected boolean alertOnceInDrift = false;
protected Optional transformDecay = Optional.empty();
protected Optional ignoreNearExpectedFromAbove = Optional.empty();
protected Optional ignoreNearExpectedFromBelow = Optional.empty();
protected Optional ignoreNearExpectedFromAboveByRatio = Optional.empty();
protected Optional ignoreNearExpectedFromBelowByRatio = Optional.empty();
void validate() {
if (forestMode == ForestMode.TIME_AUGMENTED) {
if (internalShinglingEnabled.isPresent()) {
checkArgument(shingleSize == 1 || internalShinglingEnabled.get(),
" shingle size has to be 1 or " + "internal shingling must turned on");
checkArgument(transformMethod == TransformMethod.NONE || internalShinglingEnabled.get(),
" internal shingling must turned on for transforms");
} else {
internalShinglingEnabled = Optional.of(true);
}
if (useImputedFraction.isPresent()) {
throw new IllegalArgumentException(" imputation infeasible");
}
} else if (forestMode == ForestMode.STREAMING_IMPUTE) {
checkArgument(shingleSize > 1, "imputation with shingle size 1 is not meaningful");
internalShinglingEnabled.ifPresent(x -> checkArgument(x,
" input cannot be shingled (even if internal representation is different) "));
} else {
if (!internalShinglingEnabled.isPresent()) {
internalShinglingEnabled = Optional.of(true);
}
if (useImputedFraction.isPresent()) {
throw new IllegalArgumentException(" imputation infeasible");
}
}
if (startNormalization.isPresent()) {
// we should not be setting normalizations unless we are careful
if (outputAfter.isPresent()) {
// can be overspecified
checkArgument(outputAfter.get() + shingleSize - 1 > startNormalization.get(),
"output after has to wait till normalization, reduce normalization");
} else {
int n = startNormalization.get();
checkArgument(n > 0, " startNormalization has to be positive");
// if start normalization is low then first few output can be 0
outputAfter = Optional
.of(max(max(1, (int) (sampleSize * DEFAULT_OUTPUT_AFTER_FRACTION)), n - shingleSize + 1));
}
} else {
if (outputAfter.isPresent()) {
startNormalization = Optional.of(min(DEFAULT_START_NORMALIZATION, outputAfter.get()));
}
}
}
public ThresholdedRandomCutForest build() {
validate();
return new ThresholdedRandomCutForest(this);
}
protected RandomCutForest buildForest() {
RandomCutForest.Builder builder = new RandomCutForest.Builder().dimensions(dimensions)
.sampleSize(sampleSize).numberOfTrees(numberOfTrees)
.storeSequenceIndexesEnabled(storeSequenceIndexesEnabled).centerOfMassEnabled(centerOfMassEnabled)
.parallelExecutionEnabled(parallelExecutionEnabled)
.boundingBoxCacheFraction(boundingBoxCacheFraction).shingleSize(shingleSize)
.internalShinglingEnabled(internalShinglingEnabled.get())
.initialAcceptFraction(initialAcceptFraction);
if (forestMode != ForestMode.STREAMING_IMPUTE) {
outputAfter.ifPresent(builder::outputAfter);
} else {
// forcing the change between internal and external shingling
outputAfter.ifPresent(n -> {
int num = max(startNormalization.orElse(DEFAULT_START_NORMALIZATION), n) - shingleSize + 1;
checkArgument(num > 0, " max(start normalization, output after) should be at least " + shingleSize);
builder.outputAfter(num);
});
}
timeDecay.ifPresent(builder::timeDecay);
randomSeed.ifPresent(builder::randomSeed);
threadPoolSize.ifPresent(builder::threadPoolSize);
return builder.build();
}
public T dimensions(int dimensions) {
this.dimensions = dimensions;
return (T) this;
}
public T sampleSize(int sampleSize) {
this.sampleSize = sampleSize;
return (T) this;
}
public T startNormalization(int startNormalization) {
this.startNormalization = Optional.of(startNormalization);
return (T) this;
}
public T stopNormalization(int stopNormalization) {
this.stopNormalization = Optional.of(stopNormalization);
return (T) this;
}
public T outputAfter(int outputAfter) {
this.outputAfter = Optional.of(outputAfter);
return (T) this;
}
public T numberOfTrees(int numberOfTrees) {
this.numberOfTrees = numberOfTrees;
return (T) this;
}
public T shingleSize(int shingleSize) {
this.shingleSize = shingleSize;
return (T) this;
}
public T timeDecay(double timeDecay) {
this.timeDecay = Optional.of(timeDecay);
return (T) this;
}
public T transformDecay(double transformDecay) {
this.transformDecay = Optional.of(transformDecay);
return (T) this;
}
public T zFactor(double zFactor) {
this.zFactor = zFactor;
return (T) this;
}
public T useImputedFraction(double fraction) {
this.useImputedFraction = Optional.of(fraction);
return (T) this;
}
public T randomSeed(long randomSeed) {
this.randomSeed = Optional.of(randomSeed);
return (T) this;
}
public T centerOfMassEnabled(boolean centerOfMassEnabled) {
this.centerOfMassEnabled = centerOfMassEnabled;
return (T) this;
}
public T parallelExecutionEnabled(boolean parallelExecutionEnabled) {
this.parallelExecutionEnabled = parallelExecutionEnabled;
return (T) this;
}
public T threadPoolSize(int threadPoolSize) {
this.threadPoolSize = Optional.of(threadPoolSize);
return (T) this;
}
public T storeSequenceIndexesEnabled(boolean storeSequenceIndexesEnabled) {
this.storeSequenceIndexesEnabled = storeSequenceIndexesEnabled;
return (T) this;
}
@Deprecated
public T compact(boolean compact) {
return (T) this;
}
public T internalShinglingEnabled(boolean internalShinglingEnabled) {
this.internalShinglingEnabled = Optional.of(internalShinglingEnabled);
return (T) this;
}
@Deprecated
public T precision(Precision precision) {
return (T) this;
}
public T boundingBoxCacheFraction(double boundingBoxCacheFraction) {
this.boundingBoxCacheFraction = boundingBoxCacheFraction;
return (T) this;
}
public T initialAcceptFraction(double initialAcceptFraction) {
this.initialAcceptFraction = initialAcceptFraction;
return (T) this;
}
public Random getRandom() {
// If a random seed was given, use it to create a new Random. Otherwise, call
// the 0-argument constructor
return randomSeed.map(Random::new).orElseGet(Random::new);
}
public T anomalyRate(double anomalyRate) {
this.anomalyRate = anomalyRate;
return (T) this;
}
public T imputationMethod(ImputationMethod imputationMethod) {
this.imputationMethod = imputationMethod;
return (T) this;
}
public T fillValues(double[] values) {
// values cannot be a null
this.fillValues = Arrays.copyOf(values, values.length);
return (T) this;
}
public T weights(double[] values) {
// values cannot be a null
this.weights = Arrays.copyOf(values, values.length);
return (T) this;
}
public T normalizeTime(boolean normalizeTime) {
this.normalizeTime = normalizeTime;
return (T) this;
}
public T transformMethod(TransformMethod method) {
this.transformMethod = method;
return (T) this;
}
public T forestMode(ForestMode forestMode) {
this.forestMode = forestMode;
return (T) this;
}
public T scoreDifferencing(double persistence) {
this.scoreDifferencing = Optional.of(persistence);
return (T) this;
}
public T autoAdjust(boolean autoAdjust) {
this.autoAdjust = autoAdjust;
return (T) this;
}
public T weightTime(double value) {
this.weightTime = Optional.of(value);
return (T) this;
}
public T ignoreNearExpectedFromAbove(double[] ignoreSimilarFromAbove) {
this.ignoreNearExpectedFromAbove = Optional.ofNullable(ignoreSimilarFromAbove);
return (T) this;
}
public T ignoreNearExpectedFromBelow(double[] ignoreSimilarFromBelow) {
this.ignoreNearExpectedFromBelow = Optional.ofNullable(ignoreSimilarFromBelow);
return (T) this;
}
public T ignoreNearExpectedFromAboveByRatio(double[] ignoreSimilarFromAboveByRatio) {
this.ignoreNearExpectedFromAboveByRatio = Optional.ofNullable(ignoreSimilarFromAboveByRatio);
return (T) this;
}
public T ignoreNearExpectedFromBelowByRatio(double[] ignoreSimilarFromBelowByRatio) {
this.ignoreNearExpectedFromBelowByRatio = Optional.ofNullable(ignoreSimilarFromBelowByRatio);
return (T) this;
}
public T scoringStrategy(ScoringStrategy scoringStrategy) {
this.scoringStrategy = scoringStrategy;
return (T) this;
}
public T alertOnce(boolean alertOnceInDrift) {
this.alertOnceInDrift = alertOnceInDrift;
return (T) this;
}
}
}