
prerna.reactor.algorithms.RunMultiClusteringReactor Maven / Gradle / Ivy
The newest version!
package prerna.reactor.algorithms;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.core.config.Configurator;
import prerna.algorithm.api.ITableDataFrame;
import prerna.algorithm.learning.util.Cluster;
import prerna.reactor.frame.AbstractFrameReactor;
import prerna.sablecc2.om.GenRowStruct;
import prerna.sablecc2.om.NounStore;
import prerna.sablecc2.om.PixelDataType;
import prerna.sablecc2.om.PixelOperationType;
import prerna.sablecc2.om.ReactorKeysEnum;
import prerna.sablecc2.om.nounmeta.NounMetadata;
import prerna.util.ArrayUtilityMethods;
import prerna.util.usertracking.AnalyticsTrackerHelper;
import prerna.util.usertracking.UserTrackerFactory;
public class RunMultiClusteringReactor extends AbstractFrameReactor {
private static final String CLASS_NAME = RunMultiClusteringReactor.class.getName();
private static final String MIN_NUM_CLUSTERS = "minNumClusters";
private static final String MAX_NUM_CLUSTERS = "maxNumClusters";
// parameters for the algorithm
private int instanceIndex;
private int minNumClusters;
private int maxNumClusters;
private String instanceColumn;
// values from data
private String[] attributeNames;
private List attributeNamesList;
private boolean[] isNumeric;
/**
* RunMultiClustering(instance = column, minNumClusters = min#, maxNumCluster = max#, col1, col2....);
*/
public RunMultiClusteringReactor() {
this.keysToGet = new String[]{ReactorKeysEnum.INSTANCE_KEY.getKey(), MIN_NUM_CLUSTERS, MAX_NUM_CLUSTERS, ReactorKeysEnum.ATTRIBUTES.getKey()};
}
@Override
public NounMetadata execute() {
Logger logger = this.getLogger(CLASS_NAME);
ITableDataFrame dataFrame = getFrame();
dataFrame.setLogger(logger);
///////////////// start of initializing some stuff
this.instanceIndex = 0;
this.instanceColumn = getInstanceColumn();
this.minNumClusters = getMinNumClusters();
this.maxNumClusters = getMaxNumClusters();
this.attributeNamesList = getColumns();
this.attributeNames = attributeNamesList.toArray(new String[] {});
this.isNumeric = new boolean[this.attributeNames.length];
for (int i = 0; i < this.attributeNames.length; i++) {
this.isNumeric[i] = dataFrame.isNumeric(this.attributeNames[i]);
}
int numInstances = dataFrame.getUniqueInstanceCount(instanceColumn);
if (numInstances == 1) {
throw new IllegalArgumentException("Instance column only contains one unqiue value.");
}
if (maxNumClusters > numInstances) {
maxNumClusters = numInstances;
}
String[] allColNames = dataFrame.getColumnHeaders();
String attributeName = instanceColumn;
// to avoid adding columns with same name
int counter = 0;
String newColName = attributeName + "_CLUSTER";
while (ArrayUtilityMethods.arrayContainsValue(allColNames, newColName)) {
counter++;
newColName = attributeName + "_CLUSTER_" + counter;
}
AlgorithmSingleColStore results = null;
if (minNumClusters != maxNumClusters) {
results = runGoldenSelectionForNumberOfClusters(dataFrame, minNumClusters, maxNumClusters, logger);
} else {
// usually occurs when there is too few data points
results = runClusteringRoutine(dataFrame, minNumClusters, new ArrayList(), new HashMap(), logger);
}
// merge data back onto the frame
AlgorithmMergeHelper.mergeSimpleAlgResult(dataFrame, instanceColumn, newColName, "NUMBER", results);
// track GA data
// UserTrackerFactory.getInstance().trackAnalyticsPixel(this.insight, "ClusterOptimization");
// NEW TRACKING
UserTrackerFactory.getInstance().trackAnalyticsWidget(
this.insight,
dataFrame,
"ClusterOptimization",
AnalyticsTrackerHelper.getHashInputs(this.store, this.keysToGet));
return new NounMetadata(dataFrame, PixelDataType.FRAME, PixelOperationType.FRAME_DATA_CHANGE, PixelOperationType.FRAME_HEADERS_CHANGE);
}
private AlgorithmSingleColStore runGoldenSelectionForNumberOfClusters(ITableDataFrame data, int start, int end, Logger logger) {
logger.info("Start execution of golden selection logic to determine best cluster...");
AlgorithmSingleColStore bestResults = null;
int a = start;
int b = end;
double phi = (double) (1 + Math.sqrt(5)) / 2;
int x1 = (int) Math.round((phi - 1)*a + (2-phi)*b);
int x2 = (int) Math.round((2-phi)*a + (phi - 1)*b);
double bestVal = -1;
Map previousResults = new Hashtable();
List startClusterList = new ArrayList();
double startVal = 0;
AlgorithmSingleColStore startClusterResult = null;
String errorMessage1 = null;
try {
startClusterResult = runClusteringRoutine(data, x1, startClusterList, previousResults, logger);
startVal = computeClusteringScore(data, startClusterResult, startClusterList, previousResults, x1, logger);
previousResults.put(x1, startVal);
} catch (IllegalArgumentException ex) {
// do nothing
errorMessage1 = ex.getMessage();
}
List endClusterList = new ArrayList();
double endVal = 0;
AlgorithmSingleColStore endClusterResult = null;
String errorMessage2 = null;
try {
endClusterResult = runClusteringRoutine(data, x2, endClusterList, previousResults, logger);
endVal = computeClusteringScore(data, endClusterResult, endClusterList, previousResults, x2, logger);
previousResults.put(x2, endVal);
} catch (IllegalArgumentException ex) {
// do nothing
errorMessage2 = ex.getMessage();
}
if(startClusterResult == null && endClusterResult == null) {
throw new IllegalArgumentException(errorMessage1 + ".\n" + errorMessage2);
}
if(startVal >= endVal) {
if(startVal > bestVal) {
bestVal = startVal;
bestResults = startClusterResult;
}
} else {
if(endVal > bestVal) {
bestVal = endVal;
bestResults = endClusterResult;
}
}
while(Math.abs(b - a) > 1) {
if(startVal < endVal) {
a = x1;
x1 = x2;
x2 = (int) Math.round((2-phi)*a + (phi-1)*b);
} else {
b = x2;
x2 = x1;
x1 = (int) Math.round((phi-1)*a + (2-phi)*b);
}
try {
startClusterList.clear();
startClusterResult = runClusteringRoutine(data, x1, startClusterList, previousResults, logger);
startVal = computeClusteringScore(data, startClusterResult, startClusterList, previousResults, x1, logger);
previousResults.put(x1, startVal);
} catch (IllegalArgumentException ex) {
// do nothing
}
try {
endClusterList.clear();
endClusterResult = runClusteringRoutine(data, x2, endClusterList, previousResults, logger);
endVal = computeClusteringScore(data, endClusterResult, endClusterList, previousResults, x2, logger);
previousResults.put(x2, endVal);
} catch (IllegalArgumentException ex) {
//do nothing
}
if(startVal > endVal) {
if(startVal > bestVal) {
bestVal = startVal;
if(startClusterResult != null) { // if null, its already stored
bestResults = startClusterResult;
}
}
} else {
if(endVal > bestVal) {
bestVal = endVal;
if(endClusterResult != null) { // if null, its already stored
bestResults = endClusterResult;
}
}
}
}
return bestResults;
}
private double computeClusteringScore(
ITableDataFrame data,
AlgorithmSingleColStore startClusterResult,
List clusters,
Map previousResults,
int numClusters,
Logger logger) {
if(previousResults.containsKey(numClusters)) {
return previousResults.get(numClusters);
}
double innerClusterSimilairty = 0;
Configurator.setLevel(logger.getName(), Level.OFF);
Iterator> it = data.scaledUniqueIterator(instanceColumn, attributeNamesList);
while(it.hasNext()) {
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy