Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
prerna.algorithm.learning.util.Cluster Maven / Gradle / Ivy
package prerna.algorithm.learning.util;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Vector;
public class Cluster {
private int numInstances;
private CategoricalCluster categoricalCluster;
private NumericalCluster numericalCluster;
private static final String EMPTY = "_____";
/**
* Cluster that auto-generates the weighting to be equal among
*/
public Cluster(String[] attributeNames, boolean[] isNumeric) {
Map numericalWeights = new HashMap();
Map categoricalWeights = new HashMap();
// making all columns equally weighted
int numAttributes = isNumeric.length;
for(int i = 0; i < numAttributes; i++) {
if(isNumeric[i]) {
numericalWeights.put(attributeNames[i], 1.0);
} else {
categoricalWeights.put(attributeNames[i], 1.0);
}
}
this.categoricalCluster = new CategoricalCluster(categoricalWeights);
this.numericalCluster = new NumericalCluster(numericalWeights);
}
/**
* Use custom weighting on the columns
* @param categoricalWeights
* @param numericalWeights
*/
public Cluster(Map categoricalWeights, Map numericalWeights) {
this.categoricalCluster = new CategoricalCluster(categoricalWeights);
this.numericalCluster = new NumericalCluster(numericalWeights);
}
public void addToCluster(List valuesList, String[] names, boolean[] isNumeric, double factor) {
//numInstances++; do not increase with incomplete additions
// skip all the duplication logic and add directly
if(valuesList.size() == 1) {
addToCluster(valuesList.get(0), names, isNumeric);
}
Double[] numVals = new Double[valuesList.get(0).length];
for(Object[] values : valuesList) {
for(int i = 0; i < values.length; i++){
if(isNumeric[i]) {
if(values[i]==null || values[i].equals(EMPTY)) {
// do nothing, keep null
} else {
//TODO: determine if should be using Mean/Max/Min/Sum/Median
//implemented to always assume mean at the moment
if(numVals[i] == null) {
numVals[i] = 0.0;
}
numVals[i] += (double) values[i];
}
} else {
if(values[i]==null || values[i].equals(EMPTY)){
addToCategoricalCluster(names[i], EMPTY, factor);
} else {
addToCategoricalCluster(names[i], values[i] + "", factor);
}
}
}
}
int size = valuesList.size();
for(int i = 0; i < isNumeric.length; i++) {
if(isNumeric[i]) {
if(numVals[i] == null) {
addToNumericalCluster(names[i], null, factor);
} else {
addToNumericalCluster(names[i], numVals[i] / size, factor);
}
}
}
}
public void addToCluster(List valuesList, String[] names, boolean[] isNumeric) {
numInstances++;
// skip all the duplication logic and add directly
if(valuesList.size() == 1) {
addToCluster(valuesList.get(0), names, isNumeric);
}
Double[] numVals = new Double[valuesList.get(0).length];
for(Object[] values : valuesList) {
for(int i = 0; i < values.length; i++){
if(isNumeric[i]) {
if(values[i]==null || values[i].equals(EMPTY)) {
// do nothing, keep null
} else {
//TODO: determine if should be using Mean/Max/Min/Sum/Median
//implemented to always assume mean at the moment
if(numVals[i] == null) {
numVals[i] = 0.0;
}
numVals[i] += ((Number) values[i]).doubleValue();
}
} else {
if(values[i]==null || values[i].equals(EMPTY)){
addToCategoricalCluster(names[i], EMPTY);
} else {
addToCategoricalCluster(names[i], values[i] + "");
}
}
}
}
int size = valuesList.size();
for(int i = 0; i < isNumeric.length; i++) {
if(isNumeric[i]) {
if(numVals[i] == null) {
addToNumericalCluster(names[i], null);
} else {
addToNumericalCluster(names[i], numVals[i] / size);
}
}
}
}
private void addToCluster(Object[] values, String[] names, boolean[] isNumeric) {
for(int i = 0; i < values.length; i++){
if(isNumeric[i]) {
if(values[i]==null || values[i].equals(EMPTY)) {
addToNumericalCluster(names[i], null);
} else {
addToNumericalCluster(names[i], ((Number) values[i]).doubleValue());
}
} else {
if(values[i]==null || values[i].equals(EMPTY)){
addToCategoricalCluster(names[i], EMPTY);
} else {
addToCategoricalCluster(names[i], values[i] + "");
}
}
}
}
public void removeFromCluster(List valuesList, String[] names, boolean[] isNumeric, double factor) {
//numInstances--; do not decrease with incomplete additions
// skip all the duplication logic and add directly
if(valuesList.size() == 1) {
removeFromCluster(valuesList.get(0), names, isNumeric);
}
Double[] numVals = new Double[valuesList.get(0).length];
for(Object[] values : valuesList) {
for(int i = 0; i < values.length; i++){
if(isNumeric[i]) {
if(values[i]==null || values[i].equals(EMPTY)) {
// do nothing, keep null
} else {
//TODO: determine if should be using Mean/Max/Min/Sum/Median
//implemented to always assume mean at the moment
if(numVals[i] == null) {
numVals[i] = 0.0;
}
numVals[i] += ((Number) values[i]).doubleValue();
}
} else {
removeFromCategoricalCluster(names[i], values[i] + "", factor);
}
}
}
int size = valuesList.size();
for(int i = 0; i < isNumeric.length; i++) {
if(isNumeric[i]) {
if(numVals[i] == null) {
removeFromNumericalCluster(names[i], null, factor);
} else {
removeFromNumericalCluster(names[i], numVals[i] / size, factor);
}
}
}
}
public void removeFromCluster(List valuesList, String[] names, boolean[] isNumeric) {
numInstances--;
// skip all the duplication logic and add directly
if(valuesList.size() == 1) {
removeFromCluster(valuesList.get(0), names, isNumeric);
}
Double[] numVals = new Double[valuesList.get(0).length];
for(Object[] values : valuesList) {
for(int i = 0; i < values.length; i++){
if(isNumeric[i]) {
if(values[i]==null || values[i].equals(EMPTY)) {
// do nothing, keep null
} else {
//TODO: determine if should be using Mean/Max/Min/Sum/Median
//implemented to always assume mean at the moment
if(numVals[i] == null) {
numVals[i] = 0.0;
}
numVals[i] += ((Number) values[i]).doubleValue();
}
} else {
removeFromCategoricalCluster(names[i], values[i] + "");
}
}
}
int size = valuesList.size();
for(int i = 0; i < isNumeric.length; i++) {
if(isNumeric[i]) {
if(numVals[i] == null) {
removeFromNumericalCluster(names[i], null);
} else {
removeFromNumericalCluster(names[i], numVals[i] / size);
}
}
}
}
private void removeFromCluster(Object[] values, String[] names, boolean[] isNumeric) {
for(int i = 0; i < values.length; i++){
if(isNumeric[i]) {
if(values[i]==null || values[i].equals(EMPTY)) {
removeFromNumericalCluster(names[i], null);
} else {
removeFromNumericalCluster(names[i], (Double)values[i]);
}
} else {
removeFromCategoricalCluster(names[i], values[i] + "");
}
}
}
private void addToCategoricalCluster(String attributeName, String attributeInstance, double factor) {
categoricalCluster.addToCluster(attributeName, attributeInstance, factor);
}
private void removeFromCategoricalCluster(String attributeName, String attributeInstance, double factor) {
categoricalCluster.removeFromCluster(attributeName, attributeInstance, factor);
}
private void addToCategoricalCluster(String attributeName, String attributeInstance) {
categoricalCluster.addToCluster(attributeName, attributeInstance, 1.0);
}
private void removeFromCategoricalCluster(String attributeName, String attributeInstance) {
categoricalCluster.removeFromCluster(attributeName, attributeInstance, 1.0);
}
private void addToNumericalCluster(String attributeName, Double value, double factor) {
numericalCluster.addToCluster(attributeName, value, factor);
}
private void removeFromNumericalCluster(String attributeName, Double value, double factor) {
numericalCluster.removeFromCluster(attributeName, value, factor);
}
private void addToNumericalCluster(String attributeName, Double value) {
numericalCluster.addToCluster(attributeName, value);
}
private void removeFromNumericalCluster(String attributeName, Double value) {
numericalCluster.removeFromCluster(attributeName, value);
}
public void setDistanceMode(Map distanceMeasures) {
if(distanceMeasures != null) {
for(String attribute : distanceMeasures.keySet()) {
IClusterDistanceMode distance = null;
IClusterDistanceMode.DistanceMeasure measure = distanceMeasures.get(attribute);
if(measure == IClusterDistanceMode.DistanceMeasure.MEAN) {
distance = new MeanDistance();
}
//TODO: add other classes once they are done being implemented
// else if(measure == IClusterDistanceMode.DistanceMeasure.MAX) {
//
// }
// else if(measure == IClusterDistanceMode.DistanceMeasure.MEDIAN) {
//
// }
setDistanceMode(attribute, distance);
}
}
}
public void setDistanceMode(String attributeName, IClusterDistanceMode distanceMeasure) {
numericalCluster.setDistanceMode(attributeName, distanceMeasure);
}
/**
* Returns the similarity score for an instance with this cluster
* @param instanceValues The values for the instance for each column
* @param attributeNames The names of associated for each value the instanceValues array, essentially the 'column name'
* @param isNumeric The boolean representing if value of i in the array is numeric
* @return
*/
public double getSimilarityForInstance(List instanceValues, String[] attributeNames, boolean[] isNumeric, int indexToSkip) {
double similarityValue = 0;
for(Object[] values : instanceValues) {
similarityValue += getSimilarityForInstance(values, attributeNames, isNumeric, indexToSkip);
}
return similarityValue / instanceValues.size();
}
/**
* Returns the similarity score for an instance with the this cluster
* @param instanceValues The values for the instance for each column
* @param attributeNames The names of associated for each value the instanceValues array, essentially the 'column name'
* @param isNumeric The boolean representing if value of i in the array is numeric
* @return
*/
public double getSimilarityForInstance(Object[] instanceValues, String[] attributeNames, boolean[] isNumeric, int instanceIndex) {
List categoricalValues = new Vector();
List categoricalValueNames = new Vector();
List numericalValues = new Vector();
List numericalValueNames = new Vector();
int i = 0;
int size = isNumeric.length;
int indexToSkip = -1;
for(; i < size; i++) {
if(isNumeric[i]) {
numericalValues.add( ((Number) instanceValues[i]).doubleValue());
numericalValueNames.add(attributeNames[i] + "");
if(i==instanceIndex) indexToSkip = numericalValueNames.size()-1;
} else {
categoricalValues.add(instanceValues[i] + "");
categoricalValueNames.add(attributeNames[i] + "");
if(i==instanceIndex) indexToSkip = categoricalValueNames.size()-1;
}
}
// since the attribute names includes the instance
// we need to appropriately calculate the similarity
// based on the number of categorical vs. numerical values
int totalAttributes = attributeNames.length-1;
double categoricalSimVal = 0;
if(!categoricalValues.isEmpty()) {
if(isNumeric[instanceIndex]) {
categoricalSimVal = ((double) categoricalValues.size() / totalAttributes) * getSimilarityFromCategoricalValues(categoricalValues, categoricalValueNames, -1);
} else {
categoricalSimVal = ((double) (categoricalValues.size()-1) / totalAttributes) * getSimilarityFromCategoricalValues(categoricalValues, categoricalValueNames, indexToSkip);
}
}
double numericalSimVal = 0;
if(!numericalValues.isEmpty()) {
if(isNumeric[instanceIndex]) {
numericalSimVal = ((double) (numericalValues.size()-1) / totalAttributes) * getSimilarityFromNumericalValues(numericalValues, numericalValueNames, indexToSkip);
} else {
numericalSimVal = ((double) numericalValues.size() / totalAttributes) * getSimilarityFromNumericalValues(numericalValues, numericalValueNames, -1);
}
}
return categoricalSimVal + numericalSimVal;
}
private double getSimilarityFromNumericalValues(List numericalValues, List numericalValueNames, int index) {
return numericalCluster.getSimilarity(numericalValueNames, numericalValues, index);
}
private double getSimilarityFromCategoricalValues(List categoricalValues, List categoricalValueNames, int index) {
return categoricalCluster.getSimilarity(categoricalValueNames, categoricalValues, index);
}
public double getClusterSimilarity(Cluster c2, String instanceType) {
int numCategorical = 0;
int numNumeric = 0;
if(this.categoricalCluster.getWeights() != null) {
Set catKeys = this.categoricalCluster.getWeights().keySet();
numCategorical = catKeys.size();
if(catKeys.contains(instanceType)) {
numCategorical--;
}
}
if(this.numericalCluster.getWeights() != null) {
Set numKeys = this.numericalCluster.getWeights().keySet();
numNumeric = numKeys.size();
if(numKeys.contains(instanceType)) {
numNumeric--;
}
}
int totalAttributes = numCategorical + numNumeric;
if(totalAttributes == 0) {
throw new IllegalArgumentException("totalAttributes can not be 0");
}
double numericalClusterSim = ((double) numNumeric / totalAttributes) * this.numericalCluster.getClusterSimilarity(c2.numericalCluster, instanceType);
double categoricalClusterSim = ((double) numCategorical / totalAttributes) * this.categoricalCluster.getClusterSimilarity(c2.categoricalCluster, instanceType);
return numericalClusterSim + categoricalClusterSim;
}
public int getNumInstances() {
return this.numInstances;
}
public void setDistanceMode(String[] levelNames, IClusterDistanceMode[] distanceMeasures, boolean[] isNumeric) {
for(int i = 0; i < levelNames.length; i++) {
if(isNumeric[i]) {
setDistanceMode(levelNames[i], distanceMeasures[i]);
}
}
}
public void setDistanceMode(String[] levelNames, IClusterDistanceMode[] distanceMeasures) {
for(int i = 0; i < levelNames.length; i++) {
setDistanceMode(levelNames[i], distanceMeasures[i]);
}
}
}