
weka.core.stats.NominalStats Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of distributedWekaBase Show documentation
Show all versions of distributedWekaBase Show documentation
This package provides generic configuration class and distributed map/reduce style tasks for Weka
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* NominalStats
* Copyright (C) 2014 University of Waikato, Hamilton, New Zealand
*
*/
package weka.core.stats;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import weka.core.Attribute;
import weka.core.Utils;
import weka.distributed.CSVToARFFHeaderMapTask;
/**
* Class for computing nominal statistics (primarily frequency counts)
*
* @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
* @version $Revision: 10929 $
*/
public class NominalStats extends Stats implements Serializable {
/** A map of values to counts */
protected Map m_counts =
new TreeMap();
/** The number of missing values for this nominal attribute */
protected double m_numMissing;
/** A "label" to use when storing the number of missing values */
public static final String MISSING_LABEL = "**missing**";
/** For serialization */
private static final long serialVersionUID = -6176046647546730423L;
/**
* Class that encapsulates a count for nominal value
*
* @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
*/
public static class Count implements Serializable {
/** For serialization */
private static final long serialVersionUID = 4310467271632108735L;
/** The value of the count */
public double m_count;
}
/**
* Constructs a new NominalStats
*
* @param attributeName the name of the data attribute that these stats
* pertain to
*/
public NominalStats(String attributeName) {
super(attributeName);
}
/**
* Convert a summary meta attribute to a NominalStats
*
* @param a the attribute to convert
* @return a NominalStats
* @throws IllegalArgumentException if a problem occurs
*/
public static NominalStats attributeToStats(Attribute a)
throws IllegalArgumentException {
if (!a.isNominal()) {
throw new IllegalArgumentException("Stats attribute is not nominal!");
}
NominalStats ns = new NominalStats(a.name());
for (int j = 0; j < a.numValues(); j++) {
String v = a.value(j);
String label = v.substring(0, v.lastIndexOf("_"));
String freqCount = v.substring(v.lastIndexOf("_") + 1, v.length());
try {
double fC = Double.parseDouble(freqCount);
if (label.equals(NominalStats.MISSING_LABEL)) {
ns.add(null, fC);
} else {
ns.add(label, fC);
}
} catch (NumberFormatException n) {
throw new IllegalArgumentException(n);
}
}
return ns;
}
/**
* Adds to the count for a given label. If the label is null then it adds to
* the count for missing.
*
* @param label the label to add the count to
* @param value the count to add
*/
public void add(String label, double value) {
if (label == null) {
m_numMissing += value;
} else {
NominalStats.Count c = m_counts.get(label);
if (c == null) {
c = new Count();
m_counts.put(label, c);
}
c.m_count += value;
}
}
/**
* Get the set of labels seen by this NominalStats
*
* @return the set of labels
*/
public Set getLabels() {
return m_counts.keySet();
}
/**
* Get the count for a given label
*
* @param label the label to get the count for
* @return the count or missing value if the label is unknown
*/
public double getCount(String label) {
NominalStats.Count c = m_counts.get(label);
if (c == null) {
return Utils.missingValue();
}
return c.m_count;
}
/**
* Get the number of missing values for this attribute
*
* @return the number of missing values seen
*/
public double getNumMissing() {
return m_numMissing;
}
/**
* Get the index of the mode
*
* @return the index (in the sorted list of labels) of the mode
*/
public int getMode() {
double max = -1;
int maxIndex = -1;
int index = 0;
for (Map.Entry e : m_counts.entrySet()) {
if (e.getValue().m_count > max) {
max = e.getValue().m_count;
maxIndex = index;
}
index++;
}
return maxIndex;
}
/**
* Get the most frequent label (not including missing values)
*
* @return the most frequent label
*/
public String getModeLabel() {
double max = -1;
String maxLabel = "";
for (Map.Entry e : m_counts.entrySet()) {
if (e.getValue().m_count > max) {
max = e.getValue().m_count;
maxLabel = e.getKey();
}
}
return maxLabel;
}
/**
* Set the number of missing values for this attribute
*
* @param missing the number of missing values
*/
public void setNumMissing(double missing) {
m_numMissing = missing;
}
@Override
public Attribute makeAttribute() {
ArrayList vals = new ArrayList();
for (Map.Entry e : m_counts.entrySet()) {
vals.add(e.getKey() + "_" + e.getValue().m_count);
}
vals.add(MISSING_LABEL + "_" + m_numMissing);
Attribute a =
new Attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
+ m_attributeName, vals);
return a;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy