![JAR search and dependency download from the Maven repository](/logo.png)
mulan.data.MultiLabelInstances Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mulan Show documentation
Show all versions of mulan Show documentation
Mulan is an open-source Java library for learning from multi-label datasets.
The newest version!
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* MultiLabelInstances.java
* Copyright (C) 2009-2012 Aristotle University of Thessaloniki, Greece
*/
package mulan.data;
import java.io.*;
import java.util.*;
import mulan.core.ArgumentNullException;
import mulan.core.MulanRuntimeException;
import weka.core.Attribute;
import weka.core.Instance;
import weka.core.Instances;
/**
* Implements multi-label instances data set. Multi-label data are stored in Weka's
* {@link Instances}. The class is a convenient wrapper. The data are loaded form
* data file, checked for valid format. If hierarchy for labels is specified via
* XML meta-data file, the data file is cross-checked with XML for consistency.
*
* Applied rules:
* - label names must be unique
* - all labels in XML meta-data must be defined also in ARFF data set
* - each label attribute must be nominal with binary values
* - if labels has hierarchy, then if child labels indicates true
of some
* data instance, then all its parent labels must indicate also true
for that instance
*
* @author Jozef Vilcek
*/
public class MultiLabelInstances implements Serializable {
private Instances dataSet;
private final LabelsMetaData labelsMetaData;
/**
* Creates a new instance of {@link MultiLabelInstances} data.
* The label attributes are assumed to be at the end of ARFF data file. The count
* is specified by parameter. Based on these attributes the {@link LabelsMetaData}
* are created.
*
* @param arffFilePath the path to ARFF file containing the data
* @param numLabelAttributes the number of ARFF data set attributes which are labels.
* @throws ArgumentNullException if arrfFilePath is null
* @throws IllegalArgumentException if numLabelAttribures is less than 2
* @throws InvalidDataFormatException if format of loaded multi-label data is invalid
* @throws DataLoadException if ARFF data file can not be loaded
*/
public MultiLabelInstances(String arffFilePath, int numLabelAttributes) throws InvalidDataFormatException {
if (arffFilePath == null) {
throw new ArgumentNullException("arffFilePath");
}
if (numLabelAttributes < 2) {
throw new IllegalArgumentException("The number of label attributes must me at least 2 or higher.");
}
File arffFile = new File(arffFilePath);
Instances data = loadInstances(arffFile);
LabelsMetaData labelsData = loadLabesMeta(data, numLabelAttributes);
validate(data, labelsData);
dataSet = data;
labelsMetaData = labelsData;
}
/**
* Creates a new instance of {@link MultiLabelInstances} data from the supplied {@link InputStream}
* data source. The data in the stream are assumed to be in ARFF format.
* The label attributes in ARFF data are assumed to be the last ones. Based on those attributes
* the {@link LabelsMetaData} are created.
*
* @param arffDataStream the {@link InputStream} data source to load data in ARFF format
* @param numLabelAttributes the number of last ARFF data set attributes which are labels.
* @throws ArgumentNullException if {@link InputStream} data source is null
* @throws IllegalArgumentException if number of labels attributes is less than 2
* @throws InvalidDataFormatException if format of loaded multi-label data is invalid
* @throws DataLoadException if ARFF data can not be loaded
*/
public MultiLabelInstances(InputStream arffDataStream, int numLabelAttributes) throws InvalidDataFormatException {
if (arffDataStream == null) {
throw new ArgumentNullException("arffDataStream");
}
if (numLabelAttributes < 2) {
throw new IllegalArgumentException("The number of label attributes must me at least 2 or higher.");
}
Instances data = loadInstances(arffDataStream);
LabelsMetaData labelsData = loadLabesMeta(data, numLabelAttributes);
validate(data, labelsData);
dataSet = data;
labelsMetaData = labelsData;
}
/**
* Creates a new instance of {@link MultiLabelInstances} data.
* The Instances object and labels meta-data are loaded separately. The load failure is
* indicated by {@link DataLoadException}. When data are loaded, validations are applied
* to ensure consistency between ARFF data and specified labels meta-data.
*
* @param data the Instances object containing the data
* @param xmlLabelsDefFilePath the path to XML file containing labels meta-data
* @throws IllegalArgumentException if input parameters refers to non-existing files
* @throws InvalidDataFormatException if format of loaded multi-label data is invalid
* @throws DataLoadException if XML meta-data of ARFF data file can not be loaded
*/
public MultiLabelInstances(Instances data, String xmlLabelsDefFilePath) throws InvalidDataFormatException {
if (xmlLabelsDefFilePath == null) {
throw new ArgumentNullException("xmlLabelsDefFilePath");
}
LabelsMetaData labelsData = loadLabesMeta(xmlLabelsDefFilePath);
validate(data, labelsData);
dataSet = data;
labelsMetaData = labelsData;
}
/**
* Creates a new instance of {@link MultiLabelInstances} data.
* The ARFF data file and labels meta-data are loaded separately. The load failure is
* indicated by {@link DataLoadException}. When data are loaded, validations are applied
* to ensure consistency between ARFF data and specified labels meta-data.
*
* @param arffFilePath the path to ARFF file containing the data
* @param xmlLabelsDefFilePath the path to XML file containing labels meta-data
* @throws ArgumentNullException if input parameters are null
* @throws IllegalArgumentException if input parameters refers to non-existing files
* @throws InvalidDataFormatException if format of loaded multi-label data is invalid
* @throws DataLoadException if XML meta-data of ARFF data file can not be loaded
*/
public MultiLabelInstances(String arffFilePath, String xmlLabelsDefFilePath) throws InvalidDataFormatException {
if (arffFilePath == null) {
throw new ArgumentNullException("arffFilePath");
}
if (xmlLabelsDefFilePath == null) {
throw new ArgumentNullException("xmlLabelsDefFilePath");
}
File arffFile = new File(arffFilePath);
Instances data = loadInstances(arffFile);
LabelsMetaData labelsData = loadLabesMeta(xmlLabelsDefFilePath);
validate(data, labelsData);
dataSet = data;
labelsMetaData = labelsData;
}
/**
* Creates a new instance of {@link MultiLabelInstances} data from the supplied {@link InputStream}
* data source. The data in the stream are assumed to be in ARFF format.
* The labels meta data for ARFF data are retrieved separately from the different {@link InputStream}
* data source. The meta data are assumed to be in XML format and conform to valid schema.
* Data load load failure is indicated by {@link DataLoadException}. When data are loaded, validations
* are applied to ensure consistency between ARFF data and specified labels meta-data.
*
* @param arffDataStream the {@link InputStream} data source to load data in ARFF format
* @param xmlLabelsDefStream the {@link InputStream} data source to load XML labels meta data
* @throws ArgumentNullException if input parameters are null
* @throws IllegalArgumentException if input parameters refers to non-existing files
* @throws InvalidDataFormatException if format of loaded multi-label data is invalid
* @throws DataLoadException if XML meta-data of ARFF data can not be loaded
*/
public MultiLabelInstances(InputStream arffDataStream, InputStream xmlLabelsDefStream) throws InvalidDataFormatException {
if (arffDataStream == null) {
throw new ArgumentNullException("arffDataStream");
}
if (xmlLabelsDefStream == null) {
throw new ArgumentNullException("xmlLabelsDefStream");
}
Instances data = loadInstances(arffDataStream);
LabelsMetaData labelsData = loadLabesMeta(xmlLabelsDefStream);
validate(data, labelsData);
dataSet = data;
labelsMetaData = labelsData;
}
/**
* Creates a new instance of {@link MultiLabelInstances} data from existing {@link Instances}
* and {@link LabelsMetaData}. The input parameters are not copied. Internally are stored only
* references.
* The data set and labels meta data are validated against each other. Any violation of
* validation criteria result in {@link InvalidDataFormatException}.
*
* @param dataSet the data set with data instances in multi-label format
* @param labelsMetaData the meta-data about label attributes of data set
* @throws IllegalArgumentException if input parameters are null
* @throws InvalidDataFormatException if multi-label data format is not valid
*/
public MultiLabelInstances(Instances dataSet, LabelsMetaData labelsMetaData) throws InvalidDataFormatException {
if (dataSet == null) {
throw new ArgumentNullException("dataSet");
}
if (labelsMetaData == null) {
throw new ArgumentNullException("labelsMetaData");
}
validate(dataSet, labelsMetaData);
this.dataSet = dataSet;
this.labelsMetaData = labelsMetaData;
}
/**
* Gets the number of labels (label attributes)
* @return number of labels
*/
public int getNumLabels() {
return labelsMetaData.getNumLabels();
}
/**
* Gets the number of instances
* @return number of instances
*/
public int getNumInstances() {
return dataSet.numInstances();
}
/**
* Gets the cardinality of the dataset
*
* @return dataset cardinality
*/
public double getCardinality() {
double labelCardinality = 0;
int numInstances = dataSet.numInstances();
int numLabels = labelsMetaData.getNumLabels();
int[] labelIndices = getLabelIndices();
for (int i = 0; i < numInstances; i++) {
for (int j = 0; j < numLabels; j++) {
if (dataSet.instance(i).stringValue(labelIndices[j]).equals("1")) {
labelCardinality++;
}
}
}
labelCardinality /= numInstances;
return labelCardinality;
}
/**
* @return an array with the indices of the label attributes inside the
* Instances object
*/
public int[] getLabelIndices() {
int[] labelIndices = new int[labelsMetaData.getNumLabels()];
int numAttributes = dataSet.numAttributes();
Set labelNames = labelsMetaData.getLabelNames();
int counter = 0;
for (int index = 0; index < numAttributes; index++) {
Attribute attr = dataSet.attribute(index);
if (labelNames.contains(attr.name())) {
labelIndices[counter] = index;
counter++;
}
}
return labelIndices;
}
/**
* @return a mapping of attribute names and their indices
* Instances object
*/
public Map getLabelsOrder() {
int numAttributes = dataSet.numAttributes();
Set labelNames = labelsMetaData.getLabelNames();
HashMap assoc = new HashMap();
int counter = 0;
for (int index = 0; index < numAttributes; index++) {
Attribute attr = dataSet.attribute(index);
if (labelNames.contains(attr.name())) {
assoc.put(attr.name(), counter);
counter++;
}
}
return assoc;
}
/**
* Gets the {@link Set} of label {@link Attribute} instances of
* this {@link MultiLabelInstances} instance.
* @return the Set of label Attribute instances
*/
public Set getLabelAttributes() {
Set labelNames = labelsMetaData.getLabelNames();
Set labelAttributes = new HashSet(getNumLabels());
int numAttributes = dataSet.numAttributes();
for (int index = 0; index < numAttributes; index++) {
Attribute attr = dataSet.attribute(index);
if (labelNames.contains(attr.name())) {
labelAttributes.add(attr);
}
}
return labelAttributes;
}
/**
* Gets the array with indices of feature attributes stored in
* underlying {@link Instances} data set.
*
* @return an array with the indices of the feature attributes
*/
public int[] getFeatureIndices() {
int numAttributes = dataSet.numAttributes();
Set featureAttributes = getFeatureAttributes();
int[] featureIndices = new int[featureAttributes.size()];
int counter = 0;
for (int index = 0; index < numAttributes; index++) {
Attribute attr = dataSet.attribute(index);
if (featureAttributes.contains(attr)) {
featureIndices[counter] = attr.index();
counter++;
}
}
return featureIndices;
}
/**
* Gets the {@link Set} of feature {@link Attribute} instances of
* this {@link MultiLabelInstances} instance.
* @return the {@link Set} of feature {@link Attribute} instances
*/
public Set getFeatureAttributes() {
Set labelNames = labelsMetaData.getLabelNames();
Set featureAttributes = new HashSet(getNumLabels());
int numAttributes = dataSet.numAttributes();
for (int index = 0; index < numAttributes; index++) {
Attribute attr = dataSet.attribute(index);
if (!labelNames.contains(attr.name())) {
featureAttributes.add(attr);
}
}
return featureAttributes;
}
/**
* Gets the {@link LabelsMetaData} instance, which contains descriptive meta-data about
* label attributes stored in underlying {@link Instances} data set.
*
* @return descriptive meta-data about label attributes
*/
public LabelsMetaData getLabelsMetaData() {
return labelsMetaData;
}
/**
* Gets underlying {@link Instances}, which contains all data.
*
* @return underlying Instances object which contains all data
*/
public Instances getDataSet() {
return dataSet;
}
/**
* If {@link Instances} data set are retrieved from {@link MultiLabelInstances} and
* post-processed, modified by custom code, it can be again reintegrated into
* {@link MultiLabelInstances} if needed. The underlying {@link LabelsMetaData} are
* modified to reflect changes in data set. The method creates new instance of
* {@link MultiLabelInstances} with modified data set and new meta-data.
*
* The supported changes are:
* - remove of label {@link Attribute} to the existing {@link Instances}
* - add/remove of {@link Instance} from the existing {@link Instances}
* - add/remove of feature/predictor {@link Attribute} to the existing {@link Instances}
*
* @param modifiedDataSet the modified data set
* @return the modified data set
* @throws IllegalArgumentException if specified modified data set is null
* @throws InvalidDataFormatException if multi-label data format with specified modifications is not valid
*/
public MultiLabelInstances reintegrateModifiedDataSet(Instances modifiedDataSet) throws InvalidDataFormatException {
if (modifiedDataSet == null) {
throw new IllegalArgumentException("The modified data set is null.");
}
//TODO: add support for addition of label attributes to modified data set if necessary
LabelsMetaDataImpl newMetaData = (LabelsMetaDataImpl) labelsMetaData.clone();
Set origLabelNames = labelsMetaData.getLabelNames();
for (String labelName : origLabelNames) {
if (modifiedDataSet.attribute(labelName) == null) {
newMetaData.removeLabelNode(labelName);
}
}
return new MultiLabelInstances(modifiedDataSet, newMetaData);
}
/**
* Returns a deep copy of the {@link MultiLabelInstances} instance.
*/
@Override
public MultiLabelInstances clone() {
LabelsMetaData metaDataCopy = labelsMetaData.clone();
Instances dataSetCopy = new Instances(dataSet);
try {
return new MultiLabelInstances(dataSetCopy, metaDataCopy);
} catch (InvalidDataFormatException ex) {
throw new MulanRuntimeException(
String.format("The cloning of '%' class instance failed", getClass()), ex);
}
}
private Instances loadInstances(File arffFile) {
if (!arffFile.exists()) {
throw new IllegalArgumentException(
String.format("The arff data file does not exists under specified path '%s'.",
arffFile.getAbsolutePath()));
}
Instances aDataSet = null;
FileInputStream fileStream = null;
try {
fileStream = new FileInputStream(arffFile);
} catch (FileNotFoundException exception) {
throw new DataLoadException(
String.format("The specified data file '%s' can not be found.", arffFile.getAbsolutePath()), exception);
}
aDataSet = loadInstances(fileStream);
return aDataSet;
}
private Instances loadInstances(InputStream stream) {
Instances aDataSet = null;
InputStreamReader streamReader = new InputStreamReader(stream);
try {
aDataSet = new Instances(streamReader);
} catch (IOException exception) {
throw new DataLoadException(String.format(
"Error creating Instances data from supplied Reader data source: "
+ exception.getMessage(), exception));
}
return aDataSet;
}
private LabelsMetaData loadLabesMeta(String xmlLabelsDefFilePath) {
LabelsMetaData labelsMeta = null;
try {
labelsMeta = LabelsBuilder.createLabels(xmlLabelsDefFilePath);
} catch (LabelsBuilderException exception) {
throw new DataLoadException(
String.format("Error loading labels meta-data from xml file '%s'.", xmlLabelsDefFilePath), exception);
}
return labelsMeta;
}
private LabelsMetaData loadLabesMeta(InputStream xmlLabelsDefStream) {
LabelsMetaData labelsMeta = null;
try {
labelsMeta = LabelsBuilder.createLabels(xmlLabelsDefStream);
} catch (LabelsBuilderException exception) {
throw new DataLoadException(String.format("Error loading labels meta-data from input stream."), exception);
}
return labelsMeta;
}
private LabelsMetaData loadLabesMeta(Instances data, int numLabels) throws InvalidDataFormatException {
LabelsMetaDataImpl labelsData = new LabelsMetaDataImpl();
int numAttributes = data.numAttributes();
for (int index = numAttributes - numLabels; index < numAttributes; index++) {
String attrName = data.attribute(index).name();
labelsData.addRootNode(new LabelNodeImpl(attrName));
}
if (labelsData.getNumLabels() < numLabels) {
throw new InvalidDataFormatException("The names of label attributes are not unique.");
}
return labelsData;
}
/**
* Does validation and integrity checks between data set and meta-data. The appropriate exception is
* thrown if any inconsistencies of validation rules breached.
* The passed data set and meta-data are not modified in any way.
*/
private void validate(Instances dataSet, LabelsMetaData labelsMetaData) throws InvalidDataFormatException {
Set labelNames = labelsMetaData.getLabelNames();
if (labelNames.size() < 2) {
throw new InvalidDataFormatException(
String.format("There must be at least 2 label attributes specified, but only '%s' are defined in metadata",
labelNames.size()));
}
int numAttributes = dataSet.numAttributes();
int numMatches = 0;
for (int index = 0; index < numAttributes; index++) {
Attribute attribute = dataSet.attribute(index);
if (labelNames.contains(attribute.name())) {
numMatches++;
if (!checkLabelAttributeFormat(attribute)) {
throw new InvalidDataFormatException(
String.format("The format of label attribute '%s' is not valid.", attribute.name()));
}
}
}
if (numMatches != labelNames.size()) {
throw new InvalidDataFormatException(
String.format("Not all labels defined in meta-data are present in ARFF data file."));
}
if (labelsMetaData.isHierarchy()) {
checkLabelsConsistency(dataSet, labelsMetaData.getRootLabels());
}
}
// Checks label attribute, if it is nominal and have binary values.
private boolean checkLabelAttributeFormat(Attribute attribute) {
if (attribute.isNominal() != true) {
return false;
}
List allowedValues = new ArrayList();
allowedValues.add("0");
allowedValues.add("1");
int numValues = attribute.numValues();
if (allowedValues.size() != numValues) {
return false;
}
for (int index = 0; index < numValues; index++) {
String value = attribute.value(index);
if (allowedValues.contains(value)) {
allowedValues.remove(value);
}
}
if (!allowedValues.isEmpty()) {
return false;
}
return true;
}
// Checks the consistency of labels if there is a hierarchy between them.
// If child labels is 'true' for some instance, all its parent labels should be
// also 'true' for the instance.
private void checkLabelsConsistency(Instances dataSet, Set rootLabelNodes) throws InvalidDataFormatException {
// create an index for faster access to attribute based on name
Map attributesIndex = new HashMap();
for (int index = 0; index < dataSet.numAttributes(); index++) {
Attribute attribute = dataSet.attribute(index);
attributesIndex.put(attribute.name(), attribute);
}
int numInstances = dataSet.numInstances();
for (int index = 0; index < numInstances; index++) {
Instance instance = dataSet.instance(index);
for (LabelNode labelNode : rootLabelNodes) {
checkSubtreeConsistency(labelNode, instance, true, attributesIndex);
}
}
}
private void checkSubtreeConsistency(LabelNode node, Instance instance, boolean canBeLabelSet, Map attributesIndex) throws InvalidDataFormatException {
boolean isLabelSet = isLabelSet(instance, node.getName(), attributesIndex);
if (isLabelSet == true && canBeLabelSet == false) {
throw new InvalidDataFormatException(String.format("Consistency of labels hierarchy is breached for: Label='%s', Instance='%s'", node.getName(), instance.toString()));
}
if (node.hasChildren()) {
Set childNodes = node.getChildren();
for (LabelNode child : childNodes) {
checkSubtreeConsistency(child, instance, isLabelSet, attributesIndex);
}
}
}
private boolean isLabelSet(Instance instance, String labelName, Map attributesIndex) {
if (instance.stringValue(attributesIndex.get(labelName)).equals("1"))
return true;
else
return false;
}
/**
* Create a HashMap that contains every label, with its depth in the Hierarchical tree
* @return a HashMap that contains every label with its depth in the Hierarchical tree
*/
public HashMap getLabelDepth() {
int numAttributes = dataSet.numAttributes();
Set labelNames = labelsMetaData.getLabelNames();
HashMap assoc = new HashMap();
for (int index = 0; index < numAttributes; index++) {
Attribute attr = dataSet.attribute(index);
if (labelNames.contains(attr.name())) {
assoc.put(attr.name(), getDepth(attr.name()));
}
}
return assoc;
}
/**
* Calculates the depth of a label, in the Hierarchy of the tree of labels.
* Returns the counter of every level. We define the root node label that has the depth 1
* @param labelName
* @return the depth of a label
*/
public int getDepth(String labelName) {
int counter = 0;
while (labelsMetaData.getLabelNode(labelName).hasParent()) {
counter++;
labelName = labelsMetaData.getLabelNode(labelName).getParent().getName();
}
return counter + 1;
}
/**
* Returns the depth of the labels
*
* @return the depth of the labels
*/
public int[] getLabelDepthIndices() {
int[] labelDepthIndices = new int[labelsMetaData.getNumLabels()];
int numAttributes = dataSet.numAttributes();
Set labelNames = labelsMetaData.getLabelNames();
int counter = 0;
for (int index = 0; index < numAttributes; index++) {
Attribute attr = dataSet.attribute(index);
if (labelNames.contains(attr.name())) {
labelDepthIndices[counter] = getDepth(attr.name());
counter++;
}
}
return labelDepthIndices;
}
/**
* Method that checks whether an instance has missing labels
*
* @param instance one instance of this dataset
* @return true if the instance has missing labels
*/
public boolean hasMissingLabels(Instance instance) {
int numLabels = getNumLabels();
int[] labelIndices = getLabelIndices();
boolean missing = false;
for (int j = 0; j < numLabels; j++) {
if (instance.isMissing(labelIndices[j])) {
missing = true;
break;
}
}
return missing;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy