weka.core.xml.XMLInstances Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-stable Show documentation
Show all versions of weka-stable Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This is the stable version. Apart from bugfixes, this version
does not receive any other updates.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* XMLInstances.java
* Copyright (C) 2006-2012 University of Waikato, Hamilton, New Zealand
*/
package weka.core.xml;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Properties;
import java.util.Vector;
import java.util.zip.GZIPInputStream;
import org.w3c.dom.Element;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.ProtectedProperties;
import weka.core.RevisionUtils;
import weka.core.SparseInstance;
import weka.core.Utils;
import weka.core.Version;
/**
* XML representation of the Instances class.
*
* @author fracpete (fracpete at waikato dot ac dot nz)
* @version $Revision: 10203 $
*/
public class XMLInstances extends XMLDocument implements Serializable {
/** for serialization */
private static final long serialVersionUID = 3626821327547416099L;
/** The filename extension that should be used for xrff files */
public static String FILE_EXTENSION = ".xrff";
// tags
/** the root element */
public final static String TAG_DATASET = "dataset";
/** the header element */
public final static String TAG_HEADER = "header";
/** the body element */
public final static String TAG_BODY = "body";
/** the notes element */
public final static String TAG_NOTES = "notes";
/** the attributes element */
public final static String TAG_ATTRIBUTES = "attributes";
/** the attribute element */
public final static String TAG_ATTRIBUTE = "attribute";
/** the labels element */
public final static String TAG_LABELS = "labels";
/** the label element */
public final static String TAG_LABEL = "label";
/** the meta-data element */
public final static String TAG_METADATA = "metadata";
/** the property element */
public final static String TAG_PROPERTY = "property";
/** the data element */
public final static String TAG_INSTANCES = "instances";
/** the instance element */
public final static String TAG_INSTANCE = "instance";
/** the value element */
public final static String TAG_VALUE = "value";
// attributes
/** the version attribute */
public final static String ATT_VERSION = "version";
/** the type attribute */
public final static String ATT_TYPE = "type";
/** the format attribute (for date attributes) */
public final static String ATT_FORMAT = "format";
/** the class attribute */
public final static String ATT_CLASS = "class";
/** the index attribute */
public final static String ATT_INDEX = "index";
/** the weight attribute */
public final static String ATT_WEIGHT = "weight";
/** the missing attribute */
public final static String ATT_MISSING = "missing";
// values
/** the value for numeric */
public final static String VAL_NUMERIC = "numeric";
/** the value for date */
public final static String VAL_DATE = "date";
/** the value for nominal */
public final static String VAL_NOMINAL = "nominal";
/** the value for string */
public final static String VAL_STRING = "string";
/** the value for relational */
public final static String VAL_RELATIONAL = "relational";
/** the value for normal */
public final static String VAL_NORMAL = "normal";
/** the value for sparse */
public final static String VAL_SPARSE = "sparse";
/** the DTD */
public final static String DOCTYPE = "\n" + " \n" + " \n" + "\n" + " \n" + " \n" + " \n" + "\n" + " \n" + " \n"
+ " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n"
+ " \n"
+ " \n" + "\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n"
+ " \n" + "]\n"
+ ">";
/** the precision for numbers */
protected int m_Precision = 6;
/** the underlying Instances */
protected Instances m_Instances;
/**
* the default constructor
*
* @throws Exception if XML initialization fails
*/
public XMLInstances() throws Exception {
super();
m_Instances = null;
setDocType(DOCTYPE);
setRootNode(TAG_DATASET);
setValidating(true);
}
/**
* generates the XML structure based on the given data
*
* @param data the data to build the XML structure from
* @throws Exception if initialization/generation fails
*/
public XMLInstances(Instances data) throws Exception {
this();
setInstances(data);
}
/**
* generates the Instances directly from the reader containing the XML data.
*
* @param reader the reader for the XML data
* @throws Exception if something goes wrong
*/
public XMLInstances(Reader reader) throws Exception {
this();
setXML(reader);
}
/**
* adds the attribute to the XML structure
*
* @param parent the parent node to add the attribute node as child
* @param att the attribute to add
*/
protected void addAttribute(Element parent, Attribute att) {
Element node;
Element child;
Element property;
Element label;
String tmpStr;
Enumeration> enm;
int i;
node = m_Document.createElement(TAG_ATTRIBUTE);
parent.appendChild(node);
// XML attributes
// name
node.setAttribute(ATT_NAME, validContent(att.name()));
// type
switch (att.type()) {
case Attribute.NUMERIC:
node.setAttribute(ATT_TYPE, VAL_NUMERIC);
break;
case Attribute.DATE:
node.setAttribute(ATT_TYPE, VAL_DATE);
break;
case Attribute.NOMINAL:
node.setAttribute(ATT_TYPE, VAL_NOMINAL);
break;
case Attribute.STRING:
node.setAttribute(ATT_TYPE, VAL_STRING);
break;
case Attribute.RELATIONAL:
node.setAttribute(ATT_TYPE, VAL_RELATIONAL);
break;
default:
node.setAttribute(ATT_TYPE, "???");
}
// labels
if (att.isNominal()) {
child = m_Document.createElement(TAG_LABELS);
node.appendChild(child);
enm = att.enumerateValues();
while (enm.hasMoreElements()) {
tmpStr = enm.nextElement().toString();
label = m_Document.createElement(TAG_LABEL);
child.appendChild(label);
label.appendChild(m_Document.createTextNode(validContent(tmpStr)));
}
}
// format
if (att.isDate()) {
node.setAttribute(ATT_FORMAT, validContent(att.getDateFormat()));
}
// class
if (m_Instances.classIndex() > -1) {
if (att == m_Instances.classAttribute()) {
node.setAttribute(ATT_CLASS, VAL_YES);
}
}
// add meta-data
if ((att.getMetadata() != null) && (att.getMetadata().size() > 0)) {
child = m_Document.createElement(TAG_METADATA);
node.appendChild(child);
enm = att.getMetadata().propertyNames();
while (enm.hasMoreElements()) {
tmpStr = enm.nextElement().toString();
property = m_Document.createElement(TAG_PROPERTY);
child.appendChild(property);
property.setAttribute(ATT_NAME, validContent(tmpStr));
property.appendChild(m_Document.createTextNode(validContent(att
.getMetadata().getProperty(tmpStr, ""))));
}
}
// relational attribute?
if (att.isRelationValued()) {
child = m_Document.createElement(TAG_ATTRIBUTES);
node.appendChild(child);
for (i = 0; i < att.relation().numAttributes(); i++) {
addAttribute(child, att.relation().attribute(i));
}
}
}
/**
* turns all <, > and &into character entities and returns that
* string. Necessary for TextNodes.
*
* @param content string to convert
* @return the valid content string
*/
protected String validContent(String content) {
String result;
result = content;
// these five entities are recognized by every XML processor
// see http://www.xml.com/pub/a/2001/03/14/trxml10.html
result = result.replaceAll("&", "&").replaceAll("\"", """)
.replaceAll("'", "'").replaceAll("<", "<")
.replaceAll(">", ">");
// in addition, replace some other entities as well
result = result.replaceAll("\n", "
").replaceAll("\r", "
")
.replaceAll("\t", " ");
return result;
}
/**
* adds the instance to the XML structure
*
* @param parent the parent node to add the instance node as child
* @param inst the instance to add
*/
protected void addInstance(Element parent, Instance inst) {
Element node;
Element value;
Element child;
boolean sparse;
int i;
int n;
int index;
node = m_Document.createElement(TAG_INSTANCE);
parent.appendChild(node);
// sparse?
sparse = (inst instanceof SparseInstance);
if (sparse) {
node.setAttribute(ATT_TYPE, VAL_SPARSE);
}
// weight
if (inst.weight() != 1.0) {
node.setAttribute(ATT_WEIGHT,
Utils.doubleToString(inst.weight(), m_Precision));
}
// values
for (i = 0; i < inst.numValues(); i++) {
index = inst.index(i);
value = m_Document.createElement(TAG_VALUE);
node.appendChild(value);
if (inst.isMissing(index)) {
value.setAttribute(ATT_MISSING, VAL_YES);
} else {
if (inst.attribute(index).isRelationValued()) {
child = m_Document.createElement(TAG_INSTANCES);
value.appendChild(child);
for (n = 0; n < inst.relationalValue(i).numInstances(); n++) {
addInstance(child, inst.relationalValue(i).instance(n));
}
} else {
if (inst.attribute(index).type() == Attribute.NUMERIC) {
value.appendChild(m_Document.createTextNode(Utils.doubleToString(
inst.value(index), m_Precision)));
} else {
value.appendChild(m_Document.createTextNode(validContent(inst
.stringValue(index))));
}
}
}
if (sparse) {
value.setAttribute(ATT_INDEX, "" + (index + 1));
}
}
}
/**
* generates the XML structure for the header
*/
protected void headerToXML() {
Element root;
Element node;
Element child;
int i;
root = m_Document.getDocumentElement();
root.setAttribute(ATT_NAME, validContent(m_Instances.relationName()));
root.setAttribute(ATT_VERSION, Version.VERSION);
// create "header" node
node = m_Document.createElement(TAG_HEADER);
root.appendChild(node);
// add all attributes
child = m_Document.createElement(TAG_ATTRIBUTES);
node.appendChild(child);
for (i = 0; i < m_Instances.numAttributes(); i++) {
addAttribute(child, m_Instances.attribute(i));
}
}
/**
* generates the XML structure from the rows
*/
protected void dataToXML() {
Element root;
Element node;
Element child;
int i;
root = m_Document.getDocumentElement();
// create "body" node
node = m_Document.createElement(TAG_BODY);
root.appendChild(node);
// add all instances
child = m_Document.createElement(TAG_INSTANCES);
node.appendChild(child);
for (i = 0; i < m_Instances.numInstances(); i++) {
addInstance(child, m_Instances.instance(i));
}
}
/**
* builds up the XML structure based on the given data
*
* @param data data to generate the XML from
*/
public void setInstances(Instances data) {
m_Instances = new Instances(data);
clear();
headerToXML();
dataToXML();
}
/**
* returns the current instances, either the ones that were set or the ones
* that were generated from the XML structure.
*
* @return the current instances
*/
public Instances getInstances() {
return m_Instances;
}
/**
* returns the metadata, if any available underneath this node, otherwise just
* null
*
* @param parent the attribute node
* @return the metadata, or null if none found
* @throws Exception if generation fails
*/
protected ProtectedProperties createMetadata(Element parent) throws Exception {
ProtectedProperties result;
Properties props;
Vector list;
Element node;
Element metanode;
int i;
result = null;
// find metadata node directly underneath this attribute, but not in
// deeper nested attributes (e.g., within relational attributes)
metanode = null;
list = getChildTags(parent, TAG_METADATA);
if (list.size() > 0) {
metanode = list.get(0);
}
// generate properties
if (metanode != null) {
props = new Properties();
list = getChildTags(metanode, TAG_PROPERTY);
for (i = 0; i < list.size(); i++) {
node = list.get(i);
props.setProperty(node.getAttribute(ATT_NAME), getContent(node));
}
result = new ProtectedProperties(props);
}
return result;
}
/**
* returns the labels listed underneath this (nominal) attribute in a
* ArrayList
*
* @param parent the (nominal) attribute node
* @return the label vector
* @throws Exception if generation fails
*/
protected ArrayList createLabels(Element parent) throws Exception {
ArrayList result;
Vector list;
Element node;
Element labelsnode;
int i;
result = new ArrayList();
// find labels node directly underneath this attribute, but not in
// deeper nested attributes (e.g., within relational attributes)
labelsnode = null;
list = getChildTags(parent, TAG_LABELS);
if (list.size() > 0) {
labelsnode = list.get(0);
}
// retrieve all labels
if (labelsnode != null) {
list = getChildTags(labelsnode, TAG_LABEL);
for (i = 0; i < list.size(); i++) {
node = list.get(i);
result.add(getContent(node));
}
}
return result;
}
/**
* creates an Attribute from the given XML node
*
* @param node the node with the setup
* @return the configured Attribute
* @throws Exception if generation fails, e.g., due to unknown attribute type
*/
protected Attribute createAttribute(Element node) throws Exception {
String typeStr;
String name;
int type;
Attribute result;
ArrayList values;
ProtectedProperties metadata;
Vector list;
ArrayList atts;
result = null;
// name
name = node.getAttribute(ATT_NAME);
// type
typeStr = node.getAttribute(ATT_TYPE);
if (typeStr.equals(VAL_NUMERIC)) {
type = Attribute.NUMERIC;
} else if (typeStr.equals(VAL_DATE)) {
type = Attribute.DATE;
} else if (typeStr.equals(VAL_NOMINAL)) {
type = Attribute.NOMINAL;
} else if (typeStr.equals(VAL_STRING)) {
type = Attribute.STRING;
} else if (typeStr.equals(VAL_RELATIONAL)) {
type = Attribute.RELATIONAL;
} else {
throw new Exception("Attribute type '" + typeStr + "' is not supported!");
}
// metadata
metadata = createMetadata(node);
switch (type) {
case Attribute.NUMERIC:
if (metadata == null) {
result = new Attribute(name);
} else {
result = new Attribute(name, metadata);
}
break;
case Attribute.DATE:
if (metadata == null) {
result = new Attribute(name, node.getAttribute(ATT_FORMAT));
} else {
result = new Attribute(name, node.getAttribute(ATT_FORMAT), metadata);
}
break;
case Attribute.NOMINAL:
values = createLabels(node);
if (metadata == null) {
result = new Attribute(name, values);
} else {
result = new Attribute(name, values, metadata);
}
break;
case Attribute.STRING:
if (metadata == null) {
result = new Attribute(name, (ArrayList) null);
} else {
result = new Attribute(name, (ArrayList) null, metadata);
}
break;
case Attribute.RELATIONAL:
list = getChildTags(node, TAG_ATTRIBUTES);
node = list.get(0);
atts = createAttributes(node, new int[1]);
if (metadata == null) {
result = new Attribute(name, new Instances(name, atts, 0));
} else {
result = new Attribute(name, new Instances(name, atts, 0), metadata);
}
break;
}
return result;
}
/**
* returns a list of generated attributes
*
* @param parent the attributes node
* @param classIndex array of length 1 to return the class index, if any
* @return the vector with the generated attributes
* @throws Exception if generation fails, e.g., due to unknown attribute type
*/
protected ArrayList createAttributes(Element parent,
int[] classIndex) throws Exception {
Vector list;
ArrayList result;
int i;
Element node;
Attribute att;
result = new ArrayList();
classIndex[0] = -1;
list = getChildTags(parent, TAG_ATTRIBUTE);
for (i = 0; i < list.size(); i++) {
node = list.get(i);
att = createAttribute(node);
if (node.getAttribute(ATT_CLASS).equals(VAL_YES)) {
classIndex[0] = i;
}
result.add(att);
}
return result;
}
/**
* creates an Instance from the given XML node
*
* @param header the data this instance will belong to
* @param parent the instance node
* @return the configured Instance
* @throws Exception if generation fails, e.g., due to unknown attribute type
*/
protected Instance createInstance(Instances header, Element parent)
throws Exception {
Instance result;
Element node;
Element child;
boolean sparse;
int i;
int index;
Vector list;
Vector subList;
double[] values;
String content;
double weight;
Instances data;
result = null;
// sparse?
sparse = (parent.getAttribute(ATT_TYPE).equals(VAL_SPARSE));
values = new double[header.numAttributes()];
// weight
if (parent.getAttribute(ATT_WEIGHT).length() != 0) {
weight = Double.parseDouble(parent.getAttribute(ATT_WEIGHT));
} else {
weight = 1.0;
}
list = getChildTags(parent, TAG_VALUE);
for (i = 0; i < list.size(); i++) {
node = list.get(i);
// determine index
if (sparse) {
index = Integer.parseInt(node.getAttribute(ATT_INDEX)) - 1;
} else {
index = i;
}
// set value
if (node.getAttribute(ATT_MISSING).equals(VAL_YES)) {
values[index] = Utils.missingValue();
} else {
content = getContent(node);
switch (header.attribute(index).type()) {
case Attribute.NUMERIC:
values[index] = Double.parseDouble(content);
break;
case Attribute.DATE:
values[index] = header.attribute(index).parseDate(content);
break;
case Attribute.NOMINAL:
values[index] = header.attribute(index).indexOfValue(content);
break;
case Attribute.STRING:
values[index] = header.attribute(index).addStringValue(content);
break;
case Attribute.RELATIONAL:
subList = getChildTags(node, TAG_INSTANCES);
child = subList.get(0);
data = createInstances(header.attribute(index).relation(), child);
values[index] = header.attribute(index).addRelation(data);
break;
default:
throw new Exception("Attribute type "
+ header.attribute(index).type() + " is not supported!");
}
}
}
// create instance
if (sparse) {
result = new SparseInstance(weight, values);
} else {
result = new DenseInstance(weight, values);
}
return result;
}
/**
* creates Instances from the given XML node
*
* @param header the header of this data
* @param parent the instances node
* @return the generated Instances
* @throws Exception if generation fails, e.g., due to unknown attribute type
*/
protected Instances createInstances(Instances header, Element parent)
throws Exception {
Instances result;
Vector list;
int i;
result = new Instances(header, 0);
list = getChildTags(parent, TAG_INSTANCE);
for (i = 0; i < list.size(); i++) {
result.add(createInstance(result, list.get(i)));
}
return result;
}
/**
* generates the header from the XML document
*
* @return the generated header
* @throws Exception if generation fails
*/
protected Instances headerFromXML() throws Exception {
Instances result;
Element root;
Element node;
Vector list;
ArrayList atts;
Version version;
int[] classIndex;
root = m_Document.getDocumentElement();
// check version
version = new Version();
if (version.isOlder(root.getAttribute(ATT_VERSION))) {
System.out.println("WARNING: loading data of version "
+ root.getAttribute(ATT_VERSION) + " with version " + Version.VERSION);
}
// attributes
list = getChildTags(root, TAG_HEADER);
node = list.get(0);
list = getChildTags(node, TAG_ATTRIBUTES);
node = list.get(0);
classIndex = new int[1];
atts = createAttributes(node, classIndex);
// generate header
result = new Instances(root.getAttribute(ATT_NAME), atts, 0);
result.setClassIndex(classIndex[0]);
return result;
}
/**
* generates the complete dataset from the XML document
*
* @param header the header structure
* @return the complete dataset
* @throws Exception if generation fails
*/
protected Instances dataFromXML(Instances header) throws Exception {
Instances result;
Element node;
Vector list;
list = getChildTags(m_Document.getDocumentElement(), TAG_BODY);
node = list.get(0);
list = getChildTags(node, TAG_INSTANCES);
node = list.get(0);
result = createInstances(header, node);
return result;
}
/**
* reads the XML structure from the given reader
*
* @param reader the reader to get the XML from
* @throws Exception if
*/
public void setXML(Reader reader) throws Exception {
read(reader);
// interprete XML structure
m_Instances = dataFromXML(headerFromXML());
}
/**
* Returns the revision string.
*
* @return the revision
*/
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 10203 $");
}
/**
* takes an XML document as first argument and then outputs the Instances
* statistics
*
* @param args the commandline options
*/
public static void main(String[] args) {
try {
Reader r = null;
if (args.length != 1) {
throw (new Exception("Usage: XMLInstances "));
} else {
InputStream in = new FileInputStream(args[0]);
// compressed file?
if (args[0].endsWith(".gz")) {
in = new GZIPInputStream(in);
}
r = new BufferedReader(new InputStreamReader(in));
}
if (args[0].endsWith(Instances.FILE_EXTENSION)) {
XMLInstances i = new XMLInstances(new Instances(r));
System.out.println(i.toString());
} else {
Instances i = new XMLInstances(r).getInstances();
System.out.println(i.toSummaryString());
}
} catch (Exception ex) {
ex.printStackTrace();
System.err.println(ex.getMessage());
}
}
}