com.yahoo.labs.samoa.instances.ArffLoader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of moa Show documentation
Show all versions of moa Show documentation
Massive On-line Analysis is an environment for massive data mining. MOA
provides a framework for data stream mining and includes tools for evaluation
and a collection of machine learning algorithms. Related to the WEKA project,
also written in Java, while scaling to more demanding problems.
/*
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific
* language governing permissions and limitations under the
* License.
*/
package com.yahoo.labs.samoa.instances;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* The Class ArffLoader. Loads an Arff file with sparse or dense format.
*/
public class ArffLoader {
/**
* The instance information.
*/
protected InstanceInformation instanceInformation;
protected InstancesHeader streamHeader;
/**
* The stream tokenizer.
*/
protected StreamTokenizer streamTokenizer;
/**
* Instantiates a new arff loader.
*
* @param reader the reader
* @param size the size
* @param classAttribute the class attribute
*/
public ArffLoader(Reader reader, int size, int classAttribute) {
// size is not used
this(reader);
if (classAttribute < 0) {
this.instanceInformation.setClassIndex(this.instanceInformation.numAttributes() - 1);
//System.out.print(this.instanceInformation.classIndex());
} else if (classAttribute > 0) {
this.instanceInformation.setClassIndex(classAttribute - 1);
}
}
protected Range range;
/**
* Instantiates a new arff loader.
*
* @param reader the reader
* @param range
* @param size the size
* @param classAttribute the class attribute
*/
public ArffLoader(Reader reader) {
this(reader, null);
}
/**
* Instantiates a new arff loader.
*
* @param reader the reader
* @param range
* @param size the size
* @param classAttribute the class attribute
*/
public ArffLoader(Reader reader, Range range) {
this.range = range;
BufferedReader br = new BufferedReader(reader);
//Init streamTokenizer
streamTokenizer = new StreamTokenizer(br);
streamTokenizer.resetSyntax();
streamTokenizer.whitespaceChars(0, ' ');
streamTokenizer.wordChars(' ' + 1, '\u00FF');
streamTokenizer.whitespaceChars(',', ',');
streamTokenizer.commentChar('%');
streamTokenizer.quoteChar('"');
streamTokenizer.quoteChar('\'');
streamTokenizer.ordinaryChar('{');
streamTokenizer.ordinaryChar('}');
streamTokenizer.eolIsSignificant(true);
this.instanceInformation = this.getHeader();
if (range != null) { //is MultiLabel
this.instanceInformation.setRangeOutputIndices(range);
}
}
/**
* Gets the structure.
*
* @return the structure
*/
public InstanceInformation getStructure() {
return this.instanceInformation;
}
/**
* Reads instance. It detects if it is dense or sparse.
*
* @return the instance
*/
public Instance readInstance() {
while (streamTokenizer.ttype == StreamTokenizer.TT_EOL) {
try {
streamTokenizer.nextToken();
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
}
if (streamTokenizer.ttype == '{') {
return readInstanceSparse();
// return readDenseInstanceSparse();
} else {
return readInstanceDense();
}
}
/**
* Reads a dense instance from the file.
*
* @return the instance
*/
public Instance readInstanceDense() {
Instance instance = newDenseInstance(this.instanceInformation.numAttributes());
//System.out.println(this.instanceInformation.numAttributes());
int numAttribute = 0;
try {
while (numAttribute == 0 && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
//For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
&& streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
//For each item
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
//System.out.println(streamTokenizer.nval + "Num ");
this.setValue(instance, numAttribute, streamTokenizer.nval, true);
numAttribute++;
} else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34)) {
//System.out.println(streamTokenizer.sval + "Str");
boolean isNumeric = this.instanceInformation.attribute(numAttribute).isNumeric();
double value;
if ("?".equals(streamTokenizer.sval)) {
value = Double.NaN; //Utils.missingValue();
} else if (isNumeric == true) {
value = Double.valueOf(streamTokenizer.sval).doubleValue();
} else {
value = this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval);
}
this.setValue(instance, numAttribute, value, isNumeric);
numAttribute++;
}
streamTokenizer.nextToken();
}
streamTokenizer.nextToken();
//System.out.println("EOL");
}
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
return (numAttribute > 0) ? instance : null;
}
protected void setValue(Instance instance, int numAttribute, double value, boolean isNumber) {
double valueAttribute;
if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) {
valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
//System.out.println(value +"/"+valueAttribute+" ");
} else {
valueAttribute = value;
//System.out.println(value +"/"+valueAttribute+" ");
}
if (this.instanceInformation.classIndex() == numAttribute) {
setClassValue(instance, valueAttribute);
//System.out.println(value +"<"+this.instanceInformation.classIndex()+">");
} else {
//if(numAttribute>this.instanceInformation.classIndex())
// numAttribute--;
instance.setValue(numAttribute, valueAttribute);
}
}
/**
* Reads a sparse instance.
*
* @return the instance
*/
private Instance readInstanceSparse() {
//Return a Sparse Instance
Instance instance = newSparseInstance(1.0); //, null); //(this.instanceInformation.numAttributes() + 1);
//System.out.println(this.instanceInformation.numAttributes());
int numAttribute;
ArrayList attributeValues = new ArrayList();
List indexValues = new ArrayList();
try {
//while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
streamTokenizer.nextToken(); // Remove the '{' char
//For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
&& streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
while (streamTokenizer.ttype != '}') {
//For each item
//streamTokenizer.nextToken();
//while (streamTokenizer.ttype != '}'){
//System.out.println(streamTokenizer.nval +"-"+ streamTokenizer.sval);
//numAttribute = (int) streamTokenizer.nval;
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
numAttribute = (int) streamTokenizer.nval;
} else {
numAttribute = Integer.parseInt(streamTokenizer.sval);
}
streamTokenizer.nextToken();
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
//System.out.print(streamTokenizer.nval + " ");
this.setSparseValue(instance, indexValues, attributeValues, numAttribute, streamTokenizer.nval, true);
//numAttribute++;
} else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34)) {
//System.out.print(streamTokenizer.sval + "-");
if (this.auxAttributes.get(numAttribute).isNumeric()) {
this.setSparseValue(instance, indexValues, attributeValues, numAttribute, Double.valueOf(streamTokenizer.sval).doubleValue(), true);
} else {
this.setSparseValue(instance, indexValues, attributeValues, numAttribute, this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval), false);
}
}
streamTokenizer.nextToken();
}
streamTokenizer.nextToken(); //Remove the '}' char
}
streamTokenizer.nextToken();
//System.out.println("EOL");
//}
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
int[] arrayIndexValues = new int[attributeValues.size()];
double[] arrayAttributeValues = new double[attributeValues.size()];
for (int i = 0; i < arrayIndexValues.length; i++) {
arrayIndexValues[i] = indexValues.get(i).intValue();
arrayAttributeValues[i] = attributeValues.get(i).doubleValue();
}
instance.addSparseValues(arrayIndexValues, arrayAttributeValues, this.instanceInformation.numAttributes());
return instance;
}
private void setSparseValue(Instance instance, List indexValues, List attributeValues, int numAttribute, double value, boolean isNumber) {
double valueAttribute;
if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) {
valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
} else {
valueAttribute = value;
}
//if (this.instanceInformation.classIndex() == numAttribute) {
// setClassValue(instance, valueAttribute);
//} else {
//instance.setValue(numAttribute, valueAttribute);
indexValues.add(numAttribute);
attributeValues.add(valueAttribute);
//}
//System.out.println(numAttribute+":"+valueAttribute+","+this.instanceInformation.classIndex()+","+value);
}
/**
* Reads an instance sparse and returns a dense one.
*
* @return the instance
*/
private Instance readDenseInstanceSparse() {
//Returns a dense instance
Instance instance = newDenseInstance(this.instanceInformation.numAttributes());
//System.out.println(this.instanceInformation.numAttributes());
int numAttribute;
try {
//while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
streamTokenizer.nextToken(); // Remove the '{' char
//For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
&& streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
while (streamTokenizer.ttype != '}') {
//For each item
//streamTokenizer.nextToken();
//while (streamTokenizer.ttype != '}'){
//System.out.print(streamTokenizer.nval+":");
numAttribute = (int) streamTokenizer.nval;
streamTokenizer.nextToken();
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
//System.out.print(streamTokenizer.nval + " ");
this.setValue(instance, numAttribute, streamTokenizer.nval, true);
//numAttribute++;
} else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34)) {
//System.out.print(streamTokenizer.sval + "/"+this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval)+" ");
if (this.auxAttributes.get(numAttribute).isNumeric()) {
this.setValue(instance, numAttribute, Double.valueOf(streamTokenizer.sval).doubleValue(), true);
} else {
this.setValue(instance, numAttribute, this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval), false);
//numAttribute++;
}
}
streamTokenizer.nextToken();
}
streamTokenizer.nextToken(); //Remove the '}' char
}
streamTokenizer.nextToken();
//System.out.println("EOL");
//}
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
return instance;
}
//protected List inputAttributes;
// protected List outputAttributes;
protected List auxAttributes;
private InstanceInformation getHeader() {
//commented JD
//this.range.setUpper(10000); //TO DO: Create a new range object with isInRange that does not need the upper limit
String relation = "file stream";
//System.out.println("RELATION " + relation);
//inputAttributes = new ArrayList();
//outputAttributes = new ArrayList();
//ArrayList
auxAttributes = new ArrayList();//JD
int numAttributes = 0;
try {
streamTokenizer.nextToken();
while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
//For each line
//if (streamTokenizer.ttype == '@') {
if (streamTokenizer.ttype == StreamTokenizer.TT_WORD && streamTokenizer.sval.startsWith("@") == true) {
//streamTokenizer.nextToken();
String token = streamTokenizer.sval.toUpperCase();
if (token.startsWith("@RELATION")) {
streamTokenizer.nextToken();
relation = streamTokenizer.sval;
// System.out.println("RELATION " + relation);
} else if (token.startsWith("@ATTRIBUTE")) {
streamTokenizer.nextToken();
String name = streamTokenizer.sval;
//System.out.println("* " + name);
if (name == null) {
name = Double.toString(streamTokenizer.nval);
}
streamTokenizer.nextToken();
String type = streamTokenizer.sval;
// System.out.println("* " + name + ":" + type + " ");
if (streamTokenizer.ttype == '{') {
streamTokenizer.nextToken();
List attributeLabels = new ArrayList();
while (streamTokenizer.ttype != '}') {
if (streamTokenizer.sval != null) {
attributeLabels.add(streamTokenizer.sval);
// System.out.print(streamTokenizer.sval + ",");
} else {
attributeLabels.add(Double.toString(streamTokenizer.nval));
//System.out.print(streamTokenizer.nval + ",");
}
streamTokenizer.nextToken();
}
// System.out.println();
//attributes.add(new Attribute(name, attributeLabels));
//commented JD
/* if (this.range.isInRange(numAttribute)) {
outputAttributes.add(new Attribute(name, attributeLabels));
} else {
inputAttributes.add(new Attribute(name, attributeLabels));
}*/
auxAttributes.add(new Attribute(name, attributeLabels));
numAttributes++;
} else {
// Add attribute
//commented JD
/*if (this.range.isInRange(numAttribute)) {
outputAttributes.add(new Attribute(name));
} else {
inputAttributes.add(new Attribute(name));
}*/
auxAttributes.add(new Attribute(name));
numAttributes++;
}
} else if (token.startsWith("@DATA")) {
//System.out.print("END");
streamTokenizer.nextToken();
break;
}
}
streamTokenizer.nextToken();
}
if (range != null) {
this.range.setUpper(numAttributes);
}
/*if (range==null) //is single-target. All instances should go to inputAtrributes (see setClassIndex(int) from InstanceInformation )
inputAttributes=auxAttributes;
else//is multi-target
{
this.range.setUpper(numAttribute);
for (int i=0; i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy