com.yahoo.labs.samoa.instances.ArffLoader Maven / Gradle / Ivy
The newest version!
package com.yahoo.labs.samoa.instances;
/*
* #%L
* SAMOA
* %%
* Copyright (C) 2013 Yahoo! Inc.
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.Serializable;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
*
* @author abifet
*/
public class ArffLoader implements Serializable {
protected InstanceInformation instanceInformation;
transient protected StreamTokenizer streamTokenizer;
protected Reader reader;
protected int size;
protected int classAttribute;
public ArffLoader() {
}
public ArffLoader(Reader reader, int size, int classAttribute) {
this.reader = reader;
this.size = size;
this.classAttribute = classAttribute;
initStreamTokenizer(reader);
}
public InstanceInformation getStructure() {
return this.instanceInformation;
}
public Instance readInstance(Reader reader) {
if (streamTokenizer == null) {
initStreamTokenizer(reader);
}
while (streamTokenizer.ttype == StreamTokenizer.TT_EOL) {
try {
streamTokenizer.nextToken();
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
}
if (streamTokenizer.ttype == '{') {
return readInstanceSparse();
// return readDenseInstanceSparse();
} else {
return readInstanceDense();
}
}
public Instance readInstanceDense() {
Instance instance = new DenseInstance(this.instanceInformation.numAttributes() + 1);
//System.out.println(this.instanceInformation.numAttributes());
int numAttribute = 0;
try {
while (numAttribute == 0 && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
//For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
&& streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
//For each item
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
//System.out.println(streamTokenizer.nval + "Num ");
this.setValue(instance, numAttribute, streamTokenizer.nval, true);
numAttribute++;
} else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34)) {
//System.out.println(streamTokenizer.sval + "Str");
boolean isNumeric = attributes.get(numAttribute).isNumeric();
double value;
if ("?".equals(streamTokenizer.sval)) {
value = Double.NaN; //Utils.missingValue();
} else if (isNumeric == true) {
value = Double.valueOf(streamTokenizer.sval).doubleValue();
} else {
value = this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval);
}
this.setValue(instance, numAttribute, value, isNumeric);
numAttribute++;
}
streamTokenizer.nextToken();
}
streamTokenizer.nextToken();
//System.out.println("EOL");
}
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
return (numAttribute > 0) ? instance : null;
}
private void setValue(Instance instance, int numAttribute, double value, boolean isNumber) {
double valueAttribute;
if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) {
valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
//System.out.println(value +"/"+valueAttribute+" ");
} else {
valueAttribute = value;
//System.out.println(value +"/"+valueAttribute+" ");
}
if (this.instanceInformation.classIndex() == numAttribute) {
instance.setClassValue(valueAttribute);
//System.out.println(value +"<"+this.instanceInformation.classIndex()+">");
} else {
instance.setValue(numAttribute, valueAttribute);
}
}
private Instance readInstanceSparse() {
//Return a Sparse Instance
Instance instance = new SparseInstance(1.0, null); //(this.instanceInformation.numAttributes() + 1);
//System.out.println(this.instanceInformation.numAttributes());
int numAttribute;
ArrayList attributeValues = new ArrayList();
List indexValues = new ArrayList();
try {
//while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
streamTokenizer.nextToken(); // Remove the '{' char
//For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
&& streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
while (streamTokenizer.ttype != '}') {
//For each item
//streamTokenizer.nextToken();
//while (streamTokenizer.ttype != '}'){
//System.out.println(streamTokenizer.nval +"-"+ streamTokenizer.sval);
//numAttribute = (int) streamTokenizer.nval;
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
numAttribute = (int) streamTokenizer.nval;
} else {
numAttribute = Integer.parseInt(streamTokenizer.sval);
}
streamTokenizer.nextToken();
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
//System.out.print(streamTokenizer.nval + " ");
this.setSparseValue(instance, indexValues, attributeValues, numAttribute, streamTokenizer.nval, true);
//numAttribute++;
} else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34)) {
//System.out.print(streamTokenizer.sval + "-");
if (attributes.get(numAttribute).isNumeric()) {
this.setSparseValue(instance, indexValues, attributeValues, numAttribute, Double.valueOf(streamTokenizer.sval).doubleValue(), true);
} else {
this.setSparseValue(instance, indexValues, attributeValues, numAttribute, this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval), false);
}
}
streamTokenizer.nextToken();
}
streamTokenizer.nextToken(); //Remove the '}' char
}
streamTokenizer.nextToken();
//System.out.println("EOL");
//}
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
int[] arrayIndexValues = new int[attributeValues.size()];
double[] arrayAttributeValues = new double[attributeValues.size()];
for (int i = 0; i < arrayIndexValues.length; i++) {
arrayIndexValues[i] = indexValues.get(i).intValue();
arrayAttributeValues[i] = attributeValues.get(i).doubleValue();
}
instance.addSparseValues(arrayIndexValues, arrayAttributeValues, this.instanceInformation.numAttributes());
return instance;
}
private void setSparseValue(Instance instance, List indexValues, List attributeValues, int numAttribute, double value, boolean isNumber) {
double valueAttribute;
if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) {
valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
} else {
valueAttribute = value;
}
if (this.instanceInformation.classIndex() == numAttribute) {
instance.setClassValue(valueAttribute);
} else {
//instance.setValue(numAttribute, valueAttribute);
indexValues.add(numAttribute);
attributeValues.add(valueAttribute);
}
//System.out.println(numAttribute+":"+valueAttribute+","+this.instanceInformation.classIndex()+","+value);
}
private Instance readDenseInstanceSparse() {
//Returns a dense instance
Instance instance = new DenseInstance(this.instanceInformation.numAttributes() + 1);
//System.out.println(this.instanceInformation.numAttributes());
int numAttribute;
try {
//while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
streamTokenizer.nextToken(); // Remove the '{' char
//For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
&& streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
while (streamTokenizer.ttype != '}') {
//For each item
//streamTokenizer.nextToken();
//while (streamTokenizer.ttype != '}'){
//System.out.print(streamTokenizer.nval+":");
numAttribute = (int) streamTokenizer.nval;
streamTokenizer.nextToken();
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
//System.out.print(streamTokenizer.nval + " ");
this.setValue(instance, numAttribute, streamTokenizer.nval, true);
//numAttribute++;
} else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34)) {
//System.out.print(streamTokenizer.sval + "/"+this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval)+" ");
if (attributes.get(numAttribute).isNumeric()) {
this.setValue(instance, numAttribute, Double.valueOf(streamTokenizer.sval).doubleValue(), true);
} else {
this.setValue(instance, numAttribute, this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval), false);
//numAttribute++;
}
}
streamTokenizer.nextToken();
}
streamTokenizer.nextToken(); //Remove the '}' char
}
streamTokenizer.nextToken();
//System.out.println("EOL");
//}
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
return instance;
}
protected List attributes;
private InstanceInformation getHeader() {
String relation = "file stream";
//System.out.println("RELATION " + relation);
attributes = new ArrayList();
try {
streamTokenizer.nextToken();
while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
//For each line
//if (streamTokenizer.ttype == '@') {
if (streamTokenizer.ttype == StreamTokenizer.TT_WORD && streamTokenizer.sval.startsWith("@") == true) {
//streamTokenizer.nextToken();
String token = streamTokenizer.sval.toUpperCase();
if (token.startsWith("@RELATION")) {
streamTokenizer.nextToken();
relation = streamTokenizer.sval;
//System.out.println("RELATION " + relation);
} else if (token.startsWith("@ATTRIBUTE")) {
streamTokenizer.nextToken();
String name = streamTokenizer.sval;
//System.out.println("* " + name);
if (name == null) {
name = Double.toString(streamTokenizer.nval);
}
streamTokenizer.nextToken();
String type = streamTokenizer.sval;
//System.out.println("* " + name + ":" + type + " ");
if (streamTokenizer.ttype == '{') {
streamTokenizer.nextToken();
List attributeLabels = new ArrayList();
while (streamTokenizer.ttype != '}') {
if (streamTokenizer.sval != null) {
attributeLabels.add(streamTokenizer.sval);
//System.out.print(streamTokenizer.sval + ",");
} else {
attributeLabels.add(Double.toString(streamTokenizer.nval));
//System.out.print(streamTokenizer.nval + ",");
}
streamTokenizer.nextToken();
}
//System.out.println();
attributes.add(new Attribute(name, attributeLabels));
} else {
// Add attribute
attributes.add(new Attribute(name));
}
} else if (token.startsWith("@DATA")) {
//System.out.print("END");
streamTokenizer.nextToken();
break;
}
}
streamTokenizer.nextToken();
}
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
return new InstanceInformation(relation, attributes);
}
private void initStreamTokenizer(Reader reader) {
BufferedReader br = new BufferedReader(reader);
//Init streamTokenizer
streamTokenizer = new StreamTokenizer(br);
streamTokenizer.resetSyntax();
streamTokenizer.whitespaceChars(0, ' ');
streamTokenizer.wordChars(' ' + 1, '\u00FF');
streamTokenizer.whitespaceChars(',', ',');
streamTokenizer.commentChar('%');
streamTokenizer.quoteChar('"');
streamTokenizer.quoteChar('\'');
streamTokenizer.ordinaryChar('{');
streamTokenizer.ordinaryChar('}');
streamTokenizer.eolIsSignificant(true);
this.instanceInformation = this.getHeader();
if (classAttribute < 0) {
this.instanceInformation.setClassIndex(this.instanceInformation.numAttributes() - 1);
//System.out.print(this.instanceInformation.classIndex());
} else if (classAttribute > 0) {
this.instanceInformation.setClassIndex(classAttribute - 1);
}
}
}