All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.labs.samoa.instances.ArffLoader Maven / Gradle / Ivy

The newest version!
package com.yahoo.labs.samoa.instances;

/*
 * #%L
 * SAMOA
 * %%
 * Copyright (C) 2013 Yahoo! Inc.
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.Serializable;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 *
 * @author abifet
 */
public class ArffLoader implements Serializable {

    protected InstanceInformation instanceInformation;

    transient protected StreamTokenizer streamTokenizer;

    protected Reader reader;

    protected int size;

    protected int classAttribute;

    public ArffLoader() {
    }

    public ArffLoader(Reader reader, int size, int classAttribute) {
        this.reader = reader;
        this.size = size;
        this.classAttribute = classAttribute;
        initStreamTokenizer(reader);
    }

    public InstanceInformation getStructure() {
        return this.instanceInformation;
    }

    public Instance readInstance(Reader reader) {
        if (streamTokenizer == null) {
            initStreamTokenizer(reader);
        }
        while (streamTokenizer.ttype == StreamTokenizer.TT_EOL) {
            try {
                streamTokenizer.nextToken();
            } catch (IOException ex) {
                Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
        if (streamTokenizer.ttype == '{') {
            return readInstanceSparse();
            // return readDenseInstanceSparse();
        } else {
            return readInstanceDense();
        }

    }

    public Instance readInstanceDense() {
        Instance instance = new DenseInstance(this.instanceInformation.numAttributes() + 1);
        //System.out.println(this.instanceInformation.numAttributes());
        int numAttribute = 0;
        try {
            while (numAttribute == 0 && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
                //For each line
                while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
                        && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
                    //For each item
                    if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
                        //System.out.println(streamTokenizer.nval + "Num ");
                        this.setValue(instance, numAttribute, streamTokenizer.nval, true);
                        numAttribute++;

                    } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
                            || streamTokenizer.ttype == 34)) {
                        //System.out.println(streamTokenizer.sval + "Str");
                        boolean isNumeric = attributes.get(numAttribute).isNumeric();
                        double value;
                        if ("?".equals(streamTokenizer.sval)) {
                            value = Double.NaN; //Utils.missingValue();
                        } else if (isNumeric == true) {
                            value = Double.valueOf(streamTokenizer.sval).doubleValue();
                        } else {
                            value = this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval);
                        }

                        this.setValue(instance, numAttribute, value, isNumeric);
                        numAttribute++;
                    }
                    streamTokenizer.nextToken();
                }
                streamTokenizer.nextToken();
                //System.out.println("EOL");
            }


        } catch (IOException ex) {
            Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
        }
        return (numAttribute > 0) ? instance : null;
    }

    private void setValue(Instance instance, int numAttribute, double value, boolean isNumber) {
        double valueAttribute;
        if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) {
            valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
            //System.out.println(value +"/"+valueAttribute+" ");

        } else {
            valueAttribute = value;
            //System.out.println(value +"/"+valueAttribute+" ");
        }
        if (this.instanceInformation.classIndex() == numAttribute) {
            instance.setClassValue(valueAttribute);
            //System.out.println(value +"<"+this.instanceInformation.classIndex()+">");
        } else {
            instance.setValue(numAttribute, valueAttribute);
        }
    }

    private Instance readInstanceSparse() {
        //Return a Sparse Instance
        Instance instance = new SparseInstance(1.0, null); //(this.instanceInformation.numAttributes() + 1);
        //System.out.println(this.instanceInformation.numAttributes());
        int numAttribute;
        ArrayList attributeValues = new ArrayList();
        List indexValues = new ArrayList();
        try {
            //while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
            streamTokenizer.nextToken(); // Remove the '{' char
            //For each line
            while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
                    && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
                while (streamTokenizer.ttype != '}') {
                    //For each item
                    //streamTokenizer.nextToken();
                    //while (streamTokenizer.ttype != '}'){
                    //System.out.println(streamTokenizer.nval +"-"+ streamTokenizer.sval);
                    //numAttribute = (int) streamTokenizer.nval;
                    if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
                        numAttribute = (int) streamTokenizer.nval;
                    } else {
                        numAttribute = Integer.parseInt(streamTokenizer.sval);
                    }
                    streamTokenizer.nextToken();

                    if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
                        //System.out.print(streamTokenizer.nval + " ");
                        this.setSparseValue(instance, indexValues, attributeValues, numAttribute, streamTokenizer.nval, true);
                        //numAttribute++;

                    } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
                            || streamTokenizer.ttype == 34)) {
                        //System.out.print(streamTokenizer.sval + "-");
                        if (attributes.get(numAttribute).isNumeric()) {
                            this.setSparseValue(instance, indexValues, attributeValues, numAttribute, Double.valueOf(streamTokenizer.sval).doubleValue(), true);
                        } else {
                            this.setSparseValue(instance, indexValues, attributeValues, numAttribute, this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval), false);
                        }
                    }
                    streamTokenizer.nextToken();
                }
                streamTokenizer.nextToken(); //Remove the '}' char
            }
            streamTokenizer.nextToken();
            //System.out.println("EOL");
            //}


        } catch (IOException ex) {
            Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
        }
        int[] arrayIndexValues = new int[attributeValues.size()];
        double[] arrayAttributeValues = new double[attributeValues.size()];
        for (int i = 0; i < arrayIndexValues.length; i++) {
            arrayIndexValues[i] = indexValues.get(i).intValue();
            arrayAttributeValues[i] = attributeValues.get(i).doubleValue();
        }
        instance.addSparseValues(arrayIndexValues, arrayAttributeValues, this.instanceInformation.numAttributes());
        return instance;

    }

    private void setSparseValue(Instance instance, List indexValues, List attributeValues, int numAttribute, double value, boolean isNumber) {
        double valueAttribute;
        if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) {
            valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
        } else {
            valueAttribute = value;
        }
        if (this.instanceInformation.classIndex() == numAttribute) {
            instance.setClassValue(valueAttribute);
        } else {
            //instance.setValue(numAttribute, valueAttribute);
            indexValues.add(numAttribute);
            attributeValues.add(valueAttribute);
        }
        //System.out.println(numAttribute+":"+valueAttribute+","+this.instanceInformation.classIndex()+","+value);
    }

    private Instance readDenseInstanceSparse() {
        //Returns a dense instance
        Instance instance = new DenseInstance(this.instanceInformation.numAttributes() + 1);
        //System.out.println(this.instanceInformation.numAttributes());
        int numAttribute;
        try {
            //while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
            streamTokenizer.nextToken(); // Remove the '{' char
            //For each line
            while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
                    && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
                while (streamTokenizer.ttype != '}') {
                    //For each item
                    //streamTokenizer.nextToken();
                    //while (streamTokenizer.ttype != '}'){
                    //System.out.print(streamTokenizer.nval+":");
                    numAttribute = (int) streamTokenizer.nval;
                    streamTokenizer.nextToken();

                    if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
                        //System.out.print(streamTokenizer.nval + " ");
                        this.setValue(instance, numAttribute, streamTokenizer.nval, true);
                        //numAttribute++;

                    } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
                            || streamTokenizer.ttype == 34)) {
                        //System.out.print(streamTokenizer.sval + "/"+this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval)+" ");
                        if (attributes.get(numAttribute).isNumeric()) {
                            this.setValue(instance, numAttribute, Double.valueOf(streamTokenizer.sval).doubleValue(), true);
                        } else {
                            this.setValue(instance, numAttribute, this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval), false);
                            //numAttribute++;
                        }
                    }
                    streamTokenizer.nextToken();
                }
                streamTokenizer.nextToken(); //Remove the '}' char
            }
            streamTokenizer.nextToken();
            //System.out.println("EOL");
            //}


        } catch (IOException ex) {
            Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
        }
        return instance;
    }

    protected List attributes;

    private InstanceInformation getHeader() {

        String relation = "file stream";
        //System.out.println("RELATION " + relation);
        attributes = new ArrayList();
        try {
            streamTokenizer.nextToken();
            while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
                //For each line
                //if (streamTokenizer.ttype == '@') {
                if (streamTokenizer.ttype == StreamTokenizer.TT_WORD && streamTokenizer.sval.startsWith("@") == true) {
                    //streamTokenizer.nextToken();
                    String token = streamTokenizer.sval.toUpperCase();
                    if (token.startsWith("@RELATION")) {
                        streamTokenizer.nextToken();
                        relation = streamTokenizer.sval;
                        //System.out.println("RELATION " + relation);
                    } else if (token.startsWith("@ATTRIBUTE")) {
                        streamTokenizer.nextToken();
                        String name = streamTokenizer.sval;
                        //System.out.println("* " + name);
                        if (name == null) {
                            name = Double.toString(streamTokenizer.nval);
                        }
                        streamTokenizer.nextToken();
                        String type = streamTokenizer.sval;
                        //System.out.println("* " + name + ":" + type + " ");
                        if (streamTokenizer.ttype == '{') {
                            streamTokenizer.nextToken();
                            List attributeLabels = new ArrayList();
                            while (streamTokenizer.ttype != '}') {

                                if (streamTokenizer.sval != null) {
                                    attributeLabels.add(streamTokenizer.sval);
                                    //System.out.print(streamTokenizer.sval + ",");
                                } else {
                                    attributeLabels.add(Double.toString(streamTokenizer.nval));
                                    //System.out.print(streamTokenizer.nval + ",");
                                }

                                streamTokenizer.nextToken();
                            }
                            //System.out.println();
                            attributes.add(new Attribute(name, attributeLabels));
                        } else {
                            // Add attribute
                            attributes.add(new Attribute(name));
                        }

                    } else if (token.startsWith("@DATA")) {
                        //System.out.print("END");
                        streamTokenizer.nextToken();
                        break;
                    }
                }
                streamTokenizer.nextToken();
            }

        } catch (IOException ex) {
            Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
        }
        return new InstanceInformation(relation, attributes);
    }

    private void initStreamTokenizer(Reader reader) {
        BufferedReader br = new BufferedReader(reader);

        //Init streamTokenizer
        streamTokenizer = new StreamTokenizer(br);

        streamTokenizer.resetSyntax();
        streamTokenizer.whitespaceChars(0, ' ');
        streamTokenizer.wordChars(' ' + 1, '\u00FF');
        streamTokenizer.whitespaceChars(',', ',');
        streamTokenizer.commentChar('%');
        streamTokenizer.quoteChar('"');
        streamTokenizer.quoteChar('\'');
        streamTokenizer.ordinaryChar('{');
        streamTokenizer.ordinaryChar('}');
        streamTokenizer.eolIsSignificant(true);

        this.instanceInformation = this.getHeader();
        if (classAttribute < 0) {
            this.instanceInformation.setClassIndex(this.instanceInformation.numAttributes() - 1);
            //System.out.print(this.instanceInformation.classIndex());
        } else if (classAttribute > 0) {
            this.instanceInformation.setClassIndex(classAttribute - 1);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy