net.sourceforge.cilib.io.ARFFFileReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cilib-library Show documentation
A library of composable components enabling simpler Computational Intelligence
The newest version!
/**           __  __
 *    _____ _/ /_/ /_    Computational Intelligence Library (CIlib)
 *   / ___/ / / / __ \   (c) CIRG @ UP
 *  / /__/ / / / /_/ /   http://cilib.net
 *  \___/_/_/_/_.___/
 */
package net.sourceforge.cilib.io;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.Set;
import net.sourceforge.cilib.io.exception.CIlibIOException;
import net.sourceforge.cilib.type.types.Int;
import net.sourceforge.cilib.type.types.Real;
import net.sourceforge.cilib.type.types.StringType;
import net.sourceforge.cilib.type.types.Type;

/**
 * An implementation of an ARFF file reader according to:
 * {@code http://www.cs.waikato.ac.nz/~ml/weka/arff.html}. Currently the reader
 * does not support the DATE format as no CIlib equivalent type exists.
 */
public class ARFFFileReader extends FileReader> {

    private ArrayList columnTypePrototypes;
    private ArrayList columnNames;
    private String relationName;
    private String headerDelimiter;
    private String dataDelimiter;
    private String commentPrefix;
    private HashMap> columnToNominalAttributesMap;

    /**
     * Initiates the reader, sets the default delimiters to a space (for header
     * fields) and comma (for data fields).
     * @throws CIlibIOException
     * {@inheritDoc}
     */
    @Override
    public void open() throws CIlibIOException {
        super.open();
        headerDelimiter = "[\\s]+";
        dataDelimiter = "\\,";
        commentPrefix = "%";
        columnTypePrototypes = new ArrayList();
        columnNames = new ArrayList();
        this.processHeader();
    }

    /**
     * Reads and returns the next data row in the file as a list of type
     * {@link Type}. Each line read is assumed to be a row and the tokens parsed
     * (by using the {@link #dataDelimiter dataDelimiter}) are best-effort
     * fashion according to the data header.
     * @return a list containing the row data.
     */
    @Override
    public List nextRow() {
        String line = this.nextLine();
        String[] tokens = line.split(dataDelimiter);
        if (tokens.length != columnTypePrototypes.size()) {
            throw new UnsupportedOperationException("Error: Not all attributes specified." +
                    "Expected @attribute declaration for each column");
        }
        ArrayList row = new ArrayList();
        for (int i = 0; i < tokens.length; i++) {
            String data = tokens[i];
            row.add(this.mapTokenToType(i, data));
        }
        return row;
    }

    /**
     * Processes the header to determine the required type information.
     * @throws CIlibIOException
     * {@inheritDoc}
     */
    private void processHeader() throws CIlibIOException {
        String line = getNextLineIgnoreComments();
        String[] tokens;
        if (line.toUpperCase().startsWith("@RELATION")) {
            tokens = line.split(headerDelimiter);
            if (tokens.length < 2) {
                throw new CIlibIOException("@RELATION declaration must be followed" +
                        "by whitespace and relation name.");
            }
            relationName = tokens[1];
        }

        int columnCount = 0;
        line = getNextLineIgnoreComments();
        while (!line.equalsIgnoreCase("@DATA")) {
            tokens = line.split(headerDelimiter);
            if (!tokens[0].equalsIgnoreCase("@ATTRIBUTE")) {
                throw new CIlibIOException("Expected @ATTRIBUTE declaration, found: " + tokens[0]);
            }
            if (tokens.length < 3) {
                throw new CIlibIOException("@ATTRIBUTE declaration must be followed" +
                        "by  ");
            }
            String attributeName = tokens[1];
            columnNames.add(attributeName);
            Type type = getTypeData(columnCount, tokens[2]);
            columnTypePrototypes.add(type);
            line = getNextLineIgnoreComments();
            columnCount++;
        }
    }

    /**
     * Gets the next line, ignoring comment lines. As determined by the comment
     * prefix.
     * @return the next non-empty non-comment line.
     */
    private String getNextLineIgnoreComments() {
        String line = this.nextLine();
        while (line.startsWith(commentPrefix) || line.isEmpty()) {
            line = this.nextLine();
        }
        return line.trim();
    }

    /**
     * Given the datatype key, determines the datatype for a given column. Also
     * sets up the HashTables to map a nominal attribute to a corresponding integer
     * number.
     * @param columnn the column to map to a datatype.
     * @param datatype the datatype key.
     * @return a corresponding CIlib type.
     * @throws CIlibIOException
     * {@inheritDoc}
     */
    private Type getTypeData(int columnn, String datatype) throws CIlibIOException {
        if (datatype.equalsIgnoreCase("NUMERIC")) {
            return Real.valueOf(0.0);
        }
        if (datatype.equalsIgnoreCase("STRING")) {
            return new StringType("");
        }
        if (datatype.equalsIgnoreCase("DATE")) {
            throw new UnsupportedOperationException("Date format currently not supported" +
                    " in CIlib.");
        }

        //If none of the above, has to be a nominal attribute.
        if (columnToNominalAttributesMap == null) {
            columnToNominalAttributesMap = new HashMap>();
        }
        HashMap nominalMap = new HashMap();
        datatype = datatype.replaceAll("[{}]", "");
        String[] nominalAttributes = datatype.split("\\,");
        if (nominalAttributes.length == 0) {
            throw new CIlibIOException("Nominal attributes must be comma separated:" +
                    "{, , , ...} ");
        }
        for (int i = 0; i < nominalAttributes.length; i++) {
            String nominalAttribute = nominalAttributes[i];
            nominalMap.put(nominalAttribute, i);
        }
        columnToNominalAttributesMap.put(columnn, nominalMap);
        return Int.valueOf(0);
    }

    /**
     * Puts a token into a new object of the correct CIlib type.
     * @param index the index of the token in the row (its column).
     * @param token the token to be typed.
     * @return a new CIlib object of the correct type.
     */
    private Type mapTokenToType(int index, String token) {
        Type type = columnTypePrototypes.get(index);
        if (type instanceof Real) {
            return Real.valueOf(Double.parseDouble(token));
        }

        if (type instanceof StringType) {
            return new StringType(token);
        }

        //If none of the above, has to be a nominal attribute.
        HashMap nominalMap = this.columnToNominalAttributesMap.get(index);
        return Int.valueOf(nominalMap.get(token));
    }

    /**
     * Gets the delimiter used to separate fields in the file's header.
     * @return the header delimiter.
     */
    public String getHeaderDelimiter() {
        return headerDelimiter;
    }

    /**
     * Sets the delimiter used to separate fields in the file's header.
     * @param headerDelimiter the header delimiter.
     */
    public void setHeaderDelimiter(String headerDelimiter) {
        this.headerDelimiter = headerDelimiter;
    }

    /**
     * The name of the ARFF relation (dataset).
     * @return the relation's name.
     */
    public String getRelationName() {
        return relationName;
    }

    /**
     * Sets the ARFF relation's name.
     * @param relationName the relation's name.
     */
    public void setRelationName(String relationName) {
        this.relationName = relationName;
    }

    /**
     * Gets the names of the columns of the dataset.
     * @return the column's names.
     */
    @Override
    public ArrayList getColumnNames() {
        return columnNames;
    }

    /**
     * Sets the names of the columns of the dataset.
     * @param columnNames the column's names.
     */
    public void setColumnNames(ArrayList columnNames) {
        this.columnNames = columnNames;
    }

    /**
     * Gets the delimiter used to identify fields in the data.
     * @return the delimiter used in the data section of the file.
     */
    public String getDataDelimiter() {
        return dataDelimiter;
    }

    /**
     * Sets the delimiter used to identify fields in the data.
     * @param dataDelimiter the delimiter used in the data section of the file.
     */
    public void setDataDelimiter(String dataDelimiter) {
        this.dataDelimiter = dataDelimiter;
    }

    /**
     * For a specific nominal integer value in a nominal attribute column, this
     * method retrieves the value's corresponding nominal string representation.
     * @param column the column of the nominal attribute.
     * @param nominalKey the integer nominal key.
     * @return the string nominal value.
     */
    public String getNominalString(int column, Type nominalKey) {
        int nominalKeyInt;
        try {
            nominalKeyInt = ((Int) nominalKey).intValue();
        } catch (ClassCastException ex) {
            throw new ClassCastException("Nominal key must be CIlib Int object.");
        }
        Set> nominalKeySet = this.columnToNominalAttributesMap.get(column).entrySet();
        for (Entry entry : nominalKeySet) {
            if (entry.getValue().compareTo(nominalKeyInt) == 0) {
                return entry.getKey();
            }
        }
        return "Nominal Key not found.";
    }
}