smile.io.Arff Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of smile-io Show documentation
smile-io
There is a newer version: 2.6.0
/*******************************************************************************
 * Copyright (c) 2010-2020 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Smile.  If not, see .
 ******************************************************************************/

package smile.io;

import java.io.*;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.ParseException;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

import smile.data.DataFrame;
import smile.data.Tuple;
import smile.data.measure.NominalScale;
import smile.data.type.*;

/**
 * Weka ARFF (attribute relation file format) is an ASCII
 * text file format that is essentially a CSV file with a header that describes
 * the meta-data. ARFF was developed for use in the Weka machine learning
 * software.
 * 
 * A dataset is firstly described, beginning with the name of the dataset
 * (or the relation in ARFF terminology). Each of the variables (or attribute
 * in ARFF terminology) used to describe the observations is then identified,
 * together with their data type, each definition on a single line.
 * The actual observations are then listed, each on a single line, with fields
 * separated by commas, much like a CSV file.
 * 

 * Missing values in an ARFF dataset are identified using the question mark '?'.
 * 

 * Comments can be included in the file, introduced at the beginning of a line
 * with a '%', whereby the remainder of the line is ignored.
 * 

 * A significant advantage of the ARFF data file over the CSV data file is
 * the meta data information.
 * 
 * Also, the ability to include comments ensure we can record extra information
 * about the data set, including how it was derived, where it came from, and
 * how it might be cited.
 *
 * @author Haifeng Li
 */
public class Arff implements AutoCloseable {
    private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(Arff.class);

    /** The keyword used to denote the start of an arff header */
    private static final String ARFF_RELATION = "@relation";
    /** The keyword used to denote the start of the arff data section */
    private static final String ARFF_DATA = "@data";
    /** The keyword used to denote the start of an arff attribute declaration */
    private static final String ARFF_ATTRIBUTE = "@attribute";
    /** A keyword used to denote a numeric attribute */
    private static final String ARFF_ATTRIBUTE_INTEGER = "integer";
    /** A keyword used to denote a numeric attribute */
    private static final String ARFF_ATTRIBUTE_REAL = "real";
    /** A keyword used to denote a numeric attribute */
    private static final String ARFF_ATTRIBUTE_NUMERIC = "numeric";
    /** The keyword used to denote a string attribute */
    private static final String ARFF_ATTRIBUTE_STRING = "string";
    /** The keyword used to denote a date attribute */
    private static final String ARFF_ATTRIBUTE_DATE = "date";
    /** The keyword used to denote a relation-valued attribute */
    private static final String ARFF_ATTRIBUTE_RELATIONAL = "relational";
    /** The keyword used to denote the end of the declaration of a subrelation */
    private static final String ARFF_END_SUBRELATION = "@end";
    private static final String PREMATURE_END_OF_FILE = "premature end of file";

    /** Buffered file reader. */
    private Reader reader;
    /** The tokenizer to parse the file. */
    private StreamTokenizer tokenizer;
    /** The name of ARFF relation. */
    private String name;
    /** The schema of ARFF relation. */
    private StructType schema;
    /** The lambda to parse fields. */
    private List> parser;
    /** Attribute name path in case of sub-relations. */
    private String path = "";

    /**
     * Constructor.
     */
    public Arff(String path) throws IOException, ParseException, URISyntaxException {
        this(Input.reader(path));
    }

    /**
     * Constructor.
     */
    public Arff(String path, Charset charset) throws IOException, ParseException, URISyntaxException {
        this(Input.reader(path, charset));
    }

    /**
     * Constructor.
     */
    public Arff(Path path) throws IOException, ParseException {
        this(Files.newBufferedReader(path));
    }

    /**
     * Constructor.
     */
    public Arff(Path path, Charset charset) throws IOException, ParseException {
        this(Files.newBufferedReader(path, charset));
    }

    /**
     * Constructor.
     */
    public Arff(Reader reader) throws IOException, ParseException {
        this.reader = reader;

        tokenizer = new StreamTokenizer(reader);
        tokenizer.resetSyntax();
        tokenizer.whitespaceChars(0, ' ');
        tokenizer.wordChars(' ' + 1, '\u00FF');
        tokenizer.whitespaceChars(',', ',');
        tokenizer.commentChar('%');
        tokenizer.quoteChar('"');
        tokenizer.quoteChar('\'');
        tokenizer.ordinaryChar('{');
        tokenizer.ordinaryChar('}');
        tokenizer.eolIsSignificant(true);

        readHeader();
    }

    @Override
    public void close() throws IOException {
        reader.close();
    }

    /** Returns the name of relation. */
    public String name() {
        return name;
    }

    /**
     * Returns the attribute set of given stream.
     */
    public StructType schema() {
        return schema;
    }

    /**
     * Gets next token, skipping empty lines.
     *
     * @throws IOException if reading the next token fails
     */
    private void getFirstToken() throws IOException {
        while (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
            // empty lines
        }

        if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) {
            tokenizer.ttype = StreamTokenizer.TT_WORD;
        } else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) && (tokenizer.sval.equals("?"))) {
            tokenizer.ttype = '?';
        }
    }

    /**
     * Gets token and checks if it's end of line.
     *
     * @param endOfFileOk true if EOF is OK
     * @throws IllegalStateException if it doesn't find an end of line
     */
    private void getLastToken(boolean endOfFileOk) throws IOException, ParseException {
        if ((tokenizer.nextToken() != StreamTokenizer.TT_EOL) && ((tokenizer.ttype != StreamTokenizer.TT_EOF) || !endOfFileOk)) {
            throw new ParseException("end of line expected", tokenizer.lineno());
        }
    }

    /**
     * Gets next token, checking for a premature and of line.
     *
     * @throws IllegalStateException if it finds a premature end of line
     */
    private void getNextToken() throws IOException, ParseException {
        if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
            throw new ParseException("premature end of line", tokenizer.lineno());
        }

        if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
            throw new ParseException(PREMATURE_END_OF_FILE, tokenizer.lineno());
        } else if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) {
            tokenizer.ttype = StreamTokenizer.TT_WORD;
        } else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) && (tokenizer.sval.equals("?"))) {
            tokenizer.ttype = '?';
        }
    }

    /**
     * Reads and stores header of an ARFF file.
     *
     * @return the schema of relation.
     * @throws IllegalStateException if the information is not read successfully
     */
    private void readHeader() throws IOException, ParseException {
        List fields = new ArrayList<>();

        // Get name of relation.
        getFirstToken();
        if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
            throw new ParseException(PREMATURE_END_OF_FILE, tokenizer.lineno());
        }
        if (ARFF_RELATION.equalsIgnoreCase(tokenizer.sval)) {
            getNextToken();
            name = tokenizer.sval;
            logger.info("Read ARFF relation {}", name);
            getLastToken(false);
        } else {
            throw new ParseException("keyword " + ARFF_RELATION + " expected", tokenizer.lineno());
        }

        // Get attribute declarations.
        getFirstToken();
        if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
            throw new ParseException(PREMATURE_END_OF_FILE, tokenizer.lineno());
        }

        while (ARFF_ATTRIBUTE.equalsIgnoreCase(tokenizer.sval)) {
            StructField attribute = nextAttribute();
            // We may meet an relational attribute, which parseAttribute returns null
            // as it flats the relational attribute out.
            if (attribute != null) {
                fields.add(attribute);
            }
        }

        // Check if data part follows. We can't easily check for EOL.
        if (!ARFF_DATA.equalsIgnoreCase(tokenizer.sval)) {
            throw new ParseException("keyword " + ARFF_DATA + " expected", tokenizer.lineno());
        }

        // Check if any attributes have been declared.
        if (fields.isEmpty()) {
            throw new ParseException("no attributes declared", tokenizer.lineno());
        }
        
        schema = DataTypes.struct(fields);
        parser = schema.parser();
    }

    /**
     * Reads the attribute declaration.
     *
     * @return an attributes in this relation
     * @throws IOException if the information is not read successfully
     */
    private StructField nextAttribute() throws IOException, ParseException {
        StructField attribute = null;

        // Get attribute name.
        getNextToken();
        String name = attributeName(tokenizer.sval);
        getNextToken();

        // Check if attribute is nominal.
        if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
            // Attribute is real, integer, or string.
            if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_NUMERIC)) {
                attribute = new StructField(name, DataTypes.DoubleType);
                readTillEOL();

            } else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_REAL)) {
                attribute = new StructField(name, DataTypes.FloatType);
                readTillEOL();

            } else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_INTEGER)) {
                attribute = new StructField(name, DataTypes.IntegerType);
                readTillEOL();

            } else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_STRING)) {
                attribute = new StructField(name, DataTypes.StringType);
                readTillEOL();

            } else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_DATE)) {
                if (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
                    if ((tokenizer.ttype != StreamTokenizer.TT_WORD) && (tokenizer.ttype != '\'') && (tokenizer.ttype != '\"')) {
                        throw new ParseException("not a valid date format", tokenizer.lineno());
                    }
                    attribute = new StructField(name, DataTypes.datetime(tokenizer.sval));
                    readTillEOL();
                } else {
                    attribute = new StructField(name, DataTypes.DateTimeType);
                    tokenizer.pushBack();
                }
                readTillEOL();

            } else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_RELATIONAL)) {
                logger.info("Encounter relational attribute {}. Flat out its attributes to top level.", name);
                path = path + "." + name;
                readTillEOL();

            } else if (tokenizer.sval.equalsIgnoreCase(ARFF_END_SUBRELATION)) {
                path = path.substring(0, path.lastIndexOf('.'));
                getNextToken();

            } else {
                throw new ParseException("Invalid attribute type or invalid enumeration", tokenizer.lineno());
            }

        } else {

            // Attribute is nominal.
            List attributeValues = new ArrayList<>();
            tokenizer.pushBack();

            // Get values for nominal attribute.
            if (tokenizer.nextToken() != '{') {
                throw new ParseException("{ expected at beginning of enumeration", tokenizer.lineno());
            }
            while (tokenizer.nextToken() != '}') {
                if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
                    throw new ParseException("} expected at end of enumeration", tokenizer.lineno());
                } else {
                    attributeValues.add(tokenizer.sval.trim());
                }
            }

            NominalScale scale = new NominalScale(attributeValues);
            attribute = new StructField(name, scale.type(), scale);
        }

        getLastToken(false);
        getFirstToken();
        if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
            throw new ParseException(PREMATURE_END_OF_FILE, tokenizer.lineno());
        }

        return attribute;
    }

    /** Returns the attribute name. */
    private String attributeName(String name) {
        return path.length() == 0 ? name : path + "." + name;
    }

    /**
     * Reads and skips all tokens before next end of line token.
     *
     * @throws IOException in case something goes wrong
     */
    private void readTillEOL() throws IOException {
        while (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
            // skip all the tokens before EOL
        }

        // push back the EOL token
        tokenizer.pushBack();
    }

    /**
     * Reads all the records.
     */
    public DataFrame read() throws IOException, ParseException {
        return read(Integer.MAX_VALUE);
    }

    /**
     * Reads a limited number of records.
     * @param limit reads a limited number of records.
     */
    public DataFrame read(int limit) throws IOException, ParseException {
        if (limit <= 0) {
            throw new IllegalArgumentException("Invalid limit: " + limit);
        }

        List rows = new ArrayList<>();
        for (int i = 0; i < limit; i++) {
            // Check if end of file reached.
            getFirstToken();
            if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
                break;
            }

            // Parse instance
            Object[] row = tokenizer.ttype == '{' ? readSparseInstance() : readInstance();
            rows.add(Tuple.of(row, schema));
        }

        schema = schema.boxed(rows);
        return DataFrame.of(rows, schema);
    }

    /**
     * Reads a single instance.
     * @throws ParseException if the information is not read successfully
     */
    private Object[] readInstance() throws IOException, ParseException {
        StructField[] fields = schema.fields();
        int p = fields.length;
        Object[] x = new Object[p];

        // Get values for all attributes.
        for (int i = 0; i < p; i++) {
            // Get next token
            if (i > 0) {
                getNextToken();
            }

            if (tokenizer.ttype != '?') {
                x[i] = parser.get(i).apply(tokenizer.sval);
            }
        }

        return x;
    }

    /**
     * Reads a sparse instance using the tokenizer.
     * @throws ParseException if the information is not read successfully
     */
    private Object[] readSparseInstance() throws IOException, ParseException {
        StructField[] fields = schema.fields();
        int p = fields.length;
        Object[] x = new Object[p];

        // Get values for all attributes.
        do {
            getNextToken();
            
            // end of instance
            if (tokenizer.ttype == '}') {
                break;
            }
            
            int i = Integer.parseInt(tokenizer.sval.trim());
            if (i < 0 || i >= p) {
                throw new ParseException("Invalid attribute index: " + i, tokenizer.lineno());
            }

            getNextToken();
            String val = tokenizer.sval.trim();
            if (!val.equals("?")) {
                x[i] = parser.get(i).apply(val);
            }
        } while (tokenizer.ttype == StreamTokenizer.TT_WORD);

        for (int i = 0; i < x.length; i++) {
            if (x[i] == null) {
                StructField field = schema.field(i);
                if (field.type.isByte()) {
                    x[i] = (byte) 0;
                } else if (field.type.isShort()) {
                    x[i] = (short) 0;
                } else if (field.type.isInt()) {
                    x[i] = 0;
                } else if (field.type.isFloat()) {
                    x[i] = 0.0f;
                } else {
                    x[i] = 0.0;
                }
            }
        }

        return x;
    }

    /**
     * Writes the data frame to an ARFF file.
     * @param df the data frame.
     * @param path the file path.
     * @param relation the relation name of ARFF.
     */
    public static void write(DataFrame df, Path path, String relation) throws IOException {
        try (PrintWriter writer = new PrintWriter(Files.newOutputStream(path))) {

            writer.print("@RELATION ");
            writer.println(relation);

            for (StructField field : df.schema().fields()) {
                writeField(writer, field);
            }

            writer.println("@DATA");

            int p = df.ncols();
            df.stream().forEach(t -> {
                String line = IntStream.range(0, p).mapToObj(i -> t.toString()).collect(Collectors.joining(","));
                writer.println(line);
            });
        }
    }

    /** Write the meta of field to ARFF file. */
    private static void writeField(PrintWriter writer, StructField field) throws IOException {
        writer.print("@ATTRIBUTE ");
        writer.print(field.name);
        if (field.type.isFloating()) writer.println(" REAL");
        else if (field.type.isString()) writer.println(" STRING");
        else if (field.type.id() == DataType.ID.DateTime) writer.println(" DATE \"yyyy-MM-dd HH:mm:ss\"");
        else if (field.type.isIntegral()) {
            if (field.measure instanceof NominalScale) {
                NominalScale scale = (NominalScale) field.measure;
                String levels = Arrays.stream(scale.levels()).collect(Collectors.joining(",", " {", "}"));
                writer.println(levels);
            } else {
                writer.println(" REAL");
            }
        }
    }
}