smile.data.parser.ArffParser Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2010 Haifeng Li
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package smile.data.parser;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.net.URI;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import smile.data.Attribute;
import smile.data.AttributeDataset;
import smile.data.DateAttribute;
import smile.data.Datum;
import smile.data.NominalAttribute;
import smile.data.NumericAttribute;
import smile.data.StringAttribute;
/**
* Weka ARFF (attribute relation file format) file parser. ARFF is an ASCII
* text file format that is essentially a CSV file with a header that describes
* the meta-data. ARFF was developed for use in the Weka machine learning
* software.
*
* A dataset is firstly described, beginning with the name of the dataset
* (or the relation in ARFF terminology). Each of the variables (or attribute
* in ARFF terminology) used to describe the observations is then identified,
* together with their data type, each definition on a single line.
* The actual observations are then listed, each on a single line, with fields
* separated by commas, much like a CSV file.
*
* Missing values in an ARFF dataset are identified using the question mark '?'.
*
* Comments can be included in the file, introduced at the beginning of a line
* with a '%', whereby the remainder of the line is ignored.
*
* A significant advantage of the ARFF data file over the CSV data file is
* the meta data information.
*
* Also, the ability to include comments ensure we can record extra information
* about the data set, including how it was derived, where it came from, and
* how it might be cited.
*
* @author Haifeng Li
*/
public class ArffParser {
/** The keyword used to denote the start of an arff header */
private static final String ARFF_RELATION = "@relation";
/** The keyword used to denote the start of the arff data section */
private static final String ARFF_DATA = "@data";
/** The keyword used to denote the start of an arff attribute declaration */
private static final String ARFF_ATTRIBUTE = "@attribute";
/** A keyword used to denote a numeric attribute */
private static final String ARFF_ATTRIBUTE_INTEGER = "integer";
/** A keyword used to denote a numeric attribute */
private static final String ARFF_ATTRIBUTE_REAL = "real";
/** A keyword used to denote a numeric attribute */
private static final String ARFF_ATTRIBUTE_NUMERIC = "numeric";
/** The keyword used to denote a string attribute */
private static final String ARFF_ATTRIBUTE_STRING = "string";
/** The keyword used to denote a date attribute */
private static final String ARFF_ATTRIBUTE_DATE = "date";
/** The keyword used to denote a relation-valued attribute */
private static final String ARFF_ATTRIBUTE_RELATIONAL = "relational";
/** The keyword used to denote the end of the declaration of a subrelation */
private static final String ARFF_END_SUBRELATION = "@end";
/**
* The column index of dependent/response variable.
*/
private int responseIndex = -1;
/**
* Constructor.
*/
public ArffParser() {
}
/**
* Returns the column index (starting at 0) of dependent/response variable.
*/
public int getResponseIndex() {
return responseIndex;
}
/**
* Sets the column index (starting at 0) of dependent/response variable.
*/
public void setResponseIndex(int index) {
this.responseIndex = index;
}
/**
* Initializes the StreamTokenizer used for reading the ARFF file.
*/
private void initTokenizer(StreamTokenizer tokenizer) {
tokenizer.resetSyntax();
tokenizer.whitespaceChars(0, ' ');
tokenizer.wordChars(' ' + 1, '\u00FF');
tokenizer.whitespaceChars(',', ',');
tokenizer.commentChar('%');
tokenizer.quoteChar('"');
tokenizer.quoteChar('\'');
tokenizer.ordinaryChar('{');
tokenizer.ordinaryChar('}');
tokenizer.eolIsSignificant(true);
}
/**
* Gets next token, skipping empty lines.
*
* @throws IOException if reading the next token fails
*/
private void getFirstToken(StreamTokenizer tokenizer) throws IOException {
while (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
}
if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) {
tokenizer.ttype = StreamTokenizer.TT_WORD;
} else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) && (tokenizer.sval.equals("?"))) {
tokenizer.ttype = '?';
}
}
/**
* Gets token and checks if it's end of line.
*
* @param endOfFileOk true if EOF is OK
* @throws IllegalStateException if it doesn't find an end of line
*/
private void getLastToken(StreamTokenizer tokenizer, boolean endOfFileOk) throws IOException, ParseException {
if ((tokenizer.nextToken() != StreamTokenizer.TT_EOL) && ((tokenizer.ttype != StreamTokenizer.TT_EOF) || !endOfFileOk)) {
throw new ParseException("end of line expected", tokenizer.lineno());
}
}
/**
* Gets next token, checking for a premature and of line.
*
* @throws IllegalStateException if it finds a premature end of line
*/
private void getNextToken(StreamTokenizer tokenizer) throws IOException, ParseException {
if (tokenizer.nextToken() == StreamTokenizer.TT_EOL) {
throw new ParseException("premature end of line", tokenizer.lineno());
}
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
throw new ParseException("premature end of file", tokenizer.lineno());
} else if ((tokenizer.ttype == '\'') || (tokenizer.ttype == '"')) {
tokenizer.ttype = StreamTokenizer.TT_WORD;
} else if ((tokenizer.ttype == StreamTokenizer.TT_WORD) && (tokenizer.sval.equals("?"))) {
tokenizer.ttype = '?';
}
}
/**
* Reads and stores header of an ARFF file.
*
* @param attributes the set of attributes in this relation.
* @return the name of relation.
* @throws IllegalStateException if the information is not read successfully
*/
private String readHeader(StreamTokenizer tokenizer, List attributes) throws IOException, ParseException {
/// The name of dataset.
String relationName = null;
// clear attribute set, which may be from previous parsing of other datasets.
attributes.clear();
// Get name of relation.
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
throw new ParseException("premature end of file", tokenizer.lineno());
}
if (ARFF_RELATION.equalsIgnoreCase(tokenizer.sval)) {
getNextToken(tokenizer);
relationName = tokenizer.sval;
getLastToken(tokenizer, false);
} else {
throw new ParseException("keyword " + ARFF_RELATION + " expected", tokenizer.lineno());
}
// Get attribute declarations.
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
throw new ParseException("premature end of file", tokenizer.lineno());
}
while (ARFF_ATTRIBUTE.equalsIgnoreCase(tokenizer.sval)) {
attributes.add(parseAttribute(tokenizer));
}
// Check if data part follows. We can't easily check for EOL.
if (!ARFF_DATA.equalsIgnoreCase(tokenizer.sval)) {
throw new ParseException("keyword " + ARFF_DATA + " expected", tokenizer.lineno());
}
// Check if any attributes have been declared.
if (attributes.isEmpty()) {
throw new ParseException("no attributes declared", tokenizer.lineno());
}
if (responseIndex >= attributes.size()) {
throw new ParseException("Invalid response variable index", responseIndex);
}
return relationName;
}
/**
* Parses the attribute declaration.
*
* @return an attributes in this relation
* @throws IOException if the information is not read
* successfully
*/
private Attribute parseAttribute(StreamTokenizer tokenizer) throws IOException, ParseException {
Attribute attribute = null;
// Get attribute name.
getNextToken(tokenizer);
String attributeName = tokenizer.sval;
getNextToken(tokenizer);
// Check if attribute is nominal.
if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
// Attribute is real, integer, or string.
if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_REAL) ||
tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_INTEGER) ||
tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_NUMERIC)) {
attribute = new NumericAttribute(attributeName);
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_STRING)) {
attribute = new StringAttribute(attributeName);
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_DATE)) {
String format = null;
if (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
if ((tokenizer.ttype != StreamTokenizer.TT_WORD) && (tokenizer.ttype != '\'') && (tokenizer.ttype != '\"')) {
throw new ParseException("not a valid date format", tokenizer.lineno());
}
format = tokenizer.sval;
readTillEOL(tokenizer);
} else {
tokenizer.pushBack();
}
attribute = new DateAttribute(attributeName, null, format);
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(ARFF_ATTRIBUTE_RELATIONAL)) {
readTillEOL(tokenizer);
} else if (tokenizer.sval.equalsIgnoreCase(ARFF_END_SUBRELATION)) {
getNextToken(tokenizer);
} else {
throw new ParseException("Invalid attribute type or invalid enumeration", tokenizer.lineno());
}
} else {
// Attribute is nominal.
List attributeValues = new ArrayList();
tokenizer.pushBack();
// Get values for nominal attribute.
if (tokenizer.nextToken() != '{') {
throw new ParseException("{ expected at beginning of enumeration", tokenizer.lineno());
}
while (tokenizer.nextToken() != '}') {
if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
throw new ParseException("} expected at end of enumeration", tokenizer.lineno());
} else {
attributeValues.add(tokenizer.sval.trim());
}
}
String[] values = new String[attributeValues.size()];
for (int i = 0; i < values.length; i++) {
values[i] = attributeValues.get(i);
}
attribute = new NominalAttribute(attributeName, values);
}
getLastToken(tokenizer, false);
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
throw new ParseException("premature end of file", tokenizer.lineno());
}
return attribute;
}
/**
* Reads and skips all tokens before next end of line token.
*
* @throws IOException in case something goes wrong
*/
private void readTillEOL(StreamTokenizer tokenizer) throws IOException {
while (tokenizer.nextToken() != StreamTokenizer.TT_EOL) {
}
tokenizer.pushBack();
}
/**
* Returns the attribute set of given URI.
* @throws java.io.FileNotFoundException
*/
public static Attribute[] getAttributes(URI uri) throws FileNotFoundException, IOException, ParseException {
return getAttributes(new File(uri));
}
/**
* Returns the attribute set of given file.
* @throws java.io.FileNotFoundException
*/
public static Attribute[] getAttributes(String path) throws FileNotFoundException, IOException, ParseException {
return getAttributes(new File(path));
}
/**
* Returns the attribute set of given file.
* @throws java.io.FileNotFoundException
*/
public static Attribute[] getAttributes(File file) throws FileNotFoundException, IOException, ParseException {
return getAttributes(new FileInputStream(file));
}
/**
* Returns the attribute set of given stream.
*/
public static Attribute[] getAttributes(InputStream stream) throws IOException, ParseException {
Reader r = new BufferedReader(new InputStreamReader(stream));
StreamTokenizer tokenizer = new StreamTokenizer(r);
ArffParser parser = new ArffParser();
parser.initTokenizer(tokenizer);
List attributes = new ArrayList();
parser.readHeader(tokenizer, attributes);
return attributes.toArray(new Attribute[attributes.size()]);
}
/**
* Parse a dataset from given URI.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(URI uri) throws FileNotFoundException, IOException, ParseException {
return parse(new File(uri));
}
/**
* Parse a dataset from given file.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(String path) throws FileNotFoundException, IOException, ParseException {
return parse(new File(path));
}
/**
* Parse a dataset from given file.
* @throws java.io.FileNotFoundException
*/
public AttributeDataset parse(File file) throws FileNotFoundException, IOException, ParseException {
return parse(new FileInputStream(file));
}
/**
* Parse a dataset from given stream.
*/
public AttributeDataset parse(InputStream stream) throws IOException, ParseException {
try (Reader r = new BufferedReader(new InputStreamReader(stream))) {
StreamTokenizer tokenizer = new StreamTokenizer(r);
initTokenizer(tokenizer);
List attributes = new ArrayList();
String relationName = readHeader(tokenizer, attributes);
if (attributes.isEmpty()) {
throw new IOException("no header information available");
}
Attribute response = null;
Attribute[] attr = new Attribute[attributes.size()];
attributes.toArray(attr);
for (int i = 0; i < attributes.size(); i++) {
if (responseIndex == i) {
response = attributes.remove(i);
break;
}
}
AttributeDataset data = new AttributeDataset(relationName, attributes.toArray(new Attribute[attributes.size()]), response);
while (true) {
// Check if end of file reached.
getFirstToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
break;
}
// Parse instance
if (tokenizer.ttype == '{') {
data.add(getSparseInstance(tokenizer, attr));
} else {
data.add(getInstance(tokenizer, attr));
}
}
for (Attribute attribute : attributes) {
if (attribute instanceof NominalAttribute) {
NominalAttribute a = (NominalAttribute) attribute;
a.setOpen(false);
}
if (attribute instanceof StringAttribute) {
StringAttribute a = (StringAttribute) attribute;
a.setOpen(false);
}
}
return data;
}
}
/**
* Reads a single instance.
* @throws ParseException if the information is not read successfully
*/
private Datum getInstance(StreamTokenizer tokenizer, Attribute[] attributes) throws IOException, ParseException {
double[] x = responseIndex >= 0 ? new double[attributes.length - 1] : new double[attributes.length];
double y = Double.NaN;
// Get values for all attributes.
for (int i = 0, k = 0; i < attributes.length; i++) {
// Get next token
if (i > 0) {
getNextToken(tokenizer);
}
if (i == responseIndex) {
if (tokenizer.ttype == '?') {
y = Double.NaN;
} else {
y = attributes[i].valueOf(tokenizer.sval);
}
} else {
if (tokenizer.ttype == '?') {
x[k++] = Double.NaN;
} else {
x[k++] = attributes[i].valueOf(tokenizer.sval);
}
}
}
return new Datum(x, y);
}
/**
* Reads a sparse instance using the tokenizer.
* @throws ParseException if the information is not read successfully
*/
private Datum getSparseInstance(StreamTokenizer tokenizer, Attribute[] attributes) throws IOException, ParseException {
double[] x = responseIndex >= 0 ? new double[attributes.length - 1] : new double[attributes.length];
double y = Double.NaN;
int index = -1;
// Get values for all attributes.
do {
getNextToken(tokenizer);
// end of instance
if (tokenizer.ttype == '}') {
break;
}
String s = tokenizer.sval.trim();
if (index < 0) {
index = Integer.valueOf(s);
if (index < 0 || index >= attributes.length) {
throw new ParseException("Invalid attribute index: " + index, tokenizer.lineno());
}
} else {
String val = s;
if (index != responseIndex) {
if (val.equals("?")) {
x[index] = Double.NaN;
} else {
x[index] = attributes[index].valueOf(val);
}
} else {
if (val.equals("?")) {
y = Double.NaN;
} else {
y = attributes[index].valueOf(val);
}
}
index = -1;
}
} while (tokenizer.ttype == StreamTokenizer.TT_WORD);
return new Datum(x, y);
}
}