smile.io.Read Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of smile-base Show documentation
smile-base
There is a newer version: 4.2.0
/*
 * Copyright (c) 2010-2021 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Smile.  If not, see .
 */

package smile.io;

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;

import org.apache.commons.csv.CSVFormat;
import smile.data.DataFrame;
import smile.data.SampleInstance;
import smile.data.SparseDataset;
import smile.data.type.StructType;
import smile.util.SparseArray;
import smile.util.Strings;

/**
 * Reads data from external storage systems.
 * 
 * @author Haifeng Li
 */
public interface Read {
    /**
     * Reads a serialized object from a file.
     * @param path the file path.
     * @return the serialized object.
     * @throws IOException when fails to read the stream.
     * @throws ClassNotFoundException when fails to load the class.
     */
    static Object object(Path path) throws IOException, ClassNotFoundException {
        InputStream file = Files.newInputStream(path);
        ObjectInputStream in = new ObjectInputStream(file);
        Object o = in.readObject();
        in.close();
        file.close();
        return o;
    }

    /**
     * Reads a data file. Infers the data format by the file name extension.
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @throws ParseException when fails to parse the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame data(String path) throws IOException, URISyntaxException, ParseException {
        return data(path, null);
    }

    /**
     * Reads a data file. Infers the data format by the file name extension.
     * @param path the input file path.
     * @param format the optional file format specification. For csv files,
     *               it is such as delimiter=\t,header=true,comment=#,escape=\,quote=".
     *               For json files, it is the file mode (single-line or
     *               multi-line). For avro files, it is the path to the schema
     *               file.
     * @throws IOException when fails to read the file.
     * @throws ParseException when fails to parse the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame data(String path, String format) throws IOException, URISyntaxException, ParseException {
        int dotIndex = path.lastIndexOf(".");
        String ext = dotIndex < 0 ? "csv" : path.substring(dotIndex + 1);
        switch (ext) {
            case "dat":
            case "txt":
            case "csv": return csv(path, format);
            case "arff": return arff(path);
            case "json":
                JSON.Mode mode = format == null ? JSON.Mode.SINGLE_LINE : JSON.Mode.valueOf(format);
                return json(path, mode, null);
            case "sas7bdat": return sas(path);
            case "avro": return avro(path, format);
            case "parquet": return parquet(path);
            case "feather": return arrow(path);
            default:
                if (format != null) {
                    if (format.equals("csv")) {
                        return csv(path);
                    } else if (format.startsWith("csv,")) {
                        return csv(path, format.substring(4));
                    }
                }
        }

        throw new UnsupportedOperationException("Unsupported data format: " + ext);
    }

    /**
     * Reads a CSV file.
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame csv(String path) throws IOException, URISyntaxException {
        return csv(path, CSVFormat.DEFAULT);
    }

    /**
     * Reads a CSV file.
     * @param path the input file path.
     * @param format the format specification in key-value pairs such as
     *               delimiter=\t,header=true,comment=#,escape=\,quote=".
     * @throws IOException when fails to read the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame csv(String path, String format) throws IOException, URISyntaxException {
        CSVFormat.Builder formatBuilder = CSVFormat.Builder.create();
        for (String token : format.split(",")) {
            String[] option = token.split("=");
            if (option.length != 2) {
                throw new IllegalArgumentException("Invalid csv format specifier: " + token);
            }
            switch (option[0].toLowerCase(Locale.ROOT)) {
                case "delimiter":
                    formatBuilder.setDelimiter(Strings.unescape(option[1]));
                    break;
                case "quote":
                    String quote = Strings.unescape(option[1]);
                    if (quote.length() != 1) {
                        throw new IllegalArgumentException("Unknown csv quote: " + quote);
                    }
                    formatBuilder.setQuote(quote.charAt(0));
                    break;
                case "escape":
                    String escape = Strings.unescape(option[1]);
                    if (escape.length() != 1) {
                        throw new IllegalArgumentException("Unknown csv escape: " + escape);
                    }
                    formatBuilder.setEscape(escape.charAt(0));
                    break;
                case "comment":
                    String comment = Strings.unescape(option[1]);
                    if (comment.length() != 1) {
                        throw new IllegalArgumentException("Unknown csv comment marker: " + comment);
                    }
                    formatBuilder.setCommentMarker(comment.charAt(0));
                    break;
                case "header":
                    if ("true".equalsIgnoreCase(option[1])) {
                        formatBuilder.setHeader().setSkipHeaderRecord(true);
                    } else {
                        String[] header = option[1].split("\\|");
                        formatBuilder.setHeader(header);
                    }
                    break;
                default:
                    throw new IllegalArgumentException("Unknown csv format specifier: " + option[0]);
            }
        }

        return csv(path, formatBuilder.build());
    }

    /**
     * Reads a CSV file.
     * @param path the input file path.
     * @param format the CSV file format.
     * @throws IOException when fails to read the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame csv(String path, CSVFormat format) throws IOException, URISyntaxException {
        return csv(path, format, null);
    }

    /**
     * Reads a CSV file.
     * @param path the input file path.
     * @param format the CSV file format.
     * @param schema the data schema.
     * @throws IOException when fails to read the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame csv(String path, CSVFormat format, StructType schema) throws IOException, URISyntaxException {
        CSV csv = new CSV(format);
        if (schema != null) csv.schema(schema);
        return csv.read(path);
    }

    /**
     * Reads a CSV file.
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @return the data frame.
     */
    static DataFrame csv(Path path) throws IOException {
        return csv(path, CSVFormat.DEFAULT);
    }

    /**
     * Reads a CSV file.
     * @param path the input file path.
     * @param format the CSV file format.
     * @throws IOException when fails to read the file.
     * @return the data frame.
     */
    static DataFrame csv(Path path, CSVFormat format) throws IOException {
        return csv(path, format, null);
    }

    /**
     * Reads a CSV file.
     * @param path the input file path.
     * @param format the CSV file format.
     * @param schema the data schema.
     * @throws IOException when fails to read the file.
     * @return the data frame.
     */
    static DataFrame csv(Path path, CSVFormat format, StructType schema) throws IOException {
        CSV csv = new CSV(format);
        if (schema != null) csv.schema(schema);
        return csv.read(path);
    }

    /**
     * Reads a JSON file.
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame json(String path) throws IOException, URISyntaxException {
        return json(path, JSON.Mode.SINGLE_LINE, null);
    }

    /**
     * Reads a JSON file.
     * @param path the input file path.
     * @param mode the file mode (single-line or multi-line).
     * @param schema the data schema.
     * @throws IOException when fails to read the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame json(String path, JSON.Mode mode, StructType schema) throws IOException, URISyntaxException {
        JSON json = new JSON().mode(mode);
        if (schema != null) json.schema(schema);
        return json.read(path);
    }

    /**
     * Reads a JSON file.
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @return the data frame.
     */
    static DataFrame json(Path path) throws IOException {
        return json(path, JSON.Mode.SINGLE_LINE, null);
    }

    /**
     * Reads a JSON file.
     * @param path the input file path.
     * @param mode the file mode (single-line or multi-line).
     * @param schema the data schema.
     * @throws IOException when fails to read the file.
     * @return the data frame.
     */
    static DataFrame json(Path path, JSON.Mode mode, StructType schema) throws IOException {
        JSON json = new JSON().mode(mode);
        if (schema != null) json.schema(schema);
        return json.read(path);
    }

    /**
     * Reads an ARFF file. Weka ARFF (attribute relation file format) is an ASCII
     * text file format that is essentially a CSV file with a header that describes
     * the meta-data. ARFF was developed for use in the Weka machine learning
     * software.
     * 
     * A dataset is firstly described, beginning with the name of the dataset
     * (or the relation in ARFF terminology). Each of the variables (or attribute
     * in ARFF terminology) used to describe the observations is then identified,
     * together with their data type, each definition on a single line.
     * The actual observations are then listed, each on a single line, with fields
     * separated by commas, much like a CSV file.
     * 

     * Missing values in an ARFF dataset are identified using the question mark '?'.
     * 

     * Comments can be included in the file, introduced at the beginning of a line
     * with a '%', whereby the remainder of the line is ignored.
     * 

     * A significant advantage of the ARFF data file over the CSV data file is
     * the metadata information.
     * 

     * Also, the ability to include comments ensure we can record extra information
     * about the data set, including how it was derived, where it came from, and
     * how it might be cited.
     *
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @throws ParseException when fails to parse the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame arff(String path) throws IOException, ParseException, URISyntaxException {
        try (var arff = new Arff(path)) {
            return arff.read();
        }
    }

    /**
     * Reads an ARFF file. Weka ARFF (attribute relation file format) is an ASCII
     * text file format that is essentially a CSV file with a header that describes
     * the meta-data. ARFF was developed for use in the Weka machine learning
     * software.
     * 

     * A dataset is firstly described, beginning with the name of the dataset
     * (or the relation in ARFF terminology). Each of the variables (or attribute
     * in ARFF terminology) used to describe the observations is then identified,
     * together with their data type, each definition on a single line.
     * The actual observations are then listed, each on a single line, with fields
     * separated by commas, much like a CSV file.
     * 

     * Missing values in an ARFF dataset are identified using the question mark '?'.
     * 

     * Comments can be included in the file, introduced at the beginning of a line
     * with a '%', whereby the remainder of the line is ignored.
     * 

     * A significant advantage of the ARFF data file over the CSV data file is
     * the metadata information.
     * 

     * Also, the ability to include comments ensure we can record extra information
     * about the data set, including how it was derived, where it came from, and
     * how it might be cited.
     *
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @throws ParseException when fails to parse the file.
     * @return the data frame.
     */
    static DataFrame arff(Path path) throws IOException, ParseException {
        try (var arff = new Arff(path)) {
            return arff.read();
        }
    }

    /**
     * Reads a SAS7BDAT file.
     *
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame sas(String path) throws IOException, URISyntaxException {
        return SAS.read(path);
    }

    /**
     * Reads a SAS7BDAT file.
     *
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @return the data frame.
     */
    static DataFrame sas(Path path) throws IOException {
        return SAS.read(path);
    }

    /**
     * Reads an Apache Arrow file.
     * Apache Arrow is a cross-language development platform for in-memory data.
     * It specifies a standardized language-independent columnar memory format
     * for flat and hierarchical data, organized for efficient analytic
     * operations on modern hardware.
     *
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame arrow(String path) throws IOException, URISyntaxException {
        Arrow arrow = new Arrow();
        return arrow.read(path);
    }

    /**
     * Reads an Apache Arrow file.
     * Apache Arrow is a cross-language development platform for in-memory data.
     * It specifies a standardized language-independent columnar memory format
     * for flat and hierarchical data, organized for efficient analytic
     * operations on modern hardware.
     *
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @return the data frame.
     */
    static DataFrame arrow(Path path) throws IOException {
        Arrow arrow = new Arrow();
        return arrow.read(path);
    }

    /**
     * Reads an Apache Avro file.
     *
     * @param path the input file path.
     * @param schema the input stream of data schema.
     * @throws IOException when fails to read the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame avro(String path, InputStream schema) throws IOException, URISyntaxException {
        Avro avro = new Avro(schema);
        return avro.read(path);
    }

    /**
     * Reads an Apache Avro file.
     *
     * @param path the input file path.
     * @param schema the data schema file path.
     * @throws IOException when fails to read the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame avro(String path, String schema) throws IOException, URISyntaxException {
        Avro avro = new Avro(HadoopInput.stream(schema));
        return avro.read(path);
    }

    /**
     * Reads an Apache Avro file.
     *
     * @param path the input file path.
     * @param schema the input stream of data schema.
     * @throws IOException when fails to read the file.
     * @return the data frame.
     */
    static DataFrame avro(Path path, InputStream schema) throws IOException {
        Avro avro = new Avro(schema);
        return avro.read(path);
    }

    /**
     * Reads an Apache Avro file.
     *
     * @param path the input file path.
     * @param schema the data schema file path.
     * @throws IOException when fails to read the file.
     * @return the data frame.
     */
    static DataFrame avro(Path path, Path schema) throws IOException {
        Avro avro = new Avro(schema);
        return avro.read(path);
    }

    /**
     * Reads an Apache Parquet file.
     *
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static DataFrame parquet(String path) throws IOException, URISyntaxException {
        return Parquet.read(path);
    }

    /**
     * Reads an Apache Parquet file.
     *
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @return the data frame.
     */
    static DataFrame parquet(Path path) throws IOException {
        return Parquet.read(path);
    }

    /**
     * Reads a libsvm sparse dataset. The format of libsvm file is:
     * 
     * {@code
     *  : : ...
     * }
     * where {@code label} is the target value of the training data.
     * For classification, it should be an integer which identifies a class
     * (multi-class classification is supported). For regression, it's any real
     * number. For one-class SVM, it's not used so can be any number.
     * {@code index} is an integer starting from 1, and {@code value}
     * is a real number. The indices must be in ascending order. The labels in
     * the testing data file are only used to calculate accuracy or error. If they
     * are unknown, just fill this column with a number.
     *
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @throws URISyntaxException when the file path syntax is wrong.
     * @return the data frame.
     */
    static SparseDataset libsvm(String path) throws IOException, URISyntaxException {
        return libsvm(Input.reader(path));
    }

    /**
     * Reads a libsvm sparse dataset. The format of libsvm file is:
     *      * {@code
     *  : : ...
     * }
     * where {@code label} is the target value of the training data.
     * For classification, it should be an integer which identifies a class
     * (multi-class classification is supported). For regression, it's any real
     * number. For one-class SVM, it's not used so can be any number.
     * {@code index} is an integer starting from 1, and {@code value}
     * is a real number. The indices must be in ascending order. The labels in
     * the testing data file are only used to calculate accuracy or error. If they
     * are unknown, just fill this column with a number.
     *
     * @param path the input file path.
     * @throws IOException when fails to read the file.
     * @return the data frame.
     */
    static SparseDataset libsvm(Path path) throws IOException {
        return libsvm(Files.newBufferedReader(path));
    }

    /**
     * Reads a libsvm sparse dataset. The format of libsvm file is:
     *      * {@code
     *  : : ...
     * }
     * where {@code label} is the target value of the training data.
     * For classification, it should be an integer which identifies a class
     * (multi-class classification is supported). For regression, it's any real
     * number. For one-class SVM, it's not used so can be any number.
     * {@code index} is an integer starting from 1, and {@code value}
     * is a real number. The indices must be in ascending order. The labels in
     * the testing data file are only used to calculate accuracy or error. If they
     * are unknown, just fill this column with a number.
     *
     * @param reader the file reader.
     * @throws IOException when fails to read the file.
     * @return the data frame.
     */
    static SparseDataset libsvm(BufferedReader reader) throws IOException {
        try (reader) {
            List> data = new ArrayList<>();
            String line = reader.readLine();
            while (line != null) {
                String[] tokens = line.trim().split("\\s+");
                int y = Integer.parseInt(tokens[0]);

                SparseArray row = new SparseArray();
                for (int k = 1; k < tokens.length; k++) {
                    String[] pair = tokens[k].split(":");
                    if (pair.length != 2) {
                        throw new NumberFormatException("Invalid token: " + tokens[k]);
                    }

                    int j = Integer.parseInt(pair[0]) - 1;
                    double x = Double.parseDouble(pair[1]);
                    row.set(j, x);
                }

                data.add(new SampleInstance<>(row, y));
                line = reader.readLine();
            }

            return SparseDataset.of(data);
        }
    }    
}