All Downloads are FREE. Search and download functionalities are using the official Maven repository.

dataloader.CsvDataLoader Maven / Gradle / Ivy

The newest version!
package dataloader;

import base.Configuration;
import dataloader.utils.ParseUtils;
import datasets.VectorDouble;
import datastructs.IVector;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import tech.tablesaw.api.Table;
import tech.tablesaw.columns.Column;
import tech.tablesaw.io.csv.CsvReadOptions;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;


/**
 * Class that loads data from a
 * specific csv file
 * and creates datasets in two formats
 * HashMap and Table from tablesaw.
 */

public class CsvDataLoader {


    public static class MapLoader {


        /**
         * Create a NumericsSample from the given column in the given Map
         *
         * @param dataSet The name of the data set
         * @param colName The column name
         * @return A numeric sample
         */
        public static IVector buildNumericSample(Map> dataSet, String colName) {

            if (dataSet == null) {

                throw new IllegalArgumentException("Null data set given");
            }

            VectorDouble numericSample;

            if (!dataSet.containsKey(colName)) {

                if (Configuration.ENABLE_WARNINGS) {
                    Configuration.Logging.printWarning("Column " + colName + " not in dataset");
                }

                numericSample = new VectorDouble(0);

            } else {

                List data = ParseUtils.parseAsDouble(dataSet.get(colName));
                numericSample = new VectorDouble(data);
            }
            return numericSample;
        }




        /**
         * Simple method that parses data set from a csv file
         * The CSV file should NOT have the last column ending with comma.
         * The CSV column names should NOT have white space
         *
         * @param csvFile A given csv file
         * @return a data set
         * @throws IOException When invalid file is given
         */
        public static Map> parseFile(File csvFile) throws IOException {

            Reader csvData = new FileReader(csvFile);
            Map> dataSet = new HashMap>();


            CSVParser parser = CSVParser.parse(csvData, CSVFormat.DEFAULT);

            int lineCounter = 0;

            ArrayList colNames = new ArrayList();

            for (CSVRecord record : parser) {

                // the first record is the header
                if (lineCounter == 0) {

                    for (String field : record) {

                        if (dataSet.containsKey(field)) {

                            if (Configuration.ENABLE_WARNINGS) {
                                Configuration.Logging.printWarning("Column: " + field + " already exists");
                            }
                        } else {
                            // add a new column
                            colNames.add(field);
                            dataSet.put(field, new ArrayList());
                        }
                    }

                    lineCounter++;
                } else {

                    int colCounter = 0;
                    for (String field : record) {

                        String colName = colNames.get(colCounter);
                        dataSet.get(colName).add(field);
                        colCounter++;

                    }
                }

            }
            return dataSet;
        }


        /**
         * Simple method that parses a
         * data set from a csv file
         * without headers
         *
         * @param csvFile A given csv file
         * @return a data set
         * @throws IOException When invalid file is given
         */
        public static Map> parseFile(String csvFile) throws IOException {
            File file = new File(csvFile);
            return MapLoader.parseFile(file);
        }
    }

    /**
     * Handles the loading for tablesaw
     */
    public static class TableLoader {


        /**
         * Reads from csv in file system with columns separated by commas
         * and the file has a header row.
         * Missing values are treated with an indicator.
         * A Table dataSet is returned
         *
         * @param csvFile A given csv file
         * @return A data set
         * @throws IOException When invalid file is given
         */
        public static Table parseFile(String csvFile) throws IOException {

            File file = new File(csvFile);
            return TableLoader.parseFile(file);
        }

        /**
         * Reads from a csv file with columns separated by commas
         * and the file has a header row.
         * Missing values are treated with an indicator.
         * A Table dataSet is returned
         *
         * @param csvFile A given csv file
         * @return A data set
         * @throws IOException When invalid file is given
         */
        public static Table parseFile(File csvFile) throws IOException {

            CsvReadOptions options = CsvReadOptions.builder(csvFile).missingValueIndicator("-").build();
            Table dataSet = Table.read().usingOptions(options);
            return dataSet;
        }


        /**
         * Create a NumericsSample from the given column of the given Map
         *
         * @param dataSet A given data set in Tablesaw format
         * @param colName The name of the column
         * @return A numeric sample
         */
        public static IVector buildNumericSample(Table dataSet, String colName) {

            if (dataSet == null) {

                throw new IllegalArgumentException("Null data set given");
            }

            VectorDouble sample;
            Column col = dataSet.column(colName);

            if (col == null) {

                if (Configuration.ENABLE_WARNINGS) {
                    Configuration.Logging.printWarning("Column " + colName + " not in dataset");
                }

                sample = new VectorDouble(0);
            }
            else {

                List data = ParseUtils.parseAsDouble(col);
                sample = new VectorDouble(data);
            }

            return sample;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy