data.CsvData Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of java-timeseries Show documentation
Time Series Analysis in Java
The newest version!
/*
 * Copyright (c) 2016 Jacob Rachiele
 * 
 */
package data;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.Charset;
import java.time.LocalDateTime;
import java.time.OffsetDateTime;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
 * Data from a CSV file.
 *
 * @author Jacob Rachiele
 */
final class CsvData {

    private final File csvFile;
    private final List parsedRecords;
    private final List header;

    /**
     * Create a new CSV data object using the given file path. This constructor assumes no header row is present.
     *
     * @param csvFilePath the path to the file.
     */
    CsvData(final String csvFilePath) {
        this(csvFilePath, false);
    }

    /**
     * Create a new CSV data object using the given file path and header flag.
     *
     * @param csvFilePath the path to the file.
     * @param headerRow   indicates whether the file contains a header row.
     */
    CsvData(final String csvFilePath, final boolean headerRow) {
        URL resource = getClass().getClassLoader().getResource(csvFilePath);
        if (resource == null) {
            throw new IllegalArgumentException(
                    "The resource given by " + csvFilePath + " could not be found on the file system.");
        } else {
            this.csvFile = new File(resource.getFile());
        }
        final CSVParser parser = getParser(csvFile);
        List records;
        if (parser == null) {
            throw new IllegalArgumentException(
                    "The resource given by " + csvFilePath + " could not be found on the file system.");
        } else {
            try {
                records = parser.getRecords();
            } catch (IOException ie) {
                throw new RuntimeException("The csv parser failed to retrieve the records.");
            } finally {
                try {
                    parser.close();
                } catch (IOException ie) {
                    ie.printStackTrace();
                }
            }
            // TODO: Add method for large csv files where we don't read all records at once.
            if (headerRow) {
                this.header = new ArrayList<>();
                CSVRecord headerRecord = records.remove(0);
                for (String s : headerRecord) {
                    this.header.add(s);
                }
            } else {
                this.header = Collections.emptyList();
            }
            this.parsedRecords = records;
        }
    }

    private CSVParser getParser(final File csvFile) {
        try {
            return CSVParser.parse(csvFile, Charset.defaultCharset(), CSVFormat.DEFAULT);
        } catch (IOException ie) {
            ie.printStackTrace();
            System.out.println("Error parsing csv file. Path : " + csvFile.getAbsolutePath());
            return null;
        }
    }

    /**
     * The csv file.
     *
     * @return the csv file.
     */
    File csvFile() {
        return this.csvFile;
    }

    /**
     * The data row-by-row.
     *
     * @return the data row-by-row.
     */
    List getRows() {
        return this.parsedRecords;
    }

    /**
     * The ith row of data.
     *
     * @param i the index of the row.
     * @return the ith row of data.
     */
    CSVRecord getRow(final int i) {
        return this.parsedRecords.get(i);
    }

    /**
     * The ith column of data.
     *
     * @param i the index of the column.
     * @return the ith column of data.
     */
    List getColumn(final int i) {
        List column = new ArrayList<>(this.parsedRecords.size());
        for (CSVRecord record : parsedRecords) {
            column.add(record.get(i));
        }
        return column;
    }

    /**
     * The header row.
     *
     * @return the header row.
     */
    List getHeader() {
        return Collections.unmodifiableList(this.header);
    }

    /**
     * The data column-by-column.
     *
     * @return the data column-by-column.
     */
    List> getColumns() {
        int nCols = this.parsedRecords.get(0).size();
        List> columns = new ArrayList<>(nCols);
        List column;
        for (int i = 0; i < nCols; i++) {
            column = new ArrayList<>(this.parsedRecords.size());
            for (CSVRecord row : this.parsedRecords) {
                column.add(row.get(i));
            }
            columns.add(column);
        }
        return columns;
        //return Collections.unmodifiableList(columns);
    }

    /**
     * The column with the given name.
     *
     * @param columnName the name of the column.
     * @return the column with the given name.
     */
    List getColumn(final String columnName) {
        List headerRow = getHeader();
        for (int i = 0; i < headerRow.size(); i++) {
            if (headerRow.get(i).equals(columnName)) {
                return getColumn(i);
            }
        }
        throw new IllegalArgumentException("A column with the name " + columnName + " was not found.");
    }

    /**
     * Create a new dataframe with the column types set according to the given schema.
     *
     * @param schema a list of data types for each column in the dataframe.
     * @return a new dataframe with the column types set according to the given schema.
     */
    DataFrame createDataFrame(Schema schema) {
        List> columns = getColumns();
        if (schema.size() != columns.size()) {
            throw new IllegalArgumentException(
                    "The schema size was " + schema.size() + " but the number of columns was " + columns.size() +
                    ". There must be one data type for each column.");
        }
        DataFrame df = new DataFrame(schema.size());
        int colId = 0;
        String stringColId;
        DataType type;
        List column;
        List header = this.getHeader();
        for (int i = 0; i < schema.size(); i++) {
            type = schema.get(i);
            stringColId = (header.isEmpty()) ? Integer.toString(colId++) : header.get(i);
            column = columns.get(i);
            switch (type) {
                case DOUBLE:
                    Double[] doubles = Types.toDoubleArray(column);
                    df.add(stringColId, Double[].class, doubles);
                    break;
                case BOOLEAN:
                    Boolean[] booleans = Types.toBooleanArray(column);
                    df.add(stringColId, Boolean[].class, booleans);
                    break;
                case LOCAL_DATE_TIME:
                    LocalDateTime[] localDateTimes = Types.toLocalDateTimeArray(column);
                    df.add(stringColId, LocalDateTime[].class, localDateTimes);
                    break;
                case OFFSET_DATE_TIME:
                    OffsetDateTime[] offsetDateTimes = Types.toOffsetDateTimeArray(column);
                    df.add(stringColId, OffsetDateTime[].class, offsetDateTimes);
                    break;
                case STRING:
                    df.add(stringColId, String[].class, Types.toStringArray(column));
                    break;
                default:
                    df.add(stringColId, String[].class, Types.toStringArray(column));
            }
        }
        return df;
    }

    /**
     * Create a dataframe from this csv data with each column type set to String.
     *
     * @return a new dataframe from this csv data with each column type set to String.
     */
    DataFrame createDataFrame() {
        List> columns = getColumns();
        DataFrame df = new DataFrame(columns.size());
        int colId = 0;
        String stringColId;
        List column;
        List header = this.getHeader();
        for (int i = 0; i < columns.size(); i++) {
            stringColId = (header.isEmpty()) ? Integer.toString(colId++) : header.get(i);
            column = columns.get(i);
            df.add(stringColId, String[].class, Types.toStringArray(column));
        }
        return df;
    }


}