repicea.stats.data.DataSet Maven / Gradle / Ivy
Show all versions of repicea-mathstats Show documentation
/*
* This file is part of the repicea-mathstats library.
*
* Copyright (C) 2009-2019 Mathieu Fortin for Rouge-Epicea
* Copyright (C) 2024 His Majesty the King in Right of Canada
* Author: Mathieu Fortin, Canadian Forest Service
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This library is distributed with the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* Please see the license at http://www.gnu.org/copyleft/lesser.html.
*/
package repicea.stats.data;
import java.io.IOException;
import java.security.InvalidParameterException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonIgnore;
import repicea.data.Record;
import repicea.data.Table;
import repicea.gui.REpiceaUIObject;
import repicea.gui.components.REpiceaTable;
import repicea.gui.components.REpiceaTableModel;
import repicea.io.FormatField;
import repicea.io.FormatReader;
import repicea.io.FormatWriter;
import repicea.io.GExportFieldDetails;
import repicea.io.Saveable;
import repicea.math.Matrix;
/**
* The DataSet class contains Observation instances and implements the methods
* to read a dataset with a FormatReader instance or to write data to file.
* @author Mathieu Fortin - November 2012
*
*/
public class DataSet implements Table, Saveable, REpiceaUIObject {
protected List fieldNames;
protected List> fieldTypes;
protected List observations;
private final String originalFilename;
@JsonIgnore
private transient REpiceaTable table;
@JsonIgnore
private Map formatters;
protected DataSet(String filename) {
this.originalFilename = filename;
fieldNames = new ArrayList();
fieldTypes = new ArrayList>();
observations = new ArrayList();
}
/**
* Constructor for reading.
* This constructor reads a file, typically in a csv format and populates a List of Observation instances. The
* type of each field is automatically determined by scanning the values and using type coercion.
*
* @param filename the name of the file to be read
* @param autoLoad true if the file is to be read now. Typically, this boolean is set to false when the swingworker is
* launched from a window that retrieves some events.
* @throws Exception if an error has occurred during the loading
*/
public DataSet(String filename, boolean autoLoad) throws Exception {
this(filename);
if (autoLoad) {
load();
}
}
/**
* Constructor for writing.
*
* The constructor sets the field names. Observations can be added using the {@link DataSet#addObservation(Object[])}
* method. When all the observations have been added, it is good practice to call the {@link DataSet#indexFieldType()} method
* to set the field types. The DataSet instance can be saved to file using the {@link DataSet#save(String)} method.
* @param fieldNames a List of String instances that represent the field names
*/
public DataSet(List fieldNames) {
this();
for (String fieldName : fieldNames) {
addFieldName(fieldName);
}
}
/**
* An empty DataSet to be populated using the addField method.
*/
public DataSet() {
this((String) null);
}
/**
* This method returns any object in the dataset at row i and column j.
* @param i the index of the row
* @param j the index of the column
* @return an Object instance
*/
protected Object getValueAt(int i, int j) {
return observations.get(i).values.get(j);
}
/**
* Provide the value at row i in a particular field.
* @param i the row index.
* @param fieldName the name of the field.
* @return an Object instance
*/
public Object getValueAt(int i, String fieldName) {
int j = getIndexOfThisField(fieldName);
if (j != -1) {
return getValueAt(i,j);
} else {
return null;
}
}
protected final void setValueAt(int i, int j, Object value) {
if (value.getClass().equals(fieldTypes.get(j))) {
observations.get(i).values.remove(j);
observations.get(i).values.add(j, value);
}
}
/**
* Index the different field types.
* More specifically, it goes through the columns and find the appropriate class for a particular
* field. This method should be called after adding all the observations.
*/
public void indexFieldType() {
fieldTypes.clear();
for (int j = 0; j < fieldNames.size(); j++) {
setClassOfThisField(j);
}
}
private void setClassOfThisField(int fieldIndex) {
if (isInteger(fieldIndex)) {
setFieldType(fieldIndex, Integer.class);
} else if (isDouble(fieldIndex)) {
setFieldType(fieldIndex, Double.class);
reconvertToDoubleIfNeedsBe(fieldIndex);
} else {
setFieldType(fieldIndex, String.class);
reconvertToStringIfNeedsBe(fieldIndex);
}
}
private void setFieldType(int fieldIndex, Class> clazz) {
if (fieldIndex < fieldTypes.size()) {
fieldTypes.set(fieldIndex, clazz);
} else if (fieldIndex == fieldTypes.size()) {
fieldTypes.add(clazz);
} else {
throw new InvalidParameterException("The field type cannot be set!");
}
}
private boolean isInteger(int j) {
boolean isInteger = true;
for (int i = 0; i < getNumberOfObservations(); i++) {
if (!(getValueAt(i,j) instanceof Integer)) {
isInteger = false;
break;
}
}
return isInteger;
}
private void reconvertToStringIfNeedsBe(int j) {
for (int i = 0; i < getNumberOfObservations(); i++) {
Object value = getValueAt(i,j);
if ((value instanceof Number)) {
setValueAt(i,j, value.toString());
}
}
}
private void reconvertToDoubleIfNeedsBe(int j) {
for (int i = 0; i < getNumberOfObservations(); i++) {
Object value = getValueAt(i,j);
if ((value instanceof Integer)) {
setValueAt(i,j, ((Integer) value).doubleValue()); // MF2020-04-30 Bug corrected
}
}
}
private boolean isDouble(int indexJ) {
boolean isDouble = true;
for (int i = 0; i < getNumberOfObservations(); i++) {
if (!(getValueAt(i,indexJ) instanceof Number)) {
isDouble = false;
break;
}
}
return isDouble;
}
/**
* This method returns the index of a particular field.
* @param fieldName the name of the field
* @return an integer
*/
public int getIndexOfThisField(String fieldName) {return fieldNames.indexOf(fieldName);}
/**
* This method sorts the data according to the fields represented by the indices in fieldIndices parameter.
* @param fieldIndices a List of field indices
*/
@SuppressWarnings("unchecked")
public void sortObservations(List fieldIndices) {
Observation.ComparableFields = fieldIndices;
Collections.sort(observations);
}
/**
* This method returns the number of observations in the dataset.
* @return an integer
*/
public int getNumberOfObservations() {
return observations.size();
}
@SuppressWarnings("rawtypes")
protected Class getFieldTypeOfThisField(int i) {
return fieldTypes.get(i);
}
@SuppressWarnings("rawtypes")
protected Class getFieldTypeOfThisField(String fieldName) {
int index = getIndexOfThisField(fieldName);
if (index == -1)
throw new InvalidParameterException("Field " + fieldName + " cannot be found in the dataset!");
else
return getFieldTypeOfThisField(getIndexOfThisField(fieldName));
}
protected Matrix getVectorOfThisField(String fieldName) {
return getVectorOfThisField(getIndexOfThisField(fieldName));
}
protected Matrix getVectorOfThisField(int j) {
Matrix output = new Matrix(observations.size(), 1);
for (int i = 0; i < observations.size(); i++) {
output.setValueAt(i, 0, ((Number) getValueAt(i,j)).doubleValue());
}
return output;
}
/**
* Return all possible values in this field.
* @param j the index of the field.
* @return a SORTED list of all the possible value.
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
protected List getPossibleValuesInThisField(int j) {
List possibleValues = new ArrayList();
for (int i = 0; i < observations.size(); i++) {
Object value = getValueAt(i,j);
if (!possibleValues.contains(value)) {
possibleValues.add(value);
}
}
Collections.sort(possibleValues);
return possibleValues;
}
protected Matrix getDummyMatrix(List> possibleValues, int fieldIndex) {
Matrix outputMatrix = new Matrix(getNumberOfObservations(), possibleValues.size());
for (int i = 0; i < getNumberOfObservations(); i++) {
int position = possibleValues.indexOf(getValueAt(i, fieldIndex));
if (position >= 0 && position < outputMatrix.m_iCols) {
outputMatrix.setValueAt(i, position, 1d);
}
}
return outputMatrix;
}
/**
* Add an observation to the list of observations.
* @param observationFrame an array of Object instances.
*/
public void addObservation(Object[] observationFrame) {
parseDifferentFields(observationFrame);
observations.add(new Observation(observationFrame));
}
private void addFieldName(String originalName) {
int index = 0;
String name = originalName;
while (fieldNames.contains(name)) {
name = originalName + index;
index++;
}
fieldNames.add(name);
}
/**
* Add a field and its value to the DataSet instance.
* The field argument must have the same size than the number of observations.
* @param name the name of the new field.
* @param field an array of Object instances
*/
public void addField(String name, Object[] field) {
if (observations.size() > 0 && field.length != observations.size()) { // will only trigger if there are some observations already
throw new InvalidParameterException("The number of observations in the new field does not match the number of observations in the dataset!");
}
addFieldName(name);
for (int i = 0; i < field.length; i++) {
if (i < observations.size()) { // means the observation exists already
observations.get(i).values.add(field[i]);
} else {
observations.add(new Observation(new Object[] {field[i]}));
}
}
setClassOfThisField(fieldNames.size() - 1);
}
@Override
public void save(String filename) throws IOException {
try {
FormatWriter> writer = FormatWriter.createFormatWriter(false, filename);
GExportFieldDetails exportField;
List headerFields = new ArrayList();
Object[] record;
for (int i = 0; i < observations.size(); i++) {
record = new Object[fieldNames.size()];
for (int j = 0; j < fieldNames.size(); j++) {
record[j] = getValueAt(i,j);
if (i == 0) {
exportField = new GExportFieldDetails(fieldNames.get(j), getValueAt(i,j));
headerFields.add(writer.convertGExportFieldDetailsToFormatField(exportField));
}
}
if (i == 0) {
writer.setFields(headerFields);
}
writer.addRecord(record);
}
writer.close();
} catch (IOException e1) {
e1.printStackTrace();
}
}
private void load() throws Exception {
fieldNames.clear();
observations.clear();
try {
FormatReader> reader = FormatReader.createFormatReader(originalFilename);
FormatField field;
for (int i = 0; i < reader.getHeader().getNumberOfFields(); i++) {
field = reader.getHeader().getField(i);
fieldNames.add(field.getName());
}
Object[] lineRead = reader.nextRecord();
while (lineRead != null) {
addObservation(lineRead);
lineRead = reader.nextRecord();
}
indexFieldType();
} catch (IOException e1) {
e1.printStackTrace();
}
}
private void parseDifferentFields(Object[] lineRead) {
for (int i = 0; i < fieldNames.size(); i++) {
try {
lineRead[i] = (Integer) Integer.parseInt(lineRead[i].toString());
} catch (NumberFormatException e1) {
try {
lineRead[i] = (Double) Double.parseDouble(lineRead[i].toString());
} catch (NumberFormatException e2) {
lineRead[i] = lineRead[i].toString();
}
}
}
}
@Override
public List getFieldNames() {
List fieldNames = new ArrayList();
fieldNames.addAll(this.fieldNames);
return fieldNames;
}
@Override
public List> getFieldTypes() {
List> fieldTypes = new ArrayList>();
fieldTypes.addAll(this.fieldTypes);
return fieldTypes;
}
@JsonIgnore
@Override
public REpiceaTable getUI() {
if (table == null) {
REpiceaTableModel model = new REpiceaTableModel(this);
table = new REpiceaTable(model, false); // no pop up
}
return table;
}
@JsonIgnore
@Override
public boolean isVisible() {
return table.isVisible();
}
/**
* Return the observations of the DataSet instance.
* @return a List of Observation instances
*/
public List getObservations() {
return observations;
}
private static String addSpacesUpTo(String str, int desiredLength, int buffer, boolean isNumber) {
StringBuilder sb = new StringBuilder();
if (isNumber) {
int nbSpaceToAdd = desiredLength - str.length();
while (sb.length() < nbSpaceToAdd)
sb.append(" ");
sb.append(str);
} else {
sb.append(str);
while (sb.length() < desiredLength)
sb.append(" ");
}
while(sb.length() < desiredLength + buffer)
sb.append(" ");
return sb.toString();
}
@JsonIgnore
private Map getFormatters() {
if (formatters == null) {
formatters = new HashMap();
}
return formatters;
}
@JsonIgnore
private NumberFormat getFormatter(int j) {
return getFormatters().get(j);
}
/**
* Set a NumberFormat instance for a particular field.
* This method is used to have a better display of the DataSet instance when calling the {@link DataSet#toString()} method.
* @param fieldId the id of the field
* @param formatter a NumberFormat instance
*/
public void setFormatter(int fieldId, NumberFormat formatter) {
if (fieldId < 0 || fieldId >= this.getFieldNames().size()) {
throw new InvalidParameterException("The fieldId argument must be positive (>= 0) and smaller than the number of fields!");
}
getFormatters().put(fieldId, formatter);
}
/**
* Clear the NumberFormat instances associated with the fields.
*/
public void clearFormatters() {
getFormatters().clear();
}
/**
* Provide a String representation of the DataSet instance.
* @return a String instance
*/
@Override
public String toString() {
boolean exceeds = false;
int maxLength;
if (getNumberOfObservations() <= 100) {
maxLength = getNumberOfObservations();
} else {
exceeds = true;
maxLength = 100;
}
int nbFields = getFieldNames().size();
int[] maxSizes = new int[nbFields];
for (int j = 0; j < nbFields; j++) {
String fieldName = getFieldNames().get(j);
maxSizes[j] = fieldName.length();
}
for (int i = 0; i < maxLength; i++) {
for (int j = 0; j < nbFields; j++) {
Object o = getObservations().get(i).getValueAt(j);
int currentLength = o instanceof Double && getFormatter(j) != null ? getFormatter(j).format((Double) o).length() : o.toString().length();
if (maxSizes[j] < currentLength) {
maxSizes[j] = currentLength;
}
}
}
StringBuilder output = new StringBuilder();
for (int j = 0; j < nbFields; j++) {
String fieldName = getFieldNames().get(j);
output.append(addSpacesUpTo(fieldName, maxSizes[j], 2, false)); // not number instances
}
output.append(System.lineSeparator());
for (int i = 0; i < maxLength; i++) {
for (int j = 0; j < nbFields; j++) {
Object o = getObservations().get(i).getValueAt(j);
String value = o instanceof Double && getFormatter(j) != null ? getFormatter(j).format((Double) o) : o.toString();
output.append(addSpacesUpTo(value, maxSizes[j], 2, o instanceof Number));
}
output.append(System.lineSeparator());
}
if (exceeds) {
output.append("Only " + maxLength + " out of " + getNumberOfObservations() + " observations printed!");
}
return output.toString();
}
/**
* Return a list of the values in a particular field.
* @param i the field id, that is an integer
* @return a List of Object instance
*/
public List