edu.pitt.dbmi.data.reader.tabular.MixedTabularDatasetFileReader Maven / Gradle / Ivy
/*
* Copyright (C) 2019 University of Pittsburgh.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301 USA
*/
package edu.pitt.dbmi.data.reader.tabular;
import edu.pitt.dbmi.data.reader.*;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Collections;
import java.util.Set;
/**
* Dec 14, 2018 1:54:31 PM
*
* @author Kevin V. Bui ([email protected])
* @version $Id: $Id
*/
public class MixedTabularDatasetFileReader extends DatasetFileReader implements MixedTabularDatasetReader {
private final int numberOfDiscreteCategories;
private boolean hasHeader;
private char quoteChar;
/**
* Constructor.
*
* @param dataFile The data file.
* @param delimiter The delimiter.
* @param numberOfDiscreteCategories The number of discrete categories.
*/
public MixedTabularDatasetFileReader(Path dataFile, Delimiter delimiter, int numberOfDiscreteCategories) {
super(dataFile, delimiter);
this.numberOfDiscreteCategories = numberOfDiscreteCategories;
this.hasHeader = true;
this.quoteChar = '"';
}
/**
* {@inheritDoc}
*
* Reads in the data.
*/
@Override
public Data readInData() throws IOException {
return readInData(Collections.EMPTY_SET);
}
/**
* {@inheritDoc}
*/
@Override
public Data readInData(Set namesOfColumnsToExclude) throws IOException {
TabularColumnReader columnReader = new TabularColumnFileReader(this.dataFile, this.delimiter);
columnReader.setCommentMarker(this.commentMarker);
columnReader.setQuoteCharacter(this.quoteChar);
final boolean isDiscrete = false;
DataColumn[] dataColumns = this.hasHeader
? columnReader.readInDataColumns(namesOfColumnsToExclude, isDiscrete)
: columnReader.generateColumns(new int[0], isDiscrete);
TabularDataReader dataReader = new TabularDataFileReader(this.dataFile, this.delimiter);
dataReader.setCommentMarker(this.commentMarker);
dataReader.setQuoteCharacter(this.quoteChar);
dataReader.setMissingDataMarker(this.missingDataMarker);
dataReader.determineDiscreteDataColumns(dataColumns, this.numberOfDiscreteCategories, this.hasHeader);
return toMixedData(dataReader.read(dataColumns, this.hasHeader));
}
/**
* {@inheritDoc}
*
* Reads in the data.
*/
@Override
public Data readInData(int[] columnsToExclude) throws IOException {
TabularColumnReader columnReader = new TabularColumnFileReader(this.dataFile, this.delimiter);
columnReader.setCommentMarker(this.commentMarker);
columnReader.setQuoteCharacter(this.quoteChar);
final boolean isDiscrete = false;
DataColumn[] dataColumns = this.hasHeader
? columnReader.readInDataColumns(columnsToExclude, isDiscrete)
: columnReader.generateColumns(columnsToExclude, isDiscrete);
TabularDataReader dataReader = new TabularDataFileReader(this.dataFile, this.delimiter);
dataReader.setCommentMarker(this.commentMarker);
dataReader.setQuoteCharacter(this.quoteChar);
dataReader.setMissingDataMarker(this.missingDataMarker);
dataReader.determineDiscreteDataColumns(dataColumns, this.numberOfDiscreteCategories, this.hasHeader);
return toMixedData(dataReader.read(dataColumns, this.hasHeader));
}
private Data toMixedData(Data data) {
if (data instanceof ContinuousData continuousData) {
double[][] contData = continuousData.getData();
int numOfRows = contData.length;
int numOfCols = contData[0].length;
// convert to mixed variables
DiscreteDataColumn[] columns = Arrays.stream(continuousData.getDataColumns())
.map(MixedTabularDataColumn::new)
.toArray(DiscreteDataColumn[]::new);
// transpose the data
double[][] vertContData = new double[numOfCols][numOfRows];
for (int row = 0; row < numOfRows; row++) {
for (int col = 0; col < numOfCols; col++) {
vertContData[col][row] = contData[row][col];
}
}
return new MixedTabularData(numOfRows, columns, vertContData, new int[0][0]);
} else if (data instanceof DiscreteData verticalDiscreteData) {
int[][] discreteData = verticalDiscreteData.getData();
int numOfRows = discreteData[0].length;
// convert to mixed variables
DiscreteDataColumn[] columns = Arrays.stream(verticalDiscreteData.getDataColumns())
.map(e -> {
DiscreteDataColumn column = new MixedTabularDataColumn(e.getDataColumn());
e.getCategories().forEach(v -> column.setValue(v));
e.recategorize();
return column;
}).toArray(DiscreteDataColumn[]::new);
return new MixedTabularData(numOfRows, columns, new double[0][0], discreteData);
} else if (data instanceof MixedTabularData mixedTabularData) {
DiscreteDataColumn[] columns = mixedTabularData.getDataColumns();
double[][] continuousData = mixedTabularData.getContinuousData();
int[][] discreteData = mixedTabularData.getDiscreteData();
int numOfRows = mixedTabularData.getNumOfRows();
return new MixedTabularData(numOfRows, columns, continuousData, discreteData);
} else {
return null;
}
}
/**
* {@inheritDoc}
*
* Sets whether the data file has a header.
*/
@Override
public void setHasHeader(boolean hasHeader) {
this.hasHeader = hasHeader;
}
/**
* {@inheritDoc}
*
* Sets the quote character.
*/
@Override
public void setQuoteCharacter(char quoteCharacter) {
this.quoteChar = quoteCharacter;
}
}