All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.pitt.dbmi.data.reader.tabular.TabularDataFileReader Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (C) 2018 University of Pittsburgh.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
 * MA 02110-1301  USA
 */
package edu.pitt.dbmi.data.reader.tabular;

import edu.pitt.dbmi.data.reader.*;
import edu.pitt.dbmi.data.reader.metadata.ColumnMetadata;
import edu.pitt.dbmi.data.reader.metadata.Metadata;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

/**
 * Nov 15, 2018 5:22:50 PM
 *
 * @author Kevin V. Bui ([email protected])
 * @version $Id: $Id
 */
public final class TabularDataFileReader extends DatasetFileReader implements TabularDataReader {

    /**
     * Constructor.
     *
     * @param dataFile  The data file.
     * @param delimiter The delimiter.
     */
    public TabularDataFileReader(Path dataFile, Delimiter delimiter) {
        super(dataFile, delimiter);
    }

    /**
     * {@inheritDoc}
     * 

* Reads in the data. */ @Override public void determineDiscreteDataColumns(DataColumn[] dataColumns, int numberOfCategories, boolean hasHeader) throws IOException { int numOfColsInDataFile = 0; for (DataColumn dataColumn : dataColumns) { if (!dataColumn.isGenerated()) { numOfColsInDataFile++; } } Set[] columnCategories = new Set[numOfColsInDataFile]; for (int i = 0; i < numOfColsInDataFile; i++) { columnCategories[i] = new HashSet<>(); } try (InputStream in = Files.newInputStream(this.dataFile, StandardOpenOption.READ)) { boolean skipHeader = hasHeader; boolean skip = false; boolean hasSeenNonblankChar = false; boolean hasQuoteChar = false; byte delimChar = this.delimiter.getByteValue(); // comment marker check byte[] comment = this.commentMarker.getBytes(); int cmntIndex = 0; boolean checkForComment = comment.length > 0; int colNum = 0; int lineNum = 1; int columnIndex = 0; int maxCategoryToAdd = numberOfCategories + 1; StringBuilder dataBuilder = new StringBuilder(); byte prevChar = -1; byte[] buffer = new byte[DataFileReader.BUFFER_SIZE]; int len; while ((len = in.read(buffer)) != -1 && !Thread.currentThread().isInterrupted()) { int i = 0; // buffer array index if (skipHeader) { boolean finished = false; for (; i < len && !finished && !Thread.currentThread().isInterrupted(); i++) { byte currChar = buffer[i]; if (currChar == DataFileReader.CARRIAGE_RETURN || currChar == DataFileReader.LINE_FEED) { if (currChar == DataFileReader.LINE_FEED && prevChar == DataFileReader.CARRIAGE_RETURN) { prevChar = DataFileReader.LINE_FEED; continue; } finished = hasSeenNonblankChar && !skip; if (finished) { skipHeader = false; } lineNum++; // reset states skip = false; hasSeenNonblankChar = false; cmntIndex = 0; checkForComment = comment.length > 0; } else if (!skip) { if (currChar > DataFileReader.SPACE_CHAR) { hasSeenNonblankChar = true; } // skip blank chars at the begining of the line if (currChar <= DataFileReader.SPACE_CHAR && !hasSeenNonblankChar) { continue; } // check for comment marker to skip line if (checkForComment) { if (currChar == comment[cmntIndex]) { cmntIndex++; if (cmntIndex == comment.length) { skip = true; prevChar = currChar; continue; } } else { checkForComment = false; } } } prevChar = currChar; } } for (; i < len && !Thread.currentThread().isInterrupted(); i++) { byte currChar = buffer[i]; if (currChar == DataFileReader.CARRIAGE_RETURN || currChar == DataFileReader.LINE_FEED) { if (currChar == DataFileReader.LINE_FEED && prevChar == DataFileReader.CARRIAGE_RETURN) { prevChar = DataFileReader.LINE_FEED; continue; } if (hasSeenNonblankChar && !skip) { colNum++; DataColumn dataColumn = dataColumns[columnIndex]; if (dataColumn.getColumnNumber() == colNum) { String value = dataBuilder.toString().trim(); if (!(value.isEmpty() || value.equals(this.missingDataMarker))) { Set categories = columnCategories[columnIndex]; if (categories.size() < maxCategoryToAdd) { categories.add(value); } } columnIndex++; } // ensure we have enough data if (columnIndex < numOfColsInDataFile) { String errMsg = String.format("Insufficient data on line %d. Extracted %d value(s) but expected %d.", lineNum, columnIndex, numOfColsInDataFile); // TabularDataFileReader.LOGGER.error(errMsg); throw new DataReaderException(errMsg); } } lineNum++; // clear data dataBuilder.delete(0, dataBuilder.length()); // reset states skip = false; hasSeenNonblankChar = false; cmntIndex = 0; checkForComment = comment.length > 0; columnIndex = 0; colNum = 0; } else if (!skip) { if (currChar > DataFileReader.SPACE_CHAR) { hasSeenNonblankChar = true; } // skip blank chars at the begining of the line if (currChar <= DataFileReader.SPACE_CHAR && !hasSeenNonblankChar) { continue; } // check for comment marker to skip line if (checkForComment) { if (currChar == comment[cmntIndex]) { cmntIndex++; if (cmntIndex == comment.length) { skip = true; prevChar = currChar; continue; } } else { checkForComment = false; } } if (currChar == this.quoteCharacter) { hasQuoteChar = !hasQuoteChar; } else { if (hasQuoteChar) { dataBuilder.append((char) currChar); } else { boolean isDelimiter; if (this.delimiter == Delimiter.WHITESPACE) { isDelimiter = (currChar <= DataFileReader.SPACE_CHAR) && (prevChar > DataFileReader.SPACE_CHAR); } else { isDelimiter = (currChar == delimChar); } if (isDelimiter) { colNum++; DataColumn dataColumn = dataColumns[columnIndex]; if (dataColumn.getColumnNumber() == colNum) { String value = dataBuilder.toString().trim(); if (!(value.isEmpty() || value.equals(this.missingDataMarker))) { Set categories = columnCategories[columnIndex]; if (categories.size() < maxCategoryToAdd) { categories.add(value); } } columnIndex++; if (columnIndex == numOfColsInDataFile) { skip = true; } } // clear data dataBuilder.delete(0, dataBuilder.length()); } else { dataBuilder.append((char) currChar); } } } } prevChar = currChar; } } if (!skipHeader && hasSeenNonblankChar && !skip) { colNum++; DataColumn dataColumn = dataColumns[columnIndex]; if (dataColumn.getColumnNumber() == colNum) { String value = dataBuilder.toString().trim(); if (!(value.isEmpty() || value.equals(this.missingDataMarker))) { Set categories = columnCategories[columnIndex]; if (categories.size() < maxCategoryToAdd) { categories.add(value); } } columnIndex++; } // ensure we have enough data if (columnIndex < numOfColsInDataFile) { String errMsg = String.format("Insufficient data on line %d. Extracted %d value(s) but expected %d.", lineNum, columnIndex, numOfColsInDataFile); // TabularDataFileReader.LOGGER.error(errMsg); throw new DataReaderException(errMsg); } } } for (int i = 0; i < numOfColsInDataFile; i++) { dataColumns[i].setDiscrete(columnCategories[i].size() <= numberOfCategories); } } /** * {@inheritDoc} */ @Override public Data read(DataColumn[] dataColumns, boolean hasHeader) throws IOException { if (dataColumns == null) { return null; } int numOfColsInDataFile = 0; boolean isDiscrete = false; boolean isContinuous = false; for (DataColumn dataColumn : dataColumns) { if (dataColumn.isDiscrete()) { isDiscrete = true; } else { isContinuous = true; } if (!dataColumn.isGenerated()) { numOfColsInDataFile++; } } if (isDiscrete && isContinuous) { return readInMixedData(dataColumns, hasHeader, numOfColsInDataFile); } else if (isContinuous) { return readInContinuousData(dataColumns, hasHeader, numOfColsInDataFile); } else if (isDiscrete) { return readInDiscreteData(dataColumns, hasHeader, numOfColsInDataFile); } else { return null; } } /** * {@inheritDoc} */ @Override public Data read(DataColumn[] dataColumns, boolean hasHeader, Metadata metadata) throws IOException { Data data = read(dataColumns, hasHeader); if (metadata != null) { if (data instanceof ContinuousData continuousData) { double[][] contData = continuousData.getData(); metadata.getInterventionalColumns().forEach(column -> { ColumnMetadata valCol = column.getValueColumn(); ColumnMetadata statCol = column.getStatusColumn(); int valColNum = valCol.getColumnNumber() - 1; int statColNum = statCol.getColumnNumber() - 1; double[] val = contData[valColNum]; double[] stat = contData[statColNum]; for (int i = 0; i < val.length; i++) { if (Double.isNaN(val[i])) { val[i] = 0.0; stat[i] = 0.0; } else if (dataColumns[statColNum].isGenerated()) { stat[i] = 1.0; } } }); } else if (data instanceof DiscreteData verticalDiscreteData) { int[][] discreteData = verticalDiscreteData.getData(); metadata.getInterventionalColumns().forEach(column -> { ColumnMetadata valCol = column.getValueColumn(); ColumnMetadata statCol = column.getStatusColumn(); int valColNum = valCol.getColumnNumber() - 1; int statColNum = statCol.getColumnNumber() - 1; int[] val = discreteData[valColNum]; int[] stat = discreteData[statColNum]; for (int i = 0; i < val.length; i++) { if (val[i] == DatasetReader.DISCRETE_MISSING_VALUE) { val[i] = 0; stat[i] = 0; } else if (dataColumns[statColNum].isGenerated()) { stat[i] = 1; } } }); } else if (data instanceof MixedTabularData mixedTabularData) { double[][] continuousData = mixedTabularData.getContinuousData(); int[][] discreteData = mixedTabularData.getDiscreteData(); metadata.getInterventionalColumns().forEach(column -> { ColumnMetadata valCol = column.getValueColumn(); ColumnMetadata statCol = column.getStatusColumn(); int valColNum = valCol.getColumnNumber() - 1; int statColNum = statCol.getColumnNumber() - 1; if (valCol.isDiscrete()) { int[] val = discreteData[valColNum]; if (statCol.isDiscrete()) { int[] stat = discreteData[statColNum]; for (int i = 0; i < val.length; i++) { if (val[i] == DatasetReader.DISCRETE_MISSING_VALUE) { val[i] = 0; stat[i] = 0; } else if (dataColumns[statColNum].isGenerated()) { stat[i] = 1; } } } else { double[] stat = continuousData[statColNum]; for (int i = 0; i < val.length; i++) { if (val[i] == DatasetReader.DISCRETE_MISSING_VALUE) { val[i] = 0; stat[i] = 0.0; } else if (dataColumns[statColNum].isGenerated()) { stat[i] = 1.0; } } } } else { double[] val = continuousData[valColNum]; if (statCol.isDiscrete()) { int[] stat = discreteData[statColNum]; for (int i = 0; i < val.length; i++) { if (Double.isNaN(val[i])) { val[i] = 0.0; stat[i] = 0; } else if (dataColumns[statColNum].isGenerated()) { stat[i] = 1; } } } else { double[] stat = continuousData[statColNum]; for (int i = 0; i < val.length; i++) { if (Double.isNaN(val[i])) { val[i] = 0.0; stat[i] = 0.0; } else if (dataColumns[statColNum].isGenerated()) { stat[i] = 1.0; } } } } }); } } return data; } private Data readInMixedData(DataColumn[] dataColumns, boolean hasHeader, int numOfColsInDataFile) throws IOException { int numOfCols = dataColumns.length; int numOfRows = hasHeader ? countNumberOfLines() - 1 : countNumberOfLines(); DiscreteDataColumn[] discreteDataColumns = new DiscreteDataColumn[numOfCols]; double[][] continuousData = new double[numOfCols][]; int[][] discreteData = new int[numOfCols][]; for (int i = 0; i < numOfCols; i++) { DataColumn dataColumn = dataColumns[i]; // initialize data if (dataColumn.isDiscrete()) { discreteData[i] = new int[numOfRows]; } else { continuousData[i] = new double[numOfRows]; } // initialize columns discreteDataColumns[i] = new MixedTabularDataColumn(dataColumn); } readInDiscreteCategorizes(discreteDataColumns, hasHeader, numOfColsInDataFile); readInMixedData(discreteDataColumns, hasHeader, continuousData, discreteData, numOfColsInDataFile); return new MixedTabularData(numOfRows, discreteDataColumns, continuousData, discreteData); } private void readInMixedData(DiscreteDataColumn[] dataColumns, boolean hasHeader, double[][] continuousData, int[][] discreteData, int numOfColsInDataFile) throws IOException { int numOfCols = dataColumns.length; try (InputStream in = Files.newInputStream(this.dataFile, StandardOpenOption.READ)) { boolean skipHeader = hasHeader; boolean skip = false; boolean hasSeenNonblankChar = false; boolean hasQuoteChar = false; byte delimChar = this.delimiter.getByteValue(); // comment marker check byte[] comment = this.commentMarker.getBytes(); int cmntIndex = 0; boolean checkForComment = comment.length > 0; int colNum = 0; int lineNum = 1; int columnIndex = 0; int row = 0; // array row number int col = 0; // array column number StringBuilder dataBuilder = new StringBuilder(); byte prevChar = -1; byte[] buffer = new byte[DataFileReader.BUFFER_SIZE]; int len; while ((len = in.read(buffer)) != -1 && !Thread.currentThread().isInterrupted()) { int i = 0; // buffer array index if (skipHeader) { boolean finished = false; for (; i < len && !finished && !Thread.currentThread().isInterrupted(); i++) { byte currChar = buffer[i]; if (currChar == DataFileReader.CARRIAGE_RETURN || currChar == DataFileReader.LINE_FEED) { if (currChar == DataFileReader.LINE_FEED && prevChar == DataFileReader.CARRIAGE_RETURN) { prevChar = DataFileReader.LINE_FEED; continue; } finished = hasSeenNonblankChar && !skip; if (finished) { skipHeader = false; } lineNum++; // reset states skip = false; hasSeenNonblankChar = false; cmntIndex = 0; checkForComment = comment.length > 0; } else if (!skip) { if (currChar > DataFileReader.SPACE_CHAR) { hasSeenNonblankChar = true; } // skip blank chars at the begining of the line if (currChar <= DataFileReader.SPACE_CHAR && !hasSeenNonblankChar) { continue; } // check for comment marker to skip line if (checkForComment) { if (currChar == comment[cmntIndex]) { cmntIndex++; if (cmntIndex == comment.length) { skip = true; prevChar = currChar; continue; } } else { checkForComment = false; } } } prevChar = currChar; } } for (; i < len && !Thread.currentThread().isInterrupted(); i++) { byte currChar = buffer[i]; if (currChar == DataFileReader.CARRIAGE_RETURN || currChar == DataFileReader.LINE_FEED) { if (currChar == DataFileReader.LINE_FEED && prevChar == DataFileReader.CARRIAGE_RETURN) { prevChar = DataFileReader.LINE_FEED; continue; } if (hasSeenNonblankChar && !skip) { colNum++; DiscreteDataColumn discreteDataColumn = dataColumns[columnIndex]; DataColumn dataColumn = discreteDataColumn.getDataColumn(); if (dataColumn.getColumnNumber() == colNum) { String value = dataBuilder.toString().trim(); if (dataColumn.isDiscrete()) { if (value.isEmpty() || value.equals(this.missingDataMarker)) { discreteData[col++][row] = DatasetReader.DISCRETE_MISSING_VALUE; } else { discreteData[col++][row] = discreteDataColumn.getEncodeValue(value); } } else { if (value.isEmpty() || value.equals(this.missingDataMarker)) { continuousData[col++][row] = DatasetReader.CONTINUOUS_MISSING_VALUE; } else { try { continuousData[col++][row] = Double.parseDouble(value); } catch (NumberFormatException exception) { String errMsg = String.format("Invalid number %s on line %d at column %d.", value, lineNum, colNum); // TabularDataFileReader.LOGGER.error(errMsg, exception); throw new DataReaderException(errMsg); } } } columnIndex++; } // ensure we have enough data if (columnIndex < numOfColsInDataFile) { String errMsg = String.format("Insufficient data on line %d. Extracted %d value(s) but expected %d.", lineNum, columnIndex, numOfColsInDataFile); throw new DataReaderException(errMsg); } row++; } lineNum++; // clear data dataBuilder.delete(0, dataBuilder.length()); // reset states skip = false; hasSeenNonblankChar = false; cmntIndex = 0; checkForComment = comment.length > 0; columnIndex = 0; colNum = 0; col = 0; } else if (!skip) { if (currChar > DataFileReader.SPACE_CHAR) { hasSeenNonblankChar = true; } // skip blank chars at the begining of the line if (currChar <= DataFileReader.SPACE_CHAR && !hasSeenNonblankChar) { continue; } // check for comment marker to skip line if (checkForComment) { if (currChar == comment[cmntIndex]) { cmntIndex++; if (cmntIndex == comment.length) { skip = true; prevChar = currChar; continue; } } else { checkForComment = false; } } if (currChar == this.quoteCharacter) { hasQuoteChar = !hasQuoteChar; } else { if (hasQuoteChar) { dataBuilder.append((char) currChar); } else { boolean isDelimiter; if (this.delimiter == Delimiter.WHITESPACE) { isDelimiter = (currChar <= DataFileReader.SPACE_CHAR) && (prevChar > DataFileReader.SPACE_CHAR); } else { isDelimiter = (currChar == delimChar); } if (isDelimiter) { colNum++; DiscreteDataColumn discreteDataColumn = dataColumns[columnIndex]; DataColumn dataColumn = discreteDataColumn.getDataColumn(); if (dataColumn.getColumnNumber() == colNum) { String value = dataBuilder.toString().trim(); if (dataColumn.isDiscrete()) { if (value.isEmpty() || value.equals(this.missingDataMarker)) { discreteData[col++][row] = DatasetReader.DISCRETE_MISSING_VALUE; } else { discreteData[col++][row] = discreteDataColumn.getEncodeValue(value); } } else { if (value.isEmpty() || value.equals(this.missingDataMarker)) { continuousData[col++][row] = DatasetReader.CONTINUOUS_MISSING_VALUE; } else { try { continuousData[col++][row] = Double.parseDouble(value); } catch (NumberFormatException exception) { String errMsg = String.format("Invalid number %s on line %d at column %d.", value, lineNum, colNum); // TabularDataFileReader.LOGGER.error(errMsg, exception); throw new DataReaderException(errMsg); } } } columnIndex++; if (columnIndex == numOfCols) { row++; skip = true; } } // clear data dataBuilder.delete(0, dataBuilder.length()); } else { dataBuilder.append((char) currChar); } } } } prevChar = currChar; } } if (!skipHeader && hasSeenNonblankChar && !skip) { colNum++; DiscreteDataColumn discreteDataColumn = dataColumns[columnIndex]; DataColumn dataColumn = discreteDataColumn.getDataColumn(); if (dataColumn.getColumnNumber() == colNum) { String value = dataBuilder.toString().trim(); if (dataColumn.isDiscrete()) { if (value.isEmpty() || value.equals(this.missingDataMarker)) { discreteData[col++][row] = DatasetReader.DISCRETE_MISSING_VALUE; } else { discreteData[col++][row] = discreteDataColumn.getEncodeValue(value); } } else { if (value.isEmpty() || value.equals(this.missingDataMarker)) { continuousData[col++][row] = DatasetReader.CONTINUOUS_MISSING_VALUE; } else { try { continuousData[col++][row] = Double.parseDouble(value); } catch (NumberFormatException exception) { String errMsg = String.format("Invalid number %s on line %d at column %d.", value, lineNum, colNum); // TabularDataFileReader.LOGGER.error(errMsg, exception); throw new DataReaderException(errMsg); } } } columnIndex++; } // ensure we have enough data if (columnIndex < numOfColsInDataFile) { String errMsg = String.format("Insufficient data on line %d. Extracted %d value(s) but expected %d.", lineNum, columnIndex, numOfColsInDataFile); throw new DataReaderException(errMsg); } } } } private Data readInContinuousData(DataColumn[] dataColumns, boolean hasHeader, int numOfColsInDataFile) throws IOException { int numOfCols = dataColumns.length; int numOfRows = hasHeader ? countNumberOfLines() - 1 : countNumberOfLines(); double[][] data = new double[numOfRows][numOfCols]; try (InputStream in = Files.newInputStream(this.dataFile, StandardOpenOption.READ)) { boolean skipHeader = hasHeader; boolean skip = false; boolean hasSeenNonblankChar = false; boolean hasQuoteChar = false; byte delimChar = this.delimiter.getByteValue(); // comment marker check byte[] comment = this.commentMarker.getBytes(); int cmntIndex = 0; boolean checkForComment = comment.length > 0; int colNum = 0; int lineNum = 1; int columnIndex = 0; int row = 0; // array row number int col = 0; // array column number StringBuilder dataBuilder = new StringBuilder(); byte prevChar = -1; byte[] buffer = new byte[DataFileReader.BUFFER_SIZE]; int len; while ((len = in.read(buffer)) != -1 && !Thread.currentThread().isInterrupted()) { int i = 0; // buffer array index if (skipHeader) { boolean finished = false; for (; i < len && !finished && !Thread.currentThread().isInterrupted(); i++) { byte currChar = buffer[i]; if (currChar == DataFileReader.CARRIAGE_RETURN || currChar == DataFileReader.LINE_FEED) { if (currChar == DataFileReader.LINE_FEED && prevChar == DataFileReader.CARRIAGE_RETURN) { prevChar = DataFileReader.LINE_FEED; continue; } finished = hasSeenNonblankChar && !skip; if (finished) { skipHeader = false; } lineNum++; // reset states skip = false; hasSeenNonblankChar = false; cmntIndex = 0; checkForComment = comment.length > 0; } else if (!skip) { if (currChar > DataFileReader.SPACE_CHAR) { hasSeenNonblankChar = true; } // skip blank chars at the begining of the line if (currChar <= DataFileReader.SPACE_CHAR && !hasSeenNonblankChar) { continue; } // check for comment marker to skip line if (checkForComment) { if (currChar == comment[cmntIndex]) { cmntIndex++; if (cmntIndex == comment.length) { skip = true; prevChar = currChar; continue; } } else { checkForComment = false; } } } prevChar = currChar; } } for (; i < len && !Thread.currentThread().isInterrupted(); i++) { byte currChar = buffer[i]; if (currChar == DataFileReader.CARRIAGE_RETURN || currChar == DataFileReader.LINE_FEED) { if (currChar == DataFileReader.LINE_FEED && prevChar == DataFileReader.CARRIAGE_RETURN) { prevChar = DataFileReader.LINE_FEED; continue; } if (hasSeenNonblankChar && !skip) { colNum++; DataColumn dataColumn = dataColumns[columnIndex]; if (dataColumn.getColumnNumber() == colNum) { String value = dataBuilder.toString().trim(); if (value.isEmpty() || value.equals(this.missingDataMarker)) { data[row][col++] = DatasetReader.CONTINUOUS_MISSING_VALUE; } else { try { data[row][col++] = Double.parseDouble(value); } catch (NumberFormatException exception) { String errMsg = String.format("Non-continuous number %s on line %d at column %d.", value, lineNum, colNum); // TabularDataFileReader.LOGGER.error(errMsg, exception); throw new DataReaderException(errMsg); } } columnIndex++; } // ensure we have enough data if (columnIndex < numOfColsInDataFile) { String errMsg = String.format("Insufficient data on line %d. Extracted %d value(s) but expected %d.", lineNum, columnIndex, numOfColsInDataFile); // TabularDataFileReader.LOGGER.error(errMsg); throw new DataReaderException(errMsg); } row++; } lineNum++; // clear data dataBuilder.delete(0, dataBuilder.length()); // reset states skip = false; hasSeenNonblankChar = false; cmntIndex = 0; checkForComment = comment.length > 0; columnIndex = 0; colNum = 0; col = 0; } else if (!skip) { if (currChar > DataFileReader.SPACE_CHAR) { hasSeenNonblankChar = true; } // skip blank chars at the begining of the line if (currChar <= DataFileReader.SPACE_CHAR && !hasSeenNonblankChar) { continue; } // check for comment marker to skip line if (checkForComment) { if (currChar == comment[cmntIndex]) { cmntIndex++; if (cmntIndex == comment.length) { skip = true; prevChar = currChar; continue; } } else { checkForComment = false; } } if (currChar == this.quoteCharacter) { hasQuoteChar = !hasQuoteChar; } else { if (hasQuoteChar) { dataBuilder.append((char) currChar); } else { boolean isDelimiter; if (this.delimiter == Delimiter.WHITESPACE) { isDelimiter = (currChar <= DataFileReader.SPACE_CHAR) && (prevChar > DataFileReader.SPACE_CHAR); } else { isDelimiter = (currChar == delimChar); } if (isDelimiter) { colNum++; DataColumn dataColumn = dataColumns[columnIndex]; if (dataColumn.getColumnNumber() == colNum) { String value = dataBuilder.toString().trim(); if (value.isEmpty() || value.equals(this.missingDataMarker)) { data[row][col++] = DatasetReader.CONTINUOUS_MISSING_VALUE; } else { try { data[row][col++] = Double.parseDouble(value); } catch (NumberFormatException exception) { String errMsg = String.format("Non-continuous number %s on line %d at column %d.", value, lineNum, colNum); // TabularDataFileReader.LOGGER.error(errMsg, exception); throw new DataReaderException(errMsg); } } columnIndex++; if (columnIndex == numOfCols) { row++; skip = true; } } // clear data dataBuilder.delete(0, dataBuilder.length()); } else { dataBuilder.append((char) currChar); } } } } prevChar = currChar; } } if (!skipHeader && hasSeenNonblankChar && !skip) { colNum++; DataColumn dataColumn = dataColumns[columnIndex]; if (dataColumn.getColumnNumber() == colNum) { String value = dataBuilder.toString().trim(); if (value.isEmpty() || value.equals(this.missingDataMarker)) { data[row][col++] = DatasetReader.CONTINUOUS_MISSING_VALUE; } else { try { data[row][col++] = Double.parseDouble(value); } catch (NumberFormatException exception) { String errMsg = String.format("Non-continuous number %s on line %d at column %d.", value, lineNum, colNum); // TabularDataFileReader.LOGGER.error(errMsg, exception); throw new DataReaderException(errMsg); } } columnIndex++; } // ensure we have enough data if (columnIndex < numOfColsInDataFile) { String errMsg = String.format("Insufficient data on line %d. Extracted %d value(s) but expected %d.", lineNum, columnIndex, numOfColsInDataFile); // TabularDataFileReader.LOGGER.error(errMsg); throw new DataReaderException(errMsg); } } } return new ContinuousTabularData(dataColumns, data); } private Data readInDiscreteData(DataColumn[] dataColumns, boolean hasHeader, int numOfColsInDataFile) throws IOException { DiscreteDataColumn[] discreteDataColumns = Arrays.stream(dataColumns) .map(DiscreteTabularDataColumn::new) .toArray(DiscreteDataColumn[]::new); readInDiscreteCategorizes(discreteDataColumns, hasHeader, numOfColsInDataFile); int[][] data = readInDiscreteData(discreteDataColumns, hasHeader, numOfColsInDataFile); return new VerticalDiscreteTabularData(discreteDataColumns, data); } private int[][] readInDiscreteData(DiscreteDataColumn[] dataColumns, boolean hasHeader, int numOfColsInDataFile) throws IOException { int numOfCols = dataColumns.length; int numOfRows = hasHeader ? countNumberOfLines() - 1 : countNumberOfLines(); int[][] data = new int[numOfCols][numOfRows]; try (InputStream in = Files.newInputStream(this.dataFile, StandardOpenOption.READ)) { boolean skipHeader = hasHeader; boolean skip = false; boolean hasSeenNonblankChar = false; boolean hasQuoteChar = false; byte delimChar = this.delimiter.getByteValue(); // comment marker check byte[] comment = this.commentMarker.getBytes(); int cmntIndex = 0; boolean checkForComment = comment.length > 0; int colNum = 0; int lineNum = 1; int columnIndex = 0; int row = 0; // array row number int col = 0; // array column number StringBuilder dataBuilder = new StringBuilder(); byte prevChar = -1; byte[] buffer = new byte[DataFileReader.BUFFER_SIZE]; int len; while ((len = in.read(buffer)) != -1 && !Thread.currentThread().isInterrupted()) { int i = 0; // buffer array index if (skipHeader) { boolean finished = false; for (; i < len && !finished && !Thread.currentThread().isInterrupted(); i++) { byte currChar = buffer[i]; if (currChar == DataFileReader.CARRIAGE_RETURN || currChar == DataFileReader.LINE_FEED) { if (currChar == DataFileReader.LINE_FEED && prevChar == DataFileReader.CARRIAGE_RETURN) { prevChar = DataFileReader.LINE_FEED; continue; } finished = hasSeenNonblankChar && !skip; if (finished) { skipHeader = false; } lineNum++; // reset states skip = false; hasSeenNonblankChar = false; cmntIndex = 0; checkForComment = comment.length > 0; } else if (!skip) { if (currChar > DataFileReader.SPACE_CHAR) { hasSeenNonblankChar = true; } // skip blank chars at the begining of the line if (currChar <= DataFileReader.SPACE_CHAR && !hasSeenNonblankChar) { continue; } // check for comment marker to skip line if (checkForComment) { if (currChar == comment[cmntIndex]) { cmntIndex++; if (cmntIndex == comment.length) { skip = true; prevChar = currChar; continue; } } else { checkForComment = false; } } } prevChar = currChar; } } for (; i < len && !Thread.currentThread().isInterrupted(); i++) { byte currChar = buffer[i]; if (currChar == DataFileReader.CARRIAGE_RETURN || currChar == DataFileReader.LINE_FEED) { if (currChar == DataFileReader.LINE_FEED && prevChar == DataFileReader.CARRIAGE_RETURN) { prevChar = DataFileReader.LINE_FEED; continue; } if (hasSeenNonblankChar && !skip) { colNum++; DiscreteDataColumn discreteDataColumn = dataColumns[columnIndex]; DataColumn dataColumn = discreteDataColumn.getDataColumn(); if (dataColumn.getColumnNumber() == colNum) { String value = dataBuilder.toString().trim(); if (value.isEmpty() || value.equals(this.missingDataMarker)) { data[col++][row] = DatasetReader.DISCRETE_MISSING_VALUE; } else { data[col++][row] = discreteDataColumn.getEncodeValue(value); } columnIndex++; } // ensure we have enough data if (columnIndex < numOfColsInDataFile) { String errMsg = String.format("Insufficient data on line %d. Extracted %d value(s) but expected %d.", lineNum, columnIndex, numOfColsInDataFile); // TabularDataFileReader.LOGGER.error(errMsg); throw new DataReaderException(errMsg); } row++; } lineNum++; // clear data dataBuilder.delete(0, dataBuilder.length()); // reset states skip = false; hasSeenNonblankChar = false; cmntIndex = 0; checkForComment = comment.length > 0; columnIndex = 0; colNum = 0; col = 0; } else if (!skip) { if (currChar > DataFileReader.SPACE_CHAR) { hasSeenNonblankChar = true; } // skip blank chars at the begining of the line if (currChar <= DataFileReader.SPACE_CHAR && !hasSeenNonblankChar) { continue; } // check for comment marker to skip line if (checkForComment) { if (currChar == comment[cmntIndex]) { cmntIndex++; if (cmntIndex == comment.length) { skip = true; prevChar = currChar; continue; } } else { checkForComment = false; } } if (currChar == this.quoteCharacter) { hasQuoteChar = !hasQuoteChar; } else { if (hasQuoteChar) { dataBuilder.append((char) currChar); } else { boolean isDelimiter; if (this.delimiter == Delimiter.WHITESPACE) { isDelimiter = (currChar <= DataFileReader.SPACE_CHAR) && (prevChar > DataFileReader.SPACE_CHAR); } else { isDelimiter = (currChar == delimChar); } if (isDelimiter) { colNum++; DiscreteDataColumn discreteDataColumn = dataColumns[columnIndex]; DataColumn dataColumn = discreteDataColumn.getDataColumn(); if (dataColumn.getColumnNumber() == colNum) { String value = dataBuilder.toString().trim(); if (value.isEmpty() || value.equals(this.missingDataMarker)) { data[col++][row] = DatasetReader.DISCRETE_MISSING_VALUE; } else { data[col++][row] = discreteDataColumn.getEncodeValue(value); } columnIndex++; if (columnIndex == numOfCols) { row++; skip = true; } } // clear data dataBuilder.delete(0, dataBuilder.length()); } else { dataBuilder.append((char) currChar); } } } } prevChar = currChar; } } if (!skipHeader && hasSeenNonblankChar && !skip) { colNum++; DiscreteDataColumn discreteDataColumn = dataColumns[columnIndex]; DataColumn dataColumn = discreteDataColumn.getDataColumn(); if (dataColumn.getColumnNumber() == colNum) { String value = dataBuilder.toString().trim(); if (value.isEmpty() || value.equals(this.missingDataMarker)) { data[col++][row] = DatasetReader.DISCRETE_MISSING_VALUE; } else { data[col++][row] = discreteDataColumn.getEncodeValue(value); } columnIndex++; } // ensure we have enough data if (columnIndex < numOfColsInDataFile) { String errMsg = String.format("Insufficient data on line %d. Extracted %d value(s) but expected %d.", lineNum, columnIndex, numOfColsInDataFile); // TabularDataFileReader.LOGGER.error(errMsg); throw new DataReaderException(errMsg); } } } return data; } private void readInDiscreteCategorizes(DiscreteDataColumn[] dataColumns, boolean hasHeader, int numOfColsInDataFile) throws IOException { int numOfCols = dataColumns.length; try (InputStream in = Files.newInputStream(this.dataFile, StandardOpenOption.READ)) { boolean skipHeader = hasHeader; boolean skip = false; boolean hasSeenNonblankChar = false; boolean hasQuoteChar = false; byte delimChar = this.delimiter.getByteValue(); // comment marker check byte[] comment = this.commentMarker.getBytes(); int cmntIndex = 0; boolean checkForComment = comment.length > 0; int colNum = 0; int lineNum = 1; int columnIndex = 0; StringBuilder dataBuilder = new StringBuilder(); byte prevChar = -1; byte[] buffer = new byte[DataFileReader.BUFFER_SIZE]; int len; while ((len = in.read(buffer)) != -1 && !Thread.currentThread().isInterrupted()) { int i = 0; // buffer array index if (skipHeader) { boolean finished = false; for (; i < len && !finished && !Thread.currentThread().isInterrupted(); i++) { byte currChar = buffer[i]; if (currChar == DataFileReader.CARRIAGE_RETURN || currChar == DataFileReader.LINE_FEED) { if (currChar == DataFileReader.LINE_FEED && prevChar == DataFileReader.CARRIAGE_RETURN) { prevChar = DataFileReader.LINE_FEED; continue; } finished = hasSeenNonblankChar && !skip; if (finished) { skipHeader = false; } lineNum++; // reset states skip = false; hasSeenNonblankChar = false; cmntIndex = 0; checkForComment = comment.length > 0; } else if (!skip) { if (currChar > DataFileReader.SPACE_CHAR) { hasSeenNonblankChar = true; } // skip blank chars at the begining of the line if (currChar <= DataFileReader.SPACE_CHAR && !hasSeenNonblankChar) { continue; } // check for comment marker to skip line if (checkForComment) { if (currChar == comment[cmntIndex]) { cmntIndex++; if (cmntIndex == comment.length) { skip = true; prevChar = currChar; continue; } } else { checkForComment = false; } } } prevChar = currChar; } } for (; i < len && !Thread.currentThread().isInterrupted(); i++) { byte currChar = buffer[i]; if (currChar == DataFileReader.CARRIAGE_RETURN || currChar == DataFileReader.LINE_FEED) { if (currChar == DataFileReader.LINE_FEED && prevChar == DataFileReader.CARRIAGE_RETURN) { prevChar = DataFileReader.LINE_FEED; continue; } if (hasSeenNonblankChar && !skip) { colNum++; DiscreteDataColumn discreteDataColumn = dataColumns[columnIndex]; DataColumn dataColumn = discreteDataColumn.getDataColumn(); if (dataColumn.getColumnNumber() == colNum) { if (dataColumn.isDiscrete()) { String value = dataBuilder.toString().trim(); if (!value.isEmpty() && !value.equals(this.missingDataMarker)) { discreteDataColumn.setValue(value); } } columnIndex++; } // ensure we have enough data if (columnIndex < numOfColsInDataFile) { String errMsg = String.format("Insufficient data on line %d. Extracted %d value(s) but expected %d.", lineNum, columnIndex, numOfColsInDataFile); throw new DataReaderException(errMsg); } } lineNum++; // clear data dataBuilder.delete(0, dataBuilder.length()); // reset states skip = false; hasSeenNonblankChar = false; cmntIndex = 0; checkForComment = comment.length > 0; columnIndex = 0; colNum = 0; } else if (!skip) { if (currChar > DataFileReader.SPACE_CHAR) { hasSeenNonblankChar = true; } // skip blank chars at the begining of the line if (currChar <= DataFileReader.SPACE_CHAR && !hasSeenNonblankChar) { continue; } // check for comment marker to skip line if (checkForComment) { if (currChar == comment[cmntIndex]) { cmntIndex++; if (cmntIndex == comment.length) { skip = true; prevChar = currChar; continue; } } else { checkForComment = false; } } if (currChar == this.quoteCharacter) { hasQuoteChar = !hasQuoteChar; } else { if (hasQuoteChar) { dataBuilder.append((char) currChar); } else { boolean isDelimiter; if (this.delimiter == Delimiter.WHITESPACE) { isDelimiter = (currChar <= DataFileReader.SPACE_CHAR) && (prevChar > DataFileReader.SPACE_CHAR); } else { isDelimiter = (currChar == delimChar); } if (isDelimiter) { colNum++; DiscreteDataColumn discreteDataColumn = dataColumns[columnIndex]; DataColumn dataColumn = discreteDataColumn.getDataColumn(); if (dataColumn.getColumnNumber() == colNum) { if (dataColumn.isDiscrete()) { String value = dataBuilder.toString().trim(); if (!value.isEmpty() && !value.equals(this.missingDataMarker)) { discreteDataColumn.setValue(value); } } columnIndex++; if (columnIndex == numOfCols) { skip = true; } } // clear data dataBuilder.delete(0, dataBuilder.length()); } else { dataBuilder.append((char) currChar); } } } } prevChar = currChar; } } if (!skipHeader && hasSeenNonblankChar && !skip) { colNum++; DiscreteDataColumn discreteDataColumn = dataColumns[columnIndex]; DataColumn dataColumn = discreteDataColumn.getDataColumn(); if (dataColumn.getColumnNumber() == colNum) { if (dataColumn.isDiscrete()) { String value = dataBuilder.toString().trim(); if (!value.isEmpty() && !value.equals(this.missingDataMarker)) { discreteDataColumn.setValue(value); } } columnIndex++; } // ensure we have enough data if (columnIndex < numOfColsInDataFile) { String errMsg = String.format("Insufficient data on line %d. Extracted %d value(s) but expected %d.", lineNum, columnIndex, numOfColsInDataFile); // TabularDataFileReader.LOGGER.error(errMsg); throw new DataReaderException(errMsg); } } } // recategorize values for (DiscreteDataColumn discreteDataColumn : dataColumns) { if (discreteDataColumn.getDataColumn().isGenerated()) { discreteDataColumn.setValue("0"); discreteDataColumn.setValue("1"); } discreteDataColumn.recategorize(); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy