
io.deephaven.csv.reading.CsvReader Maven / Gradle / Ivy
package io.deephaven.csv.reading;
import io.deephaven.csv.CsvSpecs;
import io.deephaven.csv.containers.ByteSlice;
import io.deephaven.csv.densestorage.DenseStorageReader;
import io.deephaven.csv.densestorage.DenseStorageWriter;
import io.deephaven.csv.parsers.Parser;
import io.deephaven.csv.sinks.Sink;
import io.deephaven.csv.sinks.SinkFactory;
import io.deephaven.csv.util.CsvReaderException;
import io.deephaven.csv.util.MutableBoolean;
import io.deephaven.csv.util.MutableObject;
import io.deephaven.csv.util.Renderer;
import java.io.InputStream;
import java.io.Reader;
import java.util.*;
import java.util.concurrent.*;
/**
* A class for reading CSV data. Typical usage is:
*
*
* - Construct a CsvReader.
*
- Customize the CsvReader by calling the various setXXX methods.
*
- Arrange for the input text to be in a {@link Reader}.
*
- Prepare a {@link SinkFactory} which can in turn provide Sink<T> objects for the output data.
*
- Call the {@link #read} method.
*
*
* Furthermore the setXXX methods can be used in a builder pattern. Example:
*
*
* final CsvReader csvr = new CsvReader()
* .setQuoteChar('#')
* .setAsync(false)
* .setParserFor("Timestamp", Parsers.DATETIME);
* final Reader r = ...;
* final SinkFactory f = ...;
* final CsvReader.Result res = csvr.read(r, f);
*
*/
public final class CsvReader {
/**
* Utility class. Do not instantiate.
*/
private CsvReader() {}
/**
* Read the data.
*
* @param specs A {@link CsvSpecs} object providing options for the parse.
* @param stream The input data, encoded in UTF-8.
* @param sinkFactory A factory that can provide Sink<T> of all appropriate types for the output data. Once
* the CsvReader determines what the column type is, it will use the {@link SinkFactory} to create an
* appropriate Sink<T> for the type. Note that the CsvReader might guess wrong, so it might create a
* Sink, partially populate it, and then abandon it. The final set of fully-populated Sinks will be returned
* in in the CsvReader.Result. Thread safety: The {@link SinkFactory} may be invoked concurrently, therefore
* it must be thread safe.
* @return A CsvReader.Result containing the column names, the number of columns, and the final set of
* fully-populated Sinks.
*/
public static Result read(final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory)
throws CsvReaderException {
final byte quoteAsByte = check7BitAscii("quote", specs.quote());
final byte delimiterAsByte = check7BitAscii("delimiter", specs.delimiter());
final CellGrabber grabber =
new CellGrabber(stream, quoteAsByte, delimiterAsByte, specs.ignoreSurroundingSpaces(),
specs.trim());
// For an "out" parameter
final MutableObject firstDataRowHolder = new MutableObject<>();
final String[] headersTemp = determineHeadersToUse(specs, grabber, firstDataRowHolder);
final byte[][] firstDataRow = firstDataRowHolder.getValue();
final int numInputCols = headersTemp.length;
// If the final column in the header row is blank, we assume that the final column in all the data rows
// is also blank (we confirm this assumption in ParseInputToDenseStorage, as we're reading the file).
// This corresponds to a file with trailing delimiters.
final String[] headersTemp2;
if (numInputCols != 0 && headersTemp[numInputCols - 1].isEmpty()) {
headersTemp2 = Arrays.copyOf(headersTemp, numInputCols - 1);
} else {
headersTemp2 = headersTemp;
}
final int numOutputCols = headersTemp2.length;
final String[] headersToUse = canonicalizeHeaders(specs, headersTemp2);
// Create a DenseStorageWriter and two readers for each column.
final DenseStorageWriter[] dsws = new DenseStorageWriter[numInputCols];
final DenseStorageReader[] dsr0s = new DenseStorageReader[numInputCols];
final DenseStorageReader[] dsr1s = new DenseStorageReader[numInputCols];
// The arrays are sized to "numInputCols" but only populated up to "numOutputCols".
// The code in ParseInputToDenseStorge knows that a null DenseStorageWriter means that the
// column is all-empty and (once the data is confirmed to be empty) just drop the data.
for (int ii = 0; ii < numOutputCols; ++ii) {
final DenseStorageWriter dsw = new DenseStorageWriter();
dsws[ii] = dsw;
dsr0s[ii] = dsw.newReader();
dsr1s[ii] = dsw.newReader();
}
// Select an Excecutor based on whether the user wants the code to run asynchronously
// or not.
final ExecutorService exec =
specs.concurrent()
? Executors.newFixedThreadPool(numOutputCols + 1)
: Executors.newSingleThreadExecutor();
final ArrayList>> sinkFutures = new ArrayList<>();
try {
final Future numRowsFuture =
exec.submit(
() -> ParseInputToDenseStorage.doit(firstDataRow, specs.nullValueLiteral(), grabber, dsws));
for (int ii = 0; ii < numOutputCols; ++ii) {
final List> parsersToUse = calcParsersToUse(specs, headersToUse[ii], ii + 1);
final String nullValueLiteralToUse = calcNullValueLiteralToUse(specs, headersToUse[ii], ii + 1);
final int iiCopy = ii;
final Future> fcb =
exec.submit(
() -> ParseDenseStorageToColumn.doit(
dsr0s[iiCopy],
dsr1s[iiCopy],
parsersToUse,
specs.nullParser(),
specs.customTimeZoneParser(),
nullValueLiteralToUse,
sinkFactory));
sinkFutures.add(fcb);
}
final long numRows;
final Sink>[] sinks = new Sink>[numOutputCols];
numRows = numRowsFuture.get();
for (int ii = 0; ii < numOutputCols; ++ii) {
sinks[ii] = sinkFutures.get(ii).get();
}
return new Result(numRows, headersToUse, sinks);
} catch (Exception inner) {
throw new CsvReaderException("Caught exception", inner);
} finally {
// Cancel the sinks (interrupting them if necessary). It is harmless to do this if the sinks
// have already exited normally.
for (Future> sf : sinkFutures) {
sf.cancel(true); // Result ignored.
}
exec.shutdown();
}
}
/**
* Determine which list of parsers to use for type inference. Returns {@link CsvSpecs#parsers} unless the user has
* set an override on a column name or column number basis.
*/
private static List> calcParsersToUse(final CsvSpecs specs,
final String columnName, final int oneBasedColumnNumber) {
Parser> specifiedParser = specs.parserForName().get(columnName);
if (specifiedParser != null) {
return List.of(specifiedParser);
}
specifiedParser = specs.parserForIndex().get(oneBasedColumnNumber);
if (specifiedParser != null) {
return List.of(specifiedParser);
}
return specs.parsers();
}
/**
* Determine which null value literal to use. Returns {@link CsvSpecs#nullValueLiteral()} unless the user has set an
* override on a column name or column number basis.
*/
private static String calcNullValueLiteralToUse(final CsvSpecs specs,
final String columnName, final int oneBasedColumnNumber) {
String result = specs.nullValueLiteralForName().get(columnName);
if (result != null) {
return result;
}
result = specs.nullValueLiteralForIndex().get(oneBasedColumnNumber);
if (result != null) {
return result;
}
return specs.nullValueLiteral();
}
/**
* Determine which headers to use. The result comes from either the first row of the file or the user-specified
* overrides.
*/
private static String[] determineHeadersToUse(final CsvSpecs specs,
final CellGrabber grabber, final MutableObject firstDataRowHolder)
throws CsvReaderException {
String[] headersToUse = null;
if (specs.hasHeaderRow()) {
final byte[][] firstRow = tryReadOneRow(grabber);
if (firstRow == null) {
throw new CsvReaderException(
"Can't proceed because hasHeaders is set but input file is empty");
}
headersToUse = Arrays.stream(firstRow).map(String::new).toArray(String[]::new);
}
// Whether or not the input had headers, maybe override with client-specified headers.
if (specs.headers().size() != 0) {
headersToUse = specs.headers().toArray(new String[0]);
}
// If we still have nothing, try generate synthetic column headers (works only if the file is
// non-empty,
// because we need to infer the column count).
final byte[][] firstDataRow;
if (headersToUse == null) {
firstDataRow = tryReadOneRow(grabber);
if (firstDataRow == null) {
throw new CsvReaderException(
"Can't proceed because input file is empty and client has not specified headers");
}
headersToUse = new String[firstDataRow.length];
for (int ii = 0; ii < headersToUse.length; ++ii) {
headersToUse[ii] = "Column" + (ii + 1);
}
} else {
firstDataRow = null;
}
// Apply column specific overrides.
for (Map.Entry entry : specs.headerForIndex().entrySet()) {
headersToUse[entry.getKey() - 1] = entry.getValue();
}
firstDataRowHolder.setValue(firstDataRow);
return headersToUse;
}
private static String[] canonicalizeHeaders(CsvSpecs specs, final String[] headers) throws CsvReaderException {
final String[] legalized = specs.headerLegalizer().apply(headers);
final Set unique = new HashSet<>();
final List repeats = new ArrayList<>();
final List invalidNames = new ArrayList<>();
for (String header : legalized) {
if (!unique.add(header)) {
repeats.add(header);
} else if (!specs.headerValidator().test(header)) {
// Using an "else if" because we only want to run each unique name once through the
// validator.
invalidNames.add(header);
}
}
if (repeats.isEmpty() && invalidNames.isEmpty()) {
return legalized;
}
final StringBuilder sb = new StringBuilder("Some column headers are invalid.");
if (!repeats.isEmpty()) {
sb.append(" Repeated headers: ");
sb.append(Renderer.renderList(repeats));
}
if (!invalidNames.isEmpty()) {
sb.append(" Invalid headers: ");
sb.append(Renderer.renderList(invalidNames));
}
throw new CsvReaderException(sb.toString());
}
/**
* Try to read one row from the input. Returns false if the input ends before one row has been read.
*
* @return The first row as a byte[][] or null if the input was exhausted.
*/
private static byte[][] tryReadOneRow(final CellGrabber grabber) throws CsvReaderException {
final List headers = new ArrayList<>();
// Grab the header
final ByteSlice slice = new ByteSlice();
final MutableBoolean lastInRow = new MutableBoolean();
do {
if (!grabber.grabNext(slice, lastInRow)) {
return null;
}
final byte[] item = new byte[slice.size()];
slice.copyTo(item, 0);
headers.add(item);
} while (!lastInRow.booleanValue());
return headers.toArray(new byte[0][]);
}
private static byte check7BitAscii(String what, char c) throws CsvReaderException {
if (c > 0x7f) {
final String message = String.format("%s is set to '%c' but is required to be 7-bit ASCII",
what, c);
throw new CsvReaderException(message);
}
return (byte) c;
}
/** Result of {@link #read}. */
public static final class Result {
private final long numRows;
private final String[] columnNames;
private final Sink>[] columns;
public Result(final long numRows, final String[] columnNames, final Sink>[] columns) {
this.numRows = numRows;
this.columnNames = columnNames;
this.columns = columns;
}
/** Number of rows in the input. */
public long numRows() {
return numRows;
}
/** The column names. */
public String[] columnNames() {
return columnNames;
}
/**
* Data for each column. Each Sink was constructed by some method in the SinkFactory that the caller passed to
* {@link #read}.
*/
public Sink>[] columns() {
return columns;
}
/** The number of columns. */
public int numCols() {
return columns.length;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy