io.github.vmzakharov.ecdataframe.dataset.CsvDataSet Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of dataframe-ec Show documentation
Show all versions of dataframe-ec Show documentation
A tabular data structure based on the Eclipse Collections framework
The newest version!
package io.github.vmzakharov.ecdataframe.dataset;
import io.github.vmzakharov.ecdataframe.dataframe.DataFrame;
import io.github.vmzakharov.ecdataframe.dataframe.DfColumn;
import io.github.vmzakharov.ecdataframe.dataframe.DfDateColumn;
import io.github.vmzakharov.ecdataframe.dataframe.DfDateTimeColumn;
import io.github.vmzakharov.ecdataframe.dataframe.DfDoubleColumn;
import io.github.vmzakharov.ecdataframe.dataframe.DfFloatColumn;
import io.github.vmzakharov.ecdataframe.dataframe.DfIntColumn;
import io.github.vmzakharov.ecdataframe.dataframe.DfLongColumn;
import io.github.vmzakharov.ecdataframe.dsl.value.ValueType;
import org.eclipse.collections.api.block.procedure.Procedure;
import org.eclipse.collections.api.list.ListIterable;
import org.eclipse.collections.api.list.MutableList;
import org.eclipse.collections.impl.factory.Lists;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.time.format.ResolverStyle;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import static io.github.vmzakharov.ecdataframe.dsl.value.ValueType.DATE;
import static io.github.vmzakharov.ecdataframe.dsl.value.ValueType.DATE_TIME;
import static io.github.vmzakharov.ecdataframe.dsl.value.ValueType.DOUBLE;
import static io.github.vmzakharov.ecdataframe.dsl.value.ValueType.LONG;
import static io.github.vmzakharov.ecdataframe.dsl.value.ValueType.STRING;
import static io.github.vmzakharov.ecdataframe.util.ExceptionFactory.exceptionByKey;
public class CsvDataSet
extends DataSetAbstract
{
public static final int BUFFER_SIZE = 65_536;
public static final int LINE_COUNT_FOR_TYPE_INFERENCE = 100;
private final Path dataFilePath;
private boolean emptyElementsConvertedToNulls = false;
private CsvSchema schema;
private DateTimeFormatter[] formatters;
public CsvDataSet(String newDataFileName, String newName)
{
this(newDataFileName, newName, null);
}
public CsvDataSet(Path newDataFilePath, String newName)
{
this(newDataFilePath, newName, null);
}
public CsvDataSet(String newDataFileName, String newName, CsvSchema newSchema)
{
super(newName);
this.dataFilePath = Paths.get(newDataFileName);
this.schema = newSchema;
}
public CsvDataSet(Path newDataFilePath, String newName, CsvSchema newSchema)
{
super(newName);
this.dataFilePath = newDataFilePath;
this.schema = newSchema;
}
private String getDataFileName()
{
return this.dataFilePath.toString();
}
@Override
public void openFileForReading()
{
// Not needed for CSV files
}
/**
* Empty values in the source file (two adjacent separators) will be loaded as null values as this data set is
* loaded into a data frame. By default, they are treated as empty values of the corresponding column type (e.g.
* empty strings for string columns, zeroes for numeric columns, etc.)
* @return this data set
*/
public CsvDataSet convertEmptyElementsToNulls()
{
this.emptyElementsConvertedToNulls = true;
return this;
}
/**
* Empty values in the source file (two adjacent separators) will be converted to the respective zero or empty
* values depending on the column type as this data set is loaded into a data frame. This is the default behavior.
* @return this data set
*/
public CsvDataSet convertEmptyElementsToValues()
{
this.emptyElementsConvertedToNulls = false;
return this;
}
@Override
public Object next()
{
return null;
}
@Override
public boolean hasNext()
{
return false;
}
@Override
public void close()
{
// Not needed for CSV files
}
public void write(DataFrame dataFrame)
{
if (this.schemaIsNotDefined())
{
this.schema = this.schemaFromDataFrame(dataFrame);
}
try (BufferedWriter writer = new BufferedWriter(this.createWriter(), BUFFER_SIZE))
{
int columnCount = dataFrame.columnCount();
if (this.schema.hasHeaderLine())
{
for (int columnIndex = 0; columnIndex < columnCount; columnIndex++)
{
writer.write(this.schema.columnAt(columnIndex).getName());
if (columnIndex < columnCount - 1)
{
writer.write(this.getSchema().getSeparator());
}
}
writer.write('\n');
}
int rowCount = dataFrame.rowCount();
for (int rowIndex = 0; rowIndex < rowCount; rowIndex++)
{
for (int columnIndex = 0; columnIndex < columnCount; columnIndex++)
{
this.writeValue(writer, dataFrame, rowIndex, columnIndex);
if (columnIndex < columnCount - 1)
{
writer.write(this.getSchema().getSeparator());
}
}
writer.write('\n');
}
}
catch (IOException e)
{
throw exceptionByKey("CSV_FILE_WRITE_FAIL").get(e);
}
}
private CsvSchema schemaFromDataFrame(DataFrame dataFrame)
{
CsvSchema dfSchema = new CsvSchema();
dataFrame.getColumns().forEach(e -> dfSchema.addColumn(e.getName(), e.getType()));
return dfSchema;
}
private void writeValue(Writer writer, DataFrame dataFrame, int rowIndex, int columnIndex)
throws IOException
{
DfColumn dfColumn = dataFrame.getColumnAt(columnIndex);
CsvSchemaColumn schemaColumn = this.getSchema().columnAt(columnIndex);
String valueAsLiteral;
if (dfColumn.isNull(rowIndex))
{
valueAsLiteral = this.schema.hasNullMarker() ? this.schema.getNullMarker() : "";
}
else
{
valueAsLiteral = switch (dfColumn.getType())
{
case LONG ->
{
long longValue = ((DfLongColumn) dfColumn).getLong(rowIndex);
yield schemaColumn.getLongFormatter().format(longValue);
}
case DOUBLE ->
{
double doubleValue = ((DfDoubleColumn) dfColumn).getDouble(rowIndex);
yield schemaColumn.getDoubleFormatter().format(doubleValue);
}
case INT ->
{
int intValue = ((DfIntColumn) dfColumn).getInt(rowIndex);
yield schemaColumn.getIntFormatter().format(intValue);
}
case FLOAT ->
{
float floatValue = ((DfFloatColumn) dfColumn).getFloat(rowIndex);
yield schemaColumn.getFloatFormatter().format(floatValue);
}
case STRING ->
{
String stringValue = dfColumn.getValueAsString(rowIndex);
yield this.schema.getQuoteCharacter() + stringValue + this.schema.getQuoteCharacter();
}
case DATE ->
{
LocalDate dateValue = ((DfDateColumn) dfColumn).getTypedObject(rowIndex);
yield this.formatterForColumn(columnIndex).format(dateValue);
}
case DATE_TIME ->
{
LocalDateTime dateTimeValue = ((DfDateTimeColumn) dfColumn).getTypedObject(rowIndex);
yield this.formatterForColumn(columnIndex).format(dateTimeValue);
}
default -> throw exceptionByKey("CSV_UNSUPPORTED_VAL_TO_STR")
.with("valueType", dfColumn.getType())
.get();
};
}
writer.write(valueAsLiteral);
}
private DateTimeFormatter formatterForColumn(int columnIndex)
{
if (this.formatters == null)
{
this.formatters = new DateTimeFormatter[this.schema.columnCount()];
}
if (this.formatters[columnIndex] == null)
{
String pattern = this.schema.columnAt(columnIndex).getPattern();
this.formatters[columnIndex] = DateTimeFormatter.ofPattern(pattern);
}
return this.formatters[columnIndex];
}
protected Writer createWriter()
throws IOException
{
return new OutputStreamWriter(Files.newOutputStream(this.dataFilePath));
}
protected Reader createReader()
throws IOException
{
String fileName = this.dataFilePath.getFileName().toString();
InputStream fis = Files.newInputStream(this.dataFilePath);
if (fileName.endsWith(".zip"))
{
ZipInputStream zis = new ZipInputStream(fis);
ZipEntry entry = zis.getNextEntry();
return new InputStreamReader(zis);
}
if (fileName.endsWith(".gz"))
{
GZIPInputStream gzis = new GZIPInputStream(fis);
return new InputStreamReader(gzis);
}
return new InputStreamReader(fis);
}
/**
* Load the first lines of the data set as a data frame.
* If a schema is not specified for this data set, an attempt will be made to infer the schema from the data in
* the first several lines of the data set.
* @return a data frame representing the contents of the data set
*/
public DataFrame loadAsDataFrame()
{
return this.loadAsDataFrame(0, true);
}
/**
* load the first lines of the data set as a data frame
* @param headLineCount the number of lines to load from the beginning of the data set
* @return a data frame representing the first {@code headLineCount} lines in the data set
*/
public DataFrame loadAsDataFrame(int headLineCount)
{
return this.loadAsDataFrame(headLineCount, false);
}
/**
* Infers schema from the data file defined by this data set. This is done by reading the first several lines of
* the file and attempting to parse elements using different formats.
* If the data set already has the schema defined, it gets overridden with the inferred one.
* By default, the first 100 lines of the file are used to infer column types
* @return the inferred schema
*/
public CsvSchema inferSchema()
{
return this.inferSchema(LINE_COUNT_FOR_TYPE_INFERENCE);
}
/**
* Infers schema from the data file defined by this data set. This is done by reading the first several lines of
* the file and attempting to parse elements using different formats. The inferred schema becomes the schema of the
* data set.
* If the data set already has the schema defined, it gets overridden with the inferred one.
* @param numberOfLinesToInferFrom the number of lines from the beginning of the file to be used
* to infer column types
* @return the inferred data set schema
*/
public CsvSchema inferSchema(int numberOfLinesToInferFrom)
{
this.schema = new CsvSchema();
try (BufferedReader reader = new BufferedReader(this.createReader(), BUFFER_SIZE))
{
MutableList headers = this.splitMindingQs(reader.readLine()).collect(this::removeSurroundingQuotes);
MutableList lineBuffer = Lists.mutable.withInitialCapacity(numberOfLinesToInferFrom);
String dataRow;
for (int loadedLineCount = 0;
loadedLineCount < numberOfLinesToInferFrom && (dataRow = reader.readLine()) != null;
loadedLineCount++)
{
lineBuffer.add(dataRow);
}
this.inferSchema(headers, lineBuffer);
}
catch (IOException e)
{
exceptionByKey("CSV_INFER_SCHEMA_FAIL").with("fileName", this.getDataFileName()).fire(e);
}
return this.schema;
}
private DataFrame loadAsDataFrame(int headLineCount, boolean loadAllLines)
{
DataFrame df = new DataFrame(this.getName());
df.enablePooling();
/*
expected format:
header1,header2,header3,...
"String",123,45.67,...
*/
if (this.schemaIsNotDefined())
{
this.schema = new CsvSchema(); // provides default separators, quote characters, etc.
}
try (BufferedReader reader = new BufferedReader(this.createReader(), BUFFER_SIZE))
{
MutableList headers;
if (this.schema.hasHeaderLine())
{
headers = this.splitMindingQs(reader.readLine()).collect(this::removeSurroundingQuotes);
if (headers.anySatisfy(String::isEmpty))
{
exceptionByKey("CSV_MISSING_COL_HEADER").fire();
}
}
else
{
headers = this.schema.getColumns().collect(CsvSchemaColumn::getName);
}
String dataRow = reader.readLine();
if (dataRow == null) // no data, just headers
{
if (this.getSchema().columnCount() == 0) // schema does not have columns predefined and there is no data
{
headers.forEach(header -> this.schema.addColumn(header, STRING));
}
this.getSchema().getColumns().forEach(col -> df.addColumn(col.getName(), col.getType()));
return df;
}
MutableList lineBuffer = Lists.mutable.withInitialCapacity(LINE_COUNT_FOR_TYPE_INFERENCE);
lineBuffer.add(dataRow);
// the schema is empty, need to infer columns properties from the first lineCountForTypeInference columns
if (this.getSchema().columnCount() == 0)
{
int loadedLineCount = 1; // already have one in the buffer
while (loadedLineCount++ < LINE_COUNT_FOR_TYPE_INFERENCE
&& (dataRow = reader.readLine()) != null)
{
lineBuffer.add(dataRow);
}
this.inferSchema(headers, lineBuffer);
}
else if (headers.size() != this.schema.columnCount())
{
exceptionByKey("CSV_SCHEMA_HEADER_SIZE_MISMATCH")
.with("headerCount", headers.size())
.with("schemaColumnCount", this.schema.columnCount())
.fire();
}
else
{
MutableList schemaColumnNames = this.schema.getColumns().collect(CsvSchemaColumn::getName);
if (!headers.equals(schemaColumnNames))
{
exceptionByKey("CSV_SCHEMA_HEADER_NAME_MISMATCH")
.with("headerColumnList", headers.makeString("[", ",", "]"))
.with("schemaColumnList", schemaColumnNames.makeString("[", ",", "]"))
.fire();
}
}
MutableList> columnPopulators = Lists.mutable.of();
this.getSchema().getColumns().forEach(col -> this.addDataFrameColumn(df, col, columnPopulators));
int columnCount = this.getSchema().columnCount();
MutableList lineElements = Lists.mutable.withInitialCapacity(columnCount);
int lineNumber = 0;
while (
(dataRow = this.getNextLine(lineBuffer, reader, lineNumber)) != null
&& (loadAllLines || (lineNumber < headLineCount))
)
{
this.parseAndAddLineToDataFrame(dataRow, lineElements, columnCount, columnPopulators);
lineNumber++;
}
df.seal();
}
catch (IOException e)
{
exceptionByKey("CSV_FILE_LOAD_FAIL").with("fileName", this.getDataFileName()).fire(e);
}
return df;
}
private String getNextLine(MutableList lineBuffer, BufferedReader reader, int lineNumber)
throws IOException
{
if (lineNumber < lineBuffer.size())
{
return lineBuffer.get(lineNumber);
}
return reader.readLine();
}
private String removeSurroundingQuotes(String aString)
{
int size = aString.length();
if (size > 1)
{
if (this.isQuote(aString.charAt(0)) && this.isQuote(aString.charAt(size - 1)))
{
return aString.substring(1, size - 1);
}
}
return aString;
}
private void addDataFrameColumn(DataFrame df, CsvSchemaColumn schemaCol, MutableList> columnPopulators)
{
ValueType columnType = schemaCol.getType();
DfColumn lastColumn = df.newColumn(schemaCol.getName(), columnType);
Procedure populator = switch (columnType)
{
case LONG -> s -> schemaCol.parseAsLongAndAdd(s, lastColumn);
case DOUBLE -> s -> schemaCol.parseAsDoubleAndAdd(s, lastColumn);
case INT -> s -> schemaCol.parseAsIntAndAdd(s, lastColumn);
case FLOAT -> s -> schemaCol.parseAsFloatAndAdd(s, lastColumn);
case STRING -> s -> lastColumn.addObject(schemaCol.parseAsString(s));
case DATE -> s -> lastColumn.addObject(schemaCol.parseAsLocalDate(s));
case DATE_TIME -> s -> lastColumn.addObject(schemaCol.parseAsLocalDateTime(s));
case DECIMAL -> s -> lastColumn.addObject(schemaCol.parseAsDecimal(s));
default -> throw exceptionByKey("CSV_POPULATING_BAD_COL_TYPE").with("columnType", columnType)
.get();
};
columnPopulators.add(populator);
}
private void inferSchema(MutableList headers, MutableList topDataLines)
{
int columnCount = headers.size();
ValueType[] types = new ValueType[columnCount];
String[] formats = new String[columnCount];
for (int lineIndex = 0; lineIndex < topDataLines.size(); lineIndex++)
{
String dataLine = topDataLines.get(lineIndex);
MutableList elements = this.splitMindingQs(dataLine);
if (headers.size() != elements.size())
{
exceptionByKey("CSV_HEADER_ROW_SIZE_MISMATCH")
.with("rowIndex", lineIndex + 1)
.with("headerElementCount", headers.size())
.with("rowElementCount", elements.size())
.fire();
}
for (int columnIndex = 0; columnIndex < columnCount; columnIndex++)
{
String element = elements.get(columnIndex);
String matchingFormat = null;
ValueType guessedType;
// element can be null if it is a null marker or if it is an empty element (0-length)
if (element == null || element.isEmpty())
{
continue;
}
if (this.getSchema().surroundedByQuotes(element))
{
guessedType = STRING;
}
else
{
matchingFormat = this.findMatchingDateFormat(element);
if (matchingFormat != null)
{
guessedType = DATE;
}
else
{
matchingFormat = this.findMatchingDateTimeFormat(element);
if (matchingFormat != null)
{
guessedType = DATE_TIME;
}
else if (this.canParseAsLong(element))
{
guessedType = LONG;
}
else if (this.canParseAsDouble(element))
{
guessedType = DOUBLE;
}
else
{
guessedType = STRING;
}
}
}
if (types[columnIndex] == null)
{
types[columnIndex] = guessedType;
formats[columnIndex] = matchingFormat;
}
else if (guessedType == STRING)
{
types[columnIndex] = guessedType;
}
else if (guessedType == DOUBLE && types[columnIndex] == LONG)
{
types[columnIndex] = DOUBLE;
}
else if (((guessedType == DATE || guessedType == DATE_TIME) && types[columnIndex] == guessedType)
&& !matchingFormat.equals(formats[columnIndex]))
{
// still a date or dateTime but mismatched formats
types[columnIndex] = STRING;
formats[columnIndex] = null;
}
}
}
for (int columnIndex = 0; columnIndex < columnCount; columnIndex++)
{
this.schema.addColumn(
headers.get(columnIndex),
types[columnIndex] == null ? STRING : types[columnIndex],
formats[columnIndex]);
}
}
private boolean schemaIsNotDefined()
{
return this.schema == null;
}
public CsvSchema getSchema()
{
return this.schema;
}
private void parseAndAddLineToDataFrame(String line, MutableList elements, int columnCount, MutableList> columnPopulators)
{
this.splitMindingQsInto(line, elements);
if (this.getSchema().columnCount() != elements.size())
{
exceptionByKey("CSV_SCHEMA_ROW_SIZE_MISMATCH")
.with("schemaColumnCount", this.getSchema().columnCount())
.with("rowElementCount", elements.size())
.with("dataRow", line)
.fire();
}
for (int i = 0; i < columnCount; i++)
{
String element = elements.get(i);
if (this.getSchema().hasNullMarker())
{
if (this.getSchema().getNullMarker().equals(element))
{
element = null;
}
}
columnPopulators.get(i).accept(element);
}
}
private boolean canParseAsLong(String aString)
{
try
{
Long.parseLong(aString);
return true;
}
catch (NumberFormatException e)
{
return false;
}
}
private boolean canParseAsDouble(String aString)
{
try
{
Double.parseDouble(aString);
return true;
}
catch (NumberFormatException e)
{
return false;
}
}
private String findMatchingDateFormat(String aString)
{
ListIterable dateFormats = Lists.immutable.of("uuuu/M/d", "uuuu-M-d", "M/d/uuuu");
String trimmed = aString.trim();
for (int i = 0; i < dateFormats.size(); i++)
{
String pattern = dateFormats.get(i);
try
{
DateTimeFormatter candidateFormatter = DateTimeFormatter.ofPattern(pattern).withResolverStyle(ResolverStyle.STRICT);
LocalDate.parse(trimmed, candidateFormatter);
return pattern;
}
catch (DateTimeParseException e)
{
// ignore
}
}
return null;
}
private String findMatchingDateTimeFormat(String aString)
{
ListIterable dateFormats = Lists.immutable.of("uuuu-M-d'T'H:m:s");
String trimmed = aString.trim();
for (int i = 0; i < dateFormats.size(); i++)
{
String pattern = dateFormats.get(i);
try
{
DateTimeFormatter candidateFormatter = DateTimeFormatter.ofPattern(pattern).withResolverStyle(ResolverStyle.STRICT);
LocalDateTime.parse(trimmed, candidateFormatter);
return pattern;
}
catch (DateTimeParseException e)
{
// ignore
}
}
return null;
}
private MutableList splitMindingQs(String aString)
{
MutableList elements = Lists.mutable.of();
this.splitMindingQsInto(aString, elements);
return elements;
}
public void splitMindingQsInto(String aString, MutableList elements)
{
elements.clear();
int currentTokenStart = 0;
boolean insideQuotes = false;
boolean initialBlanks = true;
boolean closedQuote = false;
int charCount = aString.length();
for (int index = 0; index < charCount; index++)
{
char curChar = aString.charAt(index);
boolean endOfLine = index == charCount - 1;
if (endOfLine)
{
if (insideQuotes && !this.isQuote(curChar))
{
throw exceptionByKey("CSV_UNBALANCED_QUOTES").with("index", index).with("string", aString).get();
}
if (this.isTokenSeparator(curChar))
{
if (!closedQuote) // unquoted token followed by an empty token, add the current token first
{
elements.add(this.substringOrNull(aString, currentTokenStart, index));
}
// a comma right after a token, so add an empty value
elements.add(this.substringOrNull(aString, index + 1, index + 1));
}
else
{
elements.add(this.substringOrNull(aString, currentTokenStart, index + 1));
}
}
else if (insideQuotes)
{
if (this.isQuote(curChar))
{
insideQuotes = false;
closedQuote = true;
elements.add(this.substringOrNull(aString, currentTokenStart, index + 1));
currentTokenStart = index + 1;
}
}
else if (this.isTokenSeparator(curChar))
{
if (!closedQuote)
{
elements.add(this.substringOrNull(aString, currentTokenStart, index));
}
closedQuote = false;
initialBlanks = true;
currentTokenStart = index + 1;
}
else if (initialBlanks)
{
if (!Character.isSpaceChar(curChar))
{
initialBlanks = false;
if (this.isQuote(curChar))
{
insideQuotes = true;
currentTokenStart = index;
}
}
}
}
}
private boolean isTokenSeparator(char aChar)
{
return aChar == this.getSchema().getSeparator();
}
private boolean isQuote(char aChar)
{
return aChar == this.getSchema().getQuoteCharacter();
}
private String substringOrNull(String aString, int beginIndex, int endIndex)
{
if (beginIndex < endIndex)
{
return aString.substring(beginIndex, endIndex);
}
return this.emptyElementsConvertedToNulls ? null : "";
}
}