sirius.kernel.commons.CSVReader Maven / Gradle / Ivy
Show all versions of sirius-kernel Show documentation
* Made with all the love in the world
* by scireum in Remshalden, Germany
* Copyright by scireum GmbH
* http://www.scireum.de - [email protected]
package sirius.kernel.commons;
import com.google.common.collect.Lists;
import sirius.kernel.async.TaskContext;
import javax.annotation.Nonnull;
import java.io.IOException;
import java.io.Reader;
import java.util.List;
import java.util.function.Consumer;
* Provides a simple reader which parses given CSV (comma separated values) data into rows.
* By default ; is used to separate columns and a line break (either Windows or Unix) is used
* to separate rows. Also columns can be enclosed in quotations, especially if line breaks occur within
* a value. The default character used to signal quaotation is ". Note that the quotation symbol has to
* be
* the first non-whitespace character in the column to be detected as such (or the very first character if
* {@link #notIgnoringWhitespaces()} was called during initialisation).
* Furthermore escaping can be used to embed a column separator or a quotation character in a column value.
* By default \ is used as escape character.
* Empty columns will be represented as empty strings. Values will not be trimmed, as this can be easily achieved
* using the {@link Values} which is used to represent a parsed row.
* An example use case would be:
* {@code new CSVReader(someInput).execute(row -> doSomethingSmartPerRow(row)); }
* Note that this class checks the {@link TaskContext} during execution. Therefore if the underlying task is cancelled,
* the parser will stop after the current row has been processed.
public class CSVReader {
private Reader input;
private char separator = ';';
private char quotation = '"';
private boolean ignoreWhitespaces = true;
private char escape = '\\';
private Consumer consumer;
private int buffer;
private Limit limit = Limit.UNLIMITED;
* Creates a new reader which processes the given input.
* Note that the given input is consumed character by character so using a {@link java.io.BufferedReader}
* might be a good idea as most devices rather exchange larger blocks of data (e.g. 8kb).
* If {@link #execute(Consumer)} is invoked, the given input will be closed once all data has been parsed or if and
* IO error occurs.
* @param input the input to parse
public CSVReader(@Nonnull Reader input) {
this.input = input;
* Specifies the separator character to use.
* By default this is ;.
* @param separator the separator to use
* @return the reader itself for fluent method calls
public CSVReader withSeparator(char separator) {
this.separator = separator;
return this;
* Specifies the quotation character to use.
* By default this is ". Use \0 to disable quotation entirely.
* @param quotation the quotation character to use
* @return the reader itself for fluent method calls
public CSVReader withQuotation(char quotation) {
this.quotation = quotation;
return this;
* Specifies the escape character to use.
* By default this is \. Use \0 to disable escaping entirely.
* @param escape the escape character to use
* @return the reader itself for fluent method calls
public CSVReader withEscape(char escape) {
this.escape = escape;
return this;
* Disables the flexible whitespace behaviour.
* If a column starts with whitespaces (space or tab characters) and is then quoted, the whitespaces
* around the quotes are simply ignored. Therefore ;"a";, ; "a" ; and ;a; will
* yield the same result. However ; a ; will keep the whitespaces and has to be manually trimmed.
* Calling this method will disable this behavior and ; "a" ; will yield "a"
* as column value instead of a.
* @return the reader itself for fluent method calls
public CSVReader notIgnoringWhitespaces() {
this.ignoreWhitespaces = false;
return this;
* Can set a limit to read only a specific range of rows from the file.
* By default all rows are read.
* @param limit the limit to use reading the rows from the file.
* @return the reader itself for fluent method calls
public CSVReader withLimit(Limit limit) {
this.limit = limit;
return this;
* Parses the previously supplied input and calls the given consumer for each row.
* Note that this method will close the given input.
* @param consumer the consume to call for each line
* @throws IOException if an IO error occures while reading from the given input
public void execute(Consumer consumer) throws IOException {
try {
this.consumer = consumer;
TaskContext tc = TaskContext.get();
while (tc.isActive() && !isEOF() && limit.shouldContinue()) {
} finally {
* Consumes a windows or unix style line break.
private void consumeNewLine() throws IOException {
if (buffer == '\r') {
if (buffer == '\n') {
* Fills the internal buffer by reading from the stream.
private void read() throws IOException {
buffer = input.read();
* Reads a single row from the stream. This might be multiple lines from the
* input as quotet columns may contain line breaks.
private void readRow() throws IOException {
List row = Lists.newArrayList();
while (!isEOF() && !isAtNewline()) {
if (buffer == separator) {
if (limit.nextRow()) {
* Reads a single column.
private String readField() throws IOException {
StringBuilder result = new StringBuilder();
boolean inQuote = false;
if (buffer == quotation) {
inQuote = true;
if (ignoreWhitespaces) {
result = new StringBuilder();
readFieldValue(result, inQuote);
return result.toString();
private void skipLeadingWhitespaces(StringBuilder result) throws IOException {
if (ignoreWhitespaces) {
while (buffer == ' ' || buffer == '\t') {
result.append((char) buffer);
private void skipTrailingWhitespaces(boolean inQuote) throws IOException {
if (inQuote) {
while (buffer == ' ' || buffer == '\t') {
private void readFieldValue(StringBuilder result, boolean inQuote) throws IOException {
while (shouldContinueField(inQuote)) {
if (buffer == escape) {
if (!isEOF()) {
result.append((char) buffer);
} else {
result.append((char) buffer);
* Determines if the current buffer value should be added to the field (column) content.
private boolean shouldContinueField(boolean inQuote) throws IOException {
if (isEOF()) {
return false;
if (inQuote) {
if (buffer == quotation) {
return buffer == quotation;
return buffer != quotation;
} else {
return buffer != separator && !isAtNewline();
* Determines if the current buffer indicates a line break.
private boolean isAtNewline() {
return buffer == '\r' || buffer == '\n';
* Determines if we reached the end of the steam / reader.
public boolean isEOF() {
return buffer == -1;