de.siegmar.fastcsv.reader.CsvReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of fastcsv Show documentation
Lightning-fast, dependency-free CSV library that conforms to RFC standards.
There is a newer version: 3.4.0
package de.siegmar.fastcsv.reader;

import java.io.Closeable;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.Spliterator;
import java.util.StringJoiner;
import java.util.function.Consumer;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

/**
 * This is the main class for reading CSV data.
 * 
 * Example use:
 * {@snippet :
 * try (CsvReader csv = CsvReader.builder().ofCsvRecord(file)) {
 *     for (CsvRecord csvRecord : csv) {
 *         // ...
 *     }
 * }
 *}
 * 

 * Example for named records:
 * {@snippet :
 * try (CsvReader csv = CsvReader.builder().ofNamedCsvRecord(file)) {
 *     for (NamedCsvRecord csvRecord : csv) {
 *         // ...
 *     }
 * }
 *}
 *
 * @param  the type of the CSV record.
 */
public final class CsvReader implements Iterable, Closeable {

    private final CsvParser csvParser;
    private final CsvCallbackHandler callbackHandler;
    private final CommentStrategy commentStrategy;
    private final boolean skipEmptyLines;
    private final boolean ignoreDifferentFieldCount;
    private final CloseableIterator csvRecordIterator = new CsvRecordIterator();

    private int firstRecordFieldCount = -1;

    @SuppressWarnings("checkstyle:ParameterNumber")
    CsvReader(final CsvParser csvParser, final CsvCallbackHandler callbackHandler,
              final CommentStrategy commentStrategy, final boolean skipEmptyLines,
              final boolean ignoreDifferentFieldCount) {

        this.csvParser = csvParser;
        this.callbackHandler = callbackHandler;
        this.commentStrategy = commentStrategy;
        this.skipEmptyLines = skipEmptyLines;
        this.ignoreDifferentFieldCount = ignoreDifferentFieldCount;
    }

    /**
     * Constructs a {@link CsvReaderBuilder} to configure and build instances of this class.
     *
     * @return a new {@link CsvReaderBuilder} instance.
     */
    public static CsvReaderBuilder builder() {
        return new CsvReaderBuilder();
    }

    /**
     * Returns an iterator over elements of type {@link CsvRecord}.
     * 

     * The returned iterator is not thread-safe.
     * Don't forget to close the returned iterator when you're done.
     * Alternatively, use {@link #stream()}.
     * 

     * This method is idempotent.
     *
     * @return an iterator over the CSV records.
     * @throws UncheckedIOException if an I/O error occurs.
     * @throws CsvParseException    if any other problem occurs when parsing the CSV data.
     * @see #stream()
     */
    @Override
    public CloseableIterator iterator() {
        return csvRecordIterator;
    }

    /**
     * Returns a {@link Spliterator} over elements of type {@link CsvRecord}.
     * 

     * The returned spliterator is not thread-safe.
     * Don't forget to invoke {@link #close()} when you're done.
     * Alternatively, use {@link #stream()}.
     * 

     * This method is idempotent.
     *
     * @return a spliterator over the CSV records.
     * @throws UncheckedIOException if an I/O error occurs.
     * @throws CsvParseException    if any other problem occurs when parsing the CSV data.
     * @see #stream()
     */
    @Override
    public Spliterator spliterator() {
        return new CsvSpliterator();
    }

    /**
     * Returns a sequential {@code Stream} with this reader as its source.
     * 

     * The returned stream is not thread-safe.
     * Don't forget to close the returned stream when you're done.
     * 

     * This method is idempotent.
     *
     * @return a sequential {@code Stream} over the CSV records.
     * @throws UncheckedIOException if an I/O error occurs.
     * @throws CsvParseException    if any other problem occurs when parsing the CSV data.
     * @see #iterator()
     */
    public Stream stream() {
        return StreamSupport.stream(spliterator(), false)
            .onClose(() -> {
                try {
                    close();
                } catch (final IOException e) {
                    throw new UncheckedIOException(e);
                }
            });
    }

    @SuppressWarnings({
        "PMD.AvoidBranchingStatementAsLastInLoop",
        "PMD.AssignmentInOperand"
    })
    private T fetchRecord() throws IOException {
        while (csvParser.parse()) {
            final T csvRecord = processRecord();

            if (csvRecord != null) {
                return csvRecord;
            }
        }

        callbackHandler.terminate();

        return null;
    }

    @SuppressWarnings("checkstyle:ReturnCount")
    private T processRecord() {
        final RecordWrapper recordWrapper = callbackHandler.buildRecord();

        // handle consumed records (e.g. header for named records)
        if (recordWrapper == null) {
            return null;
        }

        // handle comment lines
        if (recordWrapper.isComment()) {
            return commentStrategy == CommentStrategy.SKIP ? null : recordWrapper.getWrappedRecord();
        }

        // handle empty lines
        if (recordWrapper.isEmptyLine()) {
            return skipEmptyLines ? null : recordWrapper.getWrappedRecord();
        }

        // check field count consistency
        if (!ignoreDifferentFieldCount) {
            checkFieldCountConsistency(recordWrapper.getFieldCount());
        }

        return recordWrapper.getWrappedRecord();
    }

    private void checkFieldCountConsistency(final int fieldCount) {
        // check the field count consistency on every record
        if (firstRecordFieldCount == -1) {
            firstRecordFieldCount = fieldCount;
        } else if (fieldCount != firstRecordFieldCount) {
            throw new CsvParseException(
                String.format("Record %d has %d fields, but first record had %d fields",
                    csvParser.getStartingLineNumber(), fieldCount, firstRecordFieldCount));
        }
    }

    @Override
    public void close() throws IOException {
        csvParser.close();
    }

    @Override
    public String toString() {
        return new StringJoiner(", ", CsvReader.class.getSimpleName() + "[", "]")
            .add("commentStrategy=" + commentStrategy)
            .add("skipEmptyLines=" + skipEmptyLines)
            .add("ignoreDifferentFieldCount=" + ignoreDifferentFieldCount)
            .toString();
    }

    @SuppressWarnings({"checkstyle:IllegalCatch", "PMD.AvoidCatchingThrowable"})
    private T fetch() {
        try {
            return fetchRecord();
        } catch (final IOException e) {
            throw new UncheckedIOException(buildExceptionMessage(), e);
        } catch (final Throwable t) {
            throw new CsvParseException(buildExceptionMessage(), t);
        }
    }

    private String buildExceptionMessage() {
        return (csvParser.getStartingLineNumber() == 1)
            ? "Exception when reading first record"
            : String.format("Exception when reading record that started in line %d",
            csvParser.getStartingLineNumber());
    }

    private class CsvSpliterator implements Spliterator {

        @Override
        public boolean tryAdvance(final Consumer action) {
            final T t = fetch();
            if (t != null) {
                action.accept(t);
                return true;
            }
            return false;
        }

        @Override
        public Spliterator trySplit() {
            return null;
        }

        @Override
        public long estimateSize() {
            return Long.MAX_VALUE;
        }

        @Override
        public int characteristics() {
            return ORDERED | NONNULL;
        }

    }

    private class CsvRecordIterator implements CloseableIterator {

        private T fetchedRecord;
        private boolean fetched;

        @Override
        public boolean hasNext() {
            if (!fetched) {
                fetchedRecord = fetch();
                fetched = true;
            }
            return fetchedRecord != null;
        }

        @Override
        public T next() {
            if (!fetched) {
                fetchedRecord = fetch();
            }
            if (fetchedRecord == null) {
                throw new NoSuchElementException();
            }

            fetched = false;
            return fetchedRecord;
        }

        @Override
        public void close() throws IOException {
            CsvReader.this.close();
        }

    }

    /**
     * This builder is used to create configured instances of {@link CsvReader}. The default
     * configuration of this class complies with RFC 4180:
     * 

     *     Field separator: {@code ,} (comma)
     *     Quote character: {@code "} (double quotes)
     *     Comment strategy: {@link CommentStrategy#NONE} (as RFC doesn't handle comments)
     *     Comment character: {@code #} (hash) (in case comment strategy is enabled)
     *     Skip empty lines: {@code true}
     *     Ignore different field count: {@code true}
     *     Detect BOM header: {@code false}
     * 
     * 
     * The line delimiter (line-feed, carriage-return or the combination of both) is detected
     * automatically and thus not configurable.
     */
    @SuppressWarnings({"checkstyle:HiddenField", "PMD.AvoidFieldNameMatchingMethodName"})
    public static final class CsvReaderBuilder {

        private char fieldSeparator = ',';
        private char quoteCharacter = '"';
        private CommentStrategy commentStrategy = CommentStrategy.NONE;
        private char commentCharacter = '#';
        private boolean skipEmptyLines = true;
        private boolean ignoreDifferentFieldCount = true;
        private boolean detectBomHeader;

        private CsvReaderBuilder() {
        }

        /**
         * Sets the {@code fieldSeparator} used when reading CSV data.
         *
         * @param fieldSeparator the field separator character (default: {@code ,} - comma).
         * @return This updated object, allowing additional method calls to be chained together.
         */
        public CsvReaderBuilder fieldSeparator(final char fieldSeparator) {
            this.fieldSeparator = fieldSeparator;
            return this;
        }

        /**
         * Sets the {@code quoteCharacter} used when reading CSV data.
         *
         * @param quoteCharacter the character used to enclose fields
         *                       (default: {@code "} - double quotes).
         * @return This updated object, allowing additional method calls to be chained together.
         */
        public CsvReaderBuilder quoteCharacter(final char quoteCharacter) {
            this.quoteCharacter = quoteCharacter;
            return this;
        }

        /**
         * Sets the strategy that defines how (and if) commented lines should be handled
         * (default: {@link CommentStrategy#NONE} as comments are not defined in RFC 4180).
         * 

         * If a comment strategy other than {@link CommentStrategy#NONE} is used, special parsing rules are
         * applied for commented lines. FastCSV defines a comment as a line that starts with a comment character.
         * No (whitespace) character is allowed before the comment character. Everything after the comment character
         * until the end of the line is considered the comment value.
         *
         * @param commentStrategy the strategy for handling comments.
         * @return This updated object, allowing additional method calls to be chained together.
         * @see #commentCharacter(char)
         */
        public CsvReaderBuilder commentStrategy(final CommentStrategy commentStrategy) {
            this.commentStrategy = commentStrategy;
            return this;
        }

        /**
         * Sets the {@code commentCharacter} used to comment lines.
         *
         * @param commentCharacter the character used to comment lines (default: {@code #} - hash)
         * @return This updated object, allowing additional method calls to be chained together.
         * @see #commentStrategy(CommentStrategy)
         */
        public CsvReaderBuilder commentCharacter(final char commentCharacter) {
            this.commentCharacter = commentCharacter;
            return this;
        }

        /**
         * Defines whether empty lines should be skipped when reading data.
         * 

         * The default implementation interprets empty lines as lines that do not contain any data.
         * This includes lines that consist only of opening and closing quote characters.
         * 

         * A line that only contains whitespace characters is not considered empty.
         * However, the determination of empty lines is done after field modifiers have been applied.
         * If you use a field trimming modifier (like {@link FieldModifiers#TRIM}), lines that only contain whitespaces
         * are considered empty.
         * 

         * Commented lines are not considered empty lines. Use {@link #commentStrategy(CommentStrategy)} for handling
         * commented lines.
         *
         * @param skipEmptyLines Whether empty lines should be skipped (default: {@code true}).
         * @return This updated object, allowing additional method calls to be chained together.
         */
        public CsvReaderBuilder skipEmptyLines(final boolean skipEmptyLines) {
            this.skipEmptyLines = skipEmptyLines;
            return this;
        }

        /**
         * Defines if an {@link CsvParseException} should be thrown if records do contain a
         * different number of fields.
         *
         * @param ignoreDifferentFieldCount if exception should be suppressed, when CSV data contains
         *                                  different field count (default: {@code true}).
         * @return This updated object, allowing additional method calls to be chained together.
         */
        public CsvReaderBuilder ignoreDifferentFieldCount(final boolean ignoreDifferentFieldCount) {
            this.ignoreDifferentFieldCount = ignoreDifferentFieldCount;
            return this;
        }

        /**
         * Defines if an optional BOM (Byte order mark) header should be detected.
         * BOM detection only applies for direct file access.
         * 

         * Supported BOMs are: UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, UTF-32BE.
         *
         * @param detectBomHeader if detection should be enabled (default: {@code false})
         * @return This updated object, allowing additional method calls to be chained together.
         */
        public CsvReaderBuilder detectBomHeader(final boolean detectBomHeader) {
            this.detectBomHeader = detectBomHeader;
            return this;
        }

        /**
         * Constructs a new {@link CsvReader} that uses {@link CsvRecord} as record type.
         * 

         * This is a convenience method for calling {@link #build(CsvCallbackHandler, Reader)} with
         * {@link CsvRecordHandler} as callback handler.
         *
         * @param reader the data source to read from.
         * @return a new CsvReader of CsvRecord - never {@code null}.
         * @throws NullPointerException if reader is {@code null}
         */
        public CsvReader ofCsvRecord(final Reader reader) {
            return build(new CsvRecordHandler(), reader);
        }

        /**
         * Constructs a new {@link CsvReader} for the specified arguments.
         * 

         * This is a convenience method for calling {@link #build(CsvCallbackHandler, String)} with
         * {@link CsvRecordHandler} as callback handler.
         *
         * @param data the data to read.
         * @return a new CsvReader of CsvRecord - never {@code null}.
         * @throws NullPointerException if data is {@code null}
         */
        public CsvReader ofCsvRecord(final String data) {
            return build(new CsvRecordHandler(), data);
        }

        /**
         * Constructs a new {@link CsvReader} for the specified file.
         * 

         * This is a convenience method for calling {@link #build(CsvCallbackHandler, Path)} with
         * {@link CsvRecordHandler} as callback handler.
         *
         * @param file the file to read data from.
         * @return a new CsvReader of CsvRecord - never {@code null}. Don't forget to close it!
         * @throws IOException          if an I/O error occurs.
         * @throws NullPointerException if file is {@code null}
         */
        public CsvReader ofCsvRecord(final Path file) throws IOException {
            return build(new CsvRecordHandler(), file);
        }

        /**
         * Constructs a new {@link CsvReader} for the specified file.
         * 

         * This is a convenience method for calling {@link #build(CsvCallbackHandler, Path, Charset)} with
         * {@link CsvRecordHandler} as callback handler.
         *
         * @param file    the file to read data from.
         * @param charset the character set to use. If BOM header detection is enabled
         *                (via {@link #detectBomHeader(boolean)}), this acts as a default
         *                when no BOM header was found.
         * @return a new CsvReader of CsvRecord - never {@code null}. Don't forget to close it!
         * @throws IOException          if an I/O error occurs.
         * @throws NullPointerException if file or charset is {@code null}
         */
        public CsvReader ofCsvRecord(final Path file, final Charset charset) throws IOException {
            return build(new CsvRecordHandler(), file, charset);
        }

        /**
         * Constructs a new {@link CsvReader} that uses {@link CsvRecord} as record type.
         * 

         * This is a convenience method for calling {@link #build(CsvCallbackHandler, Reader)} with
         * {@link NamedCsvRecordHandler} as callback handler.
         *
         * @param reader the data source to read from.
         * @return a new CsvReader of CsvRecord - never {@code null}.
         * @throws NullPointerException if reader is {@code null}
         */
        public CsvReader ofNamedCsvRecord(final Reader reader) {
            return build(new NamedCsvRecordHandler(), reader);
        }

        /**
         * Constructs a new {@link CsvReader} for the specified arguments.
         * 

         * This is a convenience method for calling {@link #build(CsvCallbackHandler, String)} with
         * {@link NamedCsvRecordHandler} as callback handler.
         *
         * @param data the data to read.
         * @return a new CsvReader of CsvRecord - never {@code null}.
         * @throws NullPointerException if data is {@code null}
         */
        public CsvReader ofNamedCsvRecord(final String data) {
            return build(new NamedCsvRecordHandler(), data);
        }

        /**
         * Constructs a new {@link CsvReader} for the specified file.
         * 

         * This is a convenience method for calling {@link #build(CsvCallbackHandler, Path)} with
         * {@link NamedCsvRecordHandler} as callback handler.
         *
         * @param file the file to read data from.
         * @return a new CsvReader of CsvRecord - never {@code null}. Don't forget to close it!
         * @throws IOException          if an I/O error occurs.
         * @throws NullPointerException if file is {@code null}
         */
        public CsvReader ofNamedCsvRecord(final Path file) throws IOException {
            return build(new NamedCsvRecordHandler(), file);
        }

        /**
         * Constructs a new {@link CsvReader} for the specified file.
         * 

         * This is a convenience method for calling {@link #build(CsvCallbackHandler, Path, Charset)} with
         * {@link NamedCsvRecordHandler} as callback handler.
         *
         * @param file    the file to read data from.
         * @param charset the character set to use. If BOM header detection is enabled
         *                (via {@link #detectBomHeader(boolean)}), this acts as a default
         *                when no BOM header was found.
         * @return a new CsvReader of CsvRecord - never {@code null}. Don't forget to close it!
         * @throws IOException          if an I/O error occurs.
         * @throws NullPointerException if file or charset is {@code null}
         */
        public CsvReader ofNamedCsvRecord(final Path file, final Charset charset) throws IOException {
            return build(new NamedCsvRecordHandler(), file, charset);
        }

        /**
         * Constructs a new {@link CsvReader} for the specified arguments.
         * 

         * This library uses built-in buffering, so you do not need to pass in a buffered Reader
         * implementation such as {@link java.io.BufferedReader}. Performance may be even likely
         * better if you do not.
         * 

         * Use {@link #build(CsvCallbackHandler, Path)} for optimal performance when
         * reading files and {@link #build(CsvCallbackHandler, String)} when reading Strings.
         *
         * @param              the type of the CSV record.
         * @param callbackHandler the record handler to use. Do not reuse a handler after it has been used!
         * @param reader          the data source to read from.
         * @return a new CsvReader - never {@code null}.
         * @throws NullPointerException if callbackHandler or reader is {@code null}
         */
        public  CsvReader build(final CsvCallbackHandler callbackHandler, final Reader reader) {
            Objects.requireNonNull(callbackHandler, "callbackHandler must not be null");
            Objects.requireNonNull(reader, "reader must not be null");

            final CsvParser csvParser = new CsvParser(fieldSeparator, quoteCharacter, commentStrategy,
                commentCharacter, callbackHandler, reader);

            return newReader(callbackHandler, csvParser);
        }

        /**
         * Constructs a new {@link CsvReader} for the specified arguments.
         *
         * @param              the type of the CSV record.
         * @param callbackHandler the record handler to use. Do not reuse a handler after it has been used!
         * @param data            the data to read.
         * @return a new CsvReader - never {@code null}.
         * @throws NullPointerException if callbackHandler or data is {@code null}
         */
        public  CsvReader build(final CsvCallbackHandler callbackHandler, final String data) {
            Objects.requireNonNull(callbackHandler, "callbackHandler must not be null");
            Objects.requireNonNull(data, "data must not be null");

            final CsvParser csvParser = new CsvParser(fieldSeparator, quoteCharacter, commentStrategy,
                commentCharacter, callbackHandler, data);

            return newReader(callbackHandler, csvParser);
        }

        /**
         * Constructs a new {@link CsvReader} for the specified file.
         * 
         * This is a convenience method for calling {@code of(file, StandardCharsets.UTF_8, callbackHandler)}.
         *
         * @param              the type of the CSV record.
         * @param callbackHandler the record handler to use. Do not reuse a handler after it has been used!
         * @param file            the file to read data from.
         * @return a new CsvReader - never {@code null}. Don't forget to close it!
         * @throws IOException          if an I/O error occurs.
         * @throws NullPointerException if callbackHandler or file is {@code null}
         */
        public  CsvReader build(final CsvCallbackHandler callbackHandler, final Path file) throws IOException {
            return build(callbackHandler, file, StandardCharsets.UTF_8);
        }

        /**
         * Constructs a new {@link CsvReader} for the specified arguments.
         *
         * @param              the type of the CSV record.
         * @param callbackHandler the record handler to use. Do not reuse a handler after it has been used!
         * @param file            the file to read data from.
         * @param charset         the character set to use. If BOM header detection is enabled
         *                        (via {@link #detectBomHeader(boolean)}), this acts as a default
         *                        when no BOM header was found.
         * @return a new CsvReader - never {@code null}. Don't forget to close it!
         * @throws IOException          if an I/O error occurs.
         * @throws NullPointerException if callbackHandler, file or charset is {@code null}
         */
        public  CsvReader build(final CsvCallbackHandler callbackHandler,
                                      final Path file, final Charset charset) throws IOException {
            Objects.requireNonNull(callbackHandler, "callbackHandler must not be null");
            Objects.requireNonNull(file, "file must not be null");
            Objects.requireNonNull(charset, "charset must not be null");

            final Reader reader = detectBomHeader
                ? BomUtil.openReader(file, charset)
                : new InputStreamReader(Files.newInputStream(file), charset);

            return build(callbackHandler, reader);
        }

        private  CsvReader newReader(final CsvCallbackHandler callbackHandler, final CsvParser csvParser) {
            return new CsvReader<>(csvParser, callbackHandler,
                commentStrategy, skipEmptyLines, ignoreDifferentFieldCount);
        }

        @Override
        public String toString() {
            return new StringJoiner(", ", CsvReaderBuilder.class.getSimpleName() + "[", "]")
                .add("fieldSeparator=" + fieldSeparator)
                .add("quoteCharacter=" + quoteCharacter)
                .add("commentStrategy=" + commentStrategy)
                .add("commentCharacter=" + commentCharacter)
                .add("skipEmptyLines=" + skipEmptyLines)
                .add("ignoreDifferentFieldCount=" + ignoreDifferentFieldCount)
                .toString();
        }

    }

}