com.univocity.parsers.csv.CsvParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of univocity-parsers Show documentation
Show all versions of univocity-parsers Show documentation
uniVocity's open source parsers for processing different text formats using a consistent API
/*******************************************************************************
* Copyright 2014 Univocity Software Pty Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package com.univocity.parsers.csv;
import com.univocity.parsers.common.*;
import com.univocity.parsers.common.input.EOFException;
import com.univocity.parsers.common.input.*;
import java.io.*;
import static com.univocity.parsers.csv.UnescapedQuoteHandling.*;
/**
* A very fast CSV parser implementation.
*
* @author Univocity Software Pty Ltd - [email protected]
* @see CsvFormat
* @see CsvParserSettings
* @see CsvWriter
* @see AbstractParser
*/
public final class CsvParser extends AbstractParser {
private boolean parseUnescapedQuotes;
private boolean parseUnescapedQuotesUntilDelimiter;
private boolean backToDelimiter;
private final boolean doNotEscapeUnquotedValues;
private final boolean keepEscape;
private final boolean keepQuotes;
private boolean unescaped;
private char prev;
private char delimiter;
private char[] multiDelimiter;
private char quote;
private char quoteEscape;
private char escapeEscape;
private char newLine;
private final DefaultCharAppender whitespaceAppender;
private final boolean normalizeLineEndingsInQuotes;
private UnescapedQuoteHandling quoteHandling;
private final String nullValue;
private final int maxColumnLength;
private final String emptyValue;
private final boolean trimQuotedLeading;
private final boolean trimQuotedTrailing;
private char[] delimiters;
private int match = 0;
private int formatDetectorRowSampleCount;
/**
* The CsvParser supports all settings provided by {@link CsvParserSettings}, and requires this configuration to be properly initialized.
*
* @param settings the parser configuration
*/
public CsvParser(CsvParserSettings settings) {
super(settings);
parseUnescapedQuotes = settings.isParseUnescapedQuotes();
parseUnescapedQuotesUntilDelimiter = settings.isParseUnescapedQuotesUntilDelimiter();
doNotEscapeUnquotedValues = !settings.isEscapeUnquotedValues();
keepEscape = settings.isKeepEscapeSequences();
keepQuotes = settings.getKeepQuotes();
normalizeLineEndingsInQuotes = settings.isNormalizeLineEndingsWithinQuotes();
nullValue = settings.getNullValue();
emptyValue = settings.getEmptyValue();
maxColumnLength = settings.getMaxCharsPerColumn();
trimQuotedTrailing = settings.getIgnoreTrailingWhitespacesInQuotes();
trimQuotedLeading = settings.getIgnoreLeadingWhitespacesInQuotes();
formatDetectorRowSampleCount = settings.getFormatDetectorRowSampleCount();
updateFormat(settings.getFormat());
whitespaceAppender = new ExpandingCharAppender(10, "", whitespaceRangeStart);
this.quoteHandling = settings.getUnescapedQuoteHandling();
if (quoteHandling == null) {
if (parseUnescapedQuotes) {
if (parseUnescapedQuotesUntilDelimiter) {
quoteHandling = STOP_AT_DELIMITER;
} else {
quoteHandling = STOP_AT_CLOSING_QUOTE;
}
} else {
quoteHandling = RAISE_ERROR;
}
} else {
backToDelimiter = quoteHandling == BACK_TO_DELIMITER;
parseUnescapedQuotesUntilDelimiter = quoteHandling == STOP_AT_DELIMITER || quoteHandling == SKIP_VALUE || backToDelimiter;
parseUnescapedQuotes = quoteHandling != RAISE_ERROR;
}
}
@Override
protected final void parseRecord() {
if (multiDelimiter == null) {
parseSingleDelimiterRecord();
} else {
parseMultiDelimiterRecord();
}
}
private final void parseSingleDelimiterRecord() {
if (ch <= ' ' && ignoreLeadingWhitespace && whitespaceRangeStart < ch) {
ch = input.skipWhitespace(ch, delimiter, quote);
}
while (ch != newLine) {
if (ch <= ' ' && ignoreLeadingWhitespace && whitespaceRangeStart < ch) {
ch = input.skipWhitespace(ch, delimiter, quote);
}
if (ch == delimiter || ch == newLine) {
output.emptyParsed();
} else {
unescaped = false;
prev = '\0';
if (ch == quote) {
input.enableNormalizeLineEndings(normalizeLineEndingsInQuotes);
int len = output.appender.length();
if (len == 0) {
String value = input.getQuotedString(quote, quoteEscape, escapeEscape, maxColumnLength, delimiter, newLine, keepQuotes, keepEscape, trimQuotedLeading, trimQuotedTrailing);
if (value != null) {
output.valueParsed(value == "" ? emptyValue : value);
input.enableNormalizeLineEndings(true);
try {
ch = input.nextChar();
if (ch == delimiter) {
try {
ch = input.nextChar();
if (ch == newLine) {
output.emptyParsed();
}
} catch (EOFException e) {
output.emptyParsed();
return;
}
}
} catch (EOFException e) {
return;
}
continue;
}
} else if (len == -1 && input.skipQuotedString(quote, quoteEscape, delimiter, newLine)) {
output.valueParsed();
try {
ch = input.nextChar();
if (ch == delimiter) {
try {
ch = input.nextChar();
if (ch == newLine) {
output.emptyParsed();
}
} catch (EOFException e) {
output.emptyParsed();
return;
}
}
} catch (EOFException e) {
return;
}
continue;
}
output.trim = trimQuotedTrailing;
parseQuotedValue();
input.enableNormalizeLineEndings(true);
if (!(unescaped && quoteHandling == BACK_TO_DELIMITER && output.appender.length() == 0)) {
output.valueParsed();
}
} else if (doNotEscapeUnquotedValues) {
String value = null;
int len = output.appender.length();
if (len == 0) {
value = input.getString(ch, delimiter, ignoreTrailingWhitespace, nullValue, maxColumnLength);
}
if (value != null) {
output.valueParsed(value);
ch = input.getChar();
} else {
if (len != -1) {
output.trim = ignoreTrailingWhitespace;
ch = output.appender.appendUntil(ch, input, delimiter, newLine);
} else {
if (input.skipString(ch, delimiter)) {
ch = input.getChar();
} else {
ch = output.appender.appendUntil(ch, input, delimiter, newLine);
}
}
output.valueParsed();
}
} else {
output.trim = ignoreTrailingWhitespace;
parseValueProcessingEscape();
output.valueParsed();
}
}
if (ch != newLine) {
ch = input.nextChar();
if (ch == newLine) {
output.emptyParsed();
}
}
}
}
private void skipValue() {
output.appender.reset();
output.appender = NoopCharAppender.getInstance();
if (multiDelimiter == null) {
ch = NoopCharAppender.getInstance().appendUntil(ch, input, delimiter, newLine);
} else {
for (; match < multiDelimiter.length && ch != newLine; ch = input.nextChar()) {
if (multiDelimiter[match] == ch) {
match++;
} else {
match = 0;
}
}
}
}
private void handleValueSkipping(boolean quoted) {
switch (quoteHandling) {
case SKIP_VALUE:
skipValue();
break;
case RAISE_ERROR:
throw new TextParsingException(context, "Unescaped quote character '" + quote
+ "' inside " + (quoted ? "quoted" : "") + " value of CSV field. To allow unescaped quotes, set 'parseUnescapedQuotes' to 'true' in the CSV parser settings. Cannot parse CSV input.");
}
}
private void handleUnescapedQuoteInValue() {
switch (quoteHandling) {
case BACK_TO_DELIMITER:
case STOP_AT_CLOSING_QUOTE:
case STOP_AT_DELIMITER:
output.appender.append(quote);
prev = ch;
parseValueProcessingEscape();
break;
default:
handleValueSkipping(false);
break;
}
}
private int nextDelimiter() {
if (multiDelimiter == null) {
return output.appender.indexOfAny(delimiters, 0);
} else {
int lineEnd = output.appender.indexOf(newLine, 0);
int delimiter = output.appender.indexOf(multiDelimiter, 0);
return lineEnd != -1 && lineEnd < delimiter ? lineEnd : delimiter;
}
}
private boolean handleUnescapedQuote() {
unescaped = true;
switch (quoteHandling) {
case BACK_TO_DELIMITER:
int pos;
int lastPos = 0;
while ((pos = nextDelimiter()) != -1) {
lastPos = pos;
String value = output.appender.substring(0, pos);
if (keepQuotes && output.appender.charAt(pos - 1) == quote) {
value += quote;
}
output.valueParsed(value);
if (output.appender.charAt(pos) == newLine) {
output.pendingRecords.add(output.rowParsed());
output.appender.remove(0, pos + 1);
continue;
}
if (multiDelimiter == null) {
output.appender.remove(0, pos + 1);
} else {
output.appender.remove(0, pos + multiDelimiter.length);
}
}
if (keepQuotes && input.lastIndexOf(quote) > lastPos) {
output.appender.append(quote);
}
output.appender.append(ch);
prev = '\0';
if (multiDelimiter == null) {
parseQuotedValue();
} else {
parseQuotedValueMultiDelimiter();
}
return true;
case STOP_AT_CLOSING_QUOTE:
case STOP_AT_DELIMITER:
output.appender.append(quote);
output.appender.append(ch);
prev = ch;
if (multiDelimiter == null) {
parseQuotedValue();
} else {
parseQuotedValueMultiDelimiter();
}
return true; //continue;
default:
handleValueSkipping(true);
return false;
}
}
private void processQuoteEscape() {
if (ch == quoteEscape && prev == escapeEscape && escapeEscape != '\0') {
if (keepEscape) {
output.appender.append(escapeEscape);
}
output.appender.append(quoteEscape);
ch = '\0';
} else if (prev == quoteEscape) {
if (ch == quote) {
if (keepEscape) {
output.appender.append(quoteEscape);
}
output.appender.append(quote);
ch = '\0';
} else {
output.appender.append(prev);
}
} else if (ch == quote && prev == quote) {
output.appender.append(quote);
} else if (prev == quote) { //unescaped quote detected
handleUnescapedQuoteInValue();
}
}
private void parseValueProcessingEscape() {
while (ch != delimiter && ch != newLine) {
if (ch != quote && ch != quoteEscape) {
if (prev == quote) { //unescaped quote detected
handleUnescapedQuoteInValue();
return;
}
output.appender.append(ch);
} else {
processQuoteEscape();
}
prev = ch;
ch = input.nextChar();
}
}
private void parseQuotedValue() {
if (prev != '\0' && parseUnescapedQuotesUntilDelimiter) {
if (quoteHandling == SKIP_VALUE) {
skipValue();
return;
}
if (!keepQuotes) {
output.appender.prepend(quote);
}
ch = input.nextChar();
output.trim = ignoreTrailingWhitespace;
ch = output.appender.appendUntil(ch, input, delimiter, newLine);
} else {
if (keepQuotes && prev == '\0') {
output.appender.append(quote);
}
ch = input.nextChar();
if (trimQuotedLeading && ch <= ' ' && output.appender.length() == 0) {
while ((ch = input.nextChar()) <= ' ') ;
}
while (true) {
if (prev == quote && (ch <= ' ' && whitespaceRangeStart < ch || ch == delimiter || ch == newLine)) {
break;
}
if (ch != quote && ch != quoteEscape) {
if (prev == quote) { //unescaped quote detected
if (handleUnescapedQuote()) {
if (quoteHandling == SKIP_VALUE) {
break;
} else {
return;
}
} else {
return;
}
}
if (prev == quoteEscape && quoteEscape != '\0') {
output.appender.append(quoteEscape);
}
ch = output.appender.appendUntil(ch, input, quote, quoteEscape, escapeEscape);
prev = ch;
ch = input.nextChar();
} else {
processQuoteEscape();
prev = ch;
ch = input.nextChar();
if (unescaped && (ch == delimiter || ch == newLine)) {
return;
}
}
}
// handles whitespaces after quoted value: whitespaces are ignored. Content after whitespaces may be parsed if 'parseUnescapedQuotes' is enabled.
if (ch != delimiter && ch != newLine && ch <= ' ' && whitespaceRangeStart < ch) {
whitespaceAppender.reset();
do {
//saves whitespaces after value
whitespaceAppender.append(ch);
ch = input.nextChar();
//found a new line, go to next record.
if (ch == newLine) {
if (keepQuotes) {
output.appender.append(quote);
}
return;
}
} while (ch <= ' ' && whitespaceRangeStart < ch && ch != delimiter);
//there's more stuff after the quoted value, not only empty spaces.
if (ch != delimiter && parseUnescapedQuotes) {
if (output.appender instanceof DefaultCharAppender) {
//puts the quote before whitespaces back, then restores the whitespaces
output.appender.append(quote);
((DefaultCharAppender) output.appender).append(whitespaceAppender);
}
//the next character is not the escape character, put it there
if (parseUnescapedQuotesUntilDelimiter || ch != quote && ch != quoteEscape) {
output.appender.append(ch);
}
//sets this character as the previous character (may be escaping)
//calls recursively to keep parsing potentially quoted content
prev = ch;
parseQuotedValue();
} else if (keepQuotes) {
output.appender.append(quote);
}
} else if (keepQuotes) {
output.appender.append(quote);
}
if (ch != delimiter && ch != newLine) {
throw new TextParsingException(context, "Unexpected character '" + ch + "' following quoted value of CSV field. Expecting '" + delimiter + "'. Cannot parse CSV input.");
}
}
}
@Override
protected final InputAnalysisProcess getInputAnalysisProcess() {
if (settings.isDelimiterDetectionEnabled() || settings.isQuoteDetectionEnabled()) {
return new CsvFormatDetector(formatDetectorRowSampleCount, settings, whitespaceRangeStart) {
@Override
void apply(char delimiter, char quote, char quoteEscape) {
if (settings.isDelimiterDetectionEnabled()) {
CsvParser.this.delimiter = delimiter;
CsvParser.this.delimiters[0] = delimiter;
}
if (settings.isQuoteDetectionEnabled()) {
CsvParser.this.quote = quote;
CsvParser.this.quoteEscape = quoteEscape;
}
}
};
}
return null;
}
/**
* Returns the CSV format detected when one of the following settings is enabled:
*
* - {@link CommonParserSettings#isLineSeparatorDetectionEnabled()}
* - {@link CsvParserSettings#isDelimiterDetectionEnabled()}
* - {@link CsvParserSettings#isQuoteDetectionEnabled()}
*
*
* The detected format will be available once the parsing process is initialized (i.e. when {@link AbstractParser#beginParsing(Reader) runs}.
*
* @return the detected CSV format, or {@code null} if no detection has been enabled or if the parsing process has not been started yet.
*/
public final CsvFormat getDetectedFormat() {
CsvFormat out = null;
if (settings.isDelimiterDetectionEnabled()) {
out = settings.getFormat().clone();
out.setDelimiter(this.delimiter);
}
if (settings.isQuoteDetectionEnabled()) {
out = out == null ? settings.getFormat().clone() : out;
out.setQuote(quote);
out.setQuoteEscape(quoteEscape);
}
if (settings.isLineSeparatorDetectionEnabled()) {
out = out == null ? settings.getFormat().clone() : out;
out.setLineSeparator(input.getLineSeparator());
}
return out;
}
@Override
protected final boolean consumeValueOnEOF() {
if (ch == quote) {
if (prev == quote) {
if (keepQuotes) {
output.appender.append(quote);
}
return true;
} else {
if (!unescaped) {
output.appender.append(quote);
}
}
}
boolean out = prev != '\0' && ch != delimiter && ch != newLine && ch != comment;
ch = prev = '\0';
if (match > 0) {
saveMatchingCharacters();
return true;
}
return out;
}
/**
* Allows changing the format of the input on the fly.
*
* @param format the new format to use.
*/
public final void updateFormat(CsvFormat format) {
newLine = format.getNormalizedNewline();
multiDelimiter = format.getDelimiterString().toCharArray();
if (multiDelimiter.length == 1) {
multiDelimiter = null;
delimiter = format.getDelimiter();
delimiters = new char[]{delimiter, newLine};
} else {
delimiters = new char[]{multiDelimiter[0], newLine};
}
quote = format.getQuote();
quoteEscape = format.getQuoteEscape();
escapeEscape = format.getCharToEscapeQuoteEscaping();
}
private void skipWhitespace() {
while (ch <= ' ' && match < multiDelimiter.length && ch != newLine && ch != quote && whitespaceRangeStart < ch) {
ch = input.nextChar();
if (multiDelimiter[match] == ch) {
if (matchDelimiter()) {
output.emptyParsed();
ch = input.nextChar();
}
}
}
saveMatchingCharacters();
}
private void saveMatchingCharacters() {
if (match > 0) {
if (match < multiDelimiter.length) {
output.appender.append(multiDelimiter, 0, match);
}
match = 0;
}
}
private boolean matchDelimiter() {
while (ch == multiDelimiter[match]) {
match++;
if (match == multiDelimiter.length) {
break;
}
ch = input.nextChar();
}
if (multiDelimiter.length == match) {
match = 0;
return true;
}
if (match > 0) {
saveMatchingCharacters();
}
return false;
}
private boolean matchDelimiterAfterQuote() {
while (ch == multiDelimiter[match]) {
match++;
if (match == multiDelimiter.length) {
break;
}
ch = input.nextChar();
}
if (multiDelimiter.length == match) {
match = 0;
return true;
}
return false;
}
private void parseMultiDelimiterRecord() {
if (ch <= ' ' && ignoreLeadingWhitespace && whitespaceRangeStart < ch) {
skipWhitespace();
}
while (ch != newLine) {
if (ch <= ' ' && ignoreLeadingWhitespace && whitespaceRangeStart < ch) {
skipWhitespace();
}
if (ch == newLine || matchDelimiter()) {
output.emptyParsed();
} else {
unescaped = false;
prev = '\0';
if (ch == quote && output.appender.length() == 0) {
input.enableNormalizeLineEndings(normalizeLineEndingsInQuotes);
output.trim = trimQuotedTrailing;
parseQuotedValueMultiDelimiter();
input.enableNormalizeLineEndings(true);
if (!(unescaped && quoteHandling == BACK_TO_DELIMITER && output.appender.length() == 0)) {
output.valueParsed();
}
} else if (doNotEscapeUnquotedValues) {
appendUntilMultiDelimiter();
if (ignoreTrailingWhitespace) {
output.appender.updateWhitespace();
}
output.valueParsed();
} else {
output.trim = ignoreTrailingWhitespace;
parseValueProcessingEscapeMultiDelimiter();
output.valueParsed();
}
}
if (ch != newLine) {
ch = input.nextChar();
if (ch == newLine) {
output.emptyParsed();
}
}
}
}
private void appendUntilMultiDelimiter() {
while (match < multiDelimiter.length && ch != newLine) {
if (multiDelimiter[match] == ch) {
match++;
if (match == multiDelimiter.length) {
break;
}
} else {
if (match > 0) {
saveMatchingCharacters();
continue;
}
output.appender.append(ch);
}
ch = input.nextChar();
}
saveMatchingCharacters();
}
private void parseQuotedValueMultiDelimiter() {
if (prev != '\0' && parseUnescapedQuotesUntilDelimiter) {
if (quoteHandling == SKIP_VALUE) {
skipValue();
return;
}
if (!keepQuotes) {
output.appender.prepend(quote);
}
ch = input.nextChar();
output.trim = ignoreTrailingWhitespace;
appendUntilMultiDelimiter();
} else {
if (keepQuotes && prev == '\0') {
output.appender.append(quote);
}
ch = input.nextChar();
if (trimQuotedLeading && ch <= ' ' && output.appender.length() == 0) {
while ((ch = input.nextChar()) <= ' ') ;
}
while (true) {
if (prev == quote && (ch <= ' ' && whitespaceRangeStart < ch || ch == newLine)) {
break;
}
if (prev == quote && matchDelimiter()) {
if (keepQuotes) {
output.appender.append(quote);
}
return;
}
if (ch != quote && ch != quoteEscape) {
if (prev == quote) { //unescaped quote detected
if (handleUnescapedQuote()) {
if (quoteHandling == SKIP_VALUE) {
break;
} else {
return;
}
} else {
return;
}
}
if (prev == quoteEscape && quoteEscape != '\0') {
output.appender.append(quoteEscape);
}
ch = output.appender.appendUntil(ch, input, quote, quoteEscape, escapeEscape);
prev = ch;
ch = input.nextChar();
} else {
processQuoteEscape();
prev = ch;
ch = input.nextChar();
if (unescaped && (ch == newLine || matchDelimiter())) {
return;
}
}
}
}
// handles whitespaces after quoted value: whitespaces are ignored. Content after whitespaces may be parsed if 'parseUnescapedQuotes' is enabled.
if (ch != newLine && ch <= ' ' && whitespaceRangeStart < ch && !matchDelimiterAfterQuote()) {
whitespaceAppender.reset();
do {
//saves whitespaces after value
whitespaceAppender.append(ch);
ch = input.nextChar();
//found a new line, go to next record.
if (ch == newLine) {
if (keepQuotes) {
output.appender.append(quote);
}
return;
}
if (matchDelimiterAfterQuote()) {
return;
}
} while (ch <= ' ' && whitespaceRangeStart < ch);
//there's more stuff after the quoted value, not only empty spaces.
if (parseUnescapedQuotes && !matchDelimiterAfterQuote()) {
if (output.appender instanceof DefaultCharAppender) {
//puts the quote before whitespaces back, then restores the whitespaces
output.appender.append(quote);
((DefaultCharAppender) output.appender).append(whitespaceAppender);
}
//the next character is not the escape character, put it there
if (parseUnescapedQuotesUntilDelimiter || ch != quote && ch != quoteEscape) {
output.appender.append(ch);
}
//sets this character as the previous character (may be escaping)
//calls recursively to keep parsing potentially quoted content
prev = ch;
parseQuotedValue();
} else if (keepQuotes) {
output.appender.append(quote);
}
} else if (keepQuotes && (!unescaped || quoteHandling == STOP_AT_CLOSING_QUOTE)) {
output.appender.append(quote);
}
}
private void parseValueProcessingEscapeMultiDelimiter() {
while (ch != newLine && !matchDelimiter()) {
if (ch != quote && ch != quoteEscape) {
if (prev == quote) { //unescaped quote detected
handleUnescapedQuoteInValue();
return;
}
output.appender.append(ch);
} else {
processQuoteEscape();
}
prev = ch;
ch = input.nextChar();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy