com.univocity.parsers.csv.CsvParserSettings Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of univocity-parsers Show documentation
univocity's open source parsers for processing different text formats using a consistent API
There is a newer version: 2.9.1
/*******************************************************************************
 * Copyright 2014 uniVocity Software Pty Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package com.univocity.parsers.csv;

import com.univocity.parsers.common.*;
import com.univocity.parsers.common.input.*;

import java.util.*;

/**
 * This is the configuration class used by the CSV parser ({@link CsvParser})
 *
 * In addition to the configuration options provided by {@link CommonParserSettings}, the CSVParserSettings include:
 *
 * 

 * emptyValue (defaults to null): Defines a replacement string to signify an empty value (which is not a null value)
 * When reading, if the parser does not read any character from the input, and the input is within quotes, the empty is used instead of an empty string
 * 
 *
 * @author uniVocity Software Pty Ltd - [email protected]
 * @see com.univocity.parsers.csv.CsvParser
 * @see com.univocity.parsers.csv.CsvFormat
 * @see com.univocity.parsers.common.CommonParserSettings
 */
public class CsvParserSettings extends CommonParserSettings {

	private String emptyValue = null;
	private boolean parseUnescapedQuotes = true;
	private boolean parseUnescapedQuotesUntilDelimiter = true;
	private boolean escapeUnquotedValues = false;
	private boolean keepEscapeSequences = false;
	private boolean keepQuotes = false;
	private boolean normalizeLineEndingsWithinQuotes = true;

	private boolean ignoreTrailingWhitespacesInQuotes = false;
	private boolean ignoreLeadingWhitespacesInQuotes = false;

	private boolean delimiterDetectionEnabled = false;
	private boolean quoteDetectionEnabled = false;
	private UnescapedQuoteHandling unescapedQuoteHandling = null;
	private char[] delimitersForDetection = null;

	/**
	 * Returns the String representation of an empty value (defaults to null)
	 *
	 * When reading, if the parser does not read any character from the input, and the input is within quotes, the empty is used instead of an empty string
	 *
	 * @return the String representation of an empty value
	 */
	public String getEmptyValue() {
		return emptyValue;
	}

	/**
	 * Sets the String representation of an empty value (defaults to null)
	 *
	 * 
When reading, if the parser does not read any character from the input, and the input is within quotes, the empty is used instead of an empty string
	 *
	 * @param emptyValue the String representation of an empty value
	 */
	public void setEmptyValue(String emptyValue) {
		this.emptyValue = emptyValue;
	}

	/**
	 * Returns an instance of CharAppender with the configured limit of maximum characters per column and the default value used to represent an empty value (when the String parsed from the input, within quotes, is empty)
	 *
	 * 
This overrides the parent's version because the CSV parser does not rely on the appender to identify null values, but on the other hand, the appender is required to identify empty values.
	 *
	 * @return an instance of CharAppender with the configured limit of maximum characters per column and the default value used to represent an empty value (when the String parsed from the input, within quotes, is empty)
	 */
	@Override
	protected CharAppender newCharAppender() {
		int chars = getMaxCharsPerColumn();
		if (chars != -1) {
			return new DefaultCharAppender(chars, emptyValue, getWhitespaceRangeStart());
		} else {
			return new ExpandingCharAppender(emptyValue, getWhitespaceRangeStart());
		}
	}

	/**
	 * Returns the default CsvFormat configured to handle CSV inputs compliant to the RFC4180 standard.
	 *
	 * @return and instance of CsvFormat configured to handle CSV inputs compliant to the RFC4180 standard.
	 */
	@Override
	protected CsvFormat createDefaultFormat() {
		return new CsvFormat();
	}

	/**
	 * Indicates whether the CSV parser should accept unescaped quotes inside quoted values and parse them normally. Defaults to {@code true}.
	 *
	 * @return a flag indicating whether or not the CSV parser should accept unescaped quotes inside quoted values.
	 *
	 * @deprecated use {@link #getUnescapedQuoteHandling()} instead. The configuration returned by {@link #getUnescapedQuoteHandling()} will override this setting if not null.
	 */
	@Deprecated
	public boolean isParseUnescapedQuotes() {
		return parseUnescapedQuotes || (unescapedQuoteHandling != null && unescapedQuoteHandling != UnescapedQuoteHandling.RAISE_ERROR);
	}

	/**
	 * Configures how to handle unescaped quotes inside quoted values. If set to {@code true}, the parser will parse the quote normally as part of the value.
	 * If set the {@code false}, a {@link TextParsingException} will be thrown. Defaults to {@code true}.
	 *
	 * @param parseUnescapedQuotes indicates whether or not the CSV parser should accept unescaped quotes inside quoted values.
	 *
	 * @deprecated use {@link #setUnescapedQuoteHandling(UnescapedQuoteHandling)} instead. The configuration returned by {@link #getUnescapedQuoteHandling()} will override this setting if not null.
	 */
	@Deprecated
	public void setParseUnescapedQuotes(boolean parseUnescapedQuotes) {
		this.parseUnescapedQuotes = parseUnescapedQuotes;
	}

	/**
	 * Configures the parser to process values with unescaped quotes, and stop accumulating characters and consider the value parsed when a delimiter is found.
	 * (defaults to {@code true})
	 *
	 * @param parseUnescapedQuotesUntilDelimiter a flag indicating that the parser should stop accumulating values when a field delimiter character is
	 *                                           found when parsing unquoted and unescaped values.
	 *
	 * @deprecated use {@link #setUnescapedQuoteHandling(UnescapedQuoteHandling)} instead. The configuration returned by {@link #getUnescapedQuoteHandling()} will override this setting if not null.
	 */
	@Deprecated
	public void setParseUnescapedQuotesUntilDelimiter(boolean parseUnescapedQuotesUntilDelimiter) {
		if (parseUnescapedQuotesUntilDelimiter) {
			parseUnescapedQuotes = true;
		}
		this.parseUnescapedQuotesUntilDelimiter = parseUnescapedQuotesUntilDelimiter;
	}

	/**
	 * When parsing unescaped quotes, indicates the parser should stop accumulating characters and consider the value parsed when a delimiter is found.
	 * (defaults to {@code true})
	 *
	 * @return a flag indicating that the parser should stop accumulating values when a field delimiter character is
	 * found when parsing unquoted and unescaped values.
	 *
	 * @deprecated use {@link #getUnescapedQuoteHandling()} instead. The configuration returned by {@link #getUnescapedQuoteHandling()} will override this setting if not null.
	 */
	@Deprecated
	public boolean isParseUnescapedQuotesUntilDelimiter() {
		return (parseUnescapedQuotesUntilDelimiter && isParseUnescapedQuotes()) || (unescapedQuoteHandling == UnescapedQuoteHandling.STOP_AT_DELIMITER || unescapedQuoteHandling == UnescapedQuoteHandling.SKIP_VALUE);
	}

	/**
	 * Indicates whether escape sequences should be processed in unquoted values. Defaults to {@code false}.
	 *
	 * 
By default, this is disabled and if the input is {@code A""B,C}, the resulting value will be
	 * {@code [A""B] and [C]} (i.e. the content is read as-is). However, if the parser is configured
	 * to process escape sequences in unquoted values, the result will be {@code [A"B] and [C]}
	 *
	 * @return true if escape sequences should be processed in unquoted values, otherwise false
	 */
	public boolean isEscapeUnquotedValues() {
		return escapeUnquotedValues;
	}

	/**
	 * Configures the parser to process escape sequences in unquoted values. Defaults to {@code false}.
	 *
	 * By default, this is disabled and if the input is {@code A""B,C}, the resulting value will be
	 * {@code [A""B] and [C]} (i.e. the content is read as-is). However, if the parser is configured
	 * to process escape sequences in unquoted values, the result will be {@code [A"B] and [C]}
	 *
	 * @param escapeUnquotedValues a flag indicating whether escape sequences should be processed in unquoted values
	 */
	public void setEscapeUnquotedValues(boolean escapeUnquotedValues) {
		this.escapeUnquotedValues = escapeUnquotedValues;
	}

	/**
	 * Indicates whether the parser should keep any escape sequences if they are present in the input (i.e. a quote escape sequence such as two double quotes {@code ""} won't be replaced by a single double quote {@code "}).
	 * This is disabled by default
	 *
	 * @return a flag indicating whether escape sequences should be kept (and not replaced) by the parser.
	 */
	public final boolean isKeepEscapeSequences() {
		return keepEscapeSequences;
	}

	/**
	 * Configures the parser to keep any escape sequences if they are present in the input (i.e. a quote escape sequence such as 2 double quotes {@code ""} won't be replaced by a single double quote {@code "}).
	 * This is disabled by default
	 *
	 * @param keepEscapeSequences the flag indicating whether escape sequences should be kept (and not replaced) by the parser.
	 */
	public final void setKeepEscapeSequences(boolean keepEscapeSequences) {
		this.keepEscapeSequences = keepEscapeSequences;
	}

	/**
	 * Returns a flag indicating whether the parser should analyze the input to discover the column delimiter character.
	 * Note that the detection process is not guaranteed to discover the correct column delimiter. In this case the delimiter provided by {@link CsvFormat#getDelimiter()} will be used
	 *
	 * @return a flag indicating whether the parser should analyze the input to discover the column delimiter character.
	 */
	public final boolean isDelimiterDetectionEnabled() {
		return delimiterDetectionEnabled;
	}

	/**
	 * Configures the parser to analyze the input before parsing to discover the column delimiter character.
	 * Note that the detection process is not guaranteed to discover the correct column delimiter.
	 * The first character in the list of delimiters allowed for detection will be used, if available, otherwise
	 * the delimiter returned by {@link CsvFormat#getDelimiter()} will be used.
	 *
	 * @param separatorDetectionEnabled the flag to enable/disable discovery of the column delimiter character.
	 * @param delimitersForDetection possible delimiters for detection when {@link #isDelimiterDetectionEnabled()} evaluates
	 * to {@code true}, in order of priority.
	 */
	public final void setDelimiterDetectionEnabled(boolean separatorDetectionEnabled, char... delimitersForDetection) {
		this.delimiterDetectionEnabled = separatorDetectionEnabled;
		this.delimitersForDetection = delimitersForDetection;
	}

	/**
	 * Returns a flag indicating whether the parser should analyze the input to discover the quote character. The quote escape will also be detected as part of this process.
	 *  Note that the detection process is not guaranteed to discover the correct quote & escape.
	 * In this case the characters provided by {@link CsvFormat#getQuote()} and {@link CsvFormat#getQuoteEscape()} will be used 
	 *
	 * @return a flag indicating whether the parser should analyze the input to discover the quote character. The quote escape will also be detected as part of this process.
	 */
	public final boolean isQuoteDetectionEnabled() {
		return quoteDetectionEnabled;
	}

	/**
	 * Configures the parser to analyze the input before parsing to discover the quote character. The quote escape will also be detected as part of this process.
	 *  Note that the detection process is not guaranteed to discover the correct quote & escape.
	 * In this case the characters provided by {@link CsvFormat#getQuote()} and {@link CsvFormat#getQuoteEscape()} will be used 
	 *
	 * @param quoteDetectionEnabled the flag to enable/disable discovery of the quote character. The quote escape will also be detected as part of this process.
	 */
	public final void setQuoteDetectionEnabled(boolean quoteDetectionEnabled) {
		this.quoteDetectionEnabled = quoteDetectionEnabled;
	}

	/**
	 * Convenience method to turn on all format detection features in a single method call, namely:
	 * 
	 * {@link #setDelimiterDetectionEnabled(boolean, char[])} 
	 * {@link #setQuoteDetectionEnabled(boolean)} 
	 * {@link #setLineSeparatorDetectionEnabled(boolean)} 
	 * 
	 *
	 * @param delimitersForDetection possible delimiters for detection, in order of priority.
	 */
	public final void detectFormatAutomatically(char... delimitersForDetection) {
		this.setDelimiterDetectionEnabled(true, delimitersForDetection);
		this.setQuoteDetectionEnabled(true);
		this.setLineSeparatorDetectionEnabled(true);
	}

	/**
	 * Flag indicating whether the parser should replace line separators, specified in {@link Format#getLineSeparator()}
	 * by the normalized line separator character specified in {@link Format#getNormalizedNewline()}, even on quoted values.
	 *
	 * This is enabled by default and is used to ensure data be read on any platform without introducing unwanted blank lines.
	 *
	 * For example, consider the quoted value {@code "Line1 \r\n Line2"}. If this is parsed using {@code "\r\n"} as
	 * the line separator sequence, and the normalized new line is set to {@code '\n'} (the default), the output will be:
	 *
	 * {@code [Line1 \n Line2]}
	 *
	 * However, if the value is meant to be kept untouched, and the original line separator should be maintained, set
	 * the {@link #normalizeLineEndingsWithinQuotes} to {@code false}. This will make the parser read the value as-is, producing:
	 *
	 * {@code [Line1 \r\n Line2]}
	 *
	 * @return {@code true} if line separators in quoted values will be normalized, {@code false} otherwise
	 */
	public boolean isNormalizeLineEndingsWithinQuotes() {
		return normalizeLineEndingsWithinQuotes;
	}

	/**
	 * Configures the parser to replace line separators, specified in {@link Format#getLineSeparator()}
	 * by the normalized line separator character specified in {@link Format#getNormalizedNewline()}, even on quoted values.
	 *
	 * This is enabled by default and is used to ensure data be read on any platform without introducing unwanted blank lines.
	 *
	 * For example, consider the quoted value {@code "Line1 \r\n Line2"}. If this is parsed using {@code "\r\n"} as
	 * the line separator sequence, and the normalized new line is set to {@code '\n'} (the default), the output will be:
	 *
	 * {@code [Line1 \n Line2]}
	 *
	 * However, if the value is meant to be kept untouched, and the original line separator should be maintained, set
	 * the {@link #normalizeLineEndingsWithinQuotes} to {@code false}. This will make the parser read the value as-is, producing:
	 *
	 * {@code [Line1 \r\n Line2]}
	 *
	 * @param normalizeLineEndingsWithinQuotes flag indicating whether line separators in quoted values should be replaced by
	 *                                         the the character specified in {@link Format#getNormalizedNewline()} .
	 */
	public void setNormalizeLineEndingsWithinQuotes(boolean normalizeLineEndingsWithinQuotes) {
		this.normalizeLineEndingsWithinQuotes = normalizeLineEndingsWithinQuotes;
	}

	/**
	 * Configures the handling of values with unescaped quotes.
	 * Defaults to {@code null}, for backward compatibility with {@link #isParseUnescapedQuotes()} and {@link #isParseUnescapedQuotesUntilDelimiter()}.
	 * If set to a non-null value, this setting will override the configuration of {@link #isParseUnescapedQuotes()} and {@link #isParseUnescapedQuotesUntilDelimiter()}.
	 *
	 * @param unescapedQuoteHandling the handling method to be used when unescaped quotes are found in the input.
	 */
	public void setUnescapedQuoteHandling(UnescapedQuoteHandling unescapedQuoteHandling) {
		this.unescapedQuoteHandling = unescapedQuoteHandling;
	}

	/**
	 * Returns the method of handling values with unescaped quotes.
	 * Defaults to {@code null}, for backward compatibility with {@link #isParseUnescapedQuotes()} and {@link #isParseUnescapedQuotesUntilDelimiter()}
	 * If set to a non-null value, this setting will override the configuration of {@link #isParseUnescapedQuotes()} and {@link #isParseUnescapedQuotesUntilDelimiter()}.
	 *
	 * @return the handling method to be used when unescaped quotes are found in the input, or {@code null} if not set.
	 */
	public UnescapedQuoteHandling getUnescapedQuoteHandling() {
		return this.unescapedQuoteHandling;
	}


	/**
	 * Flag indicating whether the parser should keep enclosing quote characters in the values parsed from the input.
	 * Defaults to {@code false}
	 *
	 * @return a flag indicating whether enclosing quotes should be maintained when parsing quoted values.
	 */
	public boolean getKeepQuotes() {
		return keepQuotes;
	}

	/**
	 * Configures the parser to keep enclosing quote characters in the values parsed from the input.
	 * Defaults to {@code false}
	 *
	 * @param keepQuotes flag indicating whether enclosing quotes should be maintained when parsing quoted values.
	 */
	public void setKeepQuotes(boolean keepQuotes) {
		this.keepQuotes = keepQuotes;
	}

	@Override
	protected void addConfiguration(Map out) {
		super.addConfiguration(out);
		out.put("Empty value", emptyValue);
		out.put("Unescaped quote handling", unescapedQuoteHandling);
		out.put("Escape unquoted values", escapeUnquotedValues);
		out.put("Keep escape sequences", keepEscapeSequences);
		out.put("Keep quotes", keepQuotes);
		out.put("Normalize escaped line separators", normalizeLineEndingsWithinQuotes);
		out.put("Autodetect column delimiter", delimiterDetectionEnabled);
		out.put("Autodetect quotes", quoteDetectionEnabled);
		out.put("Delimiters for detection", Arrays.toString(delimitersForDetection));
		out.put("Ignore leading whitespaces in quotes", ignoreLeadingWhitespacesInQuotes);
		out.put("Ignore trailing whitespaces in quotes", ignoreTrailingWhitespacesInQuotes);
	}

	@Override
	public final CsvParserSettings clone() {
		return (CsvParserSettings) super.clone();
	}

	@Override
	public final CsvParserSettings clone(boolean clearInputSpecificSettings) {
		return (CsvParserSettings) super.clone(clearInputSpecificSettings);
	}

	/**
	 * Returns the sequence of possible delimiters for detection when {@link #isDelimiterDetectionEnabled()} evaluates
	 * to {@code true}, in order of priority.
	 *
	 * @return the possible delimiter characters, in order of priority.
	 */
	public final char[] getDelimitersForDetection() {
		return this.delimitersForDetection;
	}

	/**
	 * Returns whether or not trailing whitespaces from within quoted values should be skipped  (defaults to false)
	 *
	 * Note: if {@link #keepQuotes} evaluates to {@code true}, values won't be trimmed.
	 *
	 * @return true if trailing whitespaces from quoted values should be skipped, false otherwise
	 */
	public boolean getIgnoreTrailingWhitespacesInQuotes() {
		return ignoreTrailingWhitespacesInQuotes;
	}

	/**
	 * Defines whether or not trailing whitespaces from quoted values should be skipped  (defaults to false)
	 *
	 * Note: if {@link #keepQuotes} evaluates to {@code true}, values won't be trimmed.
	 *
	 * @param ignoreTrailingWhitespacesInQuotes whether trailing whitespaces from quoted values should be skipped
	 */
	public void setIgnoreTrailingWhitespacesInQuotes(boolean ignoreTrailingWhitespacesInQuotes) {
		this.ignoreTrailingWhitespacesInQuotes = ignoreTrailingWhitespacesInQuotes;
	}

	/**
	 * Returns whether or not leading whitespaces from quoted values should be skipped  (defaults to false)
	 *
	 * Note: if {@link #keepQuotes} evaluates to {@code true}, values won't be trimmed.
	 *
	 * @return true if leading whitespaces from quoted values should be skipped, false otherwise
	 */
	public boolean getIgnoreLeadingWhitespacesInQuotes() {
		return ignoreLeadingWhitespacesInQuotes;
	}

	/**
	 * Defines whether or not leading whitespaces from quoted values should be skipped  (defaults to false)
	 *
	 * Note: if {@link #keepQuotes} evaluates to {@code true}, values won't be trimmed.
	 *
	 * @param ignoreLeadingWhitespacesInQuotes whether leading whitespaces from quoted values should be skipped
	 */
	public void setIgnoreLeadingWhitespacesInQuotes(boolean ignoreLeadingWhitespacesInQuotes) {
		this.ignoreLeadingWhitespacesInQuotes = ignoreLeadingWhitespacesInQuotes;
	}

	/**
	 * Configures the parser to trim any whitespaces around values extracted from within quotes. Shorthand for
	 * {@link #setIgnoreLeadingWhitespacesInQuotes(boolean)} and {@link #setIgnoreTrailingWhitespacesInQuotes(boolean)}
	 *
	 * Note: if {@link #keepQuotes} evaluates to {@code true}, values won't be trimmed.
	 *
	 * @param trim a flag indicating whether whitespaces around values extracted from a quoted field should be removed
	 */
	public final void trimQuotedValues(boolean trim) {
		setIgnoreTrailingWhitespacesInQuotes(trim);
		setIgnoreLeadingWhitespacesInQuotes(trim);
	}
}