All Downloads are FREE. Search and download functionalities are using the official Maven repository.

es.prodevelop.pui9.csv.CsvReader Maven / Gradle / Ivy

The newest version!
/*
 * Java CSV is a stream based library for reading and writing
 * CSV and other delimited data.
 *   
 * Copyright (C) Bruce Dunwiddie [email protected]
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 */
package es.prodevelop.pui9.csv;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.text.NumberFormat;
import java.util.LinkedHashMap;
import java.util.Map;

/**
 * A stream based parser for parsing delimited text data from a file or a
 * stream.
 */
public class CsvReader {

	/**
	 * Double up the text qualifier to represent an occurance of the text qualifier.
	 */
	public static final int ESCAPE_MODE_DOUBLED = 1;

	/**
	 * Use a backslash character before the text qualifier to represent an occurance
	 * of the text qualifier.
	 */
	public static final int ESCAPE_MODE_BACKSLASH = 2;

	private Reader inputStream;

	// this holds all the values for switches that the user is allowed to set
	private UserSettings userSettings;
	private boolean useCustomRecordDelimiter;
	// this will be our working buffer to hold data chunks
	// read in from the data file
	private DataBuffer dataBuffer;
	private ColumnBuffer columnBuffer;
	private RawRecordBuffer rawBuffer;
	private boolean[] isQualified;
	private String rawRecord;
	private HeadersHolder headersHolder;
	// these are all more or less global loop variables
	// to keep from needing to pass them all into various
	// methods during parsing
	private boolean startedColumn;
	private boolean startedWithQualifier;
	private boolean hasMoreData;
	private char lastLetter;
	private boolean hasReadNextLine;
	private int columnsCount;
	private long currentRecord;
	private String[] values;
	private boolean closed;

	/**
	 * Creates a {@link CsvReader CsvReader} object using a file as the data source.
	 * 
	 * @param fileName  The path to the file to use as the data source.
	 * @param delimiter The character to use as the column delimiter.
	 * @param charset   The {@link java.nio.charset.Charset Charset} to use while
	 *                  parsing the data.
	 * 
	 * @throws FileNotFoundException if the file doesn't exist
	 */
	public CsvReader(String fileName, char delimiter, Charset charset) throws FileNotFoundException {
		if (fileName == null) {
			throw new IllegalArgumentException("Parameter fileName can not be null.");
		}

		if (charset == null) {
			throw new IllegalArgumentException("Parameter charset can not be null.");
		}

		if (!new File(fileName).exists()) {
			throw new FileNotFoundException("File " + fileName + " does not exist.");
		}

		this.inputStream = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), charset),
				StaticSettings.MAX_FILE_BUFFER_SIZE);
		init(delimiter);
	}

	/**
	 * Creates a {@link CsvReader CsvReader} object using a file as the data
	 * source. Uses UTF8 as the {@link java.nio.charset.Charset Charset}.
	 * 
	 * @param fileName  The path to the file to use as the data source.
	 * @param delimiter The character to use as the column delimiter.
	 * 
	 * @throws FileNotFoundException if the file doesn't exist
	 */
	public CsvReader(String fileName, char delimiter) throws FileNotFoundException {
		this(fileName, delimiter, StandardCharsets.UTF_8);
	}

	/**
	 * Creates a {@link CsvReader CsvReader} object using a file as the data
	 * source. Uses a comma as the column delimiter and UTF8 as the
	 * {@link java.nio.charset.Charset Charset}.
	 * 
	 * @param fileName The path to the file to use as the data source.
	 * 
	 * @throws FileNotFoundException if the file doesn't exist
	 */
	public CsvReader(String fileName) throws FileNotFoundException {
		this(fileName, Letters.SEMICOLON.code);
	}

	/**
	 * Constructs a {@link CsvReader CsvReader} object using a {@link java.io.Reader
	 * Reader} object as the data source.
	 * 
	 * @param inputStream The stream to use as the data source.
	 * @param delimiter   The character to use as the column delimiter.
	 */
	public CsvReader(Reader inputStream, char delimiter) {
		if (inputStream == null) {
			throw new IllegalArgumentException("Parameter inputStream can not be null.");
		}

		this.inputStream = inputStream;
		init(delimiter);
	}

	/**
	 * Constructs a {@link CsvReader CsvReader} object using a {@link java.io.Reader
	 * Reader} object as the data source. Uses a comma as the column delimiter.
	 * 
	 * @param inputStream The stream to use as the data source.
	 */
	public CsvReader(Reader inputStream) {
		this(inputStream, Letters.SEMICOLON.code);
	}

	/**
	 * Constructs a {@link CsvReader CsvReader} object using an
	 * {@link java.io.InputStream InputStream} object as the data source.
	 * 
	 * @param inputStream The stream to use as the data source.
	 * @param delimiter   The character to use as the column delimiter.
	 * @param charset     The {@link java.nio.charset.Charset Charset} to use while
	 *                    parsing the data.
	 */
	public CsvReader(InputStream inputStream, char delimiter, Charset charset) {
		this(new InputStreamReader(inputStream, charset), delimiter);
	}

	/**
	 * Constructs a {@link CsvReader CsvReader} object using an
	 * {@link java.io.InputStream InputStream} object as the data source. Uses
	 * a comma as the column delimiter.
	 * 
	 * @param inputStream The stream to use as the data source.
	 * @param charset     The {@link java.nio.charset.Charset Charset} to use while
	 *                    parsing the data.
	 */
	public CsvReader(InputStream inputStream, Charset charset) {
		this(new InputStreamReader(inputStream, charset));
	}

	public CsvReader(InputStream inputStream, Charset charset, boolean guessSeparator) {
		this(new InputStreamReader(inputStream, charset));

		if (guessSeparator) {
			Letters delimiter = null;
			try {
				readHeaders();
				if (getHeaderCount() == 0) {
					return;
				}

				for (Letters letter : Letters.values()) {
					if (getHeaders()[0].indexOf(letter.code) >= 0) {
						delimiter = letter;
						break;
					}
				}

				if (delimiter == null) {
					delimiter = Letters.SEMICOLON;
				}

				close();
				inputStream.reset();
				this.inputStream = new InputStreamReader(inputStream, charset);
				init(delimiter.code);
			} catch (IOException e) {
				// do nothing
			}
		}
	}

	private void init(char delimiter) {
		this.userSettings = new UserSettings();
		this.useCustomRecordDelimiter = false;
		this.dataBuffer = new DataBuffer();
		this.columnBuffer = new ColumnBuffer();
		this.rawBuffer = new RawRecordBuffer();
		this.rawRecord = "";
		this.headersHolder = new HeadersHolder();
		this.startedColumn = false;
		this.startedWithQualifier = false;
		this.hasMoreData = true;
		this.lastLetter = '\0';
		this.hasReadNextLine = false;
		this.columnsCount = 0;
		this.currentRecord = 0;
		this.values = new String[StaticSettings.INITIAL_COLUMN_COUNT];
		this.isQualified = new boolean[values.length];
		this.closed = false;

		this.userSettings.Delimiter = delimiter;
	}

	public boolean getCaptureRawRecord() {
		return userSettings.CaptureRawRecord;
	}

	public void setCaptureRawRecord(boolean captureRawRecord) {
		userSettings.CaptureRawRecord = captureRawRecord;
	}

	public String getRawRecord() {
		return rawRecord;
	}

	/**
	 * Gets whether leading and trailing whitespace characters are being trimmed
	 * from non-textqualified column data. Default is true.
	 * 
	 * @return Whether leading and trailing whitespace characters are being trimmed
	 *         from non-textqualified column data.
	 */
	public boolean getTrimWhitespace() {
		return userSettings.TrimWhitespace;
	}

	/**
	 * Sets whether leading and trailing whitespace characters should be trimmed
	 * from non-textqualified column data or not. Default is true.
	 * 
	 * @param trimWhitespace Whether leading and trailing whitespace characters
	 *                       should be trimmed from non-textqualified column data or
	 *                       not.
	 */
	public void setTrimWhitespace(boolean trimWhitespace) {
		userSettings.TrimWhitespace = trimWhitespace;
	}

	/**
	 * Gets the character being used as the column delimiter. Default is comma, ','.
	 * 
	 * @return The character being used as the column delimiter.
	 */
	public char getDelimiter() {
		return userSettings.Delimiter;
	}

	/**
	 * Sets the character to use as the column delimiter. Default is comma, ','.
	 * 
	 * @param delimiter The character to use as the column delimiter.
	 */
	public void setDelimiter(char delimiter) {
		userSettings.Delimiter = delimiter;
	}

	public char getRecordDelimiter() {
		return userSettings.RecordDelimiter;
	}

	/**
	 * Sets the character to use as the record delimiter.
	 * 
	 * @param recordDelimiter The character to use as the record delimiter. Default
	 *                        is combination of standard end of line characters for
	 *                        Windows, Unix, or Mac.
	 */
	public void setRecordDelimiter(char recordDelimiter) {
		useCustomRecordDelimiter = true;
		userSettings.RecordDelimiter = recordDelimiter;
	}

	/**
	 * Gets the character to use as a text qualifier in the data.
	 * 
	 * @return The character to use as a text qualifier in the data.
	 */
	public char getTextQualifier() {
		return userSettings.TextQualifier;
	}

	/**
	 * Sets the character to use as a text qualifier in the data.
	 * 
	 * @param textQualifier The character to use as a text qualifier in the data.
	 */
	public void setTextQualifier(char textQualifier) {
		userSettings.TextQualifier = textQualifier;
	}

	/**
	 * Whether text qualifiers will be used while parsing or not.
	 * 
	 * @return Whether text qualifiers will be used while parsing or not.
	 */
	public boolean getUseTextQualifier() {
		return userSettings.UseTextQualifier;
	}

	/**
	 * Sets whether text qualifiers will be used while parsing or not.
	 * 
	 * @param useTextQualifier Whether to use a text qualifier while parsing or not.
	 */
	public void setUseTextQualifier(boolean useTextQualifier) {
		userSettings.UseTextQualifier = useTextQualifier;
	}

	/**
	 * Gets the character being used as a comment signal.
	 * 
	 * @return The character being used as a comment signal.
	 */
	public char getComment() {
		return userSettings.Comment;
	}

	/**
	 * Sets the character to use as a comment signal.
	 * 
	 * @param comment The character to use as a comment signal.
	 */
	public void setComment(char comment) {
		userSettings.Comment = comment;
	}

	/**
	 * Gets whether comments are being looked for while parsing or not.
	 * 
	 * @return Whether comments are being looked for while parsing or not.
	 */
	public boolean getUseComments() {
		return userSettings.UseComments;
	}

	/**
	 * Sets whether comments are being looked for while parsing or not.
	 * 
	 * @param useComments Whether comments are being looked for while parsing or
	 *                    not.
	 */
	public void setUseComments(boolean useComments) {
		userSettings.UseComments = useComments;
	}

	/**
	 * Gets the current way to escape an occurance of the text qualifier inside
	 * qualified data.
	 * 
	 * @return The current way to escape an occurance of the text qualifier inside
	 *         qualified data.
	 */
	public int getEscapeMode() {
		return userSettings.EscapeMode;
	}

	/**
	 * Sets the current way to escape an occurance of the text qualifier inside
	 * qualified data.
	 * 
	 * @param escapeMode The way to escape an occurance of the text qualifier inside
	 *                   qualified data.
	 * @exception IllegalArgumentException When an illegal value is specified for
	 *                                     escapeMode.
	 */
	public void setEscapeMode(int escapeMode) throws IllegalArgumentException {
		if (escapeMode != ESCAPE_MODE_DOUBLED && escapeMode != ESCAPE_MODE_BACKSLASH) {
			throw new IllegalArgumentException("Parameter escapeMode must be a valid value.");
		}

		userSettings.EscapeMode = escapeMode;
	}

	public boolean getSkipEmptyRecords() {
		return userSettings.SkipEmptyRecords;
	}

	public void setSkipEmptyRecords(boolean skipEmptyRecords) {
		userSettings.SkipEmptyRecords = skipEmptyRecords;
	}

	/**
	 * Safety caution to prevent the parser from using large amounts of memory in
	 * the case where parsing settings like file encodings don't end up matching the
	 * actual format of a file. This switch can be turned off if the file format is
	 * known and tested. With the switch off, the max column lengths and max column
	 * count per record supported by the parser will greatly increase. Default is
	 * true.
	 * 
	 * @return The current setting of the safety switch.
	 */
	public boolean getSafetySwitch() {
		return userSettings.SafetySwitch;
	}

	/**
	 * Safety caution to prevent the parser from using large amounts of memory in
	 * the case where parsing settings like file encodings don't end up matching the
	 * actual format of a file. This switch can be turned off if the file format is
	 * known and tested. With the switch off, the max column lengths and max column
	 * count per record supported by the parser will greatly increase. Default is
	 * true.
	 * 
	 * @param safetySwitch if want safety switch or not
	 */
	public void setSafetySwitch(boolean safetySwitch) {
		userSettings.SafetySwitch = safetySwitch;
	}

	/**
	 * Gets the count of columns found in this record.
	 * 
	 * @return The count of columns found in this record.
	 */
	public int getColumnCount() {
		return columnsCount;
	}

	/**
	 * Gets the index of the current record.
	 * 
	 * @return The index of the current record.
	 */
	public long getCurrentRecord() {
		return currentRecord - 1;
	}

	/**
	 * Gets the count of headers read in by a previous call to
	 * {@link CsvReader#readHeaders readHeaders()}.
	 * 
	 * @return The count of headers read in by a previous call to
	 *         {@link CsvReader#readHeaders readHeaders()}.
	 */
	public int getHeaderCount() {
		return headersHolder.Length;
	}

	/**
	 * Returns the header values as a string array.
	 * 
	 * @return The header values as a String array.
	 * @exception IOException Thrown if this object has already been closed.
	 */
	public String[] getHeaders() throws IOException {
		checkClosed();

		if (headersHolder.Headers == null) {
			return null;
		} else {
			// use clone here to prevent the outside code from
			// setting values on the array directly, which would
			// throw off the index lookup based on header name
			String[] clone = new String[headersHolder.Length];
			System.arraycopy(headersHolder.Headers, 0, clone, 0, headersHolder.Length);
			return clone;
		}
	}

	public void setHeaders(String[] headers) {
		headersHolder.Headers = headers;

		headersHolder.IndexByName.clear();

		if (headers != null) {
			headersHolder.Length = headers.length;
		} else {
			headersHolder.Length = 0;
		}

		// use headersHolder.Length here in case headers is null
		if (headers == null) {
			return;
		}

		for (int i = 0; i < headersHolder.Length; i++) {
			headersHolder.IndexByName.put(headers[i], Integer.valueOf(i));
		}
	}

	public String[] getValues() throws IOException {
		checkClosed();

		// need to return a clone, and can't use clone because values.Length
		// might be greater than columnsCount
		String[] clone = new String[columnsCount];
		System.arraycopy(values, 0, clone, 0, columnsCount);
		return clone;
	}

	/**
	 * Returns the current column value for a given column index.
	 * 
	 * @param columnIndex The index of the column.
	 * @return The current column value.
	 * @exception IOException Thrown if this object has already been closed.
	 */
	public String get(int columnIndex) throws IOException {
		checkClosed();

		if (columnIndex > -1 && columnIndex < columnsCount) {
			return values[columnIndex];
		} else {
			return "";
		}
	}

	/**
	 * Returns the current column value for a given column header name.
	 * 
	 * @param headerName The header name of the column.
	 * @return The current column value.
	 * @exception IOException Thrown if this object has already been closed.
	 */
	public String get(String headerName) throws IOException {
		checkClosed();

		return get(getIndex(headerName));
	}

	/**
	 * Creates a {@link CsvReader CsvReader} object using a string of data as the
	 * source. Uses UTF8 as the {@link java.nio.charset.Charset Charset}.
	 * 
	 * @param data The String of data to use as the source.
	 * @return A {@link CsvReader CsvReader} object using the String of data as the
	 *         source.
	 */
	public static CsvReader parse(String data) {
		if (data == null) {
			throw new IllegalArgumentException("Parameter data can not be null.");
		}

		return new CsvReader(new StringReader(data));
	}

	/**
	 * Reads another record.
	 * 
	 * @return Whether another record was successfully read or not.
	 * @exception IOException Thrown if an error occurs while reading data from the
	 *                        source stream.
	 */
	public boolean readRecord() throws IOException {
		checkClosed();

		columnsCount = 0;
		rawBuffer.Position = 0;

		dataBuffer.LineStart = dataBuffer.Position;

		hasReadNextLine = false;

		// check to see if we've already found the end of data

		if (hasMoreData) {
			// loop over the data stream until the end of data is found
			// or the end of the record is found

			do {
				if (dataBuffer.Position == dataBuffer.Count) {
					checkDataLength();
				} else {
					startedWithQualifier = false;

					// grab the current letter as a char

					char currentLetter = dataBuffer.Buffer[dataBuffer.Position];

					if (userSettings.UseTextQualifier && currentLetter == userSettings.TextQualifier) {
						// this will be a text qualified column, so
						// we need to set startedWithQualifier to make it
						// enter the seperate branch to handle text
						// qualified columns

						lastLetter = currentLetter;

						// read qualified
						startedColumn = true;
						dataBuffer.ColumnStart = dataBuffer.Position + 1;
						startedWithQualifier = true;
						boolean lastLetterWasQualifier = false;

						char escapeChar = userSettings.TextQualifier;

						if (userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH) {
							escapeChar = Letters.BACKSLASH.code;
						}

						boolean eatingTrailingJunk = false;
						boolean lastLetterWasEscape = false;
						boolean readingComplexEscape = false;
						int escape = ComplexEscape.UNICODE;
						int escapeLength = 0;
						char escapeValue = (char) 0;

						dataBuffer.Position++;

						do {
							if (dataBuffer.Position == dataBuffer.Count) {
								checkDataLength();
							} else {
								// grab the current letter as a char

								currentLetter = dataBuffer.Buffer[dataBuffer.Position];

								if (eatingTrailingJunk) {
									dataBuffer.ColumnStart = dataBuffer.Position + 1;

									if (currentLetter == userSettings.Delimiter) {
										endColumn();
									} else if ((!useCustomRecordDelimiter
											&& (currentLetter == Letters.CR.code || currentLetter == Letters.LF.code))
											|| (useCustomRecordDelimiter
													&& currentLetter == userSettings.RecordDelimiter)) {
										endColumn();

										endRecord();
									}
								} else if (readingComplexEscape) {
									escapeLength++;

									switch (escape) {
									case ComplexEscape.UNICODE:
										escapeValue *= (char) 16;
										escapeValue += hexToDec(currentLetter);

										if (escapeLength == 4) {
											readingComplexEscape = false;
										}

										break;
									case ComplexEscape.OCTAL:
										escapeValue *= (char) 8;
										escapeValue += (char) (currentLetter - '0');

										if (escapeLength == 3) {
											readingComplexEscape = false;
										}

										break;
									case ComplexEscape.DECIMAL:
										escapeValue *= (char) 10;
										escapeValue += (char) (currentLetter - '0');

										if (escapeLength == 3) {
											readingComplexEscape = false;
										}

										break;
									case ComplexEscape.HEX:
										escapeValue *= (char) 16;
										escapeValue += hexToDec(currentLetter);

										if (escapeLength == 2) {
											readingComplexEscape = false;
										}

										break;
									}

									if (!readingComplexEscape) {
										appendLetter(escapeValue);
									} else {
										dataBuffer.ColumnStart = dataBuffer.Position + 1;
									}
								} else if (currentLetter == userSettings.TextQualifier) {
									if (lastLetterWasEscape) {
										lastLetterWasEscape = false;
										lastLetterWasQualifier = false;
									} else {
										updateCurrentValue();

										if (userSettings.EscapeMode == ESCAPE_MODE_DOUBLED) {
											lastLetterWasEscape = true;
										}

										lastLetterWasQualifier = true;
									}
								} else if (userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH && lastLetterWasEscape) {
									switch (currentLetter) {
									case 'n':
										appendLetter(Letters.LF.code);
										break;
									case 'r':
										appendLetter(Letters.CR.code);
										break;
									case 't':
										appendLetter(Letters.TAB.code);
										break;
									case 'b':
										appendLetter(Letters.BACKSPACE.code);
										break;
									case 'f':
										appendLetter(Letters.FORM_FEED.code);
										break;
									case 'e':
										appendLetter(Letters.ESCAPE.code);
										break;
									case 'v':
										appendLetter(Letters.VERTICAL_TAB.code);
										break;
									case 'a':
										appendLetter(Letters.ALERT.code);
										break;
									case '0':
									case '1':
									case '2':
									case '3':
									case '4':
									case '5':
									case '6':
									case '7':
										escape = ComplexEscape.OCTAL;
										readingComplexEscape = true;
										escapeLength = 1;
										escapeValue = (char) (currentLetter - '0');
										dataBuffer.ColumnStart = dataBuffer.Position + 1;
										break;
									case 'u':
									case 'x':
									case 'o':
									case 'd':
									case 'U':
									case 'X':
									case 'O':
									case 'D':
										switch (currentLetter) {
										case 'u':
										case 'U':
											escape = ComplexEscape.UNICODE;
											break;
										case 'x':
										case 'X':
											escape = ComplexEscape.HEX;
											break;
										case 'o':
										case 'O':
											escape = ComplexEscape.OCTAL;
											break;
										case 'd':
										case 'D':
											escape = ComplexEscape.DECIMAL;
											break;
										}

										readingComplexEscape = true;
										escapeLength = 0;
										escapeValue = (char) 0;
										dataBuffer.ColumnStart = dataBuffer.Position + 1;

										break;
									default:
										break;
									}

									lastLetterWasEscape = false;

									// can only happen for ESCAPE_MODE_BACKSLASH
								} else if (currentLetter == escapeChar) {
									updateCurrentValue();
									lastLetterWasEscape = true;
								} else {
									if (lastLetterWasQualifier) {
										if (currentLetter == userSettings.Delimiter) {
											endColumn();
										} else if ((!useCustomRecordDelimiter && (currentLetter == Letters.CR.code
												|| currentLetter == Letters.LF.code))
												|| (useCustomRecordDelimiter
														&& currentLetter == userSettings.RecordDelimiter)) {
											endColumn();

											endRecord();
										} else {
											dataBuffer.ColumnStart = dataBuffer.Position + 1;

											eatingTrailingJunk = true;
										}

										// make sure to clear the flag for next
										// run of the loop

										lastLetterWasQualifier = false;
									}
								}

								// keep track of the last letter because we need
								// it for several key decisions

								lastLetter = currentLetter;

								if (startedColumn) {
									dataBuffer.Position++;

									if (userSettings.SafetySwitch && dataBuffer.Position - dataBuffer.ColumnStart
											+ columnBuffer.Position > 100000) {
										close();

										throw new IOException("Maximum column length of 100,000 exceeded in column "
												+ NumberFormat.getIntegerInstance().format(columnsCount) + " in record "
												+ NumberFormat.getIntegerInstance().format(currentRecord)
												+ ". Set the SafetySwitch property to false"
												+ " if you're expecting column lengths greater than 100,000 characters to"
												+ " avoid this error.");
									}
								}
							} // end else

						} while (hasMoreData && startedColumn);
					} else if (currentLetter == userSettings.Delimiter) {
						// we encountered a column with no data, so
						// just send the end column

						lastLetter = currentLetter;

						endColumn();
					} else if (useCustomRecordDelimiter && currentLetter == userSettings.RecordDelimiter) {
						// this will skip blank lines
						if (startedColumn || columnsCount > 0 || !userSettings.SkipEmptyRecords) {
							endColumn();

							endRecord();
						} else {
							dataBuffer.LineStart = dataBuffer.Position + 1;
						}

						lastLetter = currentLetter;
					} else if (!useCustomRecordDelimiter
							&& (currentLetter == Letters.CR.code || currentLetter == Letters.LF.code)) {
						// this will skip blank lines
						if (startedColumn || columnsCount > 0 || (!userSettings.SkipEmptyRecords
								&& (currentLetter == Letters.CR.code || lastLetter != Letters.CR.code))) {
							endColumn();

							endRecord();
						} else {
							dataBuffer.LineStart = dataBuffer.Position + 1;
						}

						lastLetter = currentLetter;
					} else if (userSettings.UseComments && columnsCount == 0 && currentLetter == userSettings.Comment) {
						// encountered a comment character at the beginning of
						// the line so just ignore the rest of the line

						lastLetter = currentLetter;

						skipLine();
					} else if (userSettings.TrimWhitespace
							&& (currentLetter == Letters.SPACE.code || currentLetter == Letters.TAB.code)) {
						// do nothing, this will trim leading whitespace
						// for both text qualified columns and non

						startedColumn = true;
						dataBuffer.ColumnStart = dataBuffer.Position + 1;
					} else {
						// since the letter wasn't a special letter, this
						// will be the first letter of our current column

						startedColumn = true;
						dataBuffer.ColumnStart = dataBuffer.Position;
						boolean lastLetterWasBackslash = false;
						boolean readingComplexEscape = false;
						int escape = ComplexEscape.UNICODE;
						int escapeLength = 0;
						char escapeValue = (char) 0;

						boolean firstLoop = true;

						do {
							if (!firstLoop && dataBuffer.Position == dataBuffer.Count) {
								checkDataLength();
							} else {
								if (!firstLoop) {
									// grab the current letter as a char
									currentLetter = dataBuffer.Buffer[dataBuffer.Position];
								}

								if (!userSettings.UseTextQualifier && userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH
										&& currentLetter == Letters.BACKSLASH.code) {
									if (lastLetterWasBackslash) {
										lastLetterWasBackslash = false;
									} else {
										updateCurrentValue();
										lastLetterWasBackslash = true;
									}
								} else if (readingComplexEscape) {
									escapeLength++;

									switch (escape) {
									case ComplexEscape.UNICODE:
										escapeValue *= (char) 16;
										escapeValue += hexToDec(currentLetter);

										if (escapeLength == 4) {
											readingComplexEscape = false;
										}

										break;
									case ComplexEscape.OCTAL:
										escapeValue *= (char) 8;
										escapeValue += (char) (currentLetter - '0');

										if (escapeLength == 3) {
											readingComplexEscape = false;
										}

										break;
									case ComplexEscape.DECIMAL:
										escapeValue *= (char) 10;
										escapeValue += (char) (currentLetter - '0');

										if (escapeLength == 3) {
											readingComplexEscape = false;
										}

										break;
									case ComplexEscape.HEX:
										escapeValue *= (char) 16;
										escapeValue += hexToDec(currentLetter);

										if (escapeLength == 2) {
											readingComplexEscape = false;
										}

										break;
									}

									if (!readingComplexEscape) {
										appendLetter(escapeValue);
									} else {
										dataBuffer.ColumnStart = dataBuffer.Position + 1;
									}
								} else if (userSettings.EscapeMode == ESCAPE_MODE_BACKSLASH && lastLetterWasBackslash) {
									switch (currentLetter) {
									case 'n':
										appendLetter(Letters.LF.code);
										break;
									case 'r':
										appendLetter(Letters.CR.code);
										break;
									case 't':
										appendLetter(Letters.TAB.code);
										break;
									case 'b':
										appendLetter(Letters.BACKSPACE.code);
										break;
									case 'f':
										appendLetter(Letters.FORM_FEED.code);
										break;
									case 'e':
										appendLetter(Letters.ESCAPE.code);
										break;
									case 'v':
										appendLetter(Letters.VERTICAL_TAB.code);
										break;
									case 'a':
										appendLetter(Letters.ALERT.code);
										break;
									case '0':
									case '1':
									case '2':
									case '3':
									case '4':
									case '5':
									case '6':
									case '7':
										escape = ComplexEscape.OCTAL;
										readingComplexEscape = true;
										escapeLength = 1;
										escapeValue = (char) (currentLetter - '0');
										dataBuffer.ColumnStart = dataBuffer.Position + 1;
										break;
									case 'u':
									case 'x':
									case 'o':
									case 'd':
									case 'U':
									case 'X':
									case 'O':
									case 'D':
										switch (currentLetter) {
										case 'u':
										case 'U':
											escape = ComplexEscape.UNICODE;
											break;
										case 'x':
										case 'X':
											escape = ComplexEscape.HEX;
											break;
										case 'o':
										case 'O':
											escape = ComplexEscape.OCTAL;
											break;
										case 'd':
										case 'D':
											escape = ComplexEscape.DECIMAL;
											break;
										}

										readingComplexEscape = true;
										escapeLength = 0;
										escapeValue = (char) 0;
										dataBuffer.ColumnStart = dataBuffer.Position + 1;

										break;
									default:
										break;
									}

									lastLetterWasBackslash = false;
								} else {
									if (currentLetter == userSettings.Delimiter) {
										endColumn();
									} else if ((!useCustomRecordDelimiter
											&& (currentLetter == Letters.CR.code || currentLetter == Letters.LF.code))
											|| (useCustomRecordDelimiter
													&& currentLetter == userSettings.RecordDelimiter)) {
										endColumn();

										endRecord();
									}
								}

								// keep track of the last letter because we need
								// it for several key decisions

								lastLetter = currentLetter;
								firstLoop = false;

								if (startedColumn) {
									dataBuffer.Position++;

									if (userSettings.SafetySwitch && dataBuffer.Position - dataBuffer.ColumnStart
											+ columnBuffer.Position > 100000) {
										close();

										throw new IOException("Maximum column length of 100,000 exceeded in column "
												+ NumberFormat.getIntegerInstance().format(columnsCount) + " in record "
												+ NumberFormat.getIntegerInstance().format(currentRecord)
												+ ". Set the SafetySwitch property to false"
												+ " if you're expecting column lengths greater than 100,000 characters to"
												+ " avoid this error.");
									}
								}
							} // end else
						} while (hasMoreData && startedColumn);
					}

					if (hasMoreData) {
						dataBuffer.Position++;
					}
				} // end else
			} while (hasMoreData && !hasReadNextLine);

			// check to see if we hit the end of the file
			// without processing the current record

			if (startedColumn || lastLetter == userSettings.Delimiter) {
				endColumn();

				endRecord();
			}
		}

		if (userSettings.CaptureRawRecord) {
			if (hasMoreData) {
				if (rawBuffer.Position == 0) {
					rawRecord = new String(dataBuffer.Buffer, dataBuffer.LineStart,
							dataBuffer.Position - dataBuffer.LineStart - 1);
				} else {
					rawRecord = new String(rawBuffer.Buffer, 0, rawBuffer.Position) + new String(dataBuffer.Buffer,
							dataBuffer.LineStart, dataBuffer.Position - dataBuffer.LineStart - 1);
				}
			} else {
				// for hasMoreData to ever be false, all data would have had to
				// have been
				// copied to the raw buffer
				rawRecord = new String(rawBuffer.Buffer, 0, rawBuffer.Position);
			}
		} else {
			rawRecord = "";
		}

		return hasReadNextLine;
	}

	/**
	 * @exception IOException Thrown if an error occurs while reading data from the
	 *                        source stream.
	 */
	private void checkDataLength() throws IOException {
		updateCurrentValue();

		if (userSettings.CaptureRawRecord && dataBuffer.Count > 0) {
			if (rawBuffer.Buffer.length - rawBuffer.Position < dataBuffer.Count - dataBuffer.LineStart) {
				int newLength = rawBuffer.Buffer.length
						+ Math.max(dataBuffer.Count - dataBuffer.LineStart, rawBuffer.Buffer.length);

				char[] holder = new char[newLength];

				System.arraycopy(rawBuffer.Buffer, 0, holder, 0, rawBuffer.Position);

				rawBuffer.Buffer = holder;
			}

			System.arraycopy(dataBuffer.Buffer, dataBuffer.LineStart, rawBuffer.Buffer, rawBuffer.Position,
					dataBuffer.Count - dataBuffer.LineStart);

			rawBuffer.Position += dataBuffer.Count - dataBuffer.LineStart;
		}

		try {
			dataBuffer.Count = inputStream.read(dataBuffer.Buffer, 0, dataBuffer.Buffer.length);
		} catch (IOException ex) {
			close();

			throw ex;
		}

		// if no more data could be found, set flag stating that
		// the end of the data was found

		if (dataBuffer.Count == -1) {
			hasMoreData = false;
		}

		dataBuffer.Position = 0;
		dataBuffer.LineStart = 0;
		dataBuffer.ColumnStart = 0;
	}

	/**
	 * Read the first record of data as column headers.
	 * 
	 * @return Whether the header record was successfully read or not.
	 * @exception IOException Thrown if an error occurs while reading data from the
	 *                        source stream.
	 */
	public boolean readHeaders() throws IOException {
		boolean result = readRecord();

		// copy the header data from the column array
		// to the header string array

		headersHolder.Length = columnsCount;

		headersHolder.Headers = new String[columnsCount];

		for (int i = 0; i < headersHolder.Length; i++) {
			String columnValue = get(i);

			headersHolder.Headers[i] = columnValue;

			// if there are duplicate header names, we will save the last one
			headersHolder.IndexByName.put(columnValue, new Integer(i));
		}

		if (result) {
			currentRecord--;
		}

		columnsCount = 0;

		return result;
	}

	/**
	 * Returns the column header value for a given column index.
	 * 
	 * @param columnIndex The index of the header column being requested.
	 * @return The value of the column header at the given column index.
	 * @exception IOException Thrown if this object has already been closed.
	 */
	public String getHeader(int columnIndex) throws IOException {
		checkClosed();

		// check to see if we have read the header record yet

		// check to see if the column index is within the bounds
		// of our header array

		if (columnIndex > -1 && columnIndex < headersHolder.Length) {
			// return the processed header data for this column

			return headersHolder.Headers[columnIndex];
		} else {
			return "";
		}
	}

	public boolean isQualified(int columnIndex) throws IOException {
		checkClosed();

		if (columnIndex < columnsCount && columnIndex > -1) {
			return isQualified[columnIndex];
		} else {
			return false;
		}
	}

	/**
	 * @exception IOException Thrown if a very rare extreme exception occurs during
	 *                        parsing, normally resulting from improper data format.
	 */
	private void endColumn() throws IOException {
		String currentValue = "";

		// must be called before setting startedColumn = false
		if (startedColumn) {
			if (columnBuffer.Position == 0) {
				if (dataBuffer.ColumnStart < dataBuffer.Position) {
					int lastLetter = dataBuffer.Position - 1;

					if (userSettings.TrimWhitespace && !startedWithQualifier) {
						while (lastLetter >= dataBuffer.ColumnStart
								&& (dataBuffer.Buffer[lastLetter] == Letters.SPACE.code
										|| dataBuffer.Buffer[lastLetter] == Letters.TAB.code)) {
							lastLetter--;
						}
					}

					currentValue = new String(dataBuffer.Buffer, dataBuffer.ColumnStart,
							lastLetter - dataBuffer.ColumnStart + 1);
				}
			} else {
				updateCurrentValue();

				int lastLetter = columnBuffer.Position - 1;

				if (userSettings.TrimWhitespace && !startedWithQualifier) {
					while (lastLetter >= 0 && (columnBuffer.Buffer[lastLetter] == Letters.SPACE.code)) {
						lastLetter--;
					}
				}

				currentValue = new String(columnBuffer.Buffer, 0, lastLetter + 1);
			}
		}

		columnBuffer.Position = 0;

		startedColumn = false;

		if (columnsCount >= 100000 && userSettings.SafetySwitch) {
			close();

			throw new IOException("Maximum column count of 100,000 exceeded in record "
					+ NumberFormat.getIntegerInstance().format(currentRecord)
					+ ". Set the SafetySwitch property to false"
					+ " if you're expecting more than 100,000 columns per record to" + " avoid this error.");
		}

		// check to see if our current holder array for
		// column chunks is still big enough to handle another
		// column chunk

		if (columnsCount == values.length) {
			// holder array needs to grow to be able to hold another column
			int newLength = values.length * 2;

			String[] holder = new String[newLength];

			System.arraycopy(values, 0, holder, 0, values.length);

			values = holder;

			boolean[] qualifiedHolder = new boolean[newLength];

			System.arraycopy(isQualified, 0, qualifiedHolder, 0, isQualified.length);

			isQualified = qualifiedHolder;
		}

		values[columnsCount] = currentValue;

		isQualified[columnsCount] = startedWithQualifier;

		columnsCount++;
	}

	private void appendLetter(char letter) {
		if (columnBuffer.Position == columnBuffer.Buffer.length) {
			int newLength = columnBuffer.Buffer.length * 2;

			char[] holder = new char[newLength];

			System.arraycopy(columnBuffer.Buffer, 0, holder, 0, columnBuffer.Position);

			columnBuffer.Buffer = holder;
		}
		columnBuffer.Buffer[columnBuffer.Position++] = letter;
		dataBuffer.ColumnStart = dataBuffer.Position + 1;
	}

	private void updateCurrentValue() {
		if (startedColumn && dataBuffer.ColumnStart < dataBuffer.Position) {
			if (columnBuffer.Buffer.length - columnBuffer.Position < dataBuffer.Position - dataBuffer.ColumnStart) {
				int newLength = columnBuffer.Buffer.length
						+ Math.max(dataBuffer.Position - dataBuffer.ColumnStart, columnBuffer.Buffer.length);

				char[] holder = new char[newLength];

				System.arraycopy(columnBuffer.Buffer, 0, holder, 0, columnBuffer.Position);

				columnBuffer.Buffer = holder;
			}

			System.arraycopy(dataBuffer.Buffer, dataBuffer.ColumnStart, columnBuffer.Buffer, columnBuffer.Position,
					dataBuffer.Position - dataBuffer.ColumnStart);

			columnBuffer.Position += dataBuffer.Position - dataBuffer.ColumnStart;
		}

		dataBuffer.ColumnStart = dataBuffer.Position + 1;
	}

	/**
	 * @exception IOException Thrown if an error occurs while reading data from the
	 *                        source stream.
	 */
	private void endRecord() throws IOException {
		// this flag is used as a loop exit condition
		// during parsing

		hasReadNextLine = true;

		currentRecord++;
	}

	/**
	 * Gets the corresponding column index for a given column header name.
	 * 
	 * @param headerName The header name of the column.
	 * @return The column index for the given column header name. Returns -1 if
	 *         not found.
	 * @exception IOException Thrown if this object has already been closed.
	 */
	public int getIndex(String headerName) throws IOException {
		checkClosed();

		Object indexValue = headersHolder.IndexByName.get(headerName);

		if (indexValue != null) {
			return ((Integer) indexValue).intValue();
		} else {
			return -1;
		}
	}

	/**
	 * Skips the next record of data by parsing each column. Does not increment
	 * {@link CsvReader#getCurrentRecord getCurrentRecord()}.
	 * 
	 * @return Whether another record was successfully skipped or not.
	 * @exception IOException Thrown if an error occurs while reading data from the
	 *                        source stream.
	 */
	public boolean skipRecord() throws IOException {
		checkClosed();

		boolean recordRead = false;

		if (hasMoreData) {
			recordRead = readRecord();

			if (recordRead) {
				currentRecord--;
			}
		}

		return recordRead;
	}

	/**
	 * Skips the next line of data using the standard end of line characters and
	 * does not do any column delimited parsing.
	 * 
	 * @return Whether a line was successfully skipped or not.
	 * @exception IOException Thrown if an error occurs while reading data from the
	 *                        source stream.
	 */
	public boolean skipLine() throws IOException {
		checkClosed();

		// clear public column values for current line

		columnsCount = 0;

		boolean skippedLine = false;

		if (hasMoreData) {
			boolean foundEol = false;

			do {
				if (dataBuffer.Position == dataBuffer.Count) {
					checkDataLength();
				} else {
					skippedLine = true;

					// grab the current letter as a char

					char currentLetter = dataBuffer.Buffer[dataBuffer.Position];

					if (currentLetter == Letters.CR.code || currentLetter == Letters.LF.code) {
						foundEol = true;
					}

					// keep track of the last letter because we need
					// it for several key decisions

					lastLetter = currentLetter;

					if (!foundEol) {
						dataBuffer.Position++;
					}

				} // end else
			} while (hasMoreData && !foundEol);

			columnBuffer.Position = 0;

			dataBuffer.LineStart = dataBuffer.Position + 1;
		}

		rawBuffer.Position = 0;
		rawRecord = "";

		return skippedLine;
	}

	/**
	 * Closes and releases all related resources.
	 */
	public void close() {
		if (!closed) {
			close(true);

			closed = true;
		}
	}

	/**
	 * 
	 */
	private void close(boolean closing) {
		if (!closed) {
			if (closing) {
				headersHolder.Headers = null;
				headersHolder.IndexByName = null;
				dataBuffer.Buffer = null;
				columnBuffer.Buffer = null;
				rawBuffer.Buffer = null;
			}

			try {
				inputStream.close();
			} catch (Exception e) {
				// just eat the exception
			}

			inputStream = null;

			closed = true;
		}
	}

	/**
	 * @exception IOException Thrown if this object has already been closed.
	 */
	private void checkClosed() throws IOException {
		if (closed) {
			throw new IOException("This instance of the CsvReader class has already been closed.");
		}
	}

	/**
	 * 
	 */
	protected void finalize() {
		close(false);
	}

	private class ComplexEscape {
		private static final int UNICODE = 1;

		private static final int OCTAL = 2;

		private static final int DECIMAL = 3;

		private static final int HEX = 4;
	}

	private static char hexToDec(char hex) {
		char result;

		if (hex >= 'a') {
			result = (char) (hex - 'a' + 10);
		} else if (hex >= 'A') {
			result = (char) (hex - 'A' + 10);
		} else {
			result = (char) (hex - '0');
		}

		return result;
	}

	private class DataBuffer {
		public char[] Buffer;

		public int Position;

		// / 
		// / How much usable data has been read into the stream,
		// / which will not always be as long as Buffer.Length.
		// / 
		public int Count;

		// / 
		// / The position of the cursor in the buffer when the
		// / current column was started or the last time data
		// / was moved out to the column buffer.
		// / 
		public int ColumnStart;

		public int LineStart;

		public DataBuffer() {
			Buffer = new char[StaticSettings.MAX_BUFFER_SIZE];
			Position = 0;
			Count = 0;
			ColumnStart = 0;
			LineStart = 0;
		}
	}

	private class ColumnBuffer {
		public char[] Buffer;

		public int Position;

		public ColumnBuffer() {
			Buffer = new char[StaticSettings.INITIAL_COLUMN_BUFFER_SIZE];
			Position = 0;
		}
	}

	private class RawRecordBuffer {
		public char[] Buffer;

		public int Position;

		public RawRecordBuffer() {
			Buffer = new char[StaticSettings.INITIAL_COLUMN_BUFFER_SIZE * StaticSettings.INITIAL_COLUMN_COUNT];
			Position = 0;
		}
	}

	private enum Letters {
		LF('\n'),

		CR('\r'),

		QUOTE('"'),

		SEMICOLON(';'),

		COMMA(','),

		SPACE(' '),

		TAB('\t'),

		POUND('#'),

		BACKSLASH('\\'),

		NULL('\0'),

		BACKSPACE('\b'),

		FORM_FEED('\f'),

		ESCAPE('\u001B'), // ASCII/ANSI escape

		VERTICAL_TAB('\u000B'),

		ALERT('\u0007');

		public final char code;

		private Letters(char code) {
			this.code = code;
		}
	}

	private class UserSettings {
		// having these as publicly accessible members will prevent
		// the overhead of the method call that exists on properties
		public char TextQualifier;

		public boolean TrimWhitespace;

		public boolean UseTextQualifier;

		public char Delimiter;

		public char RecordDelimiter;

		public char Comment;

		public boolean UseComments;

		public int EscapeMode;

		public boolean SafetySwitch;

		public boolean SkipEmptyRecords;

		public boolean CaptureRawRecord;

		public UserSettings() {
			TextQualifier = Letters.QUOTE.code;
			TrimWhitespace = true;
			UseTextQualifier = true;
			Delimiter = Letters.SEMICOLON.code;
			RecordDelimiter = Letters.NULL.code;
			UseComments = false;
			EscapeMode = CsvReader.ESCAPE_MODE_DOUBLED;
			SafetySwitch = true;
			SkipEmptyRecords = true;
			CaptureRawRecord = true;
		}
	}

	private class HeadersHolder {
		public String[] Headers;

		public int Length;

		public Map IndexByName;

		public HeadersHolder() {
			Headers = null;
			Length = 0;
			IndexByName = new LinkedHashMap<>();
		}
	}

	private class StaticSettings {
		// these are static instead of final so they can be changed in unit test
		// isn't visible outside this class and is only accessed once during
		// CsvReader construction
		public static final int MAX_BUFFER_SIZE = 1024;

		public static final int MAX_FILE_BUFFER_SIZE = 4 * 1024;

		public static final int INITIAL_COLUMN_COUNT = 10;

		public static final int INITIAL_COLUMN_BUFFER_SIZE = 50;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy