All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.relique.jdbc.csv.CsvRawReader Maven / Gradle / Ivy

There is a newer version: 1.0.43
Show newest version
/*
 *	CsvJdbc - a JDBC driver for CSV files
 *	Copyright (C) 2001	Jonathan Ackerman
 *
 *	This library is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU Lesser General Public
 *	License as published by the Free Software Foundation; either
 *	version 2.1 of the License, or (at your option) any later version.
 *	This library is distributed in the hope that it will be useful,
 *	but WITHOUT ANY WARRANTY; without even the implied warranty of
 *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *	Lesser General Public License for more details.
 *	You should have received a copy of the GNU Lesser General Public
 *	License along with this library; if not, write to the Free Software
 *	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */
package org.relique.jdbc.csv;

import java.io.IOException;
import java.io.LineNumberReader;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Set;
import java.util.Vector;

import org.relique.io.DataReader;

/**
 * This class is a helper class that handles the reading and parsing of data
 * from a .csv file.
 * 
 * @author Jonathan Ackerman
 * @author Sander Brienen
 * @author Stuart Mottram (fritto)
 * @author Jason Bedell
 * @author Tomasz Skutnik
 * @author Christoph Langer
 * @author Chetan Gupta
 */

public class CsvRawReader
{
	private static final String EMPTY_STRING = "";
	private static final String ZERO_STRING = "0";

	private LineNumberReader input;
	private String tableName;
	private String tableAlias;
	private String[] columnNames;
	private String[] fieldValues;
	private String firstLineBuffer = null;
	private String separator = ",";
	private String headerLine = "";
	private boolean suppressHeaders = false;
	private boolean isHeaderFixedWidth = true;
	private Character quoteChar = Character.valueOf('"');
	private boolean trimValues = true;
	private String comment = null;
	private boolean ignoreUnparseableLines;
	private String missingValue;
	private String quoteStyle;
	private ArrayList fixedWidthColumns;
	private LinkedList readAheadLines;
	private boolean readingAhead;
	private String[] previousFieldValues = null;

	public CsvRawReader(LineNumberReader in,
		String tableName,
		String tableAlias,
		String separator,
		boolean suppressHeaders,
		boolean isHeaderFixedWidth,
		Character quoteChar,
		String comment,
		String headerLine,
		boolean trimHeaders,
		boolean trimValues,
		int skipLeadingLines,
		boolean ignoreUnparseableLines,
		String missingValue,
		boolean defectiveHeaders,
		int skipLeadingDataLines,
		String quoteStyle,
		ArrayList fixedWidthColumns) throws IOException, SQLException
	{
		this.tableName = tableName;
		this.tableAlias = tableAlias;
		this.separator = separator;
		this.suppressHeaders = suppressHeaders;
		this.isHeaderFixedWidth = isHeaderFixedWidth;
		this.quoteChar = quoteChar;
		this.comment = comment;
		this.headerLine = headerLine;
		this.trimValues = trimValues;
		this.input = in;
		this.ignoreUnparseableLines = ignoreUnparseableLines;
		this.missingValue = missingValue;
		this.quoteStyle = quoteStyle;
		this.fixedWidthColumns = fixedWidthColumns;
		this.readAheadLines = new LinkedList();
		this.readingAhead = false;

		for (int i = 0; i < skipLeadingLines; i++)
		{
			in.readLine();
		}

		if (this.suppressHeaders)
		{
			// column names specified by property are available. Read and use.
			if (this.headerLine != null)
			{
				this.columnNames = parseHeaderLine(this.headerLine, trimHeaders);
			}
			else
			{
				// No column names available. Read first data line and determine
				// number of columns.
				firstLineBuffer = getNextDataLine();
				String[] data = parseHeaderLine(firstLineBuffer, trimValues);
				this.columnNames = new String[data.length];
				for (int i = 0; i < data.length; i++)
				{
					this.columnNames[i] = "COLUMN" + String.valueOf(i + 1);
				}
				data = null;
				// throw away.
			}
		}
		else
		{
			String tmpHeaderLine = getNextDataLine();
			this.columnNames = parseHeaderLine(tmpHeaderLine, trimHeaders);
			// some column names may be missing and should be corrected
			if (defectiveHeaders)
				fixDefectiveHeaders();
			Set uniqueNames = new HashSet();
			for (int i = 0; i < this.columnNames.length; i++)
				uniqueNames.add(this.columnNames[i]);
			if (uniqueNames.size() != this.columnNames.length)
				throw new SQLException(CsvResources.getString("duplicateColumns"));
		}

		for (int i=0; i= line.length())
				values[i] = "";
			else if (columnIndexes[1] >= line.length())
				values[i] = line.substring(columnIndexes[0], line.length());
			else
				values[i] = line.substring(columnIndexes[0], columnIndexes[1] + 1);

			values[i] = values[i].trim();
		}
		return values;
	}

	private String rtrim(String s)
	{
		int origLen = s.length();
		int len = origLen;
		while (len > 0 && Character.isWhitespace(s.charAt(len - 1)))
		{
			len--;
		}
		if (len == origLen)
			return s;
		else
			return s.substring(0, len);
	}

	private boolean isQuoteChar(char c)
	{
		return quoteChar != null && c == quoteChar.charValue();
	}

	private String createStringValue(StringBuilder columnValue, int columnIndex)
	{
		String s;

		/*
		 * Optimise for the two most frequent values in CSV files to avoid
		 * creating unnecessary String objects.
		 */
		int len = columnValue.length();
		if (len == 0)
		{
			s = EMPTY_STRING;
		}
		else if (len == 1 && columnValue.charAt(0) == '0')
		{
			s = ZERO_STRING;
		}
		else
		{
			s = columnValue.toString();
			if (previousFieldValues != null && previousFieldValues.length > columnIndex)
			{
				if (previousFieldValues[columnIndex] != null && previousFieldValues[columnIndex].equals(s))
				{
					/*
					 * Reuse String from previous row with same value to reduce number of
					 * allocated java.lang.String objects.
					 */
					s = previousFieldValues[columnIndex];
				}
			}
		}
		return s;
	}

	/**
	 * splits line into the String[] it contains.
	 * Stuart Mottram added the code for handling line breaks in fields.
	 * 
	 * @param line the line to parse
	 * @param trimValues tells whether to remove leading and trailing spaces
	 * @return line split into fields.
	 * @throws SQLException
	 */
	private String[] parseCsvLine(String line, boolean trimValues)
			throws SQLException
	{
		// TODO: quoteChar should be recognized ONLY when close to separator. 
		Vector values = new Vector();
		boolean inQuotedString = false;
		int quotedLineNumber = 0;
		StringBuilder value = new StringBuilder(32);
		String orgLine = line;
		int currentPos = 0;
		int fullLine = 0;

		while (fullLine == 0)
		{
			currentPos = 0;
			line += separator; // this way all fields are separator-terminated
			while (currentPos < line.length())
			{
				char currentChar = line.charAt(currentPos);
				if (value.length() == 0 && isQuoteChar(currentChar)
						&& !inQuotedString)
				{
					// acknowledge quoteChar only at beginning of value.
					inQuotedString = true;
					quotedLineNumber = input.getLineNumber();
				}
				else if (currentChar == '\\' && "C".equals(quoteStyle))
				{
					// in C quoteStyle \\ escapes any character.
					char nextChar = line.charAt(currentPos + 1);
					value.append(nextChar);
					currentPos++;
				}
				else if (isQuoteChar(currentChar))
				{
					char nextChar = line.charAt(currentPos + 1);
					if (!inQuotedString)
					{
						// accepting the single quoteChar because the whole
						// value is not quoted.
						value.append(quoteChar.charValue());
					}
					else if (isQuoteChar(nextChar))
					{
						value.append(quoteChar.charValue());
						if ("SQL".equals(quoteStyle))
						{
							// doubled quoteChar in quoted strings collapse to
							// one single quoteChar in SQL quotestyle
							currentPos++;
						}
					}
					else
					{
						while (trimValues &&
							atSeparator(line, currentPos + 1) == false &&
							Character.isWhitespace(nextChar) &&
							currentPos + 2 < line.length())
						{
							// Skip trailing whitespace after quoted value before next separator
							nextChar = line.charAt(currentPos + 2);
							currentPos++;
						}
						if (atSeparator(line, currentPos + 1) == false)
						{
							throw new SQLException(CsvResources.getString("expectedSeparator") + ": " +
								input.getLineNumber() + " " + (currentPos + 1) +
								": " + orgLine);
						}

						values.add(createStringValue(value, values.size()));
						value.setLength(0);
						inQuotedString = false;
						currentPos += separator.length();
					}
				}
				else
				{
					if (atSeparator(line, currentPos))
					{
						if (inQuotedString)
						{
							value.append(currentChar);
						}
						else
						{
							if (trimValues)
							{
								values.add(rtrim(createStringValue(value, values.size())));
							}
							else
							{
								values.add(createStringValue(value, values.size()));
							}
							value.setLength(0);

							if (separator.length() > 1)
							{
								/*
								 * Skip other characters in separator too.
								 */
								currentPos += separator.length() - 1;
							}
						}
					}
					else if (trimValues &&
						Character.isWhitespace(currentChar) &&
						value.length() == 0 &&
						inQuotedString == false)
					{
						// Skip leading whitespace in field
					}
					else
					{
						// default action
						value.append(currentChar);
					}
				}
				currentPos++;
			}
			if (inQuotedString)
			{
				// Line ended while looking for matching quoteChar. This means
				// we are inside of a field (not yet fullLine).
				// Remove extra separator added at start.
				value = new StringBuilder(value.substring(0, value.length() - 1));
				try
				{
					String additionalLine;
					if (readingAhead)
					{
						additionalLine = input.readLine();

						/*
						 * Remember each line we read ahead -- we may have to re-read
						 * these lines later.
						 */
						if (additionalLine != null)
							readAheadLines.addLast(additionalLine);
					}
					else
					{
						if (readAheadLines.isEmpty() == false)
							additionalLine = readAheadLines.removeFirst();
						else
							additionalLine = input.readLine();
					}

					if (additionalLine == null)
					{
						throw new SQLException(CsvResources.getString("eofInQuotes") + ": " +
							quotedLineNumber);
					}
					line = "\n" + additionalLine;
					if (orgLine == firstLineBuffer)
					{
						// We are reading and remembering the first record to
						// determine the number of columns in the file.
						// Append any extra lines we read for first record to
						// the buffer too.
						firstLineBuffer += "\n" + additionalLine;
					}
				}
				catch (IOException e)
				{
					throw new SQLException(e.toString());
				}
			}
			else
			{
				fullLine = 1;
			}

		}
		String[] retVal = new String[values.size()];
		values.copyInto(retVal);
		return retVal;
	}

	private boolean atSeparator(String line, int currentPos)
	{
		boolean matchesSeparator;
		
		/*
		 * Quicker to compare just the current character for the
		 * normal case of a single character separator.
		 */
		if (separator.length() == 1)
			matchesSeparator = (line.charAt(currentPos) == separator.charAt(0));
		else
			matchesSeparator = line.regionMatches(currentPos, separator, 0, separator.length());
		return matchesSeparator;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy