org.relique.io.FileSetInputStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of csvjdbc Show documentation
a Java JDBC driver for reading comma-separated-value files
The newest version!
/*
 *  CsvJdbc - a JDBC driver for CSV files
 *  Copyright (C) 2008  Mario Frasca
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public
 *  License along with this library; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 */

package org.relique.io;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.relique.jdbc.csv.CsvResources;


/**
 * Class that collapses a set of files into one input stream. All files matching
 * a given pattern are collected, parts of the file name contains part of the
 * data, and the values in the file name are appended (or prepended) to each
 * data line.
 * 
 * @author Mario Frasca
 * 
 */
public class FileSetInputStream extends InputStream
{
	private String dirName;
	private List fileNames;
	private EncryptedFileInputStream currentFile;
	private boolean readingHeader;
	private String tail;
	private int pos;
	private Pattern fileNameRE;
	private String separator;
	private Character quotechar;
	private String quoteStyle;
	private String dataTail;
	private boolean prepend;
	private int lookahead = '\n';
	private int lookahead2 = -1;
	private int quoteCountForRecord = 0;
	private boolean doingTail;
	private int currentLineLength;
	private CryptoFilter filter;
	private int skipLeadingDataLines;
	private String charset;
	private boolean isClosed = false;

	/**
	 * 
	 * @param dirName
	 *            the containing directory
	 * @param fileNamePattern
	 *            the regular expression describing the file name and the extra
	 *            fields.
	 * @param fieldsInName
	 *            the names of the fields contained in the file name.
	 * @param separator
	 *            the separator to use when faking output (typically the ",").
	 * @param quotechar the character used to quote column values containing the separator, or null for no quoting.
	 * @param quoteStyle either "SQL" or "C" for rule for escaping quote characters in column values.
	 * @param prepend
	 *            whether the extra fields should precede the ones from the file
	 *            content.
	 * @param headerless if true, then there is no header on first line of file.
	 * @param filter filter for decoding bytes read from file.
	 * @param skipLeadingDataLines number of lines to skip at start of file before header line.
	 * @throws IOException if a file cannot be opened or read.
	 */
	public FileSetInputStream(String dirName, String fileNamePattern,
			String[] fieldsInName, String separator, Character quotechar, String quoteStyle, boolean prepend,
			boolean headerless, CryptoFilter filter, int skipLeadingDataLines,
			String charset)
			throws IOException
	{
		this.dirName = dirName;
		this.filter = filter;
		this.skipLeadingDataLines = skipLeadingDataLines;
		this.charset = charset;
		if (!headerless)
			this.skipLeadingDataLines++;
		
		// Initialising tail for header...
		this.prepend = prepend;
		this.separator = separator;
		this.quotechar = quotechar;
		this.quoteStyle = quoteStyle;
		tail = "";
		if (prepend)
		{
			tail += '\n';
		}
		else
		{
			if (fieldsInName != null)
				tail += separator;
		}
		if (fieldsInName != null)
		{
			for (int i = 0; i < fieldsInName.length; i++)
			{
				tail += fieldsInName[i];
				if (i + 1 < fieldsInName.length)
					tail += separator;
			}
		}
		if (prepend)
		{
			if (fieldsInName != null)
				tail += separator;
		}
		else
		{
			tail += '\n';
		}

		fileNames = new ArrayList<>();
		File root = new File(dirName);
		File[] candidates = root.listFiles();

		if (candidates == null)
		{
			throw new IOException(CsvResources.getString("dirNotFound") + ": " + dirName);
		}

		fileNameRE = Pattern.compile(fileNamePattern);

		for (int i = 0; i < candidates.length; i++)
		{
			if (!candidates[i].isDirectory())
			{
				String candidateName = candidates[i].getName();
				Matcher m = fileNameRE.matcher(candidateName);
				if (m.matches())
				{
					fileNames.add(candidateName);
				}
			}
		}
		Collections.sort(fileNames);
		if (fileNames.isEmpty())
		{
			return;
		}

		fileNameRE = Pattern.compile(".*" + fileNamePattern);
		readingHeader = true;
		String currentFileName = fileNames.remove(0);
		dataTail = getTailFromName(dirName + currentFileName);
		if (headerless)
			tail = dataTail;
		currentFile = new EncryptedFileInputStream(dirName + currentFileName, filter);
		lookahead = currentFile.read();
		if (lookahead == 0xFF && charset != null && charset.equals("UTF-16LE"))
		{
			// Skip any 0xFFFE Byte Order Mark at start of UTF-16LE file.
			currentFile.read();
			lookahead = currentFile.read();
		}
		doingTail = prepend;
		if (doingTail)
			pos = 1;
	}

	@Override
	public void close() throws IOException
	{
		isClosed = true;
		if (currentFile != null)
		{
			currentFile.close();
			currentFile = null;
		}
	}

	/**
	 * Reads the next byte of data from the input stream. The value byte is
	 * returned as an int in the range 0 to 255. if the end of the current
	 * source file is reached, the next single file is opened. if all input has
	 * been used, -1 is returned.
	 * 
	 * to output the tail, we just glue it before each '\n'
	 * 
	 * to output the lead, we have to look ahead and append it to all '\n' that
	 * are not followed by '\n' or -1
	 * 
	 * @return the next byte of data, or -1 if the end of the stream is reached.
	 * 
	 * @throws IOException
	 *             if an I/O error occurs.
	 */
	@Override
	public int read() throws IOException
	{
		if (isClosed)
			throw new IOException(CsvResources.getString("streamClosed"));

		// run out of input on all subfiles
		if (currentFile == null)
			return -1;

		int ch;
		if (doingTail)
		{
			ch = readFromTail();
			if (ch != -1)
			{
				return ch;
			}
			doingTail = false;
			currentLineLength = 0;
			quoteCountForRecord = 0;
		}

		// shift the lookahead into the current char and get the new lookahead.
		ch = lookahead;
		do
		{
			if (lookahead2 != -1)
			{
				lookahead = lookahead2;
				lookahead2 = -1;
			}
			else
			{
				lookahead = currentFile.read();
			}
			// we ignore \r, which breaks things on files created with MacOS9
			if (lookahead == '\r')
			{
				if (charset != null && charset.equals("UTF-16LE"))
				{
					// Skip '\r' and '\0' for UTF-16LE charset.
					int skip = currentFile.read();
					if (skip != 0)
					{
						// Next char is some other valid UTF-16LE character,
						// not a carriage return, so we need to keep
						// it for later.
						lookahead2 = skip;
						break;
					}
				}
			}
		}
		while (lookahead == '\r');

		if (quotechar != null)
		{
			// Keep a count of the number of quotes in current record, so we can
			// detect whether the record is split across multiple lines (an odd
			// number of quotes when the end of the line is reached), and avoid
			// adding the file tail in this situation.
			if (ch == quotechar.charValue())
			{
				quoteCountForRecord++;
			}
			else if (ch == '\\' && lookahead == quotechar.charValue() && "C".equals(quoteStyle))
			{
				// Also count both '\' and '"' as quote characters, so that quote
				// count remains even and escaped quote does not count as a quote.
				quoteCountForRecord++;
			}
		}

		// if we met a line border we have to output the lead/tail
		if (prepend)
		{
			// prepending a non empty line...
			if (ch == '\n' && !(lookahead == '\n' || lookahead == -1) && quoteCountForRecord % 2 == 0)
			{
				doingTail = true;
				ch = readFromTail();
				return ch;
			}
			if (lookahead == -1 && ch != '\n' && ch != -1)
			{
				// If last line of file does not end with a newline,
				// then add a newline, so prepended fields for the
				// next file begin on a new line.
				lookahead = '\n';
			}
		}
		else
		{
			// appending to the end of just any line
			if (currentLineLength > 0 && (ch == '\n' || ch == -1) && quoteCountForRecord % 2 == 0)
			{
				doingTail = true;
				if (ch == '\n' && lookahead == 0 &&
					charset != null && charset.equals("UTF-16LE"))
				{
					// Skip '\n' and '\0' for UTF-16LE charset.
					lookahead = currentFile.read();
				}

				ch = readFromTail();
				return ch;
			}
		}
		if (ch < 0)
		{
			currentFile.close();
			// open next file and possibly skip header
			pos = 0;
			String currentFileName;
			if (fileNames.size() > 0)
			{
				currentFileName = fileNames.remove(0);
			}
			else
			{
				currentFile = null;
				return -1;
			}
			tail = getTailFromName(dirName + currentFileName);
			currentFile = new EncryptedFileInputStream(dirName + currentFileName, filter);
			// if files do contain a header, skip it
			for(int i = 0; i < this.skipLeadingDataLines; i++)
			{
				int ch2;
				do
				{
					ch2 = currentFile.read();

					if (ch2 == '\n' && charset != null && charset.equals("UTF-16LE"))
					{
						// Skip '\n' and '\0' for UTF-16LE charset.
						int skip = currentFile.read();
						if (skip != 0)
						{
							// Next char is not a newline, so keep reading.
							ch2 = 0;
						}
					}
				}
				while (ch2 != '\n' && ch2 != -1);
			}
			quoteCountForRecord = 0;
			doingTail = prepend;
			if (doingTail)
				pos = 1;
			lookahead = currentFile.read();
			if (lookahead == 0xFF && skipLeadingDataLines == 0 &&
				charset != null && charset.equals("UTF-16LE"))
			{
				// Skip any 0xFFFE Byte Order Mark at start of UTF-16LE file.
				currentFile.read();
				lookahead = currentFile.read();
			}
			ch = read();
			return ch;
		}
		currentLineLength++;
		return ch;
	}

	private String getTailFromName(String currentName)
	{
		Matcher m = fileNameRE.matcher(currentName);
		m.matches();
		String tail = "";
		int groupCount = m.groupCount();
		if (prepend)
		{
			tail += '\n';
		}
		else
		{
			if (groupCount > 0)
				tail += separator;
		}
		for (int i = 1; i <= groupCount; i++)
		{
			tail += m.group(i);
			if (i < groupCount)
				tail += separator;
		}
		if (prepend)
		{
			if (groupCount > 0)
				tail += separator;
		}
		else
		{
			tail += '\n';
		}
		return tail;
	}

	private int readFromTail() throws IOException
	{
		if (charset != null && charset.equals("UTF-16LE"))
		{
			byte []b = tail.getBytes(charset);
			if (pos < b.length)
				return b[pos++];
		}
		else
		{
			if (pos < tail.length())
				return tail.charAt(pos++);
		}
		pos = 0;
		if (readingHeader)
		{
			readingHeader = false;
			tail = dataTail;
		}
		return -1;
	}

	@Override
	public synchronized void reset() throws IOException
	{
		super.reset();
	}
}