org.databene.document.csv.CSVTokenizer Maven / Gradle / Ivy
Go to download
'databene webdecs' is an open source software library for
WEB Data Extraction, Conversion and Scripting, written by Volker Bergmann.
The newest version!
/*
* (c) Copyright 2007-2009 by Volker Bergmann. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, is permitted under the terms of the
* GNU General Public License.
*
* For redistributing this software or a derivative work under a license other
* than the GPL-compatible Free Software License as defined by the Free
* Software Foundation or approved by OSI, you must first obtain a commercial
* license to this software product from Volker Bergmann.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* WITHOUT A WARRANTY OF ANY KIND. ALL EXPRESS OR IMPLIED CONDITIONS,
* REPRESENTATIONS AND WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT, ARE
* HEREBY EXCLUDED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package org.databene.document.csv;
import static org.databene.document.csv.CSVTokenType.*;
import org.databene.commons.IOUtil;
import org.databene.commons.SystemInfo;
import java.io.*;
/**
* Parses a CSV file token by token as specified in RFC 4180.
* It returns parsed values as CSVTokensType of type CELL, EOL and EOF.
* The current cell content is accessible by the public attribute 'cell'.
*
* Created: 26.08.2006 17:19:35
* @see CSVTokenType
*/
public class CSVTokenizer implements Closeable {
/**
* The default separator to use
*/
public static final char DEFAULT_SEPARATOR = ',';
/**
* the source to read from
*/
private PushbackReader reader;
/**
* the actual separator
*/
private char separator;
/**
* The token at the cursor position
*/
public CSVTokenType ttype;
public CSVTokenType lastType;
/**
* String representation of the cell at the cursor position.
* If the cursor is at a EOL/EOF position, this is null
*/
public String cell;
public int line;
public int startColumn;
public int endColumn;
private int cursor;
// constructors ----------------------------------------------------------------------------------------------------
/**
* Creates a tokenizer that reads from a URL.
*
* @param uri the URL to read from
* @throws IOException
*/
public CSVTokenizer(String uri) throws IOException {
this(uri, DEFAULT_SEPARATOR);
}
/**
* Creates a tokenizer that reads from a uri.
*
* @param uri the uri to read from
* @param separator character used for separating CSV cells
* @throws IOException
*/
public CSVTokenizer(String uri, char separator) throws IOException {
this(uri, separator, SystemInfo.getFileEncoding());
}
public CSVTokenizer(String uri, char separator, String encoding) throws IOException {
this(IOUtil.getReaderForURI(uri, encoding), separator);
}
/**
* Creates a tokenizer that reads from a java.io.Reader.
*
* @param reader the reader to use as input
*/
public CSVTokenizer(Reader reader) {
this(reader, DEFAULT_SEPARATOR);
}
/**
* Creates a tokenizer that reads from a java.io.Reader.
*
* @param reader the reader to use as input
* @param separator character used for separating CSV cells
*/
public CSVTokenizer(Reader reader, char separator) {
this.reader = new PushbackReader(new BufferedReader(reader));
this.separator = separator;
this.line = 1;
this.cursor = 1;
}
// interface -------------------------------------------------------------------------------------------------------
/**
* Returns the next token.
*
* @return the next token
* @throws IOException if source access fails
*/
public CSVTokenType next() throws IOException {
this.lastType = this.ttype;
if (reader == null)
return setState(EOF, null);
if (lastType == EOL) {
line++;
startColumn = endColumn = cursor = 1;
} else
startColumn = endColumn = cursor;
int c = read();
if (c == -1) {
close();
return setState(EOF, null);
} else if (c == separator) {
return setState(CELL, "");
} else if (c == '\r') {
if ((c = read()) != '\n')
unread(c);
return setState(EOL, null);
} else if (c == '\n') {
return setState(EOL, null);
} else if (c == '"') {
unread(c);
return parseQuotes();
} else {
StringBuilder buffer = new StringBuilder().append((char) c);
boolean escapeMode = false;
while ((c = read()) != -1 && c != '\r' && c != '\n') {
if (escapeMode) {
c = unescape((char) c);
escapeMode = false;
} else if (c == '\\') {
escapeMode = true;
continue;
}
if (c == separator) {
endColumn = cursor - 2;
return setState(CELL, buffer.toString());
}
buffer.append((char) c);
}
if (c == '\r' || c == '\n')
unread(c);
endColumn = cursor - 1;
return setState(CELL, buffer.toString());
}
}
private char unescape(char c) { // this is more efficient than StringUtil.unescape(String)
switch (c) {
case 't': return '\t';
case 'r': return '\r';
case 'n': return '\n';
default: return c;
}
}
private CSVTokenType parseQuotes() throws IOException {
read(); // skip leading quote
StringBuilder buffer = new StringBuilder();
int c;
boolean escapeMode = false;
boolean done;
do {
while ((c = read()) != -1 && c != '"') {
if (escapeMode) {
c = unescape((char) c);
escapeMode = false;
} else if (c == '\\') {
escapeMode = true;
continue;
}
buffer.append((char) c);
}
if (c == '"') {
c = read();
if (c == '"') {
// escaped quote
buffer.append('"');
done = false;
} else
done = true;
} else
done = true;
} while (!done);
if (c == '\r' || c == '\n') {
unread(c);
endColumn = cursor - 1;
} else
endColumn = cursor - 2;
return setState(CELL, buffer.toString());
}
public void skipLine() throws IOException {
int c;
// go to end of line
while ((c = read()) != -1 && c != '\r' && c != '\n') {
// skip EOL characters
}
switch (c) {
case -1 :
return;
case '\n' :
return;
case '\r' :
int c2 = read();
if (c2 != '\n')
unread(c2);
return;
default :
throw new IllegalStateException();
}
}
private void unread(int c) throws IOException {
reader.unread(c);
cursor--;
}
private int read() throws IOException {
int c = reader.read();
if (c != -1)
cursor++;
return c;
}
/**
* Closes the source
*/
public void close() {
if (reader != null)
IOUtil.close(reader);
reader = null;
}
public CSVTokenType lastTtype() {
return lastType;
}
// private helpers -------------------------------------------------------------------------------------------------
/**
* sets the state of the tokenizer to the given tokenType and cell content.
*
* @param tokenType the tokenType to use
* @param cell the cell content
* @return the token type
*/
private CSVTokenType setState(CSVTokenType tokenType, String cell) {
this.cell = cell;
this.ttype = tokenType;
return this.ttype;
}
}