com.Ostermiller.util.CSVParser Maven / Gradle / Ivy
/*
* Read files in comma separated value format.
* Copyright (C) 2001-2004 Stephen Ostermiller
* http://ostermiller.org/contact.pl?regarding=Java+Utilities
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* See COPYING.TXT for details.
*/
package com.Ostermiller.util;
import java.io.*;
import java.util.*;
/**
* Read files in comma separated value format.
* More information about this class is available from ostermiller.org.
*
* CSV is a file format used as a portable representation of a database.
* Each line is one entry or record and the fields in a record are separated by commas.
* Commas may be preceded or followed by arbitrary space and/or tab characters which are
* ignored.
*
* If field includes a comma or a new line, the whole field must be surrounded with double quotes.
* When the field is in quotes, any quote literals must be escaped by \" Backslash
* literals must be escaped by \\. Otherwise a backslash and the character following
* will be treated as the following character, IE. "\n" is equivalent to "n". Other escape
* sequences may be set using the setEscapes() method. Text that comes after quotes that have
* been closed but come before the next comma will be ignored.
*
* Empty fields are returned as as String of length zero: "". The following line has three empty
* fields and three non-empty fields in it. There is an empty field on each end, and one in the
* middle. One token is returned as a space.
*
,second,," ",fifth,
*
* Blank lines are always ignored. Other lines will be ignored if they start with a
* comment character as set by the setCommentStart() method.
*
* An example of how CVSLexer might be used:
*
* CSVParser shredder = new CSVParser(System.in);
* shredder.setCommentStart("#;!");
* shredder.setEscapes("nrtf", "\n\r\t\f");
* String t;
* while ((t = shredder.nextValue()) != null){
* System.out.println("" + shredder.lastLineNumber() + " " + t);
* }
*
*
* Some applications do not output CSV according to the generally accepted standards and this parse may
* not be able to handle it. One such application is the Microsoft Excel spreadsheet. A
* separate class must be use to read
* Excel CSV.
*
* @see com.Ostermiller.util.ExcelCSVParser
*
* @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
* @since ostermillerutils 1.00.00
*/
public class CSVParser implements CSVParse {
/**
* InputStream on which this parser is based.
*
* @since ostermillerutils 1.02.22
*/
private InputStream inStream;
/**
* Reader on which this parser is based.
*
* @since ostermillerutils 1.02.22
*/
private Reader inReader;
/**
* Does all the dirty work.
* Calls for new tokens are routed through
* this object.
*
* @since ostermillerutils 1.00.00
*/
private CSVLexer lexer;
/**
* Token cache. Used for when we request a token
* from the lexer but can't return it because its
* on the next line.
*
* @since ostermillerutils 1.00.00
*/
private String tokenCache;
/**
* Line cache. The line number that goes along with
* the tokenCache. Not valid if the tokenCache is
* null.
*
* @since ostermillerutils 1.00.00
*/
private int lineCache;
/**
* The line number the last token came from, or -1 if
* no tokens have been returned.
*
* @since ostermillerutils 1.00.00
*/
private int lastLine = -1;
/**
* Create a parser to parse comma separated values from
* an InputStream.
*
* Byte to character conversion is done using the platform
* default locale.
*
* @param in stream that contains comma separated values.
*
* @since ostermillerutils 1.00.00
*/
public CSVParser(InputStream in){
inStream = in;
lexer = new CSVLexer(in);
}
/**
* Create a parser to parse delimited values from
* an InputStream.
*
* Byte to character conversion is done using the platform
* default locale.
*
* @param in stream that contains comma separated values.
* @param delimiter record separator
*
* @throws BadDelimiterException if the specified delimiter cannot be used
*
* @since ostermillerutils 1.02.24
*/
public CSVParser(InputStream in, char delimiter) throws BadDelimiterException {
inStream = in;
lexer = new CSVLexer(in);
changeDelimiter(delimiter);
}
/**
* Create a parser to parse comma separated values from
* a Reader.
*
* @param in reader that contains comma separated values.
*
* @since ostermillerutils 1.00.00
*/
public CSVParser(Reader in){
inReader = in;
lexer = new CSVLexer(in);
}
/**
* Create a parser to parse delimited values from
* a Reader.
*
* @param in reader that contains comma separated values.
* @param delimiter record separator
*
* @throws BadDelimiterException if the specified delimiter cannot be used
*
* @since ostermillerutils 1.02.24
*/
public CSVParser(Reader in, char delimiter) throws BadDelimiterException {
inReader = in;
lexer = new CSVLexer(in);
changeDelimiter(delimiter);
}
/**
* Create a parser to parse delimited values from
* an InputStream.
*
* Byte to character conversion is done using the platform
* default locale.
*
* @param in stream that contains comma separated values.
* @param escapes a list of characters that will represent escape sequences.
* @param replacements the list of replacement characters for those escape sequences.
* @param commentDelims list of characters a comment line may start with.
* @param delimiter record separator
*
* @throws BadDelimiterException if the specified delimiter cannot be used
*
* @since ostermillerutils 1.02.24
*/
public CSVParser(InputStream in, char delimiter, String escapes, String replacements, String commentDelims) throws BadDelimiterException {
inStream = in;
lexer = new CSVLexer(in);
setEscapes(escapes, replacements);
setCommentStart(commentDelims);
changeDelimiter(delimiter);
}
/**
* Create a parser to parse comma separated values from
* an InputStream.
*
* Byte to character conversion is done using the platform
* default locale.
*
* @param in stream that contains comma separated values.
* @param escapes a list of characters that will represent escape sequences.
* @param replacements the list of replacement characters for those escape sequences.
* @param commentDelims list of characters a comment line may start with.
*
* @since ostermillerutils 1.00.00
*/
public CSVParser(InputStream in, String escapes, String replacements, String commentDelims){
inStream = in;
lexer = new CSVLexer(in);
setEscapes(escapes, replacements);
setCommentStart(commentDelims);
}
/**
* Create a parser to parse delimited values from
* a Reader.
*
* @param in reader that contains comma separated values.
* @param escapes a list of characters that will represent escape sequences.
* @param replacements the list of replacement characters for those escape sequences.
* @param commentDelims list of characters a comment line may start with.
* @param delimiter record separator
*
* @throws BadDelimiterException if the specified delimiter cannot be used
*
* @since ostermillerutils 1.02.24
*/
public CSVParser(Reader in, char delimiter, String escapes, String replacements, String commentDelims) throws BadDelimiterException {
inReader = in;
lexer = new CSVLexer(in);
setEscapes(escapes, replacements);
setCommentStart(commentDelims);
changeDelimiter(delimiter);
}
/**
* Create a parser to parse comma separated values from
* a Reader.
*
* @param in reader that contains comma separated values.
* @param escapes a list of characters that will represent escape sequences.
* @param replacements the list of replacement characters for those escape sequences.
* @param commentDelims list of characters a comment line may start with.
*
* @since ostermillerutils 1.00.00
*/
public CSVParser(Reader in, String escapes, String replacements, String commentDelims){
inReader = in;
lexer = new CSVLexer(in);
setEscapes(escapes, replacements);
setCommentStart(commentDelims);
}
/**
* Close any stream upon which this parser is based.
*
* @since ostermillerutils 1.02.22
* @throws IOException if an error occurs while closing the stream.
*/
public void close() throws IOException {
if (inStream != null) inStream.close();
if (inReader != null) inReader.close();
}
/**
* get the next value.
*
* @return the next value or null if there are no more values.
* @throws IOException if an error occurs while reading.
*
* @since ostermillerutils 1.00.00
*/
public String nextValue() throws IOException {
if (tokenCache == null){
tokenCache = lexer.getNextToken();
lineCache = lexer.getLineNumber();
}
lastLine = lineCache;
String result = tokenCache;
tokenCache = null;
return result;
}
/**
* Get the line number that the last token came from.
*
* New line breaks that occur in the middle of a token are no
* counted in the line number count.
*
* @return line number or -1 if no tokens have been returned yet.
*
* @since ostermillerutils 1.00.00
*/
public int lastLineNumber(){
return lastLine;
}
/**
* Get all the values from a line.
*
* If the line has already been partially read, only the
* values that have not already been read will be included.
*
* @return all the values from the line or null if there are no more values.
* @throws IOException if an error occurs while reading.
*
* @since ostermillerutils 1.00.00
*/
public String[] getLine() throws IOException{
int lineNumber = -1;
ArrayList v = new ArrayList();
if (tokenCache != null){
v.add(tokenCache);
lineNumber = lineCache;
}
while ((tokenCache = lexer.getNextToken()) != null
&& (lineNumber == -1 || lexer.getLineNumber() == lineNumber)){
v.add(tokenCache);
lineNumber = lexer.getLineNumber();
}
if (v.size() == 0){
return null;
}
lastLine = lineNumber;
lineCache = lexer.getLineNumber();
String[] result = new String[v.size()];
return v.toArray(result);
}
/**
* Get all the values from the file.
*
* If the file has already been partially read, only the
* values that have not already been read will be included.
*
* Each line of the file that has at least one value will be
* represented. Comments and empty lines are ignored.
*
* The resulting double array may be jagged.
*
* @return all the values from the file or null if there are no more values.
* @throws IOException if an error occurs while reading.
*
* @since ostermillerutils 1.00.00
*/
public String[][] getAllValues() throws IOException {
ArrayList v = new ArrayList();
String[] line;
while((line = getLine()) != null){
v.add(line);
}
if (v.size() == 0){
return null;
}
String[][] result = new String[v.size()][];
return v.toArray(result);
}
/**
* Specify escape sequences and their replacements.
* Escape sequences set here are in addition to \\ and \".
* \\ and \" are always valid escape sequences. This method
* allows standard escape sequenced to be used. For example
* "\n" can be set to be a newline rather than an 'n'.
* A common way to call this method might be:
* setEscapes("nrtf", "\n\r\t\f");
* which would set the escape sequences to be the Java escape
* sequences. Characters that follow a \ that are not escape
* sequences will still be interpreted as that character.
* The two arguments to this method must be the same length. If
* they are not, the longer of the two will be truncated.
*
* @param escapes a list of characters that will represent escape sequences.
* @param replacements the list of replacement characters for those escape sequences.
*
* @since ostermillerutils 1.00.00
*/
public void setEscapes(String escapes, String replacements){
lexer.setEscapes(escapes, replacements);
}
/**
* Change this parser so that it uses a new delimiter.
*
* The initial character is a comma, the delimiter cannot be changed
* to a quote or other character that has special meaning in CSV.
*
* @param newDelim delimiter to which to switch.
* @throws BadDelimiterException if the character cannot be used as a delimiter.
*
* @since ostermillerutils 1.02.08
*/
public void changeDelimiter(char newDelim) throws BadDelimiterException {
lexer.changeDelimiter(newDelim);
}
/**
* Change this parser so that it uses a new character for quoting.
*
* The initial character is a double quote ("), the delimiter cannot be changed
* to a comma or other character that has special meaning in CSV.
*
* @param newQuote character to use for quoting.
* @throws BadQuoteException if the character cannot be used as a quote.
*
* @since ostermillerutils 1.02.16
*/
public void changeQuote(char newQuote) throws BadQuoteException {
lexer.changeQuote(newQuote);
}
/**
* Set the characters that indicate a comment at the beginning of the line.
* For example if the string "#;!" were passed in, all of the following lines
* would be comments:
*
# Comment
* ; Another Comment
* ! Yet another comment
* By default there are no comments in CVS files. Commas and quotes may not be
* used to indicate comment lines.
*
* @param commentDelims list of characters a comment line may start with.
*
* @since ostermillerutils 1.00.00
*/
public void setCommentStart(String commentDelims){
lexer.setCommentStart(commentDelims);
}
/**
* Get the number of the line from which the last value was retrieved.
*
* @return line number or -1 if no tokens have been returned.
*
* @since ostermillerutils 1.00.00
*/
public int getLastLineNumber(){
return lastLine;
}
/**
* Parse the comma delimited data from a string.
*
* Only escaped backslashes and quotes will be recognized as escape sequences.
* The data will be treated as having no comments.
*
* @param s string with comma delimited data to parse.
* @return parsed data.
*
* @since ostermillerutils 1.02.03
*/
public static String[][] parse(String s){
try {
return (new CSVParser(new StringReader(s))).getAllValues();
} catch (IOException x){
return null;
}
}
/**
* Parse the delimited data from a string.
*
* Only escaped backslashes and quotes will be recognized as escape sequences.
* The data will be treated as having no comments.
*
* @param s string with delimited data to parse.
* @param delimiter record separator
* @return parsed data.
* @throws BadDelimiterException if the character cannot be used as a delimiter.
*
* @since ostermillerutils 1.02.24
*/
public static String[][] parse(String s, char delimiter) throws BadDelimiterException {
try {
return (new CSVParser(new StringReader(s), delimiter)).getAllValues();
} catch (IOException x){
return null;
}
}
/**
* Parse the comma delimited data from a string.
* Escaped backslashes and quotes will always recognized as escape sequences.
*
* @param s string with comma delimited data to parse.
* @param escapes a list of additional characters that will represent escape sequences.
* @param replacements the list of replacement characters for those escape sequences.
* @param commentDelims list of characters a comment line may start with.
* @return parsed data.
*
* @since ostermillerutils 1.02.03
*/
public static String[][] parse(String s, String escapes, String replacements, String commentDelims){
try {
return (new CSVParser(new StringReader(s), escapes, replacements, commentDelims)).getAllValues();
} catch (IOException x){
return null;
}
}
/**
* Parse the delimited data from a string.
* Escaped backslashes and quotes will always recognized as escape sequences.
*
* @param s string with delimited data to parse.
* @param escapes a list of additional characters that will represent escape sequences.
* @param replacements the list of replacement characters for those escape sequences.
* @param commentDelims list of characters a comment line may start with.
* @param delimiter record separator
* @return parsed data.
* @throws BadDelimiterException if the character cannot be used as a delimiter.
*
* @since ostermillerutils 1.02.24
*/
public static String[][] parse(String s, char delimiter, String escapes, String replacements, String commentDelims) throws BadDelimiterException{
try {
return (new CSVParser(new StringReader(s), delimiter, escapes, replacements, commentDelims)).getAllValues();
} catch (IOException x){
return null;
}
}
/**
* Parse the comma delimited data from a stream.
*
* Only escaped backslashes and quotes will be recognized as escape sequences.
* The data will be treated as having no comments.
*
* @param in Reader with comma delimited data to parse.
* @param delimiter record separator
* @return parsed data.
* @throws BadDelimiterException if the character cannot be used as a delimiter.
* @throws IOException if an error occurs while reading.
*
* @since ostermillerutils 1.02.24
*/
public static String[][] parse(Reader in, char delimiter) throws IOException, BadDelimiterException {
return (new CSVParser(in, delimiter)).getAllValues();
}
/**
* Parse the delimited data from a stream.
*
* Only escaped backslashes and quotes will be recognized as escape sequences.
* The data will be treated as having no comments.
*
* @param in Reader with comma delimited data to parse.
* @return parsed data.
* @throws IOException if an error occurs while reading.
*
* @since ostermillerutils 1.02.03
*/
public static String[][] parse(Reader in) throws IOException {
return (new CSVParser(in)).getAllValues();
}
/**
* Parse the delimited data from a stream.
* Escaped backslashes and quotes will always recognized as escape sequences.
*
* @param in Reader with delimited data to parse.
* @param delimiter record separator
* @param escapes a list of additional characters that will represent escape sequences.
* @param replacements the list of replacement characters for those escape sequences.
* @param commentDelims list of characters a comment line may start with.
* @return parsed data.
* @throws BadDelimiterException if the character cannot be used as a delimiter.
* @throws IOException if an error occurs while reading.
*
* @since ostermillerutils 1.02.24
*/
public static String[][] parse(Reader in, char delimiter, String escapes, String replacements, String commentDelims) throws IOException, BadDelimiterException {
return (new CSVParser(in, delimiter, escapes, replacements, commentDelims)).getAllValues();
}
/**
* Parse the comma delimited data from a stream.
* Escaped backslashes and quotes will always recognized as escape sequences.
*
* @param in Reader with comma delimited data to parse.
* @param escapes a list of additional characters that will represent escape sequences.
* @param replacements the list of replacement characters for those escape sequences.
* @param commentDelims list of characters a comment line may start with.
* @return parsed data.
* @throws IOException if an error occurs while reading.
*
* @since ostermillerutils 1.02.03
*/
public static String[][] parse(Reader in, String escapes, String replacements, String commentDelims) throws IOException {
return (new CSVParser(in, escapes, replacements, commentDelims)).getAllValues();
}
}