net.freeutils.util.CSVParser Maven / Gradle / Ivy
Show all versions of jelementary Show documentation
/*
* Copyright © 2003-2024 Amichai Rothman
*
* This file is part of JElementary - the Java Elementary Utilities package.
*
* JElementary is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or
* (at your option) any later version.
*
* JElementary is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JElementary. If not, see .
*
* For additional info see https://www.freeutils.net/source/jelementary/
*/
package net.freeutils.util;
import static net.freeutils.util.CSVParser.State.*;
import java.io.*;
import java.util.*;
/**
* The {@code CSVParser} class parses CSV (comma separated values) content.
*
* Note: there are many variations on the CSV format used in existing applications.
* This implementation follows the format defined in RFC 4180, with the addition
* of allowing a single LF character to terminate a line, in addition to the standard CRLF.
*/
public class CSVParser implements Iterable, Closeable {
protected enum State { START, MIDDLE, QUOTE, QUOTE2 }
protected BufferedReader in;
protected boolean read;
protected String[] header;
protected int[] columns;
/**
* Escapes the given value by surrounding it with double quotes, and escaping
* any double-quote character within it with another double-quote character,
* in accordance with the CSV format.
*
* @param value the value to escape
* @return the escaped value
*/
public static String escape(String value) {
return value == null || value.isEmpty()
? "\"\""
: ('"' + Strings.replace(value, "\"", "\"\"") + '"');
}
/**
* Converts the given individual values into a single CSV-formatted line.
*
* @param values the values (columns)
* @return the CSV-formatted line
*/
public static String toLine(String... values) {
String[] quoted = new String[values.length];
for (int i = 0; i < values.length; i++)
quoted[i] = escape(values[i]);
return Strings.join(",", quoted) + "\r\n";
}
/**
* Creates a new CSV parser.
*
* @param in the underlying CSV data stream
* @throws NullPointerException if in
is null
*/
public CSVParser(Reader in) {
this.in = in instanceof BufferedReader ? (BufferedReader)in : new BufferedReader(in);
}
/**
* Creates a new CSV parser.
*
* @param in the underlying CSV data stream in UTF-8 encoding
* @throws NullPointerException if in
is null
*/
public CSVParser(InputStream in) {
try {
this.in = new BufferedReader(new InputStreamReader(in, "UTF-8"));
} catch (UnsupportedEncodingException ignore) {
// can't happen
}
}
/**
* Creates a new CSV parser.
*
* @param file the CSV file to parse
*
* @throws FileNotFoundException if the file cannot be found
* @throws NullPointerException if in
is null
*/
public CSVParser(File file) throws FileNotFoundException {
this(new FileInputStream(file));
}
/*
* Sets which columns will be returned by the parser and in which order.
*
* @param columns the ordered column indices (zero-based) to be returned
*/
public void setColumns(int... columns) {
this.columns = columns;
}
/*
* Sets which columns will be returned by the parser and in which order.
*
* @param columns the ordered column names to be returned. These names
* must correspond to the values of the CSV header, which is
* the first line of CSV data
*/
public void setColumns(String... columnNames) throws IOException {
String[] header = getHeader();
int[] columns = new int[columnNames.length];
for (int i = 0; i < columns.length; i++) {
int found = -1;
for (int j = 0; found < 0 && j < header.length; j++) {
if (header[j].equals(columnNames[i]))
found = j;
}
if (found < 0)
throw new IllegalArgumentException("column '" + columnNames[i] + "' not found");
columns[i] = found;
}
setColumns(columns);
}
/**
* Filters the given full record, returning only the columns specified
* by one of the {@link #setColumns} methods in their requested order.
*
* @param record a full record (parsed CSV line)
* @return the filtered column values
*/
protected String[] filterColumns(String[] record) {
if (columns == null)
return record;
String[] data = new String[columns.length];
for (int i = 0; i < data.length; i++)
data[i] = record[columns[i]];
return data;
}
/**
* Returns the header values. The header is the first CSV line,
* and is interpreted as the column names. This method must be
* called before any data has been read. Further invocations
* are allowed, and will always return the original header.
*
* @return the header values
* @throws IOException if the header cannot be read
* @throws IllegalStateException if data has been read before
* the first call to this method
*/
public String[] getHeader() throws IOException {
if (header == null) {
ensureNotRead();
header = nextRecord();
}
return header;
}
/**
* Ensures that no data has been read yet.
*
* @throws IllegalStateException if data has already been read
*/
protected void ensureNotRead() throws IllegalStateException {
if (read)
throw new IllegalStateException("data has already been read");
}
/**
* Reads and returns the values of the next record in the CSV stream
*
* @return the values of the next record, or null if the stream end has been reached
* @throws IOException if an error occurs
*/
public String[] nextRecord() throws IOException {
read = true;
List values = new ArrayList<>();
StringBuilder sb = new StringBuilder(64);
State state = START;
for (;;) {
int c = in.read();
switch (c) {
case ',':
if (state == QUOTE)
sb.append((char)c);
else {
values.add(sb.toString());
sb.setLength(0);
state = START;
}
break;
case '"':
if (state == MIDDLE)
throw new IOException("illegal quote in middle of value");
if (state == START)
state = QUOTE;
else if (state == QUOTE)
state = QUOTE2;
else if (state == QUOTE2) {
sb.append((char)c);
state = QUOTE;
}
break;
case '\r':
if (state == QUOTE)
sb.append((char)c);
break;
case '\n':
if (state == START && values.isEmpty())
return new String[0];
if (state == QUOTE)
sb.append((char)c);
else {
values.add(sb.toString());
return filterColumns(Containers.toArray(values, String.class));
}
break;
case -1:
if (state == QUOTE)
throw new IOException("missing end quote");
if (state == START && values.isEmpty())
return null;
values.add(sb.toString());
return filterColumns(Containers.toArray(values, String.class));
default:
if (state == QUOTE2)
throw new IOException("illegal character after quoted value");
sb.append((char)c);
if (state == START)
state = MIDDLE;
break;
}
}
}
/**
* Closes the underlying Reader.
*
* @throws IOException if the operation fails
*/
@Override
public void close() throws IOException {
in.close();
}
/**
* Returns an iterator over the CSV records (lines). Records are iterated
* using the {@link #nextRecord} method. Exceptions thrown while iterating
* are wrapped inside a {@code RuntimeException}. Note that the {@link #close}
* method must be called when the iterator is no longer needed.
*
* @return an iterator over the CSV records (lines)
*/
@Override
public Iterator iterator() {
return new Iterator() {
private String[] next;
@Override
public boolean hasNext() {
try {
return next != null || (next = nextRecord()) != null;
} catch (IOException ioe) {
throw new RuntimeException("error reading record", ioe);
}
}
@Override
public String[] next() {
if (!hasNext())
throw new NoSuchElementException();
String[] next = this.next;
this.next = null;
return next;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
/**
* Returns the entire CSV data contents as an ordered map, with
* the first column as the key and the entire record (including
* the first column) as the value.
*
* Note that the {@link #setColumns} methods can be used to
* filter the output, as well as order the record in such
* a way that the first column becomes the desired key.
*
* @throws IOException if an error occurs
* @return the entire CSV data contents as an ordered map
*/
public Map toMap() throws IOException {
Map map = new LinkedHashMap<>();
String[] record;
while ((record = nextRecord()) != null)
map.put(record[0], record);
return map;
}
}