org.diirt.util.text.CsvParser Maven / Gradle / Ivy
/**
* Copyright (C) 2010-14 diirt developers. See COPYRIGHT.TXT
* All rights reserved. Use is subject to license terms. See LICENSE.TXT
*/
package org.diirt.util.text;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.diirt.util.array.ArrayDouble;
import org.diirt.util.array.ListDouble;
import static org.diirt.util.text.StringUtil.DOUBLE_REGEX_WITH_NAN;
/**
* Utility class to parse CSV text. The parser is thread safe: it includes an
* immutable set of parameters and the state for each parsing is kept separate.
* A change in the parser parameters will create a new parser, so to create
* your configuration take the closest matching as a template and apply the
* difference.
*
* Since there is no CSV strict format, this parser honors as best it
* can the suggestions found in RFC4180,
* in the CSV wikipedia article
* and other sources.
*
* The parser can try multiple separators, so that it can auto-detect the
* likely correct one. It does so by trying them one by one, checking
* that it finds more than one column and that all the rows have the same
* number of columns. If not, proceeds to the next separator.
*
* Typical use of the parser:
*
* CsvParserResult result = CsvParser.AUTOMATIC
* .withHeader(CsvParser.Header.NONE)
* .parse(new FileReader("table.csv"));
*
* The parsing of each line is based on code and insights found in
* Mastering Regular Expressions.
*
* @author carcassi
*/
public class CsvParser {
// Configuration
private final String separators;
private final Header header;
/**
* The configuration options for the header.
*/
public enum Header {
/**
* Auto detects whether the first line is a header.
*
* The first line is interpreted as data only if it can be safely
* distinguished. If all columns contain strings, then the first
* line is always interpreted as a header. If the types in the
* first line do not match the column (e.g. first line string, rest are
* numbers) then it is interpreted as header. If the types match,
* and one of them is not a string (e.g. number) then the first
* line is interpreted as data.
*/
AUTO,
/**
* The first line is the header.
*/
FIRST_LINE,
/**
* The data contains no header, and the first line is data.
*
* A header is automatically generated with the convention given by
* spreadsheets columns: A, B, ..., Y, Z, AA, AB, ..., AZ, BA, and so on.
*/
NONE};
private class State {
// Parser state
private int nColumns;
private boolean columnMismatch = false;
private List columnNames;
private List columnNumberParsable;
private List columnTimestampParsable;
private List> columnTokens;
private String currentSeparator;
// Regex object used for parsing
private Matcher mLineTokens;
private final Matcher mQuote = pQuote.matcher("");
private final Matcher mDouble = pDouble.matcher("");
// Keep data on best matched separator
private String bestSeparator;
private int bestNLines = -1;
}
private static final Pattern pQuote = Pattern.compile("\"\"");
private static final Pattern pDouble = Pattern.compile(DOUBLE_REGEX_WITH_NAN);
/**
* Automatic parser: auto-detects whether the first line is a header or not
* and tries the most common separators (i.e. ',' ';' 'TAB' 'SPACE').
*/
public static final CsvParser AUTOMATIC = new CsvParser(",;\t ", Header.AUTO);
private CsvParser(String separators, Header header) {
this.separators = separators;
this.header = header;
}
/**
* Returns the list of separators that are going to be tried while parsing.
*
* @return a string with all the possible separators
*/
public String getSeparators() {
return separators;
}
/**
* Creates a new parser that uses the given separators.
*
* Each character of the string is tried until the parsing is
* successful.
*
* @param separators the new list of separators
* @return a new parser
*/
public CsvParser withSeparators(String separators) {
return new CsvParser(separators, header);
}
/**
* Returns the way that the parser handles the header (the first line of
* the csv file).
*
* @return the header configuration of the parser
*/
public Header getHeader() {
return header;
}
/**
* Creates a new parser with the given header handling.
*
* @param header the header configuration for the parser
* @return a new parser
*/
public CsvParser withHeader(Header header) {
return new CsvParser(separators, header);
}
/**
* Parser the text provided by the reader with the format defined in this
* parser. This method is thread-safe.
*
* If the parsing fails, this method does not throw an exception but
* will have information in the result. The idea is that, in the future,
* the parser can provide multiple reasons as why the parsing failed or
* event incomplete results.
*
* @param reader a reader
* @return the parsed information
*/
public CsvParserResult parse(Reader reader) {
// State used for parsing. Since each call has its own state,
// the parsing is thread safe.
State state = new State();
// Divide into lines.
// Note that means we are going to keep in memory the whole file.
// This is not very memory efficient. But since we have to do multiple
// passes to find the right separator, we don't have much choice.
// Also: the actual parsed result will need to stay in memory anyway.
List lines = csvLines(reader);
// Try each seaparater
separatorLoop:
for(int nSeparator = 0; nSeparator < getSeparators().length(); nSeparator++) {
state.currentSeparator = getSeparators().substring(nSeparator, nSeparator+1);
// Taken from Mastering Regular Exceptions
// Disabled comments so that space could work as possible separator
String regex = // puts a doublequoted field in group(1) and an unquoted field into group(2)
// Start with beginning of line or separator
"\\G(?:^|" + state.currentSeparator + ")" +
// Match a quoted string
"(?:" +
"\"" +
"((?:[^\"]++|\"\")*+)" +
"\"" +
// Or match a string without the separator
"|" +
"([^\"" + state.currentSeparator + "]*)" +
")";
// Compile the matcher once for all the parsing
state.mLineTokens = Pattern.compile(regex).matcher("");
// Try to parse the first line (the titles)
// If only one columns is found, proceed to next separator
state.columnNames = parseTitles(state, lines.get(0));
state.nColumns = state.columnNames.size();
if (state.nColumns == 1) {
continue;
}
// Prepare the data structures to hold column data while parsing
state.columnMismatch = false;
state.columnNumberParsable = new ArrayList<>(state.nColumns);
state.columnTimestampParsable = new ArrayList<>(state.nColumns);
state.columnTokens = new ArrayList<>();
for (int i = 0; i < state.nColumns; i++) {
state.columnNumberParsable.add(true);
state.columnTimestampParsable.add(false);
state.columnTokens.add(new ArrayList());
}
// Parse each line
// If one line does not match the number of columns found in the first
// line, pass to the next separator
for (int i = 1; i < lines.size(); i++) {
parseLine(state, lines.get(i));
if (state.columnMismatch) {
if (i > state.bestNLines) {
state.bestSeparator = state.currentSeparator;
state.bestNLines = i;
}
continue separatorLoop;
}
}
// The parsing succeeded! No need to try other separator
break;
}
// We are out of the loop: did we end because we parsed correctly,
// or because even the last separator was a mismatch?
if (state.columnMismatch) {
return new CsvParserResult(null, null, null, 0, false, "Parsing failed: number of columns not constant. Using separator '"
+ state.bestSeparator + "', line " + (state.bestNLines + 1));
}
// Parsing was successful.
// Should the first line be used as data?
if (header == Header.NONE || (header == Header.AUTO && isFirstLineData(state, state.columnNames))) {
for (int i = 0; i < state.nColumns; i++) {
state.columnTokens.set(i, joinList(state.columnNames.get(i), state.columnTokens.get(i)));
state.columnNames.set(i, alphabeticName(i));
}
}
// Now it's time to convert the tokens to the actual type.
List