org.diirt.util.text.CsvParser Maven / Gradle / Ivy

Go to download
/**
 * Copyright (C) 2010-14 diirt developers. See COPYRIGHT.TXT
 * All rights reserved. Use is subject to license terms. See LICENSE.TXT
 */
package org.diirt.util.text;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.AbstractList;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.diirt.util.array.ArrayDouble;
import org.diirt.util.array.ListDouble;
import static org.diirt.util.text.StringUtil.DOUBLE_REGEX_WITH_NAN;

/**
 * Utility class to parse CSV text. The parser is thread safe: it includes an
 * immutable set of parameters and the state for each parsing is kept separate.
 * A change in the parser parameters will create a new parser, so to create
 * your configuration take the closest matching as a template and apply the
 * difference.
 * 
 * Since there is no CSV strict format, this parser honors as best it
 * can the suggestions found in RFC4180,
 * in the CSV wikipedia article
 * and other sources.
 * 

 * The parser can try multiple separators, so that it can auto-detect the
 * likely correct one. It does so by trying them one by one, checking
 * that it finds more than one column and that all the rows have the same
 * number of columns. If not, proceeds to the next separator.
 * 

 * Typical use of the parser:
 * 
 * CsvParserResult result = CsvParser.AUTOMATIC
 *   .withHeader(CsvParser.Header.NONE)
 *   .parse(new FileReader("table.csv"));
 * 
 * The parsing of each line is based on code and insights found in
 *  Mastering Regular Expressions.
 *
 * @author carcassi
 */
public class CsvParser {
    
    // Configuration
    private final String separators;
    private final Header header;

    /**
     * The configuration options for the header.
     */
    public enum Header {
        /**
         * Auto detects whether the first line is a header.
         * 

         * The first line is interpreted as data only if it can be safely
         * distinguished. If all columns contain strings, then the first
         * line is always interpreted as a header. If the types in the
         * first line do not match the column (e.g. first line string, rest are
         * numbers) then it is interpreted as header. If the types match,
         * and one of them is not a string (e.g. number) then the first
         * line is interpreted as data.
         */
        AUTO, 
        
        /**
         * The first line is the header.
         */
        FIRST_LINE,
        
        /**
         * The data contains no header, and the first line is data.
         * 

         * A header is automatically generated with the convention given by
         * spreadsheets columns: A, B, ..., Y, Z, AA, AB, ..., AZ, BA, and so on.
         */
        NONE};
    
    private class State {
        // Parser state
        private int nColumns;
        private boolean columnMismatch = false;
        private List columnNames;
        private List columnNumberParsable;
        private List columnTimestampParsable;
        private List> columnTokens;
        private String currentSeparator;
        
        // Regex object used for parsing
        private Matcher mLineTokens;
        private final Matcher mQuote = pQuote.matcher("");
        private final Matcher mDouble = pDouble.matcher("");
        
        // Keep data on best matched separator
        private String bestSeparator;
        private int bestNLines = -1;
    }
    
    
    private static final Pattern pQuote = Pattern.compile("\"\"");
    private static final Pattern pDouble = Pattern.compile(DOUBLE_REGEX_WITH_NAN);
    
    /**
     * Automatic parser: auto-detects whether the first line is a header or not
     * and tries the most common separators (i.e. ',' ';' 'TAB' 'SPACE').
     */
    public static final CsvParser AUTOMATIC = new CsvParser(",;\t ", Header.AUTO);

    private CsvParser(String separators, Header header) {
        this.separators = separators;
        this.header = header;
    }

    /**
     * Returns the list of separators that are going to be tried while parsing.
     * 
     * @return a string with all the possible separators
     */
    public String getSeparators() {
        return separators;
    }

    /**
     * Creates a new parser that uses the given separators.
     * 

     * Each character of the string is tried until the parsing is
     * successful.
     * 
     * @param separators the new list of separators
     * @return a new parser
     */
    public CsvParser withSeparators(String separators) {
        return new CsvParser(separators, header);
    }

    /**
     * Returns the way that the parser handles the header (the first line of
     * the csv file).
     * 
     * @return the header configuration of the parser
     */
    public Header getHeader() {
        return header;
    }
    
    /**
     * Creates a new parser with the given header handling.
     * 
     * @param header the header configuration for the parser
     * @return a new parser
     */
    public CsvParser withHeader(Header header) {
        return new CsvParser(separators, header);
    }

    
    /**
     * Parser the text provided by the reader with the format defined in this
     * parser. This method is thread-safe.
     * 

     * If the parsing fails, this method does not throw an exception but
     * will have information in the result. The idea is that, in the future,
     * the parser can provide multiple reasons as why the parsing failed or 
     * event incomplete results.
     * 
     * @param reader a reader
     * @return the parsed information
     */
    public CsvParserResult parse(Reader reader) {
        // State used for parsing. Since each call has its own state,
        // the parsing is thread safe.
        State state = new State();
        
        // Divide into lines.
        // Note that means we are going to keep in memory the whole file.
        // This is not very memory efficient. But since we have to do multiple
        // passes to find the right separator, we don't have much choice.
        // Also: the actual parsed result will need to stay in memory anyway.
        List lines = csvLines(reader);
        
        // Try each seaparater
        separatorLoop:
        for(int nSeparator = 0; nSeparator < getSeparators().length(); nSeparator++) {
            state.currentSeparator = getSeparators().substring(nSeparator, nSeparator+1);
            
            // Taken from Mastering Regular Exceptions
            // Disabled comments so that space could work as possible separator
            String regex = // puts a doublequoted field in group(1) and an unquoted field into group(2)
                    // Start with beginning of line or separator
                    "\\G(?:^|" + state.currentSeparator + ")" +
                    // Match a quoted string
                    "(?:" +
                    "\"" +
                    "((?:[^\"]++|\"\")*+)" +
                    "\"" +
                    // Or match a string without the separator
                    "|" +
                    "([^\"" + state.currentSeparator + "]*)" +
                    ")";
            // Compile the matcher once for all the parsing
            state.mLineTokens = Pattern.compile(regex).matcher("");
            
            // Try to parse the first line (the titles)
            // If only one columns is found, proceed to next separator
            state.columnNames = parseTitles(state, lines.get(0));
            state.nColumns = state.columnNames.size();
            if (state.nColumns == 1) {
                continue;
            }
            
            // Prepare the data structures to hold column data while parsing
            state.columnMismatch = false;
            state.columnNumberParsable = new ArrayList<>(state.nColumns);
            state.columnTimestampParsable = new ArrayList<>(state.nColumns);
            state.columnTokens = new ArrayList<>();
            for (int i = 0; i < state.nColumns; i++) {
                state.columnNumberParsable.add(true);
                state.columnTimestampParsable.add(false);
                state.columnTokens.add(new ArrayList());
            }
            
            // Parse each line
            // If one line does not match the number of columns found in the first
            // line, pass to the next separator
            for (int i = 1; i < lines.size(); i++) {
                parseLine(state, lines.get(i));
                if (state.columnMismatch) {
                    if (i > state.bestNLines) {
                        state.bestSeparator =  state.currentSeparator;
                        state.bestNLines = i;
                    }
                    continue separatorLoop;
                }
            }
            
            // The parsing succeeded! No need to try other separator
            break;
            
        }
        
        // We are out of the loop: did we end because we parsed correctly,
        // or because even the last separator was a mismatch?
        if (state.columnMismatch) {
            return new CsvParserResult(null, null, null, 0, false, "Parsing failed: number of columns not constant. Using separator '"
                    + state.bestSeparator + "', line " + (state.bestNLines + 1));
        }
        
        // Parsing was successful.
        // Should the first line be used as data?
        if (header == Header.NONE || (header == Header.AUTO && isFirstLineData(state, state.columnNames))) {
            for (int i = 0; i < state.nColumns; i++) {
                state.columnTokens.set(i, joinList(state.columnNames.get(i), state.columnTokens.get(i)));
                state.columnNames.set(i, alphabeticName(i));
            }
        }
        
        // Now it's time to convert the tokens to the actual type.
        List