com.databasesandlife.util.CsvParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of java-common Show documentation
Utility classes developed at Adrian Smith Software (A.S.S.)
There is a newer version: 21.0.1
package com.databasesandlife.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Supplier;
import java.util.regex.Pattern;

import com.databasesandlife.util.gwtsafe.ConfigurationException;
import com.google.gdata.util.io.base.UnicodeReader;

import javax.annotation.Nonnull;

import static com.databasesandlife.util.gwtsafe.ConfigurationException.prefixExceptionMessage;

/**
 * Parses CSV files.
 *
 * The CSV file is assumed to have a first line containing the column headings.
 * Does not handle quotes in fields (e.g. as generated by Excel).
 * Field names are case-sensitive.
 * Files have a default character set (by default UTF-8) which can be changed by calling {@link #setDefaultCharset},
 * however if the file has a Unicode BOM then this is accepted in preference to the default charset.
 *
 * 
Usage
 * Create an object and set attributes such as the field-separator, list of acceptable columns, etc.
 * Then either call parseAndCallHandler or parseToListOfMaps.
 * 
 *  *    CsvLineHandler myHandler = new CsvLineHandler() {
 *        void processCsvLine(Map<String,String> line) { .. }
 *    };
 *    CsvParser csvParser = new CsvParser();
 *    csvParser.setDesiredFields("abc","def"); // field set in file must be this set
 *    csvParser.setNonEmptyFields("abc");      // all of these fields must have non-empty values
 *    csvParser.parseAndCallHandler(myHandler, aFile);
 *    csvParser.parseAndCallHandler(myHandler, aReader);
 *    csvParser.parseAndCallHandler(myHandler, aClass);  // reads "aClass.csv" from classloader
 *    List<Map<String,String>> contents = csvParser.parseToListOfMaps(aFile);
 * Glossary
 * 
 * Field - name of column
 * 
Column index - e.g. 0 is the left-most column
 * 
Line - a row of data or header
 * 
 *
 * @author This source is copyright Adrian Smith and licensed under the LGPL 3.
 * @see Project on GitHub
 */
@SuppressWarnings("serial")
public class CsvParser {

    public interface CsvLineHandler {
        /** @param line this object can be re-used between calls to reduce GC; extract values from it but do not store the object anywhere */
        void processCsvLine(Map line) throws MalformedCsvException;
    }

    public static class MalformedCsvException extends Exception {  // checked ex. because it's always possible CSV invalid, must handle it
        public MalformedCsvException(String msg) { super(msg); }
        public MalformedCsvException(String prefix, Throwable e) { super(prefixExceptionMessage(prefix, e), e); }
        public MalformedCsvException(Throwable e) { super(e); }
    }

    protected class ArrayOfMapsLineHandler implements CsvLineHandler {
        List> result = new ArrayList<>();
        public void processCsvLine(Map line) {
            Map mapCopy = mapProducer.get();
            mapCopy.putAll(line);
            result.add(mapCopy);
        }
    }

    protected Charset defaultCharset = StandardCharsets.UTF_8;
    protected Pattern fieldSeparatorRegexp = Pattern.compile(Pattern.quote(","));
    protected String fieldSeparator = ",(?=([^\"]*\"[^\"]*\")*[^\"]*$)";
    protected Set mandatoryFields = null;
    protected Set allowedFields = null;
    protected Set nonEmptyFields = null;
    protected Pattern endOfDataRegex = null;
    protected Pattern skipLinePattern = null;
    
    protected Supplier> mapProducer = HashMap::new;

    public void setEndOfDataRegex(Pattern p){ this.endOfDataRegex = p;}
    public void setSkipLinePattern(Pattern p){ this.skipLinePattern = p;}
    public void setDefaultCharset(Charset c) { defaultCharset = c; }
    public void setFieldSeparatorRegexp(Pattern p) { fieldSeparatorRegexp = Pattern.compile(fieldSeparator.replace(",", p.toString()));}
    public void setFieldSeparator(String x) { setFieldSeparatorRegexp(Pattern.compile(Pattern.quote(x))); }

    /** If the CSV file is missing any columns from this list, that's an error */
    public void setMandatoryFields(@Nonnull String... f) { mandatoryFields = new HashSet<>(Arrays.asList(f)); }

    /** If the CSV file contains any columns not on this list, that's an error */
    public void setAllowedFields(@Nonnull String... f) { allowedFields = new HashSet<>(Arrays.asList(f)); }

    /** CSV header row must contain exactly these columns (not necessarily in this order) */
    public void setDesiredFields(@Nonnull String... f) {
        setMandatoryFields(f);
        setAllowedFields(f);
    }

    /** Any fields here must be present and have non-empty values */ 
    public void setNonEmptyFields(@Nonnull String... f) { nonEmptyFields = new HashSet<>(Arrays.asList(f)); }

    /** By default use a fast map, but the client may require other maps for example one that preserves the field order */
    public void setMapProducer(@Nonnull Supplier> mapProducer) { this.mapProducer = mapProducer; }

    public void parseAndCallHandler(CsvLineHandler lineHandler, BufferedReader r) throws MalformedCsvException {
        try {
            var headerLine = r.readLine();
            if (headerLine == null) throw new MalformedCsvException("File was empty (header line is mandatory)");
            String[] fieldForColIdx = fieldSeparatorRegexp.split(headerLine);
            if (mandatoryFields != null)
                for (var f : mandatoryFields)
                    if ( ! containsField(Arrays.asList(fieldForColIdx),f))
                        throw new MalformedCsvException("Column '" + f + "' is missing");
            if (allowedFields != null)
                for (var csvField : fieldForColIdx)
                    if ( ! allowedFields.contains(csvField))
                        throw new MalformedCsvException("Column '" + csvField + "' unexpected");

            var lineNumber = 1;
            Map valueForField = mapProducer.get();
            while (true) {
                try {
                    lineNumber++;
                    var line = r.readLine();
                    if (line == null || (endOfDataRegex != null && endOfDataRegex.matcher(line).matches())) break; // end of data
                    if(skipLinePattern != null && skipLinePattern.matcher(line).matches()) continue;
                    String[] valueForColIdx = fieldSeparatorRegexp.split(line,-1);
                    if (valueForColIdx.length != fieldForColIdx.length) throw new MalformedCsvException("Expected " +
                        fieldForColIdx.length + " fields but found " + valueForColIdx.length + " fields; line was '"+line+"'");
                    valueForField.clear();
                    for (var c = 0; c < valueForColIdx.length; c++) {
                        var field = fieldForColIdx[c].replaceAll("\"", "");
                        var val = valueForColIdx[c].replaceAll("\"", "");
                        if (nonEmptyFields != null && nonEmptyFields.contains(field))
                            if (val.length() == 0) throw new MalformedCsvException("Column " + c + ", field '" + field + "': value may not be empty");
                        valueForField.put(field, val);
                    }
                    lineHandler.processCsvLine(valueForField);
                }
                catch (MalformedCsvException e) { throw new MalformedCsvException(getLineNumberText(lineNumber), e); }
            }
        }
        catch (IOException e) { throw new RuntimeException(e); }
    }

    public void parseAndCallHandler(CsvLineHandler lineHandler, File f) throws MalformedCsvException {
        try {
            try (var is = new FileInputStream(f)) {
                var r = new UnicodeReader(is, defaultCharset.name());
                var br = new BufferedReader(r);
                parseAndCallHandler(lineHandler, br);
            }
        }
        catch (FileNotFoundException e) { throw new MalformedCsvException("CSV file '"+f+"' doesn't exist"); }
        catch (IOException e) { throw new RuntimeException(prefixExceptionMessage("CSV file '" + f + "'", e), e); }
        catch (MalformedCsvException e) { throw new MalformedCsvException("CSV file '" + f + "'", e); }
    }
    
    public void parseAndCallHandler(CsvLineHandler lineHandler, Class cl) throws MalformedCsvException {
        var name = cl.getName().replaceAll("\\.", "/"); // e.g. "com/offerready/MyClass"
        try (var csvStream = cl.getClassLoader().getResourceAsStream(name + ".csv")) {
            if (csvStream == null) throw new IllegalArgumentException("No CSV file for class '" + cl.getName() + "'");
            parseAndCallHandler(lineHandler, new BufferedReader(new InputStreamReader(csvStream, defaultCharset)));
        }
        catch (IOException e) { throw new RuntimeException(prefixExceptionMessage("CSV file for class " + cl, e), e); }
        catch (MalformedCsvException e) { throw new MalformedCsvException("CSV file for class " + cl, e); }
    }

    public List> parseToListOfMaps(BufferedReader r) throws MalformedCsvException {
        var lineHandler = new ArrayOfMapsLineHandler();
        parseAndCallHandler(lineHandler, r);
        return lineHandler.result;
    }

    public List> parseToListOfMaps(File f) throws MalformedCsvException {
        var lineHandler = new ArrayOfMapsLineHandler();
        parseAndCallHandler(lineHandler, f);
        return lineHandler.result;
    }

    public List> parseToListOfMaps(Class cl) throws MalformedCsvException {
        var lineHandler = new ArrayOfMapsLineHandler();
        parseAndCallHandler(lineHandler, cl);
        return lineHandler.result;
    }
    
    private boolean containsField(List desired,String field){
        for(var s : desired){
                if(s.replaceAll("\"", "").equals(field)) return true;
        }
        return false;
    }
    
    protected String getLineNumberText(int lineNumber) {
        return "Line " + lineNumber;
    }
}