All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.databasesandlife.util.CsvParser Maven / Gradle / Ivy

There is a newer version: 21.0.1
Show newest version
package com.databasesandlife.util;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Supplier;
import java.util.regex.Pattern;

import com.databasesandlife.util.gwtsafe.ConfigurationException;
import com.google.gdata.util.io.base.UnicodeReader;

import javax.annotation.Nonnull;

import static com.databasesandlife.util.gwtsafe.ConfigurationException.prefixExceptionMessage;

/**
 * Parses CSV files.
 *
 * 

The CSV file is assumed to have a first line containing the column headings. * Does not handle quotes in fields (e.g. as generated by Excel). * Field names are case-sensitive. * Files have a default character set (by default UTF-8) which can be changed by calling {@link #setDefaultCharset}, * however if the file has a Unicode BOM then this is accepted in preference to the default charset. * *

Usage

*

Create an object and set attributes such as the field-separator, list of acceptable columns, etc. * Then either call parseAndCallHandler or parseToListOfMaps.

* *
 *    CsvLineHandler myHandler = new CsvLineHandler() {
 *        void processCsvLine(Map<String,String> line) { .. }
 *    };
 *    CsvParser csvParser = new CsvParser();
 *    csvParser.setDesiredFields("abc","def"); // field set in file must be this set
 *    csvParser.setNonEmptyFields("abc");      // all of these fields must have non-empty values
 *    csvParser.parseAndCallHandler(myHandler, aFile);
 *    csvParser.parseAndCallHandler(myHandler, aReader);
 *    csvParser.parseAndCallHandler(myHandler, aClass);  // reads "aClass.csv" from classloader
 *    List<Map<String,String>> contents = csvParser.parseToListOfMaps(aFile);
*

Glossary

*
    *
  • Field - name of column *
  • Column index - e.g. 0 is the left-most column *
  • Line - a row of data or header *
* * @author This source is copyright Adrian Smith and licensed under the LGPL 3. * @see Project on GitHub */ @SuppressWarnings("serial") public class CsvParser { public interface CsvLineHandler { /** @param line this object can be re-used between calls to reduce GC; extract values from it but do not store the object anywhere */ void processCsvLine(Map line) throws MalformedCsvException; } public static class MalformedCsvException extends Exception { // checked ex. because it's always possible CSV invalid, must handle it public MalformedCsvException(String msg) { super(msg); } public MalformedCsvException(String prefix, Throwable e) { super(prefixExceptionMessage(prefix, e), e); } public MalformedCsvException(Throwable e) { super(e); } } protected class ArrayOfMapsLineHandler implements CsvLineHandler { List> result = new ArrayList<>(); public void processCsvLine(Map line) { Map mapCopy = mapProducer.get(); mapCopy.putAll(line); result.add(mapCopy); } } protected Charset defaultCharset = StandardCharsets.UTF_8; protected Pattern fieldSeparatorRegexp = Pattern.compile(Pattern.quote(",")); protected String fieldSeparator = ",(?=([^\"]*\"[^\"]*\")*[^\"]*$)"; protected Set mandatoryFields = null; protected Set allowedFields = null; protected Set nonEmptyFields = null; protected Pattern endOfDataRegex = null; protected Pattern skipLinePattern = null; protected Supplier> mapProducer = HashMap::new; public void setEndOfDataRegex(Pattern p){ this.endOfDataRegex = p;} public void setSkipLinePattern(Pattern p){ this.skipLinePattern = p;} public void setDefaultCharset(Charset c) { defaultCharset = c; } public void setFieldSeparatorRegexp(Pattern p) { fieldSeparatorRegexp = Pattern.compile(fieldSeparator.replace(",", p.toString()));} public void setFieldSeparator(String x) { setFieldSeparatorRegexp(Pattern.compile(Pattern.quote(x))); } /** If the CSV file is missing any columns from this list, that's an error */ public void setMandatoryFields(@Nonnull String... f) { mandatoryFields = new HashSet<>(Arrays.asList(f)); } /** If the CSV file contains any columns not on this list, that's an error */ public void setAllowedFields(@Nonnull String... f) { allowedFields = new HashSet<>(Arrays.asList(f)); } /** CSV header row must contain exactly these columns (not necessarily in this order) */ public void setDesiredFields(@Nonnull String... f) { setMandatoryFields(f); setAllowedFields(f); } /** Any fields here must be present and have non-empty values */ public void setNonEmptyFields(@Nonnull String... f) { nonEmptyFields = new HashSet<>(Arrays.asList(f)); } /** By default use a fast map, but the client may require other maps for example one that preserves the field order */ public void setMapProducer(@Nonnull Supplier> mapProducer) { this.mapProducer = mapProducer; } public void parseAndCallHandler(CsvLineHandler lineHandler, BufferedReader r) throws MalformedCsvException { try { var headerLine = r.readLine(); if (headerLine == null) throw new MalformedCsvException("File was empty (header line is mandatory)"); String[] fieldForColIdx = fieldSeparatorRegexp.split(headerLine); if (mandatoryFields != null) for (var f : mandatoryFields) if ( ! containsField(Arrays.asList(fieldForColIdx),f)) throw new MalformedCsvException("Column '" + f + "' is missing"); if (allowedFields != null) for (var csvField : fieldForColIdx) if ( ! allowedFields.contains(csvField)) throw new MalformedCsvException("Column '" + csvField + "' unexpected"); var lineNumber = 1; Map valueForField = mapProducer.get(); while (true) { try { lineNumber++; var line = r.readLine(); if (line == null || (endOfDataRegex != null && endOfDataRegex.matcher(line).matches())) break; // end of data if(skipLinePattern != null && skipLinePattern.matcher(line).matches()) continue; String[] valueForColIdx = fieldSeparatorRegexp.split(line,-1); if (valueForColIdx.length != fieldForColIdx.length) throw new MalformedCsvException("Expected " + fieldForColIdx.length + " fields but found " + valueForColIdx.length + " fields; line was '"+line+"'"); valueForField.clear(); for (var c = 0; c < valueForColIdx.length; c++) { var field = fieldForColIdx[c].replaceAll("\"", ""); var val = valueForColIdx[c].replaceAll("\"", ""); if (nonEmptyFields != null && nonEmptyFields.contains(field)) if (val.length() == 0) throw new MalformedCsvException("Column " + c + ", field '" + field + "': value may not be empty"); valueForField.put(field, val); } lineHandler.processCsvLine(valueForField); } catch (MalformedCsvException e) { throw new MalformedCsvException(getLineNumberText(lineNumber), e); } } } catch (IOException e) { throw new RuntimeException(e); } } public void parseAndCallHandler(CsvLineHandler lineHandler, File f) throws MalformedCsvException { try { try (var is = new FileInputStream(f)) { var r = new UnicodeReader(is, defaultCharset.name()); var br = new BufferedReader(r); parseAndCallHandler(lineHandler, br); } } catch (FileNotFoundException e) { throw new MalformedCsvException("CSV file '"+f+"' doesn't exist"); } catch (IOException e) { throw new RuntimeException(prefixExceptionMessage("CSV file '" + f + "'", e), e); } catch (MalformedCsvException e) { throw new MalformedCsvException("CSV file '" + f + "'", e); } } public void parseAndCallHandler(CsvLineHandler lineHandler, Class cl) throws MalformedCsvException { var name = cl.getName().replaceAll("\\.", "/"); // e.g. "com/offerready/MyClass" try (var csvStream = cl.getClassLoader().getResourceAsStream(name + ".csv")) { if (csvStream == null) throw new IllegalArgumentException("No CSV file for class '" + cl.getName() + "'"); parseAndCallHandler(lineHandler, new BufferedReader(new InputStreamReader(csvStream, defaultCharset))); } catch (IOException e) { throw new RuntimeException(prefixExceptionMessage("CSV file for class " + cl, e), e); } catch (MalformedCsvException e) { throw new MalformedCsvException("CSV file for class " + cl, e); } } public List> parseToListOfMaps(BufferedReader r) throws MalformedCsvException { var lineHandler = new ArrayOfMapsLineHandler(); parseAndCallHandler(lineHandler, r); return lineHandler.result; } public List> parseToListOfMaps(File f) throws MalformedCsvException { var lineHandler = new ArrayOfMapsLineHandler(); parseAndCallHandler(lineHandler, f); return lineHandler.result; } public List> parseToListOfMaps(Class cl) throws MalformedCsvException { var lineHandler = new ArrayOfMapsLineHandler(); parseAndCallHandler(lineHandler, cl); return lineHandler.result; } private boolean containsField(List desired,String field){ for(var s : desired){ if(s.replaceAll("\"", "").equals(field)) return true; } return false; } protected String getLineNumberText(int lineNumber) { return "Line " + lineNumber; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy