All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.sfj.RFC4180CSVParser Maven / Gradle / Ivy

/*
 * Copyright 2020 C. Schanck
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sfj;

import java.io.BufferedReader;
import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.NoSuchElementException;

/**
 * This attempts to implement a very tiny RFX4180 compliant CSV parser.
 * It relies on {@link BufferedReader}'s end of line handling, so it will
 * swallow '\n', '\r' or '\r\n' line endings.
 * 

It attempts to implement the subset of RFV4180 called out on wikipedia: *

    *
  • MS-DOS-style lines that end with (CR/LF) characters (optional for the last line). *
  • An optional header record (there is no sure way to detect whether it is present, so care * is required when importing). *
  • Each record should contain the same number of comma-separated fields. *
  • Any field may be quoted (with double quotes). *
  • Fields containing a line-break, double-quote or commas should be quoted. (If they * are not, the file will likely be impossible to process correctly.) *
  • If double-quotes are used to enclose fields, then a double-quote must be * represented by two double-quote characters. *
* Additionally, it will ignore blank lines. The "header" line is not actually handled * at all, you must handle it explicitly. It should handle quoted multi-line fields * properly. You can, in the constructor, specify to parse a single header, * which is then available via the getHeader() method. * *

* Mainly, I implemented this simple subset because it is ... useful. If you * need more than this, use a real library, like the excellent Apache one or * OpenCSV, or, or, or. * @author cschanck */ public class RFC4180CSVParser implements Iterable, Closeable { private final BufferedReader bReader; private int index = 0; private String line; private final char sep; private String[] header = null; /** * Constructor, no header, comma seperator * @param reader reader to read. * @throws IOException on IO exception */ public RFC4180CSVParser(Reader reader) throws IOException { this(false, reader); } /** * Constructor, specifying if there is a header to read. * @param hasHeader true if there is a header to read * @param reader reader * @throws IOException on IO exception */ public RFC4180CSVParser(boolean hasHeader, Reader reader) throws IOException { this(',', hasHeader, reader); } /** * Constructor, specifying field separator, and if there is a header to read. * @param sep character separator * @param hasHeader true if there is a header to read * @param reader reader * @throws IOException on IO exception */ public RFC4180CSVParser(char sep, boolean hasHeader, Reader reader) throws IOException { this.sep = sep; this.bReader = new BufferedReader(reader); // preload line = bReader.readLine(); if (hasHeader) { header = record(); } } /** * If reading a header was specified, then it can be retrieved via this * method. If not specified, this will return null. * @return header or null */ public String[] getHeader() { return header; } /** * Close this, close the reader. Idempotent, swallows exceptions. */ @Override public void close() { try { // aggressively close it, ignore failures bReader.close(); } catch (Exception e) { } } @Override public Iterator iterator() { return new Iterator() { private String[] next; { try { next = record(); } catch (IOException e) { next = null; throw new IllegalStateException(e); } } @Override public boolean hasNext() { return next != null; } @Override public String[] next() { if (hasNext()) { try { String[] ret = next; next = record(); if (next == null) { close(); } return ret; } catch (IOException e) { throw new IllegalStateException(e); } } throw new NoSuchElementException(); } }; } private int next() throws IOException { if (line == null) { close(); return -1; } if (index == line.length()) { do { // eat blank lines. might as well do it here. line = bReader.readLine(); index = 0; } while (line != null && line.length() == 0); // return line end char. return '\n'; } return line.charAt(index++); } /** * Retrieve the next record in the CSV file. Used as basis for * iteration on this class, can be used directly if that is preferred. * * @return next record or null if at end of file. * @throws IOException on exception */ public String[] record() throws IOException { // read one record... ArrayList list = new ArrayList<>(); boolean[] eol = new boolean[] { false }; StringBuilder whiteSpace = new StringBuilder(); // consume initial whitespace int p; for (p = next(); p >= 0; p = next()) { char ch = (char) p; if (ch == '\n') { // restart, skip line. whiteSpace.setLength(0); } else if (ch == sep) { break; } else if (Character.isWhitespace(ch)) { whiteSpace.append(ch); } else { break; } } // eof give up and go home if (p < 0) { return null; } // process chars for (; p >= 0; p = next()) { char ch = (char) p; if (ch == '"') { // start of double quote field eol[0] = false; list.add(doubleQuote(eol)); if (eol[0]) { // handle if field was at end of line break; } } else if (ch == sep) { // empty field list.add(""); } else if (ch == '\n') { // eol, we done with this record break; } else { // unquoted field, throw the whitespace in, it counts. whiteSpace.append(ch); eol[0] = false; list.add(noQuote(whiteSpace, eol)); if (eol[0]) { // handle if field was at end of line break; } whiteSpace.setLength(0); } } return list.toArray(new String[0]); } private String noQuote(StringBuilder sb, boolean[] eol) throws IOException { // grab a single unquoted value. for (int p = next(); p >= 0; p = next()) { char ch = (char) p; if (ch == sep) { // sep we done break; } else if (ch == '\n') { // eol we done eol[0] = true; break; } else if (ch == '"') { // gulp throw new IllegalStateException("Double quote char detected in unquoted field"); } sb.append(ch); } return sb.toString(); } private String doubleQuote(boolean[] eol) throws IOException { // process double quoted field StringBuilder sb = new StringBuilder(); for (int p = next(); p >= 0; p = next()) { char ch = (char) p; if (ch != '"') { // easy case sb.append(ch); } else { // check next char for eof p = next(); if (p < 0) { throw new IllegalStateException("EOL in double quoted field encountered."); } // check for second consecutive double quote ch = (char) p; if (ch == '"') { sb.append('"'); } else { // eat whitespace and next comma, or eol for (; p >= 0; p = next()) { ch = (char) p; if (ch == sep) { break; } else if (ch == '\n') { eol[0] = true; break; } else if (!Character.isWhitespace(ch)) { throw new IllegalStateException("Non-white space character detected after double quote!"); } } return sb.toString(); } } } return sb.toString(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy