io.continual.util.data.csv.CsvCallbackReader Maven / Gradle / Ivy
/*
* Copyright 2019, Continual.io
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.continual.util.data.csv;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
public class CsvCallbackReader
{
public static final String kLineField = "line";
public static final String kLineField_Default = "line";
public static final String kHasHeaderRow = "header";
public static final String kDelimiter = "delimiter";
public static final String kQuote = "quote";
public static final String kPassThru = "passthru";
public interface RecordHandler
{
/**
* handle a CSV line
* @param fields the field values for a record
* @return true to continue
*/
boolean handler ( Map fields ) throws E;
}
public CsvCallbackReader ( boolean header )
{
this ( '"', ',', header );
}
public CsvCallbackReader ( char quoteChar, char fieldSepChar, boolean header )
{
fDelimiter = fieldSepChar;
fQuote = quoteChar;
fColumns = null;
fHasHeaderRow = header;
fLineCount = 0;
}
public void reset ()
{
fLineCount = 0;
}
public void read ( InputStream is, RecordHandler rh ) throws IOException, E
{
if ( is == null ) throw new IOException ( "No CSV stream provided" );
final InputStreamReader isr = new InputStreamReader ( is, StandardCharsets.UTF_8 );
read ( isr, rh );
}
public void read ( InputStreamReader isr, RecordHandler rh ) throws IOException, E
{
boolean keepGoing = true;
HashMap s = readNextLine ( isr );
while ( s != null && keepGoing )
{
keepGoing = rh.handler ( s );
if ( keepGoing )
{
s = readNextLine ( isr );
}
}
if ( s == null )
{
isr.close ();
}
}
public List getColumnNames ()
{
final LinkedList result = new LinkedList ();
if ( fColumns != null )
{
for ( String c : fColumns )
{
result.add ( c );
}
}
return result;
}
public int getLinesParsed ()
{
return fLineCount;
}
public boolean hasHeader ()
{
return fHasHeaderRow;
}
private int fLineCount;
private final char fDelimiter;
private final char fQuote;
private String[] fColumns;
private boolean fHasHeaderRow;
String fLastToken;
boolean fLastOnLine;
protected void readTerm ( InputStreamReader is ) throws IOException
{
fLastOnLine = false;
fLastToken = null;
final StringBuffer sb = new StringBuffer ();
int current = is.read ();
if ( current < 0 || isLineEnding ( current ) )
{
fLastOnLine = true;
return;
}
if ( current == fDelimiter )
{
fLastOnLine = false;
fLastToken = "";
return;
}
final boolean quoted = ( current == fQuote );
if ( !quoted )
{
sb.append ( (char) current );
}
// else: ignore the quote
// now read until the terminal character is seen. note that
// the quote character is escaped by doubling it.
boolean terminated = false;
boolean lastWasQuote = false;
while ( !terminated )
{
current = is.read ();
if ( current < 0 )
{
// end of stream. return what we had.
fLastToken = sb.toString ();
fLastOnLine = true;
return;
}
if ( current == fQuote )
{
if ( quoted )
{
if ( lastWasQuote )
{
sb.append ( (char) current );
lastWasQuote = false;
}
else
{
lastWasQuote = true;
}
}
else
{
sb.append ( (char) current );
}
}
else if ( current == fDelimiter || isLineEnding ( current ) )
{
if ( quoted )
{
if ( lastWasQuote )
{
// the end
terminated = true;
fLastOnLine = isLineEnding ( current );
}
else
{
// just a delim/newline in the middle
sb.append ( (char) current );
}
}
else
{
// when not quoting, this ends the term
terminated = true;
fLastOnLine = isLineEnding ( current );
}
}
else
{
sb.append ( (char) current );
}
}
fLastToken = sb.toString ();
}
private static boolean isLineEnding ( int c )
{
// FIXME: this is a "newline" which may or may not work based on the line ending
// saved into the CSV stream
return c == '\n';
}
private List readLineValues ( InputStreamReader is, boolean withTrim ) throws IOException
{
final LinkedList result = new LinkedList ();
do
{
readTerm ( is );
if ( fLastToken != null )
{
result.add ( withTrim ? fLastToken.trim () : fLastToken );
}
}
while ( !fLastOnLine );
return result;
}
private HashMap readNextLine ( InputStreamReader is ) throws IOException
{
if ( fHasHeaderRow && fLineCount == 0 )
{
// read the header line
parseHeader ( is );
}
return parseLine ( is );
}
private void parseHeader ( InputStreamReader is ) throws IOException
{
List headers = readLineValues ( is, true );
// skip empty lines or those that start with "#"
while ( headers.size () == 0 || headers.iterator ().next ().startsWith ( "#" ) )
{
headers = readLineValues ( is, true );
}
fColumns = headers.toArray ( new String [headers.size ()] );
}
/**
* Read the next line from the CSV, providing header values into a map.
* @param is
* @return null if the EOF is reached
* @throws IOException
*/
private HashMap parseLine ( InputStreamReader is ) throws IOException
{
final HashMap result = new HashMap ();
int colNum = 0;
// a line that ends in an empty field can get parsed incorrectly (e.g. "foo,")
// because the "remainder" after the first field is empty, looking just like
// the usual end case. so, if we have a set of columns, set each value to the
// empty string ahead of time.
if ( fColumns != null )
{
for ( String col : fColumns )
{
result.put ( col, "" );
}
}
boolean didOne = false;
do
{
readTerm ( is );
if ( fLastToken != null )
{
didOne = true;
final String name = ( fColumns != null && colNum