
net.anotheria.util.csv.CSVParser Maven / Gradle / Ivy
package net.anotheria.util.csv;
import net.anotheria.util.StringUtils;
import net.anotheria.util.datatable.DataCell;
import net.anotheria.util.datatable.DataHeader;
import net.anotheria.util.datatable.DataRow;
import net.anotheria.util.datatable.DataTable;
import java.util.ArrayList;
import java.util.List;
/**
* CSVParser class.
*
* @author another
* @version $Id: $Id
*/
public class CSVParser {
/** Constant DEFAULT_VALUES_SEPARATOR=','
*/
public static final char DEFAULT_VALUES_SEPARATOR = ',';
/** Constant DEFAULT_ROWS_SEPARATOR='\n'
*/
public static final char DEFAULT_ROWS_SEPARATOR = '\n';
private static String normalize(String csvSource){
return StringUtils.removeChar(csvSource, '\r');
}
/**
* parse.
*
* @param csvSource a {@link java.lang.String} object.
* @return a {@link net.anotheria.util.datatable.DataTable} object.
*/
public static DataTable parse(String csvSource){
return parse(csvSource, true);
}
/**
* parse.
*
* @param csvSource a {@link java.lang.String} object.
* @param hasHeader a boolean.
* @return a {@link net.anotheria.util.datatable.DataTable} object.
*/
public static DataTable parse(String csvSource, boolean hasHeader){
return parse(normalize(csvSource),DEFAULT_VALUES_SEPARATOR, hasHeader);
}
/**
* parse.
*
* @param csvSource a {@link java.lang.String} object.
* @param valuesSeparator a char.
* @param hasHeader a boolean.
* @return a {@link net.anotheria.util.datatable.DataTable} object.
*/
public static DataTable parse(String csvSource, char valuesSeparator, boolean hasHeader){
return parse(normalize(csvSource),valuesSeparator, DEFAULT_ROWS_SEPARATOR, hasHeader);
}
/**
* parse.
*
* @param csvSource a {@link java.lang.String} object.
* @param valuesSeparator a char.
* @param rowsSeparator a char.
* @param hasHeader a boolean.
* @return a {@link net.anotheria.util.datatable.DataTable} object.
*/
public static DataTable parse(String csvSource, char valuesSeparator, char rowsSeparator, boolean hasHeader){
String[] rows = StringUtils.tokenize(csvSource, rowsSeparator);
if(rows.length == 0)
throw new RuntimeException("No rows found!");
rows = normalizeEscapedNewlines(rows);
DataTable ret = new DataTable(rows.length);
if(hasHeader)
ret.setHeader(toDataHeader(parseRow(rows[0], valuesSeparator, true)));
for(int i = hasHeader?1:0; i < rows.length; i++)
ret.addRow(parseRow(rows[i], valuesSeparator, true));
return ret;
}
private static DataRow parseRow(String row, char valuesSeparator, boolean unescape) {
try{
List tokens = StringUtils._tokenize(row, '"', '"', false,valuesSeparator);
DataRow ret = new DataRow();
for(String t: tokens){
if (unescape && t.indexOf('"') >= 0) {
t = unescape(t);
}
ret.addCell(new DataCell(t));
}
return ret;
}catch(RuntimeException e){
throw new RuntimeException("Could not parse CSV Row: " + row, e);
}
}
// escape symbol itself has to be escaped too.
// I found that org.apache.commons.lang.StringEscapeUtils from commons-lang has useful methods:
// escapeCSV and unescapeCSV, but if we used them to escape value, we also have to unescape it properly
// for instance, we should either remove both surrounding '"' and double "" inside, either none of them, to unescape later
private static String unescape(String t) {
// t = t.replaceAll("\"{2}", "\"");// v1, using simple regexp
int index = 0; // v2, using simple string concatenations
while ((index = t.indexOf("\"\"", index)) >= 0) {
t= t.substring(0, index) + t.substring(index + 1);
index++;
}
if(StringUtils.isSurroundedWith(t, '"', '"') && (t.indexOf(',') >= 0 || t.indexOf('"') >= 0 || t.indexOf(DEFAULT_ROWS_SEPARATOR) >= 0)) {
t = StringUtils.removeSurround(t);
}
return t;
}
/** If we have new-line symbol somewhere in the text, it would be escaped, but tokenizer would split it into 2+ separate lines
* This situation can be determined by counting escape symbols in the line - in this and only in this situation, number of double quotes will be odd*/
private static boolean hasOddNumberOfQuotes(String text) {
boolean isOddCount = false;
for (int i = 0; i >= 0; ){
i = text.indexOf('"',i);
if (i >= 0) {
isOddCount = !isOddCount;
i++;
}
}
return isOddCount;
}
/** We have to handle somehow situation with newlines in the values,
* for now - algorithm written below looks as simplest and quickest */
private static String[] normalizeEscapedNewlines(String... rows) {
List result = new ArrayList<>(rows.length);
for (int i = 0; i < rows.length; i++){
String row = rows[i];
if (hasOddNumberOfQuotes(row)) {
boolean endFound = false;
for (int j = i+1; j < rows.length && !endFound; j++, i++){
row+= '\n' + rows[j];
endFound = hasOddNumberOfQuotes(rows[j]);
}
}
result.add(row);
}
return result.toArray(new String[result.size()]);
}
private static DataHeader toDataHeader(Iterable headerRow){
DataHeader ret = new DataHeader();
for(DataCell cell: headerRow)
ret.addHeader(cell.getValueAsString());
return ret;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy