com.hfg.util.io.DelimitedTextParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com_hfg Show documentation
Show all versions of com_hfg Show documentation
com.hfg xml, html, svg, and bioinformatics utility library
package com.hfg.util.io;
import com.hfg.exception.DataParsingException;
import com.hfg.util.StringUtil;
import com.hfg.util.collection.DataColumn;
import com.hfg.util.collection.DataTable;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
//------------------------------------------------------------------------------
/**
Base class for CSV (comma-separated value) and TSV (tab-separated value).
@author J. Alex Taylor, hairyfatguy.com
*/
//------------------------------------------------------------------------------
// com.hfg Library
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
// [email protected]
//------------------------------------------------------------------------------
public class DelimitedTextParser
{
private char mDelimiter;
//###########################################################################
// CONSTRUCTORS
//###########################################################################
//---------------------------------------------------------------------------
public DelimitedTextParser(char inDelimiter)
{
mDelimiter = inDelimiter;
}
//###########################################################################
// PUBLIC METHODS
//###########################################################################
//---------------------------------------------------------------------------
public String escapeField(String inField)
{
String result = inField;
if (StringUtil.isSet(result)
&& (result.contains("\"") || result.contains(mDelimiter + "")))
{
result = "\"" + result.replaceAll("\"", "\"\"") + "\"";
}
return result;
}
//---------------------------------------------------------------------------
public DataTable parseToDataTable(Reader inReader)
throws IOException
{
List lines = parse(inReader);
DataTable dataTable = new DataTable();
Map colMap = new HashMap<>(10);
boolean headerParsed = false;
int rowIndex = 0;
for (String[] fields : lines)
{
if (! headerParsed)
{
// Skip blank lines
if (1 == fields.length
&& ! StringUtil.isSet(fields[0]))
{
continue;
}
for (int i = 0; i < fields.length; i++)
{
String field = fields[i].trim();
DataColumn col = new DataColumn(field);
colMap.put(i, col);
}
headerParsed = true;
}
else
{
rowIndex++;
if (fields.length > colMap.size())
{
throw new DataParsingException("Row " + rowIndex + " has more fields (" + fields.length + ") than the number of columns (" + colMap.size() + ")!");
}
for (int i = 0; i < fields.length; i++)
{
String fieldString = fields[i];
Comparable field = null;
if (fieldString != null)
{
fieldString = fieldString.trim();
if (StringUtil.isNumber(fieldString))
{
try
{
if (fieldString.contains("."))
{
field = Double.parseDouble(fieldString);
}
else if (fieldString.length() > 9)
{
field = Long.parseLong(fieldString);
}
else
{
field = Integer.parseInt(fieldString);
}
}
catch (NumberFormatException e)
{
field = fieldString;
}
}
else
{
field = fieldString;
}
}
dataTable.put(rowIndex + "", colMap.get(i), field);
}
}
}
return dataTable;
}
//---------------------------------------------------------------------------
public List parse(Reader inReader)
throws IOException
{
List parsedLines = new ArrayList<>();
BufferedReader bufferedReader = null;
try
{
if (inReader instanceof BufferedReader)
{
bufferedReader = (BufferedReader) inReader;
}
else
{
bufferedReader = new BufferedReader(inReader);
}
String line;
while ((line = bufferedReader.readLine()) != null)
{
parsedLines.add(parseLine(line));
}
}
finally
{
StreamUtil.close(bufferedReader);
}
return parsedLines;
}
//---------------------------------------------------------------------------
public String[] parseLine(String inLine)
throws IOException
{
List fields = new ArrayList<>();
boolean inQuotedValue = false;
int quoteCount = 0;
char currentQuoteChar = ' ';
StringBuilder currentValue = new StringBuilder();
int index = 0;
while (index < inLine.length())
{
int theChar = inLine.charAt(index);
if (inQuotedValue)
{
if (theChar == currentQuoteChar)
{
quoteCount++;
if (2 == quoteCount)
{
// Skip
quoteCount = 0;
}
else if ((index == inLine.length() - 1 || inLine.charAt(index + 1) != currentQuoteChar)
&& (0 == currentValue.length() || currentValue.charAt(currentValue.length() - 1) != '\\'))
{
inQuotedValue = false;
String unescapedValue = StringUtil.replaceAll(currentValue, "\\" + currentQuoteChar, currentQuoteChar + "");
currentValue.setLength(0);
currentValue.append(unescapedValue);
}
else
{
currentValue.append((char) theChar);
}
}
else
{
currentValue.append((char) theChar);
quoteCount = 0;
}
}
else if (theChar == mDelimiter)
{
fields.add(currentValue.length() > 0 ? currentValue.toString().trim() : null);
currentValue.setLength(0);
}
else if (Character.isWhitespace(theChar)
&& 0 == currentValue.length())
{
// Skip whitespace between the comma and the value
}
else if ((theChar == '\''
|| theChar == '\"')
&& 0 == currentValue.length())
{ // Start of a quoted value
inQuotedValue = true;
quoteCount = 0;
currentQuoteChar = (char) theChar;
}
else
{
currentValue.append((char) theChar);
}
index++;
}
fields.add(currentValue.length() > 0 ? currentValue.toString().trim() : null);
return fields.toArray(new String[] {});
}
}