it.uniroma2.art.sheet2rdf.sheet.CSVSheetManager Maven / Gradle / Ivy
package it.uniroma2.art.sheet2rdf.sheet;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
public class CSVSheetManager implements SheetManager {
private static final char[] popularAlternativeDelimiters = {'|', ';'};
private List records;
public CSVSheetManager(File file){
try {
Reader in = new FileReader(file);
CSVParser parser = null;
if (testFormat(file, CSVFormat.MYSQL)){
parser = CSVFormat.MYSQL.parse(in);
} else if (testFormat(file, CSVFormat.RFC4180)){
parser = CSVFormat.RFC4180.parse(in);
} else if (testFormat(file, CSVFormat.TDF)){
parser = CSVFormat.TDF.parse(in);
} else if (testFormat(file, CSVFormat.EXCEL)){
parser = CSVFormat.EXCEL.parse(in);
} else if (testFormat(file, CSVFormat.DEFAULT)){
parser = CSVFormat.DEFAULT.parse(in);
} else {
for (char delim : popularAlternativeDelimiters){
CSVFormat customFormat = CSVFormat.newFormat(delim).withEscape('\\').withQuote('"').withRecordSeparator('\n');
if (testFormat(file, customFormat)){
parser = customFormat.parse(in);
break;
}
}
if (parser == null) {
parser = CSVFormat.DEFAULT.parse(in);
}
}
records = parser.getRecords();
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public List getHeaders(boolean includeDuplicate) {
ArrayList headers = new ArrayList();
Iterator it = records.iterator();
if (it.hasNext()){
CSVRecord record = it.next();
for (String header : record){
if (!headers.contains(header))//if the headers is not yet in the list, add it
headers.add(header);
else { //else add it only if includeDuplicate = true
if (includeDuplicate)
headers.add(header);
}
}
}
return headers;
}
@Override
public ArrayList> getDataTable() {
int rows = getDataSheetRowCount();
int columns = getDataSheetColumnCount();
ArrayList> table = new ArrayList>();
for (int r = 1; r < rows; r++){
ArrayList arrayListRow = new ArrayList();
for (int c = 0; c < columns; c++){
String data = getCellValueInDataSheet(r, c);
arrayListRow.add(data);
}
table.add(arrayListRow);
}
return table;
}
@Override
public String getCellValueInDataSheet(int row, int column) {
return records.get(row).get(column);
}
@Override
public Map getPrefixNamespaceMapping() {
return new HashMap();//for CSV is not defined a prefix mapping sheet
}
@Override
public boolean isMultipleHeader(String headerValue) {
List headers = getHeaders(true);
//find the column of headerValue
for (int i = 0; i < headers.size(); i++){
String h = headers.get(i);
if (h.equals(headerValue)){//once found, check if the following column has the same header
if (i+1 != headers.size()){
if (headers.get(i+1).equals(headerValue))
return true;
}
break;
}
}
return false;
}
@Override
public boolean existsPrefixMappingSheet() {
return false;
}
@Override
public int getDataSheetColumnCount() {
return records.get(0).size();
}
@Override
public int getDataSheetRowCount() {
return records.size();
}
@Override
public int getPrefixSheetColumnCount() {
return 0;
}
@Override
public int getPrefixSheetRowCount() {
return 0;
}
/**
* make sure the reader has correct delimiter and quotation set.
* Check first line and make sure that all the other rows have the same amount of columns and at least 2
*
* @param is input stream to be checked
* @param strategy strategy to be verified.
* @return
* @throws IOException
* @param is
*/
private static boolean testFormat(File file, CSVFormat format) throws IOException {
boolean valid = true;
Reader reader = new FileReader(file);
int MIN_COLUMNS = 2;
int MAX_ROWS = 50;
CSVParser parser = format.parse(reader);
List records = parser.getRecords();
// System.out.println("Rows: " + records.size());
if (records.size() == 0)
valid = false;
//get number of headers columns (min 2)
// System.out.println("Columns 1st row: " + records.get(0).size());
int headCols = records.get(0).size();
if (headCols < MIN_COLUMNS)
valid = false;
int checkRows = MAX_ROWS;
if (records.size() < checkRows)
checkRows = records.size();
//check if every row has the same columns of header (limit to first 50 rows if there are more than that)
for (int i=1; i