fr.boreal.io.csv.CSVParser Maven / Gradle / Ivy
Show all versions of integraal-io Show documentation
package fr.boreal.io.csv;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import com.opencsv.CSVParserBuilder;
import com.opencsv.CSVReader;
import com.opencsv.CSVReaderBuilder;
import fr.boreal.io.api.Parser;
import fr.boreal.io.dlgp.ParserResult;
import fr.boreal.model.logicalElements.api.Atom;
import fr.boreal.model.logicalElements.api.Predicate;
import fr.boreal.model.logicalElements.api.Term;
import fr.boreal.model.logicalElements.factory.api.PredicateFactory;
import fr.boreal.model.logicalElements.factory.api.TermFactory;
import fr.boreal.model.logicalElements.factory.impl.SameObjectPredicateFactory;
import fr.boreal.model.logicalElements.factory.impl.SameObjectTermFactory;
import fr.boreal.model.logicalElements.impl.AtomImpl;
import fr.lirmm.boreal.util.stream.ArrayBlockingStream;
/**
* @author Florent Tornil
*
* This class parses a single CSV file into atoms.
*
* Each line of the file represents an atom and every atom of the file
* have the same predicate either given to the constructor or deduced
* from the file name.
*
* Please note that all the terms are seen as constants.
*/
public class CSVParser implements Parser, AutoCloseable {
private final ArrayBlockingStream buffer = new ArrayBlockingStream<>(512);
private static final ExecutorService executor = Executors.newVirtualThreadPerTaskExecutor();
/**
* Parses the given CSV file using default values
* @param filePath path of the csv file to parse
*/
public CSVParser(String filePath) {
this(new File(filePath), CSVConstants.CSVSEPARATOR, CSVConstants.CSVPREFIX, CSVConstants.CSVHEADERSIZE);
}
/**
* Parses the given CSV file using the given parsing arguments
* Uses the filename as predicate label
* @param file csv file to parse
* @param separator csv separator
* @param prefix (rdf) prefix
* @param headerSize size of the csv header
*/
public CSVParser(File file, char separator, String prefix, int headerSize) {
executor.submit(new Producer(file, buffer, separator, prefix, headerSize));
}
/**
* Parses the given CSV file using the given predicate to create atoms
* @param predicateName label of the predicate
* @param arity arity of the predicate
* @param file csv file to parse
* @param separator csv separator
* @param prefix (rdf) prefix
* @param headerSize size of the csv header
*/
public CSVParser(String predicateName, int arity, File file, char separator, String prefix, int headerSize) {
new Thread(new Producer(predicateName, arity, file, buffer, separator, prefix, headerSize)).start();
}
/**
* Parses the given CSV file using the given predicate to create atoms
* @param predicateName label of the predicate
* @param arity arity of the predicate
* @param file csv file to parse
*
*/
public CSVParser(String predicateName, int arity, File file) {
new Thread(new Producer(predicateName, arity, file, buffer, CSVConstants.CSVSEPARATOR, CSVConstants.CSVPREFIX, CSVConstants.CSVHEADERSIZE)).start();
}
@Override
public boolean hasNext() {
return buffer.hasNext();
}
@Override
public Atom next() {
return buffer.next();
}
@Override
public void close() {
this.buffer.close();
executor.shutdownNow();
}
@Override
public ParserResult parse() {
Collection atoms = new ArrayList<>();
while(this.hasNext()) {
atoms.add(this.next());
}
return new ParserResult(atoms, List.of(), List.of(), List.of());
}
//
// Private class Producer
//
static class Producer implements Runnable {
private final PredicateFactory pf = SameObjectPredicateFactory.instance();
private final TermFactory tf = SameObjectTermFactory.instance();
private final File file;
private final ArrayBlockingStream buffer;
private final char separator;
private final String prefix;
private final int headerSize;
private Predicate predicate = null;
public Producer(File file, ArrayBlockingStream buffer, char separator, String prefix, int headerSize) {
this.file = file;
this.buffer = buffer;
this.separator = separator;
this.prefix = prefix;
this.headerSize = headerSize;
}
public Producer(String predicateName, int arity, File file, ArrayBlockingStream buffer, char separator,
String prefix, int headerSize) {
this(file, buffer, separator, prefix, headerSize);
this.predicate = this.pf.createOrGetPredicate(predicateName, arity);
}
@Override
public void run() {
com.opencsv.CSVParser csvParser = new CSVParserBuilder().withSeparator(this.separator).build();
try (CSVReader csvReader = new CSVReaderBuilder(new FileReader(this.file))
.withCSVParser(csvParser) // custom CSV parser
.withSkipLines(this.headerSize) // skip the headerSize first line, header info
.build()) {
while (true) {
String[] fileContentLine = csvReader.readNext();
if (fileContentLine == null) {
break;
}
if (this.predicate == null) {
// If the predicate isn't known, we deduce it from the csv filename and first
// line size
int arity = fileContentLine.length;
String predicateName = prefix + this.file.getName().split("\\.")[0].toLowerCase();
this.predicate = this.pf.createOrGetPredicate(predicateName, arity);
}
List terms = new ArrayList<>(this.predicate.arity());
for (String value : fileContentLine) {
Term t = this.tf.createOrGetConstant(value);
terms.add(t);
}
Atom a = new AtomImpl(this.predicate, terms);
this.buffer.write(a);
}
this.buffer.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}