fr.boreal.io.csv.CSVLoader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of integraal-io Show documentation
Show all versions of integraal-io Show documentation
Inputs and Outputs for integraal objects
The newest version!
package fr.boreal.io.csv;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import fr.boreal.io.csv.encoding.EncodedRLS;
import fr.boreal.io.csv.encoding.RLSEncoder;
import fr.boreal.model.kb.api.CSVCopyable;
import fr.boreal.model.kb.api.FactBase;
import fr.boreal.model.logicalElements.api.Atom;
import fr.boreal.model.logicalElements.api.Predicate;
import fr.boreal.model.logicalElements.api.Term;
import fr.boreal.model.logicalElements.impl.AtomImpl;
import fr.boreal.model.logicalElements.impl.PredicateImpl;
import fr.boreal.model.logicalElements.impl.VariableImpl;
import fr.boreal.storage.external.rdbms.RDBMSStore;
import fr.boreal.storage.external.rdbms.driver.HSQLDBDriver;
/**
* Loader for CSV files.
*/
public class CSVLoader {
static final Logger LOG = LoggerFactory.getLogger(CSVLoader.class);
/**
* Loads atoms from CSV files into a given factbase.
*
* @param f factbase
* @param rlsFile RLS configuration file to parse
* @param encode a boolean true iff the CSV has to be encoded first
* @return the factbase with loaded atoms
*/
public static FactBase parseAndLoad(FactBase f, File rlsFile, boolean encode) {
return parseAndLoad(f, rlsFile, CSVConstants.CSVSEPARATOR, CSVConstants.CSVPREFIX, CSVConstants.CSVHEADERSIZE,
encode);
}
/**
* Loads atoms from CSV files into a given factbase.
*
* @param factbase factbase to load with parsed files
* @param rlsFile input RLS file
* @param separator for CSV
* @param prefix for CSV
* @param headerSize for CSV
* @param encode_flag true iff the data must be encoded
* @return the factbase with loaded atoms
*/
public static FactBase parseAndLoad(FactBase factbase, File rlsFile, char separator, String prefix, int headerSize,
Boolean encode_flag) {
return switch (encode_flag) {
case Boolean encode when encode && canDoCopyLoadingOn(factbase) ->
parseEncodeAndLoad(factbase, rlsFile, separator, prefix, headerSize);
case Boolean encode when !encode && canDoCopyLoadingOn(factbase) ->
copyLoadingRLS((CSVCopyable) factbase, rlsFile, separator, prefix, headerSize);
case Boolean ignored when !canDoCopyLoadingOn(factbase) -> standardLoading(factbase, rlsFile, false);
default -> throw new IllegalArgumentException("Should not happen.");
};
}
private static FactBase parseEncodeAndLoad(FactBase factbase, File rlsFile, char separator, String prefix,
int headerSize) {
RDBMSStore c = (RDBMSStore) factbase;
LOG.info("Encode CSV data");
RLSEncoder optimistic = new RLSEncoder(separator, prefix, headerSize);
EncodedRLS encoded_csv = optimistic.encode(rlsFile.getAbsolutePath());
LOG.debug("Load encoded CSV data directly into the DBMS");
LOG.debug(
"factbase size before loading encoded data : {} \t {}", c.size(),c.dictionarySize());
CSVLoader.parseAndLoad(factbase, new File(encoded_csv.rlsFile()), false);
LOG.debug(
"factbase size after loading encoded data : {} \t {}", c.size(),c.dictionarySize());
LOG.debug("Load encoding dictionary directly into the DBMS");
String storeDictionaryTable = c.getStrategy().get_terms_table_name();
if (!canDoCopyLoadingOn(factbase)) {
try (var parser = new CSVParser(storeDictionaryTable, 3, new File(encoded_csv.dictionaryFile()))) {
factbase.addAll(parser.parse().atoms());
}
} else {
try {
c.copy(encoded_csv.dictionaryFile(), ',', 0, new AtomImpl(
// We can bypass all factories and safeties here as we only care about the
// predicate label and arity.
// The full Atom is only used to respect the method signature
new PredicateImpl(storeDictionaryTable, 3), new VariableImpl("X"), new VariableImpl("Y"),
new VariableImpl("Z")));
} catch (SQLException e) {
LOG.error("[RLSCSVsParser] Error while copying encoded dictionary into RDBMS", e);
}
}
LOG.debug("factbase size after loading dictionary : {} \t {}.\nRepairing data.", c.size(), c.dictionarySize());
File sortedRepare = new File(encoded_csv.repareFile() + "_sorted.csv");
try {
LOG.info("sort the repare file using system call");
ProcessBuilder pb = new ProcessBuilder("sort", encoded_csv.repareFile()).redirectOutput(sortedRepare);
Process p = pb.start();
p.waitFor();
LOG.info("Convert repare file to SQL queries");
File repareSQL = CSVLoader.sortedRepareToSQLQueries(sortedRepare, c);
LOG.info("Execute SQL repare queries");
c.getEvaluator().execute(repareSQL.getAbsolutePath());
} catch (Exception e) {
LOG.error("[RLSCSVsParser] Error while reparing the encoding", e);
return factbase;
}
LOG.debug("factbase size after loading dictionary : {} \t dictionary size {} ", c.size(), c.dictionarySize());
optimistic.deleteAllTempFiles();
return factbase;
}
private static FactBase standardLoading(FactBase factbase, File rlsFile, boolean encode) {
LOG.debug("Cannot do copy loading for this storage. Using standard loading strategy. ");
try (RLSCSVsParser parser = new RLSCSVsParser(rlsFile.getAbsolutePath(), encode)) {
factbase.addAll(parser.parse().atoms());
return factbase;
}
}
private static FactBase copyLoadingRLS(CSVCopyable factbase, File rlsFile, char separator, String prefix,
int headerSize) {
List loading_failures = new ArrayList<>();
try (RLSCSVParser rlsParser = new RLSCSVParser(rlsFile)) {
while (rlsParser.hasNext()) {
RLSCSVResult rls = rlsParser.next();
boolean loaded = copyLoadCSV(factbase, rls, separator, prefix, headerSize);
if (!loaded) {
loading_failures.add(rls);
}
}
}
for (RLSCSVResult r : loading_failures) {
LOG.info("Cannot do copy loading for {}. Using standard strategy.", r.csvFilepath());
try (var parser = new CSVParser(r.predicateName(), r.predicateArity(), new File(r.csvFilepath()),
separator, prefix, headerSize)) {
factbase.addAll(parser.parse().atoms());
}
}
return (FactBase) factbase;
}
private static boolean copyLoadCSV(CSVCopyable factbase, RLSCSVResult rls, char separator, String prefix, int headerSize) {
List atom_terms = new ArrayList<>();
for (int i = 0; i < rls.predicateArity(); i++) {
atom_terms.add(new VariableImpl("X_" + i));
}
Predicate p = new PredicateImpl(rls.predicateName(), rls.predicateArity());
Atom a = new AtomImpl(p, atom_terms);
boolean loaded = false;
try {
loaded = factbase.copy(rls.csvFilepath(), separator, headerSize, a);
} catch (Exception e) {
LOG.error("could not copy-load CSV file", e);
}
return loaded;
}
private static boolean canDoCopyLoadingOn(FactBase f) {
return (f instanceof CSVCopyable) && (f instanceof RDBMSStore s && !(s.getDriver() instanceof HSQLDBDriver));
}
/**
* Generate a new file with the SQL queries used to apply the needed repares
*
* @param repareFile initial repare file (assumed sorted)
* @return the SQL repare query file
* @throws IOException iff an error occur while reading or writing on disc
* @throws SQLException iff an SQL error occur
*/
private static File sortedRepareToSQLQueries(File repareFile, RDBMSStore s) throws IOException, SQLException {
int lastEncoding = -1;
String lastTerm = null;
BufferedReader repareReader = new BufferedReader(new FileReader(repareFile));
File sqlRepareFile = new File(repareFile.getAbsolutePath() + "repare.sql");
BufferedWriter repareWriter = new BufferedWriter(new FileWriter(sqlRepareFile));
while (true) {
String csvLine = repareReader.readLine();
if (csvLine == null) {
break;
}
String[] line = csvLine.split(",");
String term = line[1];
int encoding = Integer.parseInt(line[2]);
String predicate = line[3];
int position = Integer.parseInt(line[4]);
int arity = Integer.parseInt(line[5]);
if (lastTerm == null || !lastTerm.equals(term)) {
lastTerm = term;
lastEncoding = encoding;
} else {
List terms = new ArrayList<>();
for (int i = 0; i < arity; i++) {
terms.add(new VariableImpl("X" + i));
}
// Update data
Atom witness = new AtomImpl(new PredicateImpl(predicate, arity), terms);
String tableName = s.getStrategy().getTableName(witness);
String updateQuery = "UPDATE " + tableName + " SET "
+ s.getStrategy().getColumnName(tableName, position) + " = '" + lastEncoding + "' WHERE "
+ s.getStrategy().getColumnName(tableName, position) + " = '" + encoding + "';";
repareWriter.write(updateQuery);
repareWriter.newLine();
// Update dictionary
tableName = s.getStrategy().get_terms_table_name();
String deleteQuery = "DELETE FROM " + tableName + " WHERE encoding = '" + encoding + "';";
repareWriter.write(deleteQuery);
repareWriter.newLine();
}
}
repareReader.close();
repareWriter.close();
return sqlRepareFile;
}
}