All Downloads are FREE. Search and download functionalities are using the official Maven repository.

fr.boreal.io.csv.CSVLoader Maven / Gradle / Ivy

The newest version!
package fr.boreal.io.csv;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import fr.boreal.io.csv.encoding.EncodedRLS;
import fr.boreal.io.csv.encoding.RLSEncoder;
import fr.boreal.model.kb.api.CSVCopyable;
import fr.boreal.model.kb.api.FactBase;
import fr.boreal.model.logicalElements.api.Atom;
import fr.boreal.model.logicalElements.api.Predicate;
import fr.boreal.model.logicalElements.api.Term;
import fr.boreal.model.logicalElements.impl.AtomImpl;
import fr.boreal.model.logicalElements.impl.PredicateImpl;
import fr.boreal.model.logicalElements.impl.VariableImpl;
import fr.boreal.storage.external.rdbms.RDBMSStore;
import fr.boreal.storage.external.rdbms.driver.HSQLDBDriver;

/**
 * Loader for CSV files.
 */
public class CSVLoader {

	static final Logger LOG = LoggerFactory.getLogger(CSVLoader.class);

	/**
	 * Loads atoms from CSV files into a given factbase.
	 * 
	 * @param f       factbase
	 * @param rlsFile RLS configuration file to parse
	 * @param encode  a boolean true iff the CSV has to be encoded first
	 * @return the factbase with loaded atoms
	 */
	public static FactBase parseAndLoad(FactBase f, File rlsFile, boolean encode) {
		return parseAndLoad(f, rlsFile, CSVConstants.CSVSEPARATOR, CSVConstants.CSVPREFIX, CSVConstants.CSVHEADERSIZE,
				encode);
	}

	/**
	 * Loads atoms from CSV files into a given factbase.
	 * 
	 * @param factbase    factbase to load with parsed files
	 * @param rlsFile     input RLS file
	 * @param separator   for CSV
	 * @param prefix      for CSV
	 * @param headerSize  for CSV
	 * @param encode_flag true iff the data must be encoded
	 * @return the factbase with loaded atoms
	 */
	public static FactBase parseAndLoad(FactBase factbase, File rlsFile, char separator, String prefix, int headerSize,
			Boolean encode_flag) {

		return switch (encode_flag) {

		case Boolean encode when encode && canDoCopyLoadingOn(factbase) ->
			parseEncodeAndLoad(factbase, rlsFile, separator, prefix, headerSize);

		case Boolean encode when !encode && canDoCopyLoadingOn(factbase) ->
			copyLoadingRLS((CSVCopyable) factbase, rlsFile, separator, prefix, headerSize);

		case Boolean ignored when !canDoCopyLoadingOn(factbase) -> standardLoading(factbase, rlsFile, false);

		default -> throw new IllegalArgumentException("Should not happen.");

		};

	}

	private static FactBase parseEncodeAndLoad(FactBase factbase, File rlsFile, char separator, String prefix,
			int headerSize) {

		RDBMSStore c = (RDBMSStore) factbase;

		LOG.info("Encode CSV data");
		RLSEncoder optimistic = new RLSEncoder(separator, prefix, headerSize);
		EncodedRLS encoded_csv = optimistic.encode(rlsFile.getAbsolutePath());

		LOG.debug("Load encoded CSV data directly into the DBMS");
		LOG.debug(
				"factbase size before loading encoded data : {} \t {}", c.size(),c.dictionarySize());

		CSVLoader.parseAndLoad(factbase, new File(encoded_csv.rlsFile()), false);
		LOG.debug(
				"factbase size after loading encoded data : {} \t {}", c.size(),c.dictionarySize());

		LOG.debug("Load encoding dictionary directly into the DBMS");
		String storeDictionaryTable = c.getStrategy().get_terms_table_name();

		if (!canDoCopyLoadingOn(factbase)) {

			try (var parser = new CSVParser(storeDictionaryTable, 3, new File(encoded_csv.dictionaryFile()))) {

				factbase.addAll(parser.parse().atoms());

			}

		} else {
			try {
				c.copy(encoded_csv.dictionaryFile(), ',', 0, new AtomImpl(
						// We can bypass all factories and safeties here as we only care about the
						// predicate label and arity.
						// The full Atom is only used to respect the method signature
						new PredicateImpl(storeDictionaryTable, 3), new VariableImpl("X"), new VariableImpl("Y"),
						new VariableImpl("Z")));
			} catch (SQLException e) {
				LOG.error("[RLSCSVsParser] Error while copying encoded dictionary into RDBMS", e);
			}
		}

		LOG.debug("factbase size after loading dictionary : {} \t {}.\nRepairing data.", c.size(), c.dictionarySize());



		File sortedRepare = new File(encoded_csv.repareFile() + "_sorted.csv");
		try {

			LOG.info("sort the repare file using system call");
			ProcessBuilder pb = new ProcessBuilder("sort", encoded_csv.repareFile()).redirectOutput(sortedRepare);
			Process p = pb.start();
			p.waitFor();

			LOG.info("Convert repare file to SQL queries");
			File repareSQL = CSVLoader.sortedRepareToSQLQueries(sortedRepare, c);

			LOG.info("Execute SQL repare queries");
			c.getEvaluator().execute(repareSQL.getAbsolutePath());
		} catch (Exception e) {
			LOG.error("[RLSCSVsParser] Error while reparing the encoding", e);
			return factbase;
		}

		LOG.debug("factbase size after loading dictionary : {} \t dictionary size {} ", c.size(), c.dictionarySize());

		optimistic.deleteAllTempFiles();

		return factbase;
	}

	private static FactBase standardLoading(FactBase factbase, File rlsFile, boolean encode) {

		LOG.debug("Cannot do copy loading for this storage. Using standard loading strategy. ");

		try (RLSCSVsParser parser = new RLSCSVsParser(rlsFile.getAbsolutePath(), encode)) {

			factbase.addAll(parser.parse().atoms());

			return factbase;

		}
	}

	private static FactBase copyLoadingRLS(CSVCopyable factbase, File rlsFile, char separator, String prefix,
			int headerSize) {

		List loading_failures = new ArrayList<>();

		try (RLSCSVParser rlsParser = new RLSCSVParser(rlsFile)) {
			while (rlsParser.hasNext()) {
				RLSCSVResult rls = rlsParser.next();
				boolean loaded = copyLoadCSV(factbase, rls, separator, prefix, headerSize);
				if (!loaded) {
					loading_failures.add(rls);
				}
			}
		}

		for (RLSCSVResult r : loading_failures) {
			LOG.info("Cannot do copy loading for {}. Using standard strategy.", r.csvFilepath());
			try (var parser = new CSVParser(r.predicateName(), r.predicateArity(), new File(r.csvFilepath()),
					separator, prefix, headerSize)) {
				factbase.addAll(parser.parse().atoms());
			}
		}

		return (FactBase) factbase;

	}

	private static boolean copyLoadCSV(CSVCopyable factbase, RLSCSVResult rls, char separator, String prefix, int headerSize) {

		List atom_terms = new ArrayList<>();
		for (int i = 0; i < rls.predicateArity(); i++) {
			atom_terms.add(new VariableImpl("X_" + i));
		}

		Predicate p = new PredicateImpl(rls.predicateName(), rls.predicateArity());
		Atom a = new AtomImpl(p, atom_terms);
		boolean loaded = false;
		try {
			loaded = factbase.copy(rls.csvFilepath(), separator, headerSize, a);
		} catch (Exception e) {
			LOG.error("could not copy-load CSV file", e);
		}
		return loaded;
	}

	private static boolean canDoCopyLoadingOn(FactBase f) {

		return (f instanceof CSVCopyable) && (f instanceof RDBMSStore s && !(s.getDriver() instanceof HSQLDBDriver));
	}

	/**
	 * Generate a new file with the SQL queries used to apply the needed repares
	 * 
	 * @param repareFile initial repare file (assumed sorted)
	 * @return the SQL repare query file
	 * @throws IOException  iff an error occur while reading or writing on disc
	 * @throws SQLException iff an SQL error occur
	 */
	private static File sortedRepareToSQLQueries(File repareFile, RDBMSStore s) throws IOException, SQLException {
		int lastEncoding = -1;
		String lastTerm = null;

		BufferedReader repareReader = new BufferedReader(new FileReader(repareFile));

		File sqlRepareFile = new File(repareFile.getAbsolutePath() + "repare.sql");
		BufferedWriter repareWriter = new BufferedWriter(new FileWriter(sqlRepareFile));
		while (true) {
			String csvLine = repareReader.readLine();
			if (csvLine == null) {
				break;
			}
			String[] line = csvLine.split(",");
			String term = line[1];
			int encoding = Integer.parseInt(line[2]);
			String predicate = line[3];
			int position = Integer.parseInt(line[4]);
			int arity = Integer.parseInt(line[5]);

			if (lastTerm == null || !lastTerm.equals(term)) {
				lastTerm = term;
				lastEncoding = encoding;
			} else {
				List terms = new ArrayList<>();
				for (int i = 0; i < arity; i++) {
					terms.add(new VariableImpl("X" + i));
				}
				// Update data
				Atom witness = new AtomImpl(new PredicateImpl(predicate, arity), terms);
				String tableName = s.getStrategy().getTableName(witness);
				String updateQuery = "UPDATE " + tableName + " SET "
						+ s.getStrategy().getColumnName(tableName, position) + " = '" + lastEncoding + "' WHERE "
						+ s.getStrategy().getColumnName(tableName, position) + " = '" + encoding + "';";
				repareWriter.write(updateQuery);
				repareWriter.newLine();

				// Update dictionary
				tableName = s.getStrategy().get_terms_table_name();
				String deleteQuery = "DELETE FROM " + tableName + " WHERE encoding = '" + encoding + "';";
				repareWriter.write(deleteQuery);
				repareWriter.newLine();
			}
		}
		repareReader.close();
		repareWriter.close();
		return sqlRepareFile;
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy