
prerna.reactor.database.upload.rdf.RdfLoaderSheetUploadReactor Maven / Gradle / Ivy
The newest version!
package prerna.reactor.database.upload.rdf;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Hashtable;
import java.util.StringTokenizer;
import java.util.Vector;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.core.config.Configurator;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellType;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.openrdf.model.vocabulary.RDF;
import org.openrdf.sail.SailException;
import prerna.auth.User;
import prerna.engine.api.IDatabaseEngine;
import prerna.engine.api.IEngine;
import prerna.engine.api.ISesameRdfEngine;
import prerna.engine.impl.owl.WriteOWLEngine;
import prerna.engine.impl.rdf.RDFDefaultDatabaseTypeFactory;
import prerna.engine.impl.rdf.RdfUploadReactorUtility;
import prerna.poi.main.helper.excel.ExcelParsing;
import prerna.reactor.database.upload.AbstractDatabaseUploadFileReactor;
import prerna.sablecc2.om.PixelDataType;
import prerna.sablecc2.om.PixelOperationType;
import prerna.sablecc2.om.execptions.SemossPixelException;
import prerna.sablecc2.om.nounmeta.NounMetadata;
import prerna.util.Constants;
import prerna.util.DIHelper;
import prerna.util.UploadInputUtility;
import prerna.util.UploadUtilities;
import prerna.util.Utility;
public class RdfLoaderSheetUploadReactor extends AbstractDatabaseUploadFileReactor {
public RdfLoaderSheetUploadReactor() {
this.keysToGet = new String[] {
UploadInputUtility.DATABASE,
UploadInputUtility.FILE_PATH,
UploadInputUtility.ADD_TO_EXISTING,
UploadInputUtility.CUSTOM_BASE_URI
};
}
public void generateNewDatabase(User user, String newDatabaseName, String filePath) throws Exception {
if(!ExcelParsing.isExcelFile(filePath)) {
NounMetadata error = new NounMetadata("Invalid file. Must be .xlsx, .xlsm or .xls", PixelDataType.CONST_STRING, PixelOperationType.ERROR);
SemossPixelException e = new SemossPixelException(error);
e.setContinueThreadOfExecution(false);
throw e;
}
String baseUri = UploadInputUtility.getCustomBaseURI(this.store);
int stepCounter = 1;
logger.info(stepCounter + ". Create metadata for database...");
File owlFile = UploadUtilities.generateOwlFile(IEngine.CATALOG_TYPE.DATABASE, this.databaseId, newDatabaseName);
logger.info(stepCounter + ". Complete");
stepCounter++;
// need instance to write to smss
this.database = RDFDefaultDatabaseTypeFactory.getDefaultSesameEngine();
logger.info(stepCounter + ". Create properties file for database...");
this.tempSmss = UploadUtilities.createTemporaryRdfSmss(this.database, this.databaseId, newDatabaseName, owlFile, baseUri);
DIHelper.getInstance().setEngineProperty(this.databaseId + "_" + Constants.STORE, this.tempSmss.getAbsolutePath());
logger.info(stepCounter + ". Complete");
stepCounter++;
logger.info(stepCounter + ". Create database store...");
this.database.setEngineId(this.databaseId);
this.database.setEngineName(newDatabaseName);
this.database.open(this.tempSmss.getAbsolutePath());
String semossURI = DIHelper.getInstance().getProperty(Constants.SEMOSS_URI);
String sub = semossURI + "/" + Constants.DEFAULT_NODE_CLASS;
String typeOf = RDF.TYPE.stringValue();
String obj = Constants.CLASS_URI;
this.database.doAction(IDatabaseEngine.ACTION_TYPE.ADD_STATEMENT, new Object[] { sub, typeOf, obj, true });
sub = semossURI + "/" + Constants.DEFAULT_RELATION_CLASS;
obj = Constants.DEFAULT_PROPERTY_URI;
this.database.doAction(IDatabaseEngine.ACTION_TYPE.ADD_STATEMENT, new Object[] { sub, typeOf, obj, true });
logger.info(stepCounter + ". Complete");
stepCounter++;
/*
* Load Data
*/
logger.info(stepCounter + ". Parsing file metadata...");
WriteOWLEngine owlEngine = this.database.getOWLEngineFactory().getWriteOWL();
owlEngine.addCustomBaseURI(baseUri);
importFile(this.database, owlEngine, filePath, baseUri);
RdfUploadReactorUtility.loadMetadataIntoEngine(this.database, owlEngine);
owlEngine.commit();
owlEngine.export();
owlEngine.close();
// commit the created database
this.database.commit();
((ISesameRdfEngine) this.database).infer();
((ISesameRdfEngine) this.database).exportDB();
}
public void addToExistingDatabase(String filePath) throws Exception {
if(!ExcelParsing.isExcelFile(filePath)) {
NounMetadata error = new NounMetadata("Invalid file. Must be .xlsx, .xlsm or .xls", PixelDataType.CONST_STRING, PixelOperationType.ERROR);
SemossPixelException e = new SemossPixelException(error);
e.setContinueThreadOfExecution(false);
throw e;
}
int stepCounter = 1;
if (!(this.database instanceof ISesameRdfEngine)) {
throw new IllegalArgumentException("Invalid database type");
}
Configurator.setLevel(logger.getName(), Level.ERROR);
WriteOWLEngine owlEngine = this.database.getOWLEngineFactory().getWriteOWL();
importFile(this.database,owlEngine, filePath, this.database.getNodeBaseUri());
RdfUploadReactorUtility.loadMetadataIntoEngine(this.database, owlEngine);
owlEngine.commit();
owlEngine.export();
owlEngine.close();
// commit the created database
this.database.commit();
((ISesameRdfEngine) this.database).infer();
((ISesameRdfEngine) this.database).exportDB();
logger.info(stepCounter + ". Complete");
}
@Override
public void closeFileHelpers() {
}
/**
* Load the excel workbook, determine which sheets to load in workbook from the Loader tab
* @param fileName String containing the absolute path to the excel workbook to load
* @throws FileNotFoundException
* @throws IOException
*/
public void importFile(IDatabaseEngine database, WriteOWLEngine owlEngine, String fileName, String baseUri) throws FileNotFoundException, IOException {
Workbook workbook = null;
FileInputStream poiReader = null;
try {
poiReader = new FileInputStream(fileName);
workbook = WorkbookFactory.create(poiReader);
// load the Loader tab to determine which sheets to load
Sheet lSheet = workbook.getSheet("Loader");
if (lSheet == null) {
throw new IOException("Could not find Loader Sheet in Excel file " + fileName);
}
// check if user is loading subclassing relationships
Sheet subclassSheet = workbook.getSheet("Subclass");
if (subclassSheet != null) {
createSubClassing(database, owlEngine, subclassSheet);
}
// determine number of sheets to load
int lastRow = lSheet.getLastRowNum();
// first sheet name in second row
for (int rIndex = 1; rIndex <= lastRow; rIndex++) {
Row row = lSheet.getRow(rIndex);
// check to make sure cell is not null
if (row != null) {
Cell sheetNameCell = row.getCell(0);
Cell sheetTypeCell = row.getCell(1);
if (sheetNameCell != null) {
// get the name of the sheet
String sheetToLoad = sheetNameCell.getStringCellValue().trim();
// determine the type of load
String loadTypeName = "";
if(sheetTypeCell != null) {
loadTypeName = sheetTypeCell.getStringCellValue();
}
if (!sheetToLoad.isEmpty()) {
this.logger.debug("Cell Content is " + sheetToLoad);
// this is a relationship
if (loadTypeName.contains("Matrix")) {
loadMatrixSheet(database, owlEngine, sheetToLoad, workbook, baseUri);
database.commit();
} else {
loadSheet(database, owlEngine, sheetToLoad, workbook, baseUri);
database.commit();
}
}
}
}
}
} catch (FileNotFoundException e) {
if (e.getMessage() != null && !e.getMessage().isEmpty()) {
logger.error(e.getMessage());
}
logger.error(Constants.STACKTRACE, e);
throw new FileNotFoundException("Could not find Excel file located at " + fileName);
} catch (IOException e) {
if (e.getMessage() != null && !e.getMessage().isEmpty()) {
logger.error(e.getMessage());
}
logger.error(Constants.STACKTRACE, e);
throw new IOException("Could not read Excel file located at " + fileName);
} catch (Exception e) {
if (e.getMessage() != null && !e.getMessage().isEmpty()) {
logger.error(e.getMessage());
}
logger.error(Constants.STACKTRACE, e);
throw new IOException("File: " + fileName + " is not a valid Microsoft Excel (.xlsx, .xlsm) file");
} finally {
if (poiReader != null) {
try {
poiReader.close();
} catch (IOException e) {
logger.error(Constants.STACKTRACE, e);
throw new IOException("Could not close Excel file stream");
}
}
if (workbook != null) {
try {
workbook.close();
} catch (IOException e) {
logger.error(Constants.STACKTRACE, e);
//throw new IOException("Could not close Excel file stream");
}
}
}
}
/**
* Load subclassing information into the db and the owl file Requires the data to be in specific excel tab labeled "Subclass", with Parent nodes
* in the first column and child nodes in the second column
*
* @param subclassSheet
* Excel sheet with the subclassing information
* @throws IOException
* @throws EngineException
* @throws SailException
*/
private void createSubClassing(IDatabaseEngine database, WriteOWLEngine owlEngine, Sheet subclassSheet) throws IOException {
// URI for subclass
String pred = Constants.SUBCLASS_URI;
// check parent and child nodes in correct position
Row row = subclassSheet.getRow(0);
String parentNode = row.getCell(0).toString().trim().toLowerCase();
String childNode = row.getCell(1).toString().trim().toLowerCase();
// check to make sure parent column is in the correct column
if (!parentNode.equalsIgnoreCase("parent")) {
throw new IOException("Error with Subclass Sheet.\nError in parent node column.");
}
// check to make sure child column is in the correct column
if (!childNode.equalsIgnoreCase("child")) {
throw new IOException("Error with Subclass Sheet.\nError in child node column.");
}
// loop through and create all the triples for subclassing
int lastRow = subclassSheet.getLastRowNum();
for (int i = 1; i <= lastRow; i++) {
row = subclassSheet.getRow(i);
String parentURI = owlEngine.addConcept(Utility.cleanString(row.getCell(0).toString(), true), "STRING");
String childURI = owlEngine.addConcept(Utility.cleanString(row.getCell(1).toString(), true), "STRING");
// add triples to database
database.doAction(IDatabaseEngine.ACTION_TYPE.ADD_STATEMENT, new Object[] { childURI, pred, parentURI, true });
// add triples to OWL
owlEngine.addSubclass(childNode, parentNode);
}
database.commit();
owlEngine.commit();
}
/**
* Load specific sheet in workbook
* @param sheetToLoad String containing the name of the sheet to load
* @param workbook XSSFWorkbook containing the sheet to load
* @throws IOException
*/
public void loadSheet(IDatabaseEngine database, WriteOWLEngine owlEngine, String sheetToLoad, Workbook workbook, String baseUri) throws IOException {
Sheet lSheet = workbook.getSheet(sheetToLoad);
if (lSheet == null) {
throw new IOException("Could not find sheet " + sheetToLoad + " in workbook.");
}
logger.info("Loading Sheet: " + sheetToLoad);
int lastRow = lSheet.getLastRowNum() + 1;
// Get the first row to get column names
Row row = lSheet.getRow(0);
// initialize variables
String objectNode = "";
String relName = "";
Vector propNames = new Vector();
// determine if relationship or property sheet
String sheetType = row.getCell(0).getStringCellValue();
String subjectNode = row.getCell(1).getStringCellValue();
int currentColumn = 0;
if (sheetType.equalsIgnoreCase("Relation")) {
objectNode = row.getCell(2).getStringCellValue();
// if relationship, properties start at column 2
currentColumn++;
}
// determine property names for the relationship or node
// colIndex starts at currentColumn+1 since if relationship, the object
// node name is in the second column
int lastColumn = 0;
for (int colIndex = currentColumn + 1; colIndex < row.getLastCellNum(); colIndex++) {
// add property name to vector
if (row.getCell(colIndex) != null) {
propNames.addElement(row.getCell(colIndex).getStringCellValue());
lastColumn = colIndex;
}
}
logger.info(sheetToLoad + " has number of columns: " + (lastColumn + 1));
// processing starts
logger.info(sheetToLoad + " has number of rows: " + lastRow);
for (int rowIndex = 1; rowIndex < lastRow; rowIndex++) {
// first cell is the name of relationship
Row nextRow = lSheet.getRow(rowIndex);
if (nextRow == null) {
continue;
}
// get the name of the relationship
if (rowIndex == 1) {
Cell relCell = nextRow.getCell(0);
if (relCell != null && !relCell.getStringCellValue().isEmpty()) {
relName = nextRow.getCell(0).getStringCellValue();
} else {
if (sheetType.equalsIgnoreCase("Relation")) {
throw new IOException("Need to define the relationship on sheet " + sheetToLoad);
}
relName = "Ignore";
}
}
// set the name of the subject instance node to be a string
if (nextRow.getCell(1) != null && nextRow.getCell(1).getCellType() != CellType.BLANK) {
nextRow.getCell(1).setCellType(CellType.STRING);
}
// to prevent errors when java thinks there is a row of data when
// the row is empty
Cell instanceSubjectNodeCell = nextRow.getCell(1);
String instanceSubjectNode = "";
if (instanceSubjectNodeCell != null && instanceSubjectNodeCell.getCellType() != CellType.BLANK
&& !instanceSubjectNodeCell.toString().isEmpty()) {
instanceSubjectNode = nextRow.getCell(1).getStringCellValue();
} else {
continue;
}
// get the name of the object instance node if relationship
String instanceObjectNode = "";
int startCol = 1;
int offset = 1;
if (sheetType.equalsIgnoreCase("Relation")) {
if (nextRow.getCell(2) != null) {
// make it a string so i can easily parse it
nextRow.getCell(2).setCellType(CellType.STRING);
Cell instanceObjectNodeCell = nextRow.getCell(2);
// if empty, ignore
if (ExcelParsing.isEmptyCell(instanceObjectNodeCell)) {
continue;
}
instanceObjectNode = nextRow.getCell(2).getStringCellValue();
}
startCol++;
offset++;
}
Hashtable propHash = new Hashtable();
// process properties
for (int colIndex = (startCol + 1); colIndex < nextRow.getLastCellNum(); colIndex++) {
if (propNames.size() <= (colIndex - offset)) {
continue;
}
String propName = propNames.elementAt(colIndex - offset).toString();
// ignore bad data
if (ExcelParsing.isEmptyCell(nextRow.getCell(colIndex))) {
continue;
}
Object propValue = ExcelParsing.getCell(nextRow.getCell(colIndex));
if(propValue == null || propValue.toString().trim().isEmpty()) {
continue;
}
propHash.put(propName, propValue);
}
if (sheetType.equalsIgnoreCase("Relation")) {
if(rowIndex % 100 == 0) {
logger.info("Processing Relationship Sheet: " + sheetToLoad + ", row = " + rowIndex);
}
RdfUploadReactorUtility.createRelationship(database, owlEngine, baseUri, subjectNode, objectNode, instanceSubjectNode, instanceObjectNode, relName, propHash);
} else {
if(rowIndex % 100 == 0) {
logger.info("Processing Node Sheet: " + sheetToLoad + ", row = " + rowIndex);
}
RdfUploadReactorUtility.addNodeProperties(database, owlEngine, baseUri, subjectNode, instanceSubjectNode, propHash);
}
}
logger.info("Done processing: " + sheetToLoad + ". Total rows processed = " + lastRow);
}
/**
* Load excel sheet in matrix format
* @param sheetToLoad String containing the name of the excel sheet to load
* @param workbook XSSFWorkbook containing the name of the excel workbook
* @throws EngineException
*/
public void loadMatrixSheet(IDatabaseEngine database, WriteOWLEngine owlEngine, String sheetToLoad, Workbook workbook, String baseUri) {
Sheet lSheet = workbook.getSheet(sheetToLoad);
logger.info("Loading Sheet: " + sheetToLoad);
int lastRow = lSheet.getLastRowNum();
logger.info("Number of Rows: " + lastRow);
// Get the first row to get column names
Row row = lSheet.getRow(0);
// initialize variables
String objectNodeType = "";
String relName = "";
boolean propExists = false;
String sheetType = row.getCell(0).getStringCellValue();
// Get the string that contains the subject node type, object node type, and properties
String nodeMap = row.getCell(1).getStringCellValue();
// check to see if properties exist
String propertyName = "";
StringTokenizer tokenProperties = new StringTokenizer(nodeMap, "@");
String triple = tokenProperties.nextToken();
if (tokenProperties.hasMoreTokens()) {
propertyName = tokenProperties.nextToken();
propExists = true;
}
StringTokenizer tokenTriple = new StringTokenizer(triple, "_");
String subjectNodeType = tokenTriple.nextToken();
if (sheetType.equalsIgnoreCase("Relation")) {
relName = tokenTriple.nextToken();
objectNodeType = tokenTriple.nextToken();
}
// determine object instance names for the relationship
ArrayList objectInstanceArray = new ArrayList();
int lastColumn = 0;
for (int colIndex = 2; colIndex < row.getLastCellNum(); colIndex++) {
objectInstanceArray.add(row.getCell(colIndex).getStringCellValue());
lastColumn = colIndex;
}
// fix number of columns due to data shift in excel sheet
lastColumn--;
logger.info("Number of Columns: " + lastColumn);
// process all rows (contains subject instances) in the matrix
for (int rowIndex = 1; rowIndex <= lastRow; rowIndex++) {
// boolean to determine if a mapping exists
boolean mapExists = false;
Row nextRow = lSheet.getRow(rowIndex);
// get the name subject instance
String instanceSubjectName = nextRow.getCell(1).getStringCellValue();
// see what relationships are mapped between subject instances
// and object instances
for (int colIndex = 2; colIndex <= lastColumn; colIndex++) {
String instanceObjectName = objectInstanceArray.get(colIndex - 2);
Hashtable propHash = new Hashtable();
// store value in cell between instance subject and object
// in current iteration of loop
Cell matrixContent = nextRow.getCell(colIndex);
// if any value in cell, there should be a mapping
if (matrixContent != null) {
if (propExists) {
if (matrixContent.getCellType() == CellType.NUMERIC) {
if (DateUtil.isCellDateFormatted(matrixContent)) {
propHash.put(propertyName, (Date) matrixContent.getDateCellValue());
mapExists = true;
} else {
propHash.put(propertyName, new Double(matrixContent.getNumericCellValue()));
mapExists = true;
}
} else {
// if not numeric, assume it is a string and
// check to make sure it is not empty
if (!matrixContent.getStringCellValue().isEmpty()) {
propHash.put(propertyName, matrixContent.getStringCellValue());
mapExists = true;
}
}
} else {
mapExists = true;
}
}
if (sheetType.equalsIgnoreCase("Relation") && mapExists) {
if(rowIndex % 100 == 0) {
logger.info("Processing" + sheetToLoad + " Row " + rowIndex + " Column " + colIndex);
}
RdfUploadReactorUtility.createRelationship(database, owlEngine, baseUri, subjectNodeType, objectNodeType, instanceSubjectName, instanceObjectName, relName, propHash);
} else {
if(rowIndex % 100 == 0) {
logger.info("Processing" + sheetToLoad + " Row " + rowIndex + " Column " + colIndex);
}
RdfUploadReactorUtility.addNodeProperties(database, owlEngine, baseUri, subjectNodeType, instanceSubjectName, propHash);
}
}
}
logger.info("Done processing: " + sheetToLoad + ". Total rows processed = " + lastRow);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy