Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
prerna.ds.py.PandasFrame Maven / Gradle / Ivy
package prerna.ds.py;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import javax.crypto.Cipher;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import prerna.algorithm.api.DataFrameTypeEnum;
import prerna.algorithm.api.SemossDataType;
import prerna.cache.CachePropFileFrameObject;
import prerna.ds.OwlTemporalEngineMeta;
import prerna.ds.shared.AbstractTableDataFrame;
import prerna.ds.shared.CachedIterator;
import prerna.ds.shared.RawCachedWrapper;
import prerna.ds.util.flatfile.CsvFileIterator;
import prerna.ds.util.flatfile.ParquetFileIterator;
import prerna.engine.api.IHeadersDataRow;
import prerna.engine.api.IRawSelectWrapper;
import prerna.om.HeadersException;
import prerna.om.IStringExportProcessor;
import prerna.om.Insight;
import prerna.poi.main.helper.excel.ExcelSheetFileIterator;
import prerna.query.interpreters.IQueryInterpreter;
import prerna.query.interpreters.PandasInterpreter;
import prerna.query.querystruct.CsvQueryStruct;
import prerna.query.querystruct.ExcelQueryStruct;
import prerna.query.querystruct.HardSelectQueryStruct;
import prerna.query.querystruct.ParquetQueryStruct;
import prerna.query.querystruct.SelectQueryStruct;
import prerna.query.querystruct.transform.QSAliasToPhysicalConverter;
import prerna.reactor.imports.ImportUtility;
import prerna.sablecc2.om.task.BasicIteratorTask;
import prerna.ui.components.playsheets.datamakers.DataMakerComponent;
import prerna.util.Constants;
import prerna.util.Settings;
import prerna.util.Utility;
public class PandasFrame extends AbstractTableDataFrame {
private static final Logger classLogger = LogManager.getLogger(PandasFrame.class);
public static final String DATA_MAKER_NAME = "PandasFrame";
public static final String PANDAS_IMPORT_VAR = "pandas_import_var";
public static final String PANDAS_IMPORT_STRING = "import pandas as " + PANDAS_IMPORT_VAR;
public static final String NUMPY_IMPORT_VAR = "np_import_var";
public static final String NUMPY_IMPORT_STRING = "import numpy as " + NUMPY_IMPORT_VAR;
static Map pyS = new Hashtable<>();
static Map pyJ = new Hashtable<>();
static Map spy = new Hashtable<>();
// gets all the commands in one fell swoop
List commands = new ArrayList<>();
private String wrapperFrameName = null;
private String originalWrapperFrameName = null;
private PyTranslator pyt = null;
public boolean cache = true;
public String sqliteConnectionName = null;
// list of caches
public List keyCache = new ArrayList();
static {
pyS.put("object", SemossDataType.STRING);
pyS.put("category", SemossDataType.STRING);
pyS.put("int64", SemossDataType.INT);
pyS.put("float64", SemossDataType.DOUBLE);
pyS.put("datetime64", SemossDataType.DATE);
pyS.put("bool", SemossDataType.BOOLEAN);
pyJ.put("object", java.lang.String.class);
pyJ.put("category", java.lang.String.class);
pyJ.put("int64", java.lang.Integer.class);
pyJ.put("float64", java.lang.Double.class);
pyJ.put("datetime64", java.util.Date.class);
pyJ.put("bool", java.lang.Boolean.class);
spy.put(SemossDataType.STRING, "'str'");
spy.put(SemossDataType.INT, "np.int64");
spy.put(SemossDataType.DOUBLE, "np.float64");
spy.put(SemossDataType.DATE, "np.datetime32");
spy.put(SemossDataType.TIMESTAMP, "np.datetime32");
// spy.put(SemossDataType.BOOLEAN, "np.bool");
spy.put("float64", "np.float32");
spy.put("int64", "np.int32");
spy.put("datetime64", "np.datetime32");
spy.put("dtype('O')", "'str'");
spy.put("dtype('int64')", "int32");
spy.put("dtype('float64')", "float32");
}
public PandasFrame() {
this(null);
}
public PandasFrame(String tableName) {
if(tableName == null || tableName.trim().isEmpty()) {
tableName = "PYFRAME_" + UUID.randomUUID().toString().replace("-", "_");
}
this.frameName = tableName;
this.wrapperFrameName = PandasSyntaxHelper.createFrameWrapperName(tableName);
this.originalName = this.frameName;
this.originalWrapperFrameName = wrapperFrameName;
}
@Override
public void setName(String name) {
if(name != null && !name.isEmpty()) {
this.frameName = name;
this.wrapperFrameName = PandasSyntaxHelper.createFrameWrapperName(name);
}
}
/**
* Get the name of the frame wrapper object
* @return
*/
public String getWrapperName() {
return this.wrapperFrameName;
}
public void addRowsViaIterator(Iterator it) {
// we really need another way to get the data types....
Map rawDataTypeMap = this.metaData.getHeaderToTypeMap();
// TODO: this is annoying, need to get the frame on the same page as the meta
Map dataTypeMap = new HashMap();
for(String rawHeader : rawDataTypeMap.keySet()) {
dataTypeMap.put(rawHeader.split("__")[1], rawDataTypeMap.get(rawHeader));
}
this.addRowsViaIterator(it, this.frameName, dataTypeMap);
}
/**
* Generate a table from an iterator
* @param it
* @param tableName
* @param dataTypeMap
*/
public void addRowsViaIterator(Iterator it, String tableName, Map dataTypeMap) {
boolean loaded = false;
long limit = -1;
if(it instanceof CsvFileIterator) {
addRowsViaCsvIterator((CsvFileIterator) it, tableName);
loaded = true;
}
// just flush the excel to a grid through the iterator
// using the below logic
else if(it instanceof ExcelSheetFileIterator) {
addRowsViaExcelIterator((ExcelSheetFileIterator) it, tableName);
loaded = true;
}
else if(it instanceof ParquetFileIterator) {
// do something
addRowsViaParquetIterator((ParquetFileIterator) it, tableName);
loaded = true;
}
if(!loaded) {
// default behavior is to just write this to a csv file
// and read it back in
String newFileLoc = Utility.getInsightCacheDir() + "/" + Utility.getRandomString(6) + ".json";
if(Boolean.parseBoolean(Utility.getDIHelperProperty(Constants.CHROOT_ENABLE))) {
Insight in = this.pyt.insight;
String insightFolder = in.getInsightFolder();
new File(Utility.normalizePath(insightFolder)).mkdirs();
if(in.getUser() != null) {
in.getUser().getUserSymlinkHelper().symlinkFolder(insightFolder);
}
newFileLoc = insightFolder + "/" + Utility.getRandomString(6) + ".json";
}
File newFile = Utility.writeResultToJson(newFileLoc, it, dataTypeMap, new IStringExportProcessor() {
// we need to replace all inner quotes with ""
@Override
public String processString(String input) {
return input.replace("\"", "\\\"");
}
});
String importPandasS = new StringBuilder(PANDAS_IMPORT_STRING).toString();
String importNumpyS = new StringBuilder(NUMPY_IMPORT_STRING).toString();
// generate the script
String fileLocation = newFile.getAbsolutePath();
String loadS = PandasSyntaxHelper.getJsonFileRead(PANDAS_IMPORT_VAR, NUMPY_IMPORT_VAR, fileLocation, tableName, dataTypeMap);
//String loadS = PandasSyntaxHelper.getCsvFileRead(PANDAS_IMPORT_VAR, NUMPY_IMPORT_VAR,
// fileLocation, tableName, ",", "\"", "\\\\", pyt.getCurEncoding(), dataTypeMap);
// what if its not above 10,000 but there is still a limit
if (limit > -1) {
String rowLimits = String.valueOf(limit);
loadS = loadS + "[:" + rowLimits + "]";
}
String modHeaders = null;
String[] cleanHeaders = null;
if(it instanceof IRawSelectWrapper) {
String[] headers = ((IRawSelectWrapper) it).getHeaders();
cleanHeaders = HeadersException.getInstance().getCleanHeaders(headers);
modHeaders = PandasSyntaxHelper.alterColumnNames(tableName, headers, cleanHeaders);
} else if(it instanceof BasicIteratorTask) {
List> taskHeaders = ((BasicIteratorTask) it).getHeaderInfo();
int numHeaders = taskHeaders.size();
String[] headers = new String[numHeaders];
for(int i = 0; i < numHeaders; i++) {
Map headerInfo = taskHeaders.get(i);
String alias = (String) headerInfo.get("alias");
headers[i] = alias;
}
cleanHeaders = HeadersException.getInstance().getCleanHeaders(headers);
modHeaders = PandasSyntaxHelper.alterColumnNames(tableName, headers, cleanHeaders);
}
String makeWrapper = PandasSyntaxHelper.makeWrapper(PandasSyntaxHelper.createFrameWrapperName(tableName), tableName);
// execute the script
//pyt.runScript(importS, loadS);
//pyt.runScript(makeWrapper);
pyt.runEmptyPy(importPandasS, importNumpyS, loadS, modHeaders, makeWrapper);
// delete the generated file
Double rowCount = pyt.getLong(tableName + ".shape[0]");
if(rowCount == 0) {
String frameColumns = "columns = " + "['" + String.join("','", cleanHeaders) + "']";
String createDataFrame = frameName + " = pd.DataFrame("+frameColumns+")";
this.pyt.runScript(createDataFrame);
}
// dont delete.. we probably need to test the file py
newFile.delete();
}
// if(isEmpty(tableName)) {
// throw new EmptyIteratorException("Unable to load data into pandas frame");
// }
syncHeaders();
// need to get a pandas frame types and then see if this is the same as
if(!isEmpty(tableName)) {
adjustDataTypes(tableName, dataTypeMap);
}
}
/**
* Generate a table from a CSV file iterator
* @param it
* @param tableName
*/
private void addRowsViaCsvIterator(CsvFileIterator it, String tableName) {
// generate the script
CsvQueryStruct qs = it.getQs();
String importPandasS = new StringBuilder(PANDAS_IMPORT_STRING).toString();
String importNumpyS = new StringBuilder(NUMPY_IMPORT_STRING).toString();
String fileLocation = it.getFileLocation();
Map temp = qs.getColumnTypes();
// apply limit for import
long limit = qs.getLimit();
String loadS = PandasSyntaxHelper.getCsvFileRead(PANDAS_IMPORT_VAR, NUMPY_IMPORT_VAR,
fileLocation, tableName, qs.getDelimiter() + "", "\"", "\\\\", null, qs.getColumnTypes(), limit);
// run import of packages and df
pyt.runEmptyPy(importPandasS, importNumpyS, loadS);
// need a clean headers call
String[] colNames = pyt.getColumns(tableName);
String cleanHeaders = PandasSyntaxHelper.cleanFrameHeaders(tableName, colNames);
pyt.runEmptyPy(cleanHeaders);
// De-select section
// Need to do
// proper logic first
Map newHeaders = qs.getNewHeaderNames();
String[] selectedHeaders = it.getHeaders();
String [] cleanNewHeaders = selectedHeaders;
if(newHeaders != null) {
int i = 0;
for(String newColName : selectedHeaders) {
String oldColName = newHeaders.get(newColName);
if (oldColName != null) {
cleanNewHeaders[i] = oldColName;
} else {
cleanNewHeaders[i] = newColName;
}
i++;
}
}
String selectedColumns = PandasSyntaxHelper.filterByColumn(tableName, tableName, Arrays.asList(cleanNewHeaders) );
String headerS = PandasSyntaxHelper.setColumnNames(tableName, selectedHeaders);
String makeWrapper = PandasSyntaxHelper.makeWrapper(PandasSyntaxHelper.createFrameWrapperName(tableName), tableName);
pyt.runEmptyPy(selectedColumns, headerS, makeWrapper);
}
/**
* Generate a table from a Excel file iterator
* @param it
* @param tableName
*/
private void addRowsViaExcelIterator(ExcelSheetFileIterator it, String tableName) {
ExcelQueryStruct qs = it.getQs();
String sheetName = qs.getSheetName();
String filePath = qs.getFilePath();
String sheetRange = qs.getSheetRange();
it.getSheet();
// generate the script
String importPandasS = new StringBuilder(PANDAS_IMPORT_STRING).toString();
String importNumpyS = new StringBuilder(NUMPY_IMPORT_STRING).toString();
// run import of packages
pyt.runEmptyPy(importPandasS,importNumpyS);
String loadS = PandasSyntaxHelper.loadExcelSheet(PANDAS_IMPORT_VAR, filePath, tableName, sheetName, sheetRange);
long limit = qs.getLimit();
if (limit > -1) {
String rowLimits = String.valueOf(limit);
loadS = loadS + "[:" + rowLimits + "]";
}
// run import df
pyt.runEmptyPy(loadS);
// need a clean headers call
String[] colNames = pyt.getColumns(tableName);
String cleanHeaders = PandasSyntaxHelper.cleanFrameHeaders(tableName, colNames);
pyt.runEmptyPy(cleanHeaders);
// De-select section
// Need to do
// proper logic first
Map newHeaders = qs.getNewHeaderNames();
String[] selectedHeaders = it.getHeaders();
String [] cleanNewHeaders = new String [selectedHeaders.length];
int i = 0;
for(String newColName : selectedHeaders) {
String oldColName = newHeaders.get(newColName);
if (oldColName != null) {
cleanNewHeaders[i] = oldColName;
} else {
cleanNewHeaders[i] = newColName;
}
i++;
}
String selectedColumns = PandasSyntaxHelper.filterByColumn(tableName, tableName, Arrays.asList(cleanNewHeaders) );
String headerS = PandasSyntaxHelper.setColumnNames(tableName, selectedHeaders);
String makeWrapper = PandasSyntaxHelper.makeWrapper(PandasSyntaxHelper.createFrameWrapperName(tableName), tableName);
pyt.runEmptyPy(selectedColumns, headerS, makeWrapper);
}
/**
* Generate a table from a Parquet file iterator
* @param it
* @param tableName
*/
private void addRowsViaParquetIterator(ParquetFileIterator it, String tableName) {
// generate the script
ParquetQueryStruct qs = it.getQs();
String importPandasS = new StringBuilder(PANDAS_IMPORT_STRING).toString();
String importNumpyS = new StringBuilder(NUMPY_IMPORT_STRING).toString();
String fileLocation = it.getFileLocation();
String loadS = PandasSyntaxHelper.getParquetFileRead(PANDAS_IMPORT_VAR, NUMPY_IMPORT_VAR,
fileLocation, tableName);
// apply limit for import
long limit = qs.getLimit();
if (limit > -1) {
String rowLimits = String.valueOf(limit);
loadS = loadS + "[:" + rowLimits + "]";
}
pyt.runEmptyPy(importPandasS, importNumpyS, loadS);
// need a clean headers call
String[] colNames = pyt.getColumns(tableName);
String cleanHeaders = PandasSyntaxHelper.cleanFrameHeaders(tableName, colNames);
pyt.runEmptyPy(cleanHeaders);
// De-select section
Map newHeaders = qs.getNewHeaderNames();
String[] selectedHeaders = it.getHeaders();
String [] cleanNewHeaders = new String [selectedHeaders.length];
int i = 0;
for(String newColName : selectedHeaders) {
String oldColName = newHeaders.get(newColName);
if (oldColName != null) {
cleanNewHeaders[i] = oldColName;
} else {
cleanNewHeaders[i] = newColName;
}
i++;
}
String selectedColumns = PandasSyntaxHelper.filterByColumn(tableName, tableName, Arrays.asList(cleanNewHeaders) );
String headerS = PandasSyntaxHelper.setColumnNames(tableName, selectedHeaders);
String makeWrapper = PandasSyntaxHelper.makeWrapper(PandasSyntaxHelper.createFrameWrapperName(tableName), tableName);
pyt.runEmptyPy(selectedColumns, headerS, makeWrapper);
}
/**
* Merge the pandas frame with another frame. If a non equi join, performs a cross product and then
* filters the results. For the non equi joins, if the left and right join column names are equal, changes the right column name
* so that it can be dropped later.
*
* @param returnTable
* @param leftTableName
* @param rightTableName
* @param joinType
* @param joinCols
*/
public void merge(String returnTable, String leftTableName, String rightTableName, String joinType, List> joinCols,
List joinComparators, boolean nonEqui) {
String mergeString = PandasSyntaxHelper.getMergeSyntax(PANDAS_IMPORT_VAR, returnTable, leftTableName, rightTableName,
joinType, joinCols, nonEqui);
if (!nonEqui) {
pyt.runScript(mergeString);
} else {
for (int i = 0; i < joinCols.size(); i++) {
Map joinMap = joinCols.get(i);
for (String lColumn : joinMap.keySet()) {
if (lColumn.equals(joinMap.get(lColumn))) {
String newColumn = joinMap.get(lColumn) + "_CTD";
pyt.runScript(PandasSyntaxHelper.alterColumnName(rightTableName, joinMap.get(lColumn), newColumn));
joinMap.replace(lColumn, newColumn);
joinCols.set(i, joinMap);
}
}
}
String filterSyntax = PandasSyntaxHelper.getMergeFilterSyntax(returnTable, joinCols,joinComparators);
pyt.runScript(mergeString);
pyt.runScript(filterSyntax);
}
syncHeaders();
}
@Override
public void syncHeaders() {
super.syncHeaders();
if(sqliteConnectionName != null) {
pyt.runScript("del " + sqliteConnectionName);
sqliteConnectionName = null;
}
}
/**
* Adjust the data types of the frame in case we messed up and readjust
* @param tableName
* @param dataTypeMap
*/
private void adjustDataTypes(String tableName, Map dataTypeMap) {
String wrapperTableName = PandasSyntaxHelper.createFrameWrapperName(tableName);
String colScript = PandasSyntaxHelper.getColumns(wrapperTableName + ".cache['data']");
String typeScript = PandasSyntaxHelper.getTypes(wrapperTableName + ".cache['data']");
List headerList = (List) pyt.runScript(colScript);
String[] headers = headerList.toArray(new String[headerList.size()]);
List types = (List) pyt.runScript(typeScript);
StringBuffer allTypes = new StringBuffer();
// here we run and see if the types are good
// or if we messed up, we perform a switch
for(int colIndex = 0; colIndex < headers.length; colIndex++) {
String colName = headers[colIndex];
String colType = types.get(colIndex);
if(types == null || colType == null) {
colType = "STRING";
}
SemossDataType pysColType = pyS.get(colType);
SemossDataType proposedType = dataTypeMap.get(frameName + "__" + colName);
if(proposedType == null) {
proposedType = dataTypeMap.get(colName);
}
String pyproposedType = colType;
if(proposedType != null) {
pyproposedType = spy.get(proposedType);
} else {
pyproposedType = spy.get(colType);
}
//if(proposedType != null && pysColType != proposedType) {
if(proposedType!=null && pyproposedType!=null && !pyproposedType.equalsIgnoreCase(colType)) {
// create and execute the type
if(proposedType == SemossDataType.DATE) {
String typeChanger = tableName + "['" + colName + "'] = pd.to_datetime(" + tableName + "['" + colName + "'], errors='ignore').dt.date";
allTypes.append(typeChanger).append("\n");
//pyt.runScript(typeChanger);
} else if(proposedType == SemossDataType.TIMESTAMP) {
String typeChanger = tableName + "['" + colName + "'] = pd.to_datetime(" + tableName + "['" + colName + "'], errors='ignore')";
allTypes.append(typeChanger).append("\n");
//pyt.runScript(typeChanger);
} else {
String typeChanger = tableName + "['" + colName + "'] = " + tableName + "['" + colName + "'].astype(" + pyproposedType + ", errors='ignore')";
allTypes.append(typeChanger).append("\n");
//pyt.runScript(typeChanger);
}
}
}
// execute all at once
if(allTypes.length() > 0) {
pyt.runEmptyPy(allTypes.toString());
}
}
// tries to see if the order in which pandas is giving is valid with the order that is being requested
public boolean sync(String[] headers, List actHeaders) {
boolean sync = true;
for(int headerIndex = 0;headerIndex < headers.length && sync;headerIndex++) {
sync = sync && (headers[headerIndex].equals(actHeaders.get(headerIndex)));
}
return sync;
}
// get the types of headers
public Object [] getHeaderAndTypes(String targetFrame) {
String colScript = PandasSyntaxHelper.getColumns(targetFrame);
String typeScript = PandasSyntaxHelper.getTypes(targetFrame);
/*
Hashtable response = (Hashtable)pyt.runScript(colScript, typeScript);
String [] headers = (String [])((ArrayList)response.get(colScript)).toArray();
SemossDataType [] stypes = new SemossDataType[headers.length];
*/
ArrayList headerList = (ArrayList)pyt.runScript(colScript);
String [] headers = new String[headerList.size()];
headerList.toArray(headers);
SemossDataType [] stypes = new SemossDataType[headerList.size()];
ArrayList types = (ArrayList)pyt.runScript(typeScript);
for(int colIndex = 0;colIndex < headers.length;colIndex++)
{
String colName = headers[colIndex];
String colType = types.get(colIndex);
SemossDataType pysColType = (SemossDataType)pyS.get(colType);
stypes[colIndex] = pysColType;
}
Object [] retObject = new Object[2];
retObject[0] = stypes;
retObject[1] = headers;
return retObject;
}
@Override
public IRawSelectWrapper query(SelectQueryStruct qs) {
// R does not support relations in general
// so we are going to remove any that may have been added
// this is important for when the BE changes the frame without
// the FE knowing and that frame was native and had joins
qs.getRelations().clear();
// at this point try to see if the cache already has it and if so pass that iterator instead
// the cache is sitting in the insight
qs = QSAliasToPhysicalConverter.getPhysicalQs(qs, this.metaData);
if(qs.getPragmap() != null && qs.getPragmap().containsKey("xCache"))
this.cache = ((String)qs.getPragmap().get("xCache")).equalsIgnoreCase("True") ? true:false;
PandasInterpreter interp = new PandasInterpreter();
interp.setDataTableName(this.frameName, this.wrapperFrameName + ".cache['data']");
interp.setDataTypeMap(this.metaData.getHeaderToTypeMap());
interp.setQueryStruct(qs);
interp.setKeyCache(keyCache);
// I should also possibly set up pytranslator so I can run command for creating filter
interp.setPyTranslator(pyt);
// need to do this for subqueries where we flush the values into a filter
interp.setPandasFrame(this);
return processInterpreter(interp, qs);
}
@Override
public IRawSelectWrapper query(String query) {
//TODO: this only works if you have an interp!
//TODO: this only works if you have an interp!
//TODO: this only works if you have an interp!
// need to redo this when you have a pandas script you want to run
// need to grab the headers and types via the output object
Object output = pyt.runScript(query);
List response = null;
PandasInterpreter interp = new PandasInterpreter();
String [] headers = interp.getHeaders();
SemossDataType [] types = interp.getTypes();
List actHeaders = null;
boolean sync = true;
// get the types for headers also
if(interp.isScalar()) {
List val = new ArrayList();
val.add(output);
response = new ArrayList();
response.add(val);
} else if(output instanceof HashMap) {
HashMap map = (HashMap) output;
response = (List) map.get("data");
// get the columns
List columns = (List) map.get("columns");
actHeaders = mapColumns(interp, columns);
sync = sync(headers, actHeaders);
}
else if(output instanceof List) {
response = (List) output;
actHeaders = null;
sync = true;
}
PandasIterator pi = new PandasIterator(headers, response, types);
// set the actual headers so that it can be processed
// if not in sync then transform
pi.setTransform(actHeaders, !sync);
RawPandasWrapper rpw = new RawPandasWrapper();
rpw.setPandasIterator(pi);
return rpw ;
}
// create a subframe for the purposes of variables
@Override
public String createVarFrame() {
PandasInterpreter interp = new PandasInterpreter();
SelectQueryStruct qs = getMetaData().getFlatTableQs(true);
// add all the frame filter
qs.setExplicitFilters(this.grf.copy());
// convert to physical
qs = QSAliasToPhysicalConverter.getPhysicalQs(qs, this.metaData);
interp.setDataTableName(this.frameName, this.wrapperFrameName + ".cache['data']");
interp.setDataTypeMap(this.metaData.getHeaderToTypeMap());
interp.setQueryStruct(qs);
interp.setKeyCache(keyCache);
// I should also possibly set up pytranslator so I can run command for creating filter
interp.setPyTranslator(pyt);
// need to do this for subqueries where we flush the values into a filter
interp.setPandasFrame(this);
String query = interp.composeQuery();
query = query.substring(0, query.indexOf(".drop_duplicates"));
String newFrame = Utility.getRandomString(6);
String command = newFrame + " = " + query;
pyt.runScript(command);
return newFrame;
}
private IRawSelectWrapper processInterpreter(PandasInterpreter interp, SelectQueryStruct qs) {
String query = null;
// make it into a full frame
String targetFrame = Utility.getRandomString(6);
if(qs instanceof HardSelectQueryStruct) // this is a hard select query struct
{
// escape the quotes
String sql = ((HardSelectQueryStruct)qs).getQuery();
sql = sql.replace("\"", "\\\"");
boolean pandasImported = (boolean) this.pyt.runScript("'pd' in dir()");
if (!pandasImported) {
this.pyt.runEmptyPy("import pandas as pd");
}
String frameMaker = targetFrame + " = pd.read_sql(\"" + sql + "\", " + getSQLite() + ")";
// String loadsqlDF = "from pandasql import sqldf";
// this.pyt.runEmptyPy(loadsqlDF);
// query = targetFrame + "= sqldf('" + ((HardSelectQueryStruct)qs).getQuery() + "')";
this.pyt.runEmptyPy(frameMaker);
query = targetFrame + ".to_dict('split')";
}
else
{
query = interp.composeQuery();
}
// assign query to frame
// the command that is coming in has the sort values and other things attached to it
// need to get rid of it before I can get the frame
// and then slap it back to get the values back
//ascszng = d[['bp.1d','bp.1s','bp.2d','bp.2s','chol','Drug','frame','gender','glyhb','hdl','height','hip','id','location','ratio','stab.glu','time.ppn','waist','weight']]
// .drop_duplicates().sort_values(['bp.1d'],ascending=[True]).iloc[0:2000].to_dict('split')
// need to get rid of everything from drop_duplicates
// if that is not available then sort values
// make the query to be just the new frameName
// query = frameName;
CachedIterator ci = null;
IRawSelectWrapper retWrapper = null;
if(!queryCache.containsKey(query) || !cache)
{
// run the query
Object output = pyt.runScript(query);
// if using native py server, and cant'structure output, try convert.
if (Utility.getDIHelperProperty(Settings.NATIVE_PY_SERVER) != null
&& Utility.getDIHelperProperty(Settings.NATIVE_PY_SERVER).equalsIgnoreCase("true")
&& output instanceof String) {
output = PandasTimestampDeserializer.MAPPER.convertValue(output, Object.class);
}
// need to see if this is a parquet format as well
String format = "grid";
if(qs.getPragmap() != null && qs.getPragmap().containsKey("format"))
format = (String)qs.getPragmap().get("format");
String [] headers = interp.getHeaders();
SemossDataType [] types = interp.getTypes();
if(format.equalsIgnoreCase("grid"))
{
List response = null;
List actHeaders = null;
boolean sync = true;
// get the types for headers also
if(interp.isScalar()) // not much to do here
{
List val = new ArrayList();
val.add(output);
response = new ArrayList();
response.add(val);
}
//else if(output instanceof HashMap) // this is our main map
else if(output instanceof Map) // this is our main map
{
Map map = (Map) output;
response = (List)map.get("data");
// get the columns
List columns = (List)map.get("columns");
actHeaders = mapColumns(interp, columns);
if(headers != null) // regular compose query
sync = sync(headers, actHeaders);
else if(qs instanceof HardSelectQueryStruct)
{
Object [] typesAndHeaders = getHeaderAndTypes(targetFrame);
// types and headers
types = (SemossDataType []) typesAndHeaders[0];
headers = (String []) typesAndHeaders[1];
sync = true;
}
}
else if(output instanceof List)
{
response = (List) output;
actHeaders = null;
sync = true;
}
PandasIterator pi = new PandasIterator(headers, response, types);
pi.setQuery(query);
pi.setTransform(actHeaders, !sync);
RawPandasWrapper rpw = new RawPandasWrapper();
rpw.setPandasIterator(pi);
retWrapper = rpw;
}
else // handling parquet format here
{
PandasParquetIterator ppi = new PandasParquetIterator(headers, output, types);
ppi.setQuery(query);
RawPandasParquetWrapper rpw = new RawPandasParquetWrapper();
rpw.setPandasParquetIterator(ppi);
retWrapper = rpw;
}
// clear it if it was !cache
if(!cache) {
// clean up the cache
// I have to do this everytime since I keep the keys at the py level as well
clearQueryCache();
}
}
else if(cache)
{
ci = queryCache.get(query);
RawCachedWrapper rcw = new RawCachedWrapper();
rcw.setIterator(ci);
retWrapper = rcw;
}
// set the actual headers so that it can be processed
// if not in sync then transform
return retWrapper;
}
private List mapColumns(PandasInterpreter interp, List columns) {
List newHeaders = new ArrayList();
Map funcMap = interp.functionMap();
for(int colIndex = 0;colIndex < columns.size();colIndex++) {
// every element here is List
Object item = columns.get(colIndex);
if(item instanceof List) {
List elem = (List) columns.get(colIndex);
String key = elem.get(1) + "" + elem.get(0);
if(funcMap.containsKey(key)) {
newHeaders.add(funcMap.get(key));
} else {
newHeaders.add(elem.get(0)+"");
}
} else {
newHeaders.add(item+"");
}
}
return newHeaders;
}
@Override
public IQueryInterpreter getQueryInterpreter() {
PandasInterpreter interp = new PandasInterpreter();
interp.setDataTableName(this.getName(), this.wrapperFrameName);
interp.setDataTypeMap(this.metaData.getHeaderToTypeMap());
return interp;
}
/**
* Run the script
* By default return the first script passed in
* use the Executor to grab the specific code portion if running multiple
* @param script
* @return
*/
public Object runScript(String script) {
/*
py.command = script;
Object monitor = py.getMonitor();
Object response = null;
synchronized(monitor) {
try {
monitor.notify();
monitor.wait();
} catch (Exception ignored) {
}
if(script.length == 1) {
response = py.response.get(script[0]);
} else {
response = py.response;
}
}
commands.add(script[0]);
return response;
*/
//if(script.length == 1)
return pyt.runScript(script);
//else
// return pyt.runScript(script);
}
@Override
public long size(String tableName) {
if(isEmpty(tableName)) {
return 0;
}
String command = "len(" + tableName + ")";
Number num = (Number) pyt.runScript(command);
return num.longValue();
}
@Override
public void close() {
super.close();
// this should take the variable name and kill it
// if the user has created others, nothing can be done
logger.info("Removing variable " + this.frameName);
pyt.runScript("del " + this.frameName);
pyt.runScript("del " + this.wrapperFrameName);
if(!this.originalName.equals(this.frameName)) {
pyt.runScript("del " + this.originalName);
pyt.runScript("del " + this.originalWrapperFrameName);
}
// clear all the other frames added through sqlite
if(sqliteConnectionName != null)
pyt.runScript("del " + sqliteConnectionName);
pyt.runScript("gc.collect()");
}
@Override
public CachePropFileFrameObject save(String folderDir, Cipher cipher) throws IOException {
CachePropFileFrameObject cf = new CachePropFileFrameObject();
// save frame
String frameFilePath = folderDir + DIR_SEPARATOR + this.frameName + ".pkl";
cf.setFrameCacheLocation(frameFilePath);
// trying to write the pickle instead
frameFilePath = frameFilePath.replaceAll("\\\\", "/");
/*
pyt.runScript("import pickle");
String command = PandasSyntaxHelper.getWritePandasToPickle("pickle", this.frameName, frameFilePath);
pyt.runScript(command);
*/
String [] commands = new String[]{"import pickle", PandasSyntaxHelper.getWritePandasToPickle("pickle", this.frameName, frameFilePath)};
pyt.runEmptyPy(commands);
// also save the meta details
this.saveMeta(cf, folderDir, this.frameName, cipher);
return cf;
}
@Override
public void open(CachePropFileFrameObject cf, Cipher cipher) {
// open the meta details
this.openCacheMeta(cf, cipher);
// this will get set when we open the cf
setName(this.frameName);
// set the wrapper frame name once the frame name is set
this.wrapperFrameName = getWrapperName();
String [] commands = new String[]{PANDAS_IMPORT_STRING, "import pickle", "import smssutil",
PandasSyntaxHelper.getReadPickleToPandas(PANDAS_IMPORT_VAR, cf.getFrameCacheLocation(),
this.frameName),PandasSyntaxHelper.makeWrapper(this.wrapperFrameName, this.frameName)};
pyt.runEmptyPy(commands);
}
@Override
public boolean isEmpty() {
return isEmpty(this.frameName);
}
public boolean isEmpty(String tableName) {
String wrapperName = PandasSyntaxHelper.createFrameWrapperName(tableName);
String command = "( ('"+wrapperName+"' in vars() or '"+wrapperName+"' in globals()) and len("+wrapperName+".cache['data'])>=0 )";
Object notEmpty = pyt.runScript(command);
Boolean notEmptyResult = null;
try {
notEmptyResult = (Boolean) notEmpty;
} catch (java.lang.ClassCastException e) {
notEmptyResult = Boolean.valueOf((String) notEmpty);
}
return !notEmptyResult;
}
@Override
public DataFrameTypeEnum getFrameType() {
return DataFrameTypeEnum.PYTHON;
}
@Override
public String getDataMakerName() {
return DATA_MAKER_NAME;
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
@Override
public void addRow(Object[] cleanCells, String[] headers) {
// TODO Auto-generated method stub
}
@Override
public void removeColumn(String columnHeader) {
// TODO Auto-generated method stub
}
@Override
public void processDataMakerComponent(DataMakerComponent component) {
// TODO Auto-generated method stub
}
public void setTranslator(PyTranslator pyt) {
// TODO Auto-generated method stub
this.pyt = pyt;
pyt.setLogger(logger);
}
public PyTranslator getTranslator()
{
return pyt;
}
/**
* Recreate the metadata for this existing frame
*/
public void recreateMeta() {
logger.info("Getting the columns for :" + frameName);
String[] colNames = pyt.getStringArray(PandasSyntaxHelper.getColumns(frameName));;
pyt.runScript(PandasSyntaxHelper.cleanFrameHeaders(frameName, colNames));
logger.info("Getting the column types for :" + frameName);
String[] colTypes = pyt.getStringArray(PandasSyntaxHelper.getTypes(frameName));
//clean headers
HeadersException headerChecker = HeadersException.getInstance();
colNames = headerChecker.getCleanHeaders(colNames);
// grab the existing metadata from the frame
Map additionalDataTypes = this.metaData.getHeaderToAdtlTypeMap();
Map> sources = this.metaData.getHeaderToSources();
Map complexSelectors = this.metaData.getComplexSelectorsMap();
// close existing meta
// make a new one
// load into it
this.metaData.close();
this.metaData = new OwlTemporalEngineMeta();
ImportUtility.parseTableColumnsAndTypesToFlatTable(this.metaData, colNames, colTypes, getName(), additionalDataTypes, sources, complexSelectors);
// clear the cached info
this.clearCachedMetrics();
this.clearQueryCache();
}
/**
* Update the wrapper frame with the actual frame object
*/
public void replaceWrapperDataFromFrame() {
pyt.runScript(wrapperFrameName + ".cache['data'] = " + frameName );
}
@Override
public Object querySQL(String sql)
{
// columns
// types
// data
sql = org.apache.commons.text.StringEscapeUtils.unescapeHtml3(sql);
if(sql.startsWith("NLP:") || sql.startsWith("nlp:"))
{
// convert this to sql
sql = getSQLFromNLP(sql);
}
if(sql.trim().toUpperCase().startsWith("SELECT")) {
Map retMap = new HashMap();
String tempFrameName = Utility.getRandomString(5);
String loadsqlDF = "";
String makeNewFrame = "";
sql = sql.replace("\"", "\\\"");
/* alternate to use sqlite directly */
String connName = getSQLite();
makeNewFrame = tempFrameName + " = pd.read_sql(\"" + sql + "\", " + connName +")";
/********/
// dont load sql df everytime
//loadsqlDF = "from pandasql import sqldf";
//makeNewFrame = tempFrameName + "= sqldf(\"" + sql.replace("\"", "\\\"") + "\")";
String addColumnTypes = tempFrameName + "_types = " + tempFrameName + ".dtypes.to_dict()";
String dict = tempFrameName + "_dict = " + tempFrameName + ".to_dict('split')";
String dictColumns = tempFrameName + "_dict['types'] = " + tempFrameName + "_types";
String deleteAll = "del " + tempFrameName + ", " + tempFrameName + "_types, " + tempFrameName + "_dict";
pyt.runEmptyPy(loadsqlDF, makeNewFrame, addColumnTypes, dict, dictColumns);
Object retObject = pyt.runScript(tempFrameName + "_dict"); // get the dictionary back
// will delete later
pyt.runEmptyPy(deleteAll);
if(retObject instanceof Map) {
System.err.println("Valid Output");
retMap = (Map)retObject;
}
// convert types to java object
Map typeMap = (Map)retMap.get("types");
Iterator keys = typeMap.keySet().iterator();
while(keys.hasNext()) {
String column = (String)keys.next();
String value = (String)typeMap.get(column);
if(pyJ.containsKey(value)) {
typeMap.put(column, pyJ.get(value));
} else {
typeMap.put(column, java.lang.String.class);
}
}
List columns = null;
columns = (List )retMap.get("columns");
String [] colArray = new String[columns.size()];
Object [] typesArray = new Object[columns.size()];
for(int columnIndex = 0;columnIndex < columns.size();columnIndex++)
{
String thisColumn = columns.get(columnIndex);
Object colType = typeMap.get(thisColumn);
typesArray[columnIndex] = colType;
colArray[columnIndex] = thisColumn;
}
retMap.put("columns", colArray);
retMap.put("types", typesArray);
return retMap;
} else {
Map retMap = new HashMap();
String [] commands = sql.split("\\R");
// execute each command and drop the result
String [] columns = new String [] {"Command", "Output"};
Object [] types = new Object [] {java.lang.String.class, java.lang.String.class};
List > data = new ArrayList>();
for(int commandIndex = 0;commandIndex < commands.length;commandIndex++)
{
List row = new ArrayList ();
String thisCommand = commands[commandIndex];
Object output = pyt.runScript(thisCommand);
row.add(thisCommand);
row.add(output);
data.add(row);
}
retMap.put("data", data);
retMap.put("types", types);
retMap.put("columns", columns);
return retMap;
}
}
public Object queryCSV(String sql)
{
// columns
// types
// data
sql = org.apache.commons.text.StringEscapeUtils.unescapeHtml3(sql);
if(sql.startsWith("NLP:") || sql.startsWith("nlp:"))
{
// convert this to sql
sql = getSQLFromNLP(sql);
}
if(sql.toUpperCase().startsWith("SELECT"))
{
Map retMap = new HashMap();
try
{
String frameName = Utility.getRandomString(5);
File fileName = new File(Utility.getInsightCacheDir(), frameName + ".csv");
String fileNameStr = fileName.getAbsolutePath().replace("\\", "/");
// old way
String loadsqlDF = "";
//String loadsqlDF = "from pandasql import sqldf";
//String newFrame = "sqldf('" + sql + "').to_csv('" + fileNameStr + "', index=False)";
/* alternate to use sqlite directly */
// new way
sql = sql.replace("\"", "\\\"");
String connName = getSQLite();
String newFrame = "pd.read_sql(\"" + sql + "\", " + connName +").to_csv('"+ fileNameStr + "', index=False)";
// nothing to delete
pyt.runEmptyPy(loadsqlDF, newFrame);
Object retObject = "no data";
if(fileName.exists())
{
//retObject = new String(Files.readAllBytes(fileName.toPath())); // get the dictionary back
//fileName.delete(); // delete the generated file
return fileName;
}
//return retObject;
}catch(Exception ex)
{
}
}
else
{
String frameName = Utility.getRandomString(5);
File fileName = new File(Utility.getInsightCacheDir(), frameName + ".csv");
try {
PrintWriter bw = new PrintWriter(new FileWriter(fileName));
bw.write("Command, Output");
bw.println();
String [] commands = sql.split("\\R");
// execute each command and drop the result
String [] columns = new String [] {"Command", "Output"};
Object [] types = new Object [] {java.lang.String.class, java.lang.String.class};
List > data = new ArrayList>();
for(int commandIndex = 0;commandIndex < commands.length;commandIndex++)
{
List row = new ArrayList ();
String thisCommand = commands[commandIndex];
Object output = this.pyt.runPyAndReturnOutput(thisCommand);
bw.write(thisCommand);
bw.print(", ");
bw.print(output);
bw.println();
}
bw.flush();
bw.close();
} catch (IOException e) {
// TODO Auto-generated catch block
classLogger.error(Constants.STACKTRACE, e);
}
return fileName;
}
return null;
}
// write this as a file and pump it out
// no need to serialize and deserialize this
public Object queryJSON(String sql)
{
// columns
// types
// data
sql = org.apache.commons.text.StringEscapeUtils.unescapeHtml3(sql);
if(sql.startsWith("NLP:") || sql.startsWith("nlp:"))
{
// convert this to sql
sql = getSQLFromNLP(sql);
}
if(sql.toUpperCase().startsWith("SELECT"))
{
Map retMap = new HashMap();
File fileName = new File(Utility.getInsightCacheDir(), frameName + ".json");
String fileNameStr = fileName.getAbsolutePath().replace("\\", "/");
//String loadsqlDF = "from pandasql import sqldf";
//String frameName = Utility.getRandomString(5);
//String newFrame = frameName + "= sqldf('" + sql + "')";
//String deleteAll = "delete " + frameName;
//pyt.runEmptyPy(loadsqlDF, newFrame, dict);
//String dict = frameName + ".to_json('" + fileNameStr + "', orient='records')";
//pyt.runEmptyPy(deleteAll);
// new way
sql = sql.replace("\"", "\\\"");
String connName = getSQLite();
String newFrame = "pd.read_sql(\"" + sql + "\", " + connName +").to_json('"+ fileNameStr + "', orient='records')";
pyt.runEmptyPy(newFrame);
if(fileName.exists())
return fileName;
}
else
{
Map retMap = new HashMap();
String [] commands = sql.split("\\R");
// execute each command and drop the result
String [] columns = new String [] {"Command", "Output"};
Object [] types = new Object [] {java.lang.String.class, java.lang.String.class};
List > data = new ArrayList>();
for(int commandIndex = 0;commandIndex < commands.length;commandIndex++)
{
List row = new ArrayList ();
String thisCommand = commands[commandIndex];
Object output = pyt.runScript(thisCommand);
row.add(thisCommand);
row.add(output);
data.add(row);
}
retMap.put("data", data);
retMap.put("types", types);
retMap.put("columns", columns);
return retMap;
}
return null;
}
// recalibrate variables
public void recalculateVariables(String [] formulas, String oldName, String newName)
{
// this is a string replacement unfortunately
// this is again why we may need better frame variable names
String [] returnCommands = new String[formulas.length + 1];
for(int varIndex = 0;varIndex < formulas.length;varIndex++)
{
String thisVar = formulas[varIndex];
thisVar = thisVar.replace(oldName, newName);
System.err.println("Running command " + thisVar);
returnCommands[varIndex + 1] = thisVar;
pyt.runScript(thisVar);
}
}
public String getSQLite()
{
// create a full sqlite instance
// so that we can start querying it without connection issues
if(sqliteConnectionName == null)
{
sqliteConnectionName = "conn_" + frameName;
String [] commands = new String[3];
commands[0] = "import sqlite3";
commands[1] = sqliteConnectionName + " = sqlite3.connect(':memory:', check_same_thread=False)";
commands[2] = frameName + ".to_sql('" + frameName + "', " + sqliteConnectionName + ", if_exists='replace', index=False)";
this.pyt.runEmptyPy(commands);
}
return sqliteConnectionName;
}
private String getSQLFromNLP(String query)
{
//ITableDataFrame thisFrame = frameIterator.next();
query = query.substring(query.indexOf(":") + 1);
StringBuffer finalDbString = new StringBuffer();
logger.info("Processing frame " + this.getName());
finalDbString.append("#").append(this.getName()).append("(");
String [] columns = this.getColumnHeaders();
// if the frame is pandas frame get the data
// we will get to this shortly
for(int columnIndex = 0;columnIndex < columns.length;columnIndex++)
{
if(columnIndex == 0)
finalDbString.append(columns[columnIndex]);
else
finalDbString.append(" , ").append(columns[columnIndex]);
}
finalDbString.append(")\\n");
finalDbString.append("#\\n").append("### A query to list ").append(query).append("\\n").append("SELECT");
logger.info("executing query " + finalDbString);
Object output = this.pyt.runScript("smssutil.run_gpt_3(\"" + finalDbString + "\", " + 150 + ")");
// get the string
// make a frame
// load the frame into insight
logger.info("SQL query is " + output);
//Create a new SQL Data Frame
String sqlDFQuery = output.toString().trim();
// remove the new line
sqlDFQuery = sqlDFQuery.replace("\n", " ");
return sqlDFQuery;
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/*
* Unused methods...
*/
// private void alignColumns(String tableName, Map dataTypeMap) {
// // look at what are the column types
// // see if they are the same as data type map
//
// String colScript = PandasSyntaxHelper.getColumns(tableName);
// String typeScript = PandasSyntaxHelper.getTypes(tableName);
//
// Hashtable columnTypes = (Hashtable)pyt.runScript(colScript, typeScript);
// // get the column names
// ArrayList cols = (ArrayList)columnTypes.get(colScript);
// ArrayList types = (ArrayList)columnTypes.get(typeScript);
//
// List colChanger = new ArrayList();
//
// for(int colIndex = 0;colIndex < cols.size();colIndex++)
// {
// String colName = cols.get(colIndex);
// String colType = types.get(colIndex);
//
// SemossDataType pysColType = (SemossDataType)pyS.get(colType);
//
// SemossDataType sColType = dataTypeMap.get(colName);
//
// colChanger = checkColumn(tableName, colName, pysColType, sColType, colChanger);
// }
//
// pyt.runScript((String [])colChanger.toArray());
// }
// private List checkColumn(String tableName, String colName, SemossDataType curType, SemossDataType newType, List colChanger) {
// if(curType == newType) {
// return colChanger;
// } else {
// // get the pytype
// String pyType = (String)spy.get(curType);
//
// // get the as type
// String asType = (String)spy.get(pyType);
//
// // column change
// String script = PandasSyntaxHelper.getColumnChange(tableName, colName, asType);
// colChanger.add(script);
//
// return colChanger;
// }
// }
// public void setDataTypeMap(Map dataTypeMap) {
// this.dataTypeMap = dataTypeMap;
// interp.setDataTypeMap(dataTypeMap);
// }
}