Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
prerna.ds.py.PandasSyntaxHelper Maven / Gradle / Ivy
package prerna.ds.py;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import prerna.algorithm.api.SemossDataType;
import prerna.om.HeadersException;
import prerna.poi.main.helper.CSVFileHelper;
import prerna.poi.main.helper.excel.ExcelRange;
import prerna.sablecc2.om.Join;
public class PandasSyntaxHelper {
public static final List OPERATOR_LIST = new ArrayList(6);
static {
OPERATOR_LIST.add(">");
OPERATOR_LIST.add("<");
OPERATOR_LIST.add(">=");
OPERATOR_LIST.add("<=");
OPERATOR_LIST.add("==");
OPERATOR_LIST.add("!=");
}
private static final String NAN_VALUES = "[\"\",\"#N/A\",\"#N/A N/A\",\"#NA\",\"-1.#IND\",\"\",\"N/A\",\"NULL\""
+ "\"n/a\",\"null\",\"-1.#QNAN\",\"-NaN\",\"-nan\",\"1.#IND\",\"1.#Q.#NAN\",\"NA\",\"NaN\",\"nan\"]";
private static final String EQUAL = "==";
private static final String NOT_EQUAL = "!=";
private PandasSyntaxHelper() {
}
/**
* Get the syntax to construct get the wrapper for a pandas frame
*
* @param wrapper
* @param tableName
* @return
*/
public static String makeWrapper(String wrapper, String tableName) {
return wrapper + " = PyFrame.makefm(" + tableName + ")";
}
/**
* Get the name of the wrapper variable to be used
*
* @param frameName
* @return
*/
public static String createFrameWrapperName(String frameName) {
return frameName + "w";
}
/**
* Execute a .py file
*
* @param fileLocation
* @return
*/
public static String execFile(String fileLocation) {
return "execfile('" + fileLocation.replaceAll("\\\\+", "/") + "')";
}
/**
* Get the syntax to load a csv file Defaults the encoding to utf-8
*
* @param pandasImportVar
* @param numpyImportVar
* @param fileLocation
* @param tableName
* @return
*/
public static String getCsvFileRead(String pandasImportVar, String numpyImportVar, String fileLocation, String tableName) {
return getCsvFileRead(pandasImportVar, numpyImportVar, fileLocation, tableName, null, null, null);
}
/**
* Get the syntax to load a csv file
*
* @param pandasImportVar
* @param numpyImportVar
* @param fileLocation
* @param tableName
* @param sep
* @param quoteChar
* @param escapeChar
* @return
*/
public static String getCsvFileRead(String pandasImportVar, String numpyImportVar, String fileLocation,
String tableName, String sep, String quoteChar, String escapeChar) {
if (sep == null || sep.isEmpty()) {
sep = ",";
}
// default to double quote
if(quoteChar == null) {
quoteChar = "\"";
}
// default to \\ in python
if(escapeChar == null) {
escapeChar = "\\\\";
}
return getCsvFileRead(pandasImportVar, numpyImportVar, fileLocation, tableName, sep, quoteChar, escapeChar, null);
}
/**
* Get the syntax to load a CSV file that preserves leading zeros.
*
* @param pandasImportVar
* @param numpyImportVar
* @param fileLocation
* @param tableName
* @param sep
* @param quoteChar
* @param escapeChar
* @param encoding
* @return
*/
public static String getCsvFileRead(String pandasImportVar, String numpyImportVar, String fileLocation,
String tableName, String sep, String quoteChar, String escapeChar, String encoding) {
if (encoding == null || encoding.isEmpty()) {
encoding = "utf-8";
}
StringBuffer readCsv = buildReadCsv(pandasImportVar, numpyImportVar, fileLocation, tableName, sep, quoteChar, escapeChar, encoding);
return readCsv.toString();
}
/**
* Builds the script to read in a CSV file to a pandas data frame. Code preserves leading zeros of integers. Returned data frame can have mixed
* columns of floats and ints, but if there is a string present, the column type is returned as an object.
*
* @param pandasImportVar
* @param numpyImportVar
* @param fileLocation
* @param tableName
* @param sep
* @param quoteChar
* @param escapeChar
* @param encoding
* @return
*/
public static StringBuffer buildReadCsv(String pandasImportVar, String numpyImportVar, String fileLocation,
String tableName, String sep, String quoteChar, String escapeChar, String encoding) {
StringBuffer script = new StringBuffer();
StringBuffer replace = new StringBuffer(".replace(");
StringBuffer converter = new StringBuffer("converters={i: lambda x: x.strip() if (isinstance(");
StringBuffer numeric = new StringBuffer("");
numeric.append(pandasImportVar).append(".to_numeric(x,errors='ignore')");
replace.append(NAN_VALUES).append(",").append(numpyImportVar).append(".nan)");
converter.append(numeric).append(",").append(numpyImportVar).append(".integer) and x.strip()[0] == '0' and len(x.strip()) > 1) else ")
.append(numeric).append(" for i in range(" + numberOfColumns(fileLocation) + ")}");
script.append(tableName).append(" =").append(pandasImportVar).append(".read_csv('").append(fileLocation.replaceAll("\\\\+", "/"))
.append("',sep='" + sep + "',encoding='" + encoding + "',")
.append(converter + ")").append(replace)
.append(".apply(lambda x: x.astype(str) if (any(x.map(type) == str)) else x)").append(replace);
return script;
}
/**
* Returns the number of columns in a csv file.
*
* @param filePath
* @return
*/
public static Number numberOfColumns(String filePath) {
CSVFileHelper csv = new CSVFileHelper();
csv.parse(filePath);
return csv.getHeaders().length;
}
/**
* Reads a CSV using pandas. less greedy / dynamic version.
* @param pandasImportVar
* @param numpyImportVar
* @param fileLocation
* @param tableName
* @param sep
* @param encoding
* @param dataTypeMaps
* @return
*/
public static String getCsvFileRead(String pandasImportVar, String numpyImportVar, String fileLocation, String tableName, String sep,
String quoteChar, String escapeChar, String encoding, Map dataTypeMaps, long limit) {
fileLocation=fileLocation.replaceAll("\\\\+", "/");
StringBuffer script = new StringBuffer();
if (encoding == null || encoding.isEmpty()) {
encoding = ",encoding='utf-8',encoding_errors='backslashreplace'";
} else {
// we will wrap this
encoding = ",encoding='"+encoding+"',encoding_errors='backslashreplace'";
}
if (dataTypeMaps == null || dataTypeMaps.isEmpty()) {
String nrows = "";
if(limit > 0) {
nrows = ",nrows="+limit;
}
script.append(tableName).append("=").append(pandasImportVar).append(".read_csv('").append(fileLocation)
.append("',sep='"+sep+"',quotechar='"+quoteChar+"',escapechar='"+escapeChar+"'"+encoding+nrows+")");
} else {
script = buildReadCsv(pandasImportVar, numpyImportVar, fileLocation, tableName,
sep, quoteChar, escapeChar, encoding, dataTypeMaps, limit);
}
return script.toString();
}
/**
* Builds the pandas CSV read method. Less greedy approach that accounts for date parsing.
* https://pandas.pydata.org/pandas-docs/version/0.15.1/generated/pandas.read_csv.html
* @param pandasImportVar
* @param numpyImportVar
* @param fileLocation
* @param tableName
* @param sep
* @param quoteChar
* @param escapeChar
* @param encoding
* @param dataTypeMaps
* @return
*/
private static StringBuffer buildReadCsv(String pandasImportVar, String numpyImportVar, String fileLocation,
String tableName, String sep, String quoteChar, String escapeChar, String encoding, Map dataTypeMaps, long limit) {
StringBuffer dataMap = new StringBuffer();
StringBuffer dateList = new StringBuffer();
for (String column : dataTypeMaps.keySet()) {
Object inputType = dataTypeMaps.get(column);
SemossDataType dataType = null;
if(inputType instanceof SemossDataType) {
dataType = (SemossDataType) inputType;
} else {
dataType = SemossDataType.convertStringToDataType(inputType + "");
}
if (dataType == null) {
continue;
}
if (dataType == SemossDataType.DATE || dataType == SemossDataType.TIMESTAMP) {
if (dateList.length() > 0) {
dateList.append(",");
}
dateList.append("'").append(column).append("'");
} else {
if (dataMap.length() > 0) {
dataMap.append(",");
}
dataMap.append("'").append(column).append("':").append(convertSemossDataType(numpyImportVar, dataType));
}
}
dataMap = new StringBuffer("{").append(dataMap).append("}");
dateList = new StringBuffer("[").append(dateList).append("]");
String nrows = "";
if(limit > 0) {
nrows = ",nrows="+limit;
}
// we assume file location is already clean in this method
StringBuffer sb = new StringBuffer();
sb.append(tableName).append("=").append(pandasImportVar).append(".read_csv('").append(fileLocation)
.append("',sep='"+sep+"',quotechar='"+quoteChar+"',escapechar='"+escapeChar+"',dtype="
+ dataMap.toString()).append(",parse_dates=").append(dateList)
// note, we assume encoding is the actual ,encoding='value' format
.append(",infer_datetime_format=True"+encoding+nrows+")");
return sb;
}
/**
* Reads a JSON using pandas. less greedy / dynamic version.
* @param pandasImportVar
* @param numpyImportVar
* @param fileLocation
* @param tableName
* @param dataTypeMaps
* @return
*/
public static String getJsonFileRead(String pandasImportVar, String numpyImportVar, String fileLocation, String tableName, Map dataTypeMaps) {
StringBuffer script = new StringBuffer();
if (dataTypeMaps == null || dataTypeMaps.isEmpty()) {
script.append(tableName).append("=").append(pandasImportVar).append(".read_json('").append(fileLocation.replaceAll("\\\\+", "/"));
} else {
script = buildReadJson(pandasImportVar, numpyImportVar, fileLocation, tableName, dataTypeMaps);
}
return script.toString();
}
/**
* Builds the pandas JSON read method. Less greedy approach that accounts for date parsing.
* https://pandas.pydata.org/pandas-docs/version/0.15.1/generated/pandas.read_csv.html
* @param pandasImportVar
* @param numpyImportVar
* @param fileLocation
* @param tableName
* @param sep
* @param quoteChar
* @param escapeChar
* @param encoding
* @param dataTypeMaps
* @return
*/
public static StringBuffer buildReadJson(String pandasImportVar, String numpyImportVar, String fileLocation, String tableName, Map dataTypeMaps) {
StringBuffer sb = new StringBuffer();
StringBuffer dataMap = new StringBuffer();
StringBuffer dateList = new StringBuffer();
for (String column : dataTypeMaps.keySet()) {
Object inputType = dataTypeMaps.get(column);
SemossDataType dataType = null;
if(inputType instanceof SemossDataType) {
dataType = (SemossDataType) inputType;
} else {
dataType = SemossDataType.convertStringToDataType(inputType + "");
}
if (dataType == null) {
continue;
}
if (dataType == SemossDataType.DATE || dataType == SemossDataType.TIMESTAMP) {
if (dateList.length() > 0) {
dateList.append(",");
}
dateList.append("'").append(column).append("'");
} else {
if (dataMap.length() > 0) {
dataMap.append(",");
}
dataMap.append("'").append(column).append("':").append(convertSemossDataType(numpyImportVar, dataType));
}
}
dataMap = new StringBuffer("{").append(dataMap).append("}");
dateList = new StringBuffer("[").append(dateList).append("]");
sb.append(tableName).append("=").append(pandasImportVar).append(".read_json('").append(fileLocation.replaceAll("\\\\+", "/"))
.append("',dtype="+ dataMap.toString()).append(",convert_dates=").append(dateList).append(")");
return sb;
}
/**
* Maps Semoss data types to Pandas / Numpy data types.
* @param numpyImportVar
* @param type
* @return
*/
public static String convertSemossDataType(String numpyImportVar, Object inputType) {
// account for string or enum type
SemossDataType type = null;
if(inputType instanceof SemossDataType) {
type = (SemossDataType) inputType;
} else {
type = SemossDataType.convertStringToDataType(inputType + "");
}
if (type == SemossDataType.INT) {
return numpyImportVar + ".float64";
} else if (type == SemossDataType.DOUBLE) {
return numpyImportVar + ".float64";
} else if(type == SemossDataType.BOOLEAN) {
return "object";
// return numpyImportVar + ".bool";
} if (type == SemossDataType.DATE) {
return "object";
//return numpyImportVar + ".datetime64";
} else if (type == SemossDataType.TIMESTAMP) {
//return numpyImportVar + ".datetime64";
return "object";
} else {
return "object";
}
}
/**
*
* @param tableName
* @param fileLocation
* @return
*/
public static String getWriteCsvFile(String tableName, String fileLocation) {
return getWriteCsvFile(tableName, fileLocation, null);
}
/**
*
* @param tableName
* @param fileLocation
* @param sep
* @return
*/
public static String getWriteCsvFile(String tableName, String fileLocation, String sep) {
if (sep == null || sep.isEmpty()) {
sep = ",";
}
return getWriteCsvFile(tableName, fileLocation, sep, null);
}
/**
*
* @param tableName
* @param fileLocation
* @param sep
* @param encoding
* @return
*/
public static String getWriteCsvFile(String tableName, String fileLocation, String sep, String encoding) {
if (encoding == null || encoding.isEmpty()) {
encoding = "utf-8";
}
String readCsv = tableName + ".to_csv('" + fileLocation.replaceAll("\\\\+", "/") + "', sep='" + sep
+ "', encoding='" + encoding + "', index=False)";
return readCsv;
}
public static String loadExcelSheet(String pandasImportVar, String fileLocation, String tableName, String sheetName,
String sheetRange) {
int[] rangeIndicies = ExcelRange.getSheetRangeIndex(sheetRange);
int startCol = rangeIndicies[0];
int startRow = rangeIndicies[1];
int endCol = rangeIndicies[2];
int endRow = rangeIndicies[3];
fileLocation = fileLocation.replace("\\", "/");
StringBuffer sb = new StringBuffer();
sb.append(tableName + " = " + pandasImportVar + ".read_excel('" + fileLocation + "',");
// add column range
sb.append("sheet_name='" + sheetName + "', usecols=range(" + (startCol - 1) + ", " + endCol + "), ");
// add row range
int rowNum = endRow - startRow;
sb.append("skiprows = " + (startRow - 1) + ", nrows=" + rowNum + ")");
return sb.toString();
}
public static String getWritePandasToPickle(String pickleVarName, String tableName, String fileLocation) {
return pickleVarName + ".dump(" + tableName + ", open(\"" + fileLocation.replaceAll("\\\\+", "/") + "\", \"wb\"))";
}
public static String getReadPickleToPandas(String pandasImportVar, String fileLocation, String tableName) {
return tableName + " = " + pandasImportVar + ".read_pickle(\"" + fileLocation.replaceAll("\\\\+", "/") + "\")";
// return tableName + " = " + pickleVarName + ".load(open(\"" + fileLocation.replaceAll("\\\\+", "/") + "\", \"wb\"))";
}
/**
* Reads a Parquet File using pandas. less greedy / dynamic version.
* @param pandasImportVar
* @param numpyImportVar
* @param fileLocation
* @param tableName
* @param dataTypeMaps
* @return
*/
public static String getParquetFileRead(String pandasImportVar, String numpyImportVar, String fileLocation, String tableName) {
StringBuffer script = new StringBuffer();
script.append(tableName).append("=").append(pandasImportVar).append(".read_parquet('").append(fileLocation.replaceAll("\\\\+", "/"))
.append("')");
return script.toString();
}
/** Merges two dataframes. if nonEqui is true, implicitly performs a join by taking the cross product.
*
* @param leftTableName
* @param rightTableName
* @param joinType
* @param joinCols
* @return
*/
public static String getMergeSyntax(String pandasFrameVar, String returnTable, String leftTableName,
String rightTableName, String joinType, List> joinCols,boolean nonEqui) {
/*
* joinCols = [ {leftTable.Title -> rightTable.Movie} , {leftTable.Genre ->,
* rightTable.Genre} ]
*/
StringBuffer builder = new StringBuffer();
if (!nonEqui) {
builder.append(returnTable).append(" = ").append(pandasFrameVar).append(".merge(").append(leftTableName)
.append(", ").append(rightTableName).append(", left_on=[");
getMergeColsSyntax(builder, joinCols, true);
builder.append("], right_on=[");
getMergeColsSyntax(builder, joinCols, false);
if (joinType.equals("inner.join")) {
builder.append("], how=\"inner\")");
} else if (joinType.equals("left.outer.join")) {
builder.append("], how=\"left\")");
} else if (joinType.equals("right.outer.join")) {
builder.append("], how=\"right\")");
} else if (joinType.equals("outer.join")) {
builder.append("], how=\"outer\")");
}
} else {
builder.append(returnTable).append("=").append(pandasFrameVar).append(".merge(").append(leftTableName).append(".assign(key=0),")
.append(rightTableName).append(".assign(key=0),on='key').drop('key',axis=1)");
}
return builder.toString();
}
public static String getMergeFilterSyntax(String tableName, List> joinCols, List joinComparators) {
StringBuffer builder = new StringBuffer();
StringBuffer dropSyntax = new StringBuffer();
for (int i = 0; i < joinCols.size(); i++) {
if (builder.length() > 0) {
builder.append(" & ");
}
if (dropSyntax.length() > 0) {
dropSyntax.append(", ");
}
Map joinMap = joinCols.get(i);
for (String lColumn : joinMap.keySet()) {
String rColumn = joinMap.get(lColumn);
builder.append("(").append(tableName).append("['").append(lColumn).append("']")
.append(joinComparators.get(i)).append(tableName).append("['").append(rColumn).append("'])");
if (rColumn.equals(lColumn + "_CTD")) {
dropSyntax.append("'" + rColumn + "'");
}
}
}
builder = new StringBuffer(tableName).append("=").append(tableName).append(".loc[(").append(builder).append(")]");
if (dropSyntax.length() > 0) {
builder.append(".drop([").append(dropSyntax).append("],axis=1)");
}
return builder.toString();
}
/**
* Get the correct py syntax for the table join columns This uses the join
* information keys since it is {leftCol -> rightCol}
*
* @param builder
* @param colNames
*/
public static void getMergeColsSyntax(StringBuffer builder, List> colNames, boolean grabKeys) {
// iterate through the map
boolean firstLoop = true;
int numJoins = colNames.size();
for (int i = 0; i < numJoins; i++) {
Map joinMap = colNames.get(i);
// just in case an empty map is passed
// we do not want to modify the firstLoop boolean
if (joinMap.isEmpty()) {
continue;
}
if (!firstLoop) {
builder.append(",");
}
// this should really be 1
// since each join between 2 columns is its own map
Collection tableCols = null;
if (grabKeys) {
tableCols = joinMap.keySet();
} else {
tableCols = joinMap.values();
}
// keep track of where to add a ","
int counter = 0;
int numCols = tableCols.size();
for (String colName : tableCols) {
builder.append("\"").append(colName).append("\"");
if (counter + 1 != numCols) {
builder.append(",");
}
counter++;
}
firstLoop = false;
}
}
public static String alterColumnName(String tableName, String oldHeader, String newHeader) {
return tableName + ".rename(columns={'" + oldHeader + "':'" + newHeader + "'}, inplace=True)";
}
public static String alterColumnNames(String tableName, String[] oldHeaders, String[] newHeaders) {
// create loop to generate key value pair
Map headerMapping = new HashMap<>();
for(int i = 0; i < oldHeaders.length; i++) {
headerMapping.put(oldHeaders[i], newHeaders[i]);
}
return alterColumnNames(tableName, headerMapping);
}
public static String alterColumnNames(String tableName, Map oldNewHeaders) {
// create loop to generate key value pair
int numRenames = oldNewHeaders.size();
StringBuilder keyValues = new StringBuilder();
int i = 1;
for (String oldHeader : oldNewHeaders.keySet()) {
String newHeader = oldNewHeaders.get(oldHeader);
if (i == numRenames) {
keyValues.append("'" + oldHeader + "':'" + newHeader + "'");
break;
} else {
keyValues.append("'" + oldHeader + "':'" + newHeader + "',");
}
i++;
}
return tableName + ".rename(columns={" + keyValues.toString() + "}, inplace=True)";
}
/**
* Generate an alter statement to add new columns, taking into consideration
* joins and new column alias's
*
* @param tableName
* @param existingColumns
* @param newColumns
* @param joins
* @param newColumnAlias
* @return
*/
public static String alterMissingColumns(String tableName, String[] curHeaders,
Map newColumnsToTypeMap, List joins, Map newColumnAlias) {
List newColumnsToAdd = new Vector();
List newColumnsToAddTypes = new Vector();
// get all the join columns
List joinColumns = new Vector();
for (Join j : joins) {
String columnName = j.getRColumn();
if (columnName.contains("__")) {
columnName = columnName.split("__")[1];
}
joinColumns.add(columnName);
}
for (String newColumn : newColumnsToTypeMap.keySet()) {
SemossDataType newColumnType = newColumnsToTypeMap.get(newColumn);
// modify the header
if (newColumn.contains("__")) {
newColumn = newColumn.split("__")[1];
}
// if its a join column, ignore it
if (joinColumns.contains(newColumn)) {
continue;
}
// not a join column
// check if it has an alias
// and then add
if (newColumnAlias.containsKey(newColumn)) {
newColumnsToAdd.add(newColumnAlias.get(newColumn));
} else {
newColumnsToAdd.add(newColumn);
}
// and store the type at the same index
// in its list
newColumnsToAddTypes.add(newColumnType);
}
// TODO: account for column types
StringBuffer command = new StringBuffer(tableName).append(".reindex(columns=[");
// add current headers
for (int i = 0; i < curHeaders.length; i++) {
command.append("\"").append(curHeaders[i]).append("\",");
}
// add the new headers
int numNew = newColumnsToAdd.size();
for (int i = 0; i < numNew; i++) {
String newCol = newColumnsToAdd.get(i);
SemossDataType newColType = newColumnsToAddTypes.get(i);
command.append("\"").append(newCol).append("\"");
if ((i + 1) < numNew) {
command.append(",");
}
// if(newColType == SemossDataType.DOUBLE) {
// command.append(newColSyntax).append(" <- as.numeric(").append(newColSyntax).append(");");
// } else if(newColType == SemossDataType.INT) {
// command.append(newColSyntax).append(" <- as.integer(").append(newColSyntax).append(");");
// } else {
// command.append(newColSyntax).append(" <- as.character(").append(newColSyntax).append(");");
// }
}
command.append("])");
return command.toString();
}
// gets the number of rows in a given data frame
public static String getDFLength(String tableName) {
String dfLength = "len(" + tableName + ".index)";
return dfLength;
}
// gets all the columns as an array
public static String getColumns(String tableName) {
String cols = "list(" + tableName + ".columns)";
return cols;
}
// get all types
// this needs to be used in combination with get columns
public static String getTypes(String tableName) {
String types = "list(" + tableName + ".dtypes.astype(str))";
return types;
}
public static String getColumnType(String tableName, String column) {
String types = tableName + "['" + column + "'].dtype.name";
return types;
}
// gets the record
public static String getColumnChange(String tableName, String colName, String type) {
String typeChanger = tableName + "['" + colName + "'] = " + tableName + "['" + colName + "'].astype('" + type
+ "')";
return typeChanger;
}
/**
* Remove duplicate columns in a frame based on column names
*
* @param tableName the frame with the duplicate column names
* @param newTable the new frame to set
* @return Python syntax to remove duplicate columns
*/
public static String removeDuplicateColumns(String tableName, String newTable) {
String script = newTable + " = " + tableName + ".loc[:,~" + tableName + ".columns.duplicated()]";
return script;
}
/**
* Create a pandas vector from a java vector
*
* @param row The object[] to convert
* @param dataType The data type for each entry in the object[]
* @return String containing the equivalent r column vector
*/
public static String createPandasColVec(List row, SemossDataType dataType) {
StringBuffer str = new StringBuffer("([");
int i = 0;
int size = row.size();
for (; i < size; i++) {
if (SemossDataType.STRING == dataType) {
String escaper = row.get(i) + "";
escaper = escaper.replace("'", "\\'");
str.append("'").append(escaper).append("'");
} else if (SemossDataType.INT == dataType || SemossDataType.DOUBLE == dataType) {
str.append(row.get(i).toString());
} else if (SemossDataType.DATE == dataType) {
str.append("np.datetime64(\"" + row.get(i).toString() + "\")");
} else if (SemossDataType.TIMESTAMP == dataType) {
str.append("np.datetime64(\"" + row.get(i).toString() + "\")");
} else {
// just in case this is not defined yet...
// see the type of the value and add it in based on that
if (dataType == null) {
if (row.get(i) instanceof String) {
String escaper = row.get(i) + "";
escaper = escaper.replace("'", "\\'");
str.append("'").append(escaper).append("'");
} else {
str.append(row.get(i));
}
} else {
str.append(row.get(i));
}
}
// if not the last entry, append a "," to separate entries
if ((i + 1) != size) {
str.append(",");
}
}
str.append("])");
return str.toString();
}
public static String formatFilterValue(Object value, SemossDataType dataType) {
if (SemossDataType.STRING == dataType) {
return "'" + value + "'";
} else if (SemossDataType.INT == dataType || SemossDataType.DOUBLE == dataType) {
return value.toString();
} else if (SemossDataType.DATE == dataType) {
return "np.datetime64(\"" + value.toString() + "\", format='%Y-%m-%d')";
} else if (SemossDataType.TIMESTAMP == dataType) {
return "np.datetime64(\"" + value.toString() + "\", format='%Y-%m-%d %H:%M:%S')";
} else {
// just in case this is not defined yet...
// see the type of the value and add it in based on that
if (dataType == null) {
if (value instanceof String) {
return "'" + value + "'";
} else {
return value + "";
}
} else {
return value + "";
}
}
}
public static String cleanFrameHeaders(String frameName, String[] colNames) {
HeadersException headerChecker = HeadersException.getInstance();
String[] pyColNames = headerChecker.getCleanHeaders(colNames);
// update frame header names in R
StringBuffer colRen = new StringBuffer("{");
for (int i = 0; i < colNames.length; i++) {
colRen.append("'").append(colNames[i]).append("':'").append(pyColNames[i]).append("'");
if (i < colNames.length - 1) {
colRen.append(", ");
}
}
colRen.append("}");
String script = frameName + ".rename(columns=" + colRen + ", inplace=True)";
return script;
}
/**
* Set the column names for a frame
*
* @param tableName
* @param headers the new column names to use
* @return
*/
public static String setColumnNames(String tableName, String[] headers) {
StringBuffer sb = new StringBuffer();
for (int headerIndex = 0; headerIndex < headers.length; headerIndex++) {
if (sb.length() == 0) {
sb.append("[");
} else {
sb.append(",");
}
sb.append("'").append(headers[headerIndex]).append("'");
}
sb.append("]");
String headerS = tableName + ".columns=" + sb.toString();
return headerS;
}
/**
* Filters a data frame on specified columns.
* @param tableName
* @param newTableName
* @param columns
* @return
*/
public static String filterByColumn(String tableName, String newTableName, List columns) {
StringBuffer sb = new StringBuffer();
if (columns.isEmpty() || columns == null) {
return newTableName + "=" + tableName;
}
sb.append("[");
if (columns.size() == 1) {
sb.append("'" + columns.get(0) + "'");
} else {
for (int i = 0; i < columns.size() - 1; i++) {
sb.append("'" + columns.get(i) + "', ");
}
sb.append("'" + columns.get(columns.size() - 1) + "'");
}
sb.append("]");
String script = newTableName + " = " + tableName + "[" + sb.toString() + "]";
return script;
}
/**
* Filters a data frame by taking a slice based on index values.
* @param tableName
* @param newTableName
* @param rowEnd
* @param rowStart
* @return
*/
public static String filterRowBySlice(String tableName, String newTableName, Object indexEnd, Object indexStart) {
boolean nullPassed = false;
try {
if (indexStart == null) {
indexStart = "0";
nullPassed = true;
}
Integer.parseInt((String) indexStart);
Integer.parseInt((String) indexEnd);
String script = newTableName + " = " + tableName + ".iloc[" + indexStart + ":" + indexEnd + "]";
return script;
}
catch (NumberFormatException e) {
if (nullPassed) {
throw new IllegalArgumentException("Incorrect input of 'null' when filtering by label.");
}
String script = newTableName + " = " + tableName + ".loc['" + indexStart + "':'" + indexEnd + "']";
return script;
}
}
/**
* Filters data frame by rows, either numeric or label.
* @param tableName
* @param newTableName
* @param rowIndex
* @return
*/
public static String filterByRow(String tableName, String newTableName, List rowIndex) {
StringBuffer sb = new StringBuffer();
if (rowIndex.isEmpty() || rowIndex == null) {
String script = newTableName + " = " + tableName;
return script;
}
try {
sb.append("[");
if (rowIndex.size() == 1) {
Integer.parseInt(rowIndex.get(0));
sb.append(rowIndex.get(0) + "]");
String script = newTableName + " = " + tableName + ".iloc[" + sb.toString() + "]";
return script;
}
else {
for (int i = 0; i < rowIndex.size() - 1; i++) {
Integer.parseInt(rowIndex.get(i));
sb.append(rowIndex.get(i) + ", ");
}
Integer.parseInt(rowIndex.get(rowIndex.size() - 1));
sb.append(rowIndex.get(rowIndex.size() - 1) + "]");
String script = newTableName + " = " + tableName + ".iloc[" + sb.toString() + "]";
return script;
}
}
catch (NumberFormatException e) {
sb.delete(0, sb.length());
sb.append("['");
if (rowIndex.size() == 1) {
sb.append(rowIndex.get(0) + "']");
String script = newTableName + " = " + tableName + ".loc[" + sb.toString() + "]";
return script;
}
else {
for (int i = 0; i < rowIndex.size() - 1; i++) {
sb.append(rowIndex.get(i) + "', '");
}
sb.append(rowIndex.get(rowIndex.size() - 1) + "']");
String script = newTableName + " = " + tableName + ".loc[" + sb.toString() + "]";
return script;
}
}
}
/**
* Filter pandas data frame by specified value and operator. Handles both numeric and object value types.
* @param tableName
* @param newTableName
* @param columnName
* @param operator
* @param value
* @return
*/
public static String filterBySingleValue(String tableName, String newTableName, String columnName, String operator, Object value) {
try {
Double.parseDouble((String) value);
// check to ensure valid operator
if (!OPERATOR_LIST.contains(operator.trim())) {
throw new IllegalArgumentException("Operator " + operator.trim() + " is not a valid operator.");
}
String script = newTableName + " = " + tableName + "[" + tableName + "['" + columnName + "']"
+ operator.trim() + value + "]";
return script;
}
catch (NumberFormatException e) {
if (!(operator.trim().equals(EQUAL) || operator.trim().equals(NOT_EQUAL))) {
throw new IllegalArgumentException("Operator " + operator.trim() + " is not a valid operator.");
}
String script = newTableName + " = " + tableName + "[" + tableName + "['" + columnName + "']"
+ operator.trim() + "'" + value + "']";
return script;
}
}
/**
* Filters a data frame using boolean expressions on multiple values in a single column.
* @param tableName
* @param newTableName
* @param columnName
* @param values
* @param negation -- default to null
* @return
*/
public static String filterByMultipleValues(String tableName, String newTableName, String columnName, List values,
Object negation) {
if (negation == null) {
negation = "";
}
StringBuffer sb = new StringBuffer();
if (values.isEmpty() || values == null) {
String script = newTableName + " = " + tableName;
return script;
}
try {
sb.append("[");
if (values.size() == 1) {
Double.parseDouble(values.get(0));
sb.append(values.get(0) + "]");
String script = newTableName + " = " + tableName + "[" + negation + tableName + "['" + columnName
+ "'].isin(" + sb.toString() + ")]";
return script;
}
else {
for (int i = 0; i < values.size() - 1; i++) {
Double.parseDouble(values.get(i));
sb.append(values.get(i) + ", ");
}
Double.parseDouble(values.get(values.size() - 1));
sb.append(values.get(values.size() - 1) + "]");
String script = newTableName + " = " + tableName + "[" + negation + tableName + "['" + columnName
+ "'].isin(" + sb.toString() + ")]";
return script;
}
}
catch (NumberFormatException e) {
sb.delete(0, sb.length());
sb.append("['");
if (values.size() == 1) {
sb.append(values.get(0) + "']");
String script = newTableName + " = " + tableName + "[" + negation + tableName + "['" + columnName
+ "'].isin(" + sb.toString() + ")]";
return script;
}
else {
for (int i = 0; i < values.size() - 1; i++) {
sb.append(values.get(i) + "', '");
}
sb.append(values.get(values.size() - 1) + "']");
String script = newTableName + " = " + tableName + "[" + negation + tableName + "['" + columnName
+ "'].isin(" + sb.toString() + ")]";
return script;
}
}
}
/**
* Filters a data frame by whether a specified column contains an expression. Requirements are that columnType is object.
* @param tableName
* @param newTableName
* @param columnName
* @param expression
* @param negation -- default to null
* @return
*/
public static String filterByExpression(String tableName, String newTableName, String columnName, Object expression,
Object negation) {
if (negation == null) {
negation = "";
}
String script = newTableName + " = " + tableName + "[" + negation + tableName + "['" + columnName
+ "'].str.contains('" + expression + "')]";
return script;
}
/**
* Filters a data frame by selecting all rows between 2 values.
* @param tableName
* @param newTableName
* @param columnName
* @param floor
* @param ceiling
* @param negation -- default to null
* @return
*/
public static String filterBetweenNumbers(String tableName, String newTableName, String columnName, Object floor,
Object ceiling, Object negation) {
if (negation == null) {
negation = "";
}
String script = newTableName + " = " + tableName + "[" + negation + tableName + "['" + columnName
+ "'].between(" + floor.toString() + ", " + ceiling.toString() + ")]";
return script;
}
}