Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
prerna.ds.r.RSyntaxHelper Maven / Gradle / Ivy
package prerna.ds.r;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import prerna.algorithm.api.SemossDataType;
import prerna.om.HeadersException;
import prerna.sablecc2.om.GenRowStruct;
import prerna.sablecc2.om.Join;
import prerna.util.Utility;
public class RSyntaxHelper {
private static final Logger logger = LogManager.getLogger(RSyntaxHelper.class);
private static Map javaRDatTimeTranslationMap = new HashMap();
static {
javaRDatTimeTranslationMap.put("y2", "%y"); //yr without century (2 digits)
javaRDatTimeTranslationMap.put("Y2", "%y"); //yr without century (2 digits)
javaRDatTimeTranslationMap.put("y4", "%Y"); //full yr (4 digits)
javaRDatTimeTranslationMap.put("Y4", "%Y"); //full yr (4 digits)
// javaRDatTimeTranslationMap.put("G", value); //Era
javaRDatTimeTranslationMap.put("M1", "%m"); //Numerical month
javaRDatTimeTranslationMap.put("M2", "%m"); //Numerical month
javaRDatTimeTranslationMap.put("M3", "%b"); //Abbreviated month name
javaRDatTimeTranslationMap.put("M+", "%B"); //Full month name
javaRDatTimeTranslationMap.put("d", "%d"); //Day in month
javaRDatTimeTranslationMap.put("D", "%j"); //Day in year
// javaRDatTimeTranslationMap.put("w", value); //Week in year
// javaRDatTimeTranslationMap.put("W", value); //Week in month
// javaRDatTimeTranslationMap.put("F", value); //Day of week in month
javaRDatTimeTranslationMap.put("E3", "%a"); //Abbreviate day name in week
javaRDatTimeTranslationMap.put("E", "%A"); //Full day name in week
javaRDatTimeTranslationMap.put("u", "%u"); //Day number of week (1 = Monday, ..., 7 = Sunday)
javaRDatTimeTranslationMap.put("a", "%p"); //AM/PM --- need to be used with %I not %H
javaRDatTimeTranslationMap.put("H", "%H"); //Hour in day (0-23)
// javaRDatTimeTranslationMap.put("k", value); //Hour in day (1-24)
// javaRDatTimeTranslationMap.put("K", value); //Hour in am/pm (0-11)
javaRDatTimeTranslationMap.put("h", "%I"); //Hour in am/pm (1-12)
javaRDatTimeTranslationMap.put("m", "%M"); //Minute in hour
javaRDatTimeTranslationMap.put("s", "%S"); //Second in minute
// javaRDatTimeTranslationMap.put("s", "%OS"); //Handles milliseconds as well
javaRDatTimeTranslationMap.put("S", "%OS"); //Millisecond
// javaRDatTimeTranslationMap.put("z", "%Z"); //tz (Pacific Standard Time; PST; GMT-08:00) ???? - TODO needs to be handled via the tz param
javaRDatTimeTranslationMap.put("Z", "%z"); //tz (-0800)
javaRDatTimeTranslationMap.put("X", "%z"); //tz (-08; -0800; -08:00)
}
private RSyntaxHelper() {
}
/**
* Convert a java object[] into a r column vector
* @param row The object[] to convert
* @param dataType The data type for each entry in the object[]
* @return String containing the equivalent r column vector
*/
public static String createRColVec(Object[] row, SemossDataType[] dataType) {
StringBuilder str = new StringBuilder("c(");
int i = 0;
int size = row.length;
for(; i < size; i++) {
if(dataType[i] == SemossDataType.STRING || dataType[i] == SemossDataType.FACTOR) {
str.append("\"").append(row[i]).append("\"");
} else {
str.append(row[i]);
}
// if not the last entry, append a "," to separate entries
if( (i+1) != size) {
str.append(",");
}
}
str.append(")");
return str.toString();
}
/**
* Convert a r vector from a java vector
* @param row The object[] to convert
* @param dataType The data type for each entry in the object[]
* @param additionalParameter Useful on timestamp where we need to define the timezone, tz()
* @return String containing the equivalent r column vector
*/
public static String createRColVec(List row, SemossDataType dataType, String additionalParameter) {
DecimalFormat decimalFormat = null;
StringBuilder str = new StringBuilder("c(");
int i = 0;
int size = row.size();
for(; i < size; i++) {
if(SemossDataType.STRING == dataType || SemossDataType.FACTOR == dataType) {
str.append("\"").append(row.get(i)).append("\"");
} else if(SemossDataType.INT == dataType || SemossDataType.DOUBLE == dataType) {
if(decimalFormat == null) {
decimalFormat = new DecimalFormat("0.0000000000");
}
try {
decimalFormat = new DecimalFormat("0.0000000000");
str.append(decimalFormat.format(row.get(i)));
} catch(Exception e1) {
logger.warn("Invalid object passed for numeric field", e1);
str.append(row.get(i) + "");
}
} else if(SemossDataType.DATE == dataType) {
str.append("as.Date(\"").append(row.get(i).toString()).append("\", format='%Y-%m-%d')");
} else if(SemossDataType.TIMESTAMP == dataType) {
if(additionalParameter != null && !additionalParameter.isEmpty()) {
str.append("as.POSIXct(\"").append(row.get(i).toString()).append("\", format='%Y-%m-%d %H:%M:%OS', ")
.append(additionalParameter).append(")");
} else {
str.append("as.POSIXct(\"").append(row.get(i).toString()).append("\", format='%Y-%m-%d %H:%M:%OS')");
}
} else {
// just in case this is not defined yet...
// see the type of the value and add it in based on that
if(dataType == null) {
if(row.get(i) instanceof String) {
str.append("\"").append(row.get(i)).append("\"");
} else {
str.append(row.get(i));
}
} else {
str.append(row.get(i));
}
}
// if not the last entry, append a "," to separate entries
if( (i+1) != size) {
str.append(",");
}
}
str.append(")");
return str.toString();
}
/**
* Convert a list to a r column vector of strings
* @param values
* @return
*/
public static String createStringRColVec(GenRowStruct grs) {
StringBuilder str = new StringBuilder("c(");
int i = 0;
int size = grs.size();
for(; i < size; i++) {
str.append("\"").append(grs.get(i)).append("\"");
// if not the last entry, append a "," to separate entries
if( (i+1) != size) {
str.append(",");
}
}
str.append(")");
return str.toString();
}
/**
* Convert a list to a r column vector of strings
* @param values
* @return
*/
public static String createStringRColVec(List values) {
StringBuilder str = new StringBuilder("c(");
int i = 0;
int size = values.size();
for(; i < size; i++) {
str.append("\"").append(values.get(i)).append("\"");
// if not the last entry, append a "," to separate entries
if( (i+1) != size) {
str.append(",");
}
}
str.append(")");
return str.toString();
}
/**
* Convert a list to a r column vector of strings
* @param values
* @return
*/
public static String createStringRColVec(Collection values) {
StringBuilder str = new StringBuilder("c(");
// add the first element
Iterator iterator = values.iterator();
if(iterator.hasNext()) {
str.append("\"").append(iterator.next()).append("\"");
}
// add a comma and then the first element
while(iterator.hasNext()) {
str.append(", \"").append(iterator.next()).append("\"");
}
str.append(")");
return str.toString();
}
/**
* Convert a java object[] into a r column vector
* @param row The object[] to convert
* @param dataType The data type for each entry in the object[]
* @return String containing the equivalent r column vector
*/
public static String createStringRColVec(Object[] row) {
StringBuilder str = new StringBuilder("c(");
int i = 0;
int size = row.length;
for(; i < size; i++) {
str.append("\"").append(row[i]).append("\"");
// if not the last entry, append a "," to separate entries
if( (i+1) != size) {
str.append(",");
}
}
str.append(")");
return str.toString();
}
/**
* Convert a java object[] into a r column vector
* @param row The object[] to convert
* @param dataType The data type for each entry in the object[]
* @return String containing the equivalent r column vector
*/
public static String createStringRColVec(Integer[] row) {
StringBuilder str = new StringBuilder("c(");
int i = 0;
int size = row.length;
for(; i < size; i++) {
str.append(row[i]);
// if not the last entry, append a "," to separate entries
if( (i+1) != size) {
str.append(",");
}
}
str.append(")");
return str.toString();
}
/**
* Convert a java object[] into a r column vector
* @param row The object[] to convert
* @param dataType The data type for each entry in the object[]
* @return String containing the equivalent r column vector
*/
public static String createStringRColVec(Double[] row) {
StringBuilder str = new StringBuilder("c(");
int i = 0;
int size = row.length;
for(; i < size; i++) {
str.append(row[i]);
// if not the last entry, append a "," to separate entries
if( (i+1) != size) {
str.append(",");
}
}
str.append(")");
return str.toString();
}
/**
* Convert a java object[] into a r ordered factor col
* @param row The object[] to convert
* @param orderedLevels The ordering of the factor
* @return String containing the equivalent r column vector
*/
public static String createOrderedRFactor(Object[] row, String orderedLevels) {
StringBuilder str = new StringBuilder();
String vectorParameter = createStringRColVec(row);
Object[] orderedLevelsSplit = orderedLevels.split("+++");
if (orderedLevelsSplit.length < 2) {
throw new RuntimeException("Ordered levels of a factor must contain 2 or more items.");
} else {
String orderLevelsVector = createStringRColVec(orderedLevelsSplit);
str.append("factor(" + vectorParameter + ", ordered = TRUE, levels = " + orderLevelsVector + ");");
}
return str.toString();
}
/**
* Convert a java object[] into a r ordered factor col
* @param row The object[] to convert
* @param orderedLevels The ordering of the factor
* @param orderedLevelLabels Corresponding labels of the ordered levels of the factor
* @return String containing the equivalent r column vector
*/
public static String createOrderedRFactor(Object[] row, String orderedLevels, String orderedLevelLabels) {
StringBuilder str = new StringBuilder();
String vectorParameter = createStringRColVec(row);
Object[] orderedLevelsSplit = orderedLevels.split("+++");
Object[] orderedLevelLabelsSplit = orderedLevelLabels.split("+++");
if (orderedLevelsSplit.length != orderedLevelLabelsSplit.length) {
throw new RuntimeException("Counts of ordered levels and the corresponding labels must be equal.");
} else if (orderedLevelsSplit.length < 2) {
throw new RuntimeException("Ordered levels/labels of a factor must contain 2 or more items.");
} else {
String orderLevelsVector = createStringRColVec(orderedLevelsSplit);
str.append("factor(" + vectorParameter + ", ordered = TRUE, levels = " + orderLevelsVector + "), labels = " + orderedLevelLabelsSplit +");");
}
return str.toString();
}
public static String getOrderedLevelsFromRFactorCol(String tableName, String colName) {
StringBuilder str = new StringBuilder();
str.append("paste(levels("+ tableName + "$" + colName + "), collapse = '+++');");
return str.toString();
}
////////////////////////////////////////////////
// Data frame changes
/////////////////////////////////////////////////
public static String cleanFrameHeaders(String frameName, String[] colNames) {
HeadersException headerChecker = HeadersException.getInstance();
colNames = headerChecker.getCleanHeaders(colNames);
// update frame header names in R
String rColNames = "";
for (int i = 0; i < colNames.length; i++) {
rColNames += "\"" + colNames[i] + "\"";
if (i < colNames.length - 1) {
rColNames += ", ";
}
}
String script = "colnames(" + frameName + ") <- c(" + rColNames + ");";
return script;
}
public static String alterColumnName(String tableName, String oldHeader, String newHeader) {
StringBuilder str = new StringBuilder();
str.append("colnames("+tableName+")[which(names("+tableName+") == \""+oldHeader+"\")] <- \""+newHeader+"\";");
return str.toString();
}
public static String alterColumnNames(String tableName, String[] oldHeaders, String[] newHeaders) {
StringBuilder str = new StringBuilder();
str.append("setnames("+tableName+",old=" + createStringRColVec(oldHeaders) + ",new=" + createStringRColVec(newHeaders) + ");");
return str.toString();
}
/**
* Convert the column into the correct type
* @param tableName
* @param colName
* @param type
* @return
*/
public static String alterColumnType(String tableName, String colName, SemossDataType type) {
if(type == SemossDataType.INT) {
return alterColumnTypeToInteger(tableName, colName);
} else if (type == SemossDataType.DOUBLE) {
return alterColumnTypeToNumeric(tableName, colName);
} else if(type == SemossDataType.STRING) {
return alterColumnTypeToCharacter(tableName, colName);
} else if(type == SemossDataType.FACTOR) {
return alterColumnTypeToFactor(tableName, colName);
} else if(type == SemossDataType.DATE) {
return alterColumnTypeToDate(tableName, null, colName);
} else if(type == SemossDataType.TIMESTAMP) {
return alterColumnTypeToDateTime(tableName, null, colName);
}
throw new IllegalArgumentException("Unable to convert column to specified type");
}
public static String alterColumnTypeToCharacter(String tableName, String colName) {
// will generate a string similar to
// "datatable$Revenue_International <- as.numeric(as.character(datatable$Revenue_International))"
StringBuilder builder = new StringBuilder();
builder.append(tableName).append("$").append(colName).append(" <- ").append("as.character(")
.append(tableName).append("$").append(colName).append(")");
return builder.toString();
}
public static String alterColumnTypeToCharacter(String tableName, List colName) {
StringBuilder builder = new StringBuilder();
builder.append(tableName + "[,(c('" + StringUtils.join(colName,"','") + "')) := lapply(.SD, as.character), .SDcols = c('" + StringUtils.join(colName,"','") + "')]");
return builder.toString();
}
public static String alterColumnTypeToFactor(String tableName, String colName) {
// will generate a string similar to
// "datatable$Revenue_International <- as.factor(datatable$Revenue_International)"
StringBuilder builder = new StringBuilder();
builder.append(tableName).append("$").append(colName).append(" <- ").append("as.factor(")
.append(tableName).append("$").append(colName).append(")");
return builder.toString();
}
public static String alterColumnTypeToFactor(String tableName, List colName) {
StringBuilder builder = new StringBuilder();
builder.append(tableName + "[,(c('" + StringUtils.join(colName,"','") + "')) := lapply(.SD, as.factor), .SDcols = c('" + StringUtils.join(colName,"','") + "')]");
return builder.toString();
}
/**
* Converts a R column type to numeric
* @param tableName The name of the R table
* @param colName The name of the column
* @return The r script to execute
*/
public static String alterColumnTypeToNumeric(String tableName, String colName) {
// will generate a string similar to
// "datatable$Revenue_International <- as.numeric(as.character(datatable$Revenue_International))"
StringBuilder builder = new StringBuilder();
builder.append(tableName).append("$").append(colName).append(" <- ").append("as.numeric(as.character(")
.append(tableName).append("$").append(colName).append("))");
return builder.toString();
}
public static String alterColumnTypeToNumeric(String tableName, List colName) {
StringBuilder builder = new StringBuilder();
builder.append(tableName + "[,(c('" + StringUtils.join(colName,"','") + "')) := lapply(.SD, as.numeric), .SDcols = c('" + StringUtils.join(colName,"','") + "')]");
return builder.toString();
}
/**
* Converts a R column type to boolean
* @param tableName The name of the R table
* @param colName The name of the column
* @return The r script to execute
*/
public static String alterColumnTypeToBoolean(String tableName, String colName) {
// will generate a string similar to
// "datatable$Revenue_International <- as.boolean(datatable$Revenue_International)"
StringBuilder builder = new StringBuilder();
builder.append(tableName).append("$").append(colName).append(" <- ").append("as.logical(")
.append(tableName).append("$").append(colName).append(")");
return builder.toString();
}
public static String alterColumnTypeToBoolean(String tableName, List colName) {
StringBuilder builder = new StringBuilder();
builder.append(tableName + "[,(c('" + StringUtils.join(colName,"','") + "')) := lapply(.SD, as.logical), .SDcols = c('" + StringUtils.join(colName,"','") + "')]");
return builder.toString();
}
/**
* Converts a R column type to numeric
* @param tableName The name of the R table
* @param colName The name of the column
* @return The r script to execute
*/
public static String alterColumnTypeToInteger(String tableName, String colName) {
// will generate a string similar to
// "datatable$Revenue_International <- as.integer(as.character(datatable$Revenue_International))"
StringBuilder builder = new StringBuilder();
builder.append(tableName).append("$").append(colName).append(" <- ").append("as.integer(as.character(")
.append(tableName).append("$").append(colName).append("))");
return builder.toString();
}
public static String alterColumnTypeToInteger(String tableName, List colName) {
StringBuilder builder = new StringBuilder();
builder.append(tableName + "[,(c('" + StringUtils.join(colName,"','") + "')) := lapply(.SD, as.integer), .SDcols = c('" + StringUtils.join(colName,"','") + "')]");
return builder.toString();
}
/**
* Converts a R column type to date
* @param tableName The name of the R table
* @param colName The name of the column
* @return The r script to execute
*/
public static String alterColumnTypeToDate(String tableName, String format, String colName) {
if(format == null) {
format = "%Y-%m-%d";
}
String[] parsedFormat = format.split("\\|");
// will generate a string similar to
// "datatable$Birthday <- as.Date(as.character(datatable$Birthday), format = '%m/%d/%Y')"
StringBuilder builder = new StringBuilder();
builder.append(tableName + "$" + colName + "<- as.Date("
+ tableName + "$" + colName + ", format = '" + parsedFormat[0] +"')");
return builder.toString();
}
public static String alterColumnTypeToDate(String tableName, String format, List cols) {
if(format == null) {
format = "%Y-%m-%d";
}
String[] parsedFormat = format.split("\\|");
StringBuilder builder = new StringBuilder();
//parse out the milliseconds options
builder.append(tableName + "[,(c('" + StringUtils.join(cols,"','") + "')) := "
+ "lapply(.SD, function(x) as.Date(x, format='" + parsedFormat[0] + "')), "
+ ".SDcols = c('" + StringUtils.join(cols,"','") + "')]");
return builder.toString();
}
public static List alterColumnTypeToDate_Excel(String tableName, List cols) {
List dateExcelR= new ArrayList();
StringBuilder builder = new StringBuilder();
String convertedDateCols_R = "convertedDateCols" + Utility.getRandomString(6);
String convertedDateColsList_R = "convertedDateColsList" + Utility.getRandomString(6);
//append the function that will handle dates that have been translated into a numerical value during read.xlsx2 upload
builder.append("clean_convertToDate_function <- function(x){ cleanCol <- gsub('^\\\\s*$', 'NA', " + tableName + "[[x]]) "
+ "%>% .[!is.na(.) & . != 'NA' & . != 'null' & . != 'NULL'];"
+ "if (all(!is.na(as.numeric(cleanCol)))) as.Date('1900-01-01') + as.numeric(" + tableName + "[[x]]) - 2 else 'NOTHING' };");
//call the function above and sort columns by whether they were handled by the function
builder.append(convertedDateCols_R + " <- lapply(c('" + StringUtils.join(cols, "','") + "'), clean_convertToDate_function);");
builder.append(convertedDateColsList_R + " <- c('" + StringUtils.join(cols,"','") + "')[-grep('NOTHING'," + convertedDateCols_R + ")];");
//update columns via the outputs of the function
builder.append("if (length(" + convertedDateColsList_R + ") > 0) "
+ tableName + "[, (" + convertedDateColsList_R + ") := lapply("
+ "setdiff(seq(1,length(" + convertedDateCols_R + ")), grep('NOTHING'," + convertedDateCols_R + ")), "
+ "function(x) unlist(" + convertedDateCols_R + "[[x]] ))];");
//clean up variables/functions
builder.append("rm(clean_convertToDate_function," + convertedDateCols_R + "); gc()");
//prep return list
dateExcelR.add(builder.toString());
dateExcelR.add(convertedDateColsList_R);
return dateExcelR;
}
public static String alterColumnTypeToDateTime(String tableName, String format,String colName) {
if(format == null) {
format = "%Y-%m-%d %H:%M:%OS|NULL";
}
String[] parsedFormat = format.split("\\|");
StringBuilder builder = new StringBuilder();
builder.append("options(digits.secs=" + parsedFormat[1] + ");");
builder.append(tableName + "$" + colName + "<- as.POSIXct(fast_strptime("
+ tableName + "$" + colName + ", format = '" + parsedFormat[0] +"'))");
return builder.toString();
}
public static String alterColumnTypeToDateTime(String tableName, String format, List cols) {
if(format == null) {
format = "%Y-%m-%d %H:%M:%OS|NULL";
}
//parse out the milliseconds options
String[] parsedFormat = format.split("\\|");
StringBuilder builder = new StringBuilder();
builder.append("options(digits.secs=" + parsedFormat[1] + ");");
builder.append(tableName + "[,(c('" + StringUtils.join(cols,"','") + "')) := "
+ "lapply(.SD, function(x) as.POSIXct(fast_strptime(x, format='" + parsedFormat[0] + "'))), "
+ ".SDcols = c('" + StringUtils.join(cols,"','") + "')]");
return builder.toString();
}
public static List alterColumnTypeToDateTime_Excel(String tableName, List cols) {
List dateTimeExcelR= new ArrayList();
StringBuilder builder = new StringBuilder();
String convertedDTCols_R = "convertedDTCols" + Utility.getRandomString(6);
String convertedDTColsList_R = "convertedDTColsList" + Utility.getRandomString(6);
//TODO need to make this based on an input parameter instead of being hardcoded
builder.append("options(digits.secs = 3);");
//append the function that will handle datetimes that have been translated into a numerical value during read.xlsx2 upload
builder.append("clean_convertToPOSIXct_function <- function(x){ cleanCol <- gsub('^\\\\s*$', 'NA', " + tableName + "[[x]]) "
+ "%>% .[!is.na(.) & . != 'NA' & . != 'null' & . != 'NULL'];"
+ "if (all(!is.na(as.numeric(cleanCol)))) as.POSIXct(as.numeric(" + tableName + "[[x]])*86400, origin = '1899-12-30', tz='UTC') else 'NOTHING' };");
//call the function above and sort columns by whether they were handled by the function
builder.append(convertedDTCols_R + " <- lapply(c('" + StringUtils.join(cols, "','") + "'), clean_convertToPOSIXct_function);");
builder.append(convertedDTColsList_R + " <- c('" + StringUtils.join(cols,"','") + "')[-grep('NOTHING'," + convertedDTCols_R + ")];");
//update columns via the outputs of the function
builder.append("if (length(" + convertedDTColsList_R + ") > 0) "
+ tableName + "[, (" + convertedDTColsList_R + ") := lapply("
+ "setdiff(seq(1,length(" + convertedDTCols_R + ")), grep('NOTHING'," + convertedDTCols_R + ")), "
+ "function(x) unlist(" + convertedDTCols_R + "[[x]] ))];");
//clean up variables/functions
builder.append("rm(clean_convertToDate_function," + convertedDTCols_R + "); gc()");
//prep return list
dateTimeExcelR.add(builder.toString());
dateTimeExcelR.add(convertedDTColsList_R);
return dateTimeExcelR;
}
public static String alterEmptyTableColumnTypeToDateTime(String tableName, List cols) {
//parse out the milliseconds options
StringBuilder builder = new StringBuilder();
builder.append(tableName + "[,(c('" + StringUtils.join(cols,"','") + "')) := "
+ "lapply(.SD, function(x) as.POSIXct(x)), "
+ ".SDcols = c('" + StringUtils.join(cols,"','") + "')]");
return builder.toString();
}
/**
* Get specific columns from a frame
* @param resultingFrame
* @param dataframe
* @param cols
* @return
*/
public static String getFrameSubset(String resultingFrame, String dataframe, Object[] cols) {
StringBuilder rsb = new StringBuilder();
String rColsVec = RSyntaxHelper.createStringRColVec(cols);
rsb.append(resultingFrame + "<- subset(" + dataframe + ", select=" + rColsVec + ");");
return rsb.toString();
}
/**
* Return new frame object
* @param oldFrame
* @param newFrame
* @return
*/
public static String asDataTable(String newFrame, String oldFrame) {
StringBuilder rsb = new StringBuilder();
rsb.append(newFrame + "<- as.data.table(" + oldFrame + ");");
return rsb.toString();
}
/**
* Return new frame object
* @param oldFrame
* @param newFrame
* @return
*/
public static String asDataFrame(String newFrame, String oldFrame) {
StringBuilder rsb = new StringBuilder();
rsb.append(newFrame + "<- as.data.frame(" + oldFrame + ");");
return rsb.toString();
}
/**
* Creates a numeric column by removing numbers from a column
* @param dataframe
* @param column
* @return
*/
public static String extractNumbers(String dataframe, String column) {
StringBuilder rsb = new StringBuilder();
rsb.append(dataframe + "$" + column + " <- as.numeric(gsub('[^-\\\\.0-9]', '', " + dataframe + "$" + column + "));");
return rsb.toString();
}
/**
*
* @param leftTableName
* @param rightTableName
* @param joinType
* @param joinCols
* @return
*/
public static String getMergeSyntax(String returnTable, String leftTableName, String rightTableName, String joinType, List> joinCols) {
/*
* joinCols = [ {leftTable.Title -> rightTable.Movie} , {leftTable.Genre -> rightTable.Genre} ]
*/
StringBuilder builder = new StringBuilder();
builder.append(returnTable).append(" <- merge(").append(leftTableName).append(", ")
.append(rightTableName).append(", by.x = c(");
// want to grab the keys since it is left table -> right table
getMergeColsSyntax(builder, joinCols, true);
builder.append("), by.y = c(");
// here is the reverse, grab the values of the map
getMergeColsSyntax(builder, joinCols, false);
if (joinType.equals("inner.join")) {
builder.append("), all = FALSE, allow.cartesian = TRUE)");
} else if (joinType.equals("left.outer.join")) {
builder.append("), all.x = TRUE, all.y = FALSE, allow.cartesian = TRUE)");
} else if (joinType.equals("right.outer.join")) {
builder.append("), all.x = FALSE, all.y = TRUE, allow.cartesian = TRUE)");
} else if (joinType.equals("outer.join")) {
builder.append("), all = TRUE, allow.cartesian = TRUE)");
}
return builder.toString();
}
/**
* Get the correct r syntax for the table join columns
* This uses the join information keys since it is {leftCol -> rightCol}
* @param builder
* @param colNames
*/
public static void getMergeColsSyntax(StringBuilder builder, List> colNames, boolean grabKeys){
// iterate through the map
boolean firstLoop = true;
int numJoins = colNames.size();
for(int i = 0; i < numJoins; i++) {
Map joinMap = colNames.get(i);
// just in case an empty map is passed
// we do not want to modify the firstLoop boolean
if(joinMap.isEmpty()) {
continue;
}
if(!firstLoop) {
builder.append(",");
}
// this should really be 1
// since each join between 2 columns is its own map
Collection tableCols = null;
if(grabKeys) {
tableCols = joinMap.keySet();
} else {
tableCols = joinMap.values();
}
// keep track of where to add a ","
int counter = 0;
int numCols = tableCols.size();
for(String colName : tableCols){
builder.append("\"").append(colName).append("\"");
if(counter+1 != numCols) {
builder.append(",");
}
counter++;
}
firstLoop = false;
}
}
/**
* Generate an alter statement to add new columns, taking into consideration joins
* and new column alias's
* @param tableName
* @param existingColumns
* @param newColumns
* @param joins
* @param newColumnAlias
* @return
*/
public static String alterMissingColumns(String tableName, Map newColumnsToTypeMap, List joins, Map newColumnAlias) {
List newColumnsToAdd = new Vector();
List newColumnsToAddTypes = new Vector();
// get all the join columns
List joinColumns = new Vector();
for(Join j : joins) {
String columnName = j.getRColumn();
if(columnName.contains("__")) {
columnName = columnName.split("__")[1];
}
joinColumns.add(columnName);
}
for(String newColumn : newColumnsToTypeMap.keySet()) {
SemossDataType newColumnType = newColumnsToTypeMap.get(newColumn);
// modify the header
if(newColumn.contains("__")) {
newColumn = newColumn.split("__")[1];
}
// if its a join column, ignore it
if(joinColumns.contains(newColumn)) {
continue;
}
// not a join column
// check if it has an alias
// and then add
if(newColumnAlias.containsKey(newColumn)) {
newColumnsToAdd.add(newColumnAlias.get(newColumn));
} else {
newColumnsToAdd.add(newColumn);
}
// and store the type at the same index
// in its list
newColumnsToAddTypes.add(newColumnType);
}
StringBuilder rExec = new StringBuilder();
for(int i = 0; i < newColumnsToAdd.size(); i++) {
String newCol = newColumnsToAdd.get(i);
SemossDataType newColType = newColumnsToAddTypes.get(i);
String newColSyntax = tableName + "$" + newCol;
if(newColType == SemossDataType.DOUBLE) {
rExec.append(newColSyntax).append(" <- as.numeric(").append(newColSyntax).append(");");
} else if(newColType == SemossDataType.INT) {
rExec.append(newColSyntax).append(" <- as.integer(").append(newColSyntax).append(");");
} else {
rExec.append(newColSyntax).append(" <- as.character(").append(newColSyntax).append(");");
}
}
return rExec.toString();
}
//////////////////////////////////////////
// File I/O
/////////////////////////////////////////
/**
* Generate the syntax to perform a fRead to ingest a file
* @param tableName
* @param absolutePath
* @return
*/
public static String getFReadSyntax(String tableName, String absolutePath) {
return getFReadSyntax(tableName, absolutePath, ",");
}
/**
* Generate the syntax to perform a fRead to ingest a file
* @param tableName
* @param absolutePath
* @param delimiter
* @return
*/
public static String getFReadSyntax(String tableName, String absolutePath, String delimiter) {
StringBuilder builder = new StringBuilder();
builder.append(tableName).append(" <- fread(\"").append(absolutePath.replace("\\", "/"))
.append("\", sep=\"").append(delimiter)
.append("\", encoding=\"UTF-8\", blank.lines.skip=TRUE, fill=TRUE, na.strings=NULL, keepLeadingZeros=TRUE, integer64='numeric');");
return builder.toString();
}
/**
* Write dataframe to csv
* @param dataframe
* @param absolutePath
* @return
*/
public static String getFWriteSyntax(String dataframe, String absolutePath) {
StringBuilder rsb = new StringBuilder();
rsb.append("fwrite(" + dataframe + ",file=\"" + absolutePath.replace("\\", "/") + "\");");
return rsb.toString();
}
public static String getExcelReadSheetSyntax(String tableName, String absolutePath, int sheetIndex, List colIndices, boolean uploadSubset) {
StringBuilder builder = new StringBuilder();
builder.append(tableName).append(" <- as.data.table(read.xlsx2(\"").append(absolutePath.replace("\\", "/"))
.append("\", sheetIndex=").append(sheetIndex).append(", colClasses = c(rep('character',").append(colIndices.size())
.append(")), stringsAsFactors=FALSE");
if (uploadSubset) {
builder.append(", colIndex = c(").append(StringUtils.join(colIndices,",")).append(")");
}
builder.append("))");
return builder.toString();
}
/**
* Load excel sheet to dataframe
* @param filePath The file path of the excel file
* @param frameName The name of the frame to load excel sheet
* @param sheetName The name of the sheet in the excel workbook
* @param sheetRange The desired range to load
* @return
*/
public static String loadExcelSheet(String filePath, String frameName, String sheetName, String sheetRange) {
StringBuilder rsb = new StringBuilder();
rsb.append("library(readxl);library(cellranger);");
filePath = filePath.replace("\\", "/");
rsb.append(frameName + " <- read_excel(path = \"" + filePath + "\", col_names = TRUE, sheet = \"" + sheetName + "\", range='"
+ sheetRange + "');");
rsb.append(frameName + " <- as.data.table(" + frameName + ");");
return rsb.toString();
}
/**
* Load parquet file to dataframe
* @param filePath The file path of the parquet file
* @param frameName The name of the frame to create
* @return
*/
public static String loadParquetFile(String filePath, String frameName) {
StringBuilder rsb = new StringBuilder();
rsb.append("library(arrow);");
filePath = filePath.replace("\\", "/");
rsb.append(frameName + " <- read_parquet(file = \"" + filePath + "\", col_select = NULL, as_data_frame = TRUE);");
rsb.append(frameName + " <- as.data.table(" + frameName + ");");
return rsb.toString();
}
/**
*
* @param value
* @param dataType
* @param additionalParameter
* @return
*/
public static String formatFilterValue(Object value, SemossDataType dataType, String additionalParameter) {
DecimalFormat decimalFormat = null;
if(SemossDataType.STRING == dataType || SemossDataType.FACTOR == dataType) {
return "\"" + value + "\"";
} else if(SemossDataType.INT == dataType || SemossDataType.DOUBLE == dataType) {
try {
decimalFormat = new DecimalFormat("0.0000000000");
return decimalFormat.format(value);
} catch(Exception e1) {
logger.warn("Invalid object passed for numeric field", e1);
return value + "";
}
} else if(SemossDataType.DATE == dataType) {
return "as.Date(\"" + value.toString() + "\", format='%Y-%m-%d')";
} else if(SemossDataType.TIMESTAMP == dataType) {
if(additionalParameter != null && !additionalParameter.isEmpty()) {
return "as.POSIXct(\"" + value.toString() + "\", format='%Y-%m-%d %H:%M:%OS', " + additionalParameter + ")";
} else {
return "as.POSIXct(\"" + value.toString() + "\", format='%Y-%m-%d %H:%M:%OS')";
}
} else {
// just in case this is not defined yet...
// see the type of the value and add it in based on that
if(dataType == null) {
if(value instanceof String) {
return "\"" + value + "\"";
} else {
return value + "";
}
} else {
return value + "";
}
}
}
public static String escapeRegexR(String expr){
if (Pattern.matches("^(grepl).*?", expr)){
String [] parsedRegex = (String[]) expr.split("\\\\\"");
String regex = "";
for (int k=1; k < parsedRegex.length - 1; k++){
String escapedParsedRegex = parsedRegex[k].replace("\\", "\\\\");
regex = regex + escapedParsedRegex;
}
expr = parsedRegex[0] + "\\\"" + regex + "\\\"" + parsedRegex[parsedRegex.length - 1];
}
return expr;
}
/**
* Get the syntax to read rds file and load to a variable
* @param var rds variable to set
* @param file rds file to read
* @return
*/
public static String readRDS(String var, String filePath) {
return new StringBuilder(var + " <-readRDS(\"" + filePath.replace("\\", "/") + "\");").toString();
}
/**
* Get the syntax to qread
* @param var
* @param filePath
* @return
*/
public static String qread(String var, String filePath) {
return new StringBuilder(var + " <-qread(\"" + filePath.replace("\\", "/") + "\");").toString();
}
/////////////////////////////////////////////////////////
// Date formatting
/////////////////////////////////////////////////////////
/**
* Converts Java datetime format to R datetime format
* @param javaFormat
* @return rFormat
*/
public static String translateJavaRDateTimeFormat(String javaFormat){
String rFormat = "";
String substr = "";
String optionsMiliSeconds = "";
String datetimeRegex = "[yYMDdEuaHhmsSZX]";
int lastIndxSubstr = 0;
for (int i = 0; i < javaFormat.length(); i++){
String ch = (i == javaFormat.length()-1) ? javaFormat.substring(i) : javaFormat.substring(i, i + 1);
if (!ch.matches(datetimeRegex) && !ch.equals("'")){
//handle delimiters/whitespaces
rFormat += ch;
} else {
lastIndxSubstr = javaFormat.lastIndexOf(ch);
substr = javaFormat.substring(i, lastIndxSubstr + 1);
if (ch.equals("'")) {
//if character is a single quote (SQ), then need to properly parse to the nearest closing single quote
if (substr.equals("''")) {
rFormat += substr.replaceAll("''","'");
i = lastIndxSubstr;
continue;
}
String substr_trimmed = substr.substring(1, substr.length()-1).replaceAll("''","");
int nextSQIndx = substr_trimmed.indexOf("'");
int multiSQIndx = substr.substring(1, substr.length()-1).indexOf("''");
if (nextSQIndx > 0) {
if (multiSQIndx > 0 && multiSQIndx < nextSQIndx) {
rFormat += substr.substring(1, nextSQIndx + 3).replaceAll("''","'");
i = substr.substring(1, nextSQIndx + 3).length() + i + 1;
continue;
} else {
rFormat += substr.substring(1, nextSQIndx + 1);
i = substr.substring(1, nextSQIndx + 1).length() + i + 1;
continue;
}
} else {
rFormat += substr.substring(1, substr.length()-1).replaceAll("''","'");
}
} else {
//check if there are any other characters in the substr other than the character (ch)
for (int j = 1; j < substr.length(); j++) {
char ch_substr = substr.toCharArray()[j];
if (!String.valueOf(ch_substr).equals(ch)) {
substr = substr.substring(0, j);
lastIndxSubstr = i + j - 1;
break;
}
}
if (ch.equalsIgnoreCase("y") || ch.equals("M") || ch.equals("E")) {
// for these ch, it needs to be concatenated with the length of the substr to identify the appropriate R syntax
int lengthSubstr = substr.length();
if (ch.equalsIgnoreCase("e") && lengthSubstr != 3) {
rFormat += javaRDatTimeTranslationMap.get("E");
} else if (ch.equals("M") && lengthSubstr > 3) {
rFormat += javaRDatTimeTranslationMap.get("M+");
} else if (javaRDatTimeTranslationMap.get(ch + lengthSubstr) != null) {
rFormat += javaRDatTimeTranslationMap.get(ch + lengthSubstr);
} else {
throw new RuntimeException("Associated R date/time format is undefined.");
}
} else if (ch.equals("S")) {
// need to retain how many digits the millisecond request is
optionsMiliSeconds = Integer.toString(substr.length());
// for second, need to check for millisecond to properly translate to R syntax
if (rFormat.indexOf("%S") > 1) {
rFormat = rFormat.replaceAll("%S", "%OS");
rFormat = rFormat.substring(0, rFormat.indexOf("%OS") + 3);
} else {
throw new RuntimeException(
"R timestamps cannot support milliseconds without the presence of seconds.");
}
} else if (javaRDatTimeTranslationMap.get(ch) != null) {
// for these ch, it can be used to directly search for the appropriate R syntax
rFormat += javaRDatTimeTranslationMap.get(ch);
} else {
throw new RuntimeException("Associated R date/time format is undefined.");
}
}
i = lastIndxSubstr;
}
}
//persist the millisecond decimal place option alongside the R date format
if (!optionsMiliSeconds.isEmpty()) {
rFormat += "|" + optionsMiliSeconds ;
} else {
rFormat += "|NULL" ;
}
return rFormat;
}
public static String getValueJavaRDatTimeTranslationMap(String key){
String value = javaRDatTimeTranslationMap.get(key);
return value;
}
/**
* Determine the limit/offset given the size of the frame
* @param tableName
* @param numRows
* @param limit
* @param offset
* @return
*/
public static String determineLimitOffsetSyntax(String tableName, long numRows, long limit, long offset) {
StringBuilder query = new StringBuilder();
query.append(tableName).append(" <- ").append(tableName);
if(limit > 0) {
if(offset > 0) {
// we have limit + offset
long lastRIndex = offset + limit;
// r is 1 based so we will increase the offset by 1
// since FE sends back limit/offset 0 based
offset++;
if(numRows < lastRIndex) {
if(numRows > offset) {
query.append("[").append(offset).append(":").append(numRows).append("]");
} else {
throw new IllegalArgumentException("Limit + Offset result in no data");
}
} else {
query.append("[").append(offset).append(":").append((lastRIndex)).append("]");
}
} else {
// we just have a limit
if(numRows < limit) {
query.append("[1:").append(numRows).append("]");
} else {
query.append("[1:").append(limit).append("]");
}
}
} else if(offset > 0) {
// r is 1 based so we will increase the offset by 1
// since FE sends back limit/offset 0 based
offset++;
query.append("[").append(offset).append(":").append(numRows).append("]");
}
return query.toString();
}
/////////////////////////////////////////////////////////
// R Environment
/////////////////////////////////////////////////////////
/**
* Generate R syntax to set the working directory
* @param directory
* @return
*/
public static String setWorkingDirectory(String directory) {
return new StringBuilder("setwd(\"" + directory.replace("\\", "/") + "\");").toString();
}
/**
* Generate R syntax to set the working directory
* @return
*/
public static String getWorkingDirectory() {
return new StringBuilder("getwd();").toString();
}
/**
* Generate R syntax to load r packages
*
* @param packages
* @return
*/
public static String loadPackages(String[] packages) {
StringBuilder sb = new StringBuilder();
for (String lib : packages) {
sb.append(RSyntaxHelper.loadLibrary(lib));
}
return sb.toString();
}
/**
* Generate R synatx to load a single library
*
* @param library
* @return
*/
public static String loadLibrary(String library) {
StringBuilder rsb = new StringBuilder();
rsb.append("library(" + library + ");");
return rsb.toString();
}
/**
* Replace na values with the string "NA"
* @param tableName
* @param colName
* @return
*/
public static String replaceNAString(String tableName, List colName) {
// dt[,c("FRAME","AGE")] <- replace(dt[,c("FRAME","AGE")], is.na(dt[,c("FRAME","AGE")]), "NA");
String colVector = createStringRColVec(colName);
String subsetTable = tableName + "[," + colVector + "]";
StringBuilder rsb = new StringBuilder(subsetTable)
.append(" <- replace(").append(subsetTable).append(",is.na(")
.append(subsetTable).append("), \"NA\")");
return rsb.toString();
}
public static void main(String[] args) {
// // testing inner
// System.out.println("testing inner...");
// System.out.println("testing inner...");
// System.out.println("testing inner...");
// System.out.println("testing inner...");
// System.out.println("testing inner...");
//
// String returnTable = "tableToTest";
// String leftTableName = "x";
// String rightTableName = "y";
// String joinType = "inner.join"; // left.outer.join, right.outer.join, outer.join
// List> joinCols = new Vector>();
// Map join1 = new HashMap();
// join1.put("", "");
// joinCols.add(join1);
//
// // when you get to multi
// // Map join2 = new HashMap();
// // join2.put("", "");
// // joinCols.add(join2);
//
// System.out.println(getMergeSyntax(returnTable, leftTableName, rightTableName, joinType, joinCols));
//
// System.out.println("testing left...");
// System.out.println("testing left...");
// System.out.println("testing left...");
// System.out.println("testing left...");
// System.out.println("testing left...");
//
// returnTable = "tableToTest";
// joinType = "left.outer.join"; // left.outer.join, right.outer.join, outer.join
//
// System.out.println(getMergeSyntax(returnTable, leftTableName, rightTableName, joinType, joinCols));
}
}