All Downloads are FREE. Search and download functionalities are using the official Maven repository.

prerna.reactor.frame.FrameProfileReactor Maven / Gradle / Ivy

The newest version!
package prerna.reactor.frame;

import java.io.IOException;
import java.util.Map;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import prerna.algorithm.api.DataFrameTypeEnum;
import prerna.algorithm.api.ITableDataFrame;
import prerna.algorithm.api.SemossDataType;
import prerna.ds.OwlTemporalEngineMeta;
import prerna.engine.api.IHeadersDataRow;
import prerna.engine.api.IRawSelectWrapper;
import prerna.query.querystruct.SelectQueryStruct;
import prerna.query.querystruct.filters.OrQueryFilter;
import prerna.query.querystruct.filters.SimpleQueryFilter;
import prerna.query.querystruct.selectors.QueryColumnSelector;
import prerna.query.querystruct.selectors.QueryFunctionHelper;
import prerna.query.querystruct.selectors.QueryFunctionSelector;
import prerna.reactor.imports.ImportUtility;
import prerna.sablecc2.om.PixelDataType;
import prerna.sablecc2.om.PixelOperationType;
import prerna.sablecc2.om.ReactorKeysEnum;
import prerna.sablecc2.om.nounmeta.NounMetadata;
import prerna.util.Constants;

/**
 * This class lets user profile the data in a frame. It analyzes the data in a
 * frame and then provides a comprehensive summary on Min, Max, Sum, count of
 * blanks, count of unique values, Avg and count of null values in each column.
 * 
 * The input to this routine will be the current frame and a new frame will be
 * the output with the profiling data.
 *
 */

public class FrameProfileReactor extends AbstractFrameReactor{

	private static final Logger classLogger = LogManager.getLogger(FrameProfileReactor.class);
	
	private static final String DOUBLE_UNDERSCORE = "__";
	private static final String NA = "NA";
	private static final String DOUBLE_EQUALS = "==";
	
	public FrameProfileReactor() {
		this.keysToGet = new String[] {ReactorKeysEnum.FRAME.getKey()};
	}
	
	@Override
	public NounMetadata execute() {
		classLogger.info("Starting Frame Profile execution.");
		
		organizeKeys();
		ITableDataFrame actFrame = getFrame();
		String tableName = actFrame.getName();
		
		/*
		 * Get the datatype of the columns and store it in a map for further processing.
		 */
		OwlTemporalEngineMeta metaData = actFrame.getMetaData();
		Map headerDatatypeMap = metaData.getHeaderToTypeMap();
		String[] actFrameHeaders = actFrame.getQsHeaders();
		
		/*
		 * Create a new frame of type ITableDataFrame. When dealing with R/Python frame we 
		 * want to use the existing user connection. For this we use the FrameFactory, as this
		 * takes in the insight which contains the user space and gives back the required frame. 
		 */
		ITableDataFrame newFrame = null;
		try {
			// Pass the frame type as grid and blank in the alias param so that you get a new frame.
			newFrame = FrameFactory.getFrame(this.insight, DataFrameTypeEnum.GRID.getTypeAsString(), "");
		}catch(Exception e) {
			throw new IllegalArgumentException("Error occurred trying to create frame of type " + DataFrameTypeEnum.GRID.getTypeAsString(), e);
		}
		String newFrameAlias = newFrame.getName();
		
		
		/*
		 * The new headers and data types that will be included in the frame. This can change
		 * afterwards and is just for the first iteration of the functionality.
		 */
		String[] newHeaders = new String[] { "TABLE_NAME", "COLUMN_NAME", "NUMBER_OF_BLANKS", 
				"NUM_OF_UNIQUE_VALUES", "MIN", "AVG", "MAX", "SUM", 
				"NUMBER_OF_NULLS" };
		String[] dataTypes = new String[] { "String", "String", "Double", "Double", 
				"Double", "Double", "Double", "Double" , "Double" };
		
		/*
		 * Update metadata for new frame
		 */
		OwlTemporalEngineMeta newFrameMetadata = newFrame.getMetaData();
		String newTableName = newFrame.getName();
		/*
		 * Using below utility to fill up the metadata in the new frame.
		 */
		ImportUtility.parseTableColumnsAndTypesToFlatTable(newFrameMetadata, newHeaders, dataTypes, newTableName);
		 
		/*
		 * Iterate over the headers and profile each column based on the datatype. If
		 * column is numeric, then call getNumericalProfileDataOnFrame() because we have
		 * to perform min/max/avg etc on the column data. Else treat every other col as
		 * string and call getStringProfileDataOnFrame().
		 */
		for(String header : actFrameHeaders) {
			String[] row = null;
			String colName = header.split(DOUBLE_UNDERSCORE)[1];
			if(SemossDataType.isNotString(headerDatatypeMap.get(header))){
				row = getNumericalProfileDataOnFrame( actFrame, tableName, colName);
			}else {
				row = getStringProfileDataOnFrame(actFrame, tableName, colName); 
			}
			newFrame.addRow(row, newHeaders);
			classLogger.info("New row added to the new frame for column " + colName);
		}
		
		/*
		 * Create the NounMetadata and store it in the varStore object.
		 */
		NounMetadata noun = new NounMetadata(newFrame, PixelDataType.FRAME, PixelOperationType.FRAME);
		this.insight.getVarStore().put(newFrameAlias, noun);
		classLogger.info("New Frame created with alias and has been set in the insight**********" + newFrameAlias);
		classLogger.info("Completed execution of FrameProfileReactor.");
		return noun;
	}
	
	
	/**
	 * Profile data on columns which are of Numerical datatype in the frame.
	 * @param frame
	 * @param tableName
	 * @param colName
	 * @return
	 */
	private String[] getNumericalProfileDataOnFrame(ITableDataFrame frame, String tableName, String colName) {
		//----FORMAT------
		//"table_name", "column_name", "numOfBlanks", "numOfUniqueValues", "minValue", "averageValue", "maxValue", "sumValue", "numOfNullValues"
		//"String", 		"String", 		"Double", 		"Double",		  "Double", 	"Double", 	  "Double",   "Double" , 		"Double" 
		String[] row = new String[9];
		row[0] = tableName;
		row[1] = colName;
		row[2] = getNumOfBlanks(frame, colName);
		computeNumericalProfilingData(row, frame, colName);
		row[8] = getNumOfNullValues(frame, colName);
		return row;
	}
	
	/**
	 * Profile data on columns which are of String datatype in the frame.
	 * @param frame
	 * @param tableName
	 * @param colName
	 * @return
	 */
	private String[] getStringProfileDataOnFrame(ITableDataFrame frame, String tableName, String colName) {
		//----FORMAT------
		//"table_name", "column_name", "numOfBlanks", "numOfUniqueValues", "minValue", "averageValue", "maxValue", "sumValue", "numOfNullValues"
		//"String", 		"String", 		"Double", 		"Double",		  "Double", 	"Double", 	  "Double",   "Double" , 		"Double" 
		String[] row = new String[9];
		
		row[0] = tableName;
		row[1] = colName;
		row[2] = getNumOfBlanks(frame, colName);
		row[3] = getNumOfUniqueValues(frame, colName);
		row[4] = NA;
		row[5] = NA;
		row[6] = NA;
		row[7] = NA;
		row[8] = getNumOfNullValues(frame, colName); // Need to debug. Getting exception
		return row;
	}
	
	/*
	 * We query on the frame in the below utility methods, hence we use the query method in the frame. 
	 * This helps us to run the queries in a frametype agnostic manner.
	 */
	
	/*
	 * ----------------------------Profile routines-----------------------------
	 */
	
	/**
	 * Get the number of blank values in a column which is of String datatype
	 * @param frame
	 * @param colName
	 * @return
	 */
	private String getNumOfBlanks(ITableDataFrame frame, String colName) {
		
		SelectQueryStruct qs = new SelectQueryStruct();
		QueryFunctionSelector uniqueCountSelector = new QueryFunctionSelector();
		uniqueCountSelector.setFunction(QueryFunctionHelper.COUNT);
		QueryColumnSelector innerSelector = new QueryColumnSelector(colName);
		uniqueCountSelector.addInnerSelector(innerSelector);
		qs.addSelector(uniqueCountSelector);
		
		QueryColumnSelector colSelector = new QueryColumnSelector(colName);
		SimpleQueryFilter blankFilter = new SimpleQueryFilter(new NounMetadata(colSelector, PixelDataType.COLUMN),
				DOUBLE_EQUALS, new NounMetadata("", PixelDataType.CONST_STRING));
		qs.addExplicitFilter(blankFilter);
		
		int blankCnts = 0;
		IRawSelectWrapper blanksWrapper = null;
		try {
			blanksWrapper = frame.query(qs);
			if(blanksWrapper.hasNext()) {
				blankCnts = ((Number)blanksWrapper.next().getValues()[0]).intValue();
			}else {
				classLogger.info("Blanks Wrapper is empty. No blanks!");
			}
		}catch(Exception e) {
			classLogger.warn("Exception during execution of query." + e.getMessage());
			classLogger.error(Constants.STACKTRACE, e);
		}finally {
			//Cleanup.
			if(blanksWrapper != null) {
				try {
					blanksWrapper.close();
				} catch (IOException e) {
					classLogger.error(Constants.STACKTRACE, e);
				}
			}
		}
		
		return Integer.toString(blankCnts);
	}
	
	/**
	 * Get the number of unique values in a column which is of String datatype.
	 * @param frame
	 * @param colName
	 * @return
	 */
	private String getNumOfUniqueValues(ITableDataFrame frame, String colName) {

		SelectQueryStruct qs_unique = new SelectQueryStruct();
		QueryFunctionSelector functionSelector = new QueryFunctionSelector();
		functionSelector.setFunction(QueryFunctionHelper.UNIQUE_COUNT);
		QueryColumnSelector inSelector = new QueryColumnSelector(colName);
		functionSelector.addInnerSelector(inSelector);
		functionSelector.setDistinct(true);
		qs_unique.addSelector(functionSelector);
		qs_unique.setQsType(SelectQueryStruct.QUERY_STRUCT_TYPE.FRAME);

		IRawSelectWrapper uniqueWrapper = null;
		Object value = null;
		try {
			uniqueWrapper = frame.query(qs_unique);
			if(uniqueWrapper.hasNext()) {
				value = uniqueWrapper.next().getValues()[0];
			}else {
				classLogger.info("Unique Wrapper is empty. No unique elements.");
			}
		} catch (Exception e) {
			classLogger.warn("Exception during execution of query." + e.getMessage());
			classLogger.error(Constants.STACKTRACE, e);
		} finally {
			if (uniqueWrapper != null) {
				try {
					uniqueWrapper.close();
				} catch (IOException e) {
					classLogger.error(Constants.STACKTRACE, e);
				}
			}
		}

		return String.valueOf(value);
	}
	
	/**
	 * Get the number of null values in a column with String datatype.
	 * @param frame
	 * @param colName
	 * @return
	 */
	private String getNumOfNullValues(ITableDataFrame frame, String colName) {

		SelectQueryStruct qs_null = new SelectQueryStruct();
		QueryFunctionSelector uniqueCountSelector = new QueryFunctionSelector();
		uniqueCountSelector.setFunction(QueryFunctionHelper.COUNT);
		QueryColumnSelector innerSelector = new QueryColumnSelector(colName);
		uniqueCountSelector.addInnerSelector(innerSelector);
		qs_null.addSelector(uniqueCountSelector);
		QueryColumnSelector col = new QueryColumnSelector(colName);
		SimpleQueryFilter nullFilter = new SimpleQueryFilter(new NounMetadata(col, PixelDataType.COLUMN), 
				DOUBLE_EQUALS,
				new NounMetadata(null, PixelDataType.NULL_VALUE));
		SimpleQueryFilter stringNullFilter = new SimpleQueryFilter(new NounMetadata(col, PixelDataType.COLUMN), 
				DOUBLE_EQUALS,
				new NounMetadata("null", PixelDataType.NULL_VALUE));
		OrQueryFilter orfilter = new OrQueryFilter();
		orfilter.addFilter(nullFilter);
		orfilter.addFilter(stringNullFilter);
		qs_null.addExplicitFilter(orfilter);

		int nullCount = 0;
		IRawSelectWrapper nullWrapper = null;
		try {
			nullWrapper = frame.query(qs_null);
			if(nullWrapper.hasNext()) {
				nullCount = ((Number) nullWrapper.next().getValues()[0]).intValue();
			}else {
				classLogger.info("Null Wrapper is empty. No null values.");
			}
		} catch (Exception e) {
			classLogger.warn("Exception during execution of query." + e.getMessage());
			classLogger.error(Constants.STACKTRACE, e);
		}finally {
			if(nullWrapper != null) {
				try {
					nullWrapper.close();
				} catch (IOException e) {
					classLogger.error(Constants.STACKTRACE, e);
				}
			}
		}

		return Integer.toString(nullCount);
	}
	
	/*
	 * ----------------------------Profile routines for numerical columns-----------------------------
	 */
	
	/**
	 * Get unique/min/max/avg/sum over the numerical columns
	 * @param row
	 * @param frame
	 * @param colName
	 */
	private void computeNumericalProfilingData(String[] row, ITableDataFrame frame, String colName) {
		SelectQueryStruct qs = new SelectQueryStruct();
		QueryColumnSelector innerSelector = new QueryColumnSelector(colName);
		qs.addSelector(getUniqueCountFunctionSelector(innerSelector, colName));
		qs.addSelector(getMinFunctionSelector(innerSelector, colName));
		qs.addSelector(getAvgFunctionSelector(innerSelector, colName));
		qs.addSelector(getMaxFunctionSelector(innerSelector, colName));
		qs.addSelector(getSumFunctionSelector(innerSelector, colName));
		
		qs.setQsType(SelectQueryStruct.QUERY_STRUCT_TYPE.FRAME);
		IRawSelectWrapper resultWrapper = null;
		try {
			resultWrapper = frame.query(qs);
			if(resultWrapper.hasNext()) {
				IHeadersDataRow datarow = resultWrapper.next();
				Object[] dataArr = datarow.getValues();
				int offset = 3;
				for(int i = 0; i < dataArr.length; i++) {
					row[offset + i] = String.valueOf(dataArr[i]);
				}
			}else {
				classLogger.info("Wrapper for calculating unique/min/max/avg/sum is empty for column name " + colName);
			}
		}catch(Exception e) {
			classLogger.warn("Exception during execution of query." + e.getMessage());
			classLogger.error(Constants.STACKTRACE, e);
		}
	}
	
	/**
	 * Create and return a Unique Count QueryFunctionSelector
	 * @param innerSelector
	 * @param colName
	 * @return
	 */
	private static QueryFunctionSelector getUniqueCountFunctionSelector(QueryColumnSelector innerSelector, String colName) {
		QueryFunctionSelector uniquefuncSelector = new QueryFunctionSelector();
		uniquefuncSelector.setFunction(QueryFunctionHelper.UNIQUE_COUNT);
		uniquefuncSelector.addInnerSelector(innerSelector);
		return uniquefuncSelector;
	}
	
	/**
	 * Create and return a Min QueryFunctionSelector
	 * @param innerSelector
	 * @param colName
	 * @return
	 */
	private static QueryFunctionSelector getMinFunctionSelector(QueryColumnSelector innerSelector, String colName) {
		QueryFunctionSelector minFuncSelector = new QueryFunctionSelector();
		minFuncSelector.setFunction(QueryFunctionHelper.MIN);
		minFuncSelector.addInnerSelector(innerSelector);
		return minFuncSelector;
	}
	
	/**
	 * Create and return an Average QueryFunctionSelector
	 * @param innerSelector
	 * @param colName
	 * @return
	 */
	private static QueryFunctionSelector getAvgFunctionSelector(QueryColumnSelector innerSelector, String colName) {
		QueryFunctionSelector avgFuncSelector = new QueryFunctionSelector();
		avgFuncSelector.setFunction(QueryFunctionHelper.AVERAGE_1);
		avgFuncSelector.addInnerSelector(innerSelector);
		return avgFuncSelector;
	}
	
	/**
	 * Create and return a Max QueryFunctionSelector
	 * @param innerSelector
	 * @param colName
	 * @return
	 */
	private static QueryFunctionSelector getMaxFunctionSelector(QueryColumnSelector innerSelector, String colName) {
		QueryFunctionSelector maxFuncSelector = new QueryFunctionSelector();
		maxFuncSelector.setFunction(QueryFunctionHelper.MAX);
		maxFuncSelector.addInnerSelector(innerSelector);
		return maxFuncSelector;
	}
	
	/**
	 * Create and return a Sum QueryFunctionSelector
	 * @param innerSelector
	 * @param colName
	 * @return
	 */
	private static QueryFunctionSelector getSumFunctionSelector(QueryColumnSelector innerSelector, String colName) {
		QueryFunctionSelector sumFuncSelector = new QueryFunctionSelector();
		sumFuncSelector.setFunction(QueryFunctionHelper.SUM);
		sumFuncSelector.addInnerSelector(innerSelector);
		return sumFuncSelector;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy