prerna.reactor.imports.MergeFramesReactor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of semoss Show documentation
SEMOSS
The newest version!
package prerna.reactor.imports;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Vector;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import prerna.algorithm.api.ITableDataFrame;
import prerna.algorithm.api.SemossDataType;
import prerna.ds.OwlTemporalEngineMeta;
import prerna.ds.TinkerFrame;
import prerna.ds.nativeframe.NativeFrame;
import prerna.ds.py.PandasFrame;
import prerna.ds.r.RDataTable;
import prerna.ds.r.RSyntaxHelper;
import prerna.engine.api.IHeadersDataRow;
import prerna.engine.api.IRawSelectWrapper;
import prerna.om.Insight;
import prerna.om.InsightFile;
import prerna.query.querystruct.AbstractQueryStruct;
import prerna.query.querystruct.CsvQueryStruct;
import prerna.query.querystruct.ExcelQueryStruct;
import prerna.query.querystruct.SelectQueryStruct;
import prerna.query.querystruct.filters.SimpleQueryFilter;
import prerna.query.querystruct.selectors.IQuerySelector;
import prerna.query.querystruct.selectors.QueryColumnSelector;
import prerna.reactor.AbstractReactor;
import prerna.sablecc2.om.GenRowStruct;
import prerna.sablecc2.om.Join;
import prerna.sablecc2.om.PixelDataType;
import prerna.sablecc2.om.PixelOperationType;
import prerna.sablecc2.om.execptions.SemossPixelException;
import prerna.sablecc2.om.nounmeta.NounMetadata;
import prerna.util.Constants;
import prerna.util.usertracking.UserTrackerFactory;

public class MergeFramesReactor extends AbstractReactor {
	
	private static final Logger classLogger = LogManager.getLogger(MergeFramesReactor.class);
	
	private static final String sourceFrame = "source";
	private static final String targetFrame = "target";
	private static final String sourceCols = "sourceCols";
	private static final String targetCols = "targetCols";
	private static final String joinType = "jType";

	private static final String CLASS_NAME = MergeFramesReactor.class.getName();
	
	public MergeFramesReactor() {
		this.keysToGet = new String[]{sourceFrame, targetFrame, sourceCols, targetCols, joinType}; 

	}

	@Override
	public NounMetadata execute()  {
		ITableDataFrame sourceFrame = getSourceFrame();
		ITableDataFrame targetFrame = getTargetFrame();
		
		if(sourceFrame.isEmpty() || targetFrame.isEmpty()) {
			throw new IllegalArgumentException("Attempting to merge with an empty frame");
		}
		
		// set the logger into the frames
		Logger logger = getLogger(CLASS_NAME);
		sourceFrame.setLogger(logger);
		targetFrame.setLogger(logger);

		// first convert the join to use the physical frame name in the selector
		List joins = getJoins();
		joins = convertJoins(joins, sourceFrame.getMetaData(), targetFrame.getMetaData());
		
		// let us try to optimize 
		// if the frames are the same type
		
		// 1) are both R frames
		// 2) are both Python frames
		// 3) are both Native frames and using the same engine
		// 4) have go through a task + CSV - if one is native, must flush out to another frame
		
		// TODO: need to add for tracking...
		boolean optimized = false;
		// they are the same type
		if(sourceFrame.getFrameType() == targetFrame.getFrameType()) {
			// 1) are they both R
			if(sourceFrame instanceof RDataTable) {
				optimized = true;
				
				String joinType = null;
				List> joinCols = new ArrayList>();
				for(Join joinItem : joins) {
					joinType = joinItem.getJoinType();
					// in R, the existing column is referenced as frame__column
					// but the R syntax only wants the col
					Map joinColMapping = new HashMap();
					String jLeftColumn = joinItem.getLColumn();
					if(jLeftColumn.contains("__")) {
						jLeftColumn = jLeftColumn.split("__")[1];
					}
					String jRightColumn = joinItem.getRColumn();
					if(jRightColumn.contains("__")) {
						jRightColumn = jRightColumn.split("__")[1];
					}
					joinColMapping.put(jLeftColumn, jRightColumn);
					joinCols.add(joinColMapping);
				}
				
				// few steps to perform within this
				// a) need to rename the columns if they exist in both source and target and not part of the join
				// b) need to perform the merge
				// c) need to update the metadata
				
				String sourceFrameName = sourceFrame.getName();
				
				Map leftTableTypes = targetFrame.getMetaData().getHeaderToTypeMap();
				Map rightTableTypes = sourceFrame.getMetaData().getHeaderToTypeMap();
				
				Set leftTableHeaders = leftTableTypes.keySet();
				Set rightTableHeaders = rightTableTypes.keySet();
				Set rightTableJoinCols = AbstractImporter.getRightJoinColumns(joins);
				
				Map rightTableAlias = new HashMap();

				// note, we are not going to modify the existing headers
				// even though the query builder code allows for it
				for(String leftTableHeader : leftTableHeaders) {
					if(leftTableHeader.contains("__")) {
						leftTableHeader = leftTableHeader.split("__")[1];
					}
					// instead of making the method return a boolean and then having to perform
					// another ignore case match later on
					// we return the match and do a null check
					String dupRightTableHeader = AbstractImporter.setIgnoreCaseMatch(leftTableHeader, rightTableHeaders, rightTableJoinCols);
					if(dupRightTableHeader != null) {
						rightTableAlias.put(dupRightTableHeader, leftTableHeader + "_1");
					}
				}
				
				
				
				
				String mergeString = RSyntaxHelper.getMergeSyntax(targetFrame.getName(), targetFrame.getName(), sourceFrame.getName(),  joinType, joinCols);
				((RDataTable) targetFrame).executeRScript(mergeString);
				((RDataTable) targetFrame).recreateMeta();

			}
			
			// 2) are they both Py
			else if(sourceFrame instanceof PandasFrame) {
				optimized = true;
				
				String joinType = null;
				List> joinCols = new ArrayList>();
				List joinComparators = new ArrayList<>();
				boolean nonEqui = false;
				for(Join joinItem : joins) {
					joinType = joinItem.getJoinType();
					// in R, the existing column is referenced as frame__column
					// but the R syntax only wants the col
					Map joinColMapping = new HashMap();
					String jLeftColumn = joinItem.getLColumn();
					if(jLeftColumn.contains("__")) {
						jLeftColumn = jLeftColumn.split("__")[1];
					}
					String jRightColumn = joinItem.getRColumn();
					if(jRightColumn.contains("__")) {
						jRightColumn = jRightColumn.split("__")[1];
					}
					joinColMapping.put(jLeftColumn, jRightColumn);
					joinCols.add(joinColMapping);
					String joinComparator = joinItem.getComparator();
					if (!joinComparator.equals("=")) {
						nonEqui = true;
					}
					joinComparators.add(joinComparator);
				}
				
				// few steps to perform within this
				// a) need to rename the columns if they exist in both source and target and not part of the join
				// b) need to perform the merge
				// c) need to update the metadata
				
				((PandasFrame) targetFrame).merge(targetFrame.getName(), targetFrame.getName(), sourceFrame.getName(), joinType, joinCols, joinComparators, nonEqui);
				((PandasFrame) targetFrame).recreateMeta();
				((PandasFrame) targetFrame).replaceWrapperDataFromFrame();

			}
			
			// 3) are they both native
			else if(sourceFrame instanceof NativeFrame) {
				// need to ensure they are the same engine
				NativeFrame sourceNFrame = (NativeFrame) sourceFrame;
				NativeFrame targetNFrame = (NativeFrame) targetFrame;
				if(sourceNFrame.getEngineId().equals(targetNFrame.getEngineId())) {
					
					SelectQueryStruct sourceQs = sourceNFrame.getQueryStruct();
					SelectQueryStruct targetQs = targetNFrame.getQueryStruct();
					
					// at the moment, cannot merge from 2 custom froms
					if(sourceQs.getCustomFrom() == null && targetQs.getCustomFrom() == null) {
						optimized = true;
						targetQs.merge(sourceQs);
						//targetQs.addRelation(fromConcept, toConcept, joinType);
						
						NativeImporter importer = (NativeImporter) ImportFactory.getImporter(targetNFrame, targetQs, null);
						// we reassign the frame because it might have changed
						// this only happens for native frame
						try {
							importer.appendNecessaryRels(joins);
							importer.insertData();
						} catch (Exception e) {
							// TODO Auto-generated catch block
							classLogger.error(Constants.STACKTRACE, e);
						}
					}
				}
			}
		}
		
		if(!optimized) {
			// we will query and flush this out ...
			SelectQueryStruct qs = sourceFrame.getMetaData().getFlatTableQs(true);
			qs.setFrame(sourceFrame);
			qs.setQsType(AbstractQueryStruct.QUERY_STRUCT_TYPE.FRAME);
			ITableDataFrame mergeFrame = null;
			if(targetFrame instanceof NativeFrame) {
				try {
					SelectQueryStruct nativeQs = ((NativeFrame)targetFrame).getQueryStruct();
					//the qs's frame on the target frame is the sourceFrame. Later on during the import process for 
					// taking this native frame to an R frame, we read the qs then get qs.getFrame to generate metadata
					// from frame.getMeta. this will break since it has the sourceFrame's meta.
					// unless we reset the qs's frame to the targetFrame, we won't hold the meta anywhere
					((NativeFrame) targetFrame).getQueryStruct().setFrame(targetFrame);
					mergeFrame = mergeNative(targetFrame, qs, joins);
				} catch (Exception e) {
					classLogger.error(Constants.STACKTRACE, e);
					throw new SemossPixelException(e.getMessage());
				}
			} else if(qs != null) {
				try {
					mergeFrame = mergeFromQs(targetFrame, qs, joins);
				} catch (Exception e) {
					classLogger.error(Constants.STACKTRACE, e);
					throw new SemossPixelException(e.getMessage());
				}
			}
			// clear cached info after merge
			targetFrame.clearCachedMetrics();
			targetFrame.clearQueryCache();
			
			NounMetadata noun = new NounMetadata(mergeFrame, PixelDataType.FRAME, PixelOperationType.FRAME_DATA_CHANGE, PixelOperationType.FRAME_HEADERS_CHANGE);
			// in case we generated a new frame
			// update existing references
			if(mergeFrame != targetFrame) {
				if(targetFrame.getName() != null) {
					this.insight.getVarStore().put(targetFrame.getName(), noun);
				} 
				if(targetFrame == this.insight.getVarStore().get(Insight.CUR_FRAME_KEY).getValue()) {
					this.insight.setDataMaker(mergeFrame);
				}
			}
			
			return noun;
		}
		
		// clear cached info after merge
		targetFrame.clearCachedMetrics();
		targetFrame.clearQueryCache();
		
		NounMetadata noun = new NounMetadata(targetFrame, PixelDataType.FRAME, PixelOperationType.FRAME_DATA_CHANGE, PixelOperationType.FRAME_HEADERS_CHANGE);
		return noun;
	}
	
	private ITableDataFrame mergeNative(ITableDataFrame frame, SelectQueryStruct qs, List joins) throws Exception {
		// track GA data
		UserTrackerFactory.getInstance().trackDataImport(this.insight, qs);

		IImporter importer = ImportFactory.getImporter(frame, qs);
		importer.setInsight(this.insight);
		// we reassign the frame because it might have changed
		// this only happens for native frame
		frame = importer.mergeData(joins);
		return frame;
	}

	/**
	 * Merge via a QS that we will execute into an iterator
	 * @param frame
	 * @param qs
	 * @param joins
	 * @return
	 * @throws Exception 
	 */
	private ITableDataFrame mergeFromQs(ITableDataFrame frame, SelectQueryStruct qs, List joins) throws Exception {
		// track GA data
		UserTrackerFactory.getInstance().trackDataImport(this.insight, qs);

		// if we have an inner join, add the current values as a filter on the query
		// important for performance on large dbs when the user has already 
		// filtered to small subset
		boolean noDataError = false;
		try {
			for(Join j : joins) {
				// the join format is
				// LHS = COLUMN NAME OF THE FRAME I AM MERGING INTO 
				// RHS = COLUMN NAME OF THE NEW DATA WE ARE JOINING TO
				// LHS IS WHAT IS MAINTAINED AFTER THE JOIN
				// RHS IS THE NAME IN THE QUERY
				String leftColumnJoin = j.getLColumn();
				String rColumnJoin = j.getRColumn();
				String type = j.getJoinType();

				if(leftColumnJoin.contains("__")) {
					leftColumnJoin = leftColumnJoin.split("__")[1];
				}
				if(rColumnJoin.contains("__")) {
					rColumnJoin = rColumnJoin.split("__")[1];
				}
				
				if(type.equals("inner.join") || type.equals("left.outer.join")) {
					// we need to make sure we apply the filter correctly!
					// remember, RHS is the alias we provide the selector
					// but might not match the physical
					if(!qs.hasColumn(rColumnJoin)) {
						IQuerySelector selector = qs.findSelectorFromAlias(rColumnJoin);
						// get the correct q
						if(selector == null) {
							throw new IllegalArgumentException("There is an error with the join. Please make sure the columns are matched appropriately based on the frame you want to maintain");
						}
						rColumnJoin = selector.getQueryStructName();
					}
					// we will add a filter frame existing values in frame
					// but wait... need to make sure an existing filter isn't there
					if(qs.hasFiltered(rColumnJoin)) {
						continue;
					}

					// if current frame is empty
					// well, you will end up with no data
					// unless you are on a graph, which will just append nodes
					// as there is no real concept of joins currently
					if(frame.isEmpty()) {
						noDataError = true;
						throw new IllegalArgumentException("Attempting to join new data with an empty frame. End result is still an empty frame.");
					}

					SelectQueryStruct filterQs = new SelectQueryStruct();
					QueryColumnSelector column = new QueryColumnSelector(leftColumnJoin);
					filterQs.addSelector(column);
					try {
						Iterator it = frame.query(filterQs);
						List