All Downloads are FREE. Search and download functionalities are using the official Maven repository.

prerna.reactor.algorithms.xray.GenerateXRayMatchingReactor Maven / Gradle / Ivy

The newest version!
package prerna.reactor.algorithms.xray;

import java.util.List;
import java.util.Map;
import java.util.Vector;

import org.apache.logging.log4j.Logger;

import prerna.ds.r.RDataTable;
import prerna.ds.r.RSyntaxHelper;
import prerna.masterdatabase.utility.MasterDatabaseUtility;
import prerna.reactor.frame.r.AbstractRFrameReactor;
import prerna.sablecc2.om.PixelDataType;
import prerna.sablecc2.om.PixelOperationType;
import prerna.sablecc2.om.ReactorKeysEnum;
import prerna.sablecc2.om.nounmeta.NounMetadata;
import prerna.util.Constants;
import prerna.util.DIHelper;
import prerna.util.Utility;

public class GenerateXRayMatchingReactor extends AbstractRFrameReactor {

	public static final String CLASS_NAME = GenerateXRayMatchingReactor.class.getName();
	
	public static final String SIMILARITY_KEY = "similarity";
	public static final String CANDIDATE_KEY = "candidate";
	public static final String MATCH_SAME_DB_KEY = "matchSameDb";
	public static final String ROW_MATCHING = "rowComparison";

	public GenerateXRayMatchingReactor() {
		this.keysToGet = new String[] {ReactorKeysEnum.FILE_PATH.getKey(), ReactorKeysEnum.SPACE.getKey(), 
				ReactorKeysEnum.DATABASE.getKey(), ReactorKeysEnum.OVERRIDE.getKey(), ReactorKeysEnum.CONFIG.getKey(),
				SIMILARITY_KEY, CANDIDATE_KEY, MATCH_SAME_DB_KEY, ROW_MATCHING};
	}
	
	@Override
	public NounMetadata execute() {
		GenerateXRayHashingReactor hashReactor = new GenerateXRayHashingReactor();
		hashReactor.In();
		hashReactor.setNounStore(this.store);
		hashReactor.setInsight(this.insight);
		NounMetadata successfulHash = hashReactor.execute();
		
		Map filesHash = null;
		try {
			filesHash = (Map) successfulHash.getValue();
		} catch(Exception e) {
			throw new IllegalArgumentException("Error occurred trying to generaate hash for xray");
		}
		// specify the specific files to use
		List fileNames = (List) filesHash.get(GenerateXRayHashingReactor.FILES_KEY);
		if(fileNames == null || fileNames.isEmpty()) {
			throw new IllegalArgumentException("Error occurred trying to generaate hash for xray");
		}
		List databaseIds = (List) filesHash.get(GenerateXRayHashingReactor.DATABASE_IDS_KEY);
		
		// grab values from the hashReactor
		// since it already had to grab from store
		init();
		Logger logger = this.getLogger(CLASS_NAME);
		this.keyValue = hashReactor.keyValue;
		// get the exact files that were generated
		String folderPath = hashReactor.getFolderPath();
		List filePaths = new Vector(fileNames.size());
		for(int i = 0; i < fileNames.size(); i++) {
			filePaths.add(folderPath + "/" + fileNames.get(i));
		}
		
		// get other parameters for xray script
		int nMinhash = 0;
		int nBands = 0;
		int instancesThreshold = 1;
		double similarityThreshold = getSimiliarityThreshold();
		double candidateThreshold = getCandidateThreshold();
		// check if user wants to compare columns from the same database
		// this is the boolean value passed into R script
		Boolean matchSameDB = true;
		if(this.keyValue.get(MATCH_SAME_DB_KEY) != null) {
			matchSameDB = Boolean.parseBoolean(this.keyValue.get(MATCH_SAME_DB_KEY));
		}
		boolean addSameDbWarn = false;
		if(databaseIds.size() == 1) {
			if(!matchSameDB) {
				addSameDbWarn = true;
			}
			matchSameDB = true;
		}
		if (candidateThreshold <= 0.03) {
			nMinhash = 3640;
			nBands = 1820;
		} else if (candidateThreshold <= 0.02) {
			nMinhash = 8620;
			nBands = 4310;
		} else if (candidateThreshold <= 0.01) {
			nMinhash = 34480;
			nBands = 17240;
		} else if (candidateThreshold <= 0.05) {
			nMinhash = 1340;
			nBands = 670;
		} else if (candidateThreshold <= 0.1) {
			nMinhash = 400;
			nBands = 200;
		} else if (candidateThreshold <= 0.2) {
			nMinhash = 200;
			nBands = 100;
		} else if (candidateThreshold <= 0.4) {
			nMinhash = 210;
			nBands = 70;
		} else if (candidateThreshold <= 0.5) {
			nMinhash = 200;
			nBands = 50;
		} else {
			nMinhash = 200;
			nBands = 40;
		}
		
		// source the R script
		this.rJavaTranslator.executeEmptyR("source(\"" 
				+ DIHelper.getInstance().getProperty(Constants.BASE_FOLDER).replace('\\', '/') 
				+ "/R/XRay/matching.R\", local=TRUE);");

		logger.info("Running matching routine");
		String rFrameName = "xray_" + Utility.getRandomString(4);
		String script = rFrameName + " <- run_lsh_matching(" 
				+ RSyntaxHelper.createStringRColVec(filePaths) + ", " 
				+ nMinhash
				+ ", " + nBands 
				+ ", " + similarityThreshold 
				+ ", " + instancesThreshold 
				+ ", \";\", " 
				+ matchSameDB.toString().toUpperCase() + ");";
		this.rJavaTranslator.executeEmptyR(script);
		logger.info("Done matching");
		
		this.rJavaTranslator.executeEmptyR(rFrameName + "<- as.data.table(" + rFrameName + ");");
		
		// see if we can replace database ids with database name
		boolean replaceIds = true;
		List databaseNames = new Vector(databaseIds.size());
		for(int i = 0; i < databaseIds.size(); i++) {
			String databaseName = MasterDatabaseUtility.getDatabaseAliasForId(databaseIds.get(i));
			if(databaseNames.contains(databaseName)) {
				replaceIds = false;
				break;
			}
			databaseNames.add(databaseName);
		}
		if(replaceIds) {
			StringBuilder replaceSyntax = new StringBuilder();
			String sourceDbId = rFrameName + "$Source_Database_Id";
			String targetDbId = rFrameName + "$Target_Database_Id";
			for(int i = 0; i < databaseIds.size(); i++) {
				String databaseId = databaseIds.get(i);
				String databaseName = databaseNames.get(i);
				
				replaceSyntax.append(sourceDbId + "[" + sourceDbId + " == \"" + databaseId + "\"] <- \"" + databaseName + "\";");
				replaceSyntax.append(targetDbId + "[" + targetDbId + " == \"" + databaseId + "\"] <- \"" + databaseName + "\";");
			}
			this.rJavaTranslator.executeEmptyR(replaceSyntax.toString());
		}
		
		RDataTable matchingFrame = createNewFrameFromVariable(rFrameName);
		NounMetadata noun = new NounMetadata(matchingFrame, PixelDataType.FRAME, PixelOperationType.FRAME, PixelOperationType.FRAME_HEADERS_CHANGE, PixelOperationType.FRAME_DATA_CHANGE);
		noun.addAdditionalReturn(NounMetadata.getSuccessNounMessage("Successfully ran LSH for matching column values"));
		if(addSameDbWarn) {
			noun.addAdditionalReturn(NounMetadata.getWarningNounMessage("Since only one database was selected, altered input value to perform same database matching"));
		}
		// store the frame in the insight for use
		this.insight.getVarStore().put(rFrameName, noun);
		// set as default frame
		if(this.insight.getDataMaker() == null) {
			this.insight.setDataMaker(matchingFrame);
		}
		
		return noun;
	}
	
	
	/**
	 * Get xray param to set the candidate threshold to match data
	 * @return candidateThreshold
	 */
	private double getCandidateThreshold() {
		double candidateThreshold = -1;
		String cand = this.keyValue.get(CANDIDATE_KEY);
		Double candidate = null;
		try {
			candidate = Double.parseDouble(cand);
		} catch(Exception e) {
			// ignore
		}
		if (candidate != null) {
			candidateThreshold = candidate.doubleValue();
		}
		// default value
		if (candidateThreshold < 0 || candidateThreshold > 1) {
			candidateThreshold = 0.01;
		}
		return candidateThreshold;
	}

	/**
	 * Get xray param to get similarity threshold
	 * @return
	 */
	private double getSimiliarityThreshold() {
		double similarityThreshold = -1;
		String sim = this.keyValue.get(SIMILARITY_KEY);
		Double similarity = null;
		try {
			similarity = Double.parseDouble(sim);
		} catch(Exception e) {
			// ignore
		}
		if (similarity != null) {
			similarityThreshold = similarity.doubleValue();
		}
		// default value
		if (similarityThreshold < 0 || similarityThreshold > 1) {
			similarityThreshold = 0.01;
		}
		return similarityThreshold;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy