All Downloads are FREE. Search and download functionalities are using the official Maven repository.

prerna.reactor.algorithms.RunDocumentSummarizationReactor Maven / Gradle / Ivy

The newest version!
package prerna.reactor.algorithms;

import java.io.File;

import org.apache.logging.log4j.Logger;

import prerna.algorithm.api.ITableDataFrame;
import prerna.ds.r.RDataTable;
import prerna.ds.r.RSyntaxHelper;
import prerna.reactor.frame.r.AbstractRFrameReactor;
import prerna.sablecc2.om.PixelDataType;
import prerna.sablecc2.om.PixelOperationType;
import prerna.sablecc2.om.nounmeta.NounMetadata;
import prerna.util.DIHelper;
import prerna.util.UploadInputUtility;
import prerna.util.Utility;

public class RunDocumentSummarizationReactor extends AbstractRFrameReactor {

	/**
	 * Generates a dashboard summarizing a chosen document that is entered by the
	 * user
	 */

	// CreateFrame ( R ) .as ( [ 'FRAME950929' ] ) | RunDocumentSummarization(desiredOutput = ["Summarize Text"], fileOrigin = ["File Path"], userInput = ["C:/Users/chrilong/Desktop/GoTAppeal.doc"],numSentences = ["5"],numTopics = ["5"],numTopicTerms=["6"])

	protected static final String CLASS_NAME = RunDocumentSummarizationReactor.class.getName();
	private static final String DIR_SEPARATOR = java.nio.file.FileSystems.getDefault().getSeparator();
	private static final String FILE_ORIGIN = "fileOrigin";
	private static final String USER_INPUT = "userInput";
	private static final String NUM_SENTENCES = "numSentences";
	private static final String NUM_TOPICS = "numTopics";
	private static final String NUM_TOPIC_TERMS = "numTopicTerms";

	public RunDocumentSummarizationReactor() {
		this.keysToGet = new String[] { FILE_ORIGIN, USER_INPUT, NUM_SENTENCES,NUM_TOPICS, NUM_TOPIC_TERMS };
	}

	@Override
	public NounMetadata execute() {
		// initialize reactor
		init();
		organizeKeys();
		int stepCounter = 1;
		Logger logger = this.getLogger(CLASS_NAME);

		// get inputs
		String fileOrigin = this.keyValue.get(FILE_ORIGIN);
		String userInput = this.keyValue.get(USER_INPUT);
		String numSentences = this.keyValue.get(NUM_SENTENCES);
		String numTopics = this.keyValue.get(NUM_TOPICS);
		String numTopicTerms = this.keyValue.get(NUM_TOPIC_TERMS);

		// set up frame names
		String summaryFrame = "Summary" + Utility.getRandomString(5);
		String topicsFrame = "Topics" + Utility.getRandomString(5);
		String keywordsFrame = "Keywords" + Utility.getRandomString(5);

		// set up the return R frame
		ITableDataFrame table = getFrame();
		if (!(table instanceof RDataTable)) {
			throw new IllegalArgumentException("Frame must be an R Frame for Document Summarizer");
		}
		RDataTable frame = (RDataTable) table;
		String frameName = frame.getName();
		
		// make sure that user has correct R version
		if(checkRVersion(3,4)) {
			throw new IllegalArgumentException("Document Summary requires at least R 3.4");
		}
		
		// Check Packages
		logger.info(stepCounter + ". Checking R Packages and Necessary Files");
		stepCounter++;
		System.out.println("");
		String[] packages = new String[] { "readtext", "xml2", "rvest", "lexRankr", "textrank", "udpipe", "textreuse",
				"stringr", "textmineR", "textreadr", "pdftools", "antiword", "dplyr" , "tm" };
		this.rJavaTranslator.checkPackages(packages);

		// start the R script
		logger.info(stepCounter + ". Building script to summarize document");
		stepCounter++;
		StringBuilder rsb = new StringBuilder();
		String baseFolder = DIHelper.getInstance().getProperty("BaseFolder");
		String wd = "wd" + Utility.getRandomString(5);
		rsb.append(frameName + " <- NULL;");
		rsb.append(wd + "<- getwd();");
		rsb.append(("setwd(\"" + baseFolder + DIR_SEPARATOR + "R" + DIR_SEPARATOR + "AnalyticsRoutineScripts\");").replace("\\", "/"));
		rsb.append("source(\"text_summary.R\");");

		// if file path, make sure that the file exists
		if (fileOrigin.equals("File Path")) {
			userInput = UploadInputUtility.getFilePath(this.store, this.insight, USER_INPUT);
			userInput = userInput.replace("\\", "/");
			File f = new File(Utility.normalizePath(userInput));
			if (!f.exists()) {
				throw new IllegalArgumentException("File does not exist at that location");
			}
		}

		// adjust the file origin parameter to the R syntax
		fileOrigin = adjustFileOriginParameter(fileOrigin);

		// create the summary, topics, and keywords table
		rsb.append(summaryFrame + " <- summarize_text(" + fileOrigin + " = \"" + userInput + "\", topN = " + numSentences + ");");
		rsb.append(topicsFrame + " <- summarize_topics_text(" + fileOrigin + " = \"" + userInput + "\" , topTopics = " + numTopics + ", " + "topTerms = " + numTopicTerms + ");");
		rsb.append(keywordsFrame + " <- text_keywords(" + fileOrigin + " = \"" + userInput + "\");");

		// adjust the keyword and freq column naming issue
		rsb.append("setnames(" + topicsFrame + ", old=c(\"freq\",\"keyword\", \"text\"), new=c(\"topic_frequency\", \"topic_keywords\", \"topic_summary\"));");
		rsb.append("setnames(" + keywordsFrame + ", old=c(\"freq\"), new=c(\"keyword_frequency\"));");

		// now lets put it all into one frame for dashboard purposes
		rsb.append(frameName + "<- bind_rows(as.data.frame(" + summaryFrame + ")," + topicsFrame + ");");
		rsb.append(frameName + "<- bind_rows(" + frameName + "," + keywordsFrame + ");");

		// change the summary frame column
		rsb.append("setnames(" + frameName + ", old=c(\"" + summaryFrame + "\"), new=c(\"summary\"));");
		
		// need to change the integer columns to numeric
		rsb.append(frameName + "$ngram <- as.numeric("+frameName+"$ngram);");
		rsb.append(frameName + "$keyword_frequency <- as.numeric("+frameName+"$keyword_frequency);");
		rsb.append(frameName + "$topic_frequency <- as.numeric("+frameName+"$topic_frequency);");
		
		// reset wd and clean up
		rsb.append("setwd(" + wd + ");");
		rsb.append("rm(" + summaryFrame + "," + topicsFrame + "," + keywordsFrame + ");");
		
		// make sure that script ran correctly, and throw helpful error if not
		String isError = "documentSummaryError" + Utility.getRandomString(5);
		rsb.append(isError + " <- \"\";");
		
		// run the r script
		logger.info(stepCounter + ". Summarizing document");
		stepCounter++;
		this.rJavaTranslator.runR(rsb.toString());
		this.rJavaTranslator.executeEmptyR(RSyntaxHelper.asDataTable(frameName, frameName));
		logger.info(stepCounter + ". Visualizing Data");
		stepCounter++;
		
		// make sure script ran correctly
		Boolean errorCheck = this.rJavaTranslator.getBoolean("!exists(\"" + isError + "\")");
		if(errorCheck) {
			throw new IllegalArgumentException("Document could not be summarized");
		}

		// now create the frame and return the noun
		RDataTable newTable = createNewFrameFromVariable(frameName);
		this.insight.setDataMaker(newTable);
		NounMetadata noun = new NounMetadata(newTable, PixelDataType.FRAME, PixelOperationType.FRAME_DATA_CHANGE, PixelOperationType.FRAME_HEADERS_CHANGE);
		this.insight.getVarStore().put(frameName, noun);
		return noun;
	}

	///////////////////////// ORGANIZING PARAMS /////////////////////////
	
	private String adjustFileOriginParameter(String fileOrigin) {
		if (fileOrigin.equals("File Path")) {
			fileOrigin = "filename";
		} else if (fileOrigin.equals("URL")) {
			fileOrigin = "page_url";
		} else if (fileOrigin.equals("Enter Manually")) {
			fileOrigin = "content";
		} else {
			throw new IllegalArgumentException("Improper File Origin");
		}
		return fileOrigin;
	}
	
	private boolean checkRVersion(int majorReq, int minorReq) {
		// relies on the following R format (seems like its been this format for > 6 years
		// R version 3.5.0 (2018-04-23)
		String rVersion = this.rJavaTranslator.getString("R.version.string");
		int major = Integer.parseInt(rVersion.substring(10, 11));
		int minor = Integer.parseInt(rVersion.substring(12, 13));
		if(major < 3 || (major <= majorReq && minor < minorReq)) {
			return true;
		} else {
			return false;
		}
	}

	///////////////////////// KEYS /////////////////////////////////////

	@Override
	protected String getDescriptionForKey(String key) {
		if (key.equals(FILE_ORIGIN)) {
			return "The format of the document to be summarized (file path, url, or free text)";
		} else if(key.equals(USER_INPUT)) {
			return "The actual file, whether that is a URL, file path, or free text";
		} else if (key.equals(NUM_SENTENCES)) {
			return "The number of sentences to be returned in the summary";
		} else if(key.equals(NUM_TOPICS)) {
			return "The number of major topics to be returned in the summary";
		} else if (key.equals(NUM_TOPIC_TERMS)) {
			return "The number of keywords within each major topic to be returned in the summary";
		} else {
			return super.getDescriptionForKey(key);
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy