All Downloads are FREE. Search and download functionalities are using the official Maven repository.

prerna.engine.impl.function.AWSTextractFunctionEngine Maven / Gradle / Ivy

The newest version!
package prerna.engine.impl.function;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import com.amazonaws.auth.AWSStaticCredentialsProvider;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3ClientBuilder;
import com.amazonaws.services.s3.model.GetObjectMetadataRequest;
import com.amazonaws.services.s3.model.HeadBucketRequest;
import com.amazonaws.services.s3.model.PutObjectRequest;
import com.amazonaws.services.textract.AmazonTextract;
import com.amazonaws.services.textract.AmazonTextractClientBuilder;
import com.amazonaws.services.textract.model.Block;
import com.amazonaws.services.textract.model.DocumentLocation;
import com.amazonaws.services.textract.model.GetDocumentTextDetectionRequest;
import com.amazonaws.services.textract.model.GetDocumentTextDetectionResult;
import com.amazonaws.services.textract.model.S3Object;
import com.amazonaws.services.textract.model.StartDocumentTextDetectionRequest;
import com.amazonaws.services.textract.model.StartDocumentTextDetectionResult;

import prerna.engine.api.IStorageEngine;
import prerna.util.Constants;
import prerna.util.Utility;

public class AWSTextractFunctionEngine extends AbstractFunctionEngine {

	private static final Logger classLogger = LogManager.getLogger(AWSTextractFunctionEngine.class);

	public static final String ACCESS_KEY = "ACCESS_KEY";
	public static final String SECRET_KEY = "SECRET_KEY";
	public static final String REGION = "REGION";
	public static final String BUCKETNAME = "BUCKETNAME";

	private String accessKey;
	private String secretKey;	
	private String region;
	private String bucketPath;	

	private AmazonTextract textractClient = null;

	@Override
	public void open(Properties smssProp) throws Exception {
		super.open(smssProp);

		this.accessKey = smssProp.getProperty(ACCESS_KEY);
		this.secretKey = smssProp.getProperty(SECRET_KEY);
		this.region = smssProp.getProperty(REGION);
		this.bucketPath = smssProp.getProperty(BUCKETNAME);

		if(this.requiredParameters == null || (this.requiredParameters.isEmpty())) {
			throw new RuntimeException("Must define the requiredParameters");
		}
		if(this.accessKey == null || this.accessKey.isEmpty()){
			throw new RuntimeException("Must pass in an access key");
		}		
		if(this.secretKey == null || this.secretKey.isEmpty()){
			throw new RuntimeException("Must pass in a secret key");
		}	
		if(this.region == null || this.region.isEmpty()){
			throw new RuntimeException("Must pass in a region");
		}
		if(this.bucketPath == null || this.bucketPath.isEmpty()) {
			throw new RuntimeException("Must pass in a S3BucketPath");		
		}		
		try {
			BasicAWSCredentials awsCreds = new BasicAWSCredentials(this.accessKey, this.secretKey);
			this.textractClient = AmazonTextractClientBuilder.standard()
					.withCredentials(new AWSStaticCredentialsProvider(awsCreds))
					.withRegion(this.region) 
					.build(); 
		} catch (Exception e) {
			classLogger.error(Constants.STACKTRACE, e);	
		} 
	}

	@Override
	public Object execute(Map parameterValues) {
		Object output = null;
		String documentKeyName = null;		
		String S3BucketEngineId = null;

		// validate all the required keys are set
		if(this.requiredParameters != null && !this.requiredParameters.isEmpty()) {
			Set missingPs = new HashSet<>();
			for(String requiredP : this.requiredParameters) {
				if(!parameterValues.containsKey(requiredP)) {
					missingPs.add(requiredP);
				}
			}
			if(!missingPs.isEmpty()) {
				throw new IllegalArgumentException("Must define required keys = " + missingPs);
			}
		}

		try {
			for(String k : parameterValues.keySet()) {
				if (k.contains("filepathInS3")) {	
					File file = new File(parameterValues.get(k).toString());
					documentKeyName = file.getName();
					String filePath = parameterValues.get(k).toString();
					int startIndex = filePath.indexOf('/') + 1;
					int endIndex = filePath.lastIndexOf('/');
					String folderPath;			        
					if (startIndex <= endIndex && startIndex < filePath.length()) {
						folderPath = filePath.substring(startIndex, endIndex);
						folderPath += "/"+documentKeyName;
					} else {
						folderPath = documentKeyName; // Handle the case where there is no subfolder
					}			     
					System.out.println("folderName: " + folderPath);

					int endIndex1 = filePath.indexOf('/');
					String bucketname = filePath.substring(0, endIndex1);
					System.out.println("bucketname"+bucketname);

					boolean identifyBucket = listObjects(bucketname, folderPath);
					if(identifyBucket) {
						output = textractFromDocument(documentKeyName,bucketname);		
					}else {			        	
						output = "Must provide the valid path";
						throw new RuntimeException("Must provide the valid path");
					}

				} else if(k.contains("uploadedfilepath") && k.contains("S3BucketEngineId")){
					if(parameterValues.containsKey("uploadedfilepath")) {
						File file = new File(parameterValues.get(k).toString());
						documentKeyName = file.getName(); // The name of the file in the bucket      
					} else if(parameterValues.containsKey("S3BucketEngineId")){
						S3BucketEngineId = parameterValues.get(k).toString();
					}

					/* 
			        BasicAWSCredentials awsCreds = new BasicAWSCredentials(this.accessKey, this.secretKey);
			        AmazonS3 s3Client = AmazonS3ClientBuilder.standard()
		                    .withRegion(this.region)
		                    .withCredentials(new AWSStaticCredentialsProvider(awsCreds))
		                    .build();

		            // Upload the file to the bucket
		            s3Client.putObject(new PutObjectRequest(this.S3BucketName, documentKeyName, file));       
					 */

					int startIndex = this.bucketPath.indexOf('/')+1;
					int endIndex = this.bucketPath.lastIndexOf('/');
					String folderS3 = null;
					String folderPath;			        
					if (startIndex <= endIndex && startIndex < this.bucketPath.length()) {
						folderPath = this.bucketPath.substring(startIndex, endIndex);
						folderS3 = folderPath;
						folderPath += "/"+documentKeyName;
					} else {
						folderPath = documentKeyName; // Handle the case where there is no subfolder
					}			     
					System.out.println("folderName: " + folderPath);


					int endIndex1 = this.bucketPath.indexOf('/');
					String bucketname = this.bucketPath.substring(0, endIndex1);
					System.out.println("bucketname"+bucketname);
					boolean identifyBucket = listObjects(bucketname, folderPath);			        

					IStorageEngine storage = Utility.getStorage(S3BucketEngineId);
					Map map = new HashMap<>();
					map.put("functionalityUsed",documentKeyName+"-textract_functionality");

					if(identifyBucket) {
						storage.syncLocalToStorage(folderPath,bucketname, map);	      
						output = textractFromDocument(documentKeyName,bucketname);		
					} else {
						createFolderinS3(bucketname, folderS3);
						storage.syncLocalToStorage(folderPath,bucketname, map);	      
						output = textractFromDocument(documentKeyName,bucketname);
					} 	            
				}
			}			
		} catch (Exception e) {
			classLogger.error(Constants.STACKTRACE, e);	
		}
		return output;
	}

	public List textractFromDocument(String documentName, String S3BucketPath){
		List extractedTextFromDoc = new ArrayList();    	
		try { 
			// Create the StartDocumentTextDetection request
			StartDocumentTextDetectionRequest request = new StartDocumentTextDetectionRequest()
					.withDocumentLocation(new DocumentLocation()
							.withS3Object(new S3Object()
									.withBucket(S3BucketPath)
									.withName(documentName)));

			// Start text detection
			StartDocumentTextDetectionResult result = this.textractClient.startDocumentTextDetection(request);

			//results
			GetDocumentTextDetectionRequest getRequest = new GetDocumentTextDetectionRequest().withJobId(result.getJobId());
			GetDocumentTextDetectionResult getResult;
			String nextToken = null;            

			do {
				getRequest.setNextToken(nextToken);
				do {
					getResult = this.textractClient.getDocumentTextDetection(getRequest);  
				} while (!getResult.getJobStatus().equals("SUCCEEDED"));
				nextToken = getResult.getNextToken();

				for (Block block : getResult.getBlocks()) {
					if ("PAGE".equals(block.getBlockType())) {
						int pageNumber = block.getPage();
						StringBuilder pageText = new StringBuilder();
						for (Block item : getResult.getBlocks()) {
							if (item.getPage() == pageNumber && "LINE".equals(item.getBlockType())) {
								pageText.append(item.getText());
							}
						}
						extractedTextFromDoc.add(pageText.toString());
					}
				}
			} while (nextToken != null);
			
			System.out.println(extractedTextFromDoc.size());
			System.out.println(extractedTextFromDoc);			        
		} catch (Exception e) {
			classLogger.error(Constants.STACKTRACE, e);	
			e.printStackTrace();
		}	
		return extractedTextFromDoc;
	}

	public Boolean listObjects(String bucketName, String folderPath){
		BasicAWSCredentials awsCreds = new BasicAWSCredentials(this.accessKey, this.secretKey);
		AmazonS3 s3Client = AmazonS3ClientBuilder.standard()
				.withRegion(this.region)
				.withCredentials(new AWSStaticCredentialsProvider(awsCreds))
				.build();
		boolean result = false;
		boolean bucketExists = doesBucketExist(s3Client, bucketName);
		System.out.println("Bucket exists: " + bucketExists);

		if (bucketExists) {
			try {
				// Check if the object exists
				s3Client.getObjectMetadata(new GetObjectMetadataRequest(bucketName, folderPath));
				System.out.println("File exists.");
				result = true;
			} catch (com.amazonaws.services.s3.model.AmazonS3Exception e) {
				if (e.getStatusCode() == 404) {
					result = false;
				} else {
					e.printStackTrace();
				}
			}
		} else {
			result = false;
		}

		return result;
	}

	private static boolean doesBucketExist(AmazonS3 s3Client, String bucketName) {
		try {
			s3Client.headBucket(new HeadBucketRequest(bucketName));
			return true;
		} catch (Exception e) {
			return false;
		}
	}

	private void createFolderinS3(String bucketName, String folderPath) {       
		BasicAWSCredentials awsCreds = new BasicAWSCredentials(this.accessKey, this.secretKey);
		AmazonS3 s3Client = AmazonS3ClientBuilder.standard()
				.withRegion(this.region)
				.withCredentials(new AWSStaticCredentialsProvider(awsCreds))
				.build();

		ByteArrayInputStream emptyInputStream = new ByteArrayInputStream(new byte[0]); 
		// Create an empty object (folder) in S3 
		s3Client.putObject(new PutObjectRequest(bucketName, folderPath, emptyInputStream, null));    	
	}

	@Override
	public void close() throws IOException {
		// TODO Auto-generated method stub

	}
	
	@Override
	public String getCatalogSubType(Properties smssProp) {
		return "AWS";
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy