com.adobe.platform.operation.pdfops.ExtractPDFOperation Maven / Gradle / Ivy
/*
* Copyright 2020 Adobe
* All Rights Reserved.
*
* NOTICE: Adobe permits you to use, modify, and distribute this file in
* accordance with the terms of the Adobe license agreement accompanying
* it. If you have received this file from a source other than Adobe,
* then your use, modification, or distribution of it requires the prior
* written permission of Adobe.
*/
package com.adobe.platform.operation.pdfops;
import com.adobe.platform.operation.ExecutionContext;
import com.adobe.platform.operation.Operation;
import com.adobe.platform.operation.exception.ServiceApiException;
import com.adobe.platform.operation.exception.ServiceUsageException;
import com.adobe.platform.operation.internal.ExtensionMediaTypeMapping;
import com.adobe.platform.operation.internal.FileRefImpl;
import com.adobe.platform.operation.internal.InternalExecutionContext;
import com.adobe.platform.operation.internal.exception.OperationException;
import com.adobe.platform.operation.internal.service.ExtractPDFAPI;
import com.adobe.platform.operation.internal.util.FileUtil;
import com.adobe.platform.operation.internal.util.PathUtil;
import com.adobe.platform.operation.internal.util.StringUtil;
import com.adobe.platform.operation.io.FileRef;
import com.adobe.platform.operation.pdfops.constants.PDFElementType;
import com.adobe.platform.operation.pdfops.constants.TableStructureType;
import org.apache.commons.collections4.CollectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
/**
* An Operation that extracts pdf elements such as text, images, tables in a structured format from a PDF.
*/
public class ExtractPDFOperation implements Operation {
private static final Logger LOGGER = LoggerFactory.getLogger(ExtractPDFOperation.class);
/**
* Variable to check if the operation instance was invoked more than once
*/
private boolean isInvoked = false;
/**
* Field representing the extension of the operation result
*/
private static ExtensionMediaTypeMapping resultFormat = ExtensionMediaTypeMapping.ZIP;
/* Following are the list of inputs being set by client */
/**
* Reference to input file to be extracted
*/
private FileRefImpl sourceFileRef;
/**
* List of pdf element types to be extracted in a structured format from input file
*/
private List elementsToExtract = new ArrayList();
/**
* List of pdf element types whose renditions needs to be extracted from input file
*/
private List elementsToExtractRenditions = new ArrayList();
/**
* export table in specified format - csv / xlsx
*/
private TableStructureType tableOutFormat = null;
/**
* Boolean specifying whether to add character level bounding boxes to output json
*/
private Boolean getCharInfo = Boolean.FALSE;
private ExtractPDFOperation() {
}
/**
* Constructs a {@code ExtractPDFOperation} instance.
*
* @return a new {@code ExtractPDFOperation} instance
*/
public static ExtractPDFOperation createNew() {
return new ExtractPDFOperation();
}
/**
* Sets an input file.
*
* @param sourceFileRef an input file
* @return current {@code ExtractPDFOperation} instance
*/
public ExtractPDFOperation setInputFile(FileRef sourceFileRef) {
Objects.requireNonNull(sourceFileRef, "No input was set for operation");
this.sourceFileRef = (FileRefImpl) sourceFileRef;
return this;
}
/**
* Adds a pdf element type for extracting structured information.
*
* @param elementToExtract list of pdf elements to be extracted
* @return current {@code ExtractPDFOperation} instance
*/
public ExtractPDFOperation addElementToExtract(PDFElementType elementToExtract) {
if (elementToExtract != null) {
this.elementsToExtract.add(elementToExtract);
}
return this;
}
/**
* Specifies the format for extracting table data information.
*
* @param tableStructure type of format to output table data
* @return current {@code ExtractPDFOperation} instance
*/
public ExtractPDFOperation addTableStructureFormat(TableStructureType tableStructure){
if(tableStructure != null) {
this.tableOutFormat = tableStructure;
}
return this;
}
/**
* Boolean to get bounding boxes for characters present in text blocks(paragraphs, list, headings).
*
* @param charInfo boolean
* @return current {@code ExtractPDFOperation} instance
*/
public ExtractPDFOperation addCharInfo(Boolean charInfo) {
this.getCharInfo = charInfo;
return this;
}
/**
* Adds a pdf element type for extracting rendition.
*
* @param elementToExtractRenditions list of pdf elements to be extracted
* @return current {@code ExtractPDFOperation} instance
*/
public ExtractPDFOperation addElementToExtractRenditions(PDFElementType elementToExtractRenditions) {
if (elementToExtractRenditions != null) {
this.elementsToExtractRenditions.add(elementToExtractRenditions);
}
return this;
}
/**
* Add pdf element types for extracting structured information.
*
* @param elementsToExtract list of pdf elements to be extracted
* @return current {@code ExtractPDFOperation} instance
*/
public ExtractPDFOperation addElementsToExtract(List elementsToExtract) {
if (CollectionUtils.isNotEmpty(elementsToExtract)) {
this.elementsToExtract.addAll(elementsToExtract);
}
return this;
}
/**
* Add pdf element types for extracting renditions.
*
* @param elementsToExtractRenditions list of pdf elements for extracting renditions
* @return current {@code ExtractPDFOperation} instance
*/
public ExtractPDFOperation addElementsToExtractRenditions(List elementsToExtractRenditions) {
if (CollectionUtils.isNotEmpty(elementsToExtractRenditions)) {
this.elementsToExtractRenditions.addAll(elementsToExtractRenditions);
}
return this;
}
/**
* Executes this operation synchronously using the supplied context and returns a new FileRef instance for the resulting Zip file.
*
* The resulting file may be stored in the system temporary directory (per java.io.tmpdir System property).
* See {@link FileRef} for how temporary resources are cleaned up.
*
* @param context the context in which to execute the operation
* @return the resulting Zip file containing elements information and renditions
* @throws ServiceApiException if an API call results in an error response
* @throws IOException if there is an error in reading either the input source or the resulting Zip file
* @throws ServiceUsageException if service usage limits have been reached or credentials quota has been exhausted.
*/
public FileRef execute(ExecutionContext context) throws ServiceApiException, IOException, ServiceUsageException {
validateInvocationCount();
InternalExecutionContext internalExecutionContext = (InternalExecutionContext) context;
try {
LOGGER.info("All validations successfully done. Beginning ExtractPDF operation execution");
long startTimeMs = System.currentTimeMillis();
String location = ExtractPDFAPI.extractPdf(internalExecutionContext, sourceFileRef, elementsToExtract,
elementsToExtractRenditions, tableOutFormat, getCharInfo);
String targetFileName = FileUtil.getRandomFileName(resultFormat.getExtension());
String temporaryDestinationPath = PathUtil.getTemporaryDestinationPath(targetFileName,
resultFormat.getExtension());
ExtractPDFAPI.downloadAndSave(internalExecutionContext, location, temporaryDestinationPath,
resultFormat);
isInvoked = true;
LOGGER.info("Extract Operation Successful - Transaction ID: {}", StringUtil.getRequestIdFromLocation(location));
return FileRef.createFromLocalFile(temporaryDestinationPath);
} catch (OperationException oe) {
throw new ServiceApiException(oe.getErrorMessage(), oe.getRequestTrackingId(), oe.getStatusCode());
}
}
private void validateInvocationCount() {
if (isInvoked) {
LOGGER.error("Operation instance must only be invoked once");
throw new IllegalStateException("Operation instance must not be reused, can only be invoked once");
}
}
}