com.adobe.platform.operation.pdfops.ExtractPDFOperation Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdftools-extract-sdk Show documentation
PDFTools Extract SDK
The newest version!
/*
 * Copyright 2020 Adobe
 * All Rights Reserved.
 *
 * NOTICE: Adobe permits you to use, modify, and distribute this file in
 * accordance with the terms of the Adobe license agreement accompanying
 * it. If you have received this file from a source other than Adobe,
 * then your use, modification, or distribution of it requires the prior
 * written permission of Adobe.
 */

package com.adobe.platform.operation.pdfops;

import com.adobe.platform.operation.ExecutionContext;
import com.adobe.platform.operation.Operation;
import com.adobe.platform.operation.exception.ServiceApiException;
import com.adobe.platform.operation.exception.ServiceUsageException;
import com.adobe.platform.operation.internal.ExtensionMediaTypeMapping;
import com.adobe.platform.operation.internal.FileRefImpl;
import com.adobe.platform.operation.internal.InternalExecutionContext;
import com.adobe.platform.operation.internal.exception.OperationException;
import com.adobe.platform.operation.internal.service.ExtractPDFAPI;
import com.adobe.platform.operation.internal.util.FileUtil;
import com.adobe.platform.operation.internal.util.PathUtil;
import com.adobe.platform.operation.internal.util.StringUtil;
import com.adobe.platform.operation.io.FileRef;
import com.adobe.platform.operation.pdfops.constants.PDFElementType;
import com.adobe.platform.operation.pdfops.constants.TableStructureType;
import org.apache.commons.collections4.CollectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;

/**
 * An Operation that extracts pdf elements such as text, images, tables in a structured format from a PDF.
 */
public class ExtractPDFOperation implements Operation {

    private static final Logger LOGGER = LoggerFactory.getLogger(ExtractPDFOperation.class);

    /**
     * Variable to check if the operation instance was invoked more than once
     */
    private boolean isInvoked = false;

    /**
     * Field representing the extension of the operation result
     */
    private static ExtensionMediaTypeMapping resultFormat = ExtensionMediaTypeMapping.ZIP;


    /* Following are the list of inputs being set by client */

    /**
     *  Reference to input file to be extracted
     */
    private FileRefImpl sourceFileRef;

    /**
     * List of pdf element types to be extracted in a structured format from input file
     */
    private List elementsToExtract = new ArrayList();

    /**
     *  List of pdf element types whose renditions needs to be extracted from input file
     */
    private List elementsToExtractRenditions = new ArrayList();

    /**
     *  export table in specified format - csv / xlsx
     */
    private TableStructureType tableOutFormat = null;

    /**
     * Boolean specifying whether to add character level bounding boxes to output json
     */
    private Boolean getCharInfo = Boolean.FALSE;

    private ExtractPDFOperation() {

    }

    /**
     * Constructs a {@code ExtractPDFOperation} instance.
     *
     * @return a new {@code ExtractPDFOperation} instance
     */
    public static ExtractPDFOperation createNew() {
        return new ExtractPDFOperation();
    }

    /**
     * Sets an input file.
     *
     * @param sourceFileRef an input file
     * @return current {@code ExtractPDFOperation} instance
     */
    public ExtractPDFOperation setInputFile(FileRef sourceFileRef) {
        Objects.requireNonNull(sourceFileRef, "No input was set for operation");
        this.sourceFileRef = (FileRefImpl) sourceFileRef;
        return this;
    }

    /**
     * Adds a pdf element type for extracting structured information.
     *
     * @param elementToExtract list of pdf elements to be extracted
     * @return current {@code ExtractPDFOperation} instance
     */
    public ExtractPDFOperation addElementToExtract(PDFElementType elementToExtract) {
        if (elementToExtract != null) {
            this.elementsToExtract.add(elementToExtract);
        }
        return this;
    }

    /**
     * Specifies the format for extracting table data information.
     *
     * @param tableStructure type of format to output table data
     * @return current {@code ExtractPDFOperation} instance
     */
    public ExtractPDFOperation addTableStructureFormat(TableStructureType tableStructure){
        if(tableStructure != null) {
            this.tableOutFormat = tableStructure;
        }
        return this;
    }

    /**
     * Boolean to get bounding boxes for characters present in text blocks(paragraphs, list, headings).
     *
     * @param charInfo boolean
     * @return current {@code ExtractPDFOperation} instance
     */
    public ExtractPDFOperation addCharInfo(Boolean charInfo) {
        this.getCharInfo = charInfo;
        return this;
    }
    /**
     * Adds a pdf element type for extracting rendition.
     *
     * @param elementToExtractRenditions list of pdf elements to be extracted
     * @return current {@code ExtractPDFOperation} instance
     */
    public ExtractPDFOperation addElementToExtractRenditions(PDFElementType elementToExtractRenditions) {
        if (elementToExtractRenditions != null) {
            this.elementsToExtractRenditions.add(elementToExtractRenditions);
        }
        return this;
    }

    /**
     * Add pdf element types for extracting structured information.
     *
     * @param elementsToExtract list of pdf elements to be extracted
     * @return current {@code ExtractPDFOperation} instance
     */
    public ExtractPDFOperation addElementsToExtract(List elementsToExtract) {
        if (CollectionUtils.isNotEmpty(elementsToExtract)) {
            this.elementsToExtract.addAll(elementsToExtract);
        }
        return this;
    }

    /**
     * Add pdf element types for extracting renditions.
     *
     * @param elementsToExtractRenditions list of pdf elements for extracting renditions
     * @return current {@code ExtractPDFOperation} instance
     */
    public ExtractPDFOperation addElementsToExtractRenditions(List elementsToExtractRenditions) {
        if (CollectionUtils.isNotEmpty(elementsToExtractRenditions)) {
            this.elementsToExtractRenditions.addAll(elementsToExtractRenditions);
        }
        return this;
    }

    /**
     * Executes this operation synchronously using the supplied context and returns a new FileRef instance for the resulting Zip file.
     * 
     * The resulting file may be stored in the system temporary directory (per java.io.tmpdir System property).
     * See {@link FileRef} for how temporary resources are cleaned up.
     *
     * @param context the context in which to execute the operation
     * @return the resulting Zip file containing elements information and renditions
     * @throws ServiceApiException   if an API call results in an error response
     * @throws IOException           if there is an error in reading either the input source or the resulting Zip file
     * @throws ServiceUsageException if service usage limits have been reached or credentials quota has been exhausted.
     */
    public FileRef execute(ExecutionContext context) throws ServiceApiException, IOException, ServiceUsageException {
        validateInvocationCount();
        InternalExecutionContext internalExecutionContext = (InternalExecutionContext) context;

        try {
            LOGGER.info("All validations successfully done. Beginning ExtractPDF operation execution");
            long startTimeMs = System.currentTimeMillis();

            String location = ExtractPDFAPI.extractPdf(internalExecutionContext, sourceFileRef, elementsToExtract,
                    elementsToExtractRenditions, tableOutFormat, getCharInfo);

            String targetFileName = FileUtil.getRandomFileName(resultFormat.getExtension());
            String temporaryDestinationPath = PathUtil.getTemporaryDestinationPath(targetFileName,
                    resultFormat.getExtension());

            ExtractPDFAPI.downloadAndSave(internalExecutionContext, location, temporaryDestinationPath,
                    resultFormat);

            isInvoked = true;
            LOGGER.info("Extract Operation Successful - Transaction ID: {}", StringUtil.getRequestIdFromLocation(location));

            return FileRef.createFromLocalFile(temporaryDestinationPath);
        } catch (OperationException oe) {
            throw new ServiceApiException(oe.getErrorMessage(), oe.getRequestTrackingId(), oe.getStatusCode());
        }

    }

    private void validateInvocationCount() {
        if (isInvoked) {
            LOGGER.error("Operation instance must only be invoked once");
            throw new IllegalStateException("Operation instance must not be reused, can only be invoked once");
        }
    }
}