com.shaft.tools.io.PdfFileManager Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of SHAFT_ENGINE Show documentation
SHAFT is a unified test automation engine. Powered by best-in-class frameworks, SHAFT provides a wizard-like syntax to drive your automation efficiently, maximize your ROI, and minimize your learning curve. Stop reinventing the wheel. Upgrade now!
There is a newer version: 8.2.20240402
Show newest version
package com.shaft.tools.io;

import com.shaft.cli.FileActions;
import com.shaft.tools.internal.support.JavaHelper;
import com.shaft.tools.io.internal.FailureReporter;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

import java.io.File;
import java.io.IOException;

@SuppressWarnings("unused")
public class PdfFileManager {

    private final File file;
    private RandomAccessBufferedFileInputStream stream = null;
    private PDFParser parser = null;
    private COSDocument cosDoc = null;
    private PDFTextStripper strip = null;

    public PdfFileManager(String folderName, String fileName, int numberOfRetries) {

        boolean doesFileExist = FileActions.getInstance().doesFileExist(folderName, fileName, numberOfRetries);

        file = new File(FileActions.getInstance().getAbsolutePath(folderName, fileName));

        if (!doesFileExist) {
            FailureReporter.fail("Couldn't find the provided file [" + file
                    + "]. It might need to wait more to download or the path isn't correct");
        }
    }

    public PdfFileManager(String pdfFilePath) {
        pdfFilePath = JavaHelper.appendTestDataToRelativePath(pdfFilePath);
        boolean doesFileExist = FileActions.getInstance().doesFileExist(pdfFilePath);
        file = new File(FileActions.getInstance().getAbsolutePath(pdfFilePath));
        if (!doesFileExist) {
            FailureReporter.fail("Couldn't find the provided file [" + file
                    + "]. It might need to wait more to download or the path isn't correct");
        }
    }

    public String readFileContent() {
        return PdfFileManager.readFileContent(file.getPath());
    }

    /**
     * Read PDF file content given relative path and optionally delete the file after reading it
     *
     * @param relativeFilePath       relative path to the PDF file
     * @param deleteFileAfterReading optional boolean to delete the file after reading it or not, default is to leave the file as is
     * @return a string value representing the entire content of the pdf file
     */
    public static String readFileContent(String relativeFilePath, boolean... deleteFileAfterReading) {
        if (FileActions.getInstance().doesFileExist(relativeFilePath)) {
            try {
                var randomAccessBufferedFileInputStream = new RandomAccessBufferedFileInputStream(new File(FileActions.getInstance().getAbsolutePath(relativeFilePath)));
                var pdfParser = new PDFParser(randomAccessBufferedFileInputStream);
                pdfParser.parse();
                var pdfTextStripper = new PDFTextStripper();
                pdfTextStripper.setSortByPosition(true);
                var fileContent = pdfTextStripper.getText(new PDDocument(pdfParser.getDocument()));
                randomAccessBufferedFileInputStream.close();

                if (deleteFileAfterReading != null
                        && deleteFileAfterReading.length > 0
                        && deleteFileAfterReading[0]) {
                    FileActions.getInstance().deleteFile(relativeFilePath);
                }
                return fileContent;
            } catch (java.io.IOException rootCauseException) {
                FailureReporter.fail(PdfFileManager.class, "Failed to read this PDF file [" + relativeFilePath + "].", rootCauseException);
            }

        } else {
            FailureReporter.fail("This PDF file [" + relativeFilePath + "] doesn't exist.");
        }
        return "";
    }

    /**
     * @param startPageNumber                 the starting page for the document to
     *                                        be validated
     * @param endPageNumber                   the ending page for the document to be
     *                                        validated
     * @param deleteFileAfterValidationStatus the status of deleting the file after
     *                                        get the document text
     * @return returns the pdf content in string so that can be validated
     */
    public String readPDFContentFromDownloadedPDF(int startPageNumber, int endPageNumber,
                                                  DeleteFileAfterValidationStatus deleteFileAfterValidationStatus) {

        stream = readFileInputStream(file);
        parser = parseStreamDocument(stream);

        cosDoc = getParsedDocument(parser);
        String content = getPdfText(cosDoc, startPageNumber, endPageNumber);
        closeStreamAndDeleteFile(file, stream, deleteFileAfterValidationStatus);

        return content;
    }

    public String readPDFContentFromDownloadedPDF(DeleteFileAfterValidationStatus deleteFileAfterValidationStatus) {

        stream = readFileInputStream(file);
        parser = parseStreamDocument(stream);

        cosDoc = getParsedDocument(parser);
        String content = getPdfText(cosDoc);
        closeStreamAndDeleteFile(file, stream, deleteFileAfterValidationStatus);

        return content;
    }

    private RandomAccessBufferedFileInputStream readFileInputStream(File file) {
        try {
            stream = new RandomAccessBufferedFileInputStream(file);
        } catch (IOException rootCauseException) {
            FailureReporter.fail(PdfFileManager.class, "Couldn't read the data from the provided file [" + file + "].", rootCauseException);
        }
        return stream;
    }

    private PDFParser parseStreamDocument(RandomAccessBufferedFileInputStream stream) {
        try {
            parser = new PDFParser(stream);
            parser.parse();
        } catch (IOException rootCauseException) {
            FailureReporter.fail(PdfFileManager.class, "Couldn't parse the stream that opened the document to be prepared to populate the COSDocument object.", rootCauseException);
        }
        return parser;
    }

    private COSDocument getParsedDocument(PDFParser parser) {
        try {
            cosDoc = parser.getDocument();
        } catch (IOException rootCauseException) {
            FailureReporter.fail(PdfFileManager.class, "Couldn't get the document that was parsed. Check that the document parsed before get the document.", rootCauseException);
        }
        return cosDoc;
    }

    private String getPdfText(COSDocument cosDoc, int startPageNumber, int endPageNumber) {
        try {
            strip = new PDFTextStripper();
            // By default, text extraction is done in the same sequence as the text in the
            // PDF page content stream. PDF is a graphic format, not a text format, and
            // unlike HTML, it has no requirements that text one on page be rendered in a
            // certain order. The order is the one that was determined by the software that
            // created the PDF
            // To get text sorted from left to right and top to bottom
            strip.setSortByPosition(true);
        } catch (IOException rootCauseException) {
            FailureReporter.fail(PdfFileManager.class, "Couldn't load PDFTextStripper properties.", rootCauseException);
        }

        strip.setStartPage(startPageNumber);
        strip.setEndPage(endPageNumber);
        PDDocument pdDoc = new PDDocument(cosDoc);

        String content = null;
        try {
            content = strip.getText(pdDoc);
        } catch (IOException rootCauseException) {
            FailureReporter.fail(PdfFileManager.class, "Couldn't get document text. Document state is invalid or it is encrypted.", rootCauseException);
        }
        return content;
    }

    private String getPdfText(COSDocument cosDoc) {
        try {
            strip = new PDFTextStripper();
            strip.setSortByPosition(true);
        } catch (IOException rootCauseException) {
            FailureReporter.fail(PdfFileManager.class, "Couldn't load PDFTextStripper properties.", rootCauseException);
        }

        PDDocument pdDoc = new PDDocument(cosDoc);

        String content = null;
        try {
            content = strip.getText(pdDoc);
        } catch (IOException rootCauseException) {
            FailureReporter.fail(PdfFileManager.class, "Couldn't get document text. Document state is invalid or it is encrypted.", rootCauseException);
        }
        return content;
    }

    private void closeStreamAndDeleteFile(File file, RandomAccessBufferedFileInputStream stream,
                                          DeleteFileAfterValidationStatus deleteFileAfterValidation) {
        try {
            stream.close();
        } catch (IOException rootCauseException) {
            FailureReporter.fail(PdfFileManager.class, "Couldn't close the stream, check if it already opened.", rootCauseException);
        }

        // Delete the file from target folder for next run

        if (deleteFileAfterValidation == DeleteFileAfterValidationStatus.TRUE) {
            try {
                FileUtils.forceDelete(file);
            } catch (IOException rootCauseException) {
                FailureReporter.fail(PdfFileManager.class, "Couldn't find the file, File directory may be null or file is not found.", rootCauseException);
            }
        }

    }

    @SuppressWarnings("unused")
    public enum DeleteFileAfterValidationStatus {
        TRUE, FALSE
    }

    // Will implement same for preview pdf file
}