org.eadge.extractpdfexcel.PdfConverter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdf-to-excel Show documentation
A Java library used to convert PDF to Excel.
The newest version!
package org.eadge.extractpdfexcel;

import com.itextpdf.text.pdf.PdfReader;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.eadge.extractpdfexcel.data.*;
import org.eadge.extractpdfexcel.data.array.My2DArray;
import org.eadge.extractpdfexcel.data.block.Block;
import org.eadge.extractpdfexcel.data.lane.Lanes;
import org.eadge.extractpdfexcel.debug.display.FrameCreator;
import org.eadge.extractpdfexcel.exception.IncorrectFileTypeException;
import org.eadge.extractpdfexcel.models.TextBlockIdentifier;
import org.eadge.extractpdfexcel.process.arrangement.BlockSorter;
import org.eadge.extractpdfexcel.process.extraction.PdfParser;

import javax.swing.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;

/**
 * Created by eadgyo on 12/07/16.
 * 
 * Extract data from pdf and convert it to excel page
 *
 * 

 * Full conversion process is divided in 4 operations 

 * 1-Extract extractedData from PDF 

 * 2-Sort extractedData in columns and lines 

 * 3-Create Excel page from Sorted Data 

 * 4-Add xcl Page to your excel workBook.
 * 
 *
 * 
 * Operations must be done in designated order.
 * 
 */
public class PdfConverter
{

    /**
     * Extract transformedData from PDF
     * 
     * Read pdf file from path location. If file exists and is in pdf format, extract all text from all pages in pdf
     * file. If file exists but is not a pdf throw IncorrectFileException. If file doesn't exist or is not
     * readable throw  NoSuchFileException.
     *
     * Using the default text block identifier
     * 
     *
     * @param path                pdf file location
     *
     * @return extractedData separate in blocks containing text and position for each page.
     */
    public static ExtractedData extractFromFile(String path) throws
                                                                                         FileNotFoundException,
                                                                                         IncorrectFileTypeException
    {
        return extractFromFile(path, new TextBlockIdentifier());
    }

    /**
     * Extract transformedData from PDF
     * 
     * Read pdf file from path location. If file exists and is in pdf format, extract all text from all pages in pdf
     * file. If file exists but is not a pdf throw IncorrectFileException. If file doesn't exist or is not
     * readable throw  NoSuchFileException.
     * 
     *
     * @param path                pdf file location
     * @param textBlockIdentifier TextBlockIdentifier define text separation parameters (space between to letters to
     *                            create a space, height between to letters creating new line).
     *
     * @return extractedData separate in blocks containing text and position for each page.
     */
    public static ExtractedData extractFromFile(String path,
                                                TextBlockIdentifier textBlockIdentifier) throws
                                                                                         FileNotFoundException,
                                                                                         IncorrectFileTypeException
    {
        // Create file reader
        File file = new File(path);

        if (file.exists())
        {
            // Create Pdf Extractor using path location
            PdfReader pdf = null;
            try
            {
                pdf = new PdfReader(path);
            }
            catch (IOException e)
            {
                // pdf is not readable
                throw new IncorrectFileTypeException(path);
            }

            // Pdf is readable
            // Create parser to extract data from pdf
            PdfParser parser = new PdfParser(pdf, textBlockIdentifier);

            // Extract all data
            parser.readAllPage();

            if (textBlockIdentifier.cleanDuplicated)
                parser.cleanDuplicatedData();
            if (textBlockIdentifier.mergeFactor > 1.0)
                parser.mergeBlocks(textBlockIdentifier.mergeFactor);

            parser.close();
            // return extractedData extracted with parser
            return parser.getExtractedData();
        }
        else
        {
            throw new FileNotFoundException(path);
        }
    }

    /**
     * Remove duplicated blocks
     * @param extractedData modified collection of blocks
     */
    public static void removeDuplicatedData(ExtractedData extractedData)
    {
        extractedData.cleanDuplicatedData();
    }

    /**
     * Merge near blocks following fonts and orientation rules
     * @param extractedData modified colleciton of blocks
     */
    public static void mergeData(ExtractedData extractedData, double mergeFactor)
    {
        extractedData.mergeBlocks(mergeFactor);
    }

    /**
     * Remove duplicated data
     * Merge near blocks following fonts and orientation rules
     * @param extractedData modified colleciton of blocks
     */
    public static void cleanAndMergeData(ExtractedData extractedData, double mergeFactor)
    {
        extractedData.cleanDuplicatedData();
        extractedData.mergeBlocks(mergeFactor);
    }

    /**
     * Sort extractedData in both columns and lines.
     *
     * 
     * Each block in extractedData are processed one by one. Block is first added to the right column, then in
     * the right line.
     * 
     *
     * 
     * axisIndex and oppositeIndex are used to merge Line and Column process.
     * 
     *
     * @param extractedData extracted pages from one pdf file
     * //@param reinsertBlockMoreCollidingHigherLane if true, at the end of insert process, move block to higher lane,
     *                                             if colliding percent between block and higher lane is higher than
     *                                             block and actual block lane.
     *
     * @return Sorted Data from extracted pages. Data in extractedData are sorted in the right column and line. It
     * keeps page separation. Columns and lines contained sorted blocks according to Y-axis for columns and X-axis
     * for lines.
     */
    public static SortedData sortExtractedData(ExtractedData extractedData)
    {
        return sortExtractedData(extractedData, SortedPage.DEFAULT_LINE_AXIS, SortedPage.DEFAULT_COLUMN_AXIS);
    }

    /**
     * Sort extractedData in both columns and lines.
     *
     * 
     * Each block in extractedData are processed one by one. Block is first added to the right column, then in
     * the right line.
     * 
     *
     * 
     * axisIndex and oppositeIndex are used to merge Line and Column process.
     * 
     *
     * @param extractedData extracted pages from one pdf file
     * @param axisIndex     axis of lane, 0 for Line and 1 for Column
     * @param oppositeIndex opposite axis of lane, 1 for Line and 0 for Column
     * //@param reinsertBlockMoreCollidingHigherLane if true, at the end of insert process, move block to higher lane,
     *                                             if colliding percent between block and higher lane is higher than
     *                                             block and actual block lane.
     *
     * @return Sorted Data from extracted pages. Data in extractedData are sorted in the right column and line. It
     * keeps page separation. Columns and lines contained sorted blocks according to Y-axis for columns and X-axis
     * for lines.
     */
    public static SortedData sortExtractedData(ExtractedData extractedData, int axisIndex, int oppositeIndex)
    {
        // Grouping all sortedPage
        SortedData sortedData = new SortedData();

        // For each extractedPage
        // Start at one
        for (int i = 1; i <= extractedData.numberOfPages(); i++)
        {
            ExtractedPage extractedPage = extractedData.getExtractedPage(i);

            // If page has been extracted
            if (extractedPage != null)
            {
                // Create sortedPage
                SortedPage sortedPage = sortExtractedPage(extractedPage, axisIndex, oppositeIndex);

                // Add sortedPage to sortedData
                sortedData.insertPage(i, sortedPage);
            }
        }

        return sortedData;
    }

    /**
     * Sort extractedPage in both columns and lines.
     *
     * 
     * Each block in extractedPage are processed one by one. Block is first added to the right column, then in
     * the right line.
     * 
     *
     * 
     * axisIndex and oppositeIndex are used to merge Line and Column process.
     * 
     *
     * @param extractedPage              extracted page from one pdf file
     * @param axisIndex                  axis of lane, 0 for Line and 1 for Column
     * @param oppositeIndex              opposite axis of lane, 1 for Line and 0 for Column
     * //@param reinsertBlockMoreCollidingHigherLane if true, at the end of insert process, move block to higher lane,
     *                                             if colliding percent between block and higher lane is higher than
     *                                             block and actual block lane.
     *
     * @return Sorted Data from extracted pages. Data in extractedData are sorted in the right column and line. It
     * keeps page separation. Columns and lines contained sorted blocks according to Y-axis for columns and X-axis
     * for lines.
     */
    public static SortedPage sortExtractedPage(ExtractedPage extractedPage,
                                               int axisIndex,
                                               int oppositeIndex)
    {
        // Start creating sortedPage data
        Lanes      columns    = new Lanes();
        Lanes      lines      = new Lanes();
        SortedPage sortedPage = new SortedPage(columns, lines);

        Collection blocks = extractedPage.getBlocks();

        // Sort each block
        for (Block block : blocks)
        {
            // Insert in the correct line or create new one
            BlockSorter.insertInLanes(axisIndex,
                                      oppositeIndex,
                                      block,
                                      lines);

            // Insert in the correct column or create new one
            BlockSorter.insertInLanes(oppositeIndex,
                                      axisIndex,
                                      block,
                                      columns);
        }

        // If end reinserting block option is activated
/*        if (reinsertBlockMoreCollidingHigherLane)
        {
            BlockSorter.reinsertBlockMoreCollidingHigherLane(axisIndex,
                                                             oppositeIndex,
                                                   lines);
            BlockSorter.reinsertBlockMoreCollidingHigherLane(oppositeIndex, axisIndex,
                                      columns);
        }*/

        // Link sortedPage to his extractedPage
        sortedPage.setLinkExtractedPage(extractedPage);

        return sortedPage;
    }

    /**
     * Create excel pages from sorted data, using 2D array created,
     *
     * @param sortedData data sorted in column and line
     *
     * @return list of excel pages.
     */
    public static ArrayList createExcelPages(SortedData sortedData)
    {
        ArrayList xclPages = new ArrayList<>();

        for (int pageIndex = 1; pageIndex <= sortedData.numberOfPages(); pageIndex++)
        {
            // Get sortedPage
            SortedPage sortedPage = sortedData.getSortedPage(pageIndex);

            // If page exists (has been loaded)
            if (sortedPage != null)
            {
                xclPages.add(createExcelPage(sortedPage));
            }
        }

        // return created Excel page
        return xclPages;
    }

    /**
     * Create excel page from sorted data, using 2D array created,
     *
     * @param sortedPage data sorted in column and line
     *
     * @return list of excel pages.
     */
    public static XclPage createExcelPage(SortedPage sortedPage)
    {
        // Create 2D array containing blocks using sorted lines and columns from sortedData
        My2DArray  arrayOfBlocks = sortedPage.create2DArrayOfBlocks();
        ArrayList columnsSize   = sortedPage.getColumnsWidth();
        ArrayList linesSize     = sortedPage.getLinesHeight();

        // return created Excel page
        return new XclPage(arrayOfBlocks, columnsSize, linesSize);
    }

    /**
     * Create excel sheet using workbook and xclPage, filling box with formatted text.
     *
     * @param sheetName name of the created excel sheet.
     * @param wb        used excel workBook to insert page
     * @param xclPage   excel page information. Containing cells and dimensions of columns and lines.
     * @param lineFactor line size factor
     * @param columnFactor column size factor
     * @return created excel sheet.
     */
    public static HSSFSheet createExcelSheet(String sheetName, HSSFWorkbook wb, XclPage xclPage, double lineFactor, double
            columnFactor)
    {
        // Create excel sheet
        HSSFSheet sheet = wb.createSheet(sheetName);

        // Parse columns and lines
        for (int line = 0; line < xclPage.numberOfLines(); line++)
        {
            HSSFRow createdLine = sheet.createRow(line);
            if (lineFactor != 0)
                createdLine.setHeight((short) (xclPage.getLineHeight(line) * 50));
            for (int col = 0; col < xclPage.numberOfColumns(); col++)
            {
                Block block = xclPage.getBlockAt(col, line);

                // If there is a block a the given position
                if (block != null)
                {
                    // Create a new cell and add the block content in this new cell
                    HSSFCell createdCell = createdLine.createCell(col);
                    createdCell.setCellValue(block.getFormattedText());
                }
            }
        }

        // Set the width of each lane
        if (columnFactor != 0)
        {
            for (int col = 0; col < xclPage.numberOfColumns(); col++)
            {
                sheet.setColumnWidth(col, (int) xclPage.getColumnWidth(col) * 20);
            }
        }

        return null;
    }

    /**
     * Convert a pdf file into an excel sheets
     * @param sourcePDFPath path for the used source pdf
     * @param workbook used workBook for created sheets
     * @param textBlockIdentifier defines parameter used to
     * @param lineAxis 0 if the pdf is in portray mode
     * @param columnAxis 1 if the pdf is in portray mode
     * @param lineFactor line size factor
     * @param columnFactor column size factor
     */
    public static ArrayList createExcelSheets(String sourcePDFPath,
                                       HSSFWorkbook workbook,
                                       TextBlockIdentifier textBlockIdentifier,
                                        int lineAxis,
                                        int columnAxis, double lineFactor,
                                                         double columnFactor) throws FileNotFoundException,
                                                                        IncorrectFileTypeException
    {
        ArrayList sheets = new ArrayList<>();

        // Extract data from the source pdf file
        ExtractedData extractedData = PdfConverter.extractFromFile(sourcePDFPath, textBlockIdentifier);

        // Sort Data
        SortedData sortedData = PdfConverter.sortExtractedData(extractedData, lineAxis, columnAxis);

        // Create 2D array pages containing information
        ArrayList excelPages = PdfConverter.createExcelPages(sortedData);

        // Create sheets for each pages
        int page = 1;
        for (XclPage excelPage : excelPages)
        {
            HSSFSheet excelSheet = PdfConverter.createExcelSheet("page " + page, workbook, excelPage, lineFactor, columnFactor);
            sheets.add(excelSheet);
            page++;
        }

        return sheets;
    }

    /**
     * Convert a pdf file into an excel sheets
     * @param sourcePDFPath path for the used source pdf
     * @param workbook used workbook for creating sheets
     */
    public static ArrayList createExcelSheets(String sourcePDFPath, HSSFWorkbook workbook) throws
                                                                                           FileNotFoundException,
                                                                                       IncorrectFileTypeException
    {
        return createExcelSheets(sourcePDFPath, workbook, new TextBlockIdentifier(), 0, 1, 0, 0);
    }

    /**
     * Create Excel file from pdf source
     * @param sourcePDFPath path for the used source pdf
     * @param xclPath path for the created excel file
     * @param textBlockIdentifier defines parameter used to
     * @param lineAxis 0 if the pdf is in portray mode
     * @param columnAxis 1 if the pdf is in portray mode
     * @param lineFactor line size factor
     * @param columnFactor column size factor
     */
    public static void createExcelFile(String sourcePDFPath, String xclPath,
                                       TextBlockIdentifier textBlockIdentifier,
                                       int lineAxis,
                                       int columnAxis,
                                       double lineFactor,
                                       double columnFactor) throws IOException
    {
        HSSFWorkbook workbook = new HSSFWorkbook();

        ArrayList excelSheets = createExcelSheets(sourcePDFPath,
                                                             workbook,
                                                             textBlockIdentifier,
                                                             lineAxis,
                                                             columnAxis,
                                                             lineFactor,
                                                             columnFactor);

        FileOutputStream out = new FileOutputStream(xclPath);
        workbook.write(out);
        out.close();
    }

    /**
     * Create Excel file from pdf source
     * @param sourcePDFPath path for the used source pdf
     * @param xclPath path for the created excel file
     */
    public static void createExcelFile(String sourcePDFPath, String xclPath) throws IOException
    {
        createExcelFile(sourcePDFPath, xclPath, new TextBlockIdentifier(),0, 1,0, 0);
    }

    /**
     * Display the xclPage
     * @param xclPage displayed xclPage
     */
    public static void displayXCLPage(XclPage xclPage)
    {
        FrameCreator.displayXclPage("Xcl", 800, 600, xclPage);
    }

    /**
     * Display the xclPages
     * @param xclPages displayed xclPages
     */
    public static Collection  displayXCLPages(Collection xclPages)
    {
        ArrayList frames = new ArrayList<>();

        FrameCreator.displayXclPages("XCL", 800, 600, xclPages);

        return frames;
    }

    /**
     * Display the xclPage
     * @param sourcePdf displayed pdf converted to xcl
     */
    public static Collection displayXCLPage(String sourcePdf) throws FileNotFoundException, IncorrectFileTypeException
    {
        ArrayList xclPages = convertFileToXclPages(sourcePdf);
        Collection frames = displayXCLPages(xclPages);

        return frames;
    }

    public static ArrayList convertFileToXclPages(String sourcePdf) throws FileNotFoundException,
                                                                           IncorrectFileTypeException
    {
        // Extract data from the source pdf file
        ExtractedData extractedData = PdfConverter.extractFromFile(sourcePdf, new TextBlockIdentifier());

        // Sort Data
        SortedData sortedData = PdfConverter.sortExtractedData(extractedData, 0, 1);

        // Create 2D array pages containing information
        return PdfConverter.createExcelPages(sortedData);
    }
}