All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.eadge.extractpdfexcel.PdfConverter Maven / Gradle / Ivy

The newest version!
package org.eadge.extractpdfexcel;

import com.itextpdf.text.pdf.PdfReader;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.eadge.extractpdfexcel.data.*;
import org.eadge.extractpdfexcel.data.array.My2DArray;
import org.eadge.extractpdfexcel.data.block.Block;
import org.eadge.extractpdfexcel.data.lane.Lanes;
import org.eadge.extractpdfexcel.debug.display.FrameCreator;
import org.eadge.extractpdfexcel.exception.IncorrectFileTypeException;
import org.eadge.extractpdfexcel.models.TextBlockIdentifier;
import org.eadge.extractpdfexcel.process.arrangement.BlockSorter;
import org.eadge.extractpdfexcel.process.extraction.PdfParser;

import javax.swing.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;

/**
 * Created by eadgyo on 12/07/16.
 * 

* Extract data from pdf and convert it to excel page * *

* Full conversion process is divided in 4 operations
* 1-Extract extractedData from PDF
* 2-Sort extractedData in columns and lines
* 3-Create Excel page from Sorted Data
* 4-Add xcl Page to your excel workBook. *

* *

* Operations must be done in designated order. *

*/ public class PdfConverter { /** * Extract transformedData from PDF *

* Read pdf file from path location. If file exists and is in pdf format, extract all text from all pages in pdf * file. If file exists but is not a pdf throw IncorrectFileException. If file doesn't exist or is not * readable throw NoSuchFileException. * * Using the default text block identifier *

* * @param path pdf file location * * @return extractedData separate in blocks containing text and position for each page. */ public static ExtractedData extractFromFile(String path) throws FileNotFoundException, IncorrectFileTypeException { return extractFromFile(path, new TextBlockIdentifier()); } /** * Extract transformedData from PDF *

* Read pdf file from path location. If file exists and is in pdf format, extract all text from all pages in pdf * file. If file exists but is not a pdf throw IncorrectFileException. If file doesn't exist or is not * readable throw NoSuchFileException. *

* * @param path pdf file location * @param textBlockIdentifier TextBlockIdentifier define text separation parameters (space between to letters to * create a space, height between to letters creating new line). * * @return extractedData separate in blocks containing text and position for each page. */ public static ExtractedData extractFromFile(String path, TextBlockIdentifier textBlockIdentifier) throws FileNotFoundException, IncorrectFileTypeException { // Create file reader File file = new File(path); if (file.exists()) { // Create Pdf Extractor using path location PdfReader pdf = null; try { pdf = new PdfReader(path); } catch (IOException e) { // pdf is not readable throw new IncorrectFileTypeException(path); } // Pdf is readable // Create parser to extract data from pdf PdfParser parser = new PdfParser(pdf, textBlockIdentifier); // Extract all data parser.readAllPage(); if (textBlockIdentifier.cleanDuplicated) parser.cleanDuplicatedData(); if (textBlockIdentifier.mergeFactor > 1.0) parser.mergeBlocks(textBlockIdentifier.mergeFactor); parser.close(); // return extractedData extracted with parser return parser.getExtractedData(); } else { throw new FileNotFoundException(path); } } /** * Remove duplicated blocks * @param extractedData modified collection of blocks */ public static void removeDuplicatedData(ExtractedData extractedData) { extractedData.cleanDuplicatedData(); } /** * Merge near blocks following fonts and orientation rules * @param extractedData modified colleciton of blocks */ public static void mergeData(ExtractedData extractedData, double mergeFactor) { extractedData.mergeBlocks(mergeFactor); } /** * Remove duplicated data * Merge near blocks following fonts and orientation rules * @param extractedData modified colleciton of blocks */ public static void cleanAndMergeData(ExtractedData extractedData, double mergeFactor) { extractedData.cleanDuplicatedData(); extractedData.mergeBlocks(mergeFactor); } /** * Sort extractedData in both columns and lines. * *

* Each block in extractedData are processed one by one. Block is first added to the right column, then in * the right line. *

* *

* axisIndex and oppositeIndex are used to merge Line and Column process. *

* * @param extractedData extracted pages from one pdf file * //@param reinsertBlockMoreCollidingHigherLane if true, at the end of insert process, move block to higher lane, * if colliding percent between block and higher lane is higher than * block and actual block lane. * * @return Sorted Data from extracted pages. Data in extractedData are sorted in the right column and line. It * keeps page separation. Columns and lines contained sorted blocks according to Y-axis for columns and X-axis * for lines. */ public static SortedData sortExtractedData(ExtractedData extractedData) { return sortExtractedData(extractedData, SortedPage.DEFAULT_LINE_AXIS, SortedPage.DEFAULT_COLUMN_AXIS); } /** * Sort extractedData in both columns and lines. * *

* Each block in extractedData are processed one by one. Block is first added to the right column, then in * the right line. *

* *

* axisIndex and oppositeIndex are used to merge Line and Column process. *

* * @param extractedData extracted pages from one pdf file * @param axisIndex axis of lane, 0 for Line and 1 for Column * @param oppositeIndex opposite axis of lane, 1 for Line and 0 for Column * //@param reinsertBlockMoreCollidingHigherLane if true, at the end of insert process, move block to higher lane, * if colliding percent between block and higher lane is higher than * block and actual block lane. * * @return Sorted Data from extracted pages. Data in extractedData are sorted in the right column and line. It * keeps page separation. Columns and lines contained sorted blocks according to Y-axis for columns and X-axis * for lines. */ public static SortedData sortExtractedData(ExtractedData extractedData, int axisIndex, int oppositeIndex) { // Grouping all sortedPage SortedData sortedData = new SortedData(); // For each extractedPage // Start at one for (int i = 1; i <= extractedData.numberOfPages(); i++) { ExtractedPage extractedPage = extractedData.getExtractedPage(i); // If page has been extracted if (extractedPage != null) { // Create sortedPage SortedPage sortedPage = sortExtractedPage(extractedPage, axisIndex, oppositeIndex); // Add sortedPage to sortedData sortedData.insertPage(i, sortedPage); } } return sortedData; } /** * Sort extractedPage in both columns and lines. * *

* Each block in extractedPage are processed one by one. Block is first added to the right column, then in * the right line. *

* *

* axisIndex and oppositeIndex are used to merge Line and Column process. *

* * @param extractedPage extracted page from one pdf file * @param axisIndex axis of lane, 0 for Line and 1 for Column * @param oppositeIndex opposite axis of lane, 1 for Line and 0 for Column * //@param reinsertBlockMoreCollidingHigherLane if true, at the end of insert process, move block to higher lane, * if colliding percent between block and higher lane is higher than * block and actual block lane. * * @return Sorted Data from extracted pages. Data in extractedData are sorted in the right column and line. It * keeps page separation. Columns and lines contained sorted blocks according to Y-axis for columns and X-axis * for lines. */ public static SortedPage sortExtractedPage(ExtractedPage extractedPage, int axisIndex, int oppositeIndex) { // Start creating sortedPage data Lanes columns = new Lanes(); Lanes lines = new Lanes(); SortedPage sortedPage = new SortedPage(columns, lines); Collection blocks = extractedPage.getBlocks(); // Sort each block for (Block block : blocks) { // Insert in the correct line or create new one BlockSorter.insertInLanes(axisIndex, oppositeIndex, block, lines); // Insert in the correct column or create new one BlockSorter.insertInLanes(oppositeIndex, axisIndex, block, columns); } // If end reinserting block option is activated /* if (reinsertBlockMoreCollidingHigherLane) { BlockSorter.reinsertBlockMoreCollidingHigherLane(axisIndex, oppositeIndex, lines); BlockSorter.reinsertBlockMoreCollidingHigherLane(oppositeIndex, axisIndex, columns); }*/ // Link sortedPage to his extractedPage sortedPage.setLinkExtractedPage(extractedPage); return sortedPage; } /** * Create excel pages from sorted data, using 2D array created, * * @param sortedData data sorted in column and line * * @return list of excel pages. */ public static ArrayList createExcelPages(SortedData sortedData) { ArrayList xclPages = new ArrayList<>(); for (int pageIndex = 1; pageIndex <= sortedData.numberOfPages(); pageIndex++) { // Get sortedPage SortedPage sortedPage = sortedData.getSortedPage(pageIndex); // If page exists (has been loaded) if (sortedPage != null) { xclPages.add(createExcelPage(sortedPage)); } } // return created Excel page return xclPages; } /** * Create excel page from sorted data, using 2D array created, * * @param sortedPage data sorted in column and line * * @return list of excel pages. */ public static XclPage createExcelPage(SortedPage sortedPage) { // Create 2D array containing blocks using sorted lines and columns from sortedData My2DArray arrayOfBlocks = sortedPage.create2DArrayOfBlocks(); ArrayList columnsSize = sortedPage.getColumnsWidth(); ArrayList linesSize = sortedPage.getLinesHeight(); // return created Excel page return new XclPage(arrayOfBlocks, columnsSize, linesSize); } /** * Create excel sheet using workbook and xclPage, filling box with formatted text. * * @param sheetName name of the created excel sheet. * @param wb used excel workBook to insert page * @param xclPage excel page information. Containing cells and dimensions of columns and lines. * @param lineFactor line size factor * @param columnFactor column size factor * @return created excel sheet. */ public static HSSFSheet createExcelSheet(String sheetName, HSSFWorkbook wb, XclPage xclPage, double lineFactor, double columnFactor) { // Create excel sheet HSSFSheet sheet = wb.createSheet(sheetName); // Parse columns and lines for (int line = 0; line < xclPage.numberOfLines(); line++) { HSSFRow createdLine = sheet.createRow(line); if (lineFactor != 0) createdLine.setHeight((short) (xclPage.getLineHeight(line) * 50)); for (int col = 0; col < xclPage.numberOfColumns(); col++) { Block block = xclPage.getBlockAt(col, line); // If there is a block a the given position if (block != null) { // Create a new cell and add the block content in this new cell HSSFCell createdCell = createdLine.createCell(col); createdCell.setCellValue(block.getFormattedText()); } } } // Set the width of each lane if (columnFactor != 0) { for (int col = 0; col < xclPage.numberOfColumns(); col++) { sheet.setColumnWidth(col, (int) xclPage.getColumnWidth(col) * 20); } } return null; } /** * Convert a pdf file into an excel sheets * @param sourcePDFPath path for the used source pdf * @param workbook used workBook for created sheets * @param textBlockIdentifier defines parameter used to * @param lineAxis 0 if the pdf is in portray mode * @param columnAxis 1 if the pdf is in portray mode * @param lineFactor line size factor * @param columnFactor column size factor */ public static ArrayList createExcelSheets(String sourcePDFPath, HSSFWorkbook workbook, TextBlockIdentifier textBlockIdentifier, int lineAxis, int columnAxis, double lineFactor, double columnFactor) throws FileNotFoundException, IncorrectFileTypeException { ArrayList sheets = new ArrayList<>(); // Extract data from the source pdf file ExtractedData extractedData = PdfConverter.extractFromFile(sourcePDFPath, textBlockIdentifier); // Sort Data SortedData sortedData = PdfConverter.sortExtractedData(extractedData, lineAxis, columnAxis); // Create 2D array pages containing information ArrayList excelPages = PdfConverter.createExcelPages(sortedData); // Create sheets for each pages int page = 1; for (XclPage excelPage : excelPages) { HSSFSheet excelSheet = PdfConverter.createExcelSheet("page " + page, workbook, excelPage, lineFactor, columnFactor); sheets.add(excelSheet); page++; } return sheets; } /** * Convert a pdf file into an excel sheets * @param sourcePDFPath path for the used source pdf * @param workbook used workbook for creating sheets */ public static ArrayList createExcelSheets(String sourcePDFPath, HSSFWorkbook workbook) throws FileNotFoundException, IncorrectFileTypeException { return createExcelSheets(sourcePDFPath, workbook, new TextBlockIdentifier(), 0, 1, 0, 0); } /** * Create Excel file from pdf source * @param sourcePDFPath path for the used source pdf * @param xclPath path for the created excel file * @param textBlockIdentifier defines parameter used to * @param lineAxis 0 if the pdf is in portray mode * @param columnAxis 1 if the pdf is in portray mode * @param lineFactor line size factor * @param columnFactor column size factor */ public static void createExcelFile(String sourcePDFPath, String xclPath, TextBlockIdentifier textBlockIdentifier, int lineAxis, int columnAxis, double lineFactor, double columnFactor) throws IOException { HSSFWorkbook workbook = new HSSFWorkbook(); ArrayList excelSheets = createExcelSheets(sourcePDFPath, workbook, textBlockIdentifier, lineAxis, columnAxis, lineFactor, columnFactor); FileOutputStream out = new FileOutputStream(xclPath); workbook.write(out); out.close(); } /** * Create Excel file from pdf source * @param sourcePDFPath path for the used source pdf * @param xclPath path for the created excel file */ public static void createExcelFile(String sourcePDFPath, String xclPath) throws IOException { createExcelFile(sourcePDFPath, xclPath, new TextBlockIdentifier(),0, 1,0, 0); } /** * Display the xclPage * @param xclPage displayed xclPage */ public static void displayXCLPage(XclPage xclPage) { FrameCreator.displayXclPage("Xcl", 800, 600, xclPage); } /** * Display the xclPages * @param xclPages displayed xclPages */ public static Collection displayXCLPages(Collection xclPages) { ArrayList frames = new ArrayList<>(); FrameCreator.displayXclPages("XCL", 800, 600, xclPages); return frames; } /** * Display the xclPage * @param sourcePdf displayed pdf converted to xcl */ public static Collection displayXCLPage(String sourcePdf) throws FileNotFoundException, IncorrectFileTypeException { ArrayList xclPages = convertFileToXclPages(sourcePdf); Collection frames = displayXCLPages(xclPages); return frames; } public static ArrayList convertFileToXclPages(String sourcePdf) throws FileNotFoundException, IncorrectFileTypeException { // Extract data from the source pdf file ExtractedData extractedData = PdfConverter.extractFromFile(sourcePdf, new TextBlockIdentifier()); // Sort Data SortedData sortedData = PdfConverter.sortExtractedData(extractedData, 0, 1); // Create 2D array pages containing information return PdfConverter.createExcelPages(sortedData); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy