All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.perplexhub.pdf2excel.Pdf2Excel Maven / Gradle / Ivy

The newest version!
package io.github.perplexhub.pdf2excel;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;

import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.eadge.extractpdfexcel.PdfConverter;
import org.eadge.extractpdfexcel.data.ExtractedData;
import org.eadge.extractpdfexcel.data.SortedData;
import org.eadge.extractpdfexcel.data.XclPage;
import org.eadge.extractpdfexcel.models.TextBlockIdentifier;
import org.eadge.extractpdfexcel.process.extraction.PdfParser;

import com.itextpdf.text.pdf.PdfReader;

public abstract class Pdf2Excel {

	public static ByteArrayOutputStream convert(InputStream inputStream) throws IOException {
		return convert(inputStream, null);
	}

	public static ByteArrayOutputStream convert(InputStream inputStream, String password) throws IOException {
		PdfReader pdf = password != null && password.length() > 0 ? new PdfReader(inputStream, password.getBytes()) : new PdfReader(inputStream);
		TextBlockIdentifier textBlockIdentifier = new TextBlockIdentifier();
		PdfParser parser = new PdfParser(pdf, textBlockIdentifier);
		parser.readAllPage();

		if (textBlockIdentifier.cleanDuplicated)
			parser.cleanDuplicatedData();
		if (textBlockIdentifier.mergeFactor > 1.0)
			parser.mergeBlocks(textBlockIdentifier.mergeFactor);

		parser.close();
		ExtractedData extractedData = parser.getExtractedData();
		SortedData sortedData = PdfConverter.sortExtractedData(extractedData, 0, 1);
		ArrayList excelPages = PdfConverter.createExcelPages(sortedData);
		HSSFWorkbook workbook = new HSSFWorkbook();
		ArrayList sheets = new ArrayList<>();
		int page = 1;
		for (XclPage excelPage : excelPages) {
			HSSFSheet excelSheet = PdfConverter.createExcelSheet("page " + page, workbook, excelPage, 0, 0);
			sheets.add(excelSheet);
			page++;
		}
		ByteArrayOutputStream outStream = new ByteArrayOutputStream();
		workbook.write(outStream);
		return outStream;
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy