com.mindee.pdf.PDFUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mindee-api-java Show documentation
Show all versions of mindee-api-java Show documentation
Java Library to call Mindee's Off-The-Shelf and Custom APIs
The newest version!
package com.mindee.pdf;
import com.mindee.input.LocalInputSource;
import java.awt.Color;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
/**
* Utilities for working with PDFs.
*/
public final class PDFUtils {
private PDFUtils() {
}
/**
* Get the number of pages in the PDF.
*
* @param inputSource The PDF file.
*/
public static int getNumberOfPages(LocalInputSource inputSource) throws IOException {
PDDocument document = PDDocument.load(inputSource.getFile());
int pageCount = document.getNumberOfPages();
document.close();
return pageCount;
}
private static PDPage clonePage(PDPage page) {
COSDictionary pageDict = page.getCOSObject();
COSDictionary newPageDict = new COSDictionary(pageDict);
newPageDict.removeItem(COSName.ANNOTS);
return new PDPage(newPageDict);
}
private static byte[] createPdfFromExistingPdf(
PDDocument document, List pageNumbers,
boolean closeOriginal
) throws IOException {
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
PDDocument newDocument = new PDDocument();
int pageCount = document.getNumberOfPages();
pageNumbers.stream().filter(i -> i < pageCount)
.forEach(i -> newDocument.addPage(clonePage(document.getPage(i))));
newDocument.save(outputStream);
newDocument.close();
if (closeOriginal) {
document.close();
}
byte[] output = outputStream.toByteArray();
outputStream.close();
return output;
}
/**
* Merge specified PDF pages together.
*
* @param file The PDF file.
* @param pageNumbers Lit of page numbers to merge together.
*/
public static byte[] mergePdfPages(File file, List pageNumbers) throws IOException {
PDDocument document = PDDocument.load(file);
return createPdfFromExistingPdf(document, pageNumbers, true);
}
public static byte[] mergePdfPages(PDDocument document, List pageNumbers)
throws IOException {
return mergePdfPages(document, pageNumbers, true);
}
public static byte[] mergePdfPages(
PDDocument document, List pageNumbers,
boolean closeOriginal
) throws IOException {
return createPdfFromExistingPdf(document, pageNumbers, closeOriginal);
}
public static boolean isPdfEmpty(File file) throws IOException {
return checkIfPdfIsEmpty(PDDocument.load(file));
}
private static boolean checkIfPdfIsEmpty(PDDocument document) throws IOException {
boolean isEmpty = true;
for (PDPage page : document.getPages()) {
PDResources resources = page.getResources();
if (resources == null) {
continue;
}
Iterable xObjects = resources.getXObjectNames();
Iterable fonts = resources.getFontNames();
if (xObjects.spliterator().getExactSizeIfKnown() != 0
|| fonts.spliterator().getExactSizeIfKnown() != 0) {
isEmpty = false;
break;
}
}
document.close();
return isEmpty;
}
/**
* Render all pages of a PDF as images.
* Converting PDFs with hundreds of pages may result in a heap space error.
*
* @param filePath The path to the PDF file.
* @return List of all pages as images.
*/
public static List pdfToImages(String filePath) throws IOException {
return pdfToImages(new LocalInputSource(filePath));
}
/**
* Render all pages of a PDF as images.
* Converting PDFs with hundreds of pages may result in a heap space error.
*
* @param source The PDF file.
* @return List of all pages as images.
*/
public static List pdfToImages(LocalInputSource source) throws IOException {
PDDocument document = PDDocument.load(source.getFile());
PDFRenderer pdfRenderer = new PDFRenderer(document);
List pdfPageImages = new ArrayList<>();
for (int i = 0; i < document.getNumberOfPages(); i++) {
BufferedImage imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
pdfPageImages.add(new PdfPageImage(imageBuffer, i, source.getFilename(), "jpg"));
}
document.close();
return pdfPageImages;
}
/**
* Render a single page of a PDF as an image.
* Main use case is for processing PDFs with hundreds of pages.
* If you need to only render some pages from the PDF, use mergePdfPages
and then pdfToImages
.
*
* @param filePath The path to the PDF file.
* @param pageNumber The page number to render, first page is 1.
* @return The page as an image.
*/
public static PdfPageImage pdfPageToImage(String filePath, int pageNumber) throws IOException {
return pdfPageToImage(new LocalInputSource(filePath), pageNumber);
}
/**
* Render a single page of a PDF as an image.
* Main use case is for processing PDFs with hundreds of pages.
* If you need to only render some pages from the PDF, use mergePdfPages
and
* then pdfToImages
.
*
* @param source The PDF file.
* @param pageNumber The page number to render, first page is 1.
* @return The page as an image.
*/
public static PdfPageImage pdfPageToImage(
LocalInputSource source,
int pageNumber
) throws IOException {
int index = pageNumber - 1;
PDDocument document = PDDocument.load(source.getFile());
PDFRenderer pdfRenderer = new PDFRenderer(document);
BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
document.close();
return new PdfPageImage(imageBuffer, index, source.getFilename(), "jpg");
}
private static BufferedImage pdfPageToImageBuffer(
int index, PDDocument document,
PDFRenderer pdfRenderer
) throws IOException {
PDRectangle bbox = document.getPage(index).getBBox();
float dimension = bbox.getWidth() * bbox.getHeight();
int dpi;
if (dimension < 200000) {
dpi = 300;
} else if (dimension < 300000) {
dpi = 250;
} else {
dpi = 200;
}
return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
}
public static byte[] documentToBytes(PDDocument document) throws IOException {
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
document.save(outputStream);
return outputStream.toByteArray();
}
public static void extractAndAddText(
PDDocument inputDoc, PDPageContentStream contentStream,
int pageIndex, boolean disableSourceText
) throws IOException {
if (disableSourceText) {
return;
}
PDFTextStripper stripper = new PDFTextStripper() {
@Override
protected void writeString(String text, List textPositions) throws IOException {
if (textPositions.isEmpty()) {
return;
}
TextPosition firstPosition = textPositions.get(0);
float fontSize = firstPosition.getFontSizeInPt();
PDColor color = getGraphicsState().getNonStrokingColor();
contentStream.beginText();
contentStream.setFont(firstPosition.getFont(), fontSize);
contentStream.setNonStrokingColor(convertToAwtColor(color));
float x = firstPosition.getXDirAdj();
float y = firstPosition.getPageHeight() - firstPosition.getYDirAdj();
contentStream.newLineAtOffset(x, y);
try {
contentStream.showText(text);
} catch (IllegalArgumentException | UnsupportedOperationException e) {
contentStream.setFont(PDType1Font.HELVETICA, fontSize);
contentStream.showText(text);
}
contentStream.endText();
}
};
stripper.setStartPage(pageIndex + 1);
stripper.setEndPage(pageIndex + 1);
stripper.getText(inputDoc);
}
private static Color convertToAwtColor(PDColor pdColor) {
float[] components = pdColor.getComponents();
if (components.length == 1) {
// Grayscale
return new Color(components[0], components[0], components[0]);
} else if (components.length == 3) {
// RGB
return new Color(components[0], components[1], components[2]);
} else if (components.length == 4) {
// CMYK (simplified conversion)
float c = components[0];
float m = components[1];
float y = components[2];
float k = components[3];
float r = 1 - Math.min(1, c + k);
float g = 1 - Math.min(1, m + k);
float b = 1 - Math.min(1, y + k);
return new Color(r, g, b);
}
return Color.BLACK;
}
public static void addImageToPage(
PDPageContentStream contentStream, PDImageXObject pdImage,
PDRectangle pageSize
) throws IOException {
contentStream.drawImage(pdImage, 0, 0, pageSize.getWidth(), pageSize.getHeight());
}
}