net.sourceforge.tess4j.util.PdfGsUtilities Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tess4j Show documentation
Show all versions of tess4j Show documentation
# Tess4J
## Description:
A Java JNA wrapper for Tesseract OCR API.
Tess4J is released and distributed under the Apache License, v2.0.
## Features:
The library provides optical character recognition (OCR) support for:
TIFF, JPEG, GIF, PNG, and BMP image formats
Multi-page TIFF images
PDF document format
/**
* Copyright @ 2009 Quan Nguyen
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package net.sourceforge.tess4j.util;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import org.ghost4j.Ghostscript;
import org.ghost4j.GhostscriptException;
import org.slf4j.LoggerFactory;
/**
* PDF utilities based on Ghostscript.
*/
public class PdfGsUtilities {
public static final String GS_INSTALL = "\nPlease download, install GPL Ghostscript from http://www.ghostscript.com\nand/or set the appropriate path variable.";
private static final org.slf4j.Logger logger = LoggerFactory.getLogger(new LoggHelper().toString());
/**
* Converts PDF to TIFF format.
*
* @param inputPdfFile input file
* @return a multi-page TIFF image
* @throws IOException
*/
public static File convertPdf2Tiff(File inputPdfFile) throws IOException {
File[] pngFiles = null;
try {
pngFiles = convertPdf2Png(inputPdfFile);
File tiffFile = File.createTempFile("multipage", ".tif");
// put PNG images into a single multi-page TIFF image for return
ImageIOHelper.mergeTiff(pngFiles, tiffFile);
return tiffFile;
} finally {
if (pngFiles != null && pngFiles.length > 0) {
// get the working directory of the PNG files
File pngDirectory = new File(pngFiles[0].getParent());
// delete temporary PNG images
for (File tempFile : pngFiles) {
tempFile.delete();
}
pngDirectory.delete();
}
}
}
/**
* Converts PDF to PNG format.
*
* @param inputPdfFile input file
* @return an array of PNG images
* @throws java.io.IOException
*/
public synchronized static File[] convertPdf2Png(File inputPdfFile) throws IOException {
Path path = Files.createTempDirectory("tessimages");
File imageDir = path.toFile();
//get Ghostscript instance
Ghostscript gs = Ghostscript.getInstance();
//prepare Ghostscript interpreter parameters
//refer to Ghostscript documentation for parameter usage
List gsArgs = new ArrayList();
gsArgs.add("-gs");
gsArgs.add("-dNOPAUSE");
gsArgs.add("-dQUIET");
gsArgs.add("-dBATCH");
gsArgs.add("-dSAFER");
gsArgs.add("-sDEVICE=pnggray");
gsArgs.add("-r300");
gsArgs.add("-dGraphicsAlphaBits=4");
gsArgs.add("-dTextAlphaBits=4");
gsArgs.add("-sOutputFile=" + imageDir.getPath() + "/workingimage%04d.png");
gsArgs.add(inputPdfFile.getPath());
//execute and exit interpreter
try {
synchronized (gs) {
gs.initialize(gsArgs.toArray(new String[0]));
gs.exit();
}
} catch (UnsatisfiedLinkError e) {
logger.error(e.getMessage());
throw new RuntimeException(getMessage(e.getMessage()));
} catch (NoClassDefFoundError e) {
logger.error(e.getMessage());
throw new RuntimeException(getMessage(e.getMessage()));
} catch (GhostscriptException e) {
logger.error(e.getMessage());
throw new RuntimeException(e.getMessage());
} finally {
if (imageDir.list().length == 0) {
imageDir.delete();
}
//delete interpreter instance (safer)
try {
Ghostscript.deleteInstance();
} catch (GhostscriptException e) {
//nothing
}
}
// find working files
File[] workingFiles = imageDir.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.toLowerCase().matches("workingimage\\d{4}\\.png$");
}
});
Arrays.sort(workingFiles, new Comparator() {
@Override
public int compare(File f1, File f2) {
return f1.getName().compareTo(f2.getName());
}
});
return workingFiles;
}
/**
* Splits PDF.
*
* @param inputPdfFile input file
* @param outputPdfFile output file
* @param firstPage begin page
* @param lastPage end page
*/
public static void splitPdf(File inputPdfFile, File outputPdfFile, int firstPage, int lastPage) {
//get Ghostscript instance
Ghostscript gs = Ghostscript.getInstance();
//prepare Ghostscript interpreter parameters
//refer to Ghostscript documentation for parameter usage
//gs -sDEVICE=pdfwrite -dNOPAUSE -dQUIET -dBATCH -dFirstPage=m -dLastPage=n -sOutputFile=out.pdf in.pdf
List gsArgs = new ArrayList();
gsArgs.add("-gs");
gsArgs.add("-dNOPAUSE");
gsArgs.add("-dQUIET");
gsArgs.add("-dBATCH");
gsArgs.add("-sDEVICE=pdfwrite");
if (firstPage > 0) {
gsArgs.add("-dFirstPage=" + firstPage);
}
if (lastPage > 0) {
gsArgs.add("-dLastPage=" + lastPage);
}
gsArgs.add("-sOutputFile=" + outputPdfFile.getPath());
gsArgs.add(inputPdfFile.getPath());
//execute and exit interpreter
try {
synchronized (gs) {
gs.initialize(gsArgs.toArray(new String[0]));
gs.exit();
}
} catch (UnsatisfiedLinkError e) {
logger.error(e.getMessage());
throw new RuntimeException(getMessage(e.getMessage()));
} catch (NoClassDefFoundError e) {
logger.error(e.getMessage());
throw new RuntimeException(getMessage(e.getMessage()));
} catch (GhostscriptException e) {
logger.error(e.getMessage());
throw new RuntimeException(e.getMessage());
} finally {
//delete interpreter instance (safer)
try {
Ghostscript.deleteInstance();
} catch (GhostscriptException e) {
//nothing
}
}
}
/**
* Gets PDF Page Count.
*
* @param inputPdfFile input file
* @return number of pages
*/
public static int getPdfPageCount(File inputPdfFile) {
String inputFile = inputPdfFile.getPath().replace('\\', '/');
//get Ghostscript instance
Ghostscript gs = Ghostscript.getInstance();
//prepare Ghostscript interpreter parameters
//refer to Ghostscript documentation for parameter usage
//gs -q -dNODISPLAY -c "(input.pdf) (r) file runpdfbegin pdfpagecount = quit"
List gsArgs = new ArrayList();
gsArgs.add("-gs");
gsArgs.add("-dNOPAUSE");
gsArgs.add("-dQUIET");
gsArgs.add("-dNODISPLAY");
gsArgs.add("-dBATCH");
gsArgs.add("--permit-file-read=" + inputFile);
gsArgs.add("-c");
String cValue = String.format("(%s) (r) file runpdfbegin pdfpagecount = quit", inputFile);
gsArgs.add(cValue);
int pageCount = 0;
ByteArrayOutputStream os;
//execute and exit interpreter
try {
synchronized (gs) {
//output
os = new ByteArrayOutputStream();
gs.setStdOut(os);
gs.initialize(gsArgs.toArray(new String[0]));
pageCount = Integer.parseInt(os.toString().trim());
os.close();
}
} catch (UnsatisfiedLinkError e) {
logger.error(e.getMessage());
throw new RuntimeException(getMessage(e.getMessage()));
} catch (NoClassDefFoundError e) {
logger.error(e.getMessage());
throw new RuntimeException(getMessage(e.getMessage()));
} catch (GhostscriptException e) {
logger.error(e.getMessage());
throw new RuntimeException(e.getMessage());
} catch (Exception e) {
logger.error(e.getMessage());
} finally {
//delete interpreter instance (safer)
try {
Ghostscript.deleteInstance();
} catch (GhostscriptException e) {
//nothing
}
}
return pageCount;
}
/**
* Merges PDF files.
*
* @param inputPdfFiles array of input files
* @param outputPdfFile output file
*/
public static void mergePdf(File[] inputPdfFiles, File outputPdfFile) {
//get Ghostscript instance
Ghostscript gs = Ghostscript.getInstance();
//prepare Ghostscript interpreter parameters
//refer to Ghostscript documentation for parameter usage
//gs -sDEVICE=pdfwrite -dNOPAUSE -dQUIET -dBATCH -sOutputFile=out.pdf in1.pdf in2.pdf in3.pdf
List gsArgs = new ArrayList();
gsArgs.add("-gs");
gsArgs.add("-dNOPAUSE");
gsArgs.add("-dQUIET");
gsArgs.add("-dBATCH");
gsArgs.add("-sDEVICE=pdfwrite");
gsArgs.add("-sOutputFile=" + outputPdfFile.getPath());
for (File inputPdfFile : inputPdfFiles) {
gsArgs.add(inputPdfFile.getPath());
}
//execute and exit interpreter
try {
synchronized (gs) {
gs.initialize(gsArgs.toArray(new String[0]));
gs.exit();
}
} catch (UnsatisfiedLinkError e) {
logger.error(e.getMessage());
throw new RuntimeException(getMessage(e.getMessage()));
} catch (NoClassDefFoundError e) {
logger.error(e.getMessage());
throw new RuntimeException(getMessage(e.getMessage()));
} catch (GhostscriptException e) {
logger.error(e.getMessage());
throw new RuntimeException(e.getMessage());
} finally {
//delete interpreter instance (safer)
try {
Ghostscript.deleteInstance();
} catch (GhostscriptException e) {
//nothing
}
}
}
static String getMessage(String message) {
if (message.contains("library 'gs") || message.contains("ghost4j")) {
return message + GS_INSTALL;
}
return message;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy