org.overviewproject.pdfocr.pdf.PdfDocument.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdfocr_2.10 Show documentation
Show all versions of pdfocr_2.10 Show documentation
Library that shells to Tesseract to make PDFs searchable
The newest version!
package org.overviewproject.pdfocr.pdf
import java.io.{FileNotFoundException,IOException}
import java.nio.file.{Files,Path}
import org.apache.fontbox.ttf.{TTFParser,TrueTypeFont}
import org.apache.pdfbox.io.MemoryUsageSetting
import org.apache.pdfbox.pdfparser.PDFParser
import org.apache.pdfbox.pdmodel.{PDDocument,PDPage}
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException
import org.apache.pdfbox.pdmodel.font.{PDFont,PDType0Font}
import scala.concurrent.{ExecutionContext,Future,blocking}
import org.overviewproject.pdfocr.exceptions._
/** A PDF document.
*
* This class is intended to be created via `PdfDocument.load(...)`, and its
* asynchronous methods use the ExecutionContext passed to
* `PdfDocument.load(...)`.
*
* You must call PdfDocument.close when finished.
*/
class PdfDocument(
val path: Path,
val pdDocument: PDDocument
)(implicit ec: ExecutionContext) {
/** Releases resources associated with this document.
*
* You must call this method when you're done with the object.
*/
def close: Unit = pdDocument.close
/** The number of pages in the document.
*
* This method will not block or throw an exception.
*/
val nPages: Int = pdDocument.getNumberOfPages
/** Iterates over the pages of the document.
*
* Each returned element has yet to be parsed; that's why `iterator.next`
* returns a `Future`. Here's an example usage:
*
* val it = pdfDocument.pages
* def step(result: Seq[String])(implicit ec: ExecutionContext): Future[Seq[String]] = {
* if (it.hasNext) {
* it.next // May be a Future.failed[PdfInvalidException]
* .flatMap { page =>
* step(result :+ page.toText) // May throw PdfInvalidException
* }
* } else {
* Future.successful(result)
* }
* }
* val pageTexts: Seq[String] = step(Seq())
* .recover { case ex: PdfInvalidException => ... }
*/
def pages: Iterator[Future[PdfPage]] = new PdfDocument.PdfPageIterator(this)(ec)
/** Writes the document to a file.
*
* The intended usage is:
*
* 1. Load a PdfDocument.
* 2. Mutate its pages.
* 3. Write it to a new location (this method).
*/
def write(path: Path): Future[Unit] = Future(blocking(pdDocument.save(path.toFile)))
/** Font we use to write OCR-ed text to the document.
*
* This variable is initialized in `PdfPage.addHocr()`. If you never call
* that method, this font will never be loaded.
*/
private[pdf] lazy val hocrFont: PDFont = {
val ret = PDType0Font.load(pdDocument, PdfDocument.Unifont, true)
ret.getFontDescriptor.setFontName("pdfocr-ocr-text") // For PdfPage.isFromOcr()
ret
}
}
object PdfDocument {
private val PdfParserMainMemoryBytes = 50 * 1024 * 1024;
private lazy val Unifont: TrueTypeFont = {
val parser = new TTFParser
parser.parse(getClass.getResourceAsStream("/pdfocr-stub-font.ttf"))
}
private class PdfPageIterator(pdfDocument: PdfDocument)(implicit ec: ExecutionContext)
extends Iterator[Future[PdfPage]] {
var nextPageNumber = 0
override def hasNext: Boolean = nextPageNumber < pdfDocument.nPages
override def next: Future[PdfPage] = {
Future(blocking {
val pageNumber = nextPageNumber
nextPageNumber += 1
val pdPage: PDPage = try {
pdfDocument.pdDocument.getPage(pageNumber)
} catch {
case ex: NullPointerException => {
throw new PdfInvalidException(ex)
}
}
new PdfPage(pdfDocument, pdPage, pageNumber)
})(ec)
}
}
/** Opens and returns a PDF document.
*
* The return value may be a failed Future:
*
* * PdfInvalidException: the main dictionary could not be found.
* * PdfEncryptedException: the PDF is encrypted and so can't be loaded.
* * IOException: a low-level file error occurred.
*
* Otherwise, the return value will be valid ... but any page in the `pages`
* iterator may still return a failed Future.
*
* This will only parse enough of the document to figure out how many pages
* there are. The pages method will parse the rest.
*
* Be sure close the returned PdfDocument.
*/
def load(path: Path)(implicit ec: ExecutionContext): Future[PdfDocument] = Future(blocking {
val memoryUsageSetting = MemoryUsageSetting.setupMixed(PdfParserMainMemoryBytes)
// Read enough of the document to produce an error if it isn't a PDF
try {
val pdDocument = PDDocument.load(path.toFile, memoryUsageSetting) // Only reads trailer+xref
pdDocument.setAllSecurityToBeRemoved(true)
try {
new PdfDocument(path, pdDocument)
} catch {
case ex: Throwable => {
pdDocument.close
throw ex
}
}
} catch {
case ex: InvalidPasswordException => throw new PdfEncryptedException(ex)
case ex: FileNotFoundException => throw ex
case ex: SecurityException => throw ex
case ex: IllegalArgumentException => throw new PdfInvalidException(ex)
case ex: IOException => throw new PdfInvalidException(ex)
}
})
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy