org.overviewproject.pdfocr.pdf.PdfPage.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdfocr_2.12 Show documentation
Library that shells to Tesseract to make PDFs searchable
The newest version!
package org.overviewproject.pdfocr.pdf

import java.awt.Rectangle
import java.awt.geom.AffineTransform
import java.awt.image.BufferedImage
import java.io.{ByteArrayInputStream,ByteArrayOutputStream}
import org.apache.pdfbox.cos.COSName
import org.apache.pdfbox.pdmodel.{PDDocument,PDPage,PDPageContentStream}
import org.apache.pdfbox.pdmodel.common.{PDMetadata,PDRectangle}
import org.apache.pdfbox.pdmodel.font.PDFont
import org.apache.pdfbox.pdmodel.graphics.color.{PDColor,PDDeviceRGB}
import org.apache.pdfbox.pdmodel.interactive.action.PDPageAdditionalActions
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLine
import org.apache.pdfbox.io.MemoryUsageSetting
import org.apache.pdfbox.rendering.{ImageType,PageDrawer,PageDrawerParameters,PDFRenderer}
import org.apache.pdfbox.text.PDFTextStripper
import org.apache.pdfbox.util.{Matrix,Vector}
import scala.collection.JavaConverters.iterableAsScalaIterableConverter

import org.overviewproject.pdfocr.exceptions.PdfInvalidException

/** A page of a PDF document.
  *
  * A PDF document can throw a PdfInvalidException at any time during reading.
  *
  * A PdfPage will only be valid so long as its parent PdfDocument's `close`
  * method has not been called.
  *
  * @param pdfDocument A PdfDocument
  * @param pdPage A PDPage (PDFBox internal representation)
  * @param pageNumber 0-based page index
  */
class PdfPage(val pdfDocument: PdfDocument, val pdPage: PDPage, val pageNumber: Int) {
  private val ImageDpi: Int = 300       // What we send to Tesseract. Arbitrary.
  private val MaxResolution: Int = 4000 // To ensure Tesseract finishes promptly. Picked by trying a few.
  private val pdDocument: PDDocument = pdfDocument.pdDocument

  /** Returns all the text we can read from the document.
    *
    * After you addHocr() and before you write the PDFDocument, toText() will
    * have undefined behavior.
    */
  @throws(classOf[PdfInvalidException]) // Dunno if it can even throw this
  def toText: String = {
    val stripper = new PDFTextStripper
    stripper.setStartPage(pageNumber + 1)
    stripper.setEndPage(pageNumber + 1)

    try {
      stripper.getText(pdDocument)
    } catch {
      case ex: NullPointerException => throw new PdfInvalidException(ex)
    }
  }

  /** Returns how many dots-per-inch we should render an image.
    *
    * The result will be 300, unless the PDF is large. If the PDF is large, the
    * DPI will max out at the largest integer that makes the output image
    * smaller than 4000x4000.
    *
    * If the page is missing a media box, the DPI will be 1.
    */
  private def bestDpi: Int = {
    var dpi = ImageDpi

    Option(pdPage.getMediaBox) match {
      case Some(rect) => {
        var dpi = ImageDpi

        if (rect.getWidth * dpi / PdfPage.PdfDpi > MaxResolution) {
          dpi = MaxResolution * PdfPage.PdfDpi / rect.getWidth.toInt
        }
        if (rect.getHeight * dpi / PdfPage.PdfDpi > MaxResolution) {
          dpi = MaxResolution * PdfPage.PdfDpi / rect.getHeight.toInt
        }

        dpi
      }
      case None => 1
    }
  }

  /** Renders the page to an image. */
  @throws(classOf[PdfInvalidException])
  def toImage: BufferedImage = {
    val renderer = new PDFRenderer(pdDocument)

    try {
      renderer.renderImageWithDPI(pageNumber, bestDpi, ImageType.GRAY)
    } catch {
      case ex: NullPointerException => throw new PdfInvalidException(ex)
    }
  }

  /** Renders the page to an image, omitting all text.
    *
    * In other words, if a PDF has a stream with a rectangle, some text and an
    * image, this method will return an imaage with a rectangle and an image.
    */
  @throws(classOf[PdfInvalidException])
  def toImageWithoutText: BufferedImage = {
    val renderer = new PdfPage.PDFRendererWithoutText(pdDocument)

    try {
      renderer.renderImageWithDPI(pageNumber, bestDpi, ImageType.GRAY)
    } catch {
      case ex: NullPointerException => throw new PdfInvalidException(ex)
    }
  }

  /** Uses hOCR data to add invisible text to the page.
    *
    * This will only work with Tesseract 3.03's hOCR output. It assumes the
    * hOCR output uses the same resolution as returned by `bestDpi` -- that is,
    * the resolution of the `toImage` output.
    *
    * After you addHocr() and before you write the PDFDocument, toText() will
    * have undefined behavior.
    */
  def addHocr(hocr: Array[Byte]): Unit = {
    val input = new ByteArrayInputStream(hocr)
    val parser = new HocrParser(input)

    val handler = new PdfPage.HocrHandler(this)

    parser.foreach(handler.renderLine)

    handler.close

    // Add an invisible LINE annotation with RBB 0xd0cd0c.
    // split-pdf-and-extract-text will recognize it as an OCR flag.
    val ocrAnnotation = new PDAnnotationLine()
    ocrAnnotation.setHidden(true)
    ocrAnnotation.setInvisible(true)
    ocrAnnotation.setRectangle(new PDRectangle(0, 0, 0, 0))
    ocrAnnotation.setColor(new PDColor(
      Array(0xd0, 0xcd, 0x0c).map(_ / 255.0f),
      PDDeviceRGB.INSTANCE)
    )

    val annotations = pdPage.getAnnotations()
    annotations.add(ocrAnnotation)
    pdPage.setAnnotations(annotations)
  }

  /** Returns true iff there is a /Line annotation with RGB 0xd0cd0c.
    *
    * (Perhaps cleaner would be to check for our font. But pdfium does not
    * give an interface for reading fonts yet, and we want our pdfium-based
    * splitter to recognize this mark.)
    */
  def isFromOcr: Boolean = Option(pdPage.getAnnotations) match {
    case Some(annotations) => {
      annotations.size match {
        case 0 => false
        case _ => {
          val annot = annotations.get(annotations.size - 1)
          return (
            (annot.getAnnotationFlags & 0x3) == 0x3
            && annot.getSubtype == "Line"
            && annot.getColor.toRGB == 0xd0cd0c
          )
        }
      }
    }
    case None => false
  }

  /** Returns a one-page PDF, as a byte array.
    *
    * Even if the original `pdDocument` is a single page, this method will
    * return a whole new page. (That's so we can avoid blocking re-reading the
    * original input file.)
    *
    * The purpose of this output PDF is *display*. (Another image format would
    * be more ideal; we output PDF because we always have, not because we
    * should.)
    */
  def toPdf: Array[Byte] = {
    // Mostly copied from pdfbox/.../multipdf/Splitter.java, but without the
    // horrendous API.
    val newDocument = new PDDocument(MemoryUsageSetting.setupMainMemoryOnly)

    try {
      newDocument.getDocument.setVersion(pdDocument.getVersion)
      newDocument.setDocumentInformation(pdDocument.getDocumentInformation)
      newDocument.getDocumentCatalog.setViewerPreferences(pdDocument.getDocumentCatalog.getViewerPreferences)
      val newPage = newDocument.importPage(pdPage)
      newPage.setCropBox(pdPage.getCropBox)
      newPage.setMediaBox(pdPage.getMediaBox)
      newPage.setResources(pdPage.getResources) // only the resources of the page will be copied
      newPage.setRotation(pdPage.getRotation)

      // Remove PDF features we don't "like". (We'd prefer PNG to PDF, so
      // we should nix anything "dynamic", such as links, to save space and
      // time.)
      newPage.setAnnotations(null)
      newPage.setActions(null)
      newPage.setMetadata(null)

      val outputStream = new ByteArrayOutputStream
      newDocument.save(outputStream)
      outputStream.toByteArray
    } finally {
      newDocument.close
    }
  }
}

object PdfPage {
  private val PdfDpi: Int = 72          // always. Part of the PDF spec.

  private class HocrHandler(pdfPage: PdfPage) {
    private def pdRectangleToRectangle(pdRectangle: PDRectangle) = new Rectangle(
      pdRectangle.getLowerLeftX.toInt,
      pdRectangle.getLowerLeftY.toInt,
      pdRectangle.getWidth.toInt,
      pdRectangle.getHeight.toInt
    )

    private val cropBox: Rectangle = pdRectangleToRectangle(pdfPage.pdPage.getCropBox)
    private val dpiScale: Double = PdfDpi.toDouble / pdfPage.bestDpi
    private val FontSize: Double = 12 // It's always 12; then we scale it

    private def font = pdfPage.pdfDocument.hocrFont

    private lazy val fontAscent = font.getFontDescriptor.getAscent * FontSize / 1000
    private var mustCloseStream = false
    private lazy val stream: PDPageContentStream = {
      mustCloseStream = true
      val ret = new PDPageContentStream(
        pdfPage.pdDocument,
        pdfPage.pdPage,
        PDPageContentStream.AppendMode.APPEND,
        true,
        true
      )
      ret.beginText
      ret
    }

    def close: Unit = {
      if (mustCloseStream) {
        stream.endText
        stream.close
      }
    }

    def renderLine(line: HocrLine): Unit = {
      val words = line.words

      /*
       * When Tesseract finds a "line", it gives the line's dimensions as a
       * bbox. However, the line might be slightly crooked, in which case the
       * height of the bbox will be far greater than the desired font size. So
       * we can't use the line's height to determine font size.
       *
       * When Tesseract finds a "word", it gives a bounding box that won't
       * include a font's descent or ascent if the word doesn't contain them.
       * (The word "no" is smaller, vertially, than the word "yes".) So we
       * can't use the word's height to determine font size.
       *
       * A good strategy: take the maximum word height in the line. Assume
       * Tesseract's notion of a "line" means "all the same font size". (I have
       * no idea whether that's correct.)
       *
       * This forces us to put all words at the same `y`. Tesseract's bboxes
       * don't tell us where the baseline is, and PDF spec needs a baseline. We
       * know `baseline = top - ascent`. We'll calculate `top` by centering the
       * `middle` at the `lineBbox` middle.
       */
      val maxWordHeight: Double = words.map(_.boundingBox.height).max // in hOCR coordinates
      val scaleY: Double = maxWordHeight / FontSize * dpiScale

      val lineTop: Double = line.boundingBox.y + (line.boundingBox.height - maxWordHeight) * 0.5 // hOCR coordinates
      val baseline: Double = cropBox.height - lineTop * dpiScale - fontAscent * scaleY - cropBox.y // in PDF coordinates

      words.foreach { word =>
        val bbox = word.boundingBox

        stream.setFont(font, FontSize.toFloat)

        val fontWidth: Double = font.getStringWidth(word.text) * FontSize / 1000 // width without scaling
        val scaleX: Double = bbox.width / fontWidth * dpiScale
        val leftX: Double = bbox.x * dpiScale - cropBox.x // in PDF coordinates

        val transform = new AffineTransform
        transform.scale(scaleX, scaleY)
        transform.translate(leftX / scaleX, baseline / scaleY)
        stream.setTextMatrix(new Matrix(transform))

        stream.showText(word.text)
      }
    }
  }

  private class PDFRendererWithoutText(document: PDDocument) extends PDFRenderer(document) {
    override protected def createPageDrawer(parameters: PageDrawerParameters): PageDrawer = {
      new PageDrawerWithoutText(parameters)
    }
  }

  private class PageDrawerWithoutText(parameters: PageDrawerParameters) extends PageDrawer(parameters) {
    override def beginText: Unit = ()
    override def endText: Unit = ()
    override protected def showFontGlyph(textRenderingMatrix: Matrix, font: PDFont, code: Int, unicode: String, displacement: Vector): Unit = ()
  }
}