All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.earcam.utilitarian.site.search.offline.PdfContentProcessor Maven / Gradle / Ivy

/*-
 * #%L
 * io.earcam.utilitarian.site.search.offline
 * %%
 * Copyright (C) 2017 earcam
 * %%
 * SPDX-License-Identifier: (BSD-3-Clause OR EPL-1.0 OR Apache-2.0 OR MIT)
 *
 * You must choose to accept, in full - any individual or combination of
 * the following licenses:
 * 
 * #L%
 */
package io.earcam.utilitarian.site.search.offline;

import java.io.IOException;
import java.io.UncheckedIOException;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.text.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.earcam.unexceptional.Closing;

public class PdfContentProcessor implements Processor {

	private static final Logger LOG = LoggerFactory.getLogger(PdfContentProcessor.class);


	@Override
	public void process(Document document)
	{
		if(isPdf(document) && !document.hasRaw()) {

			try {
				Closing.closeAfterAccepting(PDDocument::load, document.file().toFile(), document, this::consume);
			} catch(UncheckedIOException e) {
				LOG.warn("Failed to process PDF {} due to: {}", document.file(), e.getMessage());
				LOG.debug("Failed to process PDF", e.getCause());
			}
		}
	}


	private void consume(PDDocument pdf, Document document) throws IOException
	{
		PDDocumentInformation information = pdf.getDocumentInformation();
		document.field(Document.TITLE, information.getTitle());

		PDFTextStripper stripper = new PDFTextStripper();
		String text = stripper.getText(pdf);
		document.field(Document.RAW_TEXT, text);
	}


	private boolean isPdf(Document document)
	{
		return "application/pdf".equals(document.contentType());
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy