org.dspace.content.packager.PDFPackager Maven / Gradle / Ivy
Show all versions of dspace-api Show documentation
/**
* The contents of this file are subject to the license and copyright
* detailed in the LICENSE and NOTICE files at the root of the source
* tree and available online at
*
* http://www.dspace.org/license/
*/
package org.dspace.content.packager;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.sql.SQLException;
import java.util.Calendar;
import java.util.List;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.logging.log4j.Logger;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.io.ScratchFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.dspace.authorize.AuthorizeException;
import org.dspace.content.Bitstream;
import org.dspace.content.BitstreamFormat;
import org.dspace.content.Bundle;
import org.dspace.content.Collection;
import org.dspace.content.DCDate;
import org.dspace.content.DSpaceObject;
import org.dspace.content.Item;
import org.dspace.content.MetadataSchemaEnum;
import org.dspace.content.WorkspaceItem;
import org.dspace.content.crosswalk.CrosswalkException;
import org.dspace.content.crosswalk.MetadataValidationException;
import org.dspace.content.factory.ContentServiceFactory;
import org.dspace.content.service.BitstreamFormatService;
import org.dspace.content.service.BitstreamService;
import org.dspace.content.service.BundleService;
import org.dspace.content.service.ItemService;
import org.dspace.content.service.WorkspaceItemService;
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.core.LogHelper;
import org.dspace.core.SelfNamedPlugin;
import org.dspace.core.Utils;
import org.dspace.workflow.WorkflowException;
/**
* Accept a PDF file by itself as a SIP.
*
* This is mainly a proof-of-concept to demonstrate the flexibility
* of the packager and crosswalk plugins.
*
* To import, open up the PDF and try to extract sufficient metadata
* from its InfoDict.
*
* Export is a crude hack: if the item has a bitstream containing PDF,
* send that, otherwise it fails. Do not attempt to insert metadata.
*
* @author Larry Stone
* @version $Revision$
* @see PackageIngester
* @see PackageDisseminator
*/
public class PDFPackager
extends SelfNamedPlugin
implements PackageIngester, PackageDisseminator {
/**
* log4j category
*/
private static final Logger log = org.apache.logging.log4j.LogManager.getLogger(PDFPackager.class);
protected static final String BITSTREAM_FORMAT_NAME = "Adobe PDF";
protected static String aliases[] = {"PDF", "Adobe PDF", "pdf", "application/pdf"};
public static String[] getPluginNames() {
return (String[]) ArrayUtils.clone(aliases);
}
protected final BitstreamService bitstreamService = ContentServiceFactory.getInstance().getBitstreamService();
protected final BundleService bundleService = ContentServiceFactory.getInstance().getBundleService();
protected final BitstreamFormatService bitstreamFormatService = ContentServiceFactory.getInstance()
.getBitstreamFormatService();
protected final ItemService itemService = ContentServiceFactory.getInstance().getItemService();
protected final WorkspaceItemService workspaceItemService = ContentServiceFactory.getInstance()
.getWorkspaceItemService();
// utility to grovel bitstream formats..
protected void setFormatToMIMEType(Context context, Bitstream bs, String mimeType)
throws SQLException {
List bf = bitstreamFormatService.findNonInternal(context);
for (BitstreamFormat aBf : bf) {
if (aBf.getMIMEType().equalsIgnoreCase(mimeType)) {
bs.setFormat(context, aBf);
break;
}
}
}
/**
* Create new Item out of the ingested package, in the indicated
* collection. It creates a workspace item, which the application
* can then install if it chooses to bypass Workflow.
*
* This is a VERY crude import of a single Adobe PDF (Portable
* Document Format) file, using the document's embedded metadata
* for package metadata. If the PDF file hasn't got the minimal
* metadata available, it is rejected.
*
*
* @param context DSpace context.
* @param parent collection under which to create new item.
* @param pkgFile The package file to ingest
* @param params package parameters (none recognized)
* @param license may be null, which takes default license.
* @return workspace item created by ingest.
* @throws PackageValidationException if package invalid
* @throws CrosswalkException if crosswalking fails
* @throws AuthorizeException if authorization error
* @throws SQLException if database error
* @throws IOException if IO error
* @throws WorkflowException if workflow error
*/
@Override
public DSpaceObject ingest(Context context, DSpaceObject parent,
File pkgFile, PackageParameters params,
String license)
throws PackageValidationException, CrosswalkException,
AuthorizeException, SQLException, IOException, WorkflowException {
boolean success = false;
Bundle original = null;
Bitstream bs = null;
WorkspaceItem wi = null;
try {
// Save the PDF in a bitstream first, since the parser
// has to read it as well, and we cannot "rewind" it after that.
wi = workspaceItemService.create(context, (Collection) parent, false);
Item myitem = wi.getItem();
original = bundleService.create(context, myitem, "ORIGINAL");
InputStream fileStream = null;
try {
fileStream = new FileInputStream(pkgFile);
bs = bitstreamService.create(context, original, fileStream);
} finally {
if (fileStream != null) {
fileStream.close();
}
}
bs.setName(context, "package.pdf");
setFormatToMIMEType(context, bs, "application/pdf");
bitstreamService.update(context, bs);
if (log.isDebugEnabled()) {
log.debug("Created bitstream ID=" + String.valueOf(bs.getID()) + ", parsing...");
}
crosswalkPDF(context, myitem, bitstreamService.retrieve(context, bs));
workspaceItemService.update(context, wi);
success = true;
log.info(LogHelper.getHeader(context, "ingest",
"Created new Item, db ID=" + String.valueOf(myitem.getID()) +
", WorkspaceItem ID=" + String.valueOf(wi.getID())));
myitem = PackageUtils.finishCreateItem(context, wi, null, params);
return myitem;
} finally {
// get rid of bitstream and item if ingest fails
if (!success) {
if (original != null && bs != null) {
bundleService.removeBitstream(context, original, bs);
}
if (wi != null) {
workspaceItemService.deleteAll(context, wi);
}
}
context.complete();
}
}
/**
* IngestAll() cannot be implemented for a PDF ingester, because there's only one PDF to ingest
*
* @throws UnsupportedOperationException if unsupported operation
* @throws PackageException if package error
* @throws IOException if IO error
* @throws SQLException if database error
* @throws AuthorizeException if authorization error
* @throws CrosswalkException if crosswalk error
*/
@Override
public List ingestAll(Context context, DSpaceObject parent, File pkgFile,
PackageParameters params, String license)
throws PackageException, UnsupportedOperationException,
CrosswalkException, AuthorizeException,
SQLException, IOException {
throw new UnsupportedOperationException(
"PDF packager does not support the ingestAll() operation at this time.");
}
/**
* Replace is not implemented.
*
* @throws UnsupportedOperationException if unsupported operation
* @throws PackageException if package error
* @throws IOException if IO error
* @throws SQLException if database error
* @throws AuthorizeException if authorization error
* @throws CrosswalkException if crosswalk error
*/
@Override
public DSpaceObject replace(Context context, DSpaceObject dso,
File pkgFile, PackageParameters params)
throws PackageException, UnsupportedOperationException,
CrosswalkException, AuthorizeException,
SQLException, IOException {
throw new UnsupportedOperationException("PDF packager does not support the replace() operation at this time.");
}
/**
* ReplaceAll() cannot be implemented for a PDF ingester, because there's only one PDF to ingest
*
* @throws UnsupportedOperationException if unsupported operation
* @throws PackageException if package error
* @throws IOException if IO error
* @throws SQLException if database error
* @throws AuthorizeException if authorization error
* @throws CrosswalkException if crosswalk error
*/
@Override
public List replaceAll(Context context, DSpaceObject dso,
File pkgFile, PackageParameters params)
throws PackageException, UnsupportedOperationException,
CrosswalkException, AuthorizeException,
SQLException, IOException {
throw new UnsupportedOperationException(
"PDF packager does not support the replaceAll() operation at this time.");
}
/**
* VERY crude dissemination: just look for the first
* bitstream with the PDF package type, and toss it out.
* Works on packages importer with this packager, and maybe some others.
*
* @param dso DSpaceObject
* @throws CrosswalkException if crosswalk error
* @throws AuthorizeException if authorization error
* @throws SQLException if database error
* @throws IOException if IO error
*/
@Override
public void disseminate(Context context, DSpaceObject dso,
PackageParameters params, File pkgFile)
throws PackageValidationException, CrosswalkException,
AuthorizeException, SQLException, IOException {
if (dso.getType() != Constants.ITEM) {
throw new PackageValidationException("This disseminator can only handle objects of type ITEM.");
}
Item item = (Item) dso;
BitstreamFormat pdff = bitstreamFormatService.findByShortDescription(context,
BITSTREAM_FORMAT_NAME);
if (pdff == null) {
throw new PackageValidationException("Cannot find BitstreamFormat \"" + BITSTREAM_FORMAT_NAME + "\"");
}
Bitstream pkgBs = PackageUtils.getBitstreamByFormat(context, item, pdff, Constants.DEFAULT_BUNDLE_NAME);
if (pkgBs == null) {
throw new PackageValidationException("Cannot find Bitstream with format \"" + BITSTREAM_FORMAT_NAME + "\"");
}
//Make sure our package file exists
if (!pkgFile.exists()) {
PackageUtils.createFile(pkgFile);
}
//open up output stream to copy bitstream to file
FileOutputStream out = null;
try {
//open up output stream to copy bitstream to file
out = new FileOutputStream(pkgFile);
Utils.copy(bitstreamService.retrieve(context, pkgBs), out);
} finally {
if (out != null) {
out.close();
}
}
}
/**
* disseminateAll() cannot be implemented for a PDF disseminator, because there's only one PDF to disseminate
*
* @throws PackageException if package error
* @throws CrosswalkException if crosswalk error
* @throws AuthorizeException if authorization error
* @throws SQLException if database error
* @throws IOException if IO error
*/
@Override
public List disseminateAll(Context context, DSpaceObject dso,
PackageParameters params, File pkgFile)
throws PackageException, CrosswalkException,
AuthorizeException, SQLException, IOException {
throw new UnsupportedOperationException(
"PDF packager does not support the disseminateAll() operation at this time.");
}
/**
* Identifies the MIME-type of this package, i.e. "application/pdf".
*
* @param params package params
* @return the MIME type (content-type header) of the package to be returned
*/
@Override
public String getMIMEType(PackageParameters params) {
return "application/pdf";
}
private void crosswalkPDF(Context context, Item item, InputStream metadata)
throws CrosswalkException, IOException, SQLException, AuthorizeException {
COSDocument cos = null;
try {
ScratchFile scratchFile = null;
try {
long useRAM = Runtime.getRuntime().freeMemory() * 80 / 100; // use up to 80% of JVM free memory
scratchFile = new ScratchFile(
MemoryUsageSetting.setupMixed(useRAM)); // then fallback to temp file (unlimited size)
} catch (IOException ioe) {
log.warn("Error initializing scratch file: " + ioe.getMessage());
}
PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(metadata), scratchFile);
parser.parse();
cos = parser.getDocument();
// sanity check: PDFBox breaks on encrypted documents, so give up.
if (cos.getEncryptionDictionary() != null) {
throw new MetadataValidationException("This packager cannot accept an encrypted PDF document.");
}
/* PDF to DC "crosswalk":
*
* NOTE: This is not in a crosswalk plugin because (a) it isn't
* useful anywhere else, and more importantly, (b) the source
* data is not XML so it doesn't fit the plugin's interface.
*
* pattern of crosswalk -- PDF dict entries to DC:
* Title -> title.null
* Author -> contributor.author
* CreationDate -> date.created
* ModDate -> date.created
* Creator -> description.provenance (application that created orig)
* Producer -> description.provenance (convertor to pdf)
* Subject -> description.abstract
* Keywords -> subject.other
* date is java.util.Calendar
*/
PDDocument pd = new PDDocument(cos);
PDDocumentInformation docinfo = pd.getDocumentInformation();
String title = docinfo.getTitle();
// sanity check: item must have a title.
if (title == null) {
throw new MetadataValidationException(
"This PDF file is unacceptable, it does not have a value for \"Title\" in its Info dictionary.");
}
if (log.isDebugEnabled()) {
log.debug("PDF Info dict title=\"" + title + "\"");
}
itemService.addMetadata(context, item, MetadataSchemaEnum.DC.getName(), "title", null, "en", title);
String value = docinfo.getAuthor();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchemaEnum.DC.getName(),
"contributor", "author", null, value);
if (log.isDebugEnabled()) {
log.debug("PDF Info dict author=\"" + value + "\"");
}
}
value = docinfo.getCreator();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchemaEnum.DC.getName(),
"description", "provenance", "en",
"Application that created the original document: " + value);
}
value = docinfo.getProducer();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchemaEnum.DC.getName(),
"description", "provenance", "en",
"Original document converted to PDF by: " + value);
}
value = docinfo.getSubject();
if (value != null) {
itemService
.addMetadata(context, item, MetadataSchemaEnum.DC.getName(),
"description", "abstract", null, value);
}
value = docinfo.getKeywords();
if (value != null) {
itemService.addMetadata(context, item, MetadataSchemaEnum.DC.getName(),
"subject", "other", null, value);
}
// Take either CreationDate or ModDate as "date.created",
// Too bad there's no place to put "last modified" in the DC.
Calendar calValue = docinfo.getCreationDate();
if (calValue == null) {
calValue = docinfo.getModificationDate();
}
if (calValue != null) {
itemService.addMetadata(context, item, MetadataSchemaEnum.DC.getName(), "date", "created", null,
(new DCDate(calValue.getTime())).toString());
}
itemService.update(context, item);
} finally {
if (cos != null) {
cos.close();
}
}
}
/**
* Returns a user help string which should describe the
* additional valid command-line options that this packager
* implementation will accept when using the -o
or
* --option
flags with the Packager script.
*
* @return a string describing additional command-line options available
* with this packager
*/
@Override
public String getParameterHelp() {
return "No additional options available.";
}
}