org.lockss.pdf.PdfUtil Maven / Gradle / Ivy
Show all versions of lockss-core Show documentation
/*
* $Id$
*/
/*
Copyright (c) 2000-2016 Board of Trustees of Leland Stanford Jr. University,
all rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
STANFORD UNIVERSITY BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
Except as contained in this notice, the name of Stanford University shall not
be used in advertising or otherwise to promote the sale, use or other dealings
in this Software without prior written authorization from Stanford University.
*/
package org.lockss.pdf;
import java.io.*;
import java.util.*;
import java.util.regex.*;
import org.apache.commons.lang3.StringEscapeUtils;
import org.lockss.config.*;
import org.lockss.plugin.*;
import org.lockss.plugin.definable.DefinableArchivalUnit;
import org.lockss.util.*;
import org.lockss.util.io.DeferredTempFileOutputStream;
/**
*
* PDF-related utilities.
*
*
* @author Thib Guicherd-Callin
* @since 1.56
*/
public class PdfUtil {
/**
*
* The configuration prefix for this class ({@value}).
*
*
* @since 1.56
*/
public static final String CONFIG_PREFIX = Configuration.PREFIX + "pdf.";
/**
* Default: 5MB
*/
public static final int DEFAULT_PDF_MEMORY_LIMIT = 5 * 1024 * 1024;
/**
* Number of megabytes above which a filtered PDF file is transferred from
* memory to a temporary file. May also be used by other PDF operations that
* require in-memory processing.
*/
public static final String PARAM_PDF_MEMORY_LIMIT =
CONFIG_PREFIX + "pdfMemoryLimit";
/**
*
* A suggested prefix for title database attributes conveying hints about PDF
* filter factories.
*
*
* @see DefinableArchivalUnit#SUFFIX_FILTER_FACTORY
*/
public static final String PREFIX_PDF_FILTER_FACTORY_HINT = "hint_";
/**
*
* A logger for use by this class.
*
*
* @since 1.56
*/
private static final Logger log = Logger.getLogger(PdfUtil.class);
/**
*
* Convenience call to {@link #asInputStream(PdfDocument, int)} using a
* default memory limit defined by the parameter
* {@link #PARAM_PDF_MEMORY_LIMIT}.
*
*
* @param pdfDocument
* A PDF document.
* @return The saved PDF document, as an input stream.
* @throws PdfException
* If processing fails at the PDF level.
* @throws IOException
* If processing fails at the I/O level.
* @see #asInputStream(PdfDocument, int)
*/
public static InputStream asInputStream(PdfDocument pdfDocument)
throws PdfException, IOException {
return asInputStream(pdfDocument, getPdfMemoryLimit());
}
/**
*
* Saves the given PDF documeSnt and returns the result as an input stream,
* staying if possible in memory up to the given number of megabytes.
*
*
* @param pdfDocument
* A PDF document.
* @param memoryLimitMb
* The number of megabytes of memory up to which processing can be
* done entirely in memory.
* @return The saved PDF document, as an input stream.
* @throws PdfException
* If processing fails at the PDF level.
* @throws IOException
* If processing fails at the I/O level.
* @see DeferredTempFileOutputStream
*/
public static InputStream asInputStream(PdfDocument pdfDocument,
int memoryLimitMb)
throws PdfException, IOException {
DeferredTempFileOutputStream os = new DeferredTempFileOutputStream(memoryLimitMb);
try {
pdfDocument.save(os);
os.close();
return os.getDeleteOnCloseInputStream();
}
catch (PdfException | IOException | RuntimeException e) {
os.deleteTempFile();
throw e;
}
}
/**
*
* Convenience method to get the token factory out of a document factory.
* Equivalent to:
* pdfDocumentFactory.getTokenFactory()
*
*
* @param pdfDocumentFactory
* A PDF document factory instance.
* @return A PDF token factory instance.
* @since 1.70
*/
public static PdfTokenFactory getTokenFactory(PdfDocumentFactory pdfDocumentFactory) {
return pdfDocumentFactory.getTokenFactory();
}
/**
*
* Convenience method to get the token factory out of a document.
* Equivalent to:
* pdfDocument.getDocumentFactory().getTokenFactory()
*
*
* @param pdfDocument
* A PDF document instance.
* @return A PDF token factory instance.
* @since 1.70
*/
public static PdfTokenFactory getTokenFactory(PdfDocument pdfDocument) {
return getTokenFactory(pdfDocument.getDocumentFactory());
}
/**
*
* Convenience method to get the token factory out of a page.
* Equivalent to:
* pdfPage.getDocument().getDocumentFactory().getTokenFactory()
*
*
* @param pdfPage
* A PDF page instance.
* @return A PDF token factory instance.
* @since 1.70
*/
public static PdfTokenFactory getTokenFactory(PdfPage pdfPage) {
return getTokenFactory(pdfPage.getDocument());
}
/**
*
* Convenience method to get the token factory out of a token stream.
* Equivalent to:
* pdfTokenStream.getPage().getDocument().getDocumentFactory().getTokenFactory()
*
*
* @param pdfTokenStream
* A PDF token stream instance.
* @return A PDF token factory instance.
* @since 1.70
*/
public static PdfTokenFactory getTokenFactory(PdfTokenStream pdfTokenStream) {
// Null check to accommodate fake token streams like MockPdfTokenStream
PdfPage pdfPage = pdfTokenStream.getPage();
return pdfPage == null ? null : getTokenFactory(pdfPage);
}
/**
*
* Retrieves from the title database the value of a special attribute the
* given AU may have, that is used by convention to direct a PDF filter
* factory to use a particular PDF transformation for that AU. The special
* attribute is the concatenation of {@link #PREFIX_PDF_FILTER_FACTORY_HINT},
* {@link Constants#MIME_TYPE_PDF} and
* {@link DefinableArchivalUnit#SUFFIX_ARTICLE_MIME_TYPE}.
*
*
* @param au
* An archival unit.
* @return The value of the PDF hint attribute, or null
if unset.
*/
public static String getPdfHint(ArchivalUnit au) {
String key = PREFIX_PDF_FILTER_FACTORY_HINT + Constants.MIME_TYPE_PDF
+ DefinableArchivalUnit.SUFFIX_HASH_FILTER_FACTORY;
return AuUtil.getTitleAttribute(au, key);
}
/**
*
* Convenience method to retrieve the value of the daemon parameter
* {@link #PARAM_PDF_MEMORY_LIMIT}/{@link #DEFAULT_PDF_MEMORY_LIMIT}.
*
*
* @return The value of {@link #PARAM_PDF_MEMORY_LIMIT}/
* {@link #DEFAULT_PDF_MEMORY_LIMIT} in the current configuration.
* @since 1.56
*/
public static int getPdfMemoryLimit() {
return CurrentConfig.getCurrentConfig().getInt(PARAM_PDF_MEMORY_LIMIT,
DEFAULT_PDF_MEMORY_LIMIT);
}
/**
*
* Normalizes all token stream of each page of the given PDF document.
*
*
* @param pdfDocument
* A PDF document.
* @throws PdfException
* If PDF processing fails.
* @since 1.56
* @see #normalizeTokenStream(PdfTokenStream)
*/
public static void normalizeAllTokenStreams(PdfDocument pdfDocument)
throws PdfException {
for (PdfPage pdfPage : pdfDocument.getPages()) {
normalizeAllTokenStreams(pdfPage);
}
}
/**
*
* Normalizes all the token streams of the given PDF page.
*
*
* @param pdfPage
* A PDF page.
* @throws PdfException
* If PDF processing fails.
* @since 1.56
* @see #normalizeTokenStream(PdfTokenStream)
*/
public static void normalizeAllTokenStreams(PdfPage pdfPage)
throws PdfException {
for (PdfTokenStream pdfTokenStream : pdfPage.getAllTokenStreams()) {
normalizeTokenStream(pdfTokenStream);
}
}
/**
*
* Normalizes the page token stream of the given PDF page.
*
*
* @param pdfPage
* A PDF page.
* @throws PdfException
* If PDF processing fails.
* @since 1.56
* @see #normalizeTokenStream(PdfTokenStream)
*/
public static void normalizePageTokenStream(PdfPage pdfPage)
throws PdfException {
normalizeTokenStream(pdfPage.getPageTokenStream());
}
/**
*
* Normalizes the page stream of each page of the given PDF document.
*
*
* @param pdfDocument
* A PDF document.
* @throws PdfException
* If PDF processing fails.
* @since 1.56
* @see #normalizeTokenStream(PdfTokenStream)
*/
public static void normalizePageTokenStreams(PdfDocument pdfDocument)
throws PdfException {
for (PdfPage pdfPage : pdfDocument.getPages()) {
normalizePageTokenStream(pdfPage);
}
}
/**
*
* Reads all the tokens from the given stream, then writes the result back to
* it.
*
*
* The purpose of this seemingly idempotent operation is to force the
* underlying stream implementation to unravel any parts of the stream it may
* have been able to delay interpreting until the stream is accessed, such as
* decoding a filtered stream.
*
*
* @param pdfTokenStream
* A token stream.
* @throws PdfException
* If PDF processing fails.
* @since 1.56
*/
public static void normalizeTokenStream(PdfTokenStream pdfTokenStream)
throws PdfException {
pdfTokenStream.setTokens(pdfTokenStream.getTokens());
}
/**
*
* Sets the ID array of the given PDF document to one consisting of the
* arbitrary ID string "12345678901234567890123456789012"
twice.
*
*
* @param pdfDocument
* A PDF document.
* @throws PdfException
* If PDF processing fails.
* @since 1.56
* @see #setTrailerId(PdfDocument, String, String)
*/
public static void normalizeTrailerId(PdfDocument pdfDocument)
throws PdfException {
setTrailerId(pdfDocument,
"12345678901234567890123456789012",
"12345678901234567890123456789012");
}
/**
*
* Convenience method to convert the given PDF token to a human-readable
* String.
*
*
* @param pdfToken
* A PDF token.
* @return A string representing the token.
*/
public static String prettyPrint(PdfToken pdfToken) {
StringBuilder sb = new StringBuilder();
prettyPrint(sb, pdfToken);
return sb.toString();
}
/**
*
* Convenience method to output a human-readable version of the given token to
* the given string builder.
*
*
* @param sb
* A string builder.
* @param pdfToken
* A PDF token.
* @since 1.57
*/
public static void prettyPrint(StringBuilder sb, PdfToken pdfToken) {
sb.append("[");
if (pdfToken.isArray()) {
sb.append("array:");
for (PdfToken arrayToken : pdfToken.getArray()) {
prettyPrint(sb,arrayToken);
}
} else if (pdfToken.isBoolean()) {
sb.append("boolean:");
sb.append(Boolean.toString(pdfToken.getBoolean()));
} else if (pdfToken.isDictionary()) {
boolean first = true;
sb.append("dictionary:");
for (Map.Entry entry : pdfToken.getDictionary().entrySet()) {
if (first) {
first = false;
} else {
sb.append(";");
}
sb.append(StringEscapeUtils.escapeJava(entry.getKey()));
sb.append("=");
prettyPrint(sb,entry.getValue());
}
} else if (pdfToken.isFloat()) {
sb.append("float:");
sb.append(Float.toString(pdfToken.getFloat()));
} else if (pdfToken.isInteger()) {
sb.append("integer:");
sb.append(Long.toString(pdfToken.getInteger()));
} else if (pdfToken.isName()) {
sb.append("name:");
sb.append(StringEscapeUtils.escapeJava(pdfToken.getName()));
} else if (pdfToken.isNull()) {
sb.append("null");
} else if (pdfToken.isObject()) {
sb.append("object:");
prettyPrint(sb, pdfToken.getObject());
} else if (pdfToken.isOperator()) {
sb.append("operator:");
sb.append(StringEscapeUtils.escapeJava(pdfToken.getOperator()));
} else if (pdfToken.isString()) {
sb.append("string:\"");
sb.append(StringEscapeUtils.escapeJava(pdfToken.getString()));
sb.append("\"");
}
sb.append("]");
}
/**
*
* If the given PDF document is not null
, closes it ignoring any
* exception thrown by {@link PdfDocument#close()}.
*
*
* @param pdfDocument
* A PDF document.
* @since 1.56
* @see PdfDocument#close()
*/
public static void safeClose(PdfDocument pdfDocument) {
try {
if (pdfDocument != null) {
pdfDocument.close();
}
} catch (PdfException pdfe) {
log.debug2("Error closing a PDF document", pdfe);
}
}
/**
*
* Sets the trailer ID array of the given PDF document to one consisting of
* the two given strings.
*
*
* @param pdfDocument
* A PDF document.
* @param id0
* The first string in the trailer ID array.
* @param id1
* The second string in the trailer ID array.
* @throws PdfException
* If PDF processing fails.
* @since 1.56
*/
public static void setTrailerId(PdfDocument pdfDocument,
String id0,
String id1)
throws PdfException {
PdfTokenFactory pdfTokenFactory = PdfUtil.getTokenFactory(pdfDocument);
Map trailerMapping = pdfDocument.getTrailer();
trailerMapping.remove(PdfNames.ID);
List idArray = new ArrayList(2);
idArray.add(pdfTokenFactory.makeString(id0));
idArray.add(pdfTokenFactory.makeString(id1));
trailerMapping.put(PdfNames.ID, pdfTokenFactory.makeArray(idArray));
pdfDocument.setTrailer(trailerMapping);
}
/**
*
* This class cannot be instantiated.
*
*
* @since 1.56
*/
private PdfUtil() {
// Prevent instantiation
}
}