com.sun.pdfview.PDFFile Maven / Gradle / Ivy
Show all versions of pdf-renderer Show documentation
/*
* $Id: PDFFile.java,v 1.19 2010-05-23 22:07:05 lujke Exp $
*
* Copyright 2004 Sun Microsystems, Inc., 4150 Network Circle,
* Santa Clara, California 95054, U.S.A. All rights reserved.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
package com.sun.pdfview;
import java.awt.geom.Rectangle2D;
import java.io.*;
import java.nio.ByteBuffer;
import java.util.*;
import com.sun.pdfview.action.*;
import com.sun.pdfview.decode.PDFDecoder;
import com.sun.pdfview.decrypt.*;
/**
* An encapsulation of a .pdf file. The methods of this class
* can parse the contents of a PDF file, but those methods are
* hidden. Instead, the public methods of this class allow
* access to the pages in the PDF file. Typically, you create
* a new PDFFile, ask it for the number of pages, and then
* request one or more PDFPages.
* @author Mike Wessler
*/
public class PDFFile {
public final static int NUL_CHAR = 0;
public final static int FF_CHAR = 12;
private String versionString = "1.1";
private int majorVersion = 1;
private int minorVersion = 1;
/** the end of line character */
/** the comment text to begin the file to determine it's version */
private final static String VERSION_COMMENT = "%PDF-";
/**
* A ByteBuffer containing the file data
*/
ByteBuffer fileBuf;
/**
* the cross reference table mapping object numbers to locations
* in the PDF file
*/
PDFXrefEntry[] xrefEntries;
/** the root PDFObject, as specified in the PDF file */
PDFObject root = null;
/** the Encrypt PDFObject, from the trailer */
PDFObject encrypt = null;
/** The Info PDFPbject, from the trailer, for simple metadata */
PDFObject info = null;
/** a mapping of page numbers to parsed PDF commands */
Cache cache;
/**
* whether the file is printable or not (trailer -> Encrypt -> P & 0x4)
*/
private boolean printable = true;
/**
* whether the file is saveable or not (trailer -> Encrypt -> P & 0x10)
*/
private boolean saveable = true;
/**
* The default decrypter for streams and strings. By default, no
* encryption is expected, and thus the IdentityDecrypter is used.
*/
private PDFDecrypter defaultDecrypter = IdentityDecrypter.getInstance();
/**
* The file identifier, as found in a trailer/xref stream dictionary,
* and as used for encryption
*/
private PDFObject fileIdentifier = null;
/**
* get a PDFFile from a .pdf file. The file must me a random access file
* at the moment. It should really be a file mapping from the nio package.
*
* Use the getPage(...) methods to get a page from the PDF file.
* @param buf the RandomAccessFile containing the PDF.
* @throws IOException if there's a problem reading from the buffer
* @throws PDFParseException if the document appears to be malformed, or
* its features are unsupported. If the file is encrypted in a manner that
* the product or platform does not support then the exception's {@link
* PDFParseException#getCause() cause} will be an instance of {@link
* UnsupportedEncryptionException}.
* @throws PDFAuthenticationFailureException if the file is password
* protected and requires a password
*/
public PDFFile(ByteBuffer buf) throws IOException {
this(buf, null);
}
/**
* get a PDFFile from a .pdf file. The file must me a random access file
* at the moment. It should really be a file mapping from the nio package.
*
* Use the getPage(...) methods to get a page from the PDF file.
* @param buf the RandomAccessFile containing the PDF.
* @param password the user or owner password
* @throws IOException if there's a problem reading from the buffer
* @throws PDFParseException if the document appears to be malformed, or
* its features are unsupported. If the file is encrypted in a manner that
* the product or platform does not support then the exception's {@link
* PDFParseException#getCause() cause} will be an instance of {@link
* UnsupportedEncryptionException}.
* @throws PDFAuthenticationFailureException if the file is password
* protected and the supplied password does not decrypt the document
*/
public PDFFile(ByteBuffer buf, PDFPassword password) throws IOException {
this.fileBuf = buf;
cache = new Cache();
parseFile(password);
}
/**
* Gets whether the owner of the file has given permission to print
* the file.
* @return true if it is okay to print the file
*/
public boolean isPrintable() {
return printable;
}
/**
* Gets whether the owner of the file has given permission to save
* a copy of the file.
* @return true if it is okay to save the file
*/
public boolean isSaveable() {
return saveable;
}
/**
* get the root PDFObject of this PDFFile. You generally shouldn't need
* this, but we've left it open in case you want to go spelunking.
*/
public PDFObject getRoot() {
return root;
}
/**
* return the number of pages in this PDFFile. The pages will be
* numbered from 1 to getNumPages(), inclusive.
*/
public int getNumPages() {
try {
return root.getDictRef("Pages").getDictRef("Count").getIntValue();
} catch (Exception ioe) {
return 0;
}
}
/**
* Get metadata (e.g., Author, Title, Creator) from the Info dictionary
* as a string.
* @param name the name of the metadata key (e.g., Author)
* @return the info
* @throws IOException if the metadata cannot be read
*/
public String getStringMetadata(String name)
throws IOException {
if (info != null) {
final PDFObject meta = info.getDictRef(name);
return meta != null ? meta.getTextStringValue() : null;
} else {
return null;
}
}
/**
* Get the keys into the Info metadata, for use with
* {@link #getStringMetadata(String)}
* @return the keys present into the Info dictionary
* @throws IOException if the keys cannot be read
*/
public Iterator getMetadataKeys()
throws IOException {
if (info != null) {
return info.getDictKeys();
} else {
return Collections.emptyList().iterator();
}
}
/**
* Used internally to track down PDFObject references. You should never
* need to call this.
*
* Since this is the only public method for tracking down PDF objects,
* it is synchronized. This means that the PDFFile can only hunt down
* one object at a time, preventing the file's location from getting
* messed around.
*
* This call stores the current buffer position before any changes are made
* and restores it afterwards, so callers need not know that the position
* has changed.
*
*/
public synchronized PDFObject dereference(PDFXref ref, PDFDecrypter decrypter)
throws IOException {
int id = ref.getObjectNumber();
// make sure the id is valid and has been read
if (id >= xrefEntries.length || id < 0) {
return PDFObject.nullObj;
}
// if there is an entry, make sure that it can resolve to the
// requested generation number and that it's not a free entry; if
// so, we should return the null object
final PDFXrefEntry entry = xrefEntries[id];
if (entry == null || !entry.resolves(ref)) {
return PDFObject.nullObj;
}
// check to see if this is already dereferenced
PDFObject obj = entry.getObject();
if (obj != null) {
return obj;
}
switch (entry.getType()) {
case OBJ_IN_BODY:
int loc = entry.getOffset();
if (loc < 0) {
return PDFObject.nullObj;
}
// store the current position in the buffer
int startPos = fileBuf.position();
// move to where this object is
fileBuf.position(loc);
// read the object and cache the reference
obj= readObject(fileBuf, ref.getObjectNumber(), ref.getGeneration(), decrypter);
if (obj == null) {
obj = PDFObject.nullObj;
}
entry.setObject(obj);
// reset to the previous position
fileBuf.position(startPos);
return obj;
case OBJ_IN_STREAM:
final PDFObject stream =
dereference(entry.getStream(), getDefaultDecrypter());
if (stream == null || stream.getType() != PDFObject.STREAM || !"ObjStm".equals(stream.getDictRef("Type").getStringValue())) {
throw new PDFParseException(entry.getStream().getObjectNumber() +
" is not an object stream, but was referenced in " +
"the xref stream as one");
}
final ByteBuffer streamBuf = stream.getStreamBuffer();
final PDFXrefEntry streamSourceEntry = xrefEntries[entry.getStream().getObjectNumber()];
int[] offsets = streamSourceEntry.getObjectIndexOffsets();
if (offsets == null) {
offsets = new int[stream.getDictionary().get("N").getIntValue()];
int first = stream.getDictionary().get("First").getIntValue();
for (int i = 0; i < offsets.length; ++i) {
// we don't need the object number
final PDFObject objNum =
readObject(streamBuf, -1, -1,
IdentityDecrypter.getInstance());
// add in the initial offset represented by First here
offsets[i] = first +
readObject(streamBuf, -1, -1,
IdentityDecrypter.getInstance()).
getIntValue();
}
streamSourceEntry.setObjectIndexOffsets(offsets);
}
if (entry.getOffset() < 0 || entry.getOffset() >= offsets.length) {
throw new PDFParseException("Xref references index that does not exist in stream");
}
streamBuf.position(offsets[entry.getOffset()]);
// According to the PDF spec:
// "Any strings that are inside streams such as content streams
// and compressed object streams, which themselves are
// encrypted"
// So, we figure out whether the containing stream was
// encrypted or not; unfortunately, we don't have this
// cached anywhere. If the stream was encrypted, we make
// sure we don't attempt to decrypt any strings within.
obj= readObject(streamBuf, ref.getObjectNumber(), ref.getGeneration(),
PDFDecoder.isEncrypted(stream) ?
IdentityDecrypter.getInstance() :
getDefaultDecrypter());
if (obj == null) {
obj = PDFObject.nullObj;
}
entry.setObject(obj);
return obj;
case FREE:
// this case should in practice be covered by the
// call to entry.resolves() above
return PDFObject.nullObj;
default:
throw new UnsupportedOperationException(
"Don't know how to handle xref type " +
entry.getType());
}
}
/**
* Is the argument a white space character according to the PDF spec?.
* ISO Spec 32000-1:2008 - Table 1
*/
public static boolean isWhiteSpace(int c) {
switch (c) {
case NUL_CHAR: // Null (NULL)
case '\t': // Horizontal Tab (HT)
case '\n': // Line Feed (LF)
case FF_CHAR: // Form Feed (FF)
case '\r': // Carriage Return (CR)
case ' ': // Space (SP)
return true;
default:
return false;
}
}
/**
* Is the argument a delimiter according to the PDF spec?
*
* ISO 32000-1:2008 - Table 2
*
* @param c the character to test
*/
public static boolean isDelimiter(int c) {
switch (c) {
case '(': // LEFT PARENTHESIS
case ')': // RIGHT PARENTHESIS
case '<': // LESS-THAN-SIGN
case '>': // GREATER-THAN-SIGN
case '[': // LEFT SQUARE BRACKET
case ']': // RIGHT SQUARE BRACKET
case '{': // LEFT CURLY BRACKET
case '}': // RIGHT CURLY BRACKET
case '/': // SOLIDUS
case '%': // PERCENT SIGN
return true;
default:
return false;
}
}
/**
* return true if the character is neither a whitespace or a delimiter.
*
* @param c the character to test
* @return boolean
*/
public static boolean isRegularCharacter (int c) {
return !(isWhiteSpace(c) || isDelimiter(c));
}
/**
* read the next object from the file
* @param buf the buffer to read from
* @param objNum the object number of the object containing the object
* being read; negative only if the object number is unavailable (e.g., if
* reading from the trailer, or reading at the top level, in which
* case we can expect to be reading an object description)
* @param objGen the object generation of the object containing the object
* being read; negative only if the objNum is unavailable
* @param decrypter the decrypter to use
*/
private PDFObject readObject(
ByteBuffer buf, int objNum, int objGen, PDFDecrypter decrypter) throws IOException {
return readObject(buf, objNum, objGen, false, decrypter);
}
/**
* read the next object with a special catch for numbers
* @param buf the buffer to read from
* @param objNum the object number of the object containing the object
* being read; negative only if the object number is unavailable (e.g., if
* reading from the trailer, or reading at the top level, in which
* case we can expect to be reading an object description)
* @param objGen the object generation of the object containing the object
* being read; negative only if the objNum is unavailable
* @param numscan if true, don't bother trying to see if a number is
* an object reference (used when already in the middle of testing for
* an object reference, and not otherwise)
* @param decrypter the decrypter to use
*/
private PDFObject readObject(
ByteBuffer buf, int objNum, int objGen,
boolean numscan, PDFDecrypter decrypter) throws IOException {
// skip whitespace
int c;
PDFObject obj = null;
while (obj == null) {
c = nextNonWhitespaceChar(buf);
// check character for special punctuation:
if (c == '<') {
// could be start of , or start of <>
c = buf.get();
if (c == '<') {
// it's a dictionary
obj= readDictionary(buf, objNum, objGen, decrypter);
} else {
buf.position(buf.position() - 1);
obj= readHexString(buf, objNum, objGen, decrypter);
}
} else if (c == '(') {
obj= readLiteralString(buf, objNum, objGen, decrypter);
} else if (c == '[') {
// it's an array
obj= readArray(buf, objNum, objGen, decrypter);
} else if (c == '/') {
// it's a name
obj = readName(buf);
} else if (c == '%') {
// it's a comment
readLine(buf);
} else if ((c >= '0' && c <= '9') || c == '-' || c == '+' || c == '.') {
// it's a number
obj = readNumber(buf, (char) c);
if (!numscan) {
// It could be the start of a reference.
// Check to see if there's another number, then "R".
//
// We can't use mark/reset, since this could be called
// from dereference, which already is using a mark
int startPos = buf.position();
PDFObject testnum= readObject(buf, -1, -1, true, decrypter);
if (testnum != null &&
testnum.getType() == PDFObject.NUMBER) {
PDFObject testR= readObject(buf, -1, -1, true, decrypter);
if (testR != null &&
testR.getType() == PDFObject.KEYWORD &&
testR.getStringValue().equals("R")) {
// yup. it's a reference.
PDFXref xref = new PDFXref(obj.getIntValue(),
testnum.getIntValue());
// Create a placeholder that will be dereferenced
// as needed
obj = new PDFObject(this, xref);
} else if (testR != null &&
testR.getType() == PDFObject.KEYWORD &&
testR.getStringValue().equals("obj")) {
// it's an object description
obj= readObjectDescription(
buf, obj.getIntValue(),
testnum.getIntValue(),
decrypter);
} else {
buf.position(startPos);
}
} else {
buf.position(startPos);
}
}
} else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
// it's a keyword
obj = readKeyword(buf, (char) c);
} else {
// it's probably a closing character.
// throwback
buf.position(buf.position() - 1);
break;
}
}
return obj;
}
/**
* Get the next non-white space character
* @param buf the buffer to read from
* @return the next non-whitespace character
*/
private int nextNonWhitespaceChar(ByteBuffer buf) {
int c;
while (isWhiteSpace(c = buf.get())) {
// nothing
}
return c;
}
/**
* Consume all sequential whitespace from the current buffer position,
* leaving the buffer positioned at non-whitespace
* @param buf the buffer to read from
*/
private void consumeWhitespace(ByteBuffer buf) {
nextNonWhitespaceChar(buf);
buf.position(buf.position() - 1);
}
/**
* requires the next few characters (after whitespace) to match the
* argument.
* @param buf the buffer to read from
* @param match the next few characters after any whitespace that
* must be in the file
* @return true if the next characters match; false otherwise.
*/
private boolean nextItemIs(ByteBuffer buf, String match) throws IOException {
// skip whitespace
int c = nextNonWhitespaceChar(buf);
for (int i = 0; i < match.length(); i++) {
if (i > 0) {
c = buf.get();
}
if (c != match.charAt(i)) {
return false;
}
}
return true;
}
/**
* process a version string, to determine the major and minor versions
* of the file.
*
* @param versionString
*/
private void processVersion(String versionString) {
try {
StringTokenizer tokens = new StringTokenizer(versionString, ".");
majorVersion = Integer.parseInt(tokens.nextToken());
minorVersion = Integer.parseInt(tokens.nextToken());
this.versionString = versionString;
} catch (Exception e) {
// ignore
}
}
/**
* return the major version of the PDF header.
*
* @return int
*/
public int getMajorVersion() {
return majorVersion;
}
/**
* return the minor version of the PDF header.
*
* @return int
*/
public int getMinorVersion() {
return minorVersion;
}
/**
* return the version string from the PDF header.
*
* @return String
*/
public String getVersionString() {
return versionString;
}
/**
* read an entire << dictionary >>. The initial
* << has already been read.
* @param buf the buffer to read from
* @param objNum the object number of the object containing the dictionary
* being read; negative only if the object number is unavailable, which
* should only happen if we're reading a dictionary placed directly
* in the trailer
* @param objGen the object generation of the object containing the object
* being read; negative only if the objNum is unavailable
* @param decrypter the decrypter to use
* @return the Dictionary as a PDFObject.
*/
private PDFObject readDictionary(
ByteBuffer buf, int objNum, int objGen, PDFDecrypter decrypter) throws IOException {
HashMap hm = new HashMap();
// we've already read the <<. Now get /Name obj pairs until >>
PDFObject name;
while ((name= readObject(buf, objNum, objGen, decrypter))!=null) {
// make sure first item is a NAME
if (name.getType() != PDFObject.NAME) {
throw new PDFParseException("First item in dictionary must be a /Name. (Was " + name + ")");
}
PDFObject value= readObject(buf, objNum, objGen, decrypter);
if (value != null) {
hm.put(name.getStringValue(), value);
}
}
// System.out.println("End of dictionary at location "+raf.getFilePointer());
if (!nextItemIs(buf, ">>")) {
throw new PDFParseException("End of dictionary wasn't '>>'");
}
// System.out.println("Dictionary closed at location "+raf.getFilePointer());
return new PDFObject(this, PDFObject.DICTIONARY, hm);
}
/**
* read a character, and return its value as if it were a hexidecimal
* digit.
* @return a number between 0 and 15 whose value matches the next
* hexidecimal character. Returns -1 if the next character isn't in
* [0-9a-fA-F]
* @param buf the buffer to read from
*/
private int readHexDigit(ByteBuffer buf) throws IOException {
int a = nextNonWhitespaceChar(buf);
switch (a) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
a -= '0';
break;
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
a -= 'a' - 10;
break;
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
a -= 'A' - 10;
break;
default:
a = -1;
break;
}
return a;
}
/**
* return the 8-bit value represented by the next two hex characters.
* If the next two characters don't represent a hex value, return -1
* and reset the read head. If there is only one hex character,
* return its value as if there were an implicit 0 after it.
* @param buf
*/
private int readHexPair(ByteBuffer buf) throws IOException {
int first = readHexDigit(buf);
if (first < 0) {
buf.position(buf.position() - 1);
return -1;
}
int second = readHexDigit(buf);
if (second < 0) {
buf.position(buf.position() - 1);
return (first << 4);
} else {
return (first << 4) + second;
}
}
/**
* read a < hex string >. The initial < has already been read.
* @param buf the buffer to read from
* @param objNum the object number of the object containing the dictionary
* being read; negative only if the object number is unavailable, which
* should only happen if we're reading a string placed directly
* in the trailer
* @param objGen the object generation of the object containing the object
* being read; negative only if the objNum is unavailable
* @param decrypter the decrypter to use
*/
private PDFObject readHexString(
ByteBuffer buf, int objNum, int objGen, PDFDecrypter decrypter) throws IOException {
// we've already read the <. Now get the hex bytes until >
int val;
StringBuffer sb = new StringBuffer();
while ((val = readHexPair(buf)) >= 0) {
sb.append((char) val);
}
if (buf.get() != '>') {
throw new PDFParseException("Bad character in Hex String");
}
return new PDFObject(this, PDFObject.STRING,
decrypter.decryptString(objNum, objGen, sb.toString()));
}
/**
* read a ( character string ). The initial ( has already been read.
* Read until a *balanced* ) appears.
*
* Section 3.2.3 of PDF Refernce version 1.7 defines the format of
* String objects. Regarding literal strings:
*
* Within a literal string, the backslash (\) is used as an
* escape character for various purposes, such as to include newline
* characters, nonprinting ASCII characters, unbalanced parentheses, or
* the backslash character itself in the string. The character
* immediately following the backslash determines its precise
* interpretation (see Table 3.2). If the character following the
* backslash is not one of those shown in the table, the backslash
* is ignored.
*
* * This only reads 8 bit basic character 'strings' so as to avoid a
* text string interpretation when one is not desired (e.g., for byte
* strings, as used by the decryption mechanism). For an interpretation of
* a string returned from this method, where the object type is defined
* as a 'text string' as per Section 3.8.1, Table 3.31 "PDF Data Types",
* {@link PDFStringUtil#asTextString} ()} or
* {@link PDFObject#getTextStringValue()} must be employed.
*
* @param buf the buffer to read from
* @param objNum the object number of the object containing the dictionary
* being read; negative only if the object number is unavailable, which
* should only happen if we're reading a dictionary placed directly
* in the trailer
* @param objGen the object generation of the object containing the object
* being read; negative only if the objNum is unavailable
* @param decrypter the decrypter to use
*/
private PDFObject readLiteralString(
ByteBuffer buf, int objNum, int objGen, PDFDecrypter decrypter) throws IOException {
int c;
// we've already read the (. now get the characters until a
// *balanced* ) appears. Translate \r \n \t \b \f \( \) \\ \ddd
// if a cr/lf follows a backslash, ignore the cr/lf
int parencount = 1;
StringBuffer sb = new StringBuffer();
while (parencount > 0) {
c = buf.get() & 0xFF;
// process unescaped parenthesis
if (c == '(') {
parencount++;
} else if (c == ')') {
parencount--;
if (parencount == 0) {
c = -1;
break;
}
} else if (c == '\\') {
// From the spec:
// Within a literal string, the backslash (\) is used as an
// escape character for various purposes, such as to include
// newline characters, nonprinting ASCII characters,
// unbalanced parentheses, or the backslash character itself
// in the string. The character immediately following the
// backslash determines its precise interpretation (see
// Table 3.2). If the character following the backslash is not
// one of those shown in the table, the backslash is ignored.
//
// summary of rules:
//
// \n \r \t \b \f 2-char sequences are used to represent their
// 1-char counterparts
//
// \( and \) are used to escape parenthesis
//
// \\ for a literal backslash
//
// \ddd (1-3 octal digits) for a character code
//
// \ is used to put formatting newlines into the
// file, but aren't actually part of the string; EOL may be
// CR, LF or CRLF
//
// any other sequence should see the backslash ignored
// grab the next character to see what we're dealing with
c = buf.get() & 0xFF;
if (c >= '0' && c < '8') {
// \ddd form - one to three OCTAL digits
int count = 0;
int val = 0;
while (c >= '0' && c < '8' && count < 3) {
val = val * 8 + c - '0';
c = buf.get() & 0xFF;
count++;
}
// we'll have read one character too many
buf.position(buf.position() - 1);
c = val;
} else if (c == 'n') {
c = '\n';
} else if (c == 'r') {
c = '\r';
} else if (c == 't') {
c = '\t';
} else if (c == 'b') {
c = '\b';
} else if (c == 'f') {
c = '\f';
} else if (c == '\r') {
// escaped CR to be ignored; look for a following LF
c = buf.get() & 0xFF;
if (c != '\n') {
// not an LF, we'll consume this character on
// the next iteration
buf.position(buf.position() - 1);
}
c = -1;
} else if (c == '\n') {
// escaped LF to be ignored
c = -1;
}
// any other c should be used as is, as it's either
// one of ()\ in which case it should be used literally,
// or the backslash should just be ignored
}
if (c >= 0) {
sb.append((char) c);
}
}
return new PDFObject(this, PDFObject.STRING,
decrypter.decryptString(objNum, objGen, sb.toString()));
}
/**
* Read a line of text. This follows the semantics of readLine() in
* DataInput -- it reads character by character until a '\n' is
* encountered. If a '\r' is encountered, it is discarded.
* @param buf the buffer to read from
*/
private String readLine(ByteBuffer buf) {
StringBuffer sb = new StringBuffer();
while (buf.remaining() > 0) {
char c = (char) buf.get();
if (c == '\r') {
if (buf.remaining() > 0) {
char n = (char) buf.get(buf.position());
if (n == '\n') {
buf.get();
}
}
break;
} else if (c == '\n') {
break;
}
sb.append(c);
}
return sb.toString();
}
/**
* read an [ array ]. The initial [ has already been read. PDFObjects
* are read until ].
* @param buf the buffer to read from
* @param objNum the object number of the object containing the dictionary
* being read; negative only if the object number is unavailable, which
* should only happen if we're reading an array placed directly
* in the trailer
* @param objGen the object generation of the object containing the object
* being read; negative only if the objNum is unavailable
* @param decrypter the decrypter to use
*/
private PDFObject readArray(
ByteBuffer buf, int objNum, int objGen, PDFDecrypter decrypter) throws IOException {
// we've already read the [. Now read objects until ]
ArrayList ary = new ArrayList();
PDFObject obj;
while((obj= readObject(buf, objNum, objGen, decrypter))!=null) {
ary.add(obj);
}
if (buf.get() != ']') {
throw new PDFParseException("Array should end with ']'");
}
PDFObject[] objlist = new PDFObject[ary.size()];
for (int i = 0; i < objlist.length; i++) {
objlist[i] = (PDFObject) ary.get(i);
}
return new PDFObject(this, PDFObject.ARRAY, objlist);
}
/**
* read a /name. The / has already been read.
* @param buf the buffer to read from
*/
private PDFObject readName(ByteBuffer buf) throws IOException {
// we've already read the / that begins the name.
// all we have to check for is #hh hex notations.
StringBuffer sb = new StringBuffer();
int c;
while (isRegularCharacter(c = buf.get())) {
if (c < '!' && c > '~') {
break; // out-of-range, should have been hex
}
// H.3.2.4 indicates version 1.1 did not do hex escapes
if (c == '#' && (majorVersion != 1 && minorVersion != 1)) {
int hex = readHexPair(buf);
if (hex >= 0) {
c = hex;
} else {
throw new PDFParseException("Bad #hex in /Name");
}
}
sb.append((char) c);
}
buf.position(buf.position() - 1);
return new PDFObject(this, PDFObject.NAME, sb.toString());
}
/**
* read a number. The initial digit or . or - is passed in as the
* argument.
*/
private PDFObject readNumber(ByteBuffer buf, char start) throws IOException {
// we've read the first digit (it's passed in as the argument)
boolean neg = start == '-';
boolean sawdot = start == '.';
double dotmult = sawdot ? 0.1 : 1;
double value = (start >= '0' && start <= '9') ? start - '0' : 0;
while (true) {
int c = buf.get();
if (c == '.') {
if (sawdot) {
throw new PDFParseException("Can't have two '.' in a number");
}
sawdot = true;
dotmult = 0.1;
} else if (c >= '0' && c <= '9') {
int val = c - '0';
if (sawdot) {
value += val * dotmult;
dotmult *= 0.1;
} else {
value = value * 10 + val;
}
} else {
buf.position(buf.position() - 1);
break;
}
}
if (neg) {
value = -value;
}
return new PDFObject(this, PDFObject.NUMBER, new Double(value));
}
/**
* read a bare keyword. The initial character is passed in as the
* argument.
*/
private PDFObject readKeyword(ByteBuffer buf, char start) throws IOException {
// we've read the first character (it's passed in as the argument)
StringBuffer sb = new StringBuffer(String.valueOf(start));
int c;
while (isRegularCharacter(c = buf.get())) {
sb.append((char) c);
}
buf.position(buf.position() - 1);
return new PDFObject(this, PDFObject.KEYWORD, sb.toString());
}
/**
* read an entire PDFObject. The intro line, which looks something
* like "4 0 obj" has already been read.
* @param buf the buffer to read from
* @param objNum the object number of the object being read, being
* the first number in the intro line (4 in "4 0 obj")
* @param objGen the object generation of the object being read, being
* the second number in the intro line (0 in "4 0 obj").
* @param decrypter the decrypter to use
*/
private PDFObject readObjectDescription(
ByteBuffer buf, int objNum, int objGen, PDFDecrypter decrypter) throws IOException {
// we've already read the 4 0 obj bit. Next thing up is the object.
// object descriptions end with the keyword endobj
long debugpos = buf.position();
PDFObject obj= readObject(buf, objNum, objGen, decrypter);
// see if it's a dictionary. If so, this could be a stream.
PDFObject endkey= readObject(buf, objNum, objGen, decrypter);
if (endkey.getType() != PDFObject.KEYWORD) {
throw new PDFParseException("Expected 'stream' or 'endobj'");
}
if (obj.getType() == PDFObject.DICTIONARY && endkey.getStringValue().equals("stream")) {
// skip until we see \n
readLine(buf);
ByteBuffer data = readStream(buf, obj);
if (data == null) {
data = ByteBuffer.allocate(0);
}
obj.setStream(data);
endkey= readObject(buf, objNum, objGen, decrypter);
}
// at this point, obj is the object, keyword should be "endobj"
String endcheck = endkey.getStringValue();
if (endcheck == null || !endcheck.equals("endobj")) {
System.out.println("WARNING: object at " + debugpos + " didn't end with 'endobj'");
//throw new PDFParseException("Object musst end with 'endobj'");
}
obj.setObjectId(objNum, objGen);
return obj;
}
/**
* read the stream portion of a PDFObject. Calls decodeStream to
* un-filter the stream as necessary.
*
* @param buf the buffer to read from
* @param dict the dictionary associated with this stream.
* @return a ByteBuffer with the encoded stream data
*/
private ByteBuffer readStream(ByteBuffer buf, PDFObject dict) throws IOException {
// pointer is at the start of a stream. read the stream and
// decode, based on the entries in the dictionary
PDFObject lengthObj = dict.getDictRef("Length");
int length = -1;
if (lengthObj != null) {
length = lengthObj.getIntValue();
}
if (length < 0) {
throw new PDFParseException("Unknown length for stream");
}
// slice the data
int start = buf.position();
ByteBuffer streamBuf = buf.slice();
streamBuf.limit(length);
// move the current position to the end of the data
buf.position(buf.position() + length);
int ending = buf.position();
if (!nextItemIs(buf, "endstream")) {
System.out.println("read " + length + " chars from " + start + " to " +
ending);
throw new PDFParseException("Stream ended inappropriately");
}
return streamBuf;
// now decode stream
// return PDFDecoder.decodeStream(dict, streamBuf);
}
/**
* read the cross reference table from a PDF file. When this method
* is called, the file pointer must point to the start of an xref table
* (i.e., to the start of the "xref" keyword) or an xref stream object.
* Reads the xref entries and populate xrefEntries. Also reads the
* trailer/xref stream dictionary to set root, fileIdentifier and encryption
* parameters. If /Prev entries are present, proceeds to read previous
* trailers and xrefs, too.
* @param password the password to use for decryption
*/
private void readTrailersAndXrefs(PDFPassword password)
throws
IOException,
PDFAuthenticationFailureException,
EncryptionUnsupportedByProductException,
EncryptionUnsupportedByPlatformException {
// the table of xrefs
// read a bunch of nested trailer tables
boolean furtherCrossrefsToRead = true;
while (furtherCrossrefsToRead) {
PDFObject header =
readObject(fileBuf, -1, -1, IdentityDecrypter.getInstance());
if (header.getType() == PDFObject.KEYWORD &&
"xref".equals(header.getStringValue())) {
furtherCrossrefsToRead = readCrossrefTableAndTrailer(password);
} else if (isXrefStream(header)) {
furtherCrossrefsToRead = readCrossrefStream(header, true);
} else {
throw new PDFParseException(
"Expected xref table or xref stream, but found " +
header);
}
}
// make sure we found a root
if (root == null) {
throw new PDFParseException("No /Root key found in trailer dictionary");
}
if (root.getDictRef("Version") != null) {
processVersion(root.getDictRef("Version").getStringValue());
}
// check what permissions are relevant
if (encrypt != null) {
defaultDecrypter =
PDFDecrypterFactory.createDecryptor(
encrypt,
fileIdentifier,
password);
PDFObject permissions = encrypt.getDictRef("P");
if (permissions!=null && !defaultDecrypter.isOwnerAuthorised()) {
int perms= permissions != null ? permissions.getIntValue() : 0;
if (permissions!=null) {
printable = (perms & 4) != 0;
saveable = (perms & 16) != 0;
}
}
}
// dereference the root object
root.dereference();
}
/**
* Identify whether a given PDFObject identifies itself as a crossreference
* stream
* @param header the object to test
* @return whether the object is an xref stream
* @throws IOException if there's a problem reading the header
*/
private boolean isXrefStream(PDFObject header) throws IOException {
return header.getType() == PDFObject.STREAM &&
"XRef".equals(header.getDictRef("Type").getStringValue());
}
/**
* Read entries from a xref table, and its trailer dictionary, which
* is expected to follow it
* @param password the password
* @return whether a previous crossref table/stream should be read; the
* buffer will have been positioned at its start point
* @throws IOException in case of a bad format, or IO problems
*/
private boolean readCrossrefTableAndTrailer(PDFPassword password) throws IOException {
// we're positioned at the start of a cross reference table
PDFObject headerObject;
while (true) {
// read until the word "trailer"
headerObject = readObject(fileBuf, -1, -1, IdentityDecrypter.getInstance());
if (headerObject.getType() != PDFObject.NUMBER) {
// we must be out of the cross-ref table!
break;
}
// each subsection will start with
//
//
// read them now:
int objNumStart = headerObject.getIntValue();
// read the size of the reference table
PDFObject sizeObj =
readObject(fileBuf, -1, -1, IdentityDecrypter.getInstance());
if (sizeObj.getType() != PDFObject.NUMBER) {
throw new PDFParseException("Expected number for length of xref table");
}
int numEntries = sizeObj.getIntValue();
final int lastObjNum = objNumStart + numEntries;
ensureXrefEntriesCapacity(lastObjNum + 1);
consumeWhitespace(fileBuf);
// read entry lines
final byte[] refline = new byte[20];
for (int objNum = objNumStart; objNum < lastObjNum; objNum++) {
// each reference line is 20 bytes long
fileBuf.get(refline);
// if xrefEntries already contains an entry for this
// object number then we've earlier read a xref
// for this object number from a later incremental
// upgrade
if (xrefEntries[objNum] == null) {
PDFXrefEntry entry;
final byte entryType = refline[17];
if (entryType == 'n') {
// active entry
int offset = Integer.parseInt(new String(refline, 0, 10));
int generation = Integer.parseInt(new String(refline, 11, 5));
final PDFXref ref = new PDFXref(objNum, generation);
entry = PDFXrefEntry.toBodyObject(generation, offset);
} else if (entryType == 'f') {
// freed entry
entry = PDFXrefEntry.forFreedObject();
} else {
throw new PDFParseException("Unknown xref entry type: "
+ entryType);
}
xrefEntries[objNum] = entry;
}
}
}
// at this point, the "trailer" word (not EOL) has been read, hopefully!
if (headerObject.getType() != PDFObject.KEYWORD ||
!"trailer".equals(headerObject.getStringValue())) {
throw new PDFParseException(
"Expected to find trailer immediately after xref table, " +
"but found " + headerObject + " instead");
}
PDFObject trailerdict = readObject(fileBuf, -1, -1, IdentityDecrypter.getInstance());
if (trailerdict.getType() != PDFObject.DICTIONARY) {
throw new PDFParseException("Expected dictionary after \"trailer\"");
}
return processTrailerDict(trailerdict, false, true);
}
/**
* Process a trailer or xref stream dictionary, recording root, info,
* encrypt and fileIdentifier members as appropriate. If a Prev entry
* is found, and followPrev is true, the buffer position is set to the
* location of a further xref table/stream to read
* @param trailerdict the trailer/xref-stream dictionary
* @param xrefStreamSource if the trailer comes from an xref stream, as
* opposed to an xref table
* @param followPrev whether Prev entries should be followed
* @return whether followPrev was set and a Prev entry was found, indicating
* that the buffer is now positioned to have another xref stream/table read
* @throws IOException if the file is badly formed, or in case of IO
* difficulties
*/
private boolean processTrailerDict(
PDFObject trailerdict,
boolean xrefStreamSource,
boolean followPrev) throws IOException {
// read the root object location
if (root == null) {
root = trailerdict.getDictRef("Root");
if (root != null) {
root.setObjectId(PDFObject.OBJ_NUM_TRAILER,
PDFObject.OBJ_NUM_TRAILER);
}
}
if (fileIdentifier == null) {
fileIdentifier = trailerdict.getDictRef("ID");
}
// read the encryption information
if (encrypt == null) {
encrypt = trailerdict.getDictRef("Encrypt");
if (encrypt != null) {
encrypt.setObjectId(PDFObject.OBJ_NUM_TRAILER,
PDFObject.OBJ_NUM_TRAILER);
}
}
if (info == null) {
info = trailerdict.getDictRef("Info");
if (info != null) {
if (!info.isIndirect()) {
throw new PDFParseException(
"Info in trailer must be an indirect reference");
}
info.setObjectId(PDFObject.OBJ_NUM_TRAILER,
PDFObject.OBJ_NUM_TRAILER);
}
}
if (!xrefStreamSource) {
PDFObject xrefStm = trailerdict.getDictRef("XRefStm");
if (xrefStm != null) {
// this is a hybrid reference file, read the
// cross-reference stream before any Prevs
fileBuf.position(xrefStm.getIntValue());
readCrossrefStream(null, false);
}
}
PDFObject prevloc = null;
if (followPrev) {
// read the location of the previous xref table
prevloc = trailerdict.getDictRef("Prev");
if (prevloc != null) {
fileBuf.position(prevloc.getIntValue());
}
}
return prevloc != null;
}
/**
* Read a Cross Reference Stream from the document
* @param xrefStream the xrefStream; if null
, the stream
* will be read from the current fileBuf position
* @param followPrev if the Prev entry from the dictionary should be
* followed to read a previous xref stream
* @return whether a Prev reference has been found and should be followed,
* in which case fileBuf will have been positioned to the start of the
* prev xref table/stream
* @throws IOException if the PDF is poorly formed
*/
private boolean readCrossrefStream(PDFObject xrefStream, boolean followPrev) throws IOException {
// the xref stream will have an object number but since there's no
// decryption involved, it doesn't matter
if (xrefStream == null) {
xrefStream = readObject(fileBuf, -1, -1, IdentityDecrypter.getInstance());
if (!isXrefStream(xrefStream)) {
throw new PDFParseException("Object found at offset for cross" +
" reference stream is not a cross reference stream");
}
}
final int size = xrefStream.getDictRef("Size").getIntValue();
ensureXrefEntriesCapacity(size);
final PDFObject[] wObjs = xrefStream.getDictRef("W").getArray();
final int[] fieldLengths = new int[3];
int entryLength = 0;
for (int i = 0; i < 3; ++i) {
fieldLengths[i] = wObjs[i].getIntValue();
entryLength += fieldLengths[i];
}
final PDFObject[] index;
final PDFObject indexObj = xrefStream.getDictRef("Index");
if (indexObj != null) {
index = indexObj.getArray();
} else {
index = new PDFObject[] {
new PDFObject(0),
new PDFObject(size)
};
}
final ByteBuffer table = xrefStream.getStreamBuffer();
for (int i = 0; i < index.length; i += 2) {
final int start = index[i].getIntValue();
final int end = start + index[i + 1].getIntValue();
for (int objNum = start; objNum < end; ++objNum) {
if (xrefEntries[objNum] == null) {
PDFXrefEntry.Type type;
if (fieldLengths[0] == 0) {
type = PDFXrefEntry.Type.OBJ_IN_BODY;
} else {
type = PDFXrefEntry.Type.forTypeField(
readInt(table, fieldLengths[0]));
}
int field2 = readInt(table, fieldLengths[1]);
// note that this is supposed to default to 0 if field 3
// length is 0 for type 1 entries, and that will work just fine
int field3 = readInt(table, fieldLengths[2]);
xrefEntries[objNum] =
type.makeXrefStreamEntry(field2, field3);
} else {
table.position(table.position() + entryLength);
}
}
}
return processTrailerDict(xrefStream, true, followPrev);
}
/**
* Read an numBytes-bytes big-endian unsigned int from a table
* @param table the table to read from
* @param numBytes the number of bytes to read
* @return the integer read; 0 if numBytes is 0
*/
private int readInt(ByteBuffer table, int numBytes) {
int val = 0;
while (numBytes-- > 0) {
final int b = table.get() & 0xFF;
val = (val << 8) | b;
}
return val;
}
/**
* Ensure that the xrefEntries table will support a given number of
* objects from 0-size. If we were to read the Size entry from a
* cross reference table before processing cross reference tables
* then we could immediately set it to the correct size, but until we
* do so, we'll just have to resize every now and then, though for
* most documents, no resizes should be required.
* @param size the required size of the xref table (i.e., the maximum
* object number plus 1)
*/
private void ensureXrefEntriesCapacity(int size) {
if (xrefEntries == null || xrefEntries.length < size) {
final PDFXrefEntry[] newXrefEntries = new PDFXrefEntry[size];
if (xrefEntries != null) {
System.arraycopy(xrefEntries, 0, newXrefEntries, 0, xrefEntries.length);
}
xrefEntries = newXrefEntries;
}
}
/**
* build the PDFFile reference table. Nothing in the PDFFile actually
* gets parsed, despite the name of this function. Things only get
* read and parsed when they're needed.
* @param password
*/
private void parseFile(PDFPassword password) throws IOException {
// start at the begining of the file
fileBuf.rewind();
String versionLine = readLine(fileBuf);
if (versionLine.startsWith(VERSION_COMMENT)) {
processVersion(versionLine.substring(VERSION_COMMENT.length()));
}
fileBuf.rewind();
fileBuf.position(fileBuf.limit() - 1);
if (!backscan(fileBuf, "startxref")) {
throw new PDFParseException("This may not be a PDF File");
}
int postStartXrefPos = fileBuf.position();
// ensure that we've got at least one piece of whitespace here, which
// should be a carriage return
if (!isWhiteSpace(fileBuf.get())) {
throw new PDFParseException("Found suspicious startxref without " +
"trialing whitespace");
}
final StringBuilder xrefBuf = new StringBuilder();
char c = (char) nextNonWhitespaceChar(fileBuf);
while (c >= '0' && c <= '9') {
xrefBuf.append(c);
c = (char) fileBuf.get();
}
int xrefpos = Integer.parseInt(xrefBuf.toString());
fileBuf.position(xrefpos);
try {
readTrailersAndXrefs(password);
} catch (UnsupportedEncryptionException e) {
throw new PDFParseException(e.getMessage(), e);
}
}
/**
* Scans backwards from the current buffer position, looking for
* the given scan token, which must exist in its entirety before
* the current buffer position. When successful, the buffer position is
* at the point immediately after the token. If not found, the buffer
* position will be at 0.
* @param buf the buffer to scan, positioned appropriately
* @param scanToken the token to scan for
* @return whether the token was found
*/
private boolean backscan(ByteBuffer buf, String scanToken) {
byte[] scanbuf = new byte[32];
if (scanToken.length() * 2 > scanbuf.length) {
// should be fine, though less than optimal, for current usages
throw new IllegalArgumentException("scanToken is too long - " +
"adjust buffer length");
}
int scanPos = buf.position() - scanbuf.length;
if (scanPos < 0) {
// use a shorter scanbuf to do a single, most likely failing, scan
scanbuf = new byte[buf.position()];
scanPos = 0;
}
while (scanPos >= 0) {
buf.position(scanPos);
buf.get(scanbuf);
// find startxref in scan
String scans = new String(scanbuf);
int loc = scans.lastIndexOf(scanToken);
if (loc >= 0) {
buf.position(scanPos + loc + scanToken.length());
return true;
}
int newScanPos = scanPos - scanbuf.length + scanToken.length() - 1;
if (newScanPos < 0) {
scanPos = scanPos == 0 ? -1 : newScanPos;
} else {
scanPos = newScanPos;
}
}
return false;
}
/**
* Gets the outline tree as a tree of OutlineNode, which is a subclass
* of DefaultMutableTreeNode. If there is no outline tree, this method
* returns null.
*/
public OutlineNode getOutline() throws IOException {
// find the outlines entry in the root object
PDFObject oroot = root.getDictRef("Outlines");
OutlineNode work = null;
OutlineNode outline = null;
if (oroot != null) {
// find the first child of the outline root
PDFObject scan = oroot.getDictRef("First");
outline = work = new OutlineNode("");
// scan each sibling in turn
while (scan != null) {
// add the new node with it's name
String title = scan.getDictRef("Title").getTextStringValue();
OutlineNode build = new OutlineNode(title);
work.add(build);
// find the action
PDFAction action = null;
PDFObject actionObj = scan.getDictRef("A");
if (actionObj != null) {
action = PDFAction.getAction(actionObj, getRoot());
} else {
// try to create an action from a destination
PDFObject destObj = scan.getDictRef("Dest");
if (destObj != null) {
try {
PDFDestination dest =
PDFDestination.getDestination(destObj, getRoot());
action = new GoToAction(dest);
} catch (IOException ioe) {
// oh well
}
}
}
// did we find an action? If so, add it
if (action != null) {
build.setAction(action);
}
// find the first child of this node
PDFObject kid = scan.getDictRef("First");
if (kid != null) {
work = build;
scan = kid;
} else {
// no child. Process the next sibling
PDFObject next = scan.getDictRef("Next");
while (next == null) {
scan = scan.getDictRef("Parent");
next = scan.getDictRef("Next");
work = (OutlineNode) work.getParent();
if (work == null) {
break;
}
}
scan = next;
}
}
}
return outline;
}
/**
* Gets the page number (starting from 1) of the page represented by
* a particular PDFObject. The PDFObject must be a Page dictionary or
* a destination description (or an action).
* @return a number between 1 and the number of pages indicating the
* page number, or 0 if the PDFObject is not in the page tree.
*/
public int getPageNumber(PDFObject page) throws IOException {
if (page.getType() == PDFObject.ARRAY) {
page = page.getAt(0);
}
// now we've got a page. Make sure.
PDFObject typeObj = page.getDictRef("Type");
if (typeObj == null || !typeObj.getStringValue().equals("Page")) {
return 0;
}
int count = 0;
while (true) {
PDFObject parent = page.getDictRef("Parent");
if (parent == null) {
break;
}
PDFObject kids[] = parent.getDictRef("Kids").getArray();
for (int i = 0; i < kids.length; i++) {
if (kids[i].equals(page)) {
break;
} else {
PDFObject kcount = kids[i].getDictRef("Count");
if (kcount != null) {
count += kcount.getIntValue();
} else {
count += 1;
}
}
}
page = parent;
}
return count;
}
/**
* Get the page commands for a given page in a separate thread.
*
* @param pagenum the number of the page to get commands for
*/
public PDFPage getPage(int pagenum) {
return getPage(pagenum, false);
}
/**
* Get the page commands for a given page.
*
* @param pagenum the number of the page to get commands for
* @param wait if true, do not exit until the page is complete.
*/
public PDFPage getPage(int pagenum, boolean wait) {
Integer key = new Integer(pagenum);
HashMap resources = null;
PDFObject pageObj = null;
boolean needread = false;
PDFPage page = cache.getPage(key);
PDFParser parser = cache.getPageParser(key);
if (page == null) {
try {
// hunt down the page!
resources = new HashMap();
PDFObject topPagesObj = root.getDictRef("Pages");
pageObj = findPage(topPagesObj, 0, pagenum, resources);
if (pageObj == null) {
return null;
}
page = createPage(pagenum, pageObj);
byte[] stream = getContents(pageObj);
parser = new PDFParser(page, stream, resources);
cache.addPage(key, page, parser);
} catch (IOException ioe) {
System.out.println("GetPage inner loop:");
ioe.printStackTrace();
return null;
}
}
if (parser != null && !parser.isFinished()) {
parser.go(wait);
}
return page;
}
/**
* Stop the rendering of a particular image on this page
*/
public void stop(int pageNum) {
PDFParser parser = cache.getPageParser(new Integer(pageNum));
if (parser != null) {
// stop it
parser.stop();
}
}
/**
* get the stream representing the content of a particular page.
*
* @param pageObj the page object to get the contents of
* @return a concatenation of any content streams for the requested
* page.
*/
private byte[] getContents(PDFObject pageObj) throws IOException {
// concatenate all the streams
PDFObject contentsObj = pageObj.getDictRef("Contents");
if (contentsObj == null) {
throw new IOException("No page contents!");
}
PDFObject contents[] = contentsObj.getArray();
// see if we have only one stream (the easy case)
if (contents.length == 1) {
return contents[0].getStream();
}
// first get the total length of all the streams
int len = 0;
for (int i = 0; i < contents.length; i++) {
byte[] data = contents[i].getStream();
if (data == null) {
throw new PDFParseException("No stream on content " + i +
": " + contents[i]);
}
len += data.length;
}
// now assemble them all into one object
byte[] stream = new byte[len];
len = 0;
for (int i = 0; i < contents.length; i++) {
byte data[] = contents[i].getStream();
System.arraycopy(data, 0, stream, len, data.length);
len += data.length;
}
return stream;
}
/**
* Create a PDF Page object by finding the relevant inherited
* properties
*
* @param pageObj the PDF object for the page to be created
*/
private PDFPage createPage(int pagenum, PDFObject pageObj)
throws IOException {
int rotation = 0;
Rectangle2D mediabox = null; // second choice, if no crop
Rectangle2D cropbox = null; // first choice
PDFObject mediaboxObj = getInheritedValue(pageObj, "MediaBox");
if (mediaboxObj != null) {
mediabox = parseNormalisedRectangle(mediaboxObj);
}
PDFObject cropboxObj = getInheritedValue(pageObj, "CropBox");
if (cropboxObj != null) {
cropbox = parseNormalisedRectangle(cropboxObj);
}
PDFObject rotateObj = getInheritedValue(pageObj, "Rotate");
if (rotateObj != null) {
rotation = rotateObj.getIntValue();
}
Rectangle2D bbox = ((cropbox == null) ? mediabox : cropbox);
return new PDFPage(pagenum, bbox, rotation, cache);
}
/**
* Get the PDFObject representing the content of a particular page. Note
* that the number of the page need not have anything to do with the
* label on that page. If there are two blank pages, and then roman
* numerals for the page number, then passing in 6 will get page (iv).
*
* @param pagedict the top of the pages tree
* @param start the page number of the first page in this dictionary
* @param getPage the number of the page to find; NOT the page's label.
* @param resources a HashMap that will be filled with any resource
* definitions encountered on the search for the page
*/
private PDFObject findPage(PDFObject pagedict, int start, int getPage,
Map resources) throws IOException {
PDFObject rsrcObj = pagedict.getDictRef("Resources");
if (rsrcObj != null) {
resources.putAll(rsrcObj.getDictionary());
}
PDFObject typeObj = pagedict.getDictRef("Type");
if (typeObj != null && typeObj.getStringValue().equals("Page")) {
// we found our page!
return pagedict;
}
// find the first child for which (start + count) > getPage
PDFObject kidsObj = pagedict.getDictRef("Kids");
if (kidsObj != null) {
PDFObject[] kids = kidsObj.getArray();
for (int i = 0; i < kids.length; i++) {
int count = 1;
// BUG: some PDFs (T1Format.pdf) don't have the Type tag.
// use the Count tag to indicate a Pages dictionary instead.
PDFObject countItem = kids[i].getDictRef("Count");
// if (kids[i].getDictRef("Type").getStringValue().equals("Pages")) {
if (countItem != null) {
count = countItem.getIntValue();
}
if (start + count >= getPage) {
return findPage(kids[i], start, getPage, resources);
}
start += count;
}
}
return null;
}
/**
* Find a property value in a page that may be inherited. If the value
* is not defined in the page itself, follow the page's "parent" links
* until the value is found or the top of the tree is reached.
*
* @param pageObj the object representing the page
* @param propName the name of the property we are looking for
*/
private PDFObject getInheritedValue(PDFObject pageObj, String propName)
throws IOException {
// see if we have the property
PDFObject propObj = pageObj.getDictRef(propName);
if (propObj != null) {
return propObj;
}
// recursively see if any of our parent have it
PDFObject parentObj = pageObj.getDictRef("Parent");
if (parentObj != null) {
return getInheritedValue(parentObj, propName);
}
// no luck
return null;
}
public static Rectangle2D parseNormalisedRectangle(PDFObject obj)
throws IOException {
if (obj != null) {
if (obj.getType() == PDFObject.ARRAY) {
PDFObject bounds[] = obj.getArray();
if (bounds.length == 4) {
final double x0 = bounds[0].getDoubleValue();
final double y0 = bounds[1].getDoubleValue();
final double x1 = bounds[2].getDoubleValue();
final double y1 = bounds[3].getDoubleValue();
final double minX;
final double maxY;
final double maxX;
final double minY;
if (x0 < x1) {
minX = x0;
maxX = x1;
} else {
minX = x1;
maxX = x0;
}
if (y0 < y1) {
minY = y0;
maxY = y1;
} else {
minY = y1;
maxY = y0;
}
return new Rectangle2D.Double(minX, minY, Math.abs(maxX - minX), Math.abs(maxY - minY));
} else {
throw new PDFParseException("Rectangle definition didn't have 4 elements");
}
} else {
throw new PDFParseException("Rectangle definition not an array");
}
} else {
throw new PDFParseException("Rectangle not present");
}
}
/**
* Get the default decrypter for the document
* @return the default decrypter; never null, even for documents that
* aren't encrypted
*/
public PDFDecrypter getDefaultDecrypter() {
return defaultDecrypter;
}
}