All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.intarsys.pdf.parser.COSDocumentParser Maven / Gradle / Ivy

Go to download

This is a fork of http://sourceforge.net/projects/jpodlib/ as development seems to be frozen. We're providing some bug fixes along with deployments to maven.

There is a newer version: 2.0
Show newest version
/*
 * Copyright (c) 2007, intarsys consulting GmbH
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * - Neither the name of intarsys nor the names of its contributors may be used
 *   to endorse or promote products derived from this software without specific
 *   prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
package de.intarsys.pdf.parser;

import de.intarsys.pdf.cos.COSDictionary;
import de.intarsys.pdf.cos.COSDocumentElement;
import de.intarsys.pdf.cos.COSIndirectObject;
import de.intarsys.pdf.cos.COSInteger;
import de.intarsys.pdf.cos.COSName;
import de.intarsys.pdf.cos.COSNull;
import de.intarsys.pdf.cos.COSObject;
import de.intarsys.pdf.cos.COSObjectKey;
import de.intarsys.pdf.crypt.ISystemSecurityHandler;
import de.intarsys.pdf.st.STDocument;
import de.intarsys.tools.randomaccess.IRandomAccess;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
 * A parser for PDF data streams.
 * 

*

* The parser will create a object representation of the pdf document using COS * level objects. *

*

*

* The parser is a one pass, read everything implementation. *

*/ public class COSDocumentParser extends PDFParser { /** * use a buffer large than specified by the spec. we already had documents * with whitespace padding > 1024 bytes! */ public static final int SEARCH_BUFFER_SIZE = 2048; private STDocument doc; public COSDocumentParser(STDocument doc) { this.doc = doc; } /* * create a COS reference out of the objects in the parsers lookahead see * PDF Reference v1.4, chapter 3.2.9 Indirect objects COSReference ::= * objNum genNum "R" * * @return the reference object created * * @throws IOException @throws COSLoadException */ @Override protected COSIndirectObject createObjectReference(IRandomAccess input) throws IOException, COSLoadException { COSObject obj1 = lookaheadPop(); COSObject obj2 = lookaheadPop(); if (!(obj1 instanceof COSInteger) || !(obj2 instanceof COSInteger)) { COSLoadError e = new COSLoadError("illegal reference at character index " + input.getOffset()); handleError(e); } int objectNumber = ((COSInteger) obj1).intValue(); int generationNumber = ((COSInteger) obj2).intValue(); return getDoc().getObjectReference(objectNumber, generationNumber); } public STDocument getDoc() { return doc; } public boolean isTokenXRefAt(IRandomAccess input, int offset) throws IOException { input.seek(offset); readSpaces(input); byte[] token = new byte[4]; input.read(token); return Arrays.equals(token, PDFParser.TOKEN_xref); } /** * read a pdf style object from the input. see PDF Reference v1.4, chapter * 3.2.9 Indirect Objects COSIndirectObject ::= ObjNum GenNum "obj" Object * "endobj" * * @return The parsed object. * @throws IOException * @throws COSLoadException */ public COSObject parseIndirectObject(IRandomAccess input, ISystemSecurityHandler securityHandler) throws IOException, COSLoadException { COSObjectKey key = parseIndirectObjectKey(input); // this may be called recursive in a stream for its length COSObjectKey oldObjectKey = getObjectKey(); ISystemSecurityHandler oldSecurityHandler = getSecurityHandler(); try { setObjectKey(key); setSecurityHandler(securityHandler); COSObject object = parseIndirectObjectBody(input); if (object == null) { object = COSNull.NULL; } return object; } finally { setObjectKey(oldObjectKey); setSecurityHandler(oldSecurityHandler); } } protected COSObject parseIndirectObjectBody(IRandomAccess input) throws IOException, COSLoadException { byte[] token; COSDocumentElement element = parseObject(input); if (element == null) { COSLoadError e = new COSLoadError("object expected at character index " + input.getOffset()); handleError(e); } else { if (element.isReference()) { COSLoadError e = new COSLoadError("object reference not allowed in a indirect object at character index " + input .getOffset()); handleError(e); } if (check) { List messages = new ArrayList(); token = readToken(input, messages, false); // todo 2 kkr add check for additional whitespace after // endstream // before endobj if (!messages.isEmpty() && !(messages.size() == 1 && messages.contains(C_TOKEN_ADDWSB)) && !(messages.size() == 1 && messages.contains(C_TOKEN_ADDWSA2))) { COSLoadWarning pwarn = new COSLoadWarning(C_WARN_SINGLEEOL_OBJ); pwarn.setHint(Long.valueOf(input.getOffset())); handleWarning(pwarn); } } else { token = readToken(input); } if (token == null) { COSLoadError e = new COSLoadError("unexpected end of file"); handleError(e); } if (!Arrays.equals(token, TOKEN_endobj)) { input.seekBy(-token.length); COSLoadWarning w = new COSLoadWarning(C_WARN_ENDOBJ_MISSING); handleWarning(w); } } return (COSObject) element; } protected COSObjectKey parseIndirectObjectKey(IRandomAccess input) throws IOException, COSLoadException { byte[] token; // pdfa compliance here, must verify the existence of a single space // between // object number, generation and obj keyword List messages = new ArrayList(); // object number if (check) { token = readToken(input, messages, true); } else { token = readToken(input); } if (token == null) { COSLoadError e = new COSLoadError("unexpected end of file"); handleError(e); } int objNumber = 0; try { objNumber = Integer.parseInt(new String(token)); if (messages.size() > 1) { COSLoadWarning pwarn = new COSLoadWarning(C_WARN_SINGLESPACE_OBJ); pwarn.setHint(Long.valueOf(input.getOffset())); handleWarning(pwarn); } } catch (NumberFormatException ignored) { COSLoadError e = new COSLoadError("invalid object number at character index " + input.getOffset()); handleError(e); } // generation number messages.clear(); if (check) { token = readToken(input, messages, true); } else { token = readToken(input); } if (token == null) { COSLoadError e = new COSLoadError("unexpected end of file"); handleError(e); } int genNumber = 0; try { genNumber = Integer.parseInt(new String(token)); if (messages.size() > 1) { COSLoadWarning pwarn = new COSLoadWarning(C_WARN_SINGLESPACE_OBJ); pwarn.setHint(Long.valueOf(input.getOffset())); handleWarning(pwarn); } } catch (NumberFormatException ignored) { COSLoadError e = new COSLoadError("invalid generation number at character index " + input.getOffset()); handleError(e); } // obj keyword messages.clear(); if (check) { token = readToken(input, messages, true); } else { token = readToken(input); } if (token == null) { COSLoadError e = new COSLoadError("unexpected end of file"); handleError(e); } if (!Arrays.equals(token, TOKEN_obj)) { input.seekBy(-token.length); COSLoadError e = new COSLoadError("file format error, obj expected at character index " + input.getOffset()); handleError(e); } if (check) { if (!messages.isEmpty()) { COSLoadWarning pwarn = new COSLoadWarning(C_WARN_SINGLESPACE_OBJ); pwarn.setHint(Long.valueOf(input.getOffset())); handleWarning(pwarn); } if (readEOL(input) > 1) { COSLoadWarning pwarn = new COSLoadWarning(C_WARN_SINGLEEOL_OBJ); pwarn.setHint(Long.valueOf(input.getOffset())); handleWarning(pwarn); } } else { readSpaces(input); } return new COSObjectKey(objNumber, genNumber); } /** * the startxref value. * * @return the startxref value * @throws IOException * @throws COSLoadException */ public int parseStartXRef(IRandomAccess input) throws IOException, COSLoadException { readSpaces(input); byte[] token = new byte[9]; input.read(token); if (!Arrays.equals(token, PDFParser.TOKEN_startxref)) { COSLoadError e = new COSLoadError("file format error. 'startxref' expected at offset:" + (input.getOffset() - 9)); handleError(e); } return readInteger(input, true); } /** * parse the trailer section from the current stream position. see PDF * Reference v1.4, chapter 3.4.4 File Trailer DocumentTrailer ::= "trailer" * COSDict "startxref" COSNumber * * @return the trailer dictionary * @throws IOException * @throws COSLoadException */ public COSDictionary parseTrailer(IRandomAccess input) throws IOException, COSLoadException { byte[] token = new byte[7]; int bytesRead = input.read(token); if (!Arrays.equals(token, TOKEN_trailer)) { if (bytesRead > 0) { input.seekBy(-bytesRead); } COSLoadError e = new COSLoadError("file format error. 'trailer' expected at character index " + input.getOffset()); handleError(e); } readSpaces(input); COSDictionary trailerDict = (COSDictionary) parseObjectDictionary(input); readSpaces(input); return trailerDict; } /** * Searches the offset to the first trailer in the last SEARCH_BUFFER_SIZE * bytes of the document. The search goes backwards starting with the last * byte. * * @return the offset to the first trailer found * @throws IOException * @throws COSLoadException */ public int searchLastStartXRef(IRandomAccess input) throws IOException, COSLoadException { long startOffset = input.getLength() - SEARCH_BUFFER_SIZE; if (startOffset < 0) { startOffset = 0; } input.seek(startOffset); byte[] buffer = new byte[SEARCH_BUFFER_SIZE]; int bytesRead = input.read(buffer); boolean found = false; int bufferOffset; for (bufferOffset = bytesRead - TOKEN_startxref.length; bufferOffset > 0; bufferOffset--) { for (int j = 0; j < TOKEN_startxref.length; j++) { if (buffer[bufferOffset + j] == TOKEN_startxref[j]) { found = true; } else { found = false; break; } } if (found) { break; } } if (found) { long startXRefOffset = startOffset + bufferOffset; input.seek(startXRefOffset); return parseStartXRef(input); } COSLoadError e = new COSLoadError("no startxref found in the last " + SEARCH_BUFFER_SIZE + " bytes of the document"); handleError(e); return -1; } /** * @param input * @return Returns the offset of the dictionary with linearization * parameters if any. * @throws IOException * @throws COSLoadException * @deprecated Don't use this anymore *

* Returns the offset of the dictionary with linearization * parameters if any. Returns -1 otherwise. */ @Deprecated public int searchLinearized(IRandomAccess input) throws IOException, COSLoadException { long oldOffset = input.getOffset(); int result = -1; input.seek(0); parseComment(input); // file header int next = input.read(); while (true) { if (next == -1) { break; } if ((next == ' ') || isWhitespace(next)) { // performance shortcut next = input.read(); continue; } input.seekBy(-1); if (isDigit(next)) { result = (int) input.getOffset(); COSDocumentElement cosobj = parseIndirectObject(input, null); COSName linearized = COSName.constant("Linearized"); if (cosobj instanceof COSDictionary && ((COSDictionary) cosobj).containsKey(linearized)) { return result; } result = -1; break; } if (next == '%') { parseComment(input); // this is the binary comment } else { break; } next = input.read(); } // reset randomaccess input.seek(oldOffset); return result; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy