org.apache.pdfbox.pdfparser.PDFParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdfbox Show documentation
Show all versions of pdfbox Show documentation
The Apache PDFBox library is an open source Java tool for working with PDF documents.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.pdfparser;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSNumber;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.exceptions.WrappedIOException;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.fdf.FDFDocument;
import org.apache.pdfbox.persistence.util.COSObjectKey;
/**
* This class will handle the parsing of the PDF document.
*
* @author Ben Litchfield
* @version $Revision: 1.53 $
*/
public class PDFParser extends BaseParser
{
/**
* Log instance.
*/
private static final Log LOG = LogFactory.getLog(PDFParser.class);
private static final int SPACE_BYTE = 32;
private static final String PDF_HEADER = "%PDF-";
private static final String FDF_HEADER = "%FDF-";
protected boolean isFDFDocment = false;
private static final String PDF_DEFAULT_VERSION = "1.4";
private static final String FDF_DEFAULT_VERSION = "1.0";
/**
* A list of duplicate objects found when Parsing the PDF
* File.
*/
private List conflictList = new ArrayList();
/**
* A list of COSStream objects to check for length correctness
*/
private final HashSet streamLengthCheckSet = new HashSet();
/** Collects all Xref/trailer objects and resolves them into single
* object using startxref reference.
*/
protected XrefTrailerResolver xrefTrailerResolver = new XrefTrailerResolver();
/**
* Temp file directory.
*/
private File tempDirectory = null;
private RandomAccess raf = null;
/**
* Constructor.
*
* @param input The input stream that contains the PDF document.
*
* @throws IOException If there is an error initializing the stream.
*/
public PDFParser( InputStream input ) throws IOException
{
this(input, null, FORCE_PARSING);
}
/**
* Constructor to allow control over RandomAccessFile.
* @param input The input stream that contains the PDF document.
* @param rafi The RandomAccessFile to be used in internal COSDocument
*
* @throws IOException If there is an error initializing the stream.
*/
public PDFParser(InputStream input, RandomAccess rafi) throws IOException
{
this(input, rafi, FORCE_PARSING);
}
/**
* Constructor to allow control over RandomAccessFile.
* Also enables parser to skip corrupt objects to try and force parsing
* @param input The input stream that contains the PDF document.
* @param rafi The RandomAccessFile to be used in internal COSDocument
* @param force When true, the parser will skip corrupt pdf objects and
* will continue parsing at the next object in the file
*
* @throws IOException If there is an error initializing the stream.
*/
public PDFParser(InputStream input, RandomAccess rafi, boolean force) throws IOException
{
super(input, force);
this.raf = rafi;
}
/**
* This is the directory where pdfbox will create a temporary file
* for storing pdf document stream in. By default this directory will
* be the value of the system property java.io.tmpdir.
*
* @param tmpDir The directory to create scratch files needed to store
* pdf document streams.
*/
public void setTempDirectory( File tmpDir )
{
tempDirectory = tmpDir;
}
/**
* Returns true if parsing should be continued. By default, forceParsing is returned.
* This can be overridden to add application specific handling (for example to stop
* parsing when the number of exceptions thrown exceed a certain number).
*
* @param e The exception if vailable. Can be null if there is no exception available
* @return true if parsing could be continued, otherwise false
*/
protected boolean isContinueOnError(Exception e)
{
return forceParsing;
}
/**
* This will parse the stream and populate the COSDocument object. This will close
* the stream when it is done parsing.
*
* @throws IOException If there is an error reading from the stream or corrupt data
* is found.
*/
public void parse() throws IOException
{
try
{
if ( raf == null )
{
if( tempDirectory != null )
{
document = new COSDocument( tempDirectory );
}
else
{
document = new COSDocument();
}
}
else
{
document = new COSDocument( raf );
}
setDocument( document );
parseHeader();
//Some PDF files have garbage between the header and the
//first object
skipToNextObj();
boolean wasLastParsedObjectEOF = false;
while(true)
{
if(pdfSource.isEOF())
{
break;
}
try
{
// don't reset flag to false if it is already true
wasLastParsedObjectEOF |= parseObject();
}
catch(IOException e)
{
/*
* PDF files may have random data after the EOF marker. Ignore errors if
* last object processed is EOF.
*/
if( wasLastParsedObjectEOF )
{
break;
}
if(isContinueOnError(e))
{
/*
* Warning is sent to the PDFBox.log and to the Console that
* we skipped over an object
*/
LOG.warn("Parsing Error, Skipping Object", e);
skipSpaces();
long lastOffset = pdfSource.getOffset();
skipToNextObj();
/* the nextObject is the one we want to skip
* so read the 'Object Number' without interpret it
* in order to force the skipObject
*/
if (lastOffset == pdfSource.getOffset()) {
readStringNumber();
skipToNextObj();
}
}
else
{
throw e;
}
}
skipSpaces();
}
// set xref to start with
xrefTrailerResolver.setStartxref( document.getStartXref() );
// get resolved xref table + trailer
document.setTrailer( xrefTrailerResolver.getTrailer() );
document.addXRefTable( xrefTrailerResolver.getXrefTable() );
fixStreamsLength();
if( !document.isEncrypted() )
{
document.dereferenceObjectStreams();
}
else
{
LOG.info("Document is encrypted");
}
ConflictObj.resolveConflicts(document, conflictList);
}
catch( Throwable t )
{
//so if the PDF is corrupt then close the document and clear
//all resources to it
if( document != null )
{
document.close();
document = null;
}
if( t instanceof IOException )
{
throw (IOException)t;
}
else
{
throw new WrappedIOException( t );
}
}
finally
{
pdfSource.close();
}
}
/**
* Check whether streams with previously unknown length have the correct
* length and fix that length if needed.
*
* @throws IOException
*/
private void fixStreamsLength() throws IOException
{
for (COSObject obj : document.getObjects())
{
if (obj.getObject() instanceof COSStream
&& streamLengthCheckSet.contains((COSStream) obj.getObject()))
{
COSStream stream = (COSStream) obj.getObject();
long filteredLength = stream.getFilteredLength();
long filteredLengthWritten = stream.getFilteredLengthWritten();
if (Math.abs(filteredLength - filteredLengthWritten) > 2)
{
// adjust the length, but only if the difference is > 2,
// i.e. don't bother with CR LF differences
LOG.warn("/Length of " + obj + " corrected from " + filteredLength + " to " + filteredLengthWritten);
stream.setLong(COSName.LENGTH, filteredLengthWritten);
stream.setFilteredLength(filteredLengthWritten);
}
}
}
}
/**
* Skip to the start of the next object. This is used to recover
* from a corrupt object. This should handle all cases that parseObject
* supports. This assumes that the next object will
* start on its own line.
*
* @throws IOException
*/
private void skipToNextObj() throws IOException
{
byte[] b = new byte[16];
Pattern p = Pattern.compile("\\d+\\s+\\d+\\s+obj.*", Pattern.DOTALL);
/* Read a buffer of data each time to see if it starts with a
* known keyword. This is not the most efficient design, but we should
* rarely be needing this function. We could update this to use the
* circular buffer, like in readUntilEndStream().
*/
while(!pdfSource.isEOF())
{
int l = pdfSource.read(b);
if(l < 1)
{
break;
}
String s = new String(b, "US-ASCII");
if(s.startsWith("trailer") ||
s.startsWith("xref") ||
s.startsWith("startxref") ||
s.startsWith("stream") ||
p.matcher(s).matches())
{
pdfSource.unread(b);
break;
}
else
{
pdfSource.unread(b, 1, l-1);
}
}
}
protected void parseHeader() throws IOException
{
// read first line
String header = readLine();
// some pdf-documents are broken and the pdf-version is in one of the following lines
if (!header.contains(PDF_HEADER) && !header.contains(FDF_HEADER))
{
header = readLine();
while (!header.contains(PDF_HEADER) && !header.contains(FDF_HEADER))
{
// if a line starts with a digit, it has to be the first one with data in it
if ((header.length() > 0) && (Character.isDigit(header.charAt(0))))
{
break;
}
header = readLine();
}
}
// nothing found
if ((header.indexOf( PDF_HEADER ) == -1) && (header.indexOf( FDF_HEADER ) == -1))
{
throw new IOException( "Error: Header doesn't contain versioninfo" );
}
//sometimes there are some garbage bytes in the header before the header
//actually starts, so lets try to find the header first.
int headerStart = header.indexOf( PDF_HEADER );
if (headerStart == -1)
{
headerStart = header.indexOf(FDF_HEADER);
}
//greater than zero because if it is zero then
//there is no point of trimming
if ( headerStart > 0 )
{
//trim off any leading characters
header = header.substring( headerStart, header.length() );
}
/*
* This is used if there is garbage after the header on the same line
*/
if (header.startsWith(PDF_HEADER))
{
if (!header.matches(PDF_HEADER + "\\d.\\d"))
{
if (header.length() < PDF_HEADER.length() + 3)
{
// No version number at all, set to 1.4 as default
header = PDF_HEADER + PDF_DEFAULT_VERSION;
LOG.debug("No pdf version found, set to " + PDF_DEFAULT_VERSION + " as default.");
}
else
{
String headerGarbage = header.substring(PDF_HEADER.length() + 3, header.length()) + "\n";
header = header.substring(0, PDF_HEADER.length() + 3);
pdfSource.unread(headerGarbage.getBytes("ISO-8859-1"));
}
}
}
else
{
isFDFDocment = true;
if (!header.matches(FDF_HEADER + "\\d.\\d"))
{
if (header.length() < FDF_HEADER.length() + 3)
{
// No version number at all, set to 1.0 as default
header = FDF_HEADER + FDF_DEFAULT_VERSION;
LOG.debug("No fdf version found, set to " + FDF_DEFAULT_VERSION + " as default.");
}
else
{
String headerGarbage = header.substring(FDF_HEADER.length() + 3, header.length()) + "\n";
header = header.substring(0, FDF_HEADER.length() + 3);
pdfSource.unread(headerGarbage.getBytes("ISO-8859-1"));
}
}
}
document.setHeaderString(header);
try
{
if (header.startsWith( PDF_HEADER ))
{
float pdfVersion = Float. parseFloat(
header.substring( PDF_HEADER.length(), Math.min( header.length(), PDF_HEADER .length()+3) ) );
document.setVersion( pdfVersion );
}
else
{
float pdfVersion = Float. parseFloat(
header.substring( FDF_HEADER.length(), Math.min( header.length(), FDF_HEADER.length()+3) ) );
document.setVersion( pdfVersion );
}
}
catch ( NumberFormatException e )
{
throw new IOException( "Error getting pdf version:" + e );
}
}
/**
* This will get the document that was parsed. parse() must be called before this is called.
* When you are done with this document you must call close() on it to release
* resources.
*
* @return The document that was parsed.
*
* @throws IOException If there is an error getting the document.
*/
public COSDocument getDocument() throws IOException
{
if( document == null )
{
throw new IOException( "You must call parse() before calling getDocument()" );
}
return document;
}
/**
* This will get the PD document that was parsed. When you are done with
* this document you must call close() on it to release resources.
*
* @return The document at the PD layer.
*
* @throws IOException If there is an error getting the document.
*/
public PDDocument getPDDocument() throws IOException
{
return new PDDocument( getDocument(), this );
}
/**
* This will get the FDF document that was parsed. When you are done with
* this document you must call close() on it to release resources.
*
* @return The document at the PD layer.
*
* @throws IOException If there is an error getting the document.
*/
public FDFDocument getFDFDocument() throws IOException
{
return new FDFDocument( getDocument() );
}
/**
* This will parse the next object from the stream and add it to
* the local state.
*
* @return Returns true if the processed object had an endOfFile marker
*
* @throws IOException If an IO error occurs.
*/
private boolean parseObject() throws IOException
{
long currentObjByteOffset = pdfSource.getOffset();
boolean isEndOfFile = false;
skipSpaces();
//peek at the next character to determine the type of object we are parsing
char peekedChar = (char)pdfSource.peek();
//ignore endobj and endstream sections.
while( peekedChar == 'e' )
{
//there are times when there are multiple endobj, so lets
//just read them and move on.
readString();
skipSpaces();
currentObjByteOffset = pdfSource.getOffset();
peekedChar = (char)pdfSource.peek();
}
if( pdfSource.isEOF())
{
//"Skipping because of EOF" );
//end of file we will return a false and call it a day.
}
//xref table. Note: The contents of the Xref table are currently ignored
else if( peekedChar == 'x')
{
parseXrefTable( currentObjByteOffset );
}
// Note: startxref can occur in either a trailer section or by itself
else if (peekedChar == 't' || peekedChar == 's')
{
if(peekedChar == 't')
{
parseTrailer();
peekedChar = (char)pdfSource.peek();
}
if (peekedChar == 's')
{
parseStartXref();
// readString() calls skipSpaces() will skip comments... that's
// bad for us b/c the %%EOF flag is a comment
while(isWhitespace(pdfSource.peek()) && !pdfSource.isEOF())
{
pdfSource.read(); // read (get rid of) all the whitespace
}
String eof = "";
if(!pdfSource.isEOF())
{
eof = readLine(); // if there's more data to read, get the EOF flag
}
// verify that EOF exists (see PDFBOX-979 for documentation on special cases)
if(!"%%EOF".equals(eof))
{
if(eof.startsWith("%%EOF"))
{
// content after marker -> unread with first space byte for read newline
pdfSource.unread(SPACE_BYTE); // we read a whole line; add space as newline replacement
pdfSource.unread(eof.substring(5).getBytes("ISO-8859-1"));
}
else
{
// PDF does not conform to spec, we should warn someone
LOG.warn("expected='%%EOF' actual='" + eof + "'");
// if we're not at the end of a file, just put it back and move on
if(!pdfSource.isEOF())
{
pdfSource.unread( SPACE_BYTE ); // we read a whole line; add space as newline replacement
pdfSource.unread(eof.getBytes("ISO-8859-1"));
}
}
}
isEndOfFile = true;
}
}
//we are going to parse an normal object
else
{
long number = -1;
int genNum;
String objectKey;
boolean missingObjectNumber = false;
try
{
char peeked = (char)pdfSource.peek();
if( peeked == '<' )
{
missingObjectNumber = true;
}
else
{
number = readObjectNumber();
}
}
catch( IOException e )
{
//ok for some reason "GNU Ghostscript 5.10" puts two endobj
//statements after an object, of course this is nonsense
//but because we want to support as many PDFs as possible
//we will simply try again
number = readObjectNumber();
}
if( !missingObjectNumber )
{
skipSpaces();
genNum = readGenerationNumber();
objectKey = readString( 3 );
//System.out.println( "parseObject() num=" + number +
//" genNumber=" + genNum + " key='" + objectKey + "'" );
if( !objectKey.equals( "obj" ) )
{
if (!isContinueOnError(null) || !objectKey.equals("o"))
{
throw new IOException("expected='obj' actual='" + objectKey + "' " + pdfSource);
}
//assume that "o" was meant to be "obj" (this is a workaround for
// PDFBOX-773 attached PDF Andersens_Fairy_Tales.pdf).
}
}
else
{
number = -1;
genNum = -1;
}
skipSpaces();
COSBase pb = parseDirObject();
String endObjectKey = readString();
if( endObjectKey.equals( "stream" ) )
{
pdfSource.unread( endObjectKey.getBytes("ISO-8859-1") );
pdfSource.unread( ' ' );
if( pb instanceof COSDictionary )
{
pb = parseCOSStream( (COSDictionary)pb, getDocument().getScratchFile() );
// test for XRef type
final COSStream strmObj = (COSStream) pb;
// remember streams without length to check them later
COSBase streamLength = strmObj.getItem(COSName.LENGTH);
int length = -1;
if (streamLength instanceof COSNumber)
{
length = ((COSNumber) streamLength).intValue();
}
if (length == -1)
{
streamLengthCheckSet.add(strmObj);
}
if (COSName.XREF.equals(strmObj.getItem(COSName.TYPE)))
{
// XRef stream
parseXrefStream( strmObj, currentObjByteOffset );
}
}
else
{
// this is not legal
// the combination of a dict and the stream/endstream forms a complete stream object
throw new IOException("stream not preceded by dictionary");
}
skipSpaces();
endObjectKey = readLine();
}
COSObjectKey key = new COSObjectKey( number, genNum );
COSObject pdfObject = document.getObjectFromPool( key );
if(pdfObject.getObject() == null)
{
pdfObject.setObject(pb);
}
/*
* If the object we returned already has a baseobject, then we have a conflict
* which we will resolve using information after we parse the xref table.
*/
else
{
addObjectToConflicts(currentObjByteOffset, key, pb);
}
if( !endObjectKey.equals( "endobj" ) )
{
if (endObjectKey.startsWith( "endobj" ) )
{
/*
* Some PDF files don't contain a new line after endobj so we
* need to make sure that the next object number is getting read separately
* and not part of the endobj keyword. Ex. Some files would have "endobj28"
* instead of "endobj"
*/
pdfSource.unread( SPACE_BYTE ); // add a space first in place of the newline consumed by readline()
pdfSource.unread( endObjectKey.substring( 6 ).getBytes("ISO-8859-1") );
}
else if(endObjectKey.trim().endsWith("endobj"))
{
/*
* Some PDF files contain junk (like ">> ", in the case of a PDF
* I found which was created by Exstream Dialogue Version 5.0.039)
* in which case we ignore the data before endobj and just move on
*/
LOG.warn("expected='endobj' actual='" + endObjectKey + "' ");
}
else if( !pdfSource.isEOF() )
{
//It is possible that the endobj is missing, there
//are several PDFs out there that do that so. Unread
//and assume that endobj was missing
pdfSource.unread( SPACE_BYTE ); // add a space first in place of the newline consumed by readline()
pdfSource.unread( endObjectKey.getBytes("ISO-8859-1") );
}
}
skipSpaces();
}
return isEndOfFile;
}
/**
* Adds a new ConflictObj to the conflictList.
* @param offset the offset of the ConflictObj
* @param key The COSObjectKey of this object
* @param pb The COSBase of this conflictObj
* @throws IOException
*/
private void addObjectToConflicts(long offset, COSObjectKey key, COSBase pb) throws IOException
{
COSObject obj = new COSObject(null);
obj.setObjectNumber( COSInteger.get( key.getNumber() ) );
obj.setGenerationNumber( COSInteger.get( key.getGeneration() ) );
obj.setObject(pb);
ConflictObj conflictObj = new ConflictObj(offset, key, obj);
conflictList.add(conflictObj);
}
/**
* This will parse the startxref section from the stream.
* The startxref value is ignored.
*
* @return false on parsing error
* @throws IOException If an IO error occurs.
*/
protected boolean parseStartXref() throws IOException
{
if(pdfSource.peek() != 's')
{
return false;
}
String startXRef = readString();
if( !startXRef.trim().equals( "startxref" ) )
{
return false;
}
skipSpaces();
/* This integer is the byte offset of the first object referenced by the xref or xref stream
* Needed for the incremental update (PREV)
*/
getDocument().setStartXref(readLong());
return true;
}
/**
* This will parse the xref table from the stream and add it to the state
* The XrefTable contents are ignored.
* @param startByteOffset the offset to start at
* @return false on parsing error
* @throws IOException If an IO error occurs.
*/
protected boolean parseXrefTable( long startByteOffset ) throws IOException
{
if(pdfSource.peek() != 'x')
{
return false;
}
String xref = readString();
if( !xref.trim().equals( "xref" ) )
{
return false;
}
// check for trailer after xref
String str = readString();
byte[] b = str.getBytes("ISO-8859-1");
pdfSource.unread(b, 0, b.length);
// signal start of new XRef
xrefTrailerResolver.nextXrefObj( startByteOffset );
if (str.startsWith("trailer"))
{
LOG.warn("skipping empty xref table");
return false;
}
/*
* Xref tables can have multiple sections.
* Each starts with a starting object id and a count.
*/
while(true)
{
long currObjID = readObjectNumber(); // first obj id
long count = readLong(); // the number of objects in the xref table
skipSpaces();
for(int i = 0; i < count; i++)
{
if(pdfSource.isEOF() || isEndOfName((char)pdfSource.peek()))
{
break;
}
if(pdfSource.peek() == 't')
{
break;
}
//Ignore table contents
String currentLine = readLine();
String[] splitString = currentLine.split("\\s");
if (splitString.length < 3)
{
LOG.warn("invalid xref line: " + currentLine);
break;
}
/* This supports the corrupt table as reported in
* PDFBOX-474 (XXXX XXX XX n) */
if(splitString[splitString.length-1].equals("n"))
{
try
{
long currOffset = Long.parseLong(splitString[0]);
int currGenID = Integer.parseInt(splitString[1]);
COSObjectKey objKey = new COSObjectKey(currObjID, currGenID);
xrefTrailerResolver.setXRef(objKey, currOffset);
}
catch(NumberFormatException e)
{
throw new IOException(e.getMessage());
}
}
else if(!splitString[2].equals("f"))
{
throw new IOException("Corrupt XRefTable Entry - ObjID:" + currObjID);
}
currObjID++;
skipSpaces();
}
skipSpaces();
char c = (char)pdfSource.peek();
if(c < '0' || c > '9')
{
break;
}
}
return true;
}
/**
* This will parse the trailer from the stream and add it to the state.
*
* @return false on parsing error
* @throws IOException If an IO error occurs.
*/
protected boolean parseTrailer() throws IOException
{
if(pdfSource.peek() != 't')
{
return false;
}
//read "trailer"
String nextLine = readLine();
if( !nextLine.trim().equals( "trailer" ) )
{
// in some cases the EOL is missing and the trailer immediately
// continues with "<<" or with a blank character
// even if this does not comply with PDF reference we want to support as many PDFs as possible
// Acrobat reader can also deal with this.
if (nextLine.startsWith("trailer"))
{
byte[] b = nextLine.getBytes("ISO-8859-1");
int len = "trailer".length();
pdfSource.unread('\n');
pdfSource.unread(b, len, b.length-len);
}
else
{
return false;
}
}
// in some cases the EOL is missing and the trailer continues with " <<"
// even if this does not comply with PDF reference we want to support as many PDFs as possible
// Acrobat reader can also deal with this.
skipSpaces();
COSDictionary parsedTrailer = parseCOSDictionary();
xrefTrailerResolver.setTrailer( parsedTrailer );
// The version can also be specified within the document /Catalog
readVersionInTrailer(parsedTrailer);
skipSpaces();
return true;
}
/**
* The document catalog can also have a /Version parameter which overrides the version specified
* in the header if, and only if it is greater.
*
* @param parsedTrailer the parsed catalog in the trailer
*/
protected void readVersionInTrailer(COSDictionary parsedTrailer)
{
COSObject root = (COSObject) parsedTrailer.getItem(COSName.ROOT);
if (root != null)
{
COSBase item = root.getItem(COSName.VERSION);
if (item instanceof COSName)
{
COSName version = (COSName) item;
float trailerVersion = Float.valueOf(version.getName());
if (trailerVersion > document.getVersion())
{
document.setVersion(trailerVersion);
}
}
else if (item != null)
{
LOG.warn("Incorrect /Version entry is ignored: " + item);
}
}
}
/**
* Fills XRefTrailerResolver with data of given stream.
* Stream must be of type XRef.
* @param stream the stream to be read
* @param objByteOffset the offset to start at
* @throws IOException if there is an error parsing the stream
*/
public void parseXrefStream( COSStream stream, long objByteOffset ) throws IOException
{
parseXrefStream(stream, objByteOffset, true);
}
/**
* Fills XRefTrailerResolver with data of given stream.
* Stream must be of type XRef.
* @param stream the stream to be read
* @param objByteOffset the offset to start at
* @param isStandalone should be set to true if the stream is not part of a hybrid xref table
* @throws IOException if there is an error parsing the stream
*/
public void parseXrefStream( COSStream stream, long objByteOffset, boolean isStandalone ) throws IOException
{
// the cross reference stream of a hybrid xref table will be added to the existing one
// and we must not override the offset and the trailer
if (isStandalone)
{
xrefTrailerResolver.nextXrefObj( objByteOffset );
xrefTrailerResolver.setTrailer( stream );
}
PDFXrefStreamParser parser =
new PDFXrefStreamParser( stream, document, forceParsing, xrefTrailerResolver );
parser.parse();
}
/**
* Used to resolve conflicts when a PDF Document has multiple objects with
* the same id number. Ideally, we could use the Xref table when parsing
* the document to be able to determine which of the objects with the same ID
* is correct, but we do not have access to the Xref Table during parsing.
* Instead, we queue up the conflicts and resolve them after the Xref has
* been parsed. The Objects listed in the Xref Table are kept and the
* others are ignored.
*/
private static class ConflictObj
{
private final long offset;
private final COSObjectKey objectKey;
private final COSObject object;
ConflictObj(long offsetValue, COSObjectKey key, COSObject pdfObject)
{
this.offset = offsetValue;
this.objectKey = key;
this.object = pdfObject;
}
@Override
public String toString()
{
return "Object(" + offset + ", " + objectKey + ")";
}
/**
* Sometimes pdf files have objects with the same ID number yet are
* not referenced by the Xref table and therefore should be excluded.
* This method goes through the conflicts list and replaces the object stored
* in the objects array with this one if it is referenced by the xref
* table.
* @throws IOException
*/
private static void resolveConflicts(COSDocument document, List conflictList) throws IOException
{
Iterator conflicts = conflictList.iterator();
if (conflicts.hasNext())
{
Collection values = document.getXrefTable().values();
do
{
ConflictObj o = conflicts.next();
if (tolerantConflicResolver(values, o.offset, 4))
{
COSObject pdfObject = document.getObjectFromPool(o.objectKey);
if (pdfObject.getObjectNumber() != null
&& pdfObject.getObjectNumber().equals(o.object.getObjectNumber()))
{
pdfObject.setObject(o.object.getObject());
}
else
{
LOG.debug("Conflict object [" + o.objectKey + "] at offset " + o.offset
+" found in the xref table, but the object numbers differ. Ignoring this object."
+ " The document is maybe malformed.");
}
}
}
while (conflicts.hasNext());
}
}
}
/**
* Check if the given object offset can be find in the xref table. If not, we try to search the table
* again with the given tolerance and check the given bytes before and after the xref table offset.
*
* @param values are the unsorted values from the xref table
* @param offset is the offset that should be found in the xref table
* @param tolerance is the allowed tolerance in bytes.
* @return true if the offset was found inside the xref table
*/
private static boolean tolerantConflicResolver(Collection values, long offset, int tolerance)
{
if (values.contains(offset))
{
return true;
}
else
{
for ( Long integer : values )
{
if (Math.abs(integer - offset) <= tolerance)
{
return true;
}
}
}
return false;
}
/**
* {@inheritDoc}
*/
@Override
public void clearResources()
{
super.clearResources();
if (conflictList != null)
{
conflictList.clear();
conflictList = null;
}
if (xrefTrailerResolver != null)
{
xrefTrailerResolver.clearResources();
xrefTrailerResolver = null;
}
}
}