org.jpedal.io.types.RefTable Maven / Gradle / Ivy

/*
 * ===========================================
 * Java Pdf Extraction Decoding Access Library
 * ===========================================
 *
 * Project Info:  http://www.idrsolutions.com
 * Help section for developers at http://www.idrsolutions.com/support/
 *
 * (C) Copyright 1997-2017 IDRsolutions and Contributors.
 *
 * This file is part of JPedal/JPDF2HTML5
 *
     This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


 *
 * ---------------
 * RefTable.java
 * ---------------
 */
package org.jpedal.io.types;

import java.io.IOException;

import org.jpedal.exception.PdfException;
import org.jpedal.io.ObjectDecoder;
import org.jpedal.io.PdfFileReader;
import org.jpedal.io.RandomAccessBuffer;
import org.jpedal.objects.raw.CompressedObject;
import org.jpedal.objects.raw.PageObject;
import org.jpedal.objects.raw.PdfDictionary;
import org.jpedal.objects.raw.PdfObject;
import org.jpedal.utils.LogWriter;
import org.jpedal.utils.NumberUtils;

/**
 *
 */
public class RefTable {


    PdfObject encryptObj;

    /**
     * holds file ID
     */
    private byte[] ID;

    /**
     * pattern to look for in objects
     */
    static final String pattern = "obj";

    /**
     * info object
     */
    private PdfObject infoObject;

    static final int UNSET = -1;
    static final int COMPRESSED = 1;
    static final int LEGACY = 2;

    private RandomAccessBuffer pdf_datafile;

    static final byte[] oldPattern = {'x', 'r', 'e', 'f'};

    private final long eof;

    final Offsets offset;

    public RefTable(final RandomAccessBuffer pdf_datafile, final long eof, final Offsets offset) {
        this.pdf_datafile = pdf_datafile;
        this.eof = eof;
        this.offset = offset;
    }

    /**
     * read first start ref from last 1024 bytes
     */
    private int readFirstStartRef() throws PdfException {

        //reset flag
        offset.setRefTableInvalid(false);


        int pointer = -1;
        int i = 1019;
        final StringBuilder startRef = new StringBuilder(10);
        
        /* move to end of file and read last 1024 bytes*/
        final int block = 1024;
        byte[] lastBytes = new byte[block];
        long end;
        
        /*
         * set endpoint, losing null chars and anything before EOF
         */
        final int[] EndOfFileMarker = {37, 37, 69, 79};
        int valReached = 3;
        boolean EOFFound = false;
        try {
            end = eof;
            
            /*
             * lose nulls and other trash from end of file
             */
            final int bufSize = 255;
            while (true) {
                final byte[] buffer = getBytes(end - bufSize, bufSize);

                int offset = 0;

                for (int ii = bufSize - 1; ii > -1; ii--) {

                    //see if we can decrement EOF tracker or restart check
                    if (!EOFFound) {
                        valReached = 3;
                    }

                    if (buffer[ii] == EndOfFileMarker[valReached]) {
                        valReached--;
                        EOFFound = true;
                    } else {
                        EOFFound = false;
                    }

                    //move to next byte
                    offset--;

                    if (valReached < 0) {
                        ii = -1;
                    }

                }

                //exit if found values on loop
                if (valReached < 0) {
                    end -= offset;
                    break;
                } else {
                    end -= bufSize;
                }

                //allow for no eof
                if (end < 0) {
                    end = eof;
                    break;
                }
            }

            //allow for very small file
            int count = (int) (end - block);

            if (count < 0) {
                count = 0;
                final int size = (int) eof;
                lastBytes = new byte[size];
                i = size + 3; //force reset below
            }

            lastBytes = getBytes(count, lastBytes.length);

        } catch (final Exception e) {
            LogWriter.writeLog("Exception " + e + " reading last 1024 bytes");

            throw new PdfException(e + " reading last 1024 bytes");
        }

        //look for tref as end of startxref
        final int fileSize = lastBytes.length;

        if (i > fileSize) {
            i = fileSize - 5;
        }

        while (i > -1) {

            //first check is because startref works as well a startxref !!
            if (((lastBytes[i] == 116 && lastBytes[i + 1] == 120) || (lastBytes[i] == 114 && lastBytes[i + 1] == 116))
                    && (lastBytes[i + 2] == 114)
                    && (lastBytes[i + 3] == 101)
                    && (lastBytes[i + 4] == 102)) {
                break;
            }


            i--;

        }
        
        /*trap buggy files*/
        if (i == -1) {
            try {
                closeFile();
            } catch (final IOException e1) {
                LogWriter.writeLog("Exception " + e1 + " closing file");
            }
            throw new PdfException("No Startxref found in last 1024 bytes ");
        }

        i += 5; //allow for word length

        //move to start of value ignoring spaces or returns
        while (i < 1024 && (lastBytes[i] == 10 || lastBytes[i] == 32 || lastBytes[i] == 13)) {
            i++;
        }

        //move to start of value ignoring spaces or returns
        while ((i < 1024)
                && (lastBytes[i] != 10)
                && (lastBytes[i] != 32)
                && (lastBytes[i] != 13)) {
            startRef.append((char) lastBytes[i]);
            i++;
        }
        
        /*convert xref to string to get pointer*/
        if (startRef.length() > 0) {
            pointer = Integer.parseInt(startRef.toString());
        }

        if (pointer == -1) {
            LogWriter.writeLog("No Startref found in last 1024 bytes ");

            try {
                closeFile();
            } catch (final IOException e1) {
                LogWriter.writeLog("Exception " + e1 + " closing file");
            }
            throw new PdfException("No Startref found in last 1024 bytes ");
        }

        return pointer;
    }


    /**
     * read reference table start to see if new 1.5 type or traditional xref
     *
     * @throws PdfException
     */
    public final PdfObject readReferenceTable(final PdfObject linearObj, final PdfFileReader currentPdfFile, final ObjectReader objectReader) throws PdfException {

        int pointer = -1;
        final int eof = (int) this.eof;

        boolean islinearizedCompressed = false;

        if (linearObj == null) {
            pointer = readFirstStartRef();
        } else { //find at start of Linearized
            final byte[] data = pdf_datafile.getPdfBuffer();


            final int count = data.length;
            int ptr = 5;
            for (int i = 0; i < count; i++) {

                //track start of this object (needed for compressed)
                if (data[i] == 'e' && data[i + 1] == 'n' && data[i + 2] == 'd' && data[i + 3] == 'o' && data[i + 4] == 'b' && data[i + 5] == 'j') {
                    ptr = i + 6;

                }

                if (data[i] == 'x' && data[i + 1] == 'r' && data[i + 2] == 'e' && data[i + 3] == 'f') {
                    pointer = i;
                    i = count;
                } else if (data[i] == 'X' && data[i + 1] == 'R' && data[i + 2] == 'e' && data[i + 3] == 'f') {

                    islinearizedCompressed = true;

                    pointer = ptr;
                    while (data[pointer] == 10 || data[pointer] == 13 || data[pointer] == 32) {
                        pointer++;
                    }

                    i = count;
                }
            }
        }

        offset.addXref(pointer);

        PdfObject rootObj = null;

        boolean isInvalid = true;

        if (pointer >= eof || pointer == 0) {

            LogWriter.writeLog("Pointer not if file - trying to manually find startref");

        } else {
            try {
                if (islinearizedCompressed || isCompressedStream(pointer, eof)) {
                    rootObj = readCompressedStream(rootObj, pointer, currentPdfFile, objectReader, linearObj);
                    isInvalid = false;

                } else {
                    rootObj = readLegacyReferenceTable(rootObj, pointer, eof, currentPdfFile);
                    isInvalid = false;
                }
            } catch (final Exception e) {
                LogWriter.writeLog("[PDF] Exception reading reg table " + e + " - trying to manually find startref");
            }
        }

        if (isInvalid) {
            offset.setRefTableInvalid(true);

            try {
                rootObj = new PageObject(BrokenRefTable.findOffsets(pdf_datafile, offset));

                final byte[] rootDictBytes = BrokenRefTable.findFirstRootDict(pdf_datafile); //fix for 28184
                if (rootDictBytes != null) {
                    final PdfObject pdfObject = new CompressedObject("0 0 R");
                    Dictionary.readDictionary(pdfObject, 0, rootDictBytes, -1, currentPdfFile);
                    encryptObj = pdfObject.getDictionary(PdfDictionary.Encrypt);
                    if (encryptObj != null) {

                        final byte[][] IDs = pdfObject.getStringArray(PdfDictionary.ID);
                        if (IDs != null && this.ID == null) {
                            // only the first encountered ID should be used as a fileID for decryption
                            this.ID = IDs[0];
                        }
                    }
                    infoObject = pdfObject.getDictionary(PdfDictionary.Info);
                }
            } catch (final Error err) {
                throw new PdfException(err.getMessage() + " attempting to manually scan file for objects");
            }

            currentPdfFile.readObject(rootObj);

        }

        return rootObj;


    }


    /**
     * read reference table from file so we can locate
     * objects in pdf file and read the trailers
     */
    private PdfObject readLegacyReferenceTable(PdfObject rootObj, int pointer, final int eof, final PdfFileReader currentPdfFile) throws PdfException {


        int endTable, current = 0; //current object number
        byte[] Bytes;
        int bufSize = 1024;
        
        /*read and decode 1 or more trailers*/
        while (true) {

            try {

                //allow for pointer outside file
                Bytes = Trailer.readTrailer(bufSize, pointer, eof, pdf_datafile);

            } catch (final Exception e) {

                try {
                    closeFile();
                } catch (final IOException e1) {
                    LogWriter.writeLog("Exception " + e + " closing file " + e1);
                }
                throw new PdfException("Exception " + e + " reading trailer");
            }

            if (Bytes == null) { //safety catch
                break;
            }

            //get trailer
            int i = 0;

            final int maxLen = Bytes.length;
            boolean trailerNotFound = true;

            //for(int a=0;a<100;a++)
            //	System.out.println((char)Bytes[i+a]);
            while (i < maxLen - 7) { //look for trailer keyword
                if (Bytes[i] == 116 && Bytes[i + 1] == 114 && Bytes[i + 2] == 97 && Bytes[i + 3] == 105 &&
                        Bytes[i + 4] == 108 && Bytes[i + 5] == 101 && Bytes[i + 6] == 114) {
                    trailerNotFound = false;
                    break;
                }

                i++;
            }

            //save endtable position for later
            endTable = i;

            if (i == Bytes.length || trailerNotFound) {
                break;
            }

            //move to beyond <<
            while (Bytes[i] != 60 && Bytes[i - 1] != 60) {
                i++;
            }

            i++;
            final PdfObject pdfObject = new CompressedObject("0 0 R");
            Dictionary.readDictionary(pdfObject, i, Bytes, -1, currentPdfFile);

            //move to beyond >>
            i = skipToEnd(Bytes, i);

            //handle optional XRefStm
            final int XRefStm = pdfObject.getInt(PdfDictionary.XRefStm);

            if (XRefStm != -1) {
                pointer = XRefStm;
            } else { //usual way
                pointer = getPointer(pointer, Bytes, i, maxLen);
            }

            i = StreamReaderUtils.skipSpaces(Bytes, 0);

            if (pointer == -1) {
                LogWriter.writeLog("No startRef");
                
                /*now read the objects for the trailers*/
            } else if (Bytes[i] == 120 && Bytes[i + 1] == 114 && Bytes[i + 2] == 101 && Bytes[i + 3] == 102) { //make sure starts xref

                i = StreamReaderUtils.skipSpaces(Bytes, 5);

                current = offset.readXRefs(current, Bytes, endTable, i, eof, pdf_datafile);
                
                /*now process trailer values - only first set of table values for root, encryption and info*/
                rootObj = processTrailer(rootObj, pdfObject);

                //make sure first values used if several tables and code for prev
                pointer = pdfObject.getInt(PdfDictionary.Prev);

                //see if other trailers
                if (pointer != -1 && pointer < this.eof) {
                    //reset values for loop
                    bufSize = 1024;

                    //track ref table so we can work out object length
                    offset.addXref(pointer);

                } else { //reset if fails second test above
                    pointer = -1;
                }

            } else {
                pointer = -1;

                //needs to be read to pick up potential /Pages value
                //noinspection ObjectAllocationInLoop
                rootObj = new PageObject(BrokenRefTable.findOffsets(pdf_datafile, offset));
                currentPdfFile.readObject(rootObj);

                offset.setRefTableInvalid(true);


            }
            if (pointer == -1) {
                break;
            }
        }

        if (encryptObj == null && rootObj != null) { //manual check for broken file (ignore if Encrypted)
            rootObj = handleBrokenFile(rootObj, currentPdfFile);
        }

        //something gone wrong so manually index
        if (rootObj == null) { //see 21382

            offset.clear();
            offset.reuse();

            //needs to be read to pick up potential /Pages value
            //noinspection ObjectAllocationInLoop
            rootObj = new PageObject(BrokenRefTable.findOffsets(pdf_datafile, offset));
            currentPdfFile.readObject(rootObj);

            offset.setRefTableInvalid(true);

        }

        return rootObj;
    }

    private static PdfObject handleBrokenFile(PdfObject rootObj, final PdfFileReader currentPdfFile) {
        int type = -1;

        final int status = rootObj.getStatus();
        final byte[] data = rootObj.getUnresolvedData();

        try {

            final ObjectDecoder objectDecoder = new ObjectDecoder(currentPdfFile);
            objectDecoder.checkResolved(rootObj);

            type = rootObj.getParameterConstant(PdfDictionary.Type);

        } catch (final Exception e) { //we need to ignore so just catch, put back as was and log

            rootObj.setStatus(status);
            rootObj.setUnresolvedData(data, status);
            LogWriter.writeLog("[PDF] Exception reading type on root object " + e);

        }

        //something gone wrong so manually index
        if (type == PdfDictionary.Font) { //see 21153 - ref table in wrong order
            rootObj = null; ///will reset in code at end
        }
        return rootObj;
    }

    private int getPointer(int pointer, final byte[] bytes, int i, final int maxLen) {
        boolean hasRef = true;

        i = StreamReaderUtils.skipSpaces(bytes, i);
        while (bytes[i] == '%') {
            while (bytes[i] != 10) {

                i++;
            }
            i++;
        }
                /* fix for /Users/markee/Downloads/oneiderapartnerbrochure_web_1371798737.pdf
                /**/

        //look for xref as end of startref
        while (bytes[i] != 116 && bytes[i + 1] != 120 &&
                bytes[i + 2] != 114 && bytes[i + 3] != 101 && bytes[i + 4] != 102) {

            if (bytes[i] == 'o' && bytes[i + 1] == 'b' && bytes[i + 2] == 'j') {
                hasRef = false;
                break;
            }
            i++;
        }

        if (hasRef) {

            //move to start of value ignoring spaces or returns
            i = StreamReaderUtils.skipSpaces(bytes, i + 8);

            final int s = i;

            i = StreamReaderUtils.skipToEndOfKey(bytes, i);

            /*convert xref to string to get pointer*/
            if (s != i) {
                pointer = NumberUtils.parseInt(s, i, bytes);
            }

        }
        return pointer;
    }

    private static int skipToEnd(final byte[] bytes, int i) {

        int level = 0;
        final int length = bytes.length;

        while (true) {

            if (bytes[i] == 60 && bytes[i - 1] == 60) {
                level++;
                i++;
            } else if (bytes[i] == '[') {
                i++;
                while (bytes[i] != ']') {
                    i++;
                    if (i == length) {
                        break;
                    }
                }
            } else if (bytes[i] == 62 && bytes[i - 1] == 62) {
                level--;
                i++;
            }

            if (level == 0) {
                break;
            }

            i++;
        }

        return i;
    }


    /**
     * read 1.5 compression stream ref table
     *
     * @throws PdfException
     */
    private PdfObject readCompressedStream(PdfObject rootObj, int pointer, final PdfFileReader currentPdfFile, final ObjectReader objectReader, final PdfObject linearObj) throws PdfException {

        while (pointer != -1) {

            // get values to read stream ref
            movePointer(pointer);

            final PdfObject pdfObject = readStreamTableData(currentPdfFile, objectReader, offset, pdf_datafile);

            // process trailer values - only first set of table values for root, encryption and info
            rootObj = processTrailer(rootObj, pdfObject);

            //make sure first values used if several tables and code for prev so long as not linearized
            //may need adjusting as more examples turn up
            if (linearObj != null) {
                pointer = -1;
            } else {
                pointer = pdfObject.getInt(PdfDictionary.Prev);

                //a non-compressed object table can follow a compressed one so we need to allow for this
                if (pointer != -1 && !isCompressedStream(pointer, (int) eof)) {
                    return readLegacyReferenceTable(rootObj, pointer, (int) eof, currentPdfFile);
                }
            }
        }

        return rootObj;
    }

    private static PdfObject readStreamTableData(final PdfFileReader currentPdfFile, final ObjectReader objectReader,
                                                 final Offsets offset,
                                                 final RandomAccessBuffer pdf_datafile) throws PdfException {

        final byte[] raw = objectReader.readObjectData(-1, null);

        final PdfObject pdfObject = new CompressedObject(getObjectName(raw));
        pdfObject.setCompressedStream(true);
        final ObjectDecoder objectDecoder = new ObjectDecoder(currentPdfFile);
        objectDecoder.readDictionaryAsObject(pdfObject, 0, raw);

        //read the field sizes
        final int[] fieldSizes = pdfObject.getIntArray(PdfDictionary.W);

        //read the xrefs stream
        byte[] xrefs = pdfObject.getDecodedStream();

        if (xrefs == null) {
            xrefs = currentPdfFile.readStream(pdfObject, true, true, false, false, true, null);
        }

        final int[] Index = pdfObject.getIntArray(PdfDictionary.Index);
        if (Index == null) { //single set of values
            CompressedObjects.readCompressedOffsets(0, 0, pdfObject.getInt(PdfDictionary.Size), fieldSizes, xrefs, offset, pdf_datafile);

        } else { //pairs of values in Index[] array
            final int count = Index.length;
            int pntr = 0;

            for (int aa = 0; aa < count; aa += 2) {
                pntr = CompressedObjects.readCompressedOffsets(pntr, Index[aa], Index[aa + 1], fieldSizes, xrefs, offset, pdf_datafile);
            }
        }
        return pdfObject;
    }

    private PdfObject processTrailer(PdfObject rootObj, final PdfObject pdfObject) {
        if (rootObj == null) {

            rootObj = pdfObject.getDictionary(PdfDictionary.Root);

            encryptObj = pdfObject.getDictionary(PdfDictionary.Encrypt);

            if (encryptObj != null) {

                final byte[][] IDs = pdfObject.getStringArray(PdfDictionary.ID);
                if (IDs != null && this.ID == null) {
                    // only the first encountered ID should be used as a fileID for decryption
                    this.ID = IDs[0];
                }
            }

            infoObject = pdfObject.getDictionary(PdfDictionary.Info);

        }
        return rootObj;
    }

    static String getObjectName(final byte[] raw) {

        final StringBuilder objectName = new StringBuilder();
        char current1, last = ' ';
        int matched = 0, i1 = 0;
        final int length = raw.length;

        while (i1 < length) {
            current1 = (char) raw[i1];

            //treat returns same as spaces
            if (current1 == 10 || current1 == 13) {
                current1 = ' ';
            }

            if (current1 == ' ' && last == ' ') { //lose duplicate or spaces
                matched = 0;
            } else if (current1 == pattern.charAt(matched)) { //looking for obj at end
                matched++;
            } else {
                matched = 0;
                objectName.append(current1);
            }
            if (matched == 3) {
                break;
            }
            last = current1;
            i1++;
        }

        //add end and put into Map
        objectName.append('R');

        return objectName.toString();
    }

    byte[] getBytes(final long start, final int count) {
        final byte[] buffer = new byte[count];

        if (start >= 0) {
            try {
                pdf_datafile.seek(start);
                pdf_datafile.read(buffer); //get next chars
            } catch (final IOException e) {
                LogWriter.writeLog("Exception: " + e.getMessage());
            }
        }

        return buffer;
    }

    void closeFile() throws IOException {

        if (pdf_datafile != null) {
            pdf_datafile.close();
            pdf_datafile = null;
        }
    }


    /**
     * test first bytes to see if new 1.5 style table with obj or contains ref
     *
     * @throws PdfException
     */
    private boolean isCompressedStream(int pointer, final int eof) throws PdfException {

        final boolean debug = false;

        int bufSize = 50, charReached_legacy = 0, charReached_comp1 = 0, charReached_comp2 = 0;

        final int[] objStm = {'O', 'b', 'j', 'S', 't', 'm'};
        final int[] XRef = {'X', 'R', 'e', 'f'};

        int type = UNSET;

        //flag to show if at start of data for check
        boolean firstRead = true;

        while (true) {
            
            /* adjust buffer if less than 1024 bytes left in file */
            if (pointer + bufSize > eof) {
                bufSize = eof - pointer;
            }

            if (bufSize < 0) {
                bufSize = 50;
            }

            if (pointer < 0) {
                pointer += bufSize;
                continue;
            }

            final byte[] buffer = getBytes(pointer, bufSize);

            //allow for fact sometimes start of data wrong
            if (firstRead && buffer[0] == 'r' && buffer[1] == 'e' && buffer[2] == 'f') {
                charReached_legacy = 1;
            }

            firstRead = false; //switch off
            
            /*look for xref or obj */
            for (int i = 0; i < bufSize; i++) {

                final byte currentByte = buffer[i];

                if (debug) {
                    System.out.print((char) currentByte);
                }
                
                /* check for xref OR end - reset if not */
                if (currentByte == oldPattern[charReached_legacy] && type != COMPRESSED) {
                    charReached_legacy++;
                    type = LEGACY;
                } else if ((currentByte == objStm[charReached_comp1]) && (charReached_comp1 == 0 || type == COMPRESSED)) {

                    charReached_comp1++;
                    type = COMPRESSED;
                } else if ((currentByte == XRef[charReached_comp2]) && (charReached_comp2 == 0 || type == COMPRESSED)) {

                    charReached_comp2++;
                    type = COMPRESSED;
                } else {

                    charReached_legacy = 0;
                    charReached_comp1 = 0;
                    charReached_comp2 = 0;

                    type = UNSET;
                }

                if (charReached_legacy == 3 || charReached_comp1 == 4 || charReached_comp2 == 3) {
                    break;
                }

            }

            if (charReached_legacy == 3 || charReached_comp1 == 4 || charReached_comp2 == 3) {
                break;
            }

            //update pointer
            pointer += bufSize;

        }
        
        /*
         * throw exception if no match or tell user which type
         */
        if (type == UNSET) {
            try {
                closeFile();
            } catch (final IOException e1) {
                LogWriter.writeLog("Exception " + 1 + " closing file " + e1);
            }
            throw new PdfException("Exception unable to find ref or obj in trailer");
        }

        return type == COMPRESSED;
    }

    public PdfObject getInfoObject() {
        return infoObject;
    }

    public PdfObject getEncryptionObject() {
        return encryptObj;
    }

    public byte[] getID() {
        return ID;
    }

    //////////////////////////////////////////////////////////////////////////

    /**
     * returns current location pointer and sets to new value
     */
    public void movePointer(final long pointer) {
        try {
            //make sure inside file
            if (pointer > pdf_datafile.length()) {

                LogWriter.writeLog("Attempting to access ref outside file");
            } else {
                pdf_datafile.seek(pointer);
            }
        } catch (final Exception e) {
            LogWriter.writeLog("Exception " + e + " moving pointer to  " + pointer + " in file.");
        }
    }
}