All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.itextpdf.text.pdf.PRTokeniser Maven / Gradle / Ivy

There is a newer version: 5.5.13.3
Show newest version
/*
 *
 * This file is part of the iText (R) project.
    Copyright (c) 1998-2017 iText Group NV
 * Authors: Bruno Lowagie, Paulo Soares, et al.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License version 3
 * as published by the Free Software Foundation with the addition of the
 * following permission added to Section 15 as permitted in Section 7(a):
 * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
 * ITEXT GROUP. ITEXT GROUP DISCLAIMS THE WARRANTY OF NON INFRINGEMENT
 * OF THIRD PARTY RIGHTS
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.
 * See the GNU Affero General Public License for more details.
 * You should have received a copy of the GNU Affero General Public License
 * along with this program; if not, see http://www.gnu.org/licenses or write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA, 02110-1301 USA, or download the license from the following URL:
 * http://itextpdf.com/terms-of-use/
 *
 * The interactive user interfaces in modified source and object code versions
 * of this program must display Appropriate Legal Notices, as required under
 * Section 5 of the GNU Affero General Public License.
 *
 * In accordance with Section 7(b) of the GNU Affero General Public License,
 * a covered work must retain the producer line in every PDF that is created
 * or manipulated using iText.
 *
 * You can be released from the requirements of the license by purchasing
 * a commercial license. Buying such a license is mandatory as soon as you
 * develop commercial activities involving the iText software without
 * disclosing the source code of your own applications.
 * These activities include: offering paid services to customers as an ASP,
 * serving PDFs on the fly in a web application, shipping iText with a closed
 * source product.
 *
 * For more information, please contact iText Software Corp. at this
 * address: [email protected]
 */
package com.itextpdf.text.pdf;

import com.itextpdf.text.error_messages.MessageLocalization;
import com.itextpdf.text.exceptions.InvalidPdfException;
import com.itextpdf.text.io.RandomAccessSourceFactory;

import java.io.IOException;
/**
 *
 * @author  Paulo Soares
 */
public class PRTokeniser {

    private final StringBuilder outBuf = new StringBuilder();

    /**
     * Enum representing the possible token types
     * @since 5.0.1
     */ 
    public enum TokenType {
        NUMBER,
        STRING,
        NAME,
        COMMENT,
        START_ARRAY,
        END_ARRAY,
        START_DIC,
        END_DIC,
        REF,
        OTHER,
        ENDOFFILE
    }
    
    public static final boolean delims[] = {
        true,  true,  false, false, false, false, false, false, false, false,
        true,  true,  false, true,  true,  false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, true,  false, false, false, false, true,  false,
        false, true,  true,  false, false, false, false, false, true,  false,
        false, false, false, false, false, false, false, false, false, false,
        false, true,  false, true,  false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, true,  false, true,  false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false, false, false, false,
        false, false, false, false, false, false, false};
    
    static final String EMPTY = "";

    
    private final RandomAccessFileOrArray file;
    
    protected TokenType type;
    protected String stringValue;
    protected int reference;
    protected int generation;
    protected boolean hexString;

    /**
     * Creates a PRTokeniser for the specified {@link RandomAccessFileOrArray}.
     * The beginning of the file is read to determine the location of the header, and the data source is adjusted
     * as necessary to account for any junk that occurs in the byte source before the header
     * @param file the source
     */
    public PRTokeniser(RandomAccessFileOrArray file) {
    	this.file = file;
	}
    
    public void seek(long pos) throws IOException {
        file.seek(pos);
    }
    
    public long getFilePointer() throws IOException {
        return file.getFilePointer();
    }

    public void close() throws IOException {
        file.close();
    }
    
    public long length() throws IOException {
        return file.length();
    }

    public int read() throws IOException {
        return file.read();
    }
    
    public RandomAccessFileOrArray getSafeFile() {
        return new RandomAccessFileOrArray(file);
    }
    
    //TODO: is this really necessary?  Seems like exposing this detail opens us up to all sorts of potential problems
    public RandomAccessFileOrArray getFile() {
        return file;
    }
    
    public String readString(int size) throws IOException {
    	StringBuilder buf = new StringBuilder();
        int ch;
        while ((size--) > 0) {
            ch = read();
            if (ch == -1)
                break;
            buf.append((char)ch);
        }
        return buf.toString();
    }

    /**
     * Is a certain character a whitespace? Currently checks on the following: '0', '9', '10', '12', '13', '32'.
     * 
The same as calling {@link #isWhitespace(int, boolean) isWhiteSpace(ch, true)}. * @param ch int * @return boolean * @since 5.5.1 */ public static final boolean isWhitespace(int ch) { return isWhitespace(ch, true); } /** * Checks whether a character is a whitespace. Currently checks on the following: '0', '9', '10', '12', '13', '32'. * @param ch int * @param isWhitespace boolean * @return boolean * @since 5.5.1 */ public static final boolean isWhitespace(int ch, boolean isWhitespace) { return ( ( isWhitespace && ch == 0 ) || ch == 9 || ch == 10 || ch == 12 || ch == 13 || ch == 32); } public static final boolean isDelimiter(int ch) { return (ch == '(' || ch == ')' || ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '/' || ch == '%'); } public static final boolean isDelimiterWhitespace(int ch) { return delims[ch + 1]; } public TokenType getTokenType() { return type; } public String getStringValue() { return stringValue; } public int getReference() { return reference; } public int getGeneration() { return generation; } public void backOnePosition(int ch) { if (ch != -1) file.pushBack((byte)ch); } public void throwError(String error) throws IOException { throw new InvalidPdfException(MessageLocalization.getComposedMessage("1.at.file.pointer.2", error, String.valueOf(file.getFilePointer()))); } public int getHeaderOffset() throws IOException{ String str = readString(1024); int idx = str.indexOf("%PDF-"); if (idx < 0){ idx = str.indexOf("%FDF-"); if (idx < 0) throw new InvalidPdfException(MessageLocalization.getComposedMessage("pdf.header.not.found")); } return idx; } public char checkPdfHeader() throws IOException { file.seek(0); String str = readString(1024); int idx = str.indexOf("%PDF-"); if (idx != 0) throw new InvalidPdfException(MessageLocalization.getComposedMessage("pdf.header.not.found")); return str.charAt(7); } public void checkFdfHeader() throws IOException { file.seek(0); String str = readString(1024); int idx = str.indexOf("%FDF-"); if (idx != 0) throw new InvalidPdfException(MessageLocalization.getComposedMessage("fdf.header.not.found")); } public long getStartxref() throws IOException { int arrLength = 1024; long fileLength = file.length(); long pos = fileLength - arrLength; if (pos < 1) pos = 1; while (pos > 0){ file.seek(pos); String str = readString(arrLength); int idx = str.lastIndexOf("startxref"); if (idx >= 0) return pos + idx; pos = pos - arrLength + 9; // 9 = "startxref".length() } throw new InvalidPdfException(MessageLocalization.getComposedMessage("pdf.startxref.not.found")); } public static int getHex(int v) { if (v >= '0' && v <= '9') return v - '0'; if (v >= 'A' && v <= 'F') return v - 'A' + 10; if (v >= 'a' && v <= 'f') return v - 'a' + 10; return -1; } public void nextValidToken() throws IOException { int level = 0; String n1 = null; String n2 = null; long ptr = 0; while (nextToken()) { if (type == TokenType.COMMENT) continue; switch (level) { case 0: { if (type != TokenType.NUMBER) return; ptr = file.getFilePointer(); n1 = stringValue; ++level; break; } case 1: { if (type != TokenType.NUMBER) { file.seek(ptr); type = TokenType.NUMBER; stringValue = n1; return; } n2 = stringValue; ++level; break; } default: { if (type != TokenType.OTHER || !stringValue.equals("R")) { file.seek(ptr); type = TokenType.NUMBER; stringValue = n1; return; } type = TokenType.REF; reference = Integer.parseInt(n1); generation = Integer.parseInt(n2); return; } } } if (level == 1){ // if the level 1 check returns EOF, then we are still looking at a number - set the type back to NUMBER type = TokenType.NUMBER; } // if we hit here, the file is either corrupt (stream ended unexpectedly), // or the last token ended exactly at the end of a stream. This last // case can occur inside an Object Stream. } public boolean nextToken() throws IOException { int ch = 0; do { ch = file.read(); } while (ch != -1 && isWhitespace(ch)); if (ch == -1){ type = TokenType.ENDOFFILE; return false; } // Note: We have to initialize stringValue here, after we've looked for the end of the stream, // to ensure that we don't lose the value of a token that might end exactly at the end // of the stream outBuf.setLength(0); stringValue = EMPTY; switch (ch) { case '[': type = TokenType.START_ARRAY; break; case ']': type = TokenType.END_ARRAY; break; case '/': { outBuf.setLength(0); type = TokenType.NAME; while (true) { ch = file.read(); if (delims[ch + 1]) break; if (ch == '#') { ch = (getHex(file.read()) << 4) + getHex(file.read()); } outBuf.append((char)ch); } backOnePosition(ch); break; } case '>': ch = file.read(); if (ch != '>') throwError(MessageLocalization.getComposedMessage("greaterthan.not.expected")); type = TokenType.END_DIC; break; case '<': { int v1 = file.read(); if (v1 == '<') { type = TokenType.START_DIC; break; } outBuf.setLength(0); type = TokenType.STRING; hexString = true; int v2 = 0; while (true) { while (isWhitespace(v1)) v1 = file.read(); if (v1 == '>') break; v1 = getHex(v1); if (v1 < 0) break; v2 = file.read(); while (isWhitespace(v2)) v2 = file.read(); if (v2 == '>') { ch = v1 << 4; outBuf.append((char)ch); break; } v2 = getHex(v2); if (v2 < 0) break; ch = (v1 << 4) + v2; outBuf.append((char)ch); v1 = file.read(); } if (v1 < 0 || v2 < 0) throwError(MessageLocalization.getComposedMessage("error.reading.string")); break; } case '%': type = TokenType.COMMENT; do { ch = file.read(); } while (ch != -1 && ch != '\r' && ch != '\n'); break; case '(': { outBuf.setLength(0); type = TokenType.STRING; hexString = false; int nesting = 0; while (true) { ch = file.read(); if (ch == -1) break; if (ch == '(') { ++nesting; } else if (ch == ')') { --nesting; } else if (ch == '\\') { boolean lineBreak = false; ch = file.read(); switch (ch) { case 'n': ch = '\n'; break; case 'r': ch = '\r'; break; case 't': ch = '\t'; break; case 'b': ch = '\b'; break; case 'f': ch = '\f'; break; case '(': case ')': case '\\': break; case '\r': lineBreak = true; ch = file.read(); if (ch != '\n') backOnePosition(ch); break; case '\n': lineBreak = true; break; default: { if (ch < '0' || ch > '7') { break; } int octal = ch - '0'; ch = file.read(); if (ch < '0' || ch > '7') { backOnePosition(ch); ch = octal; break; } octal = (octal << 3) + ch - '0'; ch = file.read(); if (ch < '0' || ch > '7') { backOnePosition(ch); ch = octal; break; } octal = (octal << 3) + ch - '0'; ch = octal & 0xff; break; } } if (lineBreak) continue; if (ch < 0) break; } else if (ch == '\r') { ch = file.read(); if (ch < 0) break; if (ch != '\n') { backOnePosition(ch); ch = '\n'; } } if (nesting == -1) break; outBuf.append((char)ch); } if (ch == -1) throwError(MessageLocalization.getComposedMessage("error.reading.string")); break; } default: { outBuf.setLength(0); if (ch == '-' || ch == '+' || ch == '.' || (ch >= '0' && ch <= '9')) { type = TokenType.NUMBER; boolean isReal = false; int numberOfMinuses = 0; if (ch == '-') { // Take care of number like "--234". If Acrobat can read them so must we. do { ++numberOfMinuses; ch = file.read(); } while (ch == '-'); outBuf.append('-'); } else { outBuf.append((char)ch); // We don't need to check if the number is real over here // as we need to know that fact only in case if there are any minuses. ch = file.read(); } while (ch != -1 && ((ch >= '0' && ch <= '9') || ch == '.')) { if (ch == '.') isReal = true; outBuf.append((char)ch); ch = file.read(); } if (numberOfMinuses > 1 && !isReal) { // Numbers of integer type and with more than one minus before them // are interpreted by Acrobat as zero. outBuf.setLength(0); outBuf.append('0'); } } else { type = TokenType.OTHER; do { outBuf.append((char)ch); ch = file.read(); } while (!delims[ch + 1]); } if(ch != -1) backOnePosition(ch); break; } } if (outBuf != null) stringValue = outBuf.toString(); return true; } public long longValue() { return Long.parseLong(stringValue); } public int intValue() { return Integer.parseInt(stringValue); } /** * Reads data into the provided byte[]. Checks on leading whitespace. * See {@link #isWhitespace(int) isWhiteSpace(int)} or {@link #isWhitespace(int, boolean) isWhiteSpace(int, boolean)} * for a list of whitespace characters. *
The same as calling {@link #readLineSegment(byte[], boolean) readLineSegment(input, true)}. * * @param input byte[] * @return boolean * @throws IOException * @since 5.5.1 */ public boolean readLineSegment(byte[] input) throws IOException { return readLineSegment(input, true); } /** * Reads data into the provided byte[]. Checks on leading whitespace. * See {@link #isWhitespace(int) isWhiteSpace(int)} or {@link #isWhitespace(int, boolean) isWhiteSpace(int, boolean)} * for a list of whitespace characters. * * @param input byte[] * @param isNullWhitespace boolean to indicate whether '0' is whitespace or not. * If in doubt, use true or overloaded method {@link #readLineSegment(byte[]) readLineSegment(input)} * @return boolean * @throws IOException * @since 5.5.1 */ public boolean readLineSegment(byte input[], boolean isNullWhitespace) throws IOException { int c = -1; boolean eol = false; int ptr = 0; int len = input.length; // ssteward, pdftk-1.10, 040922: // skip initial whitespace; added this because PdfReader.rebuildXref() // assumes that line provided by readLineSegment does not have init. whitespace; if ( ptr < len ) { while ( isWhitespace( (c = read()), isNullWhitespace ) ); } while ( !eol && ptr < len ) { switch (c) { case -1: case '\n': eol = true; break; case '\r': eol = true; long cur = getFilePointer(); if ((read()) != '\n') { seek(cur); } break; default: input[ptr++] = (byte)c; break; } // break loop? do it before we read() again if ( eol || len <= ptr ) { break; } else { c = read(); } } if (ptr >= len) { eol = false; while (!eol) { switch (c = read()) { case -1: case '\n': eol = true; break; case '\r': eol = true; long cur = getFilePointer(); if ((read()) != '\n') { seek(cur); } break; } } } if ((c == -1) && (ptr == 0)) { return false; } if (ptr + 2 <= len) { input[ptr++] = (byte)' '; input[ptr] = (byte)'X'; } return true; } public static long[] checkObjectStart(byte line[]) { try { PRTokeniser tk = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().createSource(line))); int num = 0; int gen = 0; if (!tk.nextToken() || tk.getTokenType() != TokenType.NUMBER) return null; num = tk.intValue(); if (!tk.nextToken() || tk.getTokenType() != TokenType.NUMBER) return null; gen = tk.intValue(); if (!tk.nextToken()) return null; if (!tk.getStringValue().equals("obj")) return null; return new long[]{num, gen}; } catch (Exception ioe) { // empty on purpose } return null; } public boolean isHexString() { return this.hexString; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy