com.itextpdf.text.pdf.PRTokeniser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of itextpdf Show documentation
Show all versions of itextpdf Show documentation
iText, a free Java-PDF library
The newest version!
/*
*
* This file is part of the iText (R) project.
Copyright (c) 1998-2022 iText Group NV
* Authors: Bruno Lowagie, Paulo Soares, et al.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License version 3
* as published by the Free Software Foundation with the addition of the
* following permission added to Section 15 as permitted in Section 7(a):
* FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
* ITEXT GROUP. ITEXT GROUP DISCLAIMS THE WARRANTY OF NON INFRINGEMENT
* OF THIRD PARTY RIGHTS
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU Affero General Public License for more details.
* You should have received a copy of the GNU Affero General Public License
* along with this program; if not, see http://www.gnu.org/licenses or write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA, 02110-1301 USA, or download the license from the following URL:
* http://itextpdf.com/terms-of-use/
*
* The interactive user interfaces in modified source and object code versions
* of this program must display Appropriate Legal Notices, as required under
* Section 5 of the GNU Affero General Public License.
*
* In accordance with Section 7(b) of the GNU Affero General Public License,
* a covered work must retain the producer line in every PDF that is created
* or manipulated using iText.
*
* You can be released from the requirements of the license by purchasing
* a commercial license. Buying such a license is mandatory as soon as you
* develop commercial activities involving the iText software without
* disclosing the source code of your own applications.
* These activities include: offering paid services to customers as an ASP,
* serving PDFs on the fly in a web application, shipping iText with a closed
* source product.
*
* For more information, please contact iText Software Corp. at this
* address: [email protected]
*/
package com.itextpdf.text.pdf;
import com.itextpdf.text.error_messages.MessageLocalization;
import com.itextpdf.text.exceptions.InvalidPdfException;
import com.itextpdf.text.io.RandomAccessSourceFactory;
import java.io.IOException;
/**
*
* @author Paulo Soares
*/
public class PRTokeniser {
private final StringBuilder outBuf = new StringBuilder();
/**
* Enum representing the possible token types
* @since 5.0.1
*/
public enum TokenType {
NUMBER,
STRING,
NAME,
COMMENT,
START_ARRAY,
END_ARRAY,
START_DIC,
END_DIC,
REF,
OTHER,
ENDOFFILE
}
public static final boolean delims[] = {
true, true, false, false, false, false, false, false, false, false,
true, true, false, true, true, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, true, false, false, false, false, true, false,
false, true, true, false, false, false, false, false, true, false,
false, false, false, false, false, false, false, false, false, false,
false, true, false, true, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, true, false, true, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false};
static final String EMPTY = "";
private final RandomAccessFileOrArray file;
protected TokenType type;
protected String stringValue;
protected int reference;
protected int generation;
protected boolean hexString;
/**
* Creates a PRTokeniser for the specified {@link RandomAccessFileOrArray}.
* The beginning of the file is read to determine the location of the header, and the data source is adjusted
* as necessary to account for any junk that occurs in the byte source before the header
* @param file the source
*/
public PRTokeniser(RandomAccessFileOrArray file) {
this.file = file;
}
public void seek(long pos) throws IOException {
file.seek(pos);
}
public long getFilePointer() throws IOException {
return file.getFilePointer();
}
public void close() throws IOException {
file.close();
}
public long length() throws IOException {
return file.length();
}
public int read() throws IOException {
return file.read();
}
public RandomAccessFileOrArray getSafeFile() {
return new RandomAccessFileOrArray(file);
}
//TODO: is this really necessary? Seems like exposing this detail opens us up to all sorts of potential problems
public RandomAccessFileOrArray getFile() {
return file;
}
public String readString(int size) throws IOException {
StringBuilder buf = new StringBuilder();
int ch;
while ((size--) > 0) {
ch = read();
if (ch == -1)
break;
buf.append((char)ch);
}
return buf.toString();
}
/**
* Is a certain character a whitespace? Currently checks on the following: '0', '9', '10', '12', '13', '32'.
*
The same as calling {@link #isWhitespace(int, boolean) isWhiteSpace(ch, true)}.
* @param ch int
* @return boolean
* @since 5.5.1
*/
public static final boolean isWhitespace(int ch) {
return isWhitespace(ch, true);
}
/**
* Checks whether a character is a whitespace. Currently checks on the following: '0', '9', '10', '12', '13', '32'.
* @param ch int
* @param isWhitespace boolean
* @return boolean
* @since 5.5.1
*/
public static final boolean isWhitespace(int ch, boolean isWhitespace) {
return ( ( isWhitespace && ch == 0 ) || ch == 9 || ch == 10 || ch == 12 || ch == 13 || ch == 32);
}
public static final boolean isDelimiter(int ch) {
return (ch == '(' || ch == ')' || ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '/' || ch == '%');
}
public static final boolean isDelimiterWhitespace(int ch) {
return delims[ch + 1];
}
public TokenType getTokenType() {
return type;
}
public String getStringValue() {
return stringValue;
}
/**
* Gets current reference number. If parsing was failed with NumberFormatException -1 will be return.
*
* @return a positive integer for correct reference, or negative for incorrect.
*/
public int getReference() {
return reference;
}
public int getGeneration() {
return generation;
}
public void backOnePosition(int ch) {
if (ch != -1)
file.pushBack((byte)ch);
}
public void throwError(String error) throws IOException {
throw new InvalidPdfException(MessageLocalization.getComposedMessage("1.at.file.pointer.2", error, String.valueOf(file.getFilePointer())));
}
public int getHeaderOffset() throws IOException{
String str = readString(1024);
int idx = str.indexOf("%PDF-");
if (idx < 0){
idx = str.indexOf("%FDF-");
if (idx < 0)
throw new InvalidPdfException(MessageLocalization.getComposedMessage("pdf.header.not.found"));
}
return idx;
}
public char checkPdfHeader() throws IOException {
file.seek(0);
String str = readString(1024);
int idx = str.indexOf("%PDF-");
if (idx != 0)
throw new InvalidPdfException(MessageLocalization.getComposedMessage("pdf.header.not.found"));
return str.charAt(7);
}
public void checkFdfHeader() throws IOException {
file.seek(0);
String str = readString(1024);
int idx = str.indexOf("%FDF-");
if (idx != 0)
throw new InvalidPdfException(MessageLocalization.getComposedMessage("fdf.header.not.found"));
}
public long getStartxref() throws IOException {
int arrLength = 1024;
long fileLength = file.length();
long pos = fileLength - arrLength;
if (pos < 1) pos = 1;
while (pos > 0){
file.seek(pos);
String str = readString(arrLength);
int idx = str.lastIndexOf("startxref");
if (idx >= 0) return pos + idx;
pos = pos - arrLength + 9; // 9 = "startxref".length()
}
throw new InvalidPdfException(MessageLocalization.getComposedMessage("pdf.startxref.not.found"));
}
public static int getHex(int v) {
if (v >= '0' && v <= '9')
return v - '0';
if (v >= 'A' && v <= 'F')
return v - 'A' + 10;
if (v >= 'a' && v <= 'f')
return v - 'a' + 10;
return -1;
}
public void nextValidToken() throws IOException {
int level = 0;
String n1 = null;
String n2 = null;
long ptr = 0;
while (nextToken()) {
if (type == TokenType.COMMENT)
continue;
switch (level) {
case 0:
{
if (type != TokenType.NUMBER)
return;
ptr = file.getFilePointer();
n1 = stringValue;
++level;
break;
}
case 1:
{
if (type != TokenType.NUMBER) {
file.seek(ptr);
type = TokenType.NUMBER;
stringValue = n1;
return;
}
n2 = stringValue;
++level;
break;
}
default:
{
if (type != TokenType.OTHER || !stringValue.equals("R")) {
file.seek(ptr);
type = TokenType.NUMBER;
stringValue = n1;
return;
}
type = TokenType.REF;
try {
reference = Integer.parseInt(n1);
generation = Integer.parseInt(n2);
} catch (NumberFormatException ex) {
reference = -1;
generation = 0;
}
return;
}
}
}
if (level == 1){ // if the level 1 check returns EOF, then we are still looking at a number - set the type back to NUMBER
type = TokenType.NUMBER;
}
// if we hit here, the file is either corrupt (stream ended unexpectedly),
// or the last token ended exactly at the end of a stream. This last
// case can occur inside an Object Stream.
}
public boolean nextToken() throws IOException {
int ch = 0;
do {
ch = file.read();
} while (ch != -1 && isWhitespace(ch));
if (ch == -1){
type = TokenType.ENDOFFILE;
return false;
}
// Note: We have to initialize stringValue here, after we've looked for the end of the stream,
// to ensure that we don't lose the value of a token that might end exactly at the end
// of the stream
outBuf.setLength(0);
stringValue = EMPTY;
switch (ch) {
case '[':
type = TokenType.START_ARRAY;
break;
case ']':
type = TokenType.END_ARRAY;
break;
case '/':
{
outBuf.setLength(0);
type = TokenType.NAME;
while (true) {
ch = file.read();
if (delims[ch + 1])
break;
if (ch == '#') {
ch = (getHex(file.read()) << 4) + getHex(file.read());
}
outBuf.append((char)ch);
}
backOnePosition(ch);
break;
}
case '>':
ch = file.read();
if (ch != '>')
throwError(MessageLocalization.getComposedMessage("greaterthan.not.expected"));
type = TokenType.END_DIC;
break;
case '<':
{
int v1 = file.read();
if (v1 == '<') {
type = TokenType.START_DIC;
break;
}
outBuf.setLength(0);
type = TokenType.STRING;
hexString = true;
int v2 = 0;
while (true) {
while (isWhitespace(v1))
v1 = file.read();
if (v1 == '>')
break;
v1 = getHex(v1);
if (v1 < 0)
break;
v2 = file.read();
while (isWhitespace(v2))
v2 = file.read();
if (v2 == '>') {
ch = v1 << 4;
outBuf.append((char)ch);
break;
}
v2 = getHex(v2);
if (v2 < 0)
break;
ch = (v1 << 4) + v2;
outBuf.append((char)ch);
v1 = file.read();
}
if (v1 < 0 || v2 < 0)
throwError(MessageLocalization.getComposedMessage("error.reading.string"));
break;
}
case '%':
type = TokenType.COMMENT;
do {
ch = file.read();
} while (ch != -1 && ch != '\r' && ch != '\n');
break;
case '(':
{
outBuf.setLength(0);
type = TokenType.STRING;
hexString = false;
int nesting = 0;
while (true) {
ch = file.read();
if (ch == -1)
break;
if (ch == '(') {
++nesting;
}
else if (ch == ')') {
--nesting;
}
else if (ch == '\\') {
boolean lineBreak = false;
ch = file.read();
switch (ch) {
case 'n':
ch = '\n';
break;
case 'r':
ch = '\r';
break;
case 't':
ch = '\t';
break;
case 'b':
ch = '\b';
break;
case 'f':
ch = '\f';
break;
case '(':
case ')':
case '\\':
break;
case '\r':
lineBreak = true;
ch = file.read();
if (ch != '\n')
backOnePosition(ch);
break;
case '\n':
lineBreak = true;
break;
default:
{
if (ch < '0' || ch > '7') {
break;
}
int octal = ch - '0';
ch = file.read();
if (ch < '0' || ch > '7') {
backOnePosition(ch);
ch = octal;
break;
}
octal = (octal << 3) + ch - '0';
ch = file.read();
if (ch < '0' || ch > '7') {
backOnePosition(ch);
ch = octal;
break;
}
octal = (octal << 3) + ch - '0';
ch = octal & 0xff;
break;
}
}
if (lineBreak)
continue;
if (ch < 0)
break;
}
else if (ch == '\r') {
ch = file.read();
if (ch < 0)
break;
if (ch != '\n') {
backOnePosition(ch);
ch = '\n';
}
}
if (nesting == -1)
break;
outBuf.append((char)ch);
}
if (ch == -1)
throwError(MessageLocalization.getComposedMessage("error.reading.string"));
break;
}
default:
{
outBuf.setLength(0);
if (ch == '-' || ch == '+' || ch == '.' || (ch >= '0' && ch <= '9')) {
type = TokenType.NUMBER;
boolean isReal = false;
int numberOfMinuses = 0;
if (ch == '-') {
// Take care of number like "--234". If Acrobat can read them so must we.
do {
++numberOfMinuses;
ch = file.read();
} while (ch == '-');
outBuf.append('-');
}
else {
outBuf.append((char)ch);
// We don't need to check if the number is real over here
// as we need to know that fact only in case if there are any minuses.
ch = file.read();
}
while (ch != -1 && ((ch >= '0' && ch <= '9') || ch == '.')) {
if (ch == '.')
isReal = true;
outBuf.append((char)ch);
ch = file.read();
}
if (numberOfMinuses > 1 && !isReal) {
// Numbers of integer type and with more than one minus before them
// are interpreted by Acrobat as zero.
outBuf.setLength(0);
outBuf.append('0');
}
}
else {
type = TokenType.OTHER;
do {
outBuf.append((char)ch);
ch = file.read();
} while (!delims[ch + 1]);
}
if(ch != -1)
backOnePosition(ch);
break;
}
}
if (outBuf != null)
stringValue = outBuf.toString();
return true;
}
public long longValue() {
return Long.parseLong(stringValue);
}
public int intValue() {
return Integer.parseInt(stringValue);
}
/**
* Reads data into the provided byte[]. Checks on leading whitespace.
* See {@link #isWhitespace(int) isWhiteSpace(int)} or {@link #isWhitespace(int, boolean) isWhiteSpace(int, boolean)}
* for a list of whitespace characters.
*
The same as calling {@link #readLineSegment(byte[], boolean) readLineSegment(input, true)}.
*
* @param input byte[]
* @return boolean
* @throws IOException
* @since 5.5.1
*/
public boolean readLineSegment(byte[] input) throws IOException {
return readLineSegment(input, true);
}
/**
* Reads data into the provided byte[]. Checks on leading whitespace.
* See {@link #isWhitespace(int) isWhiteSpace(int)} or {@link #isWhitespace(int, boolean) isWhiteSpace(int, boolean)}
* for a list of whitespace characters.
*
* @param input byte[]
* @param isNullWhitespace boolean to indicate whether '0' is whitespace or not.
* If in doubt, use true or overloaded method {@link #readLineSegment(byte[]) readLineSegment(input)}
* @return boolean
* @throws IOException
* @since 5.5.1
*/
public boolean readLineSegment(byte input[], boolean isNullWhitespace) throws IOException {
int c = -1;
boolean eol = false;
int ptr = 0;
int len = input.length;
// ssteward, pdftk-1.10, 040922:
// skip initial whitespace; added this because PdfReader.rebuildXref()
// assumes that line provided by readLineSegment does not have init. whitespace;
if ( ptr < len ) {
while ( isWhitespace( (c = read()), isNullWhitespace ) );
}
while ( !eol && ptr < len ) {
switch (c) {
case -1:
case '\n':
eol = true;
break;
case '\r':
eol = true;
long cur = getFilePointer();
if ((read()) != '\n') {
seek(cur);
}
break;
default:
input[ptr++] = (byte)c;
break;
}
// break loop? do it before we read() again
if ( eol || len <= ptr ) {
break;
} else {
c = read();
}
}
if (ptr >= len) {
eol = false;
while (!eol) {
switch (c = read()) {
case -1:
case '\n':
eol = true;
break;
case '\r':
eol = true;
long cur = getFilePointer();
if ((read()) != '\n') {
seek(cur);
}
break;
}
}
}
if ((c == -1) && (ptr == 0)) {
return false;
}
if (ptr + 2 <= len) {
input[ptr++] = (byte)' ';
input[ptr] = (byte)'X';
}
return true;
}
public static long[] checkObjectStart(byte line[]) {
try {
PRTokeniser tk = new PRTokeniser(new RandomAccessFileOrArray(new RandomAccessSourceFactory().createSource(line)));
int num = 0;
int gen = 0;
if (!tk.nextToken() || tk.getTokenType() != TokenType.NUMBER)
return null;
num = tk.intValue();
if (!tk.nextToken() || tk.getTokenType() != TokenType.NUMBER)
return null;
gen = tk.intValue();
if (!tk.nextToken())
return null;
if (!tk.getStringValue().equals("obj"))
return null;
return new long[]{num, gen};
}
catch (Exception ioe) {
// empty on purpose
}
return null;
}
public boolean isHexString() {
return this.hexString;
}
}