
com.itextpdf.io.source.PdfTokenizer Maven / Gradle / Ivy
/*
This file is part of the iText (R) project.
Copyright (c) 1998-2023 Apryse Group NV
Authors: Apryse Software.
This program is offered under a commercial and under the AGPL license.
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
AGPL licensing:
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
*/
package com.itextpdf.io.source;
import com.itextpdf.commons.utils.MessageFormatUtil;
import com.itextpdf.io.exceptions.IOException;
import com.itextpdf.io.exceptions.IoExceptionMessageConstant;
import com.itextpdf.io.logs.IoLogMessageConstant;
import java.io.Closeable;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class PdfTokenizer implements Closeable {
public enum TokenType {
Number,
String,
Name,
Comment,
StartArray,
EndArray,
StartDic,
EndDic,
Ref,
Obj,
EndObj,
Other,
EndOfFile
}
public static final boolean[] delims = {
true, true, false, false, false, false, false, false, false, false,
true, true, false, true, true, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, true, false, false, false, false, true, false,
false, true, true, false, false, false, false, false, true, false,
false, false, false, false, false, false, false, false, false, false,
false, true, false, true, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, true, false, true, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false};
public static final byte[] Obj = ByteUtils.getIsoBytes("obj");
public static final byte[] R = ByteUtils.getIsoBytes("R");
public static final byte[] Xref = ByteUtils.getIsoBytes("xref");
public static final byte[] Startxref = ByteUtils.getIsoBytes("startxref");
public static final byte[] Stream = ByteUtils.getIsoBytes("stream");
public static final byte[] Trailer = ByteUtils.getIsoBytes("trailer");
public static final byte[] N = ByteUtils.getIsoBytes("n");
public static final byte[] F = ByteUtils.getIsoBytes("f");
public static final byte[] Null = ByteUtils.getIsoBytes("null");
public static final byte[] True = ByteUtils.getIsoBytes("true");
public static final byte[] False = ByteUtils.getIsoBytes("false");
protected TokenType type;
protected int reference;
protected int generation;
protected boolean hexString;
protected ByteBuffer outBuf;
private final RandomAccessFileOrArray file;
/**
* Streams are closed automatically.
*/
private boolean closeStream = true;
/**
* Creates a PdfTokenizer for the specified {@link RandomAccessFileOrArray}.
* The beginning of the file is read to determine the location of the header, and the data source is adjusted
* as necessary to account for any junk that occurs in the byte source before the header
*
* @param file the source
*/
public PdfTokenizer(RandomAccessFileOrArray file) {
this.file = file;
this.outBuf = new ByteBuffer();
}
public void seek(long pos) {
file.seek(pos);
}
public void readFully(byte[] bytes) throws java.io.IOException {
file.readFully(bytes);
}
public long getPosition() {
return file.getPosition();
}
public void close() throws java.io.IOException {
if (closeStream)
file.close();
}
public long length() {
return file.length();
}
public int read() throws java.io.IOException {
return file.read();
}
public String readString(int size) throws java.io.IOException {
StringBuilder buf = new StringBuilder();
int ch;
while ((size--) > 0) {
ch = read();
if (ch == -1)
break;
buf.append((char) ch);
}
return buf.toString();
}
public TokenType getTokenType() {
return type;
}
public byte[] getByteContent() {
return outBuf.toByteArray();
}
public String getStringValue() {
return new String(outBuf.getInternalBuffer(), 0, outBuf.size());
}
public byte[] getDecodedStringContent() {
return decodeStringContent(outBuf.getInternalBuffer(), 0, outBuf.size() - 1, isHexString());
}
public boolean tokenValueEqualsTo(byte[] cmp) {
if (cmp == null)
return false;
int size = cmp.length;
if (outBuf.size() != size)
return false;
for (int i = 0; i < size; i++)
if (cmp[i] != outBuf.getInternalBuffer()[i])
return false;
return true;
}
public int getObjNr() {
return reference;
}
public int getGenNr() {
return generation;
}
public void backOnePosition(int ch) {
if (ch != -1)
file.pushBack((byte) ch);
}
public int getHeaderOffset() throws java.io.IOException {
String str = readString(1024);
int idx = str.indexOf("%PDF-");
if (idx < 0) {
idx = str.indexOf("%FDF-");
if (idx < 0)
throw new IOException(IoExceptionMessageConstant.PDF_HEADER_NOT_FOUND, this);
}
return idx;
}
public String checkPdfHeader() throws java.io.IOException {
file.seek(0);
String str = readString(1024);
int idx = str.indexOf("%PDF-");
if (idx != 0)
throw new IOException(IoExceptionMessageConstant.PDF_HEADER_NOT_FOUND, this);
return str.substring(idx + 1, idx + 8);
}
public void checkFdfHeader() throws java.io.IOException {
file.seek(0);
String str = readString(1024);
int idx = str.indexOf("%FDF-");
if (idx != 0)
throw new IOException(IoExceptionMessageConstant.FDF_STARTXREF_NOT_FOUND, this);
}
public long getStartxref() throws java.io.IOException {
int arrLength = 1024;
long fileLength = file.length();
long pos = fileLength - arrLength;
if (pos < 1) pos = 1;
while (pos > 0) {
file.seek(pos);
String str = readString(arrLength);
int idx = str.lastIndexOf("startxref");
if (idx >= 0) return pos + idx;
// 9 = "startxref".length()
pos = pos - arrLength + 9;
}
throw new IOException(IoExceptionMessageConstant.PDF_STARTXREF_NOT_FOUND, this);
}
public void nextValidToken() throws java.io.IOException {
int level = 0;
byte[] n1 = null;
byte[] n2 = null;
long ptr = 0;
while (nextToken()) {
if (type == TokenType.Comment)
continue;
switch (level) {
case 0: {
if (type != TokenType.Number)
return;
ptr = file.getPosition();
n1 = getByteContent();
++level;
break;
}
case 1: {
if (type != TokenType.Number) {
file.seek(ptr);
type = TokenType.Number;
outBuf.reset().append(n1);
return;
}
n2 = getByteContent();
++level;
break;
}
case 2: {
if (type == TokenType.Other) {
if (tokenValueEqualsTo(R)) {
assert n2 != null;
type = TokenType.Ref;
try {
reference = Integer.parseInt(new String(n1));
generation = Integer.parseInt(new String(n2));
} catch (Exception ex) {
//warn about incorrect reference number
//Exception: NumberFormatException for java, FormatException or OverflowException for .NET
Logger logger = LoggerFactory.getLogger(PdfTokenizer.class);
logger.error(MessageFormatUtil.format(IoLogMessageConstant.INVALID_INDIRECT_REFERENCE,
new String(n1), new String(n2)));
reference = -1;
generation = 0;
}
return;
} else if (tokenValueEqualsTo(Obj)) {
assert n2 != null;
type = TokenType.Obj;
reference = Integer.parseInt(new String(n1));
generation = Integer.parseInt(new String(n2));
return;
}
}
file.seek(ptr);
type = TokenType.Number;
outBuf.reset().append(n1);
return;
}
}
}
// if the level 1 check returns EOF,
// then we are still looking at a number - set the type back to Number
if (level == 1) {
type = TokenType.Number;
outBuf.reset().append(n1);
}
// if we hit here, the file is either corrupt (stream ended unexpectedly),
// or the last token ended exactly at the end of a stream. This last
// case can occur inside an Object Stream.
}
public boolean nextToken() throws java.io.IOException {
int ch;
outBuf.reset();
do {
ch = file.read();
} while (ch != -1 && isWhitespace(ch));
if (ch == -1) {
type = TokenType.EndOfFile;
return false;
}
switch (ch) {
case '[': {
type = TokenType.StartArray;
break;
}
case ']': {
type = TokenType.EndArray;
break;
}
case '/': {
type = TokenType.Name;
while (true) {
ch = file.read();
if (delims[ch + 1])
break;
outBuf.append(ch);
}
backOnePosition(ch);
break;
}
case '>': {
ch = file.read();
if (ch != '>')
throwError(IoExceptionMessageConstant.GT_NOT_EXPECTED);
type = TokenType.EndDic;
break;
}
case '<': {
int v1 = file.read();
if (v1 == '<') {
type = TokenType.StartDic;
break;
}
type = TokenType.String;
hexString = true;
int v2 = 0;
while (true) {
while (isWhitespace(v1))
v1 = file.read();
if (v1 == '>')
break;
outBuf.append(v1);
v1 = ByteBuffer.getHex(v1);
if (v1 < 0)
break;
v2 = file.read();
while (isWhitespace(v2))
v2 = file.read();
if (v2 == '>') {
break;
}
outBuf.append(v2);
v2 = ByteBuffer.getHex(v2);
if (v2 < 0)
break;
v1 = file.read();
}
if (v1 < 0 || v2 < 0)
throwError(IoExceptionMessageConstant.ERROR_READING_STRING);
break;
}
case '%': {
type = TokenType.Comment;
do {
ch = file.read();
} while (ch != -1 && ch != '\r' && ch != '\n');
break;
}
case '(': {
type = TokenType.String;
hexString = false;
int nesting = 0;
while (true) {
ch = file.read();
if (ch == -1)
break;
if (ch == '(') {
++nesting;
} else if (ch == ')') {
--nesting;
if (nesting == -1)
break;
} else if (ch == '\\') {
outBuf.append('\\');
ch = file.read();
if (ch < 0)
break;
}
outBuf.append(ch);
}
if (ch == -1)
throwError(IoExceptionMessageConstant.ERROR_READING_STRING);
break;
}
default: {
if (ch == '-' || ch == '+' || ch == '.' || (ch >= '0' && ch <= '9')) {
type = TokenType.Number;
boolean isReal = false;
int numberOfMinuses = 0;
if (ch == '-') {
// Take care of number like "--234". If Acrobat can read them so must we.
do {
++numberOfMinuses;
ch = file.read();
} while (ch == '-');
outBuf.append('-');
} else {
outBuf.append(ch);
// We don't need to check if the number is real over here
// as we need to know that fact only in case if there are any minuses.
ch = file.read();
}
while (ch >= '0' && ch <= '9') {
outBuf.append(ch);
ch = file.read();
}
if ( ch == '.'){
isReal = true;
outBuf.append(ch);
ch = file.read();
//verify if there is minus after '.'
//In that case just ignore minus chars and everything after as Adobe Reader does
int numberOfMinusesAfterDot = 0;
if (ch == '-') {
numberOfMinusesAfterDot++;
ch = file.read();
}
while (ch >= '0' && ch <= '9') {
if (numberOfMinusesAfterDot == 0) {
outBuf.append(ch);
}
ch = file.read();
}
}
if (numberOfMinuses > 1 && !isReal) {
// Numbers of integer type and with more than one minus before them
// are interpreted by Acrobat as zero.
outBuf.reset();
outBuf.append('0');
}
} else {
type = TokenType.Other;
do {
outBuf.append(ch);
ch = file.read();
} while (!delims[ch + 1]);
}
if (ch != -1)
backOnePosition(ch);
break;
}
}
return true;
}
public long getLongValue() {
return Long.parseLong(getStringValue());
}
public int getIntValue() {
return Integer.parseInt(getStringValue());
}
public boolean isHexString() {
return this.hexString;
}
public boolean isCloseStream() {
return closeStream;
}
public void setCloseStream(boolean closeStream) {
this.closeStream = closeStream;
}
public RandomAccessFileOrArray getSafeFile() {
return file.createView();
}
/**
* Resolve escape symbols or hexadecimal symbols.
*
* NOTE Due to PdfReference 1.7 part 3.2.3 String value contain ASCII characters,
* so we can convert it directly to byte array.
*
* @param content string bytes to be decoded
* @param from given start index
* @param to given end index
* @param hexWriting true if given string is hex-encoded, e.g. '<69546578…>'.
* False otherwise, e.g. '((iText( some version)…)'
* @return byte[] for decrypting or for creating {@link java.lang.String}.
*/
protected static byte[] decodeStringContent(byte[] content, int from, int to, boolean hexWriting) {
ByteBuffer buffer = new ByteBuffer(to - from + 1);
// <6954657874ae...>
if (hexWriting) {
int i = from;
while (i <= to) {
int v1 = ByteBuffer.getHex(content[i++]);
if (i > to) {
buffer.append(v1 << 4);
break;
}
int v2 = content[i++];
v2 = ByteBuffer.getHex(v2);
buffer.append((v1 << 4) + v2);
}
} else {
// ((iText\( some version)...)
int i = from;
while (i <= to) {
int ch = content[i++];
if (ch == '\\') {
boolean lineBreak = false;
ch = content[i++];
switch (ch) {
case 'n':
ch = '\n';
break;
case 'r':
ch = '\r';
break;
case 't':
ch = '\t';
break;
case 'b':
ch = '\b';
break;
case 'f':
ch = '\f';
break;
case '(':
case ')':
case '\\':
break;
case '\r':
lineBreak = true;
if (i <= to && content[i++] != '\n') {
i--;
}
break;
case '\n':
lineBreak = true;
break;
default: {
if (ch < '0' || ch > '7') {
break;
}
int octal = ch - '0';
if (i > to) {
ch = octal;
break;
}
ch = content[i++];
octal = (octal << 3) + ch - '0';
if (ch < '0' || ch > '7' || i > to) {
ch = octal;
break;
}
ch = content[i++];
octal = (octal << 3) + ch - '0';
ch = octal & 0xff;
break;
}
}
if (lineBreak)
continue;
} else if (ch == '\r') {
// in this case current char is '\n' and we have to skip next '\n' if it presents.
ch = '\n';
if (i <= to && content[i++] != '\n') {
i--;
}
}
buffer.append(ch);
}
}
return buffer.toByteArray();
}
/**
* Resolve escape symbols or hexadecimal symbols.
*
* NOTE Due to PdfReference 1.7 part 3.2.3 String value contain ASCII characters,
* so we can convert it directly to byte array.
*
* @param content string bytes to be decoded
* @param hexWriting true if given string is hex-encoded, e.g. '<69546578…>'.
* False otherwise, e.g. '((iText( some version)…)'
* @return byte[] for decrypting or for creating {@link java.lang.String}.
*/
public static byte[] decodeStringContent(byte[] content, boolean hexWriting) {
return decodeStringContent(content, 0, content.length - 1, hexWriting);
}
/**
* Is a certain character a whitespace? Currently checks on the following: '0', '9', '10', '12', '13', '32'.
*
* The same as calling {@link #isWhitespace(int, boolean) isWhiteSpace(ch, true)}.
*
* @param ch int
* @return boolean
*/
public static boolean isWhitespace(int ch) {
return isWhitespace(ch, true);
}
/**
* Checks whether a character is a whitespace. Currently checks on the following: '0', '9', '10', '12', '13', '32'.
*
* @param ch int
* @param isWhitespace boolean
* @return boolean
*/
protected static boolean isWhitespace(int ch, boolean isWhitespace) {
return ((isWhitespace && ch == 0) || ch == 9 || ch == 10 || ch == 12 || ch == 13 || ch == 32);
}
protected static boolean isDelimiter(int ch) {
return (ch == '(' || ch == ')' || ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '/' || ch == '%');
}
protected static boolean isDelimiterWhitespace(int ch) {
return delims[ch + 1];
}
/**
* Helper method to handle content errors. Add file position to {@code PdfRuntimeException}.
* @param error message.
* @param messageParams error params.
* @throws IOException wrap error message into {@code PdfRuntimeException} and add position in file.
*/
public void throwError(String error, Object... messageParams) {
throw new IOException(IoExceptionMessageConstant.ERROR_AT_FILE_POINTER, new IOException(error).setMessageParams(messageParams))
.setMessageParams(file.getPosition());
}
/**
* Checks whether {@code line} equals to 'trailer'.
*
* @param line for check
*
* @return true, if line is equals to 'trailer', otherwise false
*/
public static boolean checkTrailer(ByteBuffer line) {
if (Trailer.length > line.size())
return false;
for (int i = 0; i < Trailer.length; i++) {
if (Trailer[i] != line.get(i))
return false;
}
return true;
}
/**
* Reads data into the provided byte[]. Checks on leading whitespace.
* See {@link #isWhitespace(int) isWhiteSpace(int)} or {@link #isWhitespace(int, boolean) isWhiteSpace(int, boolean)}
* for a list of whitespace characters.
*
* The same as calling {@link #readLineSegment(com.itextpdf.io.source.ByteBuffer, boolean) readLineSegment(input, true)}.
*
* @param buffer a {@link ByteBuffer} to which the result of reading will be saved
* @return true, if something was read or if the end of the input stream is not reached
* @throws java.io.IOException in case of any reading error
*/
public boolean readLineSegment(ByteBuffer buffer) throws java.io.IOException {
return readLineSegment(buffer, true);
}
/**
* Reads data into the provided byte[]. Checks on leading whitespace.
* See {@link #isWhitespace(int) isWhiteSpace(int)} or {@link #isWhitespace(int, boolean) isWhiteSpace(int, boolean)}
* for a list of whitespace characters.
*
* @param buffer a {@link ByteBuffer} to which the result of reading will be saved
* @param isNullWhitespace boolean to indicate whether '0' is whitespace or not.
* If in doubt, use true or overloaded method {@link #readLineSegment(com.itextpdf.io.source.ByteBuffer) readLineSegment(input)}
* @return true, if something was read or if the end of the input stream is not reached
* @throws java.io.IOException in case of any reading error
*/
public boolean readLineSegment(ByteBuffer buffer, boolean isNullWhitespace) throws java.io.IOException {
int c;
boolean eol = false;
// ssteward, pdftk-1.10, 040922:
// skip initial whitespace; added this because PdfReader.rebuildXref()
// assumes that line provided by readLineSegment does not have init. whitespace;
while (isWhitespace((c = read()), isNullWhitespace)) ;
boolean prevWasWhitespace = false;
while (!eol) {
switch (c) {
case -1:
case '\n':
eol = true;
break;
case '\r':
eol = true;
long cur = getPosition();
if ((read()) != '\n') {
seek(cur);
}
break;
case 9: //whitespaces
case 12:
case 32:
if (prevWasWhitespace)
break;
prevWasWhitespace = true;
buffer.append((byte) c);
break;
default:
prevWasWhitespace = false;
buffer.append((byte) c);
break;
}
// break loop? do it before we read() again
if (eol || buffer.size() == buffer.capacity()) {
eol = true;
} else {
c = read();
}
}
if (buffer.size() == buffer.capacity()) {
eol = false;
while (!eol) {
switch (c = read()) {
case -1:
case '\n':
eol = true;
break;
case '\r':
eol = true;
long cur = getPosition();
if ((read()) != '\n') {
seek(cur);
}
break;
}
}
}
return !(c == -1 && buffer.isEmpty());
}
/**
* Check whether line starts with object declaration.
* @param lineTokenizer tokenizer, built by single line.
* @return object number and generation if check is successful, otherwise - null.
*/
public static int[] checkObjectStart(PdfTokenizer lineTokenizer) {
try {
lineTokenizer.seek(0);
if (!lineTokenizer.nextToken() || lineTokenizer.getTokenType() != TokenType.Number)
return null;
int num = lineTokenizer.getIntValue();
if (!lineTokenizer.nextToken() || lineTokenizer.getTokenType() != TokenType.Number)
return null;
int gen = lineTokenizer.getIntValue();
if (!lineTokenizer.nextToken())
return null;
if (!Arrays.equals(Obj, lineTokenizer.getByteContent()))
return null;
return new int[]{num, gen};
} catch (Exception ioe) {
// empty on purpose
}
return null;
}
}