org.sejda.sambox.input.SourceReader Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sejda.sambox.input;
import static java.util.Arrays.asList;
import static org.sejda.sambox.util.CharUtils.ASCII_BACKSPACE;
import static org.sejda.sambox.util.CharUtils.ASCII_CARRIAGE_RETURN;
import static org.sejda.sambox.util.CharUtils.ASCII_FORM_FEED;
import static org.sejda.sambox.util.CharUtils.ASCII_HORIZONTAL_TAB;
import static org.sejda.sambox.util.CharUtils.ASCII_LINE_FEED;
import static org.sejda.sambox.util.CharUtils.isCarriageReturn;
import static org.sejda.sambox.util.CharUtils.isDigit;
import static org.sejda.sambox.util.CharUtils.isEOL;
import static org.sejda.sambox.util.CharUtils.isEndOfName;
import static org.sejda.sambox.util.CharUtils.isHexDigit;
import static org.sejda.sambox.util.CharUtils.isLineFeed;
import static org.sejda.sambox.util.CharUtils.isOctalDigit;
import static org.sejda.sambox.util.CharUtils.isWhitespace;
import static org.sejda.util.RequireUtils.requireIOCondition;
import static org.sejda.util.RequireUtils.requireNotNullArg;
import java.io.Closeable;
import java.io.IOException;
import org.sejda.io.SeekableSource;
import org.sejda.sambox.SAMBox;
import org.sejda.sambox.cos.COSObjectKey;
import org.sejda.sambox.util.CharUtils;
import org.sejda.sambox.util.Charsets;
import org.sejda.sambox.util.Pool;
import org.sejda.util.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Component responsible for reading a {@link SeekableSource}. Methods to read expected kind of tokens are available as
* well as methods to skip them. This implementation uses a pool of {@link StringBuilder}s to minimize garbage
* collection.
*
* @author Andrea Vacondio
*/
class SourceReader implements Closeable
{
private static final Logger LOG = LoggerFactory.getLogger(SourceReader.class);
private static final long OBJECT_NUMBER_THRESHOLD = 10000000000L;
private static final int GENERATION_NUMBER_THRESHOLD = 65535;
public static final String OBJ = "obj";
private Pool pool = new Pool<>(StringBuilder::new,
Integer.getInteger(SAMBox.BUFFERS_POOL_SIZE_PROPERTY, 10)).onGive(b -> {
b.setLength(0);
b.trimToSize();
});
private SeekableSource source;
public SourceReader(SeekableSource source)
{
requireNotNullArg(source, "Cannot read a null source");
this.source = source;
}
/**
* @return the source for this reader
*/
public SeekableSource source()
{
return source;
}
/**
* @return the current position
* @throws IOException
* @see {@link SeekableSource#position()}
*/
public long position() throws IOException
{
return source.position();
}
/**
* seeks to the given offset
*
* @param offset the new offset
* @throws IOException
* @see {@link SeekableSource#position(long)}
*/
public void position(long offset) throws IOException
{
source.position(offset);
}
/**
* @return the source length
* @see {@link SeekableSource#size()}
*/
public long length()
{
return source.size();
}
/**
* Skips the expected given String
*
* @param expectedString the String value that is expected.
* @throws IOException if the String char is not the expected value or if an I/O error occurs.
*/
public final void skipExpected(String expected) throws IOException
{
for (char c : expected.toCharArray())
{
skipExpected(c);
}
}
/**
* Skips one char and throws an exception if it is not the expected value.
*
* @param ec the char value that is expected.
* @throws IOException if the read char is not the expected value or if an I/O error occurs.
*/
public void skipExpected(char ec) throws IOException
{
char c = (char) source.read();
if (c != ec)
{
throw new IOException(
"expected='" + ec + "' actual='" + c + "' at offset " + (position() - 1));
}
}
/**
* Skips the next token if it's value is one of the given ones
*
* @param values the values to skip
* @return true if the token is found and skipped, false otherwise.
* @throws IOException if there is an error reading from the stream
*/
public boolean skipTokenIfValue(String... values) throws IOException
{
long pos = position();
String token = readToken();
if (!asList(values).contains(token))
{
source.position(pos);
return false;
}
return true;
}
/**
* Skips an indirect object definition open tag (Ex. "12 0 obj") as defined in the chap 7.3.10 PDF 32000-1:2008.
*
* @throws IOException if we are reading a not valid indirect object definition open tag
*/
public void skipIndirectObjectDefinition() throws IOException
{
readObjectNumber();
readGenerationNumber();
skipSpaces();
skipExpected(OBJ);
}
/**
* Skips an indirect object definition open tag (Ex. "12 0 obj") as defined in the chap 7.3.10 PDF 32000-1:2008.
*
* @param expected object we are expecting to find
* @throws IOException if we are reading a not valid indirect object definition open tag or the object number or
* generation number don't match the expected object
*/
public void skipExpectedIndirectObjectDefinition(COSObjectKey expected) throws IOException
{
long objNumOffset = position();
long number = readObjectNumber();
if (number != expected.objectNumber())
{
throw new IOException(
String.format("Expected '%d' object number at offset %d but was '%d'",
expected.objectNumber(), objNumOffset, number));
}
long genNumOffset = position();
long generation = readGenerationNumber();
if (generation != expected.generation())
{
throw new IOException(
String.format("Expected '%d' generation number at offset %d but was '%d'",
expected.generation(), genNumOffset, number));
}
skipSpaces();
skipExpected(OBJ);
}
/**
* @return The next token that was read from the stream.
*
* @throws IOException If there is an error reading from the stream.
* @see CharUtils#isEndOfName(int)
*/
public String readToken() throws IOException
{
skipSpaces();
StringBuilder builder = pool.borrow();
try
{
int c;
while (((c = source.read()) != -1) && !isEndOfName(c))
{
builder.append((char) c);
}
unreadIfValid(c);
return builder.toString();
}
finally
{
pool.give(builder);
}
}
/**
* Unreads white spaces
*
* @throws IOException
*/
public void unreadSpaces() throws IOException
{
int c;
while ((c = source.peekBack()) != -1 && isWhitespace(c))
{
source.back();
}
}
/**
* Unreads characters until it finds a white space
*
* @throws IOException
*/
public void unreadUntilSpaces() throws IOException
{
int c;
while ((c = source.peekBack()) != -1 && !isWhitespace(c))
{
source.back();
}
}
/**
* @param valid values for the next token.
* @return true if the next token is one of the given values. false otherwise.
* @throws IOException if there is an error reading from the stream
*/
public boolean isNextToken(String... values) throws IOException
{
long pos = position();
String token = readToken();
position(pos);
return asList(values).contains(token);
}
/**
* Reads bytes until the first end of line marker occurs. NOTE: The EOL marker may consists of 1 (CR or LF) or 2 (CR
* and CL) bytes which is an important detail if one wants to unread the line.
*
* @return The characters between the current position and the end of the line.
* @throws IOException If there is an error reading from the stream.
*/
public String readLine() throws IOException
{
requireIOCondition(source.peek() != -1, "Expected line but was end of file");
StringBuilder builder = pool.borrow();
try
{
int c;
while ((c = source.read()) != -1 && !isEOL(c))
{
builder.append((char) c);
}
if (isCarriageReturn(c) && isLineFeed(source.peek()))
{
source.read();
}
return builder.toString();
}
finally
{
pool.give(builder);
}
}
/**
* Reads a long and throws an {@link IOException} if the long value is negative or has more than 10 digits (i.e. :
* bigger than {@link #OBJECT_NUMBER_THRESHOLD})
*
* @return the object number being read.
* @throws IOException if an I/O error occurs
*/
public long readObjectNumber() throws IOException
{
long retval = readLong();
if (retval < 0 || retval >= OBJECT_NUMBER_THRESHOLD)
{
throw new IOException(
"Object Number '" + retval + "' has more than 10 digits or is negative");
}
return retval;
}
/**
* reads an integer and throws an {@link IOException} if the integer value has more than the maximum object revision
* (i.e. : bigger than {@link #GENERATION_NUMBER_THRESHOLD})
*
* @return the generation number being read.
* @throws IOException if an I/O error occurs
*/
public int readGenerationNumber() throws IOException
{
int retval = readInt();
if (retval < 0 || retval > GENERATION_NUMBER_THRESHOLD)
{
throw new IOException("Generation Number '" + retval + "' has more than 5 digits");
}
return retval;
}
/**
* Reads a token conforming with PDF Name Objects chap 7.3.5 PDF 32000-1:2008.
*
* @return the generation number being read.
* @throws IOException if an I/O error occurs
*/
public String readName() throws IOException
{
skipExpected('/');
StringBuilder builder = pool.borrow();
try
{
int i;
while (((i = source.read()) != -1) && !isEndOfName(i))
{
char c = (char) i;
if (c == '#')
{
char ch1 = (char) source.read();
char ch2 = (char) source.read();
// Prior to PDF v1.2, the # was not a special character. Also,
// it has been observed that various PDF tools do not follow the
// spec with respect to the # escape, even though they report
// PDF versions of 1.2 or later. The solution here is that we
// interpret the # as an escape only when it is followed by two
// valid hex digits.
//
if (isHexDigit(ch1) && isHexDigit(ch2))
{
String hex = "" + ch1 + ch2;
c = (char) Integer.parseInt(hex, 16);
}
else
{
source.back(2);
LOG.warn(
"Found NUMBER SIGN (#) not used as escaping char while reading name at "
+ position());
}
}
builder.append(c);
}
unreadIfValid(i);
return builder.toString();
}
finally
{
pool.give(builder);
}
}
/**
* @return The integer that was read from the stream.
* @throws IOException If there is an error reading from the stream.
*/
public int readInt() throws IOException
{
String intBuffer = readIntegerNumber();
try
{
return Integer.parseInt(intBuffer);
}
catch (NumberFormatException e)
{
source.back(intBuffer.getBytes(Charsets.ISO_8859_1).length);
throw new IOException(
String.format("Expected an integer type at offset %d but was '%s'", position(),
intBuffer),
e);
}
}
/**
* @return The long that was read from the stream.
* @throws IOException If there is an error reading from the stream.
*/
public long readLong() throws IOException
{
String longBuffer = readIntegerNumber();
try
{
return Long.parseLong(longBuffer);
}
catch (NumberFormatException e)
{
source.back(longBuffer.getBytes(Charsets.ISO_8859_1).length);
throw new IOException(String.format("Expected a long type at offset %d but was '%s'",
position(), longBuffer), e);
}
}
/**
* Reads a a token conforming with a PDF Integer object defined in Numeric Objects chap 7.3.3 PDF 32000-1:2008.
*
* @return the token to parse as {@link Integer} or {@link Long}.
* @throws IOException If there is an error reading from the stream.
*/
public final String readIntegerNumber() throws IOException
{
skipSpaces();
StringBuilder builder = pool.borrow();
try
{
int c = source.read();
if (c != -1 && (isDigit(c) || c == '+' || c == '-'))
{
builder.append((char) c);
while ((c = source.read()) != -1 && isDigit(c))
{
builder.append((char) c);
}
}
unreadIfValid(c);
return builder.toString();
}
finally
{
pool.give(builder);
}
}
/**
* Reads a token conforming with PDF Numeric Objects chap 7.3.3 PDF 32000-1:2008.
*
* @return the token to parse as integer or real number.
* @throws IOException If there is an error reading from the stream.
*/
public final String readNumber() throws IOException
{
StringBuilder builder = pool.borrow();
try
{
int c = source.read();
if (c != -1 && (isDigit(c) || c == '+' || c == '-' || c == '.'))
{
builder.append((char) c);
while ((c = source.read()) != -1
&& (isDigit(c) || c == '.' || c == 'E' || c == 'e'))
{
builder.append((char) c);
}
}
unreadIfValid(c);
return builder.toString();
}
finally
{
pool.give(builder);
}
}
/**
* Reads a token conforming with PDF Hexadecimal Strings chap 7.3.4.3 PDF 32000-1:2008. Any non hexadecimal char
* found while parsing the token is replace with the default '0' hex char.
*
* @return the token to parse as an hexadecimal string
* @throws IOException If there is an error reading from the stream.
*/
public final String readHexString() throws IOException
{
skipExpected('<');
StringBuilder builder = pool.borrow();
try
{
int c;
while (((c = source.read()) != -1) && c != '>')
{
if (isHexDigit(c))
{
builder.append((char) c);
}
else if (isWhitespace(c))
{
continue;
}
else
{
// this differs from original PDFBox implementation. It replaces the wrong char with a default value
// and goes on.
LOG.warn(String.format(
"Expected an hexadecimal char at offset %d but was '%c'. Replaced with default 0.",
position() - 1, c));
builder.append('0');
}
}
requireIOCondition(c != -1,
"Unexpected EOF. Missing closing bracket for hexadecimal string.");
return builder.toString();
}
finally
{
pool.give(builder);
}
}
/**
* Reads a token conforming with PDF Literal Strings chap 7.3.4.2 PDF 32000-1:2008.
*
* @return the token to parse as a literal string
* @throws IOException If there is an error during parsing.
*/
public String readLiteralString() throws IOException
{
skipExpected('(');
int bracesCounter = 1;
StringBuilder builder = pool.borrow();
try
{
int i;
while ((i = source.read()) != -1 && bracesCounter > 0)
{
char c = (char) i;
switch (c)
{
case '(':
bracesCounter++;
builder.append(c);
break;
case ')':
bracesCounter--;
// TODO PDFBox 276
// this differs from the PDFBox 2.0.0 impl.
// consider if we want to take care of this. Maybe investigate Acrobat to see how they do it
if (bracesCounter > 0)
{
builder.append(c);
}
break;
case '\\':
{
char next = (char) source.read();
switch (next)
{
case 'n':
builder.append((char) ASCII_LINE_FEED);
break;
case 'r':
builder.append((char) ASCII_CARRIAGE_RETURN);
break;
case 't':
builder.append((char) ASCII_HORIZONTAL_TAB);
break;
case 'b':
builder.append((char) ASCII_BACKSPACE);
break;
case 'f':
builder.append((char) ASCII_FORM_FEED);
break;
case ')':
// TODO PDFBox 276
// this differs from the PDFBox 2.0.0 impl.
// consider if we want to take care of this. Maybe investigate Acrobat to see how they do it
case '(':
case '\\':
builder.append(next);
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
{
StringBuilder octal = pool.borrow();
try
{
octal.append(next);
next = (char) source.read();
if (isOctalDigit(next))
{
octal.append(next);
next = (char) source.read();
if (isOctalDigit(next))
{
octal.append(next);
}
else
{
unreadIfValid(next);
}
}
else
{
unreadIfValid(next);
}
builder.append((char) Integer.parseInt(octal.toString(), 8));
}
finally
{
pool.give(octal);
}
break;
}
case ASCII_LINE_FEED:
case ASCII_CARRIAGE_RETURN:
{
// this is a break in the line so ignore it and the newline and continue
while ((c = (char) source.read()) != -1 && isEOL(c))
{
// NOOP
}
unreadIfValid(c);
break;
}
default:
// dropping the backslash
unreadIfValid(c);
}
break;
}
case ASCII_LINE_FEED:
builder.append((char) ASCII_LINE_FEED);
break;
case ASCII_CARRIAGE_RETURN:
{
builder.append((char) ASCII_LINE_FEED);
if (!CharUtils.isLineFeed(source.read()))
{
unreadIfValid(c);
}
break;
}
default:
builder.append(c);
}
}
unreadIfValid(i);
return builder.toString();
}
finally
{
pool.give(builder);
}
}
/**
* Skips all spaces and comments that are present.
*
* @throws IOException If there is an error reading from the stream.
*/
public void skipSpaces() throws IOException
{
int c = source.read();
// 37 is the % character, a comment
while (isWhitespace(c) || c == 37)
{
if (c == 37)
{
// skip past the comment section
while ((c = source.read()) != -1 && !isEOL(c))
{
// NOOP
}
}
else
{
c = source.read();
}
}
unreadIfValid(c);
}
/**
* Unreads the given character if it's not -1
*
* @param c
* @throws IOException
*/
public void unreadIfValid(int c) throws IOException
{
if (c != -1)
{
source.back();
}
}
/**
* Closes the {@link SeekableSource} this reader was created from.
*/
@Override
public void close() throws IOException
{
IOUtils.close(source);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy