org.sejda.sambox.input.BaseCOSParser Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sejda.sambox.input;
import static org.sejda.sambox.util.CharUtils.isCarriageReturn;
import static org.sejda.sambox.util.CharUtils.isLineFeed;
import static org.sejda.sambox.util.CharUtils.isSpace;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.sejda.io.SeekableSource;
import org.sejda.sambox.cos.COSArray;
import org.sejda.sambox.cos.COSBase;
import org.sejda.sambox.cos.COSBoolean;
import org.sejda.sambox.cos.COSDictionary;
import org.sejda.sambox.cos.COSInteger;
import org.sejda.sambox.cos.COSName;
import org.sejda.sambox.cos.COSNull;
import org.sejda.sambox.cos.COSNumber;
import org.sejda.sambox.cos.COSStream;
import org.sejda.sambox.cos.COSString;
import org.sejda.sambox.util.Charsets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Base parser for COS objects providing methods to get parsed objects from the given {@link SeekableSource}
*
* @author Andrea Vacondio
*/
abstract class BaseCOSParser extends SourceReader
{
private static final Logger LOG = LoggerFactory.getLogger(BaseCOSParser.class);
public static final String ENDOBJ = "endobj";
public static final String OBJ = "obj";
public static final String STREAM = "stream";
public static final String ENDSTREAM = "endstream";
BaseCOSParser(SeekableSource source)
{
super(source);
}
/**
* @return The next parsed basic type object from the stream or null if the next token is not a COSBase. Basic types
* are defined in Chap 7.3 of PDF 32000-1:2008
* @throws IOException If there is an error during parsing.
*/
public abstract COSBase nextParsedToken() throws IOException;
/**
* @return The next parsed dictionary object from the stream. Dictionary objects are defined in Chap 7.3.7 of PDF
* 32000-1:2008
* @throws IOException If there is an error during parsing.
*/
public COSDictionary nextDictionary() throws IOException
{
skipExpected("<<");
skipSpaces();
COSDictionary dictionary = new COSDictionary();
int c;
while ((c = source().peek()) != -1 && c != '>')
{
if (c != '/')
{
LOG.warn("Invalid dictionary key, expected '/' but was '" + (char) c + "' at "
+ position());
if (!consumeInvalidDictionaryKey())
{
return dictionary;
}
}
else
{
COSName key = nextName();
COSBase value = nextParsedToken();
if (value == null)
{
LOG.warn("Bad dictionary declaration for key '{}'", key);
}
else
{
dictionary.setItem(key, value);
}
}
skipSpaces();
}
skipExpected(">>");
return dictionary;
}
/**
* Consumes an invalid dictionary key
*
* @return true if the dictionary has been recovered and parsing can go on
* @throws IOException
*/
private boolean consumeInvalidDictionaryKey() throws IOException
{
int c;
while ((c = source().peek()) != -1 && c != '>' && c != '/')
{
// in addition to stopping when we find / or >, we also want
// to stop when we find endstream or endobj.
if (isNextToken(ENDOBJ, ENDSTREAM))
{
LOG.warn(
"Found unexpected 'endobj or 'endstream' at position {}, assuming end of dictionary",
position());
return false;
}
source().read();
}
return c != -1;
}
/**
* @return The next parsed array object from the stream. Array objects are defined in Chap 7.3.6 of PDF 32000-1:2008
* @throws IOException If there is an error during parsing.
*/
public COSArray nextArray() throws IOException
{
skipExpected('[');
COSArray array = new COSArray();
skipSpaces();
int c;
while (((c = source().peek()) != -1) && c != ']')
{
COSBase item = nextParsedToken();
if (item != null)
{
array.add(item);
}
else
{
// This could be an "endobj" or "endstream" which means we can assume that
// the array has ended.
if (isNextToken(ENDOBJ, ENDSTREAM))
{
LOG.warn(
"Found unexpected 'endobj or 'endstream' at position {}, assuming end of array",
position());
return array;
}
// the next token is "obj" and the latest two are two integer. We assume the array wasn't
// correctly terminated and we read the object definition as part of the array.
// We have to unread the latest two int and remove them from the array
if (isNextToken(OBJ) && array.size() >= 2
&& (array.getObject(array.size() - 1) instanceof COSInteger)
&& (array.getObject(array.size() - 2) instanceof COSInteger))
{
unreadSpaces();
unreadUntilSpaces();
unreadSpaces();
unreadUntilSpaces();
array.removeLast();
array.removeLast();
LOG.warn(
"Found unexpected object definition at position {}, assuming end of array",
position());
return array;
}
}
skipSpaces();
}
skipExpected(']');
return array;
}
/**
* @return The next parsed boolean object from the stream. Boolean objects are defined in Chap 7.3.2 of PDF
* 32000-1:2008
* @throws IOException If there is an error during parsing.
*/
public COSBoolean nextBoolean() throws IOException
{
char c = (char) source().peek();
if (c == 't')
{
skipExpected(Boolean.TRUE.toString());
return COSBoolean.TRUE;
}
skipExpected(Boolean.FALSE.toString());
return COSBoolean.FALSE;
}
/**
* @return The next parsed numeric object from the stream. Numeric objects are defined in Chap 7.3.3 of PDF
* 32000-1:2008
* @throws IOException If there is an error during parsing.
*/
public COSNumber nextNumber() throws IOException
{
return COSNumber.get(readNumber());
}
/**
* @return The next parsed null object from the stream. Null object is defined in Chap 7.3.9 of PDF 32000-1:2008
* @throws IOException If there is an error during parsing.
*/
public COSNull nextNull() throws IOException
{
skipExpected("null");
return COSNull.NULL;
}
/**
* @return The next parsed name object from the stream. Name objects are defined in Chap 7.3.5 of PDF 32000-1:2008
* @throws IOException If there is an error during parsing.
*/
public COSName nextName() throws IOException
{
return COSName.getPDFName(readName());
}
/**
* @return The next parsed literal string object from the stream. Literal string objects are defined in Chap 7.3.4.2
* of PDF 32000-1:2008
* @throws IOException If there is an error during parsing.
*/
public COSString nextLiteralString() throws IOException
{
return COSString.newInstance(readLiteralString().getBytes(Charsets.ISO_8859_1));
}
/**
* @return The next parsed hexadecimal string object from the stream. Hexadecimal string objects is defined in Chap
* 7.3.4.3 of PDF 32000-1:2008
* @throws IOException If there is an error during parsing.
*/
public COSString nextHexadecimalString() throws IOException
{
return COSString.parseHex(readHexString());
}
/**
* @return The next parsed string object from the stream. String objects is defined in Chap 7.3.4 of PDF
* 32000-1:2008
* @throws IOException If there is an error during parsing.
*/
public COSString nextString() throws IOException
{
char next = (char) source().peek();
switch (next)
{
case '(':
return nextLiteralString();
case '<':
return nextHexadecimalString();
default:
throw new IOException(String.format("Expected '(' or '<' at offset %d but was '%c'",
position(), next));
}
}
/**
* This will read a COSStream from the input stream using length attribute within dictionary. If length attribute is
* a indirect reference it is first resolved to get the stream length. This means we copy stream data without
* testing for 'endstream' or 'endobj' and thus it is no problem if these keywords occur within stream. We require
* 'endstream' to be found after stream data is read.
*
* @param dic dictionary that goes with this stream.
*
* @return parsed pdf stream.
*
* @throws IOException if an error occurred reading the stream, like problems with reading length attribute, stream
* does not end with 'endstream' after data read, stream too short etc.
*/
public COSStream nextStream(COSDictionary streamDictionary) throws IOException
{
skipSpaces();
skipExpected(STREAM);
int c = source().read();
while (isSpace(c))
{
LOG.warn("Found unexpected space character after 'stream' keyword");
c = source().read();
}
if (isCarriageReturn(c))
{
c = source().read();
if (!isLineFeed(c))
{
source().back();
LOG.warn("Couldn't find expected LF following CR after 'stream' keyword at "
+ position());
}
}
else if (!isLineFeed(c))
{
source().back();
}
final COSStream stream;
long length = streamLength(streamDictionary);
if (length > 0)
{
stream = new COSStream(streamDictionary, source(), position(), length);
}
else
{
stream = new COSStream(streamDictionary);
}
source().forward(stream.getFilteredLength());
if (!skipTokenIfValue(ENDSTREAM))
{
if (isNextToken(ENDOBJ))
{
LOG.warn("Expected 'endstream' at " + position() + " but was 'endobj'");
}
}
return stream;
}
/**
* Retrieves the stream length. It gets it from the dictionary, if not present there it applies fallback strategy
* searching for endstream or endobj keywords.
*
* @param streamDictionary
* @return the length
* @throws IOException
*/
private long streamLength(COSDictionary streamDictionary) throws IOException
{
long length = streamLengthFrom(streamDictionary);
if (length <= 0)
{
LOG.info(
"Using fallback strategy reading until 'endstream' or 'endobj' is found. Starting at offset "
+ position());
length = findStreamLength();
}
return length;
}
/**
* @param streamDictionary
* @return the stream length if found in the dictionary. -1 if nothing is found or if the length is incorrect.
* @throws IOException
*/
private long streamLengthFrom(COSDictionary streamDictionary) throws IOException
{
long start = position();
COSBase lengthBaseObj = streamDictionary.getItem(COSName.LENGTH);
try
{
return doStreamLengthFrom(lengthBaseObj);
}
finally
{
position(start);
}
}
private long doStreamLengthFrom(COSBase lengthBaseObj) throws IOException
{
long startingOffset = position();
if (lengthBaseObj == null)
{
LOG.warn("Invalid stream length. No length provided");
return -1;
}
COSBase retVal = lengthBaseObj.getCOSObject();
if (!(retVal instanceof COSNumber))
{
throw new IOException("Invalid stream length. Expected number instance but was "
+ retVal.getClass().getSimpleName());
}
long length = ((COSNumber) retVal).longValue();
long endStreamOffset = startingOffset + length;
if (endStreamOffset > length())
{
LOG.warn("Invalid stream length. Out of range");
return -1;
}
position(endStreamOffset);
if (!isNextToken(ENDSTREAM))
{
LOG.warn("Invalid stream length. Expected '" + ENDSTREAM + "' at " + endStreamOffset);
return -1;
}
return length;
}
/**
* Reads from the current position until it finds the "endstream" meaning we're at the end of this stream object.
* Some pdf files, however, forget to write some endstream tags and just close off objects with an "endobj" tag so
* we have to handle this case as well.
*
* @return the length from the current position to the position where "endstream" or "endobj" was found
* @throws IOException
*/
private long findStreamLength() throws IOException
{
long start = position();
try
{
return doFindStreamLength(start);
}
finally
{
position(start);
}
}
private long doFindStreamLength(long start) throws IOException
{
Pattern pattern = Pattern.compile("endstream|endobj");
while (true)
{
long currentPosition = position();
String currentLine = readLine();
Matcher matcher = pattern.matcher(currentLine);
if (matcher.find())
{
position(currentPosition + matcher.start());
long length = position() - start;
int prevChar = source().back().peek();
if (isCarriageReturn(prevChar))
{
return length - 1;
}
if (isLineFeed(prevChar))
{
prevChar = source().back().peek();
if (isCarriageReturn(prevChar))
{
return length - 2;
}
return length - 1;
}
return length;
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy