edu.harvard.hul.ois.jhove.module.pdf.Literal Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of pdf-hul Show documentation
Show all versions of pdf-hul Show documentation
PDF module developed by Harvard University Library
/**********************************************************************
* Jhove - JSTOR/Harvard Object Validation Environment
* Copyright 2003 by JSTOR and the President and Fellows of Harvard College
**********************************************************************/
package edu.harvard.hul.ois.jhove.module.pdf;
import java.util.*;
import java.text.*;
import java.io.EOFException;
import java.io.IOException;
/**
* Class for Tokens which represent PDF strings. The class maintains
* a field for determining whether the string is encoded as PDF encoding
* or UTF-16. This is determined in the course of analyzing the
* characters for the token.
*/
public class Literal
extends StringValuedToken
{
/** True if literal is in PDFDocEncoding; false if UTF-16. */
private boolean _pdfDocEncoding;
/** Used for accommodating the literal */
private StringBuffer buffer;
/** Indicates if a character for the first half of a hex byte
has already been buffered */
private boolean haveHi;
/** The high half-byte character */
private int hi;
/** First byte of a UTF-16 character. */
int firstByte;
/** First digit of a hexadecimal string value. */
//int h1;
/** The state of the tokenization. Only the subset of States which
pertain to Literals are used here. */
private State _state;
/** True if no discrepancies with PDF/A requirements have been found,
false if there is a discrepancy in this literal. */
private boolean _pdfACompliant;
/** Depth of parenthesis nesting. */
private int _parenLevel;
/** Mapping between PDFDocEncoding and Unicode code points. */
public static char [] PDFDOCENCODING = {
'\u0000','\u0001','\u0002','\u0003','\u0004','\u0005','\u0006','\u0007',
'\b' ,'\t' ,'\n' ,'\u000b','\f' ,'\r' ,'\u000e','\u000f',
'\u0010','\u0011','\u0012','\u0013','\u0014','\u0015','\u0016','\u0017',
'\u02d8','\u02c7','\u02c6','\u02d9','\u02dd','\u02db','\u02da','\u02dc',
'\u0020','\u0021','\"' ,'\u0023','\u0024','\u0025','\u0026','\'',
'\u0028','\u0029','\u002a','\u002b','\u002c','\u002d','\u002e','\u002f',
'\u0030','\u0031','\u0032','\u0033','\u0034','\u0035','\u0036','\u0037',
'\u0038','\u0039','\u003a','\u003b','\u003c','\u003d','\u003e','\u003f',
'\u0040','\u0041','\u0042','\u0043','\u0044','\u0045','\u0046','\u0047',
'\u0048','\u0049','\u004a','\u004b','\u004c','\u004d','\u004e','\u004f',
'\u0050','\u0051','\u0052','\u0053','\u0054','\u0055','\u0056','\u0057',
'\u0058','\u0059','\u005a','\u005b','\\' ,'\u005d','\u005e','\u005f',
'\u0060','\u0061','\u0062','\u0063','\u0064','\u0065','\u0066','\u0067',
'\u0068','\u0069','\u006a','\u006b','\u006c','\u006d','\u006e','\u006f',
'\u0070','\u0071','\u0072','\u0073','\u0074','\u0075','\u0076','\u0077',
'\u0078','\u0079','\u007a','\u007b','\u007c','\u007d','\u007e','\u007f',
'\u2022','\u2020','\u2021','\u2026','\u2003','\u2002','\u0192','\u2044',
'\u2039','\u203a','\u2212','\u2030','\u201e','\u201c','\u201d','\u2018',
'\u2019','\u201a','\u2122','\ufb01','\ufb02','\u0141','\u0152','\u0160',
'\u0178','\u017d','\u0131','\u0142','\u0153','\u0161','\u017e','\u009f',
'\u20ac','\u00a1','\u00a2','\u00a3','\u00a4','\u00a5','\u00a6','\u00a7',
'\u00a8','\u00a9','\u00aa','\u00ab','\u00ac','\u00ad','\u00ae','\u00af',
'\u00b0','\u00b1','\u00b2','\u00b3','\u00b4','\u00b5','\u00b6','\u00b7',
'\u00b8','\u00b9','\u00ba','\u00bb','\u00bc','\u00bd','\u00be','\u00bf',
'\u00c0','\u00c1','\u00c2','\u00c3','\u00c4','\u00c5','\u00c6','\u00c7',
'\u00c8','\u00c9','\u00ca','\u00cb','\u00cc','\u00cd','\u00ce','\u00cf',
'\u00d0','\u00d1','\u00d2','\u00d3','\u00d4','\u00d5','\u00d6','\u00d7',
'\u00d8','\u00d9','\u00da','\u00db','\u00dc','\u00dd','\u00de','\u00df',
'\u00e0','\u00e1','\u00e2','\u00e3','\u00e4','\u00e5','\u00e6','\u00e7',
'\u00e8','\u00e9','\u00ea','\u00eb','\u00ec','\u00ed','\u00ef','\u00ef',
'\u00f0','\u00f1','\u00f2','\u00f3','\u00f4','\u00f5','\u00f6','\u00f7',
'\u00f8','\u00f9','\u00fa','\u00fb','\u00fc','\u00fd','\u00fe','\u00ff'
};
private static final int CR = 0x0D;
private static final int LF = 0x0A;
private static final int BS = 0x08;
private static final int HT = 0x09;
private static final int FORMFEED = 0x0C;
private static final int ESC = 0X1B;
private static final int OPEN_PARENTHESIS = 0x28;
private static final int CLOSE_PARENTHESIS = 0x29;
private static final int BACKSLASH = 0x5C;
private static final int FE = 0xFE;
private static final int FF = 0xFF;
/** Creates an instance of a string literal */
public Literal ()
{
super ();
_pdfDocEncoding = true;
buffer = new StringBuffer ();
haveHi = false;
}
/**
* Append a hex character.This is used only for hex literals
* (those that start with '<').
*
* @param ch The integer 8-bit code for a hex character
* @throws edu.harvard.hul.ois.jhove.module.pdf.PdfException
*/
public void appendHex (int ch) throws PdfException
{
if (_rawBytes == null) {
_rawBytes = new Vector<> (32);
}
if (haveHi) {
_rawBytes.add(hexToInt (hi, ch));
haveHi = false;
}
else {
hi = ch;
haveHi = true;
}
}
/**
* Process the incoming characters into a string literal.This is used for literals delimited
by parentheses, as opposed to hex strings.
*
* @param tok The tokenizer, passed to give access to its getChar
* function.
* @return true
if the character was processed
* normally, false
if a terminating
* parenthesis was reached.
* @throws IOException
*/
public long processLiteral (Tokenizer tok) throws IOException
{
/** Variable for UTF-16 chars. */
int utfch;
/** First byte of a UTF-16 character. */
int b1 = 0x00;
/* Character read from tokenizer. */
int ch;
_parenLevel = 0;
_rawBytes = new Vector<> (32);
_state = State.LITERAL;
long offset = 0;
for (;;) {
ch = tok.readChar ();
// If we get -1, then we've hit an EOF without proper termination of
// the literal. Throw an exception.
if (ch < 0) {
throw new EOFException (MessageConstants.PDF_HUL_10.getMessage()); // PDF-HUL-10
}
offset++;
if (_state == State.LITERAL) {
// We are still in a state of flux, determining the encoding
if (ch == FE) {
_state = State.LITERAL_FE;
}
else if (ch == CLOSE_PARENTHESIS && --_parenLevel < 0) {
// We have an empty string
setPDFDocEncoding (true);
setValue(buffer.toString());
return offset;
}
else if (ch == BACKSLASH) {
ch = readBackslashSequence (false, tok);
switch (ch) {
case 0:
continue; // invalid character, ignore
case FE:
_state = State.LITERAL_FE;
break;
default:
// any other char is treated nonspecially
setPDFDocEncoding (true);
buffer.append (PDFDOCENCODING[ch]);
break;
}
}
else {
// We now know we're in 8-bit PDF encoding.
// Append the character to the buffer.
if (ch == OPEN_PARENTHESIS) {
// Count open parens to be matched by close parens.
// Backslash-quoted parens won't get here.
++_parenLevel;
}
_state = State.LITERAL_PDF;
setPDFDocEncoding (true);
buffer.append (PDFDOCENCODING[ch]);
}
}
else if (_state == (State.LITERAL_FE)) {
switch (ch) {
case FF:
_state = State.LITERAL_UTF16_1;
setPDFDocEncoding (false);
break;
case BACKSLASH:
ch = readBackslashSequence (false, tok);
if (ch == 0) {
continue; // invalid character, ignore
} if (ch == FF) {
_state = State.LITERAL_UTF16_1;
setPDFDocEncoding (false);
}
else {
// any other char is treated nonspecially
setPDFDocEncoding (true);
// The FE is just an FE, put it in the buffer
buffer.append (PDFDOCENCODING[FE]);
buffer.append (PDFDOCENCODING[ch]);
} break;
default:
_state = State.LITERAL_PDF;
setPDFDocEncoding (true);
// The FE is just an FE, put it in the buffer
buffer.append (PDFDOCENCODING[FE]);
buffer.append (PDFDOCENCODING[ch]);
break;
}
}
else if (_state == (State.LITERAL_PDF)) {
if (ch == OPEN_PARENTHESIS) {
// Count open parens to be matched by close parens.
// Backslash-quoted parens won't get here.
++_parenLevel;
buffer.append (PDFDOCENCODING[ch]);
}
else if (ch == CLOSE_PARENTHESIS && --_parenLevel < 0) {
setValue(buffer.toString());
return offset;
}
else if (ch == BACKSLASH) {
ch = readBackslashSequence (false, tok);
if (ch == 0) {
continue; // invalid character, ignore
}
// any other char is treated nonspecially
buffer.append (PDFDOCENCODING[ch]);
}
else {
buffer.append (PDFDOCENCODING[ch]);
}
}
else if (_state == (State.LITERAL_UTF16_1)) {
// First byte of a UTF16 character. But a close
// paren or backslash is a single-byte character.
// Parens within the string are double-byte characters,
// so we don't have to worry about them.
switch (ch) {
case CLOSE_PARENTHESIS:
setValue(buffer.toString());
return offset;
case BACKSLASH:
utfch = readBackslashSequence (true, tok);
if (utfch == 0) {
continue; // invalid character, ignore
} break;
default:
_state = State.LITERAL_UTF16_2;
b1 = ch;
break;
}
}
else if (_state == (State.LITERAL_UTF16_2)) {
// Second byte of a UTF16 character.
/* It turns out that a backslash may be double-byte,
* rather than the assumed single.byte. The following
* allows for this. Suggested by Justin Litman, Library
* of Congress, 2006-03-17.
*/
if (ch == BACKSLASH) {
ch = readBackslashSequence (false, tok);
if (ch == 0) {
_state = State.LITERAL_UTF16_2; // skip the wrong char and reset to previous state
continue; /* Invalid character, ignore. */
}
}
utfch = 256 * b1 + ch;
_state = State.LITERAL_UTF16_1;
// an ESC may appear at any point to signify
// a language code. Remove the language code
// from the stream and save it in a list of codes.
if (utfch == ESC) {
readUTFLanguageCode (tok);
}
else {
buffer.append ((char) utfch);
}
}
_rawBytes.add (ch);
}
}
/**
* Convert the raw hex data.Two buffers are saved: _rawBytes
* for the untranslated hex-encoded data, and _value for the
* PDF or UTF encoded string.
* @throws edu.harvard.hul.ois.jhove.module.pdf.PdfException
*/
public void convertHex () throws PdfException
{
if (_rawBytes != null) {
boolean utf = false;
StringBuilder localBuffer = new StringBuilder();
// If a high byte is left hanging, complete it with a '0'
if (haveHi) {
_rawBytes.add(hexToInt(hi, '0'));
}
if (_rawBytes.size() >= 2 && rawByte(0) == 0XFE &&
rawByte(1) == 0XFF) {
utf = true;
}
if (utf) {
// Gather pairs of bytes into characters without conversion
for (int i = 2; i < _rawBytes.size(); i += 2) {
localBuffer.append
((char) (rawByte(i) * 256 + rawByte(i + 1)));
}
} else {
// Convert single bytes to PDF encoded characters.
for (int i = 0; i < _rawBytes.size(); i++) {
localBuffer.append(Tokenizer.PDFDOCENCODING[rawByte(i)]);
}
}
_value = localBuffer.toString();
}
}
private static int hexToInt (int c1, int c2) throws PdfException
{
return 16 * hexValue (c1) + hexValue (c2);
}
private static int hexValue (int h) throws PdfException
{
int d = 0;
if (0x30 <= h && h <= 0x39) {
// digit 0-9
d = h - 0x30;
}
else if (0x41 <= h && h <= 0x46) {
// letter A-F
d = h - 0x37;
}
else if (0x61 <= h && h <= 0x66) {
// letter a-f
d = h - 0x57;
}
else {
throw new PdfMalformedException (MessageConstants.PDF_HUL_11); // PDF-HUL-11
}
return d;
}
/* Extract a byte from _rawBytes. In order to allow for byte-short
situations, any byte off the end is returned as 0. */
private int rawByte (int idx)
{
if (idx >= _rawBytes.size ()) {
return 0;
}
return _rawBytes.elementAt(idx);
}
/**
* Returns true
if this string is in PDFDocEncoding,
* false if UTF-16.
*
* @return isPdfDocEncoding
*/
public boolean isPDFDocEncoding ()
{
return _pdfDocEncoding;
}
/**
* Sets the value of pDFDocEncoding.
* @param pdfDocEncoding: boolean if the is in PDFDocEncoding
*/
public void setPDFDocEncoding (boolean pdfDocEncoding)
{
_pdfDocEncoding = pdfDocEncoding;
}
/**
* Returns true
if the string value is a parsable date.
* Conforms to the ASN.1 date format: D:YYYYMMDDHHmmSSOHH'mm'
* where everything before and after YYYY is optional.
* If we take this literally, the format is frighteningly ambiguous
* (imagine, for instance, leaving out hours but not minutes and
* seconds), so the checking is a bit loose.
*
* @return if it's a Date
*/
public boolean isDate ()
{
return parseDate () != null;
}
/**
* Parse the string value to a date. PDF dates conform to
* the ASN.1 date format. This consists of
* D:YYYYMMDDHHmmSSOHH'mm'
* where everything before and after YYYY is optional.
* Adobe doesn't actually say so, but I'm assuming that if a
* field is included, everything to its left must be included,
* e.g., you can't have seconds but leave out minutes.
*
* @return date of string value
*/
public Date parseDate ()
{
int year = 0;
int month = 0;
int day = 0;
int hour = 0;
int minute = 0;
int second = 0;
char timezonechar = '?'; // +, -, or Z
int timezonehour = 0;
int timezoneminute = 0;
Calendar cal;
String str = getValue ();
if (str == null) {
return null;
}
str = str.trim ();
if (str.length() < 4) {
return null;
}
int datestate = 0;
int charidx = 0;
try {
wloop:
while (charidx < str.length ()) {
// We parse the date using a simple state machine,
// with a state for each date component.
switch (datestate) {
case 0: // starting state, may start with "D:"
if ("D:".equals (str.substring (charidx, charidx + 2))) {
charidx += 2;
}
datestate = 1; // advance regardless
break;
case 1: // expecting year
year = Integer.parseInt (str.substring (charidx, charidx + 4));
charidx += 4;
datestate = 2;
break;
case 2: // expecting month
month = Integer.parseInt (str.substring (charidx, charidx+2));
charidx += 2;
datestate = 3;
break;
case 3: // expecting day of month
day = Integer.parseInt (str.substring (charidx, charidx + 2));
if (day < 1 || day > 31) {
return null;
}
charidx += 2;
datestate = 4;
break;
case 4: // expecting hour (00-23)
hour = Integer.parseInt (str.substring (charidx, charidx + 2));
charidx += 2;
datestate = 5;
break;
case 5: // expecting minute (00-59)
minute = Integer.parseInt (str.substring (charidx, charidx+2));
charidx += 2;
datestate = 6;
break;
case 6: // expecting second (00-59)
second = Integer.parseInt (str.substring (charidx, charidx+2));
charidx += 2;
datestate = 7;
break;
case 7: // expecting time zone ('+', '-', or 'Z')
timezonechar = str.charAt (charidx);
if (timezonechar != 'Z' && timezonechar != '+' &&
timezonechar != '-') {
return null;
}
charidx++;
datestate = 8;
break;
case 8: // expecting time zone hour.
// ignore if timezonechar is 'Z'
if (timezonechar == '+' || timezonechar == '-') {
timezonehour = Integer.parseInt (str.substring (charidx,
charidx + 2));
if (timezonechar == '-') {
timezonehour = -timezonehour;
}
// Time zone hour must have trailing quote
if (!str.substring (charidx+2, charidx+3).equals ("'")) {
return null;
}
charidx += 3;
}
datestate = 9;
break;
case 9: // expecting time zone minute -- in single quotes
// ignore if timezonechar is 'Z'
if (timezonechar == '+' || timezonechar == '-') {
if (str.charAt (charidx) == '\'') {
timezoneminute =
Integer.parseInt (str.substring (charidx,
charidx + 2));
}
if (timezonechar == '-') {
timezoneminute = -timezoneminute;
}
// Time zone minute must have trailing quote
if (!str.substring (charidx+2, charidx+3).equals ("'")) {
return null;
}
}
break wloop;
}
}
}
// Previously, we assumed that a parsing exception meant the
// end of the date. This is too permissive; an exception means
// that the date is not well-formed.
catch (Exception e) {
return null;
}
if (datestate < 2) {
return null; // not enough fields
}
// First we must construct the time zone string, then use
// it to make a TimeZone object.
if (timezonechar != '?') {
String tzStr = "GMT";
if (timezonechar == 'Z') {
tzStr += "+0000";
}
else {
tzStr += timezonechar;
NumberFormat nfmt = NumberFormat.getInstance ();
nfmt.setMinimumIntegerDigits (2);
nfmt.setMaximumIntegerDigits (2);
tzStr += nfmt.format (timezonehour);
tzStr += nfmt.format (timezoneminute);
}
TimeZone tz = TimeZone.getTimeZone (tzStr);
// Use that TimeZone to create a Calendar with our date.
// Note that Java months are 0-based.
cal = Calendar.getInstance (tz);
}
else {
// time zone is unspecified
cal = Calendar.getInstance ();
}
cal.set (year, month - 1, day, hour, minute, second);
return cal.getTime ();
}
/**
* Returns true
if this token doesn't violate any
* PDF/A rules, false
if it does.
* @return if it's PDF/A compliant
*/
public boolean isPDFACompliant ()
{
return _pdfACompliant;
}
/* private void beginBackslashState ()
{
octalBufLen = 0;
backslashFlag = true;
}
*/
/** After a backslash, read characters into an escape
sequence. If we don't find a valid escape sequence,
return 0.
*/
private int readBackslashSequence (boolean utf16, Tokenizer tok)
throws IOException
{
int ch = tok.readChar1 (utf16);
if (ch >= 0X30 && ch <= 0X37) {
int num = ch - 0X30;
// Read octal sequence. We may get 1, 2, or 3 characters.
// If we get a non-numeric character, we're done and we
// put it back.
for (int i = 0; i < 2; i++) {
int ch1 = tok.readChar1 (utf16);
if (ch1 >= 0X30 && ch1 <= 0X37) {
num = num * 8 + (ch1 - 0X30);
}
else {
//_fileBufferOffset--; // put it back
tok.backupChar (); // add this function to Tokenizer****
_pdfACompliant = false; // octal sequences must be 3 chars in PDF/A
return num;
}
}
return num;
}
switch (ch) {
case 0X6E: // n
return LF;
case 0X72: // r
return CR;
case 0xd: // this is an error for CR
return 0;
case 0X74: // t
return HT;
case 0X62: // b
return BS;
case 0X66: // f
return FORMFEED;
case OPEN_PARENTHESIS:
return OPEN_PARENTHESIS;
case CLOSE_PARENTHESIS:
return CLOSE_PARENTHESIS;
case BACKSLASH:
return BACKSLASH;
default:
return 0;
}
}
/** We have just read an ESC in a UTF string.
Save all character up to and exclusive of the next ESC
as a language code.
*/
private static void readUTFLanguageCode (Tokenizer tok) throws IOException
{
StringBuilder sb = new StringBuilder();
for (;;) {
int ch = tok.readChar1(true);
// If we get -1, then we've hit an EOF without proper termination of
// the literal. Throw an exception.
if (ch < 0) {
throw new EOFException (MessageConstants.PDF_HUL_10.getMessage()); // PDF-HUL-10
}
if (ch == ESC) {
break;
}
sb.append ((char) ch);
}
tok.addLanguageCode (sb.toString ()); // ****add this to Tokenizer
//_languageCodes.add (sb.toString ());
}
/** If we're in the backslash substate (backslashFlag = true), then call
this to process characters. It will accumulate octal digits into
octalBuf and process other escaped characters. If the accumulation
produces a character, it will return that character code, otherwise
it will return 0 to indicate no character is available yet.
Althought the backslash itself is a byte, even in a 16-bit
string, the characters which follow it are 16-bit characters,
not bytes. So we call this only after applying UTF-16 encoding
where applicable.
*/
/* DEPRECATED for the current millisecond */
/* private int backslashProcess (int ch)
{
if (ch >= 0X30 && ch <= 0X37) {
int num = ch - 0X30;
// An octal sequence may have 1, 2, or 3 characters.
// If we get a non-numeric character, we're done and
// return the character, and put the character we
// just received into a holding buffer.
octalBuf[octalBufLen++] = num;
if (octalBufLen == 3) {
return octalBufValue ();
}
for (int i = 0; i < 2; i++) {
int ch1 = readChar1 (utf16);
if (ch1 >= 0X30 && ch1 <= 0X37) {
num = num * 8 + (ch1 - 0X30);
}
else {
holdChar = ch;
_pdfACompliant = false; // octal sequences must be 3 chars in PDF/A
return num;
}
}
return num;
}
// If no octal characters have been seen yet, look for an
// escaped character.
if (octalBufLen == 0) {
switch (ch) {
case 0X6E: // n
return LF;
case 0X72: // r
return CR;
case 0X74: // t
return HT;
case 0X68: // h
return BS;
case 0X66: // f
return FORMFEED;
case OPEN_PARENTHESIS:
return OPEN_PARENTHESIS;
case CLOSE_PARENTHESIS:
return CLOSE_PARENTHESIS;
case BACKSLASH:
return BACKSLASH;
default:
// illegal escape -- dump the character
return 0;
}
else {
// We have one or two buffered octal characters,
// but this isn't one. Put the current character
// in a holding buffer, and return the octal value.
holdCh = ch;
return octalBufValue ();
}
}
}
*/
}