org.apache.pdfbox.util.DateConverter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of pdfbox Show documentation
The Apache PDFBox library is an open source Java tool for working with PDF documents.
There is a newer version: 3.0.2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.util;

import java.io.IOException;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.Locale;
import java.util.SimpleTimeZone;
import java.util.TimeZone;

import org.apache.pdfbox.cos.COSString;

/**
 * Date format is described in PDF Reference 1.7 section 3.8.2
 * (www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf)
 * and also in PDF 32000-1:2008 
 * (http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf))
 * although the latter inexplicably omits the trailing apostrophe.
 * 
 * The interpretation of dates without timezones is unclear. 
 * The code below assumes that such dates are in UTC+00 (aka GMT).
 * This is in keeping with the PDF Reference's assertion that:
 *      numerical fields default to zero values. 
 * However, the Reference does go on to make the cryptic remark:
 *      If no UT information is specified, the relationship of the specified  
 *      time to UT is considered to be unknown. Whether or not the time 
 *      zone is known, the rest of the date should be specified in local time.
 * I understand this to refer to _creating_ a pdf date value. That is, 
 * code that can get the wall clock time and cannot get the timezone 
 * should write the wall clock time with a time zone of zero.
 * When _parsing_ a PDF date, the statement talks about "the rest of the date"
 * being local time, thus explicitly excluding the use of the local time
 * for the time zone.
*/ 

/**
 * This class is used to convert dates to strings and back using the PDF
 * date standard in section 3.8.2 of PDF Reference 1.7.  
 *
 * @author Ben Litchfield
 * @author Fred Hansen
 * 
 * TODO Move members of this class elsewhere for shared use in pdfbox, xmpbox, and jempbox.
 */
public class DateConverter
{
    // milliseconds/1000 = seconds; seconds / 60 = minutes; minutes/60 = hours
    private static final int MINUTES_PER_HOUR = 60;
    private static final int SECONDS_PER_MINUTE = 60;
    private static final int MILLIS_PER_MINUTE = SECONDS_PER_MINUTE*1000;
    private static final int MILLIS_PER_HOUR = MINUTES_PER_HOUR * MILLIS_PER_MINUTE;
    private static final int 
            HALF_DAY = 12 * MINUTES_PER_HOUR * MILLIS_PER_MINUTE, 
            DAY = 2*HALF_DAY;
    
    /**
     * Error value if date is invalid. Parsing is done with 
     * GregorianCalendar.setLenient(false), so every date field value
     * must be within bounds. If an attempt is made to parse an invalid date 
     * field, toCalendar(String, String[]) returns Jan 1 in year INVALID_YEAR.
     */
    public static final int INVALID_YEAR = 999;
    
    
    /**
     * The Date format is supposed to be the PDF_DATE_FORMAT, but other
     * forms appear. These lists offer alternatives to be tried 
     * if parseBigEndianDate fails.  
     * 
     * The time zone offset generally trails the date string, so it is processed
     * separately with parseTZoffset. (This does not preclude having time
     * zones in the elements below; one does.)
     * 
     * Alas, SimpleDateFormat is badly non-reentrant -- it modifies its 
     * calendar field (PDFBox-402), so these lists are strings to create
     * SimpleDate format as needed.
     * 
     * Some past entries have been elided because they duplicate existing 
     * entries. See the API for SimpleDateFormat, which says 
     *      "For parsing, the number of pattern letters is ignored 
     *      unless it's needed to separate two adjacent fields."
     * 
     * toCalendar(String, String[]) tests to see that the entire input text
     * has been consumed. Therefore the ordering of formats is important. 
     * If one format begins with the entirety of another, the longer
     * must precede the other in the list.
     * 
     * HH is for 0-23 hours and hh for 1-12 hours; an "a" field must follow "hh"
     * Where year is yy, four digit years are accepted 
     * and two digit years are converted to four digits in the range
     *      [thisyear-79...thisyear+20]
     */
    private static final String[] ALPHA_START_FORMATS = 
    {
            "EEEE, dd MMM yy hh:mm:ss a",
            "EEEE, MMM dd, yy hh:mm:ss a",
            "EEEE, MMM dd, yy 'at' hh:mma", // Acrobat Net Distiller 1.0 for Windows
            "EEEE, MMM dd, yy", // Acrobat Distiller 1.0.2 for Macintosh  && PDFBOX-465
            "EEEE MMM dd, yy HH:mm:ss", // ECMP5
            "EEEE MMM dd HH:mm:ss z yy", // GNU Ghostscript 7.0.7
            "EEEE MMM dd HH:mm:ss yy", // GNU Ghostscript 7.0.7 variant
    };
    
    private static final String[] DIGIT_START_FORMATS = 
    {
        "dd MMM yy HH:mm:ss",  // for 26 May 2000 11:25:00
        "dd MMM yy HH:mm",  // for 26 May 2000 11:25
        "yyyy MMM d",   // ambiguity resolved only by omitting time
        "yyyymmddhh:mm:ss", // test case "200712172:2:3"
        "H:m M/d/yy", // test case "9:47 5/12/2008"
        "M/d/yy HH:mm:ss",
        "M/d/yy HH:mm",
        "M/d/yy",

        // proposed rule that is unreachable due to "dd MMM yy HH:mm:ss" 
        //     "yyyy MMM d HH:mm:ss", 

        // rules made unreachable by "M/d/yy HH:mm:ss" "M/d/yy HH:mm"  "M/d/yy",
        // (incoming digit strings do not mark themselves as y, m, or d!)
            // "d/MM/yyyy HH:mm:ss", // PDFBOX-164 and PDFBOX-170 
            // "M/dd/yyyy hh:mm:ss",
            // "MM/d/yyyy hh:mm:ss",
            // "M/d/yyyy HH:mm:ss",
            // "M/dd/yyyy",
            // "MM/d/yyyy",
            // "M/d/yyyy",
            // "M/d/yyyy HH:mm:ss",
            // "M/d/yy HH:mm:ss",
        // subsumed by big-endian parse
            // "yyyy-MM-dd'T'HH:mm:ss",
            // "yyyy-MM-dd'T'HH:mm:ss",
            // "yyyymmdd hh:mm:ss", 
            // "yyyymmdd", 
            // "yyyymmddX''00''",  // covers 24 cases 
            //    (orignally the above ended with '+00''00'''; 
            //      the first apostrophe quoted the plus, 
            //      '' mapped to a single ', and the ''' was invalid)
    };


    private DateConverter()
    {
        //utility class should not be constructed.
    }

    ////////////////////////////////////////////
    // C o n v e r t   t o   S t r i n g   Methods
     
    /**
     * Get all know formats.
     * 
     * @return an array containig all known formats
     */
    public static String[] getFormats() 
    {
        String[] val = new String[ALPHA_START_FORMATS.length+DIGIT_START_FORMATS.length];
        System.arraycopy(ALPHA_START_FORMATS, 0, val, 0, ALPHA_START_FORMATS.length);
        System.arraycopy(DIGIT_START_FORMATS, 0, val,ALPHA_START_FORMATS.length, DIGIT_START_FORMATS.length);
        return val;
    }

    /**
     * Converts a Calendar to a string formatted as:
     *     D:yyyyMMddHHmmss#hh'mm'  where # is Z, +, or -.
     * 
     * @param cal The date to convert to a string. May be null.
     * The DST_OFFSET is included when computing the output time zone.
     *
     * @return The date as a String to be used in a PDF document, 
     *      or null if the cal value is null
     */
    public static String toString(Calendar cal)
    {
        if (cal == null) 
        {
            return null;
        }
        String offset = formatTZoffset(cal.get(Calendar.ZONE_OFFSET)
                + cal.get(Calendar.DST_OFFSET), "'");
        return String.format(Locale.US, "D:"
                + "%1$4tY%1$2tm%1$2td"   // yyyyMMdd 
                + "%1$2tH%1$2tM%1$2tS"   // HHmmss 
                + "%2$s"                // time zone
                + "'",                  // trailing apostrophe
            cal, offset);      
    }

    /**
     * Converts the date to ISO 8601 string format:
     *     yyyy-mm-ddThh:MM:ss#hh:mm    (where '#" is '+' or '-').
     *
     * @param cal The date to convert.  Must not be null.
     * The DST_OFFSET is included in the output value.
     * 
     * @return The date represented as an ISO 8601 string.
     */
    public static String toISO8601(Calendar cal)
    {
        String offset = formatTZoffset(cal.get(Calendar.ZONE_OFFSET)
                + cal.get(Calendar.DST_OFFSET), ":");
        return String.format(Locale.US, 
                "%1$4tY"   // yyyy
                + "-%1$2tm"   // -mm  (%tm adds one to cal month value)
                + "-%1$2td"  // -dd  (%tm adds one to cal month value)
                + "T"                             // T
                + "%1$2tH:%1$2tM:%1$2tS"   // HHmmss  
                + "%2$s",              // time zone
            cal, offset);      
    }
    
    /**
     * Constrain a timezone offset to the range [-14:00 thru +14:00].
     * @param proposedOffset A value intended to be a timezone offset.
     * @return The corresponding value reduced to the above noted range 
     * by adding or subtracting multiples of a full day.
     */
    public static int restrainTZoffset(long proposedOffset) 
    {
        if (proposedOffset <= 14 * MILLIS_PER_HOUR && proposedOffset >= -14 * MILLIS_PER_HOUR)
        {
            // https://www.w3.org/TR/xmlschema-2/#dateTime-timezones
            // Timezones between 14:00 and -14:00 are valid
            return (int) proposedOffset;
        }
        // Constrain a timezone offset to the range  [-11:59 thru +12:00].
        proposedOffset = ((proposedOffset+HALF_DAY)%DAY+DAY)%DAY; 
        if (proposedOffset == 0)
        {
            return HALF_DAY;
        }
        // 0 <= proposedOffset < DAY
        proposedOffset = (proposedOffset-HALF_DAY)%HALF_DAY;   
        // -HALF_DAY < proposedOffset < HALF_DAY
        return (int)proposedOffset;
    }
    
    /** 
     * Formats a time zone offset as #hh^mm
     * where # is + or -, hh is hours, ^ is a separator, and mm is minutes.
     * Any separator may be specified by the second argument;
     * the usual values are ":" (ISO 8601), "" (RFC 822), and "'" (PDF).
     * The returned value is constrained to the range -11:59 ... 11:59.
     * For offset of 0 millis, the String returned is "+00^00", never "Z".
     * To get a "general" offset in form GMT#hh:mm, write
     *      "GMT"+DateConverter.formatTZoffset(offset, ":");
     * 
     * Take thought in choosing the source for the millis value. 
     * It can come from calendarValue.getTimeZone() or from 
     * calendarValue.get(Calendar.ZONE_OFFSET).  If a TimeZone was created
     * from a valid time zone ID, then it may have a daylight savings rule.
     * (As of July 4, 2013, the data base at http://www.iana.org/time-zones 
     * recognized 629 time zone regions. But a TimeZone created as 
     *      new SimpleTimeZone(millisOffset, "ID"), 
     * will not have a daylight savings rule. (Not even if there is a
     * known time zone with the given ID. To get the TimeZone named "xDT"
     * with its DST rule, use an ID of EST5EDT, CST6CDT, MST7MDT, or PST8PDT.
     * 

     * When parsing PDF dates, the incoming values DOES NOT have a TIMEZONE value.
     * At most it has an OFFSET value like -04'00'. It is generally impossible to 
     * determine what TIMEZONE corresponds to a given OFFSET. If the date is
     * in the summer when daylight savings is in effect, an offset of -0400
     * might correspond to any one of the 38 regions (of 53) with standard time 
     * offset -0400 and no daylight saving. Or it might correspond to 
     * any one of the 31 regions (out of 43) that observe daylight savings 
     * and have standard time offset of -0500.
     * 

     * If a Calendar has not been assigned a TimeZone with setTimeZone(), 
     * it will have by default the local TIMEZONE, not just the OFFSET.  In the
     * USA, this TimeZone will have a daylight savings rule.
     * 

     * The offset assigned with calVal.set(Calendar.ZONE_OFFSET) differs
     * from the offset in the TimeZone set by Calendar.setTimeZone(). Example:
     * Suppose my local TimeZone is America/New_York. It has an offset of -05'00'.
     * And suppose I set a GregorianCalendar's ZONE_OFFSET to -07'00'
     *     calVal = new GregorianCalendar();   // TimeZone is the local default
     *     calVal.set(Calendar.ZONE_OFFSET, -7* MILLIS_PER_HOUR);
     * Four different offsets can be computed from calVal:
     *     calVal.get(Calendar.ZONE_OFFSET)  =>  -07:00
     *     calVal.get(Calendar.ZONE_OFFSET) + calVal.get(Calendar.DST_OFFSET) => -06:00
     *     calVal.getTimeZone().getRawOffset()  =>  -05:00
     *     calVal.getTimeZone().getOffset(calVal.getTimeInMillis())  =>  -04:00
     * 

     * Which is correct??? I dunno, though setTimeZone() does seem to affect
     * ZONE_OFFSET, and not vice versa.  One cannot even test whether TimeZone 
     * or ZONE_OFFSET has been set; both have been set by initialization code.
     * TimeZone is initialized to the local default time zone 
     * and ZONE_OFFSET is set from it.
     * 
     * My choice in this DateConverter class has been to set the 
     * initial TimeZone of a GregorianCalendar to GMT. Thereafter
     * the TimeZone is modified with {@link #adjustTimeZoneNicely}. 
     * 
     * @param millis a time zone offset expressed in milliseconds
     *      Any value is accepted; it is normalized to [-11:59 ... +11:59]
     * @param sep a String to insert between hh and mm. May be empty.
     * @return the formatted String for the offset
     */
    public static String formatTZoffset(long millis, String sep) 
    {
        SimpleDateFormat sdf = new SimpleDateFormat("Z"); // #hhmm
        sdf.setTimeZone(new SimpleTimeZone(restrainTZoffset(millis),"unknown"));
        String tz = sdf.format(new Date());
        return tz.substring(0,3)+sep+tz.substring(3);
    }

    //////////////////////////////////////////////
    // P A R S E   Methods

     /**
     * Parses an integer from a string, starting at and advancing a ParsePosition.
     * 
     * @param text The string being parsed. If null, the remedy value is returned.
     * @param where The ParsePosition to start the search. This value 
     *      will be incremented by the number of digits found, but no 
     *      more than maxlen.  That is, the ParsePosition will 
     *      advance across at most maxlen initial digits in text.
     *      The error index is ignored and unchanged.
     * @param maxlen The maximum length of the integer to parse. 
     *      Usually 2, but 4 for year fields.
     *      If the field of length maxlen begins with a digit, 
     *      but contains a non-digit, no error is signaled 
     *      and the integer value is returned.
     * @param remedy Value to be assigned if no digit is found at the
     *      initial parse position; that is, if the field is empty.
     * @return The integer that was at the given parse position. Or
     *      the remedy value if no digits were found.
     */
    public static int parseTimeField(String text, ParsePosition where, 
            int maxlen, int remedy) 
    {
        if (text == null) 
        {
                    return remedy; 
        }
        // (it would seem that DecimalFormat.parse() would be simpler;
        //     but that class blithely ignores setMaximumIntegerDigits)
        int retval = 0;
        int index = where.getIndex();
        int limit = index + Math.min(maxlen, text.length()-index);
        for (; index < limit; index++)
        {
            int cval = text.charAt(index) - '0';  // convert digit to integer
            if (cval <0 || cval > 9)   // test to see if we got a digit
            {
                break;   // no digit at index
            }
            retval = retval*10 + cval;   // append the digit to the return value
        }   
        if (index == where.getIndex())
        {
            return remedy;
        }
        where.setIndex(index);
        return retval;
    }
 
    /**
     * Advances the ParsePosition past any and all the characters 
     *      that match those in the optionals list.
     *      In particular, a space will skip all spaces.
     * @param text The text to examine
     * @param where index to start looking. 
     *      The value is incremented by the number of optionals found.
     *      The error index is ignored and unchanged.
     * @param optionals A String listing all the optional characters 
     *      to be skipped.
     * @return The last non-space character passed over. 
     *      Returns a space if no non-space character was found 
     *      (even if space is not in the optionals list.)
     */
    public static char skipOptionals(String text, ParsePosition where, 
            String optionals) 
    {
        char retval = ' ', currch;
        while (text != null && where.getIndex() < text.length() 
                && optionals.indexOf(
                        (currch=text.charAt(where.getIndex()))
                ) >= 0) 
        {
            retval = (currch != ' ') ? currch : retval;
            where.setIndex(where.getIndex() + 1);
        }
        return retval;
    }
    
    /**
     * If the victim string is at the given position in the text,
     * this method advances the position past that string. 
     * 
     * @param text The text to examine
     * @param victim The string to look for
     * @param where The initial position to look at. After return, this will
     *      have been incremented by the length of the victim if it was found.
     *      The error index is ignored and unchanged.
     * @return true if victim was found; otherwise false.
     */
    public static boolean skipString(String text, String victim, ParsePosition where) 
    {
        if (text.startsWith(victim, where.getIndex()))
        {
            where.setIndex(where.getIndex()+victim.length());
            return true;
        }
        return false;
    }

    /** 
     * Construct a new GregorianCalendar and set defaults.
     * Locale is ENGLISH.
     * TimeZone is "UTC" (zero offset and no DST).
     * Parsing is NOT lenient. Milliseconds are zero.
     * 
     * @return a new gregorian calendar
     */
    public static GregorianCalendar newGreg()  
    {
        GregorianCalendar retCal = new GregorianCalendar(Locale.ENGLISH);
        retCal.setTimeZone(new SimpleTimeZone(0, "UTC"));
        retCal.setLenient(false);
        retCal.set(Calendar.MILLISECOND, 0);
        return retCal;
    }
    
    /**
     * Install a TimeZone on a GregorianCalendar without changing the 
     * hours value. A plain GregorianCalendat.setTimeZone() 
     * adjusts the Calendar.HOUR value to compensate. This is *BAD*
     * (not to say *EVIL*) when we have already set the time.
     * @param cal The GregorianCalendar whose TimeZone to change.
     * @param tz The new TimeZone.
     */
    public static void adjustTimeZoneNicely(GregorianCalendar cal, TimeZone tz) 
    {
        cal.setTimeZone(tz);
        int offset = (cal.get(Calendar.ZONE_OFFSET) + cal.get(Calendar.DST_OFFSET)) / 
                MILLIS_PER_MINUTE;
        cal.add(Calendar.MINUTE, -offset);    }
    
    /**
     * Parses the end of a date string for a time zone and, if one is found,
     * sets the time zone of the GregorianCalendar. Otherwise the calendar 
     * time zone is unchanged.
     * 
     * The text is parsed as
     *      (Z|GMT|UTC)? [+- ]* h [': ]? m '?
     * where the leading String is optional, h is two digits by default, 
     * but may be a single digit if followed by one of space, apostrophe, 
     * colon, or the end of string. Similarly, m is one or two digits. 
     * This scheme accepts the format of PDF, RFC 822, and ISO8601. 
     * If none of these applies (as for a time zone name), we try
     * TimeZone.getTimeZone().
     * 
     * @param text The text expected to begin with a time zone value,
     * possibly with leading or trailing spaces.
     * @param cal The Calendar whose TimeZone to set. 
     * @param initialWhere where Scanning begins at where.index. After success, the returned
     *      index is that of the next character after the recognized string.
     *      The error index is ignored and unchanged.
     * @return true if parsed a time zone value; otherwise the 
     *      time zone is unchanged and the return value is false.
     */
    public static boolean parseTZoffset(String text, GregorianCalendar cal, 
            ParsePosition initialWhere) 
    {
        ParsePosition where = new ParsePosition(initialWhere.getIndex());
        TimeZone tz = new SimpleTimeZone(0, "GMT");
        int tzHours, tzMin;
        char sign = skipOptionals(text, where, "Z+- ");
        boolean hadGMT = (sign == 'Z' || skipString(text, "GMT", where) 
                || skipString(text, "UTC", where));
        sign = ( ! hadGMT) ? sign : skipOptionals(text, where, "+- "); 
        
        tzHours = parseTimeField(text, where, 2, -999);
        skipOptionals(text, where, "\': ");
        tzMin = parseTimeField(text, where, 2, 0);
        skipOptionals(text, where, "\' "); 
        
        if (tzHours != -999) 
        {                    // we parsed a time zone in default format
            int hrSign = (sign == '-' ? -1 :+1);
            tz.setRawOffset(restrainTZoffset(hrSign*(tzHours*MILLIS_PER_HOUR + tzMin*MILLIS_PER_MINUTE))); 
            updateZoneId(tz);
        }
        else if ( ! hadGMT)
        {            // try to process as a name; "GMT" or "UTC" has already been processed
            String tzText = text.substring(initialWhere.getIndex()).trim();
            tz = TimeZone.getTimeZone(tzText);
            // getTimeZone returns "GMT" for unknown ids
            if ("GMT".equals(tz.getID()))  
            {                // no timezone in text
                // cal amd initialWhere are unchanged
                return false;
            }
            else
            {                // we got a tz by name; use it
                where.setIndex(text.length());
            }
        }
        adjustTimeZoneNicely(cal, tz);
        initialWhere.setIndex(where.getIndex());
        return true;
    }
    
    /**
     * Update the zone ID based on the raw offset. This is either GMT, GMT+hh:mm or GMT-hh:mm, where
     * n is between 1 and 14. The highest negative hour is -14, the highest positive hour is 12.
     * Zones that don't fit in this schema are set to zone ID "unknown".
     *
     * @param tz the time zone to update.
     */
    private static void updateZoneId(TimeZone tz)
    {
        // https://garygregory.wordpress.com/2013/06/18/what-are-the-java-timezone-ids/
        int offset = tz.getRawOffset();
        char pm = '+';
        if (offset < 0)
        {
            pm = '-';
            offset = -offset;
        }
        int hh = offset / 3600000;
        int mm = offset % 3600000 / 60000;
        if (offset == 0)
        {
            tz.setID("GMT");
        }
        else if (pm == '+' && hh <= 12)
        {
            tz.setID(String.format(Locale.US, "GMT+%02d:%02d", hh, mm));
        }
        else if (pm == '-' && hh <= 14)
        {
            tz.setID(String.format(Locale.US, "GMT-%02d:%02d", hh, mm));
        }
        else
        {
            tz.setID("unknown");
        }
    }
    
    /**
     * Parses a big-endian date: year month day hour min sec.
     * The year must be four digits. Other fields may be adjacent 
     * and delimited by length or they may follow appropriate delimiters.
     *     year [ -/]* month [ -/]* dayofmonth [ T]* hour [:] min [:] sec [.secFraction]
     * If any numeric field is omitted, all following fields must also be omitted.
     * No time zone is processed.
     * 
     * Ambiguous dates can produce unexpected results. For example:
     *      1970 12 23:08 will parse as 1970 December 23 00:08:00 
     * 
     * @param text The string to parse.
     * 
     * @param initialWhere Where to begin the parse. On return the index
     *      is advanced to just beyond the last character processed.
     *      The error index is ignored and unchanged.
     * 
     * @return a GregorianCalendar representing the parsed date. 
     *      Or null if the text did not begin with at least four digits.
     */
    public static GregorianCalendar parseBigEndianDate(String text, 
            ParsePosition initialWhere) 
    {
        ParsePosition where = new ParsePosition(initialWhere.getIndex());
        int year = parseTimeField(text, where, 4, 0);
        if (where.getIndex() != 4 + initialWhere.getIndex()) 
        {
            return null;
        }
        skipOptionals(text, where, "/- ");
        int month = parseTimeField(text, where, 2, 1) - 1; // Calendar months are 0...11
        skipOptionals(text, where, "/- ");
        int day = parseTimeField(text, where, 2, 1);
        skipOptionals(text, where, " T");
        int hour = parseTimeField(text, where, 2, 0);
        skipOptionals(text, where, ": ");
        int minute = parseTimeField(text, where, 2, 0);
        skipOptionals(text, where, ": ");
        int second = parseTimeField(text, where, 2, 0);
        char nextC = skipOptionals(text, where, ".");
        if (nextC == '.')
        {
            // fractions of a second: skip upto 19 digits
            parseTimeField(text, where, 19, 0);
        }

        GregorianCalendar dest = newGreg();
        try 
        {
            dest.set(year, month, day, hour, minute, second);
            dest.getTimeInMillis();    // trigger limit tests
        }
        catch (IllegalArgumentException ill) 
        {
            return  null;
        }
        initialWhere.setIndex(where.getIndex());
        skipOptionals(text, initialWhere, " ");
        return dest;    // dest has at least a year value
    }

    /**
     * See if text can be parsed as a date according to any of a list of 
     * formats. The time zone may be included as part of the format, or
     * omitted in favor of later testing for a trailing time zone.
     * 
     * @param text The text to be parsed.
     * 
     * @param fmts A list of formats to be tried. The syntax is that for 
     *      {@link java.text.SimpleDateFormat}
     * 
     * @param initialWhere At start this is the position to begin
     *      examining the text. Upon return it will have been
     *      incremented to refer to the next non-space character after the date.
     *      If no date was found, the value is unchanged.
     *      The error index is ignored and unchanged.
     * 
     * @return null for failure to find a date, or the GregorianCalendar
     *      for the date that was found. Unless a time zone was 
     *      part of the format, the time zone will be GMT+0
     */
    public static GregorianCalendar parseSimpleDate(String text, String[] fmts, 
            ParsePosition initialWhere) 
    {
        for(String fmt : fmts)
        {
            ParsePosition where = new ParsePosition(initialWhere.getIndex());
            SimpleDateFormat sdf = new SimpleDateFormat(fmt, Locale.ENGLISH);
            GregorianCalendar retCal = newGreg();
            sdf.setCalendar(retCal);
            if (sdf.parse(text, where) != null)
            {
                initialWhere.setIndex(where.getIndex());
                skipOptionals(text, initialWhere, " ");
                return retCal;
            }
        }
        return null;
    }

    
    /**
     * Parses a String to see if it begins with a date, and if so, 
     * returns that date. The date must be strictly correct--no 
     * field may exceed the appropriate limit.
     * (That is, the Calendar has setLenient(false).) 
     * Skips initial spaces, but does NOT check for "D:"
     * 
     * The scan first tries parseBigEndianDate and parseTZoffset
     * and then tries parseSimpleDate with appropriate formats, 
     * again followed by parseTZoffset. If at any stage the entire 
     * text is consumed, that date value is returned immediately. 
     * Otherwise the date that consumes the longest initial part
     * of the text is returned.
     * 
     * - PDF format dates are among those recognized by parseBigEndianDate.
     * - The formats tried are alphaStartFormats or digitStartFormat and
     * any listed in the value of moreFmts.
     * 
     * @param text The String that may begin with a date. Must not be null.
     *      Initial spaces and "D:" are skipped over.
     * @param moreFmts Additional formats to be tried after trying the
     *      built-in formats.
     * @param initialWhere where Parsing begins at the given position in text. If the
     *      parse succeeds, the index of where is advanced to point 
     *      to the first unrecognized character.
     *      The error index is ignored and unchanged.
     * @return A GregorianCalendar for the date. If no date is found, 
     *      returns null. The time zone will be GMT+0 unless parsing 
     *      succeeded with a format containing a time zone. (Only one
     *      builtin format contains a time zone.)
     * 
     */
    public static Calendar parseDate(String text, String[] moreFmts, 
            ParsePosition initialWhere) 
    {
        // place to remember longestr date string
        int longestLen = -999999;  // theorem: this value will never be used
                // proof: longestLen is only used if longestDate is not null
        GregorianCalendar longestDate = null; // null says no date found yet
        int whereLen;   // tempcopy of where.getIndex()
        
        ParsePosition where = new ParsePosition(initialWhere.getIndex());
        // check for null (throws exception) and trim off surrounding spaces
        skipOptionals(text, where, " ");
        int startPosition = where.getIndex();

        // try big-endian parse
        GregorianCalendar retCal = parseBigEndianDate(text, where);
        // check for success and a timezone
        if (retCal != null &&
                (where.getIndex() == text.length() 
                || parseTZoffset(text, retCal, where))) 
        {
            // if text is fully consumed, return the date
            // else remember it and its length
            whereLen = where.getIndex();
            if (whereLen == text.length()) 
            {
                initialWhere.setIndex(whereLen);
                return retCal;
            }
            longestLen = whereLen;
            longestDate = retCal;
        }
        
        if (startPosition >= text.length())
        {
            return null;
        }

        // try one of the sets of standard formats
        where.setIndex(startPosition);
        String [] formats 
                = Character.isDigit(text.charAt(startPosition))
                ? DIGIT_START_FORMATS
                : ALPHA_START_FORMATS;
        retCal = parseSimpleDate(text, formats, where);
        // check for success and a timezone
        if (retCal != null && 
                (where.getIndex() == text.length() 
                || parseTZoffset(text, retCal, where)))
        {
            // if text is fully consumed, return the date
            // else remember it and its length
            whereLen = where.getIndex();
            if (whereLen == text.length()) 
            {
                initialWhere.setIndex(whereLen);
                return retCal;
            }
            if (whereLen > longestLen) 
            {
                longestLen = whereLen;
                longestDate = retCal;
            }
        }
        
        // try the supplied formats
        if (moreFmts != null)
        {
            where.setIndex(startPosition);
            retCal = parseSimpleDate(text, moreFmts, where);
            if (retCal != null && 
                (where.getIndex() == text.length() 
                || parseTZoffset(text, retCal, where)))
            {
                whereLen = where.getIndex();
                // if text is fully consumed, return the date
                // else remember it and its length
                if (whereLen == text.length() || 
                        (longestDate != null && whereLen > longestLen)) 
                {
                    initialWhere.setIndex(whereLen);
                    return retCal;
                }
            }
        }
        if (longestDate != null) 
        {
            initialWhere.setIndex(longestLen);
            return longestDate;
        }
        return retCal;
    }
       
    /**
     * Converts a string to a Calendar by parsing the String for a date.
     * @see #toCalendar(String).
     *
     * The returned value will have 0 for DST_OFFSET.
     * 
     * @param text The COSString representation of a date.
     * @return The Calendar that the text string represents. 
     *      Or null if text was null.
     * @throws IOException If the date string is not in the correct format.
     * @deprecated This method throws an IOException for failure. Replace
     *      calls to it with {@link #toCalendar(String, String[])}
     *      and test for failure with
     *          (value == null || value.get(Calendar.YEAR) == INVALID_YEAR)
     */
    public static Calendar toCalendar(COSString text) throws IOException
    {
        if (text == null)
        {
            return null;    
        }
        return toCalendar(text.getString());
    }
    
    /**
     * Converts a string date to a Calendar date value; equivalent to 
     * {@link #toCalendar(String, String[])} using 
null for the second parameter,
     * but throws an IOException for failure.
     * 
     * The returned value will have 0 for DST_OFFSET.
     * 
     * @param text The string representation of the calendar.
     * @return The Calendar that this string represents 
     *      or null if the incoming text is null.
     * @throws IOException If the date string is non-null 
     *      and not a parseable date.
     * @deprecated This method throws an IOException for failure. Replace
     *      calls to it with {@link #toCalendar(String, String[])}
     *      using null for the second parameter
     *      and test for failure with
     *          (value == null || value.get(Calendar.YEAR) == INVALID_YEAR)
     */
    public static Calendar toCalendar(String text) throws IOException
    {
        if (text == null || "".equals(text))
        {
            return null;    
        }
        Calendar val = toCalendar(text, null);
        if (val != null && val.get(Calendar.YEAR) == INVALID_YEAR)  
        {
            throw new IOException("Error converting date: '" + text + "'");
        }
        return val;
    }
    
    /**
     * Converts a string to a calendar. The entire string must be consumed.
     * The date must be strictly correct; that is, no field may exceed
     * the appropriate limit. Uses {@link #parseDate} to do the actual parsing.
     * 
     * The returned value will have 0 for DST_OFFSET.
     * 
     * @param text The text to parse. Initial spaces and "D:" are skipped over.
     * @param moreFmts An Array of formats (as Strings) to try 
     *      in addition to the standard list.
     * @return the Calendar value corresponding to the date text. 
     *      If text does not represent a valid date, 
     *      the value is January 1 on year INVALID_YEAR at 0:0:0 GMT.
     * 
     */
    public static Calendar toCalendar(String text, String[] moreFmts)
    {
        ParsePosition where = new ParsePosition(0);
        skipOptionals(text, where, " ");
        skipString(text, "D:", where);
        Calendar retCal = parseDate(text, moreFmts, where);   // PARSE THE TEXT
        if (retCal == null || where.getIndex() != text.length()) 
        {
            // the date string is invalid for all formats we tried,
            retCal = newGreg();
            retCal.set(INVALID_YEAR, 0, 1, 0, 0, 0);
        }
        return retCal;
    }
}