org.apache.pdfbox.util.DateConverter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of pdfbox Show documentation
The Apache PDFBox library is an open source Java tool for working with PDF documents.
There is a newer version: 3.0.2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pdfbox.util;

import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.Locale;
import java.util.SimpleTimeZone;
import java.util.TimeZone;

import org.apache.pdfbox.cos.COSString;

/*
 * Date format is described in PDF Reference 1.7 section 3.8.2
 * (www.adobe.com/devnet/acrobat/pdfs/pdf_reference_1-7.pdf)
 * and also in PDF 32000-1:2008
 * (http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf))
 * although the latter inexplicably omits the trailing apostrophe.
 *
 * The interpretation of dates without timezones is unclear.
 * The code below assumes that such dates are in UTC+00 (aka GMT).
 * This is in keeping with the PDF Reference's assertion that:
 *      numerical fields default to zero values.
 * However, the Reference does go on to make the cryptic remark:
 *      If no UT information is specified, the relationship of the specified
 *      time to UT is considered to be unknown. Whether or not the time
 *      zone is known, the rest of the date should be specified in local time.
 * I understand this to refer to _creating_ a pdf date value. That is,
 * code that can get the wall clock time and cannot get the timezone
 * should write the wall clock time with a time zone of zero.
 * When _parsing_ a PDF date, the statement talks about "the rest of the date"
 * being local time, thus explicitly excluding the use of the local time
 * for the time zone.
*/

/**
 * Converts dates to strings and back using the PDF date standard
 * in section 3.8.2 of PDF Reference 1.7.
 *
 * @author Ben Litchfield
 * @author Fred Hansen
 * 
 * TODO Move members of this class elsewhere for shared use in pdfbox and xmpbox.
 */
public final class DateConverter
{
    private DateConverter()
    {
    }

    // milliseconds/1000 = seconds; seconds / 60 = minutes; minutes/60 = hours
    private static final int MINUTES_PER_HOUR = 60;
    private static final int SECONDS_PER_MINUTE = 60;
    private static final int MILLIS_PER_MINUTE = SECONDS_PER_MINUTE*1000;
    private static final int MILLIS_PER_HOUR = MINUTES_PER_HOUR * MILLIS_PER_MINUTE;
    private static final int HALF_DAY = 12 * MINUTES_PER_HOUR * MILLIS_PER_MINUTE, DAY = 2*HALF_DAY;

    /*
     * The Date format is supposed to be the PDF_DATE_FORMAT, but other
     * forms appear. These lists offer alternatives to be tried 
     * if parseBigEndianDate fails.  
     * 
     * The time zone offset generally trails the date string, so it is processed
     * separately with parseTZoffset. (This does not preclude having time
     * zones in the elements below; one does.)
     * 
     * Alas, SimpleDateFormat is badly non-reentrant -- it modifies its 
     * calendar field (PDFBox-402), so these lists are strings to create
     * SimpleDate format as needed.
     * 
     * Some past entries have been elided because they duplicate existing 
     * entries. See the API for SimpleDateFormat, which says 
     *      "For parsing, the number of pattern letters is ignored 
     *      unless it's needed to separate two adjacent fields."
     * 
     * toCalendar(String, String[]) tests to see that the entire input text
     * has been consumed. Therefore the ordering of formats is important. 
     * If one format begins with the entirety of another, the longer
     * must precede the other in the list.
     * 
     * HH is for 0-23 hours and hh for 1-12 hours; an "a" field must follow "hh"
     * Where year is yy, four digit years are accepted 
     * and two digit years are converted to four digits in the range
     *      [thisyear-79...thisyear+20]
     */
    private static final String[] ALPHA_START_FORMATS = 
    {
        "EEEE, dd MMM yy hh:mm:ss a",
        "EEEE, MMM dd, yy hh:mm:ss a",
        "EEEE, MMM dd, yy 'at' hh:mma", // Acrobat Net Distiller 1.0 for Windows
        "EEEE, MMM dd, yy", // Acrobat Distiller 1.0.2 for Macintosh  && PDFBOX-465
        "EEEE MMM dd, yy HH:mm:ss", // ECMP5
        "EEEE MMM dd HH:mm:ss z yy", // GNU Ghostscript 7.0.7
        "EEEE MMM dd HH:mm:ss yy", // GNU Ghostscript 7.0.7 variant
    };
    
    private static final String[] DIGIT_START_FORMATS = 
    {
        "dd MMM yy HH:mm:ss",  // for 26 May 2000 11:25:00
        "dd MMM yy HH:mm",  // for 26 May 2000 11:25
        "yyyy MMM d",   // ambiguity resolved only by omitting time
        "yyyymmddhh:mm:ss", // test case "200712172:2:3"
        "H:m M/d/yy", // test case "9:47 5/12/2008"
        "M/d/yy HH:mm:ss",
        "M/d/yy HH:mm",
        "M/d/yy",

        // proposed rule that is unreachable due to "dd MMM yy HH:mm:ss" 
        //     "yyyy MMM d HH:mm:ss", 

        // rules made unreachable by "M/d/yy HH:mm:ss" "M/d/yy HH:mm"  "M/d/yy",
        // (incoming digit strings do not mark themselves as y, m, or d!)
            // "d/MM/yyyy HH:mm:ss", // PDFBOX-164 and PDFBOX-170 
            // "M/dd/yyyy hh:mm:ss",
            // "MM/d/yyyy hh:mm:ss",
            // "M/d/yyyy HH:mm:ss",
            // "M/dd/yyyy",
            // "MM/d/yyyy",
            // "M/d/yyyy",
            // "M/d/yyyy HH:mm:ss",
            // "M/d/yy HH:mm:ss",
        // subsumed by big-endian parse
            // "yyyy-MM-dd'T'HH:mm:ss",
            // "yyyy-MM-dd'T'HH:mm:ss",
            // "yyyymmdd hh:mm:ss", 
            // "yyyymmdd", 
            // "yyyymmddX''00''",  // covers 24 cases 
            //    (orignally the above ended with '+00''00'''; 
            //      the first apostrophe quoted the plus, 
            //      '' mapped to a single ', and the ''' was invalid)
    };

    /**
     * Converts a Calendar to a string formatted as:
     *     D:yyyyMMddHHmmss#hh'mm'  where # is Z, +, or -.
     * 
     * @param cal The date to convert to a string. May be null.
     * The DST_OFFSET is included when computing the output time zone.
     *
     * @return The date as a String to be used in a PDF document, 
     *      or null if the cal value is null
     */
    public static String toString(Calendar cal)
    {
        if (cal == null) 
        {
            return null;
        }
        String offset = formatTZoffset(cal.get(Calendar.ZONE_OFFSET) +
                                       cal.get(Calendar.DST_OFFSET), "'");
        return String.format(Locale.US, "D:"
                + "%1$4tY%1$2tm%1$2td"   // yyyyMMdd 
                + "%1$2tH%1$2tM%1$2tS"   // HHmmss 
                + "%2$s"                 // time zone
                + "'",                   // trailing apostrophe
            cal, offset);
    }

    /**
     * Converts the date to ISO 8601 string format:
     *     yyyy-mm-ddThh:MM:ss#hh:mm    (where '#" is '+' or '-').
     *
     * @param cal The date to convert.  Must not be null.
     * The DST_OFFSET is included in the output value.
     * 
     * @return The date represented as an ISO 8601 string.
     */
    public static String toISO8601(Calendar cal)
    {
        String offset = formatTZoffset(cal.get(Calendar.ZONE_OFFSET) +
                                       cal.get(Calendar.DST_OFFSET), ":");
        return String.format(Locale.US, 
                "%1$4tY"                  // yyyy
                + "-%1$2tm"               // -mm  (%tm adds one to cal month value)
                + "-%1$2td"               // -dd  (%tm adds one to cal month value)
                + "T"                     // T
                + "%1$2tH:%1$2tM:%1$2tS"  // HHmmss
                + "%2$s",                 // time zone
            cal, offset);
    }
    
    /*
     * Constrain a timezone offset to the range [-14:00 thru +14:00].
     * by adding or subtracting multiples of a full day.
     */
    private static int restrainTZoffset(long proposedOffset)
    {
        if (proposedOffset <= 14 * MILLIS_PER_HOUR && proposedOffset >= -14 * MILLIS_PER_HOUR)
        {
            // https://www.w3.org/TR/xmlschema-2/#dateTime-timezones
            // Timezones between 14:00 and -14:00 are valid
            return (int) proposedOffset;
        }
        // Constrain a timezone offset to the range  [-11:59 thru +12:00].
        proposedOffset = ((proposedOffset + HALF_DAY) % DAY + DAY) % DAY;
        if (proposedOffset == 0)
        {
            return HALF_DAY;
        }
        // 0 <= proposedOffset < DAY
        proposedOffset = (proposedOffset - HALF_DAY) % HALF_DAY;
        // -HALF_DAY < proposedOffset < HALF_DAY
        return (int)proposedOffset;
    }
    
    /*
     * Formats a time zone offset as #hh^mm
     * where # is + or -, hh is hours, ^ is a separator, and mm is minutes.
     * Any separator may be specified by the second argument;
     * the usual values are ":" (ISO 8601), "" (RFC 822), and "'" (PDF).
     * The returned value is constrained to the range -11:59 ... 11:59.
     * For offset of 0 millis, the String returned is "+00^00", never "Z".
     * To get a "general" offset in form GMT#hh:mm, write
     *      "GMT"+DateConverter.formatTZoffset(offset, ":");
     *
     * Take thought in choosing the source for the millis value. 
     * It can come from calendarValue.getTimeZone() or from 
     * calendarValue.get(Calendar.ZONE_OFFSET).  If a TimeZone was created
     * from a valid time zone ID, then it may have a daylight savings rule.
     * (As of July 4, 2013, the data base at http://www.iana.org/time-zones 
     * recognized 629 time zone regions. But a TimeZone created as 
     *      new SimpleTimeZone(millisOffset, "ID"), 
     * will not have a daylight savings rule. (Not even if there is a
     * known time zone with the given ID. To get the TimeZone named "xDT"
     * with its DST rule, use an ID of EST5EDT, CST6CDT, MST7MDT, or PST8PDT.
     *
     * When parsing PDF dates, the incoming values DOES NOT have a TIMEZONE value.
     * At most it has an OFFSET value like -04'00'. It is generally impossible to 
     * determine what TIMEZONE corresponds to a given OFFSET. If the date is
     * in the summer when daylight savings is in effect, an offset of -0400
     * might correspond to any one of the 38 regions (of 53) with standard time 
     * offset -0400 and no daylight saving. Or it might correspond to 
     * any one of the 31 regions (out of 43) that observe daylight savings 
     * and have standard time offset of -0500.
     *
     * If a Calendar has not been assigned a TimeZone with setTimeZone(), 
     * it will have by default the local TIMEZONE, not just the OFFSET.  In the
     * USA, this TimeZone will have a daylight savings rule.
     *
     * The offset assigned with calVal.set(Calendar.ZONE_OFFSET) differs
     * from the offset in the TimeZone set by Calendar.setTimeZone(). Example:
     * Suppose my local TimeZone is America/New_York. It has an offset of -05'00'.
     * And suppose I set a GregorianCalendar's ZONE_OFFSET to -07'00'
     *     calVal = new GregorianCalendar();   // TimeZone is the local default
     *     calVal.set(Calendar.ZONE_OFFSET, -7* MILLIS_PER_HOUR);
     * Four different offsets can be computed from calVal:
     *     calVal.get(Calendar.ZONE_OFFSET)  =>  -07:00
     *     calVal.get(Calendar.ZONE_OFFSET) + calVal.get(Calendar.DST_OFFSET) => -06:00
     *     calVal.getTimeZone().getRawOffset()  =>  -05:00
     *     calVal.getTimeZone().getOffset(calVal.getTimeInMillis())  =>  -04:00
     *
     * Which is correct??? I dunno, though setTimeZone() does seem to affect
     * ZONE_OFFSET, and not vice versa.  One cannot even test whether TimeZone 
     * or ZONE_OFFSET has been set; both have been set by initialization code.
     * TimeZone is initialized to the local default time zone 
     * and ZONE_OFFSET is set from it.
     * 
     * My choice in this DateConverter class has been to set the 
     * initial TimeZone of a GregorianCalendar to GMT. Thereafter
     * the TimeZone is modified with {@link #adjustTimeZoneNicely}.
     *
     * package-private for testing
     */
    static String formatTZoffset(long millis, String sep)
    {
        SimpleDateFormat sdf = new SimpleDateFormat("Z"); // #hhmm
        sdf.setTimeZone(new SimpleTimeZone(restrainTZoffset(millis),"unknown"));
        String tz = sdf.format(new Date());
        return tz.substring(0,3) + sep + tz.substring(3);
    }

     /*
     * Parses an integer from a string, starting at and advancing a ParsePosition.
     * Returns The integer that was at the given parse position, or the remedy value
     * if no digits were found.
     *
     * The ParsePosition will be incremented by the number of digits found, but no
     * more than maxlen. That is, the ParsePosition will advance across at most
     * maxlen initial digits in text. The error index is ignored and unchanged.
     *
     * maxlen is the maximum length of the integer to parse, usually 2, but 4 for
     * year fields. If the field of length maxlen begins with a digit, but contains
     * a non-digit, no error is signaled and the integer value is returned.
     */
    private static int parseTimeField(String text, ParsePosition where, int maxlen, int remedy)
    {
        if (text == null) 
        {
            return remedy;
        }
        // it would seem that DecimalFormat.parse() would be simpler;
        // but that class blithely ignores setMaximumIntegerDigits
        int retval = 0;
        int index = where.getIndex();
        int limit = index + Math.min(maxlen, text.length()-index);
        for (; index < limit; index++)
        {
            // convert digit to integer
            int cval = text.charAt(index) - '0';
            // test to see if we got a digit
            if (cval < 0 || cval > 9)
            {
                // no digit at index
                break;
            }
            // append the digit to the return value
            retval = retval * 10 + cval;
        }   
        if (index == where.getIndex())
        {
            return remedy;
        }
        where.setIndex(index);
        return retval;
    }
 
    /*
     * Advances the ParsePosition past any and all the characters that match
     * those in the optionals list. In particular, a space will skip all spaces.
     *
     * The start value is incremented by the number of optionals found. The error
     * index is ignored and unchanged.
     *
     * Returns the last non-space character passed over (even if space is not in
     * the optionals list.)
     */
    private static char skipOptionals(String text, ParsePosition where, String optionals)
    {
        char retval = ' ', currch;
        while (text != null && where.getIndex() < text.length() &&
               optionals.indexOf((currch = text.charAt(where.getIndex()))) >= 0)
        {
            retval = (currch != ' ') ? currch : retval;
            where.setIndex(where.getIndex() + 1);
        }
        return retval;
    }
    
    /*
     * If the victim string is at the given position in the text, this method
     * advances the position past that string.
     *
     * `where` is the initial position to look at. After return, this will have
     * been incremented by the length of the victim if it was found. The error
     * index is ignored and unchanged.
     */
    private static boolean skipString(String text, String victim, ParsePosition where)
    {
        if (text.startsWith(victim, where.getIndex()))
        {
            where.setIndex(where.getIndex()+victim.length());
            return true;
        }
        return false;
    }

    /*
     * Construct a new GregorianCalendar and set defaults.
     * Locale is ENGLISH.
     * TimeZone is "UTC" (zero offset and no DST).
     * Parsing is NOT lenient. Milliseconds are zero.
     *
     * package-private for testing
     */
    static GregorianCalendar newGreg()
    {
        GregorianCalendar retCal = new GregorianCalendar(Locale.ENGLISH);
        retCal.setTimeZone(new SimpleTimeZone(0, "UTC"));
        retCal.setLenient(false);
        retCal.set(Calendar.MILLISECOND, 0);
        return retCal;
    }
    
    /*
     * Install a TimeZone on a GregorianCalendar without changing the 
     * hours value. A plain GregorianCalendat.setTimeZone() 
     * adjusts the Calendar.HOUR value to compensate. This is *BAD*
     * (not to say *EVIL*) when we have already set the time.
     */
    private static void adjustTimeZoneNicely(GregorianCalendar cal, TimeZone tz)
    {
        cal.setTimeZone(tz);
        int offset = (cal.get(Calendar.ZONE_OFFSET) + cal.get(Calendar.DST_OFFSET)) / 
                MILLIS_PER_MINUTE;
        cal.add(Calendar.MINUTE, -offset);
    }
    
    /*
     * Parses the end of a date string for a time zone and, if one is found,
     * sets the time zone of the GregorianCalendar. Otherwise the calendar 
     * time zone is unchanged.
     * 
     * The text is parsed as
     *      (Z|GMT|UTC)? [+- ]* h [': ]? m '?
     * where the leading String is optional, h is two digits by default, 
     * but may be a single digit if followed by one of space, apostrophe, 
     * colon, or the end of string. Similarly, m is one or two digits. 
     * This scheme accepts the format of PDF, RFC 822, and ISO8601. 
     * If none of these applies (as for a time zone name), we try
     * TimeZone.getTimeZone().
     *
     * Scanning begins at where.index. After success, the returned index
     * is that of the next character after the recognized string.
     *
     * package-private for testing
     */
    static boolean parseTZoffset(String text, GregorianCalendar cal,
                                        ParsePosition initialWhere)
    {
        ParsePosition where = new ParsePosition(initialWhere.getIndex());
        TimeZone tz = new SimpleTimeZone(0, "GMT");
        int tzHours, tzMin;
        char sign = skipOptionals(text, where, "Z+- ");
        boolean hadGMT = (sign == 'Z' || skipString(text, "GMT", where) ||
                         skipString(text, "UTC", where));
        sign = (!hadGMT) ? sign : skipOptionals(text, where, "+- ");
        
        tzHours = parseTimeField(text, where, 2, -999);
        skipOptionals(text, where, "\': ");
        tzMin = parseTimeField(text, where, 2, 0);
        skipOptionals(text, where, "\' "); 
        
        if (tzHours != -999) 
        {
            // we parsed a time zone in default format
            int hrSign = (sign == '-' ? -1 : 1);
            tz.setRawOffset(restrainTZoffset(hrSign * (tzHours * MILLIS_PER_HOUR + tzMin *
                                                       (long) MILLIS_PER_MINUTE)));
            updateZoneId(tz);
        }
        else if ( ! hadGMT)
        {
            // try to process as a name; "GMT" or "UTC" has already been processed
            String tzText = text.substring(initialWhere.getIndex()).trim();
            tz = TimeZone.getTimeZone(tzText);
            // getTimeZone returns "GMT" for unknown ids
            if ("GMT".equals(tz.getID()))  
            {
                // no timezone in text, cal amd initialWhere are unchanged
                return false;
            }
            else
            {
                // we got a tz by name; use it
                where.setIndex(text.length());
            }
        }
        adjustTimeZoneNicely(cal, tz);
        initialWhere.setIndex(where.getIndex());
        return true;
    }

    /**
     * Update the zone ID based on the raw offset. This is either GMT, GMT+hh:mm or GMT-hh:mm, where
     * n is between 1 and 14. The highest negative hour is -14, the highest positive hour is 12.
     * Zones that don't fit in this schema are set to zone ID "unknown".
     *
     * @param tz the time zone to update.
     */
    private static void updateZoneId(TimeZone tz)
    {
        int offset = tz.getRawOffset();
        char pm = '+';
        if (offset < 0)
        {
            pm = '-';
            offset = -offset;
        }
        int hh = offset / 3600000;
        int mm = offset % 3600000 / 60000;
        if (offset == 0)
        {
            tz.setID("GMT");
        }
        else if (pm == '+' && hh <= 12)
        {
            tz.setID(String.format(Locale.US, "GMT+%02d:%02d", hh, mm));
        }
        else if (pm == '-' && hh <= 14)
        {
            tz.setID(String.format(Locale.US, "GMT-%02d:%02d", hh, mm));
        }
        else
        {
            tz.setID("unknown");
        }
    }

    /*
     * Parses a big-endian date: year month day hour min sec.
     * The year must be four digits. Other fields may be adjacent
     * and delimited by length or they may follow appropriate delimiters.
     *     year [ -/]* month [ -/]* dayofmonth [ T]* hour [:] min [:] sec [.secFraction]
     * If any numeric field is omitted, all following fields must also be omitted.
     * No time zone is processed.
     * 
     * Ambiguous dates can produce unexpected results. For example:
     *      1970 12 23:08 will parse as 1970 December 23 00:08:00 
     * 
     * The parse begins at `where, on return the index
     * is advanced to just beyond the last character processed.
     * The error index is ignored and unchanged.
     */
    private static GregorianCalendar parseBigEndianDate(String text,
            ParsePosition initialWhere) 
    {
        ParsePosition where = new ParsePosition(initialWhere.getIndex());
        int year = parseTimeField(text, where, 4, 0);
        if (where.getIndex() != 4 + initialWhere.getIndex()) 
        {
            return null;
        }
        skipOptionals(text, where, "/- ");
        int month = parseTimeField(text, where, 2, 1) - 1; // Calendar months are 0...11
        skipOptionals(text, where, "/- ");
        int day = parseTimeField(text, where, 2, 1);
        skipOptionals(text, where, " T");
        int hour = parseTimeField(text, where, 2, 0);
        skipOptionals(text, where, ": ");
        int minute = parseTimeField(text, where, 2, 0);
        skipOptionals(text, where, ": ");
        int second = parseTimeField(text, where, 2, 0);
        char nextC = skipOptionals(text, where, ".");
        if (nextC == '.')
        {
            // fractions of a second: skip upto 19 digits
            parseTimeField(text, where, 19, 0);
        }

        GregorianCalendar dest = newGreg();
        try 
        {
            dest.set(year, month, day, hour, minute, second);
            // trigger limit tests
            dest.getTimeInMillis();
        }
        catch (IllegalArgumentException ill) 
        {
            return  null;
        }
        initialWhere.setIndex(where.getIndex());
        skipOptionals(text, initialWhere, " ");
        // dest has at least a year value
        return dest;
    }

    /*
     * See if text can be parsed as a date according to any of a list of
     * formats. The time zone may be included as part of the format, or
     * omitted in favor of later testing for a trailing time zone.
     * 
     * The parse starts at `where`, upon return it will have been
     * incremented to refer to the next non-space character after the date.
     * If no date was found, the value is unchanged.
     * The error index is ignored and unchanged.
     * 
     * If there is a failure to find a date, or the GregorianCalendar
     * for the date that was found. Unless a time zone was
     * part of the format, the time zone will be GMT+0
     */
    private static GregorianCalendar parseSimpleDate(String text, String[] fmts,
            ParsePosition initialWhere) 
    {
        for(String fmt : fmts)
        {
            ParsePosition where = new ParsePosition(initialWhere.getIndex());
            SimpleDateFormat sdf = new SimpleDateFormat(fmt, Locale.ENGLISH);
            GregorianCalendar retCal = newGreg();
            sdf.setCalendar(retCal);
            if (sdf.parse(text, where) != null)
            {
                initialWhere.setIndex(where.getIndex());
                skipOptionals(text, initialWhere, " ");
                return retCal;
            }
        }
        return null;
    }

    /*
     * Parses a String to see if it begins with a date, and if so, 
     * returns that date. The date must be strictly correct--no 
     * field may exceed the appropriate limit.
     * (That is, the Calendar has setLenient(false).) 
     * Skips initial spaces, but does NOT check for "D:"
     * 
     * The scan first tries parseBigEndianDate and parseTZoffset
     * and then tries parseSimpleDate with appropriate formats, 
     * again followed by parseTZoffset. If at any stage the entire 
     * text is consumed, that date value is returned immediately. 
     * Otherwise the date that consumes the longest initial part
     * of the text is returned.
     * 
     * - PDF format dates are among those recognized by parseBigEndianDate.
     * - The formats tried are alphaStartFormats or digitStartFormat and
     * any listed in the value of moreFmts.
     */
    private static Calendar parseDate(String text, ParsePosition initialWhere)
    {
        if (text == null || text.isEmpty())
        {
            return null;
        }

        // remember longestr date string
        int longestLen = -999999;
        // theorem: the above value will never be used
        // proof: longestLen is only used if longestDate is not null

        GregorianCalendar longestDate = null; // null says no date found yet
        int whereLen;   // tempcopy of where.getIndex()
        
        ParsePosition where = new ParsePosition(initialWhere.getIndex());
        // check for null (throws exception) and trim off surrounding spaces
        skipOptionals(text, where, " ");
        int startPosition = where.getIndex();

        // try big-endian parse
        GregorianCalendar retCal = parseBigEndianDate(text, where);
        // check for success and a timezone
        if (retCal != null && (where.getIndex() == text.length() ||
                               parseTZoffset(text, retCal, where)))
        {
            // if text is fully consumed, return the date else remember it and its length
            whereLen = where.getIndex();
            if (whereLen == text.length()) 
            {
                initialWhere.setIndex(whereLen);
                return retCal;
            }
            longestLen = whereLen;
            longestDate = retCal;
        }

        // try one of the sets of standard formats
        where.setIndex(startPosition);
        String [] formats 
                = Character.isDigit(text.charAt(startPosition))
                ? DIGIT_START_FORMATS
                : ALPHA_START_FORMATS;
        retCal = parseSimpleDate(text, formats, where);
        // check for success and a timezone
        if (retCal != null &&
                (where.getIndex() == text.length() ||
                 parseTZoffset(text, retCal, where)))
        {
            // if text is fully consumed, return the date else remember it and its length
            whereLen = where.getIndex();
            if (whereLen == text.length()) 
            {
                initialWhere.setIndex(whereLen);
                return retCal;
            }
            if (whereLen > longestLen) 
            {
                longestLen = whereLen;
                longestDate = retCal;
            }
        }

        if (longestDate != null) 
        {
            initialWhere.setIndex(longestLen);
            return longestDate;
        }
        return retCal;
    }
       
    /**
     * Returns the Calendar for a given COS string containing a date,
     * or {@code null} if it cannot be parsed.
     *
     * The returned value will have 0 for DST_OFFSET.
     * 
     * @param text A COS string containing a date.
     * @return The Calendar that the text string represents, or {@code null} if it cannot be parsed.
     */
    public static Calendar toCalendar(COSString text)
    {
        if (text == null)
        {
            return null;    
        }
        return toCalendar(text.getString());
    }

    /**
     * Returns the Calendar for a given string containing a date,
     * or {@code null} if it cannot be parsed.
     *
     * The returned value will have 0 for DST_OFFSET.
     *
     * @param text A COS string containing a date.
     * @return The Calendar that the text string represents, or {@code null} if it cannot be parsed.
     */
    public static Calendar toCalendar(String text)
    {
        if (text == null || text.trim().isEmpty())
        {
            return null;
        }

        ParsePosition where = new ParsePosition(0);
        skipOptionals(text, where, " ");
        skipString(text, "D:", where);
        Calendar calendar = parseDate(text, where);

        if (calendar == null || where.getIndex() != text.length())
        {
            // the date string is invalid
            return null;
        }
        return calendar;
    }
}