org.apache.poi.hssf.record.SSTDeserializer Maven / Gradle / Ivy


/* ====================================================================
   Copyright 2002-2004   Apache Software Foundation

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
==================================================================== */
        

package org.apache.poi.hssf.record;

import org.apache.poi.util.BinaryTree;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.LittleEndianConsts;

/**
 * Handles the task of deserializing a SST string.  The two main entry points are
 *
 * @author Glen Stampoultzis (glens at apache.org)
 * @author Jason Height (jheight at apache.org)
 */
class SSTDeserializer
{

    private BinaryTree strings;
    /** this is the number of characters that have been read prior to the continuation */
    private int continuationReadChars;
    /** this is the string we were working on before hitting the end of the current record. This string is NOT finished. */
    private String unfinishedString;
    /** this is true if the string uses wide characters */
    private boolean wideChar;
    /** this is true if the string is a rich text string */
    private boolean richText;
    /** this is true if the string is a far east string or some other wierd string */
    private boolean extendedText;
    /** Number of formatting runs in this rich text field */
    private short runCount;
    /** Number of characters in current string */
    private int charCount;
    private int extensionLength;
    private int continueSkipBytes = 0;


    public SSTDeserializer( BinaryTree strings )
    {
        this.strings = strings;
        initVars();
    }

    private void initVars()
    {
        runCount = 0;
        continuationReadChars = 0;
        unfinishedString = "";
//        bytesInCurrentSegment = 0;
//        stringDataOffset = 0;
        wideChar = false;
        richText = false;
        extendedText = false;
        continueSkipBytes = 0;
    }

    /**
     * This is the starting point where strings are constructed.  Note that
     * strings may span across multiple continuations. Read the SST record
     * carefully before beginning to hack.
     */
    public void manufactureStrings( final byte[] data, final int initialOffset)
    {
        initVars();

        int offset = initialOffset;
        final int dataSize = data.length;
        while ( offset < dataSize )
        {
            int remaining = dataSize - offset;

            if ( ( remaining > 0 ) && ( remaining < LittleEndianConsts.SHORT_SIZE ) )
            {
                throw new RecordFormatException( "Cannot get length of the last string in SSTRecord" );
            }
            if ( remaining == LittleEndianConsts.SHORT_SIZE )
            {
              //JMH Dont know about this
                setContinuationCharsRead( 0 );//LittleEndian.getUShort( data, offset ) );
                unfinishedString = "";
                break;
            }
            charCount = LittleEndian.getUShort( data, offset );
            int charsRead = charCount;
            readStringHeader( data, offset );
            boolean stringContinuesOverContinuation = remaining < totalStringSize();
            if ( stringContinuesOverContinuation )
            {
                int remainingBytes = dataSize - offset - stringHeaderOverhead();
                //Only read the size of the string or whatever is left before the
                //continuation
                charsRead = Math.min(charsRead, calculateCharCount( remainingBytes ));
                setContinuationCharsRead( charsRead );                
                if (charsRead == charCount) {
                  //Since all of the characters will have been read, but the entire string (including formatting runs etc)
                  //hasnt, Compute the number of bytes to skip when the continue record starts
                  continueSkipBytes = offsetForContinuedRecord(0) - (remainingBytes - calculateByteCount(charsRead));
                }
            }
            processString( data, offset, charsRead );
            offset += totalStringSize();
            if ( stringContinuesOverContinuation )
            {
                break;
            }
        }
    }

//    private void dump( final byte[] data, int offset, int length )
//    {
//        try
//        {
//            System.out.println( "------------------- SST DUMP -------------------------" );
//            HexDump.dump( (byte[]) data, offset, System.out, offset, length );
//        }
//        catch ( IOException e )
//        {
//        }
//        catch ( ArrayIndexOutOfBoundsException e )
//        {
//        }
//        catch ( IllegalArgumentException e )
//        {
//        }
//    }

    /**
     * Detemines the option types for the string (ie, compressed or uncompressed unicode, rich text string or
     * plain string etc) and calculates the length and offset for the string.
     *
     */
    private void readStringHeader( final byte[] data, final int index )
    {

        byte optionFlag = data[index + LittleEndianConsts.SHORT_SIZE];

        wideChar = ( optionFlag & 1 ) == 1;
        extendedText = ( optionFlag & 4 ) == 4;
        richText = ( optionFlag & 8 ) == 8;
        runCount = 0;
        if ( richText )
        {
            runCount = LittleEndian.getShort( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD );
        }
        extensionLength = 0;
        if ( extendedText )
        {
            extensionLength = LittleEndian.getInt( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD
                    + (richText ? LittleEndianConsts.SHORT_SIZE : 0) );
        }

    }


    /**
     * Reads a string or the first part of a string.
     *
     * @param characters the number of characters to write.
     *
     * @return the number of bytes written.
     */
    private int processString( final byte[] data, final int dataIndex, final int characters )
    {

        // length is the length we store it as.  not the length that is read.
        int length = SSTRecord.STRING_MINIMAL_OVERHEAD + calculateByteCount( characters );
        byte[] unicodeStringBuffer = new byte[length];

        int offset = 0;

        // Set the length in characters
        LittleEndian.putUShort( unicodeStringBuffer, offset, characters );
        offset += LittleEndianConsts.SHORT_SIZE;
        // Set the option flags
        unicodeStringBuffer[offset] = data[dataIndex + offset];
        // Copy in the string data
        int bytesRead = unicodeStringBuffer.length - SSTRecord.STRING_MINIMAL_OVERHEAD;
        arraycopy( data, dataIndex + stringHeaderOverhead(), unicodeStringBuffer, SSTRecord.STRING_MINIMAL_OVERHEAD, bytesRead );
        // Create the unicode string
        UnicodeString string = new UnicodeString( UnicodeString.sid,
                (short) unicodeStringBuffer.length,
                unicodeStringBuffer );
        setContinuationCharsRead( calculateCharCount(bytesRead));

        if ( isStringFinished() )
        {
            Integer integer = new Integer( strings.size() );
            addToStringTable( strings, integer, string );
        }
        else
        {
            unfinishedString = string.getString();
        }

        return bytesRead;
    }

    private boolean isStringFinished()
    {
        return getContinuationCharsRead() == charCount;
    }

    /**
     * Okay, we are doing some major cheating here. Because we can't handle rich text strings properly
     * we end up getting duplicate strings.  To get around this I'm doing two things: 1. Converting rich
     * text to normal text and 2. If there's a duplicate I'm adding a space onto the end.  Sneaky perhaps
     * but it gets the job done until we can handle this a little better.
     */
    static public void addToStringTable( BinaryTree strings, Integer integer, UnicodeString string )
    {

        if ( string.isRichText() )
            string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~8 ) ) );
        if ( string.isExtendedText() )
            string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~4 ) ) );

        boolean added = false;
        while ( added == false )
        {
            try
            {
                strings.put( integer, string );
                added = true;
            }
            catch ( Exception ignore )
            {
                string.setString( string.getString() + " " );
            }
        }

    }


    private int calculateCharCount( final int byte_count )
    {
        return byte_count / ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE );
    }

    /**
     * Process a Continue record. A Continue record for an SST record
     * contains the same kind of data that the SST record contains,
     * with the following exceptions:
     * 
     * 

     * The string counts at the beginning of the SST record are
     *     not in the Continue record
     * 
The first string in the Continue record might NOT begin
     *     with a size. If the last string in the previous record is
     *     continued in this record, the size is determined by that
     *     last string in the previous record; the first string will
     *     begin with a flag byte, followed by the remaining bytes (or
     *     words) of the last string from the previous
     *     record. Otherwise, the first string in the record will
     *     begin with a string length
     * 
     *
     * @param record the Continue record's byte data
     */
    public void processContinueRecord( final byte[] record )
    {
        if ( isStringFinished() )
        {
            final int offset = continueSkipBytes;
            initVars();
            manufactureStrings( record, offset);
        }
        else
        {
            // reset the wide bit because that can change across a continuation. the fact that it's
            // actually rich text doesn't change across continuations even though the rich text
            // may on longer be set in the "new" option flag.  confusing huh?
            wideChar = ( record[0] & 1 ) == 1;

            if ( stringSpansContinuation( record.length - LittleEndianConsts.BYTE_SIZE ) )
            {
                processEntireContinuation( record );
            }
            else
            {
                readStringRemainder( record );
            }
        }

    }

    /**
     * Reads the remainder string and any subsequent strings from the continuation record.
     *
     * @param record  The entire continuation record data.
     */
    private void readStringRemainder( final byte[] record )
    {
        int stringRemainderSizeInBytes = calculateByteCount( charCount-getContinuationCharsRead() );
        byte[] unicodeStringData = new byte[SSTRecord.STRING_MINIMAL_OVERHEAD
                + stringRemainderSizeInBytes];

        // write the string length
        LittleEndian.putShort( unicodeStringData, 0, (short) (charCount-getContinuationCharsRead()) );

        // write the options flag
        unicodeStringData[LittleEndianConsts.SHORT_SIZE] = createOptionByte( wideChar, richText, extendedText );

        // copy the bytes/words making up the string; skipping
        // past all the overhead of the str_data array
        arraycopy( record, LittleEndianConsts.BYTE_SIZE, unicodeStringData,
                SSTRecord.STRING_MINIMAL_OVERHEAD,
                stringRemainderSizeInBytes );

        // use special constructor to create the final string
        UnicodeString string = new UnicodeString( UnicodeString.sid,
                (short) unicodeStringData.length, unicodeStringData,
                unfinishedString );
        Integer integer = new Integer( strings.size() );

        addToStringTable( strings, integer, string );

        int newOffset = offsetForContinuedRecord( stringRemainderSizeInBytes );
        manufactureStrings( record, newOffset);
    }

    /**
     * Calculates the size of the string in bytes based on the character width
     */
    private int stringSizeInBytes()
    {
        return calculateByteCount( charCount );
    }

    /**
     * Calculates the size of the string in byes.  This figure includes all the over
     * heads for the string.
     */
    private int totalStringSize()
    {
        return stringSizeInBytes()
                + stringHeaderOverhead()
                + LittleEndianConsts.INT_SIZE * runCount
                + extensionLength;
    }

    private int stringHeaderOverhead()
    {
        return SSTRecord.STRING_MINIMAL_OVERHEAD
                + ( richText ? LittleEndianConsts.SHORT_SIZE : 0 )
                + ( extendedText ? LittleEndianConsts.INT_SIZE : 0 );
    }

    private int offsetForContinuedRecord( int stringRemainderSizeInBytes )
    {
        int offset = stringRemainderSizeInBytes + runCount * LittleEndianConsts.INT_SIZE + extensionLength;        
        if (stringRemainderSizeInBytes != 0)
          //If a portion of the string remains then the wideChar options byte is repeated,
          //so need to skip this.
          offset += + LittleEndianConsts.BYTE_SIZE;
        return offset;  
    }

    private byte createOptionByte( boolean wideChar, boolean richText, boolean farEast )
    {
        return (byte) ( ( wideChar ? 1 : 0 ) + ( farEast ? 4 : 0 ) + ( richText ? 8 : 0 ) );
    }

    /**
     * If the continued record is so long is spans into the next continue then
     * simply suck the remaining string data into the existing unfinishedString.
     *
     * @param record    The data from the continuation record.
     */
    private void processEntireContinuation( final byte[] record )
    {
        // create artificial data to create a UnicodeString
        int dataLengthInBytes = record.length - LittleEndianConsts.BYTE_SIZE;
        byte[] unicodeStringData = new byte[record.length + LittleEndianConsts.SHORT_SIZE];

        int charsRead = calculateCharCount( dataLengthInBytes );
        LittleEndian.putShort( unicodeStringData, (byte) 0, (short) charsRead );
        arraycopy( record, 0, unicodeStringData, LittleEndianConsts.SHORT_SIZE, record.length );
        UnicodeString ucs = new UnicodeString( UnicodeString.sid, (short) unicodeStringData.length, unicodeStringData, unfinishedString);

        unfinishedString = ucs.getString();
        setContinuationCharsRead( getContinuationCharsRead() + charsRead );
        if (getContinuationCharsRead() == charCount) {
          Integer integer = new Integer( strings.size() );
          addToStringTable( strings, integer, ucs );
        }
    }

    private boolean stringSpansContinuation( int continuationSizeInBytes )
    {
        return calculateByteCount( charCount - getContinuationCharsRead() ) > continuationSizeInBytes;
    }

    /**
     * @return the number of characters we expect in the first
     *         sub-record in a subsequent continuation record
     */

    int getContinuationCharsRead()
    {
        return continuationReadChars;
    }

    private void setContinuationCharsRead( final int count )
    {
        continuationReadChars = count;
    }

    private int calculateByteCount( final int character_count )
    {
        return character_count * ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE );
    }


    /**
     * Copies an array from the specified source array, beginning at the
     * specified position, to the specified position of the destination array.
     * A subsequence of array components are copied from the source
     * array referenced by src to the destination array
     * referenced by dst. The number of components copied is
     * equal to the length argument. The components at
     * positions srcOffset through
     * srcOffset+length-1 in the source array are copied into
     * positions dstOffset through
     * dstOffset+length-1, respectively, of the destination
     * array.
     * 
     * If the src and dst arguments refer to the
     * same array object, then the copying is performed as if the
     * components at positions srcOffset through
     * srcOffset+length-1 were first copied to a temporary
     * array with length components and then the contents of
     * the temporary array were copied into positions
     * dstOffset through dstOffset+length-1 of the
     * destination array.
     * 

     * If dst is null, then a
     * NullPointerException is thrown.
     * 

     * If src is null, then a
     * NullPointerException is thrown and the destination
     * array is not modified.
     * 

     * Otherwise, if any of the following is true, an
     * ArrayStoreException is thrown and the destination is
     * not modified:
     * 

     * The src argument refers to an object that is not an
     *     array.
     * 
The dst argument refers to an object that is not an
     *     array.
     * 
The src argument and dst argument refer to
     *     arrays whose component types are different primitive types.
     * 
The src argument refers to an array with a primitive
     *     component type and the dst argument refers to an array
     *     with a reference component type.
     * 
The src argument refers to an array with a reference
     *     component type and the dst argument refers to an array
     *     with a primitive component type.
     * 
     * 
     * Otherwise, if any of the following is true, an
     * IndexOutOfBoundsException is
     * thrown and the destination is not modified:
     * 

     * The srcOffset argument is negative.
     * 
The dstOffset argument is negative.
     * 
The length argument is negative.
     * 
srcOffset+length is greater than
     *     src.length, the length of the source array.
     * 
dstOffset+length is greater than
     *     dst.length, the length of the destination array.
     * 
     * 
     * Otherwise, if any actual component of the source array from
     * position srcOffset through
     * srcOffset+length-1 cannot be converted to the component
     * type of the destination array by assignment conversion, an
     * ArrayStoreException is thrown. In this case, let
     * k be the smallest nonnegative integer less than
     * length such that src[srcOffset+k]
     * cannot be converted to the component type of the destination
     * array; when the exception is thrown, source array components from
     * positions srcOffset through
     * srcOffset+k-1
     * will already have been copied to destination array positions
     * dstOffset through
     * dstOffset+k-1 and no other
     * positions of the destination array will have been modified.
     * (Because of the restrictions already itemized, this
     * paragraph effectively applies only to the situation where both
     * arrays have component types that are reference types.)
     *
     * @param      src          the source array.
     * @param      src_position start position in the source array.
     * @param      dst          the destination array.
     * @param      dst_position pos   start position in the destination data.
     * @param      length       the number of array elements to be copied.
     * @exception  IndexOutOfBoundsException  if copying would cause
     *               access of data outside array bounds.
     * @exception  ArrayStoreException  if an element in the src
     *               array could not be stored into the dest array
     *               because of a type mismatch.
     * @exception  NullPointerException if either src or
     *               dst is null.
     */
    private void arraycopy( byte[] src, int src_position,
                            byte[] dst, int dst_position,
                            int length )
    {
        System.arraycopy( src, src_position, dst, dst_position, length );
    }

    /**
     * @return the unfinished string
     */
    String getUnfinishedString()
    {
        return unfinishedString;
    }

    /**
     * @return true if current string uses wide characters
     */
    boolean isWideChar()
    {
        return wideChar;
    }


}