org.apache.poi.hwpf.model.PAPBinTable Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of apache-poi-ooxml Show documentation
The Apache Commons Codec package contains simple encoder and decoders for various formats such as Base64 and Hexadecimal. In addition to these widely used encoders and decoders, the codec package also maintains a collection of phonetic encoding utilities.
There is a newer version: 62
Show newest version
/* ====================================================================
   Licensed to the Apache Software Foundation (ASF) under one or more
   contributor license agreements.  See the NOTICE file distributed with
   this work for additional information regarding copyright ownership.
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
==================================================================== */

package org.apache.poi.hwpf.model;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.IdentityHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.poi.hwpf.sprm.SprmBuffer;
import org.apache.poi.hwpf.sprm.SprmIterator;
import org.apache.poi.hwpf.sprm.SprmOperation;
import org.apache.poi.poifs.common.POIFSConstants;
import org.apache.poi.util.Internal;
import org.apache.poi.util.LittleEndian;

import static java.lang.System.currentTimeMillis;
import static org.apache.logging.log4j.util.Unbox.box;

/**
 * This class represents the bin table of Word document but it also serves as a
 * holder for all of the paragraphs of document that have been loaded into
 * memory.
 */
@Internal
public class PAPBinTable
{
    private static final Logger LOG = LogManager.getLogger(PAPBinTable.class);

    protected final ArrayList _paragraphs = new ArrayList<>();

    public PAPBinTable()
    {
    }

    public PAPBinTable( byte[] documentStream, byte[] tableStream,
            byte[] dataStream, int offset, int size,
            CharIndexTranslator charIndexTranslator )
    {
        long start = currentTimeMillis();

        {
            PlexOfCps binTable = new PlexOfCps( tableStream, offset, size, 4 );

            int length = binTable.length();
            for ( int x = 0; x < length; x++ )
            {
                GenericPropertyNode node = binTable.getProperty( x );

                int pageNum = LittleEndian.getInt( node.getBytes() );
                int pageOffset = POIFSConstants.SMALLER_BIG_BLOCK_SIZE * pageNum;

                PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(
                        documentStream, dataStream, pageOffset,
                        charIndexTranslator );

                for ( PAPX papx : pfkp.getPAPXs() )
                {
                    if ( papx != null )
                        _paragraphs.add( papx );
                }
            }
        }

        LOG.atDebug().log("PAPX tables loaded in {} ms ({} elements)", box(currentTimeMillis() - start),box(_paragraphs.size()));

        if ( _paragraphs.isEmpty() )
        {
            LOG.atWarn().log("PAPX FKPs are empty");
            _paragraphs.add( new PAPX( 0, 0, new SprmBuffer( 2 ) ) );
        }
    }

    public void rebuild( final StringBuilder docText,
            ComplexFileTable complexFileTable )
    {
        rebuild( docText, complexFileTable, _paragraphs );
    }

    static void rebuild( final StringBuilder docText,
            ComplexFileTable complexFileTable, List paragraphs )
    {
        long start = currentTimeMillis();

        if ( complexFileTable != null )
        {
            SprmBuffer[] sprmBuffers = complexFileTable.getGrpprls();

            // adding PAPX from fast-saved SPRMs
            for ( TextPiece textPiece : complexFileTable.getTextPieceTable()
                    .getTextPieces() )
            {
                PropertyModifier prm = textPiece.getPieceDescriptor().getPrm();
                if ( !prm.isComplex() )
                    continue;
                int igrpprl = prm.getIgrpprl();

                if ( igrpprl < 0 || igrpprl >= sprmBuffers.length )
                {
                    LOG.atWarn().log("{}'s PRM references to unknown grpprl", textPiece);
                    continue;
                }

                boolean hasPap = false;
                SprmBuffer sprmBuffer = sprmBuffers[igrpprl];
                for ( SprmIterator iterator = sprmBuffer.iterator(); iterator
                        .hasNext(); )
                {
                    SprmOperation sprmOperation = iterator.next();
                    if ( sprmOperation.getType() == SprmOperation.TYPE_PAP )
                    {
                        hasPap = true;
                        break;
                    }
                }

                if ( hasPap )
                {
                    SprmBuffer newSprmBuffer = new SprmBuffer( 2 );
                    newSprmBuffer.append( sprmBuffer.toByteArray() );

                    PAPX papx = new PAPX( textPiece.getStart(),
                            textPiece.getEnd(), newSprmBuffer );
                    paragraphs.add( papx );
                }
            }

            LOG.atDebug().log("Merged (?) with PAPX from complex file table in {} ms ({} elements in total)", box(currentTimeMillis() - start),box(paragraphs.size()));
            start = currentTimeMillis();
        }

        List oldPapxSortedByEndPos = new ArrayList<>(paragraphs);
        oldPapxSortedByEndPos.sort(PropertyNode.EndComparator);

        LOG.atDebug().log("PAPX sorted by end position in {} ms", box(currentTimeMillis() - start));
        start = currentTimeMillis();

        final Map papxToFileOrder = new IdentityHashMap<>();
        {
            int counter = 0;
            for ( PAPX papx : paragraphs )
            {
                papxToFileOrder.put( papx, Integer.valueOf( counter++ ) );
            }
        }
        final Comparator papxFileOrderComparator = new Comparator()
        {
            public int compare( PAPX o1, PAPX o2 )
            {
                Integer i1 = papxToFileOrder.get( o1 );
                Integer i2 = papxToFileOrder.get( o2 );
                return i1.compareTo( i2 );
            }
        };

        LOG.atDebug().log("PAPX's order map created in {} ms", box(currentTimeMillis() - start));
        start = currentTimeMillis();

        List newPapxs = new LinkedList<>();
        int lastParStart = 0;
        int lastPapxIndex = 0;
        for ( int charIndex = 0; charIndex < docText.length(); charIndex++ )
        {
            final char c = docText.charAt( charIndex );
            if ( c != 13 && c != 7 && c != 12 )
                continue;

            final int startInclusive = lastParStart;
            final int endExclusive = charIndex + 1;

            boolean broken = false;
            List papxs = new LinkedList<>();
            for ( int papxIndex = lastPapxIndex; papxIndex < oldPapxSortedByEndPos
                    .size(); papxIndex++ )
            {
                broken = false;
                PAPX papx = oldPapxSortedByEndPos.get( papxIndex );

                assert startInclusive == 0
                        || papxIndex + 1 == oldPapxSortedByEndPos.size()
                        || papx.getEnd() > startInclusive;

                if ( papx.getEnd() - 1 > charIndex )
                {
                    lastPapxIndex = papxIndex;
                    broken = true;
                    break;
                }

                papxs.add( papx );
            }
            if ( !broken )
            {
                lastPapxIndex = oldPapxSortedByEndPos.size() - 1;
            }

            if ( papxs.isEmpty() )
            {
                LOG.atWarn().log("Paragraph [{}; {}) has no PAPX. Creating new one.", box(startInclusive),box(endExclusive));
                // create it manually
                PAPX papx = new PAPX( startInclusive, endExclusive,
                        new SprmBuffer( 2 ) );
                newPapxs.add( papx );

                lastParStart = endExclusive;
                continue;
            }

            if ( papxs.size() == 1 )
            {
                // can we reuse existing?
                PAPX existing = papxs.get( 0 );
                if ( existing.getStart() == startInclusive
                        && existing.getEnd() == endExclusive )
                {
                    newPapxs.add( existing );
                    lastParStart = endExclusive;
                    continue;
                }
            }

            // restore file order of PAPX
            papxs.sort(papxFileOrderComparator);

            SprmBuffer sprmBuffer = null;
            for ( PAPX papx : papxs )
            {
                if ( papx.getGrpprl() == null || papx.getGrpprl().length <= 2 )
                    continue;

                if ( sprmBuffer == null ) {
                    sprmBuffer = papx.getSprmBuf().copy();
                } else {
                    sprmBuffer.append( papx.getGrpprl(), 2 );
                }
            }
            PAPX newPapx = new PAPX( startInclusive, endExclusive, sprmBuffer );
            newPapxs.add( newPapx );

            lastParStart = endExclusive;
            continue;
        }
        paragraphs.clear();
        paragraphs.addAll( newPapxs );

        LOG.atDebug().log("PAPX rebuilded from document text in {} ms ({} elements)", box(currentTimeMillis() - start),box(paragraphs.size()));
    }

    public void insert(int listIndex, int cpStart, SprmBuffer buf)
    {

        PAPX forInsert = new PAPX(0, 0, buf);

        // Ensure character offsets are really characters
        forInsert.setStart(cpStart);
        forInsert.setEnd(cpStart);

        if (listIndex == _paragraphs.size())
        {
             _paragraphs.add(forInsert);
        }
        else
        {
            PAPX currentPap = _paragraphs.get(listIndex);
            if (currentPap != null && currentPap.getStart() < cpStart)
            {
                SprmBuffer clonedBuf = currentPap.getSprmBuf().copy();

                // Copy the properties of the one before to afterwards
                // Will go:
                //    Original, until insert at point
                //    New one
                //    Clone of original, on to the old end
                PAPX clone = new PAPX(0, 0, clonedBuf);
                // Again ensure contains character based offsets no matter what
                clone.setStart(cpStart);
                clone.setEnd(currentPap.getEnd());

                currentPap.setEnd(cpStart);

                _paragraphs.add(listIndex + 1, forInsert);
                _paragraphs.add(listIndex + 2, clone);
            }
            else
            {
                _paragraphs.add(listIndex, forInsert);
            }
        }

    }

    public void adjustForDelete(int listIndex, int offset, int length)
    {
        int size = _paragraphs.size();
        int endMark = offset + length;
        int endIndex = listIndex;

        PAPX papx = _paragraphs.get(endIndex);
        while (papx.getEnd() < endMark)
        {
            papx = _paragraphs.get(++endIndex);
        }
        if (listIndex == endIndex)
        {
            papx = _paragraphs.get(endIndex);
            papx.setEnd((papx.getEnd() - endMark) + offset);
        }
        else
        {
            papx = _paragraphs.get(listIndex);
            papx.setEnd(offset);
            for (int x = listIndex + 1; x < endIndex; x++)
            {
                papx = _paragraphs.get(x);
                papx.setStart(offset);
                papx.setEnd(offset);
            }
            papx = _paragraphs.get(endIndex);
            papx.setEnd((papx.getEnd() - endMark) + offset);
        }

        for (int x = endIndex + 1; x < size; x++)
        {
            papx = _paragraphs.get(x);
            papx.setStart(papx.getStart() - length);
            papx.setEnd(papx.getEnd() - length);
        }
    }


    public void adjustForInsert(int listIndex, int length)
    {
        int size = _paragraphs.size();
        PAPX papx = _paragraphs.get(listIndex);
        papx.setEnd(papx.getEnd() + length);

        for (int x = listIndex + 1; x < size; x++)
        {
            papx = _paragraphs.get(x);
            papx.setStart(papx.getStart() + length);
            papx.setEnd(papx.getEnd() + length);
        }
    }


    public ArrayList getParagraphs()
    {
        return _paragraphs;
    }

    public void writeTo( ByteArrayOutputStream wordDocumentStream,
            ByteArrayOutputStream tableStream, CharIndexTranslator translator )
            throws IOException
    {

        PlexOfCps binTable = new PlexOfCps(4);

        // each FKP must start on a 512 byte page.
        int docOffset = wordDocumentStream.size();
        int mod = docOffset % POIFSConstants.SMALLER_BIG_BLOCK_SIZE;
        if (mod != 0)
        {
            byte[] padding = new byte[POIFSConstants.SMALLER_BIG_BLOCK_SIZE - mod];
            wordDocumentStream.write(padding);
        }

        // get the page number for the first fkp
        docOffset = wordDocumentStream.size();
        int pageNum = docOffset/POIFSConstants.SMALLER_BIG_BLOCK_SIZE;

        // get the ending fc
        // int endingFc = _paragraphs.get(_paragraphs.size() - 1).getEnd();
        // endingFc += fcMin;
        int endingFc = translator.getByteIndex( _paragraphs.get(_paragraphs.size() - 1 ).getEnd() );

        ArrayList overflow = _paragraphs;
        do
        {
            PAPX startingProp = overflow.get(0);

            // int start = startingProp.getStart() + fcMin;
            int start = translator.getByteIndex( startingProp.getStart() );

            PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage();
            pfkp.fill(overflow);

            byte[] bufFkp = pfkp.toByteArray(tableStream, translator);
            wordDocumentStream.write(bufFkp);
            overflow = pfkp.getOverflow();

            int end = endingFc;
            if (overflow != null)
            {
                // end = overflow.get(0).getStart() + fcMin;
                end = translator.getByteIndex( overflow.get( 0 ).getStart() );
            }

            byte[] intHolder = new byte[4];
            LittleEndian.putInt(intHolder, 0, pageNum++);
            binTable.addProperty(new GenericPropertyNode(start, end, intHolder));

        }
        while (overflow != null);
        tableStream.write(binTable.toByteArray());
    }
}