org.apache.poi.hdf.model.HDFObjectFactory Maven / Gradle / Ivy
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hdf.model;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.poi.hdf.event.HDFLowLevelParsingListener;
import org.apache.poi.hdf.model.hdftypes.CHPFormattedDiskPage;
import org.apache.poi.hdf.model.hdftypes.ChpxNode;
import org.apache.poi.hdf.model.hdftypes.DocumentProperties;
import org.apache.poi.hdf.model.hdftypes.FileInformationBlock;
import org.apache.poi.hdf.model.hdftypes.FontTable;
import org.apache.poi.hdf.model.hdftypes.FormattedDiskPage;
import org.apache.poi.hdf.model.hdftypes.ListTables;
import org.apache.poi.hdf.model.hdftypes.PAPFormattedDiskPage;
import org.apache.poi.hdf.model.hdftypes.PapxNode;
import org.apache.poi.hdf.model.hdftypes.PlexOfCps;
import org.apache.poi.hdf.model.hdftypes.SepxNode;
import org.apache.poi.hdf.model.hdftypes.StyleSheet;
import org.apache.poi.hdf.model.hdftypes.TextPiece;
import org.apache.poi.hdf.model.util.ParsingState;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
/**
* The Object Factory takes in a stream and creates the low level objects
* that represent the data.
* @author andy
*/
@Deprecated
public final class HDFObjectFactory {
/** OLE stuff*/
private POIFSFileSystem _filesystem;
/** The FIB*/
private FileInformationBlock _fib;
/** Used to set up the object model*/
private HDFLowLevelParsingListener _listener;
/** parsing state for characters */
private ParsingState _charParsingState;
/** parsing state for paragraphs */
private ParsingState _parParsingState;
/** main document stream buffer*/
byte[] _mainDocument;
/** table stream buffer*/
byte[] _tableBuffer;
@SuppressWarnings("unused")
public static void main(String args[])
{
try
{
HDFObjectFactory f = new HDFObjectFactory(new FileInputStream("c:\\test.doc"));
int k = 0;
}
catch(Exception t)
{
t.printStackTrace();
}
}
/** Creates a new instance of HDFObjectFactory
*
* @param istream The InputStream that is the Word document
*
*/
protected HDFObjectFactory(InputStream istream, HDFLowLevelParsingListener l) throws IOException
{
if (l == null)
{
_listener = new HDFObjectModel();
}
else
{
_listener = l;
}
//do Ole stuff
_filesystem = new POIFSFileSystem(istream);
DocumentEntry headerProps =
(DocumentEntry)_filesystem.getRoot().getEntry("WordDocument");
_mainDocument = new byte[headerProps.getSize()];
_filesystem.createDocumentInputStream("WordDocument").read(_mainDocument);
_fib = new FileInformationBlock(_mainDocument);
initTableStream();
initTextPieces();
initFormattingProperties();
}
/** Creates a new instance of HDFObjectFactory
*
* @param istream The InputStream that is the Word document
*
*/
public HDFObjectFactory(InputStream istream) throws IOException
{
this(istream, null);
}
public static List getTypes(InputStream istream) throws IOException
{
List results = new ArrayList(1);
//do Ole stuff
POIFSFileSystem filesystem = new POIFSFileSystem(istream);
DocumentEntry headerProps =
(DocumentEntry)filesystem.getRoot().getEntry("WordDocument");
byte[] mainDocument = new byte[headerProps.getSize()];
filesystem.createDocumentInputStream("WordDocument").read(mainDocument);
FileInformationBlock fib = new FileInformationBlock(mainDocument);
results.add(fib);
return results;
}
/**
* Initializes the table stream
*
* @throws IOException
*/
private void initTableStream() throws IOException
{
String tablename = null;
if(_fib.isFWhichTblStm())
{
tablename="1Table";
}
else
{
tablename="0Table";
}
DocumentEntry tableEntry = (DocumentEntry)_filesystem.getRoot().getEntry(tablename);
//load the table stream into a buffer
int size = tableEntry.getSize();
_tableBuffer = new byte[size];
_filesystem.createDocumentInputStream(tablename).read(_tableBuffer);
}
/**
* Initializes the text pieces. Text is divided into pieces because some
* "pieces" may only contain unicode characters.
*
* @throws IOException
*/
private void initTextPieces() throws IOException
{
int pos = _fib.getFcClx();
//skips through the prms before we reach the piece table. These contain data
//for actual fast saved files
while (_tableBuffer[pos] == 1)
{
pos++;
int skip = LittleEndian.getShort(_tableBuffer, pos);
pos += 2 + skip;
}
if(_tableBuffer[pos] != 2)
{
throw new IOException("The text piece table is corrupted");
}
//parse out the text pieces
int pieceTableSize = LittleEndian.getInt(_tableBuffer, ++pos);
pos += 4;
int pieces = (pieceTableSize - 4) / 12;
for (int x = 0; x < pieces; x++) {
int filePos = LittleEndian.getInt(_tableBuffer, pos + ((pieces + 1) * 4) + (x * 8) + 2);
boolean unicode = false;
if ((filePos & 0x40000000) == 0) {
unicode = true;
} else {
unicode = false;
filePos &= ~(0x40000000);//gives me FC in doc stream
filePos /= 2;
}
int totLength = LittleEndian.getInt(_tableBuffer, pos + (x + 1) * 4) -
LittleEndian.getInt(_tableBuffer, pos + (x * 4));
TextPiece piece = new TextPiece(filePos, totLength, unicode);
_listener.text(piece);
}
}
/**
* initializes all of the formatting properties for a Word Document
*/
private void initFormattingProperties()
{
createStyleSheet();
createListTables();
createFontTable();
initDocumentProperties();
initSectionProperties();
//initCharacterProperties();
//initParagraphProperties();
}
private void initCharacterProperties(int charOffset, PlexOfCps charPlcf, int start, int end)
{
//Initialize paragraph property stuff
//int currentCharPage = _charParsingState.getCurrentPage();
int charPlcfLen = charPlcf.length();
int currentPageIndex = _charParsingState.getCurrentPageIndex();
FormattedDiskPage fkp = _charParsingState.getFkp();
int currentChpxIndex = _charParsingState.getCurrentPropIndex();
int currentArraySize = fkp.size();
//get the character runs for this paragraph
int charStart = 0;
int charEnd = 0;
//add the character runs
do
{
if (currentChpxIndex < currentArraySize)
{
charStart = fkp.getStart(currentChpxIndex);
charEnd = fkp.getEnd(currentChpxIndex);
byte[] chpx = fkp.getGrpprl(currentChpxIndex);
_listener.characterRun(new ChpxNode(Math.max(charStart, start), Math.min(charEnd, end), chpx));
if (charEnd < end)
{
currentChpxIndex++;
}
else
{
_charParsingState.setState(currentPageIndex, fkp, currentChpxIndex);
break;
}
}
else
{
int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex));
byte[] byteFkp = new byte[512];
System.arraycopy(_mainDocument, (currentCharPage * 512), byteFkp, 0, 512);
fkp = new CHPFormattedDiskPage(byteFkp);
currentChpxIndex = 0;
currentArraySize = fkp.size();
}
}
while(currentPageIndex < charPlcfLen);
}
private void initParagraphProperties(int parOffset, PlexOfCps parPlcf, int charOffset, PlexOfCps charPlcf, int start, int end)
{
//Initialize paragraph property stuff
//int currentParPage = _parParsingState.getCurrentPage();
int parPlcfLen = parPlcf.length();
int currentPageIndex = _parParsingState.getCurrentPageIndex();
FormattedDiskPage fkp = _parParsingState.getFkp();
int currentPapxIndex = _parParsingState.getCurrentPropIndex();
int currentArraySize = fkp.size();
do
{
if (currentPapxIndex < currentArraySize)
{
int parStart = fkp.getStart(currentPapxIndex);
int parEnd = fkp.getEnd(currentPapxIndex);
byte[] papx = fkp.getGrpprl(currentPapxIndex);
_listener.paragraph(new PapxNode(Math.max(parStart, start), Math.min(parEnd, end), papx));
initCharacterProperties(charOffset, charPlcf, Math.max(start, parStart), Math.min(parEnd, end));
if (parEnd < end)
{
currentPapxIndex++;
}
else
{
//save the state
_parParsingState.setState(currentPageIndex, fkp, currentPapxIndex);
break;
}
}
else
{
int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(++currentPageIndex));
byte byteFkp[] = new byte[512];
System.arraycopy(_mainDocument, (currentParPage * 512), byteFkp, 0, 512);
fkp = new PAPFormattedDiskPage(byteFkp);
currentPapxIndex = 0;
currentArraySize = fkp.size();
}
}
while(currentPageIndex < parPlcfLen);
}
/**
* initializes the CharacterProperties BTree
*/
/*private void initCharacterProperties()
{
int charOffset = _fib.getFcPlcfbteChpx();
int charPlcSize = _fib.getLcbPlcfbteChpx();
//int arraySize = (charPlcSize - 4)/8;
//first we must go through the bin table and find the fkps
for(int x = 0; x < arraySize; x++)
{
//get page number(has nothing to do with document page)
//containing the chpx for the paragraph
int PN = LittleEndian.getInt(_tableBuffer, charOffset + (4 * (arraySize + 1) + (4 * x)));
byte[] fkp = new byte[512];
System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512);
//take each fkp and get the chpxs
int crun = LittleEndian.getUnsignedByte(fkp, 511);
for(int y = 0; y < crun; y++)
{
//get the beginning fc of each paragraph text run
int fcStart = LittleEndian.getInt(fkp, y * 4);
int fcEnd = LittleEndian.getInt(fkp, (y+1) * 4);
//get the offset in fkp of the papx for this paragraph
int chpxOffset = 2 * LittleEndian.getUnsignedByte(fkp, ((crun + 1) * 4) + y);
//optimization if offset == 0 use "Normal" style
if(chpxOffset == 0)
{
_characterRuns.add(new ChpxNode(fcStart, fcEnd, new byte[0]));
continue;
}
int size = LittleEndian.getUnsignedByte(fkp, chpxOffset);
byte[] chpx = new byte[size];
System.arraycopy(fkp, ++chpxOffset, chpx, 0, size);
//_papTable.put(Integer.valueOf(fcStart), papx);
_characterRuns.add(new ChpxNode(fcStart, fcEnd, chpx));
}
}
}*/
/**
* intializes the Paragraph Properties BTree
*/
@SuppressWarnings("unused")
private void initParagraphProperties()
{
//paragraphs
int parOffset = _fib.getFcPlcfbtePapx();
int parPlcSize = _fib.getLcbPlcfbtePapx();
//characters
int charOffset = _fib.getFcPlcfbteChpx();
int charPlcSize = _fib.getLcbPlcfbteChpx();
PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4);
PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4);
//Initialize character property stuff
int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0));
int charPlcfLen = charPlcf.length();
int currentPageIndex = 0;
byte[] fkp = new byte[512];
System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp);
int currentChpxIndex = 0;
int currentArraySize = cfkp.size();
int arraySize = parPlcf.length();
//first we must go through the bin table and find the fkps
for(int x = 0; x < arraySize; x++)
{
int PN = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(x));
fkp = new byte[512];
System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512);
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp);
//take each fkp and get the paps
int crun = pfkp.size();
for(int y = 0; y < crun; y++)
{
//get the beginning fc of each paragraph text run
int fcStart = pfkp.getStart(y);
int fcEnd = pfkp.getEnd(y);
//get the papx for this paragraph
byte[] papx = pfkp.getGrpprl(y);
_listener.paragraph(new PapxNode(fcStart, fcEnd, papx));
//get the character runs for this paragraph
int charStart = 0;
int charEnd = 0;
//add the character runs
do
{
if (currentChpxIndex < currentArraySize)
{
charStart = cfkp.getStart(currentChpxIndex);
charEnd = cfkp.getEnd(currentChpxIndex);
byte[] chpx = cfkp.getGrpprl(currentChpxIndex);
_listener.characterRun(new ChpxNode(charStart, charEnd, chpx));
if (charEnd < fcEnd)
{
currentChpxIndex++;
}
else
{
break;
}
}
else
{
currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex));
fkp = new byte[512];
System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
cfkp = new CHPFormattedDiskPage(fkp);
currentChpxIndex = 0;
currentArraySize = cfkp.size();
}
}
while(currentCharPage <= charPlcfLen + 1);
}
}
}
private void initParsingStates(int parOffset, PlexOfCps parPlcf, int charOffset, PlexOfCps charPlcf)
{
int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0));
byte[] fkp = new byte[512];
System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp);
_charParsingState = new ParsingState(currentCharPage, cfkp);
int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(0));
fkp = new byte[512];
System.arraycopy(_mainDocument, (currentParPage * 512), fkp, 0, 512);
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp);
_parParsingState = new ParsingState(currentParPage, pfkp);
}
/**
* initializes the SectionProperties BTree
*/
@SuppressWarnings("unused")
private void initSectionProperties()
{
int ccpText = _fib.getCcpText();
int ccpFtn = _fib.getCcpFtn();
//sections
int fcMin = _fib.getFcMin();
int plcfsedFC = _fib.getFcPlcfsed();
int plcfsedSize = _fib.getLcbPlcfsed();
//paragraphs
int parOffset = _fib.getFcPlcfbtePapx();
int parPlcSize = _fib.getLcbPlcfbtePapx();
//characters
int charOffset = _fib.getFcPlcfbteChpx();
int charPlcSize = _fib.getLcbPlcfbteChpx();
PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4);
PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4);
initParsingStates(parOffset, parPlcf, charOffset, charPlcf);
//byte[] plcfsed = new byte[plcfsedSize];
//System.arraycopy(_tableBuffer, plcfsedFC, plcfsed, 0, plcfsedSize);
PlexOfCps plcfsed = new PlexOfCps(plcfsedSize, 12);
int arraySize = plcfsed.length();
int start = fcMin;
int end = fcMin + ccpText;
int x = 0;
int sectionEnd = 0;
//do the main body sections
while (x < arraySize)
{
int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin;
sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin;
int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2);
int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart);
byte[] sepx = new byte[sepxSize];
System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize);
SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx);
_listener.bodySection(node);
initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, sectionStart, Math.min(end, sectionEnd));
if (sectionEnd > end)
{
break;
}
x++;
}
//do the header sections
for (; x < arraySize; x++)// && sectionEnd <= end; x++)
{
int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin;
sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin;
int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2);
int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart);
byte[] sepx = new byte[sepxSize];
System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize);
SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx);
_listener.hdrSection(node);
initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, Math.max(sectionStart, end), sectionEnd);
}
_listener.endSections();
}
/**
* Initializes the DocumentProperties object unique to this document.
*/
private void initDocumentProperties()
{
int pos = _fib.getFcDop();
int size = _fib.getLcbDop();
byte[] dopArray = new byte[size];
System.arraycopy(_tableBuffer, pos, dopArray, 0, size);
_listener.document(new DocumentProperties(dopArray));
}
/**
* Uncompresses the StyleSheet from file into memory.
*/
private void createStyleSheet()
{
int stshIndex = _fib.getFcStshf();
int stshSize = _fib.getLcbStshf();
byte[] stsh = new byte[stshSize];
System.arraycopy(_tableBuffer, stshIndex, stsh, 0, stshSize);
_listener.styleSheet(new StyleSheet(stsh));
}
/**
* Initializes the list tables for this document
*/
private void createListTables()
{
int lfoOffset = _fib.getFcPlfLfo();
int lfoSize = _fib.getLcbPlfLfo();
byte[] plflfo = new byte[lfoSize];
System.arraycopy(_tableBuffer, lfoOffset, plflfo, 0, lfoSize);
int lstOffset = _fib.getFcPlcfLst();
int lstSize = _fib.getLcbPlcfLst();
if (lstOffset > 0 && lstSize > 0)
{
// The lstSize returned by _fib.getLcbPlcfLst() doesn't appear
// to take into account any LVLs. Therefore, we recalculate
// lstSize based on where the LFO section begins (because the
// LFO section immediately follows the LST section).
lstSize = lfoOffset - lstOffset;
byte[] plcflst = new byte[lstSize];
System.arraycopy(_tableBuffer, lstOffset, plcflst, 0, lstSize);
_listener.lists(new ListTables(plcflst, plflfo));
}
}
/**
* Initializes this document's FontTable;
*/
private void createFontTable()
{
int fontTableIndex = _fib.getFcSttbfffn();
int fontTableSize = _fib.getLcbSttbfffn();
byte[] fontTable = new byte[fontTableSize];
System.arraycopy(_tableBuffer, fontTableIndex, fontTable, 0, fontTableSize);
_listener.fonts(new FontTable(fontTable));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy