org.textmining.extraction.word.model.TextPieceTable Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of tm-extractors Show documentation

Java based library that can extract text from Microsoft Word for Windows binary documents including Word 1.0/2.0/4.0/6.0/95/97/2000/xp/2003. Extracts text from fast-saved files as well.

The newest version!

/*
 * Textmining.org text extractors
 * 
 * Copyright (C) 2008 Benryan Software Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 */
package org.textmining.extraction.word.model;

import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

import org.textmining.extraction.word.model.GenericPropertyNode;
import org.textmining.extraction.word.model.PieceDescriptor;
import org.textmining.extraction.word.model.PlexOfCps;


public class TextPieceTable
{
  protected ArrayList _textPieces = new ArrayList();
  
  
  
  public TextPieceTable()
  {
    
  }
  
  public TextPieceTable(byte[] documentStream, byte[] tableStream, int offset,
      int size, int fcMin) throws UnsupportedEncodingException
  {
    // get our plex of PieceDescriptors
    PlexOfCps pieceTable = new PlexOfCps(tableStream, offset, size,
        PieceDescriptor.getSizeInBytes());

    // _multiple = 2;
    int length = pieceTable.length();
    // PieceDescriptor[] pieces = new PieceDescriptor[length];

    // iterate through piece descriptors raw bytes and create
    // PieceDescriptor objects
    for (int x = 0; x < length; x++)
    {
      GenericPropertyNode node = pieceTable.getProperty(x);
      PieceDescriptor piece = new PieceDescriptor(node.getBytes(), 0);
      
      int fcStart = piece.getFilePosition();
      int fcEnd = fcStart + ((node.getEnd() - node.getStart()) * (piece.isUnicode() ? 2 : 1));
      
      //String txt = new String(documentStream, fcStart, fcEnd - fcStart, piece.isUnicode() ? UNICODE_ENC : ASCII_ENC);
      
      _textPieces.add(new TextPiece(node.getStart(), node.getEnd(), piece));
    }    
  }
  
  public List getTextPieces()
  {
    return _textPieces;
  }
}