org.textmining.extraction.excel.ExcelTextExtractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tm-extractors Show documentation
Java based library that can extract text from Microsoft Word for Windows binary documents including Word 1.0/2.0/4.0/6.0/95/97/2000/xp/2003. Extracts text from fast-saved files as well.
The newest version!
/*
 * Textmining.org text extractors
 * 
 * Copyright (C) 2008 Benryan Software Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 * 
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 *
 */
package org.textmining.extraction.excel;

import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.util.*;

import org.textmining.extraction.TextExtractor;

public class ExcelTextExtractor
  implements TextExtractor
{
  byte[] _recordStream;
  int _offset;
  
  public ExcelTextExtractor(InputStream in) throws IOException
  {
    POIFSFileSystem poifs = new POIFSFileSystem(in);
    DocumentEntry headerProps =
      (DocumentEntry)poifs.getRoot().getEntry("Workbook");
    DocumentInputStream din = poifs.createDocumentInputStream("Workbook");
    _recordStream = new byte[headerProps.getSize()];


    din.read(_recordStream);
    din.close();   
  }
  public String getText() throws IOException
  {
    StringWriter writer = new StringWriter();
    getText(writer);
    return writer.toString();
  }

  public void getText(Writer writer) throws IOException
  {
    while (_offset < _recordStream.length)
    {
      int type = LittleEndian.getShort(_recordStream, _offset);
      _offset += LittleEndian.SHORT_SIZE;
      if (type == 0xa)
      {
        //if (_offset == _recordStream.length)
          break;
//        else
//        {
//          continue;
//        }
      }
      int size = LittleEndian.getShort(_recordStream, _offset);
      _offset += LittleEndian.SHORT_SIZE;
      if (type == Record.SST_RECORD)
      {
        int totalStrings = LittleEndian.getInt(_recordStream, _offset);
        _offset += LittleEndian.INT_SIZE;
        int sharedStrings = LittleEndian.getInt(_recordStream, _offset);
        _offset += LittleEndian.INT_SIZE;
        for (int x = 0; x < sharedStrings; x++)
        {
          int strLength = LittleEndian.getShort(_recordStream, _offset);
          _offset += LittleEndian.SHORT_SIZE;
          
          int flags = _recordStream[_offset++];
          boolean compression = (flags & 0x1) == 0;
          boolean asian = (flags & 0x4) != 0;
          boolean richText = (flags & 8) != 0;
          int numRuns = 0;
          int sizeofAsian = 0;
          
          if (richText)
          {
            numRuns = LittleEndian.getShort(_recordStream, _offset);
            _offset += LittleEndian.SHORT_SIZE;
          }
          if (asian)
          {
            sizeofAsian = LittleEndian.getInt(_recordStream, _offset);
            _offset += LittleEndian.SHORT_SIZE;
          }
          int byteLength = !compression ? strLength * 2 : strLength;
          String string = new String(_recordStream, _offset, byteLength, 
              compression ? "Cp1252":"UTF-16LE");
          //System.out.println(string);
          writer.write(string + ' ');         
          
          _offset += byteLength;
          if (richText)
          {
            _offset += (numRuns * 4);
          }
          
        }
      }
      else 
      {
        _offset += size;
      }
    }
  }
  
  
  
  
}