org.textmining.extraction.excel.ExcelTextExtractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tm-extractors Show documentation
Show all versions of tm-extractors Show documentation
Java based library that can extract text from Microsoft Word for Windows binary documents including Word 1.0/2.0/4.0/6.0/95/97/2000/xp/2003. Extracts text from fast-saved files as well.
The newest version!
/*
* Textmining.org text extractors
*
* Copyright (C) 2008 Benryan Software Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
package org.textmining.extraction.excel;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.util.*;
import org.textmining.extraction.TextExtractor;
public class ExcelTextExtractor
implements TextExtractor
{
byte[] _recordStream;
int _offset;
public ExcelTextExtractor(InputStream in) throws IOException
{
POIFSFileSystem poifs = new POIFSFileSystem(in);
DocumentEntry headerProps =
(DocumentEntry)poifs.getRoot().getEntry("Workbook");
DocumentInputStream din = poifs.createDocumentInputStream("Workbook");
_recordStream = new byte[headerProps.getSize()];
din.read(_recordStream);
din.close();
}
public String getText() throws IOException
{
StringWriter writer = new StringWriter();
getText(writer);
return writer.toString();
}
public void getText(Writer writer) throws IOException
{
while (_offset < _recordStream.length)
{
int type = LittleEndian.getShort(_recordStream, _offset);
_offset += LittleEndian.SHORT_SIZE;
if (type == 0xa)
{
//if (_offset == _recordStream.length)
break;
// else
// {
// continue;
// }
}
int size = LittleEndian.getShort(_recordStream, _offset);
_offset += LittleEndian.SHORT_SIZE;
if (type == Record.SST_RECORD)
{
int totalStrings = LittleEndian.getInt(_recordStream, _offset);
_offset += LittleEndian.INT_SIZE;
int sharedStrings = LittleEndian.getInt(_recordStream, _offset);
_offset += LittleEndian.INT_SIZE;
for (int x = 0; x < sharedStrings; x++)
{
int strLength = LittleEndian.getShort(_recordStream, _offset);
_offset += LittleEndian.SHORT_SIZE;
int flags = _recordStream[_offset++];
boolean compression = (flags & 0x1) == 0;
boolean asian = (flags & 0x4) != 0;
boolean richText = (flags & 8) != 0;
int numRuns = 0;
int sizeofAsian = 0;
if (richText)
{
numRuns = LittleEndian.getShort(_recordStream, _offset);
_offset += LittleEndian.SHORT_SIZE;
}
if (asian)
{
sizeofAsian = LittleEndian.getInt(_recordStream, _offset);
_offset += LittleEndian.SHORT_SIZE;
}
int byteLength = !compression ? strLength * 2 : strLength;
String string = new String(_recordStream, _offset, byteLength,
compression ? "Cp1252":"UTF-16LE");
//System.out.println(string);
writer.write(string + ' ');
_offset += byteLength;
if (richText)
{
_offset += (numRuns * 4);
}
}
}
else
{
_offset += size;
}
}
}
}