org.textmining.extraction.word.WordTextScrubber Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tm-extractors Show documentation
Show all versions of tm-extractors Show documentation
Java based library that can extract text from Microsoft Word for Windows binary documents including Word 1.0/2.0/4.0/6.0/95/97/2000/xp/2003. Extracts text from fast-saved files as well.
The newest version!
/*
* Textmining.org text extractors
*
* Copyright (C) 2008 Benryan Software Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
package org.textmining.extraction.word;
import java.io.IOException;
import java.io.Writer;
/**
* This class acts as a StringBuffer for text from a word document. It allows
* processing of character before they
* @author Ryan Ackley
* @version 1.0
*/
public class WordTextScrubber
{
boolean _hold;
public WordTextScrubber()
{
_hold = false;
}
public void append(Writer writer, String text)
throws IOException
{
char[] letters = text.toCharArray();
for (int x = 0; x < letters.length; x++)
{
switch(letters[x])
{
case '\r':
writer.write("\r\n");
break;
case 0x13:
_hold = true;
break;
case 0x14:
_hold = false;
break;
case 0x15:
_hold = false;
break;
default:
if (!_hold)
{
writer.write(letters[x]);
}
break;
}
}
}
}