
cc.mallet.pipe.CharSequenceRemoveUUEncodedBlocks Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
package cc.mallet.pipe;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Instance;
public class CharSequenceRemoveUUEncodedBlocks extends Pipe {
/**
Given a string, remove lines that begin with M and are 61 characters long.
Note that there are some UUEncoded blocks that do not match this.
I have seen some that are 64 characters long, and have no regular prefix character,
but this filter gets most of them in 20 Newsgroups.
@author Andrew McCallum [email protected]
*/
public static final Pattern UU_ENCODED_LINE= Pattern.compile ("^M.{60}$");
public CharSequenceRemoveUUEncodedBlocks ()
{
}
public Instance pipe (Instance carrier)
{
String string = ((CharSequence)carrier.getData()).toString();
Matcher m = UU_ENCODED_LINE.matcher(string);
carrier.setData(m.replaceAll (""));
return carrier;
}
//Serialization
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 0;
private void writeObject (ObjectOutputStream out) throws IOException {
out.writeInt (CURRENT_SERIAL_VERSION);
}
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
@SuppressWarnings("unused")
int version = in.readInt ();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy