cc.mallet.pipe.SelectiveSGML2TokenSequence Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.pipe;
import java.io.*;
import java.net.URI;
import java.util.regex.*;
import java.util.Set;
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
import cc.mallet.util.CharSequenceLexer;
import cc.mallet.util.Lexer;
/**
Similar to {@link SGML2TokenSequence}, except that only the tags
listed in allowedTags
are converted to {@link Label}s.
@author Aron Culotta [email protected]
*/
public class SelectiveSGML2TokenSequence extends Pipe implements Serializable
{
Pattern sgmlPattern = Pattern.compile ("?([^>]*)>");
CharSequenceLexer lexer;
String backgroundTag;
Set allowedTags;
/**
@param lexer to tokenize input
@param backgroundTag default tag when not in any other tag
@param allowed set of tags (Strings) that will be converted to
labels
*/
public SelectiveSGML2TokenSequence (CharSequenceLexer lexer, String backgroundTag, Set allowed)
{
this.lexer = lexer;
this.backgroundTag = backgroundTag;
this.allowedTags = allowed;
}
public SelectiveSGML2TokenSequence (String regex, String backgroundTag,
Set allowed)
{
this (new CharSequenceLexer (regex), backgroundTag, allowed);
}
public SelectiveSGML2TokenSequence (Set allowed)
{
this (new CharSequenceLexer(), "O", allowed);
}
public SelectiveSGML2TokenSequence (CharSequenceLexer lex, Set allowed)
{
this (lex, "O", allowed);
}
public Instance pipe (Instance carrier)
{
if (!(carrier.getData() instanceof CharSequence))
throw new ClassCastException ("carrier.data is a " + carrier.getData().getClass().getName() +
" not a CharSequence");
TokenSequence dataTokens = new TokenSequence ();
TokenSequence targetTokens = new TokenSequence ();
CharSequence string = (CharSequence) carrier.getData();
String tag = backgroundTag;
String nextTag = backgroundTag;
Matcher m = sgmlPattern.matcher (string);
int textStart = 0;
int textEnd = 0;
int nextStart = 0;
boolean done = false;
while (!done) {
done = !findNextValidMatch (m);
if (done)
textEnd = string.length()-1;
else {
String sgml = m.group();
int groupCount = m.groupCount();
if (sgml.charAt(1) == '/')
nextTag = backgroundTag;
else{
nextTag = m.group(0);
nextTag = sgml.substring(1, sgml.length()-1);
}
nextStart = m.end();
textEnd = m.start();
}
if (textEnd - textStart > 0) {
lexer.setCharSequence (string.subSequence (textStart, textEnd));
while (lexer.hasNext()) {
dataTokens.add (new Token ((String) lexer.next()));
targetTokens.add (new Token (tag));
}
}
textStart = nextStart;
tag = nextTag;
}
carrier.setData(dataTokens);
carrier.setTarget(targetTokens);
carrier.setSource(dataTokens);
return carrier;
}
/**
Finds the next match contained in allowedTags
.
*/
private boolean findNextValidMatch (Matcher m) {
if (!m.find ())
return false;
String sgml = m.group();
int start = m.start ();
int first = 1;
int last = sgml.length() - 1;
if (sgml.charAt(1) == '/')
first = 2;
sgml = sgml.substring (first, last);
if (allowedTags.contains (sgml)) {
m.find (start);
return true;
}
else return findNextValidMatch (m);
}
public String toString () {
String ret = "sgml pattern: " + sgmlPattern.toString();
ret += "\nlexer: " + lexer.getPattern().toString();
ret += "\nbg tag: " + backgroundTag.toString();
ret += "\nallowedHash: " + allowedTags + "\n";
return ret;
}
// Serialization
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 0;
private void writeObject (ObjectOutputStream out) throws IOException {
out.writeInt(CURRENT_SERIAL_VERSION);
out.writeObject(sgmlPattern);
out.writeObject(lexer);
out.writeObject(backgroundTag);
out.writeObject(allowedTags);
}
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
int version = in.readInt ();
sgmlPattern = (Pattern) in.readObject();
lexer = (CharSequenceLexer) in.readObject();
backgroundTag = (String) in.readObject();
allowedTags = (Set) in.readObject();
}
}