
cc.mallet.extract.CRFExtractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.extract;
import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;
import cc.mallet.fst.CRF;
import cc.mallet.pipe.Noop;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.types.*;
/**
* Created: Oct 12, 2004
*
* @author source)
{
// I think that pipes should be associated neither with InstanceLists, nor
// with Instances. -cas
InstanceList toked = new InstanceList (tokenizationPipe);
toked.addThruPipe (source);
InstanceList piped = new InstanceList (getFeaturePipe ());
piped.addThruPipe (toked.iterator());
return piped;
}
/** Assumes Instance.source contains the Tokenization object. */
public Extraction extract (InstanceList ilist) {
Extraction extraction = new Extraction (this, getTargetAlphabet ());
for (int i = 0; i < ilist.size(); i++) {
Instance inst = ilist.get(i);
Tokenization tok = (Tokenization)inst.getSource();
String name = inst.getName().toString();
Sequence input = (Sequence)inst.getData ();
Sequence target = (Sequence)inst.getTarget ();
Sequence output = crf.transduce(input);
DocumentExtraction docseq =
new DocumentExtraction (name, getTargetAlphabet(), tok,
output, target, backgroundTag,
filter);
extraction.addDocumentExtraction (docseq);
}
return extraction;
}
public Extraction extract (Iterator source)
{
Extraction extraction = new Extraction (this, getTargetAlphabet ());
// Put all the instances through both pipes, then get viterbi path
InstanceList tokedList = new InstanceList (tokenizationPipe);
tokedList.addThruPipe (source);
InstanceList pipedList = new InstanceList (getFeaturePipe ());
pipedList.addThruPipe (tokedList.iterator());
Iterator it1 = tokedList.iterator ();
Iterator it2 = pipedList.iterator ();
while (it1.hasNext()) {
Instance toked = it1.next();
Instance piped = it2.next ();
Tokenization tok = (Tokenization) toked.getData();
String name = piped.getName().toString();
Sequence input = (Sequence) piped.getData ();
Sequence target = (Sequence) piped.getTarget ();
Sequence output = crf.transduce (input);
DocumentExtraction docseq = new DocumentExtraction (name, getTargetAlphabet (), tok,
output, target, backgroundTag,
filter);
extraction.addDocumentExtraction (docseq);
}
return extraction;
}
public TokenizationFilter getTokenizationFilter ()
{
return filter;
}
public String getBackgroundTag ()
{
return backgroundTag;
}
public Pipe getTokenizationPipe ()
{
return tokenizationPipe;
}
public void setTokenizationPipe (Pipe tokenizationPipe)
{
this.tokenizationPipe = tokenizationPipe;
}
public Pipe getFeaturePipe ()
{
return featurePipe;
}
//xxx This method is inherent dangerous!!! Should check that pipe.alphabet equals crf.alphabet
public void setFeaturePipe (Pipe featurePipe)
{
this.featurePipe = featurePipe;
}
public Alphabet getInputAlphabet ()
{
return crf.getInputAlphabet ();
}
public LabelAlphabet getTargetAlphabet ()
{
return (LabelAlphabet) crf.getOutputAlphabet ();
}
public CRF getCrf ()
{
return crf;
}
/**
* Transfer some Pipes from the feature pipe to the tokenization pipe.
* The feature pipe must be a SerialPipes. This will destructively modify the CRF object of the extractor.
* This is useful if you have a CRF hat has been trained from a single pipe, which you need to split up
* int feature and tokenization pipes
*/
public void slicePipes (int num)
{
Pipe fpipe = getFeaturePipe ();
if (!(fpipe instanceof SerialPipes))
throw new IllegalArgumentException ("slicePipes: FeaturePipe must be a SerialPipes.");
SerialPipes sp = (SerialPipes) fpipe;
ArrayList pipes = new ArrayList ();
for (int i = 0; i < num; i++) {
pipes.add (sp.getPipe (0));
//sp.removePipe (0); TODO Fix this
}
//setTokenizationPipe (sp); TODO Fix this
throw new UnsupportedOperationException ("Not yet implemented...");
}
// Java serialization nonsense
// Serial version 0: Initial version
// Serial version 1: Add featurePipe
// Serial version 2: Add filter
private static final int CURRENT_SERIAL_VERSION = 2;
private static final long serialVersionUID = 1;
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException
{
in.defaultReadObject ();
int version = in.readInt ();
if ((version == 0) || (featurePipe == null)) {
featurePipe = (Pipe) crf.getInputPipe ();
}
if (version < 2) {
filter = new BIOTokenizationFilter ();
}
}
private void writeObject (ObjectOutputStream out) throws IOException
{
out.defaultWriteObject ();
out.writeInt (CURRENT_SERIAL_VERSION);
}
public Sequence pipeInput (Object input)
{
InstanceList all = new InstanceList (getFeaturePipe ());
all.add (input, null, null, null);
return (Sequence) all.get (0).getData();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy