
cc.mallet.extract.Extraction Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Andrew McCallum [email protected]
*/
package cc.mallet.extract;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.io.PrintWriter;
import cc.mallet.types.LabelAlphabet;
import cc.mallet.types.Sequence;
/**
* The results of doing information extraction. This is designed to handle
* field extraction from a single document, or relation extraction and
* coreference from multiple documents;
*/
public class Extraction
{
private Extractor extractor;
private List byDocs = new ArrayList (); // List of DocumentExtractions
private List records = new ArrayList ();
// If the DocumentExtractions contain true targets (i.e., they're labeled testing instances,
// then these are the true records obtained from those
List trueRecords = new ArrayList ();
private LabelAlphabet dict;
/**
* Creates an empty Extraction option. DocumentExtractions can be added later by
* the addDocumentExtraction method.
*/
public Extraction (Extractor extractor, LabelAlphabet dict)
{
this.extractor = extractor;
this.dict = dict;
}
/**
* Creates an extration given a sequence output by some kind of per-sequece labeler, like an
* HMM or a CRF. The extraction will contain a single document.
*/
public Extraction (Extractor extractor, LabelAlphabet dict, String name, Tokenization input, Sequence output, String background)
{
this.extractor = extractor;
this.dict = dict;
DocumentExtraction docseq = new DocumentExtraction (name, dict, input, output, background);
addDocumentExtraction (docseq);
}
public void addDocumentExtraction (DocumentExtraction docseq)
{
byDocs.add (docseq);
records.add (new Record (docseq.getName (), docseq.getExtractedSpans ()));
if (docseq.getTargetSpans () != null) {
trueRecords.add (new Record ("TRUE:"+docseq.getName (), docseq.getTargetSpans ()));
}
}
public Record getRecord (int idx) { return (Record) records.get (idx); }
public int getNumRecords () { return records.size(); }
public DocumentExtraction getDocumentExtraction(int idx) { return (DocumentExtraction) byDocs.get (idx); }
public int getNumDocuments () { return byDocs.size(); }
public Extractor getExtractor ()
{
return extractor;
}
public Record getTargetRecord (int docnum)
{
return (Record) trueRecords.get (docnum);
}
public LabelAlphabet getLabelAlphabet () { return dict; }
public void cleanFields (FieldCleaner cleaner)
{
Iterator it = records.iterator ();
while (it.hasNext ()) {
cleanRecord ((Record) it.next (), cleaner);
}
it = trueRecords.iterator ();
while (it.hasNext ()) {
cleanRecord ((Record) it.next (), cleaner);
}
}
private void cleanRecord (Record record, FieldCleaner cleaner)
{
Iterator it = record.fieldsIterator ();
while (it.hasNext ()) {
Field field = (Field) it.next ();
field.cleanField (cleaner);
}
}
public void print (PrintWriter writer)
{
Iterator it = records.iterator ();
writer.println ("***EXTRACTION***");
while (it.hasNext ()) {
Record record = (Record) it.next ();
writer.println ("**RECORD "+record.getName ());
Iterator fit = record.fieldsIterator ();
while (fit.hasNext ()) {
Field field = (Field) fit.next ();
writer.println (field.getName ());
for (int fidx = 0; fidx < field.numValues (); fidx++) {
String val = field.value (fidx).replaceAll ("\n", " ");
writer.print (" ==> "+val+"\n");
}
writer.println ();
}
}
writer.println ("***END EXTRACTION***");
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy