
cc.mallet.extract.DocumentExtraction Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
package cc.mallet.extract;
import org.jdom.Element;
import org.jdom.Document;
import org.jdom.Namespace;
import org.jdom.Text;
import org.jdom.output.XMLOutputter;
import cc.mallet.types.*;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import gnu.trove.THashMap;
/**
* Created: Oct 12, 2004
*
* @author = 0; j--) {
LabeledSpan parent = (LabeledSpan) orderedByStart.get (j);
if (parent.isSubspan (child)) {
List childList = (List) children.get (parent);
if (childList == null) {
childList = new ArrayList ();
children.put (parent, childList);
}
roots.remove (child);
childList.add (child);
break;
}
}
}
CharSequence doc = (CharSequence) document;
Span wholeDoc = new StringSpan (doc, 0, doc.length ());
return new Document (generateElement (rootEltName, wholeDoc, roots, children));
}
private Element generateElement (String parentName, Span span, List childSpans, THashMap tree)
{
Element parentElt = new Element (parentName);
if (childSpans == null || childSpans.isEmpty ()) {
parentElt.setContent (new Text (span.getText ()));
} else {
List childElts = new ArrayList (childSpans.size());
int start = span.getStartIdx ();
int current = 0;
for (int i = 0; i < childSpans.size(); i++) {
LabeledSpan childSpan = (LabeledSpan) childSpans.get (i);
Label childLabel = childSpan.getLabel();
int childStart = childSpan.getStartIdx () - start;
if (childStart > current) {
childElts.add (new Text (span.getText().substring (current, childStart)));
}
if (childLabel == backgroundTag) {
childElts.add (new Text (childSpan.getText()));
} else {
String name = childLabel.getEntry ().toString();
List grandchildren = (List) tree.get (childSpan);
childElts.add (generateElement (name, childSpan, grandchildren, tree));
}
current = childSpan.getEndIdx () - start;
}
if (current < span.getEndIdx ())
childElts.add (new Text (span.getText().substring (current)));
parentElt.addContent (childElts);
}
return parentElt;
}
public String toXmlString ()
{
Document jdom = toXmlDocument ();
XMLOutputter outputter = new XMLOutputter ();
return outputter.outputString (jdom);
}
public int size ()
{
return extractedSpans.size();
}
// Serialization garbage
private static final long serialVersionUID = 1L;
private static final int CURRENT_SERIAL_VERSION = 1;
private void writeObject(ObjectOutputStream out) throws IOException {
out.defaultWriteObject();
out.writeInt(CURRENT_SERIAL_VERSION);
}
private void readObject(ObjectInputStream in) throws IOException,
ClassNotFoundException {
in.defaultReadObject();
in.readInt(); // read version
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy