All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.extract.DocumentExtraction Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */
package cc.mallet.extract;

import org.jdom.Element;
import org.jdom.Document;
import org.jdom.Namespace;
import org.jdom.Text;
import org.jdom.output.XMLOutputter;

import cc.mallet.types.*;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import gnu.trove.THashMap;

/**
 * Created: Oct 12, 2004
 *
 * @author = 0; j--) {
         LabeledSpan parent = (LabeledSpan) orderedByStart.get (j);
         if (parent.isSubspan (child)) {
           List childList = (List) children.get (parent);
           if (childList == null) {
             childList = new ArrayList ();
             children.put (parent, childList);
           }
           roots.remove (child);
           childList.add (child);
           break;
         }
       }
     }

     CharSequence doc = (CharSequence) document;
     Span wholeDoc = new StringSpan (doc, 0, doc.length ());
     return new Document (generateElement (rootEltName, wholeDoc, roots, children));
   }


  private Element generateElement (String parentName, Span span, List childSpans, THashMap tree)
  {
    Element parentElt = new Element (parentName);
    if (childSpans == null || childSpans.isEmpty ()) {
      parentElt.setContent (new Text (span.getText ()));
    } else {
      List childElts = new ArrayList (childSpans.size());
      int start = span.getStartIdx ();
      int current = 0;
      for (int i = 0; i < childSpans.size(); i++) {
        LabeledSpan childSpan = (LabeledSpan) childSpans.get (i);
        Label childLabel = childSpan.getLabel();

        int childStart = childSpan.getStartIdx () - start;
        if (childStart > current) {
          childElts.add (new Text (span.getText().substring (current, childStart)));
        }

        if (childLabel == backgroundTag) {
          childElts.add (new Text (childSpan.getText()));
        } else {
          String name = childLabel.getEntry ().toString();
          List grandchildren = (List) tree.get (childSpan);
          childElts.add (generateElement (name, childSpan, grandchildren, tree));
        }

        current = childSpan.getEndIdx () - start;
      }

      if (current < span.getEndIdx ())
        childElts.add (new Text (span.getText().substring (current)));

      parentElt.addContent (childElts);
    }

    return parentElt;
  }


  public String toXmlString ()
  {
    Document jdom = toXmlDocument ();
    XMLOutputter outputter = new XMLOutputter ();
    return outputter.outputString (jdom);
  }

  public int size ()
  {
    return extractedSpans.size();
  }
  
	// Serialization garbage

	private static final long serialVersionUID = 1L;

	private static final int CURRENT_SERIAL_VERSION = 1;

	private void writeObject(ObjectOutputStream out) throws IOException {
		out.defaultWriteObject();
		out.writeInt(CURRENT_SERIAL_VERSION);
	}

	private void readObject(ObjectInputStream in) throws IOException,
			ClassNotFoundException {
		in.defaultReadObject();
		in.readInt(); // read version
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy