All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.grmm.learning.GenericAcrfData2TokenSequence Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */
package cc.mallet.grmm.learning;


import java.util.ArrayList;
import java.util.regex.*;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.ObjectInputStream;

import cc.mallet.extract.StringSpan;
import cc.mallet.extract.StringTokenization;
import cc.mallet.grmm.util.LabelsAssignment;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.*;

/**
 * Generic pipe that takes a linegroup of the form:
 * 
 *  LABEL1 LABEL2 ... LABELk word feature1 feature2 ... featuren
 * 
* and converts it into an input FeatureVectorSequence and target LabelsSequence. *

* If the number of labels at each sequence position could vary, then use this format instead: *

 *  LABEL1 LABEL2 ... LABELk ---- word feature1 feature2 ... featuren
 *  
* The four dashes ---- must be there to separate the features from the labels. * Whitespace is ignored. * The difference between this pipe and {@link edu.umass.cs.iesl.casutton.experiments.dcrf.GenericDcrfPipe} is that this pipe * allows for a different number of labels at each sequence position. *

* Explicitly specifying which word is the token allows the use of the HTML output from * the extract package. * * Created: Aug 22, 2005 * * @author 0) { // if fixed numLabels, just return whether we have enough. return j >= numLabels; } else { // otherwise, use the dynamic labels separator return toks[j].equals ("----"); } } // Serialization garbage // version 1.0 == returned a feature vector sequence private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 2; private void writeObject (ObjectOutputStream out) throws IOException { out.defaultWriteObject (); out.writeInt (CURRENT_SERIAL_VERSION); } private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject (); int version = in.readInt (); if (version <= 1) { featuresIncludeToken = true; } } public boolean isLabelsAtEnd () { return labelsAtEnd; } public void setLabelsAtEnd (boolean labelsAtEnd) { this.labelsAtEnd = labelsAtEnd; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy