cc.mallet.grmm.learning.GenericAcrfData2TokenSequence Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!

/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */
package cc.mallet.grmm.learning;


import java.util.ArrayList;
import java.util.regex.*;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.ObjectInputStream;

import cc.mallet.extract.StringSpan;
import cc.mallet.extract.StringTokenization;
import cc.mallet.grmm.util.LabelsAssignment;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.*;

/**
 * Generic pipe that takes a linegroup of the form:
 *  *  LABEL1 LABEL2 ... LABELk word feature1 feature2 ... featuren
 * 
 * and converts it into an input FeatureVectorSequence and target LabelsSequence.
 * 
 * If the number of labels at each sequence position could vary, then use this format instead:
 *  
 *  LABEL1 LABEL2 ... LABELk ---- word feature1 feature2 ... featuren
 *  
 * The four dashes ---- must be there to separate the features from the labels.
 * Whitespace is ignored.
 * The difference between this pipe and {@link edu.umass.cs.iesl.casutton.experiments.dcrf.GenericDcrfPipe} is that this pipe
 *  allows for a different number of labels at each sequence position.
 * 
 * Explicitly specifying which word is the token allows the use of the HTML output from
 *  the extract package.
 *
 * Created: Aug 22, 2005
 *
 * @author  0) {
      // if fixed numLabels, just return whether we have enough.
      return j >= numLabels;
    } else {
      // otherwise, use the dynamic labels separator
      return toks[j].equals ("----");
    }
  }

  // Serialization garbage

  // version 1.0 == returned a feature vector sequence
  private static final long serialVersionUID = 1;
  private static final int CURRENT_SERIAL_VERSION = 2;

  private void writeObject (ObjectOutputStream out) throws IOException
  {
    out.defaultWriteObject ();
    out.writeInt (CURRENT_SERIAL_VERSION);
  }


  private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException
  {
    in.defaultReadObject ();
    int version = in.readInt ();
    if (version <= 1) {
      featuresIncludeToken = true;
    }
  }

  public boolean isLabelsAtEnd ()
  {
    return labelsAtEnd;
  }

  public void setLabelsAtEnd (boolean labelsAtEnd)
  {
    this.labelsAtEnd = labelsAtEnd;
  }
}