gate.plugin.learningframework.data.CorpusRepresentationMalletLDA Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of learningframework Show documentation
A GATE plugin that provides many different machine learning algorithms for a wide range of NLP-related machine learning tasks like text classification, tagging, or chunking.
There is a newer version: 4.2
Show newest version
/*
 * Copyright (c) 2015-2016 The University Of Sheffield.
 *
 * This file is part of gateplugin-LearningFramework 
 * (see https://github.com/GateNLP/gateplugin-LearningFramework).
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 2.1 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this software. If not, see .
 */
package gate.plugin.learningframework.data;

import gate.Annotation;
import gate.AnnotationSet;
import java.util.ArrayList;
import java.util.List;
import cc.mallet.pipe.Noop;
import cc.mallet.pipe.Pipe;
import cc.mallet.types.Alphabet;
import cc.mallet.types.FeatureSequence;
import cc.mallet.types.Instance;
import cc.mallet.types.TokenSequence;
import gate.Document;
import gate.plugin.learningframework.ScalingMethod;
import gate.plugin.learningframework.features.FeatureInfo;
import gate.plugin.learningframework.features.TargetType;
import gate.plugin.learningframework.mallet.LFPipe;
import gate.util.GateRuntimeException;
import org.apache.log4j.Logger;
import gate.plugin.learningframework.features.SeqEncoder;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.net.URL;
import static gate.plugin.learningframework.LFUtils.newURL;
import gate.plugin.learningframework.mallet.LFAlphabet;
import gate.plugin.learningframework.mallet.LFInstanceList;
import java.io.IOException;

public class CorpusRepresentationMalletLDA extends CorpusRepresentationMallet {

  static final Logger LOGGER = Logger.getLogger("CorpusRepresentationMalletLDA");

  public CorpusRepresentationMalletLDA(FeatureInfo fi) {
    featureInfo = fi;  // always null
    // since we always pass a null feature info, the scaling method is hard-wired to be NONE
    scalingMethod = ScalingMethod.NONE;

    // TODO: we really do not need any of this, figure out if we can simplify,
    // but keeping this should not really do any harm!
    Pipe innerPipe = new Noop(new LFAlphabet(), null);
    List pipes = new ArrayList<>();
    pipes.add(innerPipe);
    pipe = new LFPipe(pipes);
    pipe.setFeatureInfo(fi);
    instances = new LFInstanceList(pipe);
    targetType = TargetType.NONE;
  }

  /**
   * Non-public constructor for use when creating from a serialized pipe.
   *
   * @param fi
   */
  CorpusRepresentationMalletLDA(LFPipe pipe) {
    this.pipe = pipe;
    this.featureInfo = pipe.getFeatureInfo();
    this.scalingMethod = ScalingMethod.NONE;
    this.instances = new LFInstanceList(pipe);
    targetType = TargetType.NONE;
  }

  /**
   * Create a new instance based on the pipe stored in directory.
   *
   * @param directory directory with the saved model
   * @return corpus representation instance
   */
  public static CorpusRepresentationMalletLDA load(URL directory) {
    // load the pipe from a Java object serialization representation
    URL inFile = newURL(directory, "pipe.pipe");
    LFPipe lfpipe = null;
    try (InputStream bom = inFile.openStream();
         ObjectInputStream ois = new ObjectInputStream(bom)) {
      lfpipe = (LFPipe) ois.readObject();      
    } catch (ClassNotFoundException | IOException ex) { 
      throw new GateRuntimeException("Could not read pipe from "+inFile,ex);
    }
    CorpusRepresentationMalletLDA crms = new CorpusRepresentationMalletLDA(lfpipe);
    return crms;
  }


  /**
   * Add instances. 
   * 
   * For LDA, the instanceAS is either null or a set of annotations where each 
   * annotation represents a "document". The inputAS is the set containing all
   * token annotations for the GATE document. 
   * 
   * @param instancesAS instance/document annotation set or null
   * @param sequenceAS not used
   * @param inputAS set of token annotations
   * @param classAS not sued
   * @param tokenFeatureName the feature to use for the token string or empty to use the document text
   * @param targetType not used, expected to be Target.NONE
   * @param instanceWeightFeature ignored
   * @param nameFeatureName not used
   * @param seqEncoder not used
   */
  @Override
  public void add(AnnotationSet instancesAS, AnnotationSet sequenceAS, AnnotationSet inputAS, AnnotationSet classAS, String tokenFeatureName, TargetType targetType, String instanceWeightFeature, String nameFeatureName, SeqEncoder seqEncoder) {
    if (instancesAS == null) {
      // create one mallet instance for the whole document
      Document doc = inputAS.getDocument();
      instances.add(getInstanceFor(0L, doc.getContent().size(), inputAS, tokenFeatureName));
    } else {
      // create one mallet instance for each instance annotation
      for (Annotation instanceAnnotation : instancesAS.inDocumentOrder()) {
        instances.add(getInstanceFor(gate.Utils.start(instanceAnnotation),gate.Utils.end(instanceAnnotation),
                inputAS, tokenFeatureName));
      }
    }
  }

  @Override
  public void finishAdding() {  
    if(scalingMethod != ScalingMethod.NONE) {
      throw new GateRuntimeException("Scaling not allowed/not yet implemented for sequence tagging representation");
    }
  }
  
  /**
   * Get a Mallet FeatureSequence Instance for the tokens in the span.
   * The span is what is covered by the original instance annotation.
   * @param from start offset
   * @param to end offset 
   * @param tokenAS  annotation set containing the token-like annotations
   * @param tokenFeatureName feature in the token-like annotations to use or empty for document text
   * @return  mallet instance containing a feature sequence 
   */
  public Instance getInstanceFor(
          long from,
          long to,
          AnnotationSet tokenAS,
          String tokenFeatureName) {

    
    if(tokenFeatureName == null) {
      tokenFeatureName = "";
    }
    Document doc = tokenAS.getDocument();
    List tokenAnnotations = tokenAS.get(from, to).inDocumentOrder();
    // System.err.println("DEBUG: getInstanceFor from="+from+", to="+to+", tokenanns="+tokenAnnotations.size());
    List tokenList = new ArrayList<>();
    String str;
    for(Annotation tokenAnnotation : tokenAnnotations) {
      if(tokenFeatureName.isEmpty()) {
        str = gate.Utils.cleanStringFor(doc, tokenAnnotation);
      } else {
        str = (String)tokenAnnotation.getFeatures().get(tokenFeatureName);
      }
      if(str != null && !str.isEmpty()) {
        tokenList.add(str);
      }
    }
    TokenSequence tokenSeq = new TokenSequence(tokenList.toArray());
    //System.err.println("DEBUG: tokensequence="+tokenSeq);
    //System.err.println("DEBUG: alphabet growStopped()="+instances.getAlphabet().growthStopped());
    
    
    // NOTE: the following will create a feature sequence that contains -1 entries
    // for tokens which are not in the alphabet, if alphabet growth has been stopped
    // FeatureSequence featSeq = tokenSeq.toFeatureSequence(instances.getAlphabet());
    
    // Instead we create the FeatureSequence ourselves
    FeatureSequence featSeq = new FeatureSequence(instances.getAlphabet(), tokenSeq.size());
    Alphabet alph = instances.getAlphabet();
    for(int i=0; i -1) {
        featSeq.add(idx);
      }
    }
    /*
    System.err.println("DEBUG: fseq size="+featSeq.size());
    System.err.println("DEBUG: fseq length="+featSeq.getLength());
    System.err.println("DEBUG: fseq feats="+Arrays.toString(featSeq.getFeatures()));
    System.err.println("DEBUG: fseq feats="+Arrays.toString(featSeq.getFeatures()));
    System.err.println("DEBUG: fseq featIndexSequence="+Arrays.toString(featSeq.toFeatureIndexSequence()));
    */
    // append the start offset to the document name, using a pipe character
    return new Instance(featSeq, null, doc.getName()+"|"+from, null);

  }

  @Override
  public int nrInstances() {
    if(instances == null) {
      return 0;
    } else {
      return instances.size();
    }
  }

  
}