gate.plugin.learningframework.data.CorpusRepresentationVolatileDense2JsonStream Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of learningframework Show documentation
A GATE plugin that provides many different machine learning algorithms for a wide range of NLP-related machine learning tasks like text classification, tagging, or chunking.
There is a newer version: 4.2
Show newest version
/*
 * Copyright (c) 2015-2016 The University Of Sheffield.
 *
 * This file is part of gateplugin-LearningFramework 
 * (see https://github.com/GateNLP/gateplugin-LearningFramework).
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 2.1 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this software. If not, see .
 */
package gate.plugin.learningframework.data;

import gate.Annotation;
import gate.AnnotationSet;
import gate.plugin.learningframework.LFUtils;
import gate.plugin.learningframework.features.FeatureExtractionDense;
import gate.plugin.learningframework.features.FeatureInfo;
import gate.plugin.learningframework.features.FeatureSpecAttribute;
import gate.plugin.learningframework.features.SeqEncoder;
import gate.plugin.learningframework.features.TargetType;
import gate.util.GateRuntimeException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
import org.apache.log4j.Logger;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.util.ArrayList;
import static gate.plugin.learningframework.features.FeatureExtractionBase.*;
import gate.plugin.learningframework.stats.Stats;
import gate.plugin.learningframework.stats.StatsForFeatures;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;

/**
 * Common base class for non Mallet volatile representations.
 *
 * This is for representations which are "volatile" i.e. whenever something is
 * added it is not kept in memory. Data could get immediately written to a file
 * or database or immediately passed on to an online training algorithm.
 * 
 * This tries to handle both sequence and non-sequence corpora.
 *
 * @author Johann Petrak
 */
public class CorpusRepresentationVolatileDense2JsonStream extends CorpusRepresentationVolatileBase {

  public static final String DATA_FILE_NAME = "crvd.data.json";
  public static final String META_FILE_NAME = "crvd.meta.json";

  private Logger LOGGER = org.apache.log4j.Logger.getLogger(CorpusRepresentationVolatileDense2JsonStream.class);

  private FileOutputStream outStream;
  private File outDir;
  private File outDataFile;
  private File outMetaFile;
  private FeatureInfo featureInfo; // the feature info from the feauture specification
  private List fnames;
  private StatsForFeatures stats = new StatsForFeatures();
  private SummaryStatistics seqLenStats = new SummaryStatistics(); // if we have sequences, stats about the lengths

  public List getTargetLabels() {
    Stats statsForTarget = stats.getStatistics(StatsForFeatures.KEY_FOR_TARGET);
    if (statsForTarget != null && statsForTarget.isString()) {
      return statsForTarget.stringValues();
    } else {
      return new ArrayList<>();
    }
  }

  public int getNrFeatures() {
    return fnames.size();
  }

  // The following flag is either unset (null) or indicates that the corpus representation
  // has received sequence representations. Once a sequence or non-sequence has been added,
  // any attempt to add the other type will lead to an exception.
  protected Boolean isSequence = null;

  public File getDataFile() {
    return outDataFile;
  }

  public File getMetaFile() {
    if (outMetaFile == null) {
      outMetaFile = new File(outDir, META_FILE_NAME);
    }
    return outMetaFile;
  }

  private final Object LOCKING_OBJECT = new Object();

  // some statistics we update while writing the corpus to the file and those
  // get included in the metadata written as well
  private int linesWritten = 0;

  @Override
  public int nrInstances() {
    return linesWritten;
  }

  /**
   * The constructor needs to specify the file where to save the instances to.
   * 
   * Note: if several threads use this instance, they should all share the just
   * this one instance, and their calls to the add method will automatically get
   * synchronized.
   *
   * @param outDir directory where to save the instances
   * @param featureInfo FeatureInfo instance
   */
  public CorpusRepresentationVolatileDense2JsonStream(File outDir, FeatureInfo featureInfo) {
    this.outDir = outDir;
    this.featureInfo = featureInfo;
    this.fnames = featureSpecAttributes2FeatureNames(featureInfo.getAttributes());
    // NOTE: the actual opening of the output file only happens when we initialise 
    // 
  }

  /**
   * Prevent the addition of new features or feature values when instances are
   * added.
   * For this representation, this is currently a no-operation.
   */
  @Override
  public void stopGrowth() {
    // TODO: this may be useful for sparse volatile representations, not used yet
  }

  /**
   * Enable the addition of new features or feature values when instances are
   * added. 
   * For this representation, this is currently a no-operation.
   */
  @Override
  public void startGrowth() {
    // TODO: this may be useful for sparse volatile representations, not used yet
  }

  /**
   * Add instances from the document.
   *
   * This adds any instances to the corpus, i.e. convert annotations first to
   * the internal dense instance representation, then converts the dense
   * instances to JSON and writes them to the stream which must have been opened
   * when this corpus representation was created.
   * 

   * NOTE: this method is automatically synchronized and should be save to use
   * from multiple threads
   * 

   * !!!TODO: explain which methods are used by this to convert to dense
   * internal instance representation and then to the final output format.
   *
   * @param instancesAS instance annotation set
   * @param sequenceAS sequence annotation set
   * @param inputAS input annotation set
   * @param classAS class annotation set
   * @param targetFeatureName target feature name
   * @param targetType type of target
   * @param instanceWeightFeature weight feature name
   * @param nameFeatureName name feature name
   * @param seqEncoder sequence encoder instance
   */
  @Override
  public void add(
          AnnotationSet instancesAS,
          AnnotationSet sequenceAS,
          AnnotationSet inputAS,
          AnnotationSet classAS,
          String targetFeatureName,
          TargetType targetType,
          String instanceWeightFeature,
          String nameFeatureName,
          SeqEncoder seqEncoder) {
    // first of all, distinguish between processing for sequences and for non-sequences
    // if the sequenceAS parameter is non-null we process sequences of instances, otherwise we process plain instances
    setTargetType(targetType);
    String json;
    if (sequenceAS == null) {
      if (isSequence == null) {
        isSequence = false;
      } else if (isSequence) {
        throw new GateRuntimeException("Trying to add non-sequence after sequence has already been added");
      }
      // processing plain instances
      // For each instance, do this:
      List instanceAnnotations = instancesAS.inDocumentOrder();

      for (Annotation instanceAnnotation : instanceAnnotations) {

        InstanceRepresentation inst
                = labeledAnnotation2Instance(instanceAnnotation, inputAS, classAS,
                        targetFeatureName, targetType, instanceWeightFeature, seqEncoder);
        // now that we have the internal instance representation, send it off
        // by first converting to a json string and then sending the string to the output
        // file
        json = internal2Json(inst,false);
        writeData(json);
      }
    } else {
      if (isSequence == null) {
        isSequence = true;
      } else if (!isSequence) {
        throw new GateRuntimeException("Trying to add sequence after non-sequence has already been added");
      }
      // processing sequences
      for (Annotation sequenceAnnotation : sequenceAS.inDocumentOrder()) {
        List insts4seq
                = instancesForSequence(instancesAS, sequenceAnnotation, inputAS, classAS, targetFeatureName, targetType, seqEncoder);
        seqLenStats.addValue(insts4seq.size());
        json = internal2Json(insts4seq,false);
        writeData(json);
      }

    }
  }

  public void writeData(String json) {
    try {
      synchronized (LOCKING_OBJECT) {
        outStream.write(json.getBytes("UTF-8"));
        outStream.write("\n".getBytes("UTF-8"));
        linesWritten += 1;
      }
    } catch (IOException ex) {
      throw new GateRuntimeException("Could not write generated JSON", ex);
    }
  }

  public List instancesForSequence(
          AnnotationSet instancesAS, Annotation sequenceAnnotation,
          AnnotationSet inputAS, AnnotationSet classAS,
          String targetFeatureName, TargetType targetType, SeqEncoder seqEncoder
  ) {

    // get all the instances from within the sequence in order
    List instanceAnnotations = gate.Utils.getContainedAnnotations(instancesAS, sequenceAnnotation).inDocumentOrder();
    List insts4seq = new ArrayList<>(instanceAnnotations.size());
    // for each instance annotation, get the instance representation and add it to the list
    for (Annotation instanceAnnotation : instanceAnnotations) {
      InstanceRepresentation inst
              = labeledAnnotation2Instance(instanceAnnotation, inputAS, classAS,
                      targetFeatureName, targetType, null, seqEncoder);
      insts4seq.add(inst);
    }

    return insts4seq;
  }

  public List unlabeledInstancesForSequence(
          AnnotationSet instancesAS, Annotation sequenceAnnotation,
          AnnotationSet inputAS
  ) {

    // get all the instances from within the sequence in order
    List instanceAnnotations = gate.Utils.getContainedAnnotations(instancesAS, sequenceAnnotation).inDocumentOrder();
    List insts4seq = new ArrayList<>(instanceAnnotations.size());
    // for each instance annotation, get the instance representation and add it to the list
    for (Annotation instanceAnnotation : instanceAnnotations) {
      InstanceRepresentation inst
              = unlabeledAnnotation2Instance(instanceAnnotation, inputAS, null);
      insts4seq.add(inst);
    }

    return insts4seq;
  }

  /**
   * Convert a labeled instance annotation to an instance representation.
   * 
   * @param instanceAnnotation instance annotation
   * @param inputAS input annotation set
   * @param classAS class annotation set
   * @param targetFeatureName name of target feature
   * @param targetType type of target
   * @param instanceWeightFeature instance weight feature, currently unused
   * @param seqEncoder sequence encoder instance
   * @return InstanceRepresentation
   */
  public InstanceRepresentation labeledAnnotation2Instance(Annotation instanceAnnotation,
          AnnotationSet inputAS, AnnotationSet classAS,
          String targetFeatureName, TargetType targetType,
          String instanceWeightFeature, SeqEncoder seqEncoder) {
    // create a new dense instance representation
    InstanceRepresentation inst = unlabeledAnnotation2Instance(
            instanceAnnotation, inputAS, instanceWeightFeature);
    // add the stats for all the features
    // TODO: maybe this is too slow and eventually we need to just limit this to 
    // adding stats for any list-like features (so the consumer of the data can
    // decide beforehand how to represent those lists).
    addToStatsForFeatures(inst);

    // now add the apropriate target information to the instance, depending on if we
    // do sequence tagging, classification, or regression
    if (classAS != null) {
      // extract the target as required for sequence tagging
      inst = FeatureExtractionDense.extractClassForSeqTagging(inst, classAS, instanceAnnotation, seqEncoder);
      // ok, for this we should have a nominal target value
      stats.addValue(StatsForFeatures.KEY_FOR_TARGET, inst.getTargetValue());
    } else {
      if (targetType == TargetType.NOMINAL) {
        inst = FeatureExtractionDense.extractClassTarget(inst, targetFeatureName, instanceAnnotation, inputAS);
        stats.addValue(StatsForFeatures.KEY_FOR_TARGET, inst.getTargetValue());
      } else if (targetType == TargetType.NUMERIC) {
        inst = FeatureExtractionDense.extractNumericTarget(inst, targetFeatureName, instanceAnnotation, inputAS);
        stats.addValue(StatsForFeatures.KEY_FOR_TARGET, inst.getTargetValue());
      }
    }
    return inst;
  }

  /**
   * Convert an unlabeled instance annotation to an InstanceRepresentation.
   * 
   * @param instanceAnnotation instance annotation 
   * @param inputAS input annotation set
   * @param instanceWeightFeature instance weight feature, currently unused
   * @return InstanceRepresentation
   */
  public InstanceRepresentation unlabeledAnnotation2Instance(Annotation instanceAnnotation,
          AnnotationSet inputAS,
          String instanceWeightFeature) {
    // create a new dense instance representation
    InstanceRepresentation inst = new InstanceRepresentationDenseVolatile();
    // first extract the independent features and add them to the instance representation
    for (FeatureSpecAttribute attr : featureInfo.getAttributes()) {
      inst = FeatureExtractionDense.extractFeature(inst, attr, inputAS, instanceAnnotation);
    }
    if (instanceWeightFeature != null && !instanceWeightFeature.isEmpty()) {
      // If the instanceWeightFeature is not specified we do not set any weight, but if it is 
      // specified then we either try to convert the value to double or use 1.0.
      double score = LFUtils.anyToDoubleOrElse(instanceAnnotation.getFeatures().get(instanceWeightFeature), 1.0);
      inst.setInstanceWeight(score);
    }
    return inst;
  }

  /**
   * Update the feature stastistics from the instance.
   * 
   * @param inst instance from which to update
   */
  public void addToStatsForFeatures(InstanceRepresentation inst) {
    // System.err.println("DEBUG: addToStatsForFeatures for "+inst);
    for (String fname : fnames) {
      stats.addValue(fname, inst.getFeature(fname));
    }
  }

  /**
   * Convert the instance to json.
   *
   * Note: this is influenced by the feature info set in the corpus
   * representation!
   *
   * @param inst instance to convert
   * @param noTarget - if true, does not include  the target(s) and does not use outermost list for 
   * indep / target pair
   * @return JSON string
   */
  public String internal2Json(InstanceRepresentation inst, boolean noTarget) {
    // can this be shared between multiple threads?
    ObjectMapper mapper = new ObjectMapper();
    List