gate.plugin.learningframework.AbstractDocumentProcessor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of learningframework Show documentation
Show all versions of learningframework Show documentation
A GATE plugin that provides many different machine learning
algorithms for a wide range of NLP-related machine learning tasks like
text classification, tagging, or chunking.
/*
* Copyright (c) 2015-2016 The University Of Sheffield.
*
* This file is part of gateplugin-LearningFramework
* (see https://github.com/GateNLP/gateplugin-LearningFramework).
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 2.1 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this software. If not, see .
*/
package gate.plugin.learningframework;
import org.apache.log4j.Logger;
import gate.Controller;
import gate.Document;
import gate.Factory;
import static gate.Factory.defaultDuplicate;
import gate.Resource;
import gate.creole.ControllerAwarePR;
import gate.creole.ResourceInstantiationException;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.CustomDuplication;
import gate.creole.ExecutionException;
import gate.creole.metadata.Sharable;
import gate.util.Benchmark;
import gate.util.Benchmarkable;
import gate.util.GateRuntimeException;
import java.io.IOException;
import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
/**
* Abstract base class for all the PRs in this plugin.
*/
// The inheriting class should define a serverVersionUID
@SuppressWarnings("serial")
public abstract class AbstractDocumentProcessor
extends AbstractLanguageAnalyser
implements ControllerAwarePR,
CustomDuplication,
Benchmarkable {
/**
*
*/
private final Logger LOGGER =
Logger.getLogger(AbstractDocumentProcessor.class.getCanonicalName());
// This will be shared between all duplicates
protected AtomicInteger seenDocuments = null;
@Sharable
public void setSeenDocuments(AtomicInteger n) {
seenDocuments = n;
}
public AtomicInteger getSeenDocuments() {
return seenDocuments;
}
public AtomicInteger seenDocumentsThisDuplicate;
protected Controller controller;
protected static final Object SYNC_OBJECT = new Object();
// because the setter for this is marked @Sharable, all duplicates will hold
// the same reference after initialisation. This is updated in init() and remains
// forever. This is the actual number of duplicates (1-based, not 0-based)
protected AtomicInteger nDuplicates = null;
@Sharable
public void setNDuplicates(AtomicInteger n) {
nDuplicates = n;
}
public AtomicInteger getNDuplicates() {
return nDuplicates;
}
// the following shared counter is used when processing starts to find out which invocation
// of the controller started method is the last one, and when processing finishes to figure out which
// invocation of controller finished/aborted is the last one. The counter gets incremented
// for each controller started and decremented for each finished/aborted.
// During execution the counter should hold the actual number of running duplicates and should
// be equal to nDuplicates
protected AtomicInteger remainingDuplicates = null;
@Sharable
public void setRemainingDuplicates(AtomicInteger n) {
remainingDuplicates = n;
}
public AtomicInteger getRemainingDuplicates() {
return remainingDuplicates;
}
protected Throwable lastError = null;
@Sharable
public void setLastError(Throwable x) {
lastError = x;
}
public Throwable getLastError() {
return lastError;
}
protected ConcurrentHashMap sharedData = null;
@Sharable
public void setSharedData(ConcurrentHashMap v) {
sharedData = v;
}
public ConcurrentHashMap getSharedData() {
return sharedData;
}
protected Object syncObject = null;
@Sharable
public void setSyncObject(Object val) {
syncObject = val;
}
public Object getSyncObject() {
return syncObject;
}
// Each duplicate holds its own duplicate id after initialisation.
// The duplicate id is 0-based, not 1-based, so the first duplicate has id 0 and
// the last nDuplicates-1
protected int duplicateId = 0;
public int getDuplicateId() {
return duplicateId;
}
//===============================================================================
// Implementation of the relevant API methods for DocumentProcessors. These
// get inherited by the implementing class. This also defines abstract methods
// that make it easier to handle the control flow:
// void process(Document doc) - replaces void execute(): process the Document
// void controllerStarted(Controller) - replaces controllerExecutionStarted(Controller)
// void controllerFinished(Controller, Throwable) - replaces
// controllerExecutionFinished and controllerExecutionAborted
// int getSeenDocuments().get() - returns the current number of documents
// for which processing has been started
// int seenDocumentsThisDuplicate.get() - the documents seen by only this duplicate
// int getDuplicateId() - returns the duplicate number for the current duplicate.
// this returns 0 for the instance for which init() was invoked first,
// usually the template other duplicates where cloned from.
// int getNDuplicates().get() - returns the current number of duplicates
// that exist.
//================================================================================
static boolean versionInfoShown = false;
@Override
public Resource init() throws ResourceInstantiationException {
// System.err.println("DEBUG init() BEGIN, getNDuplicates="+getNDuplicates()+" this="+this+" id="+duplicateId);
// we always provide the following shared fields to all PRs which are used for duplicated PRs:
// nDuplicates is an AtomicInt which gets incremented whenever a resource
// gets duplicated. seenDocuments is an AtomicInt that contains the number
// of documents for which processing was started already.
// syncObject is an Object used for synchronizing between threads
// that run duplicates.
// sharedData is a ConcurrentHashMap that contains any
// other shared data.
// NOTE: this piece of code does not need to get synchronized since we
// always expect duplication to happen in a single thread, one after the
// other. Usuall, all duplicates will get created from the same first
// created instance, but we do not rely on that.
if (!versionInfoShown) {
Properties properties = new Properties();
try {
properties.load(getClass().getClassLoader().getResourceAsStream("git.properties"));
String buildVersion = properties.getProperty("gitInfo.build.version");
if (buildVersion.endsWith("-SNAPSHOT")) {
System.out.println("LearningFramework version=" + buildVersion
+ " commit=" + properties.getProperty("gitInfo.commit.id.abbrev"));
}
} catch (IOException ex) {
System.err.println("Could not obtain version info: " + ex.getMessage());
}
versionInfoShown = true;
}
seenDocumentsThisDuplicate = new AtomicInteger(0);
// The very first instance of this (the "template" which is used to duplicate
// all others) will have getNDuplicates() null, this is then used to set up
// all the shared variables
if(getNDuplicates() == null) {
LOGGER.debug("DEBUG: creating first instance of PR "+this.getName());
setNDuplicates(new AtomicInteger(1));
duplicateId = 0;
// System.err.println("DEBUG: "+this.getName()+" init() for first instance, duplicateId="+duplicateId);
setSharedData(new ConcurrentHashMap<>());
setSeenDocuments(new AtomicInteger(0));
setRemainingDuplicates(new AtomicInteger(0));
setSyncObject(new Object());
LOGGER.debug("DEBUG: "+this.getName()+" created duplicate "+duplicateId);
}
// System.err.println("DEBUG init() END, getNDuplicates="+getNDuplicates()+" this="+this+" id="+duplicateId+" hash="+System.identityHashCode(this));
return this;
}
@Override
public Resource duplicate(Factory.DuplicationContext ctx) throws ResourceInstantiationException {
// System.err.println("DEBUG duplicate(), getNDuplicates="+getNDuplicates()+" this="+this);
// NOTE: this piece of code does not need to get synchronized since we
// always expect duplication to happen in a single thread, one after the
// other. Usually, all duplicates will get created from the same first
// created instance, but we do not rely on that.
// This should never happen since we should have dealt with getNDuplicates() being
// null in the init() of the template before any duplication occurs!
if(getNDuplicates() == null || getNDuplicates().get() == 0) {
throw new GateRuntimeException("This should not happen!");
} else {
// create a new instance of whatever we are and cast to what we need to handle it
AbstractDocumentProcessor newRes = (AbstractDocumentProcessor)defaultDuplicate(this, ctx);
int thisn = getNDuplicates().getAndAdd(1);
// set the duplicateId in the newly created instance
newRes.duplicateId = thisn;
LOGGER.debug("DEBUG: created duplicate "+newRes.duplicateId+" of PR "+this.getName());
// System.err.println("DEBUG duplicate() END, getNDuplicates="+getNDuplicates()+" new="+newRes+" id="+newRes.duplicateId+" hash="+System.identityHashCode(newRes));
// return the duplicate
return newRes;
}
}
@Override
public void execute() throws ExecutionException {
// The document counting happens in this synchronized code block.
// We could probably also use volatile Integer for the counting.
synchronized (getSyncObject()) {
seenDocumentsThisDuplicate.incrementAndGet();
getSeenDocuments().incrementAndGet();
}
// actual processing happens in parallel if there are duplicates
process(getDocument());
}
/**
* Handle the controller execution aborted callback.
*
* This does very much the same as the controller execution finished callback
* but also stores the last Throwable so it can be inspected by the PR.
* @param arg0 controller invoking the callback
* @param arg1 throwable representing the error that was encountered
* @throws ExecutionException can be thrown
*/
@Override
public void controllerExecutionAborted(Controller arg0, Throwable arg1)
throws ExecutionException {
// reset the flags for the next time the controller is run
controller = arg0;
setLastError(arg1);
LOGGER.error("Controller ended with error "+arg1.getMessage());
int tmp = getRemainingDuplicates().decrementAndGet();
LOGGER.debug("DEBUG "+this.getName()+" controllerExecutionAborted invocation "+tmp+" for duplicate "+duplicateId);
//System.err.println("DEBUG: "+this.getName()+" controllerExecutionAborted, duplicateId="+duplicateId+" remaining="+tmp);
// Assert.assertEquals(tmp, duplicateId);
controllerFinished(arg0, arg1);
}
@Override
public void controllerExecutionFinished(Controller arg0)
throws ExecutionException {
controller = arg0;
int tmp = getRemainingDuplicates().decrementAndGet();
LOGGER.debug(this.getName()+": controllerExecutionFinished invocation "+tmp+" for duplicate "+duplicateId);
// System.err.println("DEBUG: "+this.getName()+" controllerExecutionFinished, duplicateId="+duplicateId+" remaining="+tmp);
// Assert.assertEquals(tmp, duplicateId);
controllerFinished(arg0, null);
}
@Override
public void controllerExecutionStarted(Controller arg0)
throws ExecutionException {
controller = arg0;
seenDocumentsThisDuplicate.set(0);
// we count up to the number of duplicates we have. The first invocation of this is also
// responsible for resetting the document counter (it needs to be the first because
// at any later time, another duplicate could already have their execute method invoked
int tmp = getRemainingDuplicates().getAndIncrement();
if(tmp==0) {
LOGGER.debug(this.getName()+": First controllerExecutionStarted invocation, resetting error and doc count in duplicate "+duplicateId);
setLastError(null);
getSeenDocuments().set(0);
} else {
LOGGER.debug(this.getName()+": controllerExecutionStarted invocation number "+tmp+" in duplicate "+duplicateId);
}
// just for checking that our assumption is right that invocation happens
// in the order the duplicate was originally created in GCP.
// System.err.println("DEBUG: "+this.getName()+" controllerExecutionStarted, duplicateId="+duplicateId+" remaining="+tmp);
// Assert.assertEquals(tmp, duplicateId);
controllerStarted(arg0);
}
//=====================================================================
// New simplified API for the child classes
//=====================================================================
// NOTE: not sure which of these should be abstract (and thus force
// the programmer to implement them even if empty) and which should be
// pre-implemented to do nothing.
/**
* The new method to implement by PRs which derive from this class.
*
* @param document the document to get processed
*
*/
protected abstract void process(Document document);
/**
* Callback for when each controller gets started on a corpus.
* This method gets called once when processing starts for a controller.
* This replaces the controllerExecutionStarted callback which must not be
* overridden!
* Note that controllerStarted gets invoked for each duplicate
* in sequence, without any concurrency and that (in the case of GCP at least)
* the order of invocation should agree with the order of creation, so it
* should match the duplicateId assigned to each instance.
*
* @param ctrl the controller instance
*/
public abstract void controllerStarted(Controller ctrl);
/**
* Callback for when each controller finishes on a corpus.
* This method gets called once when processing finishes for a controller.
* This replaces the controllerExecutionFinished and controllerExecutionAborted
* callbacks which must not be overridden!
* If the execution of a controller had an error, the Throwable is non-null,
* otherwise a null Throwable indicates normal completion.
* Note that controllerFinished gets invoked for each duplicate
* in sequence, without any concurrency and that (in the case of GCP at least)
* the order of invocation should agree with the order of creation, so it
* should match the duplicateId assigned to each instance.
*
* @param ctrl the controller instance
* @param thrw the Throwable indicating the error or null for normal completion
*/
public abstract void controllerFinished(Controller ctrl, Throwable thrw);
protected void benchmarkCheckpoint(long startTime, String name) {
if (Benchmark.isBenchmarkingEnabled()) {
Benchmark.checkPointWithDuration(
Benchmark.startPoint() - startTime,
Benchmark.createBenchmarkId(name, this.getBenchmarkId()),
this, null);
}
}
@Override
public String getBenchmarkId() {
return benchmarkId;
}
@Override
public void setBenchmarkId(String string) {
benchmarkId = string;
}
private String benchmarkId = this.getName();
}