gate.cloud.batch.Batch Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gcp-api Show documentation
The "public API" of GCP, including the input and output handler interfaces and some abstract implementation and utility classes. A GATE plugin that wants to include input or output handler implementations should declare a "provided" dependency on this library.
The newest version!
/*
 *  Batch.java
 *  Copyright (c) 2007-2011, The University of Sheffield.
 *
 *  This file is part of GCP (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Affero General Public License,
 *  Version 3, November 2007.
 *
 *
 *  $Id: Batch.java 18202 2014-07-20 18:55:23Z ian_roberts $ 
 */
package gate.cloud.batch;

import gate.CorpusController;
import gate.cloud.io.InputHandler;
import gate.cloud.io.OutputHandler;
import gate.cloud.util.Tools;
import gate.util.GateException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import javax.xml.XMLConstants;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import javax.xml.stream.events.XMLEvent;

/**
 * Class representing a cloud batch job.
 */
public class Batch {
  
  /**
   * Constructor, which sets no values. Use the accessor methods to set the 
   * needed values, then call {@link #init()}. 
   */
  public Batch() {
    
  }
  
  
  /**
   * Log4J logger.
   */
  private static Logger logger = LoggerFactory.getLogger(Batch.class);

  private static XMLOutputFactory staxOutputFactory = 
      XMLOutputFactory.newInstance();
  
  private static XMLInputFactory staxInputFactory =
    XMLInputFactory.newInstance();
  
  /**
   * Prepares this batch for execution:
   * 
   * check that all the required values have been set
   * opens the {@link XMLStreamWriter} for the report file
   * updates the documentIDs value if the batch is being restarted after a
   * partial execution
   * 
   * If any problems are found, a {@link GateException} is thrown.
   */
  public void init() throws GateException {
    if(reportFile == null) throw new GateException("No report file set!");
    // restarting logic
    boolean restarting = false;
    // make sure the parent dir exists
    if(!reportFile.getParentFile().exists()
            && !reportFile.getParentFile().mkdirs()) { throw new GateException(
            "Could not create directories for " + reportFile.getAbsolutePath()); }
    File backupFile = new File(reportFile.getAbsolutePath() + ".bak");
    if(reportFile.exists()) {
      // restarting
      restarting = true;
      logger.info("Existing report file found at \""
              + reportFile.getAbsolutePath() + "\", attempting to restart");
      if(!reportFile.renameTo(backupFile)) {
        // try copying
        try {
          byte[] buff = new byte[32 * 1024];
          InputStream in =
                  new BufferedInputStream(new FileInputStream(reportFile));
          try {
            OutputStream out =
                    new BufferedOutputStream(new FileOutputStream(backupFile));
            try {
              int read = in.read(buff);
              while(read != -1) {
                out.write(buff, 0, read);
                read = in.read(buff);
              }
            } finally {
              out.close();
            }
          } finally {
            in.close();
          }
        } catch(IOException e) {
          throw new GateException("Could not restart batch", e);
        }
      }
    }
    // at this point we have the batch file moved/copied to the backup file,
    // so we can overwrite the report file.
    // prepare the report writer
    try {
      reportWriter =
              staxOutputFactory
                      .createXMLStreamWriter(new BufferedOutputStream(
                              new FileOutputStream(reportFile)));
      reportWriter.writeStartDocument();
      reportWriter.writeCharacters("\n");
      reportWriter.setDefaultNamespace(Tools.REPORT_NAMESPACE);
      reportWriter.writeStartElement(Tools.REPORT_NAMESPACE, "cloudReport");
      reportWriter.writeDefaultNamespace(Tools.REPORT_NAMESPACE);
      reportWriter.writeCharacters("\n");
      reportWriter.writeStartElement(Tools.REPORT_NAMESPACE, "documents");
   
    } catch(XMLStreamException e) {
      throw new GateException("Cannot write to the report file!", e);
    } catch(IOException e) {
      throw new GateException("Cannot write to the report file!", e);
    }
    if(restarting) {
      try {
        // in the report XML, the document IDs are only represented as Strings
        logger.debug("Processing existing report file");
        InputStream bakIn =
                new BufferedInputStream(new FileInputStream(backupFile));
        XMLEventReader xer = staxInputFactory.createXMLEventReader(bakIn);
        try {
          // skip until we find the starting documents tag
          XMLEvent event;
          while(xer.hasNext()) {
            event = xer.nextEvent();
            if(event.isStartElement()
                    && event.asStartElement().getName().getLocalPart()
                            .equals("documents")) {
              break;
            }
          }
          // read events from the input reader, write them to the report
          // writer, and remove any completed docIDs from the to-do list
          List events = new LinkedList();
          String currentReturnCode = null;
          String currentDocid = null;
          while(xer.hasNext()) {
            event = xer.nextEvent();
            events.add(event);
            // if this is the start of a processResult, store the return code
            // and ID for later
            if(event.isStartElement()
                    && event.asStartElement().getName().getLocalPart()
                            .equals("processResult")) {
              currentReturnCode =
                      event.asStartElement().getAttributeByName(
                              new QName(XMLConstants.NULL_NS_URI,
                                      "returnCode")).getValue();
              currentDocid =
                      event.asStartElement().getAttributeByName(
                              new QName(XMLConstants.NULL_NS_URI, "id"))
                              .getValue();
            }
            // if we have reached the end of a complete processResult
            // element, write it to the output stream writer
            if(event.isEndElement()
                    && event.asEndElement().getName().getLocalPart().equals(
                            "processResult")) {
              if(currentReturnCode.equals("SUCCESS") && currentDocid != null) {
                completedDocuments.add(currentDocid);
                for(XMLEvent evt : events) {
                  Tools.writeStaxEvent(evt, reportWriter);
                }
              }
              events.clear();
              currentReturnCode = null;
              currentDocid = null;
            }
            // stop if we reach the end of the  element
            if(event.isEndElement()
                    && event.asEndElement().getName().getLocalPart().equals(
                            "documents")) {
              break;
            }
          }
        } catch(Exception e) {
          // ignore, it probably just means end of file, as the XML we
          // are parsing will almost certainly be malformed
          logger.debug("Exception while parsing old report file - probably "
                  + "reached the end of old report", e);
        } finally {
          xer.close();
          bakIn.close();
          backupFile.delete();
        }
        // filter the documents already processed, if the full list is known up front
        if(documentIDs != null) {
          List unprocessedDocs = new ArrayList();
          for(DocumentID docId : documentIDs) {
            if(!completedDocuments.contains(docId.getIdText())) {
              unprocessedDocs.add(docId);
            }
          }
          unprocessedDocumentIDs = unprocessedDocs.toArray(
                  new DocumentID[unprocessedDocs.size()]);
        }
      } catch(XMLStreamException e) {
        throw new GateException("Cannot write to the report file!", e);
      } catch(IOException e) {
        throw new GateException("Cannot write to the report file!", e);
      }
    } else {
      // fresh start
      unprocessedDocumentIDs = documentIDs;
    }
  }

  private String batchId;

  private DocumentID[] documentIDs;

  private DocumentID[] unprocessedDocumentIDs;
  
  private Set completedDocuments = new HashSet();

  private CorpusController gateApplication;

  private File reportFile;

  private InputHandler inputHandler;

  private List outputHandlers;

  private XMLStreamWriter reportWriter;

  /**
   * Gets the ID of the this batch.
   * @return a {@link String} value.
   */
  public String getBatchId() {
    return batchId;
  }
  
  /**
   * Sets the ID of the this batch.
   */
  public void setBatchId(String batchId) {
    this.batchId = batchId;
  }

  /**
   * Gets the {@link File} object denoting the file where the processing report
   * will be written.
   * @return the report file.
   */
  public File getReportFile() {
    return reportFile;
  }
  
  /**
   * Sets the {@link File} object denoting the file where the processing report
   * will be written.
   *
   * @param reportFile the file to which the report should be written.
   */
  public void setReportFile(File reportFile) {
    this.reportFile = reportFile;
  }

  /**
   * Gets the list of output handlers for this batch.
   * @return the batch output handlers.
   */
  public List getOutputHandlers() {
    return outputHandlers;
  }

  /**
   * Sets the list of output handlers for this batch.
   *
   * @param outputHandlers output handlers for this batch.
   */  
  public void setOutputHandlers(List outputHandlers) {
    this.outputHandlers = outputHandlers;
  }

  /**
   * Gets the list of input document IDs in this batch.  May be null
   * for streaming batches.
   * 
   * @return an array of {@link String}s.
   */
  public DocumentID[] getDocumentIDs() {
    return documentIDs;
  }
  
  /**
   * Sets the list of input document IDs in this batch.
   */
  public void setDocumentIDs(DocumentID[] documentIDs) {
    this.documentIDs = documentIDs;
  }

  
  /**
   * This can be used to obtain a file object pointing to the saved version of 
   * the GATE application that should be used for processing.
   * @return a {@link File} object.
   */
  public CorpusController getGateApplication() {
    return gateApplication;
  }

  /**
   * Sets the initial GATE application that this batch will run. The app will
   * be duplicated by the document processor. 
   *
   * @param app the template application for this batch.
   */
  public void setGateApplication(CorpusController app) {
    this.gateApplication = app;
  }
  
  /**
   * Gets the input handler used by this batch.
   * @return a {@link InputHandler} value.
   */
  public InputHandler getInputHandler() {
    return inputHandler;
  }

  /**
   * Sets the input handler used by this batch.
   *
   * @param inputHandler the handler that provides documents for this batch.
   */
  public void setInputHandler(InputHandler inputHandler) {
    this.inputHandler = inputHandler;
  }

  /**
   * Gets the list of output handlers.
   * @return a {@link List} of {@link OutputHandler} objects.
   */
  public List getOutputs() {
    return outputHandlers;
  }

  /**
   * This gets an {@link XMLStreamWriter} that writes to the
   * report file for this batch.
   * @return a writer for the report file, positioned ready to
   * write the next entry for a completed or failed processing
   * job.
   * @throws IOException if an I/O error occurs while creating
   * the writer
   * @throws XMLStreamException if a StAX error occurs while
   * creating the writer.
   */
  public XMLStreamWriter getReportWriter() throws IOException,
          XMLStreamException {
    return reportWriter;
  }

  /**
   * This gets the list of all the documents from this batch that
   * are still to be processed.  For a clean batch this would be the
   * same as {@link #getDocumentIDs()} but for a batch that has
   * been interrupted and restarted the values may be different.
   * May be null for streaming batches, where the full list of
   * document IDs is not known up-front.
   */
  public DocumentID[] getUnprocessedDocumentIDs() {
    return unprocessedDocumentIDs;
  }
  
  /**
   * This gets the set of all document IDs from this batch that
   * have been successfully processed previously.  For a clean
   * batch this would be empty, but for a batch that has been
   * interrupted and restarted the set will contain document
   * IDs that are marked as SUCCEEDED in the partial report
   * file from the previous run.
   */
  public Set getCompletedDocuments() {
    return completedDocuments;
  }

  public String toString() {
    return "Batch ID:         "
            + batchId
            + "\nInput handler:    "
            + inputHandler.toString()
            + "\nOutputs:          "
            + outputHandlers
            + "\nGATE Application: "
            + (gateApplication == null ?
                    "not set" : gateApplication.getName())
            + "\nReport file:      "
            + reportFile
            + "\nInput documents:        "
            + (documentIDs == null ? 0 : documentIDs.length)
            + "\nUnprocessed documents:  "
            + (unprocessedDocumentIDs == null
                    ? 0
                    : unprocessedDocumentIDs.length);
  }
  
}