All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.cloud.batch.Batch Maven / Gradle / Ivy

Go to download

The "public API" of GCP, including the input and output handler interfaces and some abstract implementation and utility classes. A GATE plugin that wants to include input or output handler implementations should declare a "provided" dependency on this library.

The newest version!
/*
 *  Batch.java
 *  Copyright (c) 2007-2011, The University of Sheffield.
 *
 *  This file is part of GCP (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Affero General Public License,
 *  Version 3, November 2007.
 *
 *
 *  $Id: Batch.java 18202 2014-07-20 18:55:23Z ian_roberts $ 
 */
package gate.cloud.batch;

import gate.CorpusController;
import gate.cloud.io.InputHandler;
import gate.cloud.io.OutputHandler;
import gate.cloud.util.Tools;
import gate.util.GateException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import javax.xml.XMLConstants;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import javax.xml.stream.events.XMLEvent;

/**
 * Class representing a cloud batch job.
 */
public class Batch {
  
  /**
   * Constructor, which sets no values. Use the accessor methods to set the 
   * needed values, then call {@link #init()}. 
   */
  public Batch() {
    
  }
  
  
  /**
   * Log4J logger.
   */
  private static Logger logger = LoggerFactory.getLogger(Batch.class);

  private static XMLOutputFactory staxOutputFactory = 
      XMLOutputFactory.newInstance();
  
  private static XMLInputFactory staxInputFactory =
    XMLInputFactory.newInstance();
  
  /**
   * Prepares this batch for execution:
   * 
    *
  • check that all the required values have been set
  • *
  • opens the {@link XMLStreamWriter} for the report file
  • *
  • updates the documentIDs value if the batch is being restarted after a * partial execution
  • *
* If any problems are found, a {@link GateException} is thrown. */ public void init() throws GateException { if(reportFile == null) throw new GateException("No report file set!"); // restarting logic boolean restarting = false; // make sure the parent dir exists if(!reportFile.getParentFile().exists() && !reportFile.getParentFile().mkdirs()) { throw new GateException( "Could not create directories for " + reportFile.getAbsolutePath()); } File backupFile = new File(reportFile.getAbsolutePath() + ".bak"); if(reportFile.exists()) { // restarting restarting = true; logger.info("Existing report file found at \"" + reportFile.getAbsolutePath() + "\", attempting to restart"); if(!reportFile.renameTo(backupFile)) { // try copying try { byte[] buff = new byte[32 * 1024]; InputStream in = new BufferedInputStream(new FileInputStream(reportFile)); try { OutputStream out = new BufferedOutputStream(new FileOutputStream(backupFile)); try { int read = in.read(buff); while(read != -1) { out.write(buff, 0, read); read = in.read(buff); } } finally { out.close(); } } finally { in.close(); } } catch(IOException e) { throw new GateException("Could not restart batch", e); } } } // at this point we have the batch file moved/copied to the backup file, // so we can overwrite the report file. // prepare the report writer try { reportWriter = staxOutputFactory .createXMLStreamWriter(new BufferedOutputStream( new FileOutputStream(reportFile))); reportWriter.writeStartDocument(); reportWriter.writeCharacters("\n"); reportWriter.setDefaultNamespace(Tools.REPORT_NAMESPACE); reportWriter.writeStartElement(Tools.REPORT_NAMESPACE, "cloudReport"); reportWriter.writeDefaultNamespace(Tools.REPORT_NAMESPACE); reportWriter.writeCharacters("\n"); reportWriter.writeStartElement(Tools.REPORT_NAMESPACE, "documents"); } catch(XMLStreamException e) { throw new GateException("Cannot write to the report file!", e); } catch(IOException e) { throw new GateException("Cannot write to the report file!", e); } if(restarting) { try { // in the report XML, the document IDs are only represented as Strings logger.debug("Processing existing report file"); InputStream bakIn = new BufferedInputStream(new FileInputStream(backupFile)); XMLEventReader xer = staxInputFactory.createXMLEventReader(bakIn); try { // skip until we find the starting documents tag XMLEvent event; while(xer.hasNext()) { event = xer.nextEvent(); if(event.isStartElement() && event.asStartElement().getName().getLocalPart() .equals("documents")) { break; } } // read events from the input reader, write them to the report // writer, and remove any completed docIDs from the to-do list List events = new LinkedList(); String currentReturnCode = null; String currentDocid = null; while(xer.hasNext()) { event = xer.nextEvent(); events.add(event); // if this is the start of a processResult, store the return code // and ID for later if(event.isStartElement() && event.asStartElement().getName().getLocalPart() .equals("processResult")) { currentReturnCode = event.asStartElement().getAttributeByName( new QName(XMLConstants.NULL_NS_URI, "returnCode")).getValue(); currentDocid = event.asStartElement().getAttributeByName( new QName(XMLConstants.NULL_NS_URI, "id")) .getValue(); } // if we have reached the end of a complete processResult // element, write it to the output stream writer if(event.isEndElement() && event.asEndElement().getName().getLocalPart().equals( "processResult")) { if(currentReturnCode.equals("SUCCESS") && currentDocid != null) { completedDocuments.add(currentDocid); for(XMLEvent evt : events) { Tools.writeStaxEvent(evt, reportWriter); } } events.clear(); currentReturnCode = null; currentDocid = null; } // stop if we reach the end of the element if(event.isEndElement() && event.asEndElement().getName().getLocalPart().equals( "documents")) { break; } } } catch(Exception e) { // ignore, it probably just means end of file, as the XML we // are parsing will almost certainly be malformed logger.debug("Exception while parsing old report file - probably " + "reached the end of old report", e); } finally { xer.close(); bakIn.close(); backupFile.delete(); } // filter the documents already processed, if the full list is known up front if(documentIDs != null) { List unprocessedDocs = new ArrayList(); for(DocumentID docId : documentIDs) { if(!completedDocuments.contains(docId.getIdText())) { unprocessedDocs.add(docId); } } unprocessedDocumentIDs = unprocessedDocs.toArray( new DocumentID[unprocessedDocs.size()]); } } catch(XMLStreamException e) { throw new GateException("Cannot write to the report file!", e); } catch(IOException e) { throw new GateException("Cannot write to the report file!", e); } } else { // fresh start unprocessedDocumentIDs = documentIDs; } } private String batchId; private DocumentID[] documentIDs; private DocumentID[] unprocessedDocumentIDs; private Set completedDocuments = new HashSet(); private CorpusController gateApplication; private File reportFile; private InputHandler inputHandler; private List outputHandlers; private XMLStreamWriter reportWriter; /** * Gets the ID of the this batch. * @return a {@link String} value. */ public String getBatchId() { return batchId; } /** * Sets the ID of the this batch. */ public void setBatchId(String batchId) { this.batchId = batchId; } /** * Gets the {@link File} object denoting the file where the processing report * will be written. * @return the report file. */ public File getReportFile() { return reportFile; } /** * Sets the {@link File} object denoting the file where the processing report * will be written. * * @param reportFile the file to which the report should be written. */ public void setReportFile(File reportFile) { this.reportFile = reportFile; } /** * Gets the list of output handlers for this batch. * @return the batch output handlers. */ public List getOutputHandlers() { return outputHandlers; } /** * Sets the list of output handlers for this batch. * * @param outputHandlers output handlers for this batch. */ public void setOutputHandlers(List outputHandlers) { this.outputHandlers = outputHandlers; } /** * Gets the list of input document IDs in this batch. May be null * for streaming batches. * * @return an array of {@link String}s. */ public DocumentID[] getDocumentIDs() { return documentIDs; } /** * Sets the list of input document IDs in this batch. */ public void setDocumentIDs(DocumentID[] documentIDs) { this.documentIDs = documentIDs; } /** * This can be used to obtain a file object pointing to the saved version of * the GATE application that should be used for processing. * @return a {@link File} object. */ public CorpusController getGateApplication() { return gateApplication; } /** * Sets the initial GATE application that this batch will run. The app will * be duplicated by the document processor. * * @param app the template application for this batch. */ public void setGateApplication(CorpusController app) { this.gateApplication = app; } /** * Gets the input handler used by this batch. * @return a {@link InputHandler} value. */ public InputHandler getInputHandler() { return inputHandler; } /** * Sets the input handler used by this batch. * * @param inputHandler the handler that provides documents for this batch. */ public void setInputHandler(InputHandler inputHandler) { this.inputHandler = inputHandler; } /** * Gets the list of output handlers. * @return a {@link List} of {@link OutputHandler} objects. */ public List getOutputs() { return outputHandlers; } /** * This gets an {@link XMLStreamWriter} that writes to the * report file for this batch. * @return a writer for the report file, positioned ready to * write the next entry for a completed or failed processing * job. * @throws IOException if an I/O error occurs while creating * the writer * @throws XMLStreamException if a StAX error occurs while * creating the writer. */ public XMLStreamWriter getReportWriter() throws IOException, XMLStreamException { return reportWriter; } /** * This gets the list of all the documents from this batch that * are still to be processed. For a clean batch this would be the * same as {@link #getDocumentIDs()} but for a batch that has * been interrupted and restarted the values may be different. * May be null for streaming batches, where the full list of * document IDs is not known up-front. */ public DocumentID[] getUnprocessedDocumentIDs() { return unprocessedDocumentIDs; } /** * This gets the set of all document IDs from this batch that * have been successfully processed previously. For a clean * batch this would be empty, but for a batch that has been * interrupted and restarted the set will contain document * IDs that are marked as SUCCEEDED in the partial report * file from the previous run. */ public Set getCompletedDocuments() { return completedDocuments; } public String toString() { return "Batch ID: " + batchId + "\nInput handler: " + inputHandler.toString() + "\nOutputs: " + outputHandlers + "\nGATE Application: " + (gateApplication == null ? "not set" : gateApplication.getName()) + "\nReport file: " + reportFile + "\nInput documents: " + (documentIDs == null ? 0 : documentIDs.length) + "\nUnprocessed documents: " + (unprocessedDocumentIDs == null ? 0 : unprocessedDocumentIDs.length); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy