All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.cloud.io.file.StreamingFileOutputHelper Maven / Gradle / Ivy

Go to download

The "public API" of GCP, including the input and output handler interfaces and some abstract implementation and utility classes. A GATE plugin that wants to include input or output handler implementations should declare a "provided" dependency on this library.

The newest version!
/*
 *  SteamingFileOutputHelper.java
 *  Copyright (c) 2007-2018, The University of Sheffield.
 *
 *  This file is part of GCP (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Affero General Public License,
 *  Version 3, November 2007.
 */
package gate.cloud.io.file;

import static gate.cloud.io.IOConstants.PARAM_BATCH_FILE_LOCATION;
import static gate.cloud.io.IOConstants.PARAM_CHUNK_SIZE;
import static gate.cloud.io.IOConstants.PARAM_COMPRESSION;
import static gate.cloud.io.IOConstants.PARAM_NAMING_STRATEGY;
import static gate.cloud.io.IOConstants.PARAM_PATTERN;
import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_GZIP;
import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_NONE;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.lang.ProcessBuilder.Redirect;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.function.ToIntFunction;
import java.util.zip.GZIPOutputStream;

import gate.Gate;
import gate.cloud.batch.DocumentID;
import gate.cloud.io.IOConstants;
import gate.util.GateException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Helper for streaming output handlers that want to write to a series of chunk files.
 *
 * @param  the type of the items that will be streamed to this helper.
 * @param  the type of the object that will be used to write items to the files.
 */
public class StreamingFileOutputHelper {
  
  @FunctionalInterface
  public static interface WriteOperation {
    public void writeItem(W writer, I item) throws Exception;
  }
  
  @FunctionalInterface
  public static interface WriterCreator {
    public W create(OutputStream stream) throws Exception;
  }
  
  private static final Logger logger =
      LoggerFactory.getLogger(StreamingFileOutputHelper.class);

  protected String pattern;

  protected long chunkSize = -1L;

  protected String compression;

  protected File batchDir;

  protected NamingStrategy namingStrategy;

  protected BlockingQueue results = new ArrayBlockingQueue<>(100);

  protected ExecutorService processWaiter = Executors.newCachedThreadPool();

  protected TItem endOfData;

  protected WriterCreator openWriter;

  protected WriteOperation writeItem;

  protected ToIntFunction itemSize;

  /**
   * Construct a streaming output helper.
   * 
   * @param endOfData
   *          flag object that will be used to signal the end of the data
   *          stream. This will be compared by reference and should be an object
   *          that will not otherwise be used in normal processing.
   * @param openWriter
   *          operation that takes an output stream and creates an appropriate
   *          writer object for the item type.
   * @param writeItem operation that writes the given item to the given writer.
   * @param itemSize function that computes an approcimate size in bytes of the
   *          given item, used to determine when to check for chunk roll-over.
   */
  public StreamingFileOutputHelper(TItem endOfData,
      WriterCreator openWriter,
      WriteOperation writeItem, ToIntFunction itemSize) {
    super();
    this.endOfData = endOfData;
    this.openWriter = openWriter;
    this.writeItem = writeItem;
    this.itemSize = itemSize;
  }

  public void config(Map configData)
      throws IOException, GateException {
    String batchFileStr = configData.get(PARAM_BATCH_FILE_LOCATION);
    if(batchFileStr != null) {
      batchDir = new File(batchFileStr).getParentFile();
    }
    // naming strategy
    String namingStrategyClassName = configData.get(PARAM_NAMING_STRATEGY);
    if(namingStrategyClassName == null
        || namingStrategyClassName.length() == 0) {
      namingStrategyClassName = SimpleNamingStrategy.class.getName();
    }
    try {
      Class namingStrategyClass =
          Class.forName(namingStrategyClassName, true, Gate.getClassLoader())
              .asSubclass(NamingStrategy.class);
      namingStrategy = namingStrategyClass.newInstance();
      namingStrategy.config(true, configData);
    } catch(Exception e) {
      throw new GateException("Could not instantiate specified naming strategy",
          e);
    }
    pattern = configData.get(PARAM_PATTERN);
    if(pattern == null) {
      pattern = "part-%03d";
    }
    String chunkSizeStr = configData.get(PARAM_CHUNK_SIZE);
    try {
      chunkSize = Long.parseLong(chunkSizeStr);
    } catch(Exception e) {
      logger.info("Using default chunk size");
      chunkSize = 99000000;
    }
    // get the compression value
    compression = configData.get(PARAM_COMPRESSION);
    if(compression == null) {
      // default
      compression = IOConstants.VALUE_COMPRESSION_NONE;
    }
  }

  public void init() throws IOException, GateException {
    // TODO Auto-generated method stub
    new Thread(new StreamOutputter()).start();
  }

  public void sendItem(TItem item) {
    try {
      results.put(item);
    } catch(InterruptedException e) {
      Thread.currentThread().interrupt();
    }
  }

  public void close() throws IOException, GateException {
    try {
      results.put(endOfData);
    } catch(InterruptedException e) {
      Thread.currentThread().interrupt();
    }
  }

  protected class StreamOutputter implements Runnable {
    private File currentFile;

    private int currentChunk = -1;

    private TWriter currentOutput;

    private Process currentProcess;

    public void run() {
      TItem item = null;
      try {
        try {
          int bytesSinceLastCheck = 0;
          while((item = results.take()) != endOfData) {
            if(currentOutput == null) {
              try {
                openNextChunk();
              } catch(Exception e) {
                logger.error("Failed to open output file " + currentFile, e);
              }
            }
            try {
              writeItem.writeItem(currentOutput, item);
            } catch(Exception e) {
              logger.warn("Error writing to file " + currentFile, e);
            }
            bytesSinceLastCheck += itemSize.applyAsInt(item);
            if(bytesSinceLastCheck > 1024 * 1024) {
              if(currentFile.length() > chunkSize) {
                closeChunk();
              }
              bytesSinceLastCheck = 0;
            }
          }
        } finally {
          closeChunk();
          processWaiter.shutdown();
        }
      } catch(InterruptedException e) {
        Thread.currentThread().interrupt();
      }
    }

    private void closeChunk() {
      if(currentOutput != null) {
        try {
          currentOutput.close();
        } catch(Exception e) {
          logger.warn("Error closing file " + currentFile.getAbsolutePath(), e);
        }
        if(currentProcess != null) {
          final Process p = currentProcess;
          processWaiter.execute(new Runnable() {
            public void run() {
              try {
                p.waitFor();
              } catch(InterruptedException e) {
                logger.warn("Interrupted while waiting for process", e);
                Thread.currentThread().interrupt();
              }
            }
          });
          currentProcess = null;
        }
        currentOutput = null;
        currentFile = null;
      }
    }

    private void openNextChunk() throws Exception {
      // if we're restarting we might have to skip some batches
      do {
        String newFileName = String.format(pattern, ++currentChunk);
        currentFile = namingStrategy.toFile(new DocumentID(newFileName));
      } while(currentFile.exists());
      OutputStream newStream = null;
      if(VALUE_COMPRESSION_GZIP.equals(compression)) {
        newStream = new GZIPOutputStream(new FileOutputStream(currentFile));
      } else if(compression == null
          || VALUE_COMPRESSION_NONE.equals(compression)) {
        newStream = new FileOutputStream(currentFile);
      } else {
        // treat compression value as a command line
        ProcessBuilder pb =
            new ProcessBuilder(compression.trim().split("\\s+"));
        pb.directory();
        pb.redirectInput(Redirect.PIPE);
        pb.redirectOutput(currentFile);
        pb.redirectError(Redirect.INHERIT);
        currentProcess = pb.start();
        newStream = currentProcess.getOutputStream();
      }
      currentOutput = openWriter.create(newStream);
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy