All Downloads are FREE. Search and download functionalities are using the official Maven repository.

tools.PackerMain.scala Maven / Gradle / Ivy

// Copyright 2015-2022 by Carnegie Mellon University
// See license information in LICENSE.txt

package org.cert.netsa.mothra.tools

import org.cert.netsa.io.ipfix.InfoModel
import org.cert.netsa.mothra.packer.{
  CorePacker, DirMapping, PackFileJob, PackerConfig,
  PackerThreadFactory, PackingLogic, PartitionerConfigurator,
  PartitionerPackLogic, RunTimeCodeLoader, Version, WorkFile}

import com.typesafe.scalalogging.StrictLogging
import java.io.{PrintWriter, StringWriter}
import java.lang.management.ManagementFactory
import java.nio.file.{Paths => JPaths}
import java.util.concurrent.{Executors, LinkedBlockingQueue,
  ThreadPoolExecutor, TimeUnit}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path => HPath}
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.hadoop.util.ShutdownHookManager
import scala.collection.immutable.Queue
import scala.util.control.NonFatal
import scala.util.{Failure, Success, Try}


/**
  * Object to implement the Packer application
  *
  * Typical usage in a Spark environment:
  *
  * `spark-submit --class org.cert.netsa.mothra.packer.tools.PackerMain mothra-tools.jar [--one-shot]    `
  *
  * where:
  *
  * srcDir:       Source (incoming) directory as Hadoop URI
  * destDir:      Destination directory as Hadoop URI
  * workDir:      Working directory on the local disk (not file://)
  * partitioner:  Partitioning file as Hadoop URIs
  *
  * Packer scans the source directory (`srcDir`) for IPFIX files.  It splits
  * the IPFIX records in each file into output file(s) in a time-based
  * directory structure based on the partitioning rules in the partitioning
  * file (`partitioner`).  The output files are initially created in the
  * working directory (`workDir`), and when they meet size and/or age
  * thresholds, they are moved to the destination directory (`destDir`).
  *
  * If "--one-shot" is included on the command line, the `srcDir` is only
  * scanned one time.  Once all files in `srcDir` have been packed (or they
  * fail to be packed after some number of attempts), the packer exits.
  *
  * The Java property values that are used by Packer are:
  *
  * `mothra.packer.compression` -- The compression to use for files written to
  * HDFS.  Values typically supported by Hadoop include `bzip2`, `gzip`,
  * `lz4`, `lzo`, `lzop`, `snappy`, and `default`.  The empty string indicates
  * no compression.  The default is no compression.
  *
  * `mothra.packer.maxPackJobs` -- The size of the thread pool that determines
  * the maximum number of input files that may be processed simultaneously.  A
  * larger value provides more throughput.  The default is 1.
  *
  * `mothra.packer.hoursPerFile` -- The number of hours covered by each file
  * in the repository.  The valid range is 1 (a file for each hour) to 24 (one
  * file per day).  The default is 1.
  *
  * `mothra.packer.pollingInterval` -- How long the main thread sleeps (in
  * seconds) between scans (polls) of the source directory checking for IPFIX
  * files to process.  The default is 30.
  *
  * `mothra.packer.workDir.checkInterval` -- The value for how often, in
  * seconds, to check the sizes and ages of the files in the working
  * directory.  The default is 60.  When the `checkInterval` is reached,
  * the sizes and ages of the files in the working directory are checked.
  * Files that meet ONE of the following criteria are closed and moved into
  * the data repository.  The criteria are:
  *
  * --- Files that were created more than `maximumAge` seconds ago.  Since
  * files are only checked at this interval, a file could potentially be one
  * interval older than the `maximumAge`.
  *
  * --- Files whose size exceeds `maximumSize`.  Since a file's size is not
  * continuously monitored, a file could be larger than this size, and the
  * user should set this value appropriately.
  *
  * --- Files whose size is at least `minimumsSize` AND that were created at
  * least `minimumAge` seconds ago.
  *
  * `mothra.packer.workDir.maximumAge` -- Files in the working directory that
  * were created over this number of seconds ago are always moved into the
  * repository, regardless of their size.  The default value is 1800 seconds
  * (30 minutes).
  *
  * `mothra.packer.workDir.maximumSize` -- Files in the working directory
  * whose size, in octets, is greater than this value are always moved into
  * the repository, regardless of their age.  The default value is 104857600
  * bytes (100MiB).
  *
  * `mothra.packer.workDir.minimumAge` -- Files in the working directory are
  * NOT eligible to be moved into the repository if they are younger this age
  * (were created less this number of seconds ago) unless their size exceeds
  * `maximumSize`.  The default is 600 seconds (5 minutes).
  *
  * `mothra.packer.workDir.minimumSize` -- Files in the working directory are
  * NOT eligible to moved moved into the repository if they are smaller than
  * this size (in octets) unless their age exceeds `maximumAge`.  The default
  * is 67108864 bytes (64 MiB).
  *
  * `mothra.packer.numMoveThreads` -- The size of the thread pool that closes
  * the work files and moves them to the destination directory.  A task is
  * potentially created every `workdirCheckInterval` seconds if files are
  * determined to have met the limits.  The default is 4.
  *
  * `mothra.packer.archiveDirectory` -- The root directory into which working
  * files are moved after the packer copies their content to the repository,
  * as a Hadoop URI.  If not specified, the working files are deleted.
  *
  * `mothra.packer.packAttempts` -- The number of times the packer attempts to
  * process a file found in the srcDir.  After this number of failed attempts,
  * the file is ignored by this invocation of the packer.  The default is 3.
  *
  * `mothra.packer.fileCacheSize` -- The maximum size of the open file cache.
  * This is the maximum number of open files maintained by the file cache for
  * writing to files in the work directory.  The packer does not limit the
  * number of files in the work directory; this only limits the number of open
  * files.  Once the cache reaches this number of open files and the packer
  * needs to (re-)open a file, the packer closes the least-recently-used file.
  * This value does not include the file handles required when reading
  * incoming files or when copying files from the work directory to the data
  * directory.  The default is 2000; the minimum permitted is 128.
  */
object PackerMain extends App with StrictLogging {

  def usage(full: Boolean = false): Unit = {
    print(s"""
Usage: spark-submit --class org.cert.netsa.mothra.packer.tools.PackerMain mothra-tools.jar [--one-shot]    

where:

srcDir:       Source (incoming) directory as Hadoop URI
destDir:      Destination directory as Hadoop URI
workDir:      Working directory on the local disk (not file://)
partitioner:  Partitioning file as Hadoop URIs
""")
    if (full) {
      print(s"""
Packer scans the source directory (`srcDir`) for IPFIX files.  It splits
the IPFIX records in each file into output file(s) in a time-based
directory structure based on the partitioning rules in the partitioning
file (`partitioner`).  The output files are initially created in the
working directory (`workDir`), and when they meet size and/or age
thresholds, they are moved to the destination directory (`destDir`).

If "--one-shot" is included on the command line, the `srcDir` is only
scanned one time.  Once all files in `srcDir` have been packed (or they
fail to be packed after some number of attempts), the packer exits.

The Java property values that are used by the packer are:

`mothra.packer.compression` -- The compression to use for files written to
HDFS.  Values typically supported by Hadoop include `bzip2`, `gzip`,
`lz4`, `lzo`, `lzop`, `snappy`, and `default`.  The empty string indicates
no compression.  The default is `${CorePacker.DEFAULT_COMPRESSION}`.

mothra.packer.maxPackJobs -- The size of the thread pool that determines the
maximum number of input files that may be processed simultaneously.  A larger
value provides more throughput.  The default is ${DEFAULT_MAX_PACK_JOBS}.

mothra.packer.hoursPerFile -- The number of hours covered by each file in the
repository.  The valid range is 1 (a file for each hour) to 24 (one file per
day).  The default is ${CorePacker.DEFAULT_HOURS_PER_FILE}.

mothra.packer.pollingInterval -- How long the main thread sleeps (in seconds)
between scans (polls) of the source directory checking for IPFIX files to
process.  The default is ${DEFAULT_POLL_INTERVAL}.

mothra.packer.workDir.checkInterval -- The value for how often, in seconds, to
check the sizes and ages of the files in the working directory.  The default
is ${DEFAULT_WORKDIR_CHECK_INTERVAL}.
When the checkInterval is reached, the sizes and ages of the files in
the working directory are checked.  Files that meet ONE of the following
criteria are closed and moved into the data repository.  The criteria are:

* Files that were created more than `maximumAge` seconds ago.  Since files are
only checked at this interval, a file could potentially be one interval older
than the `maximumAge`.

* Files whose size exceeds `maximumSize`.  Since a file's size is not
continuously monitored, a file could be larger than this size, and the user
should set this value appropriately.

* Files whose size is at least `minimumsSize` AND that were created at least
`minimumAge` seconds ago.

mothra.packer.workDir.maximumAge -- Files in the working directory that were
created over this number of seconds ago are always moved into the repository,
regardless of their size.  The default value is ${DEFAULT_WORKDIR_MAXIMUM_AGE} seconds.

mothra.packer.workDir.maximumSize -- Files in the working directory whose
size, in octets, is greater than this value are always moved into the
repository, regardless of their age.  The default value is
${DEFAULT_WORKDIR_MAXIMUM_SIZE} octets.

mothra.packer.workDir.minimumAge -- Files in the working directory are only
moved into the repository once they reach this age (were created over this
number of seconds ago) unless their size exceeds maximumSize.  The default is
${DEFAULT_WORKDIR_MINIMUM_AGE} seconds.

mothra.packer.workDir.minimumSize -- Files in the working directory are only
moved into the repository once they reach this size (in octets) unless their
age exceeds maximumAge.  The default is ${DEFAULT_WORKDIR_MINIMUM_SIZE} octets.

`mothra.packer.numMoveThreads` -- The size of the thread pool that closes
the work files and moves them to the destination directory.  A task is
potentially created every `workdirCheckInterval` seconds if files are
determined to have met the limits.  The default is ${CorePacker.DEFAULT_NUM_MOVE_THREADS}.

mothra.packer.archiveDirectory -- The root directory into which working files
are moved after the packer copies their content to the repository, as a Hadoop
URI.  If not specified, the working files are deleted.

mothra.packer.packAttempts -- The number of times the packer attempts to
process a file found in the srcDir.  After this number of failed attempts,
the file is ignored by this invocation of the packer.  The default is ${DEFAULT_PACK_ATTEMPTS}.

mothra.packer.fileCacheSize -- The maximum size of the open file cache.
This is the maximum number of open files maintained by the file cache for
writing to files in the work directory.  The packer does not limit the
number of files in the work directory; this only limits the number of open
files.  Once the cache reaches this number of open files and the packer
needs to (re-)open a file, the packer closes the least-recently-used file.
This value does not include the file handles required when reading
incoming files or when copying files from the work directory to the data
directory.  The default is ${CorePacker.DEFAULT_FILE_CACHE_SIZE}; the minimum
permitted is ${CorePacker.MINIMUM_FILE_CACHE_SIZE}.
""")
    }
    System.exit(if (full) { 0 } else { 1 })
  }

  def version(): Unit = {
    println(Version.get())
    System.exit(0)
  }

  //  <<<<<  DEFAULT VALUES FOR PROPERTY SETTINGS  >>>>>  //

  /**
    * Priority at which Cleanup.run() is invoked during shutdown.  For
    * reference, hadoop file system shutdown priority is 10.
    */
  private val SHUTDOWN_PRIORITY = 50

  /**
    * Default value for the size of the thread pool that determines
    * the maximum number of input files that may be processed
    * simultaneously.  A larger value provides more throughput.  This
    * value may be specified at run-time by specifying the following
    * property: mothra.packer.maxPackJobs
    */
  val DEFAULT_MAX_PACK_JOBS = 1

  /**
    * Default value for the number of times the packer attempts to process an
    * incoming file.  After this number of failed attempts, the file is left
    * in the incoming directory but ignored by the Packer.
    */
  val DEFAULT_PACK_ATTEMPTS = 3

  /**
    * Default value for how long the main thread sleeps (in seconds)
    * between scans (polls) of the source directory checking for IPFIX
    * files to process.  This value may be specified at run-time by
    * specifying the following property: mothra.packer.pollingInterval
    */
  val DEFAULT_POLL_INTERVAL = 30

  /**
    * Default value for how often, in seconds, to check the sizes and ages of
    * the files in the working directory.
    *
    * When this interval is reached, the sizes and ages of the files in the
    * working directory are checked.  Files that meet ONE of the following
    * criteria are closed and moved into the data repository.  The criteria
    * are:
    *
    * Files that were created more than `maximumAge` seconds ago.  Since files
    * are only checked at this interval, a file could potentially be one
    * interval older than the `maximumAge`.
    *
    * Files whose size exceeds `maximumSize`.  Since a file's size is not
    * continuously monitored, a file could be larger than this size, and the
    * user should set this value appropriately.
    *
    * Files whose size is at least `minimumsSize` AND that were created at
    * least `minimumAge` seconds ago.
    *
    * This value may be specified at run-time by specifying the following
    * property: mothra.packer.workDir.checkInterval
    */
  val DEFAULT_WORKDIR_CHECK_INTERVAL = 60

  /**
    * Default value for the "maximum" age (`maximumAge`) of a file in the
    * working directory, as explained in the documentation of
    * `DEFAULT_WORKDIR_CHECK_INTERVAL`.  This value may be specified at
    * run-time by specifying the following property:
    * mothra.packer.workDir.maximumAge
    */
  val DEFAULT_WORKDIR_MAXIMUM_AGE = 60 * 30

  /**
    * Default value for the "maximum" size (`maximumSize`) of a file in the
    * working directory, as explained in the documentation of
    * `DEFAULT_WORKDIR_CHECK_INTERVAL`.  This value may be specified at
    * run-time by specifying the following property:
    * mothra.packer.workDir.maximumSize
    */
  val DEFAULT_WORKDIR_MAXIMUM_SIZE = 100 * 0x100000

  /**
    * Default value for the "minimum" age (`minimumAge`) of a file in the
    * working directory, as explained in the documentation of
    * `DEFAULT_WORKDIR_CHECK_INTERVAL`.  This value may be specified at
    * run-time by specifying the following property:
    * mothra.packer.workDir.minimumAge
    */
  val DEFAULT_WORKDIR_MINIMUM_AGE = 60 * 5

  /**
    * Default value for the "minimum" size (`minimumSize`) of a file in the
    * working directory, as explained in the documentation of
    * `DEFAULT_WORKDIR_CHECK_INTERVAL`.  This value may be specified at
    * run-time by specifying the following property:
    * mothra.packer.workDir.minimumSize
    */
  val DEFAULT_WORKDIR_MINIMUM_SIZE = 64 * 0x100000


  //  <<<<<  PROCESS THE COMMAND LINE ARGUMENTS  >>>>>  //

  val (switches, positionalArgs) = args.partition { _.substring(0, 2) == "--" }

  var oneShot = false

  switches.collect {
    case "-V" | "--version" => version()
    case "-h" | "--help" => usage(true)
    case "--one-shot" | "--oneshot" => oneShot = true
    case unknown: String =>
      println(s"Unknown argument '${unknown}'")
      usage()
  }


  logger.info("\n=============================" +
    " Packer is starting =============================\n")
  logger.info(s"This is packer ${Version.get()}")

  val reqArgs = 4
  if ( positionalArgs.length != reqArgs ) {
    println(s"Called with ${positionalArgs.length} args;" +
      s" exactly ${reqArgs} required")
    logger.error(
      s"Called with ${positionalArgs.length} args; exactly ${reqArgs} required")
    logger.debug(s"Args were ${positionalArgs}")
    usage()
  }

  /** The source (incoming) directory for ipfix files to be processed. */
  val incomingDir = new HPath(positionalArgs(0))

  /** The data repository (output directory). */
  val rootDir = new HPath(positionalArgs(1))

  /** A local directory used to first create the output files; this directory
    * must allow files to be closed and re-opened. */
  val workDir = JPaths.get(positionalArgs(2))

  /** The packing logic (a .scala source file). */
  val packLogicPath = new HPath(positionalArgs(3))


  //  <<<<<  USE THE PROPERTY SETTINGS OR DEFAULT VALUES  >>>>>  //

  // whether the program is running or is shutting down
  @volatile
  private var running = true

  // Hadoop configuration
  implicit val conf = new Configuration()

  /** The information model */
  val infoModel = InfoModel.getCERTStandardInfoModel()

  /** The root of an additional directory to which working files are copied
    * after being copied into the `rootDir`.  This value is determined by the
    * "mothra.packer.archiveDirectory" property.  When the property is not
    * set, the files are removed.
    */
  val archiveDir = sys.props.get("mothra.packer.archiveDirectory").
    map { x: String => new HPath(x) }

  /** The compression codec used for files written to HDFS.  This value is
    * determined by the "mothra.packer.compression" property, or
    * Packer.DEFAULT_COMPRESSION when that property is not set.
    */
  val compressCodec = {
    val compressName =
      sys.props.get("mothra.packer.compression").getOrElse(
        CorePacker.DEFAULT_COMPRESSION)
    if ( compressName == "" ) {
      None
    } else {
      Try {
        //logger.trace(s"have a name ${compressName}")
        val factory = new CompressionCodecFactory(conf)
        //logger.trace(s"have a factory ${factory}")
        val codec = factory.getCodecByName(compressName)
        //logger.trace(s"have a codec ${codec}")
        // Make sure we can create a compressor, not using it here.
        codec.createCompressor()
        //logger.trace(s"have a compressor ${compressor}")
        codec
      } match {
        case Success(ok) =>
          Option(ok)
        case Failure(e) =>
          logger.error("Unable to initialize compressor" +
            s" '${compressName}': ${e.toString}")
          val sw = new StringWriter
          e.printStackTrace(new PrintWriter(sw))
          logger.info("Unable to initialize compressor" +
            s" '${compressName}': ${sw.toString}")
          logger.warn("Using no compression for IPFIX files")
          None
      }
    }
  }
  //logger.trace(s"compressCodec is ${compressCodec}")

  /** The number of hours covered by each file in the repository.  This value is
    * determined by the "mothra.packer.hoursPerFile" property, or
    * Packer.DEFAULT_HOURS_PER_FILE when that property is not set.
    */
  val hoursPerFile: Int = sys.props.get("mothra.packer.hoursPerFile").
    map { _.toInt }.getOrElse(CorePacker.DEFAULT_HOURS_PER_FILE)
  require( hoursPerFile >= 1 && hoursPerFile <= 24 )

  /** The maximum number of [[PackFileJob]] instances to run simultaneously.
    * This value is determined by the "mothra.packer.maxPackJobs" property, or
    * DEFAULT_MAX_PACK_JOBS when that property is not set.
    */
  val maxPackJobs: Int = sys.props.get("mothra.packer.maxPackJobs").
    map { _.toInt }.getOrElse(DEFAULT_MAX_PACK_JOBS)
  require( maxPackJobs >= 1 )

  /** The number of times the packer attempts to process an incoming file.
    * After this number of failed attempts, the file is ignored by this
    * invocation of the packer.  This value is determined by the
    * "mothra.packer.packAttempts" property, or DEFAULT_PACK_ATTEMPTS when
    * that property is not set.
    */
  val packAttempts: Int = sys.props.get("mothra.packer.packAttempts").
    map { _.toInt }.getOrElse(DEFAULT_PACK_ATTEMPTS)
  require( packAttempts >= 1 )

  /** How long to wait between polls of the incoming (source) directory
    * directory, in seconds.  This value is determined by the
    * "mothra.packer.pollingInterval" property, or DEFAULT_POLL_INTERVAL when
    * that property is not set.
    */
  val pollingInterval: Int = sys.props.get("mothra.packer.pollingInterval").
    map { _.toInt }.getOrElse(DEFAULT_POLL_INTERVAL)
  require( pollingInterval >= 1 )

  /** How often to check whether the size and age of files in the working
    * directory meet limits, in seconds.  This value is determined by the
    * "mothra.packer.workDir.checkInterval" property, or
    * DEFAULT_WORKDIR_CHECK_INTERVAL when that property is not set.
    */
  val workdirCheckInterval: Int =
    sys.props.get("mothra.packer.workDir.checkInterval").
      map { _.toInt }.getOrElse(DEFAULT_WORKDIR_CHECK_INTERVAL)
  require( workdirCheckInterval >= 1 )

  /** Minimum size a file in the working directory must have before it is moved
    * to the repository, in octets, unless its age exceeds
    * `workdirMaximumAge`.  This value is determined by the
    * "mothra.packer.workDir.minimumSize" property, or
    * DEFAULT_WORKDIR_MINIMUM_SIZE when that property is not set.
    */
  val workdirMinimumSize: Long =
    sys.props.get("mothra.packer.workDir.minimumSize").
      map { _.toLong }.getOrElse(DEFAULT_WORKDIR_MINIMUM_SIZE)
  require( workdirMinimumSize >= 1 )

  /** Approximate maximum size for a file in the working directory.  Files
    * larger than this are moved regardless of their age.  This value is
    * determined by the "mothra.packer.workDir.maximumSize" property, or
    * DEFAULT_WORKDIR_MAXIMUM_SIZE when that property is not set.
    */
  val workdirMaximumSize: Long =
    sys.props.get("mothra.packer.workDir.maximumSize").
      map { _.toLong }.getOrElse(DEFAULT_WORKDIR_MAXIMUM_SIZE)
  require( workdirMaximumSize >= 1 )

  /** Minimum age a file in the working directory must have before it is moved
    * to the repository, in seconds, unless its size exceeds
    * `workdirMaximumSize`.  This value is determined by the
    * "mothra.packer.workDir.minimumAge" property, or
    * DEFAULT_WORKDIR_MINIMUM_AGE when that property is not set.
    */
  val workdirMinimumAge: Int =
    sys.props.get("mothra.packer.workDir.minimumAge").
      map { _.toInt }.getOrElse(DEFAULT_WORKDIR_MINIMUM_AGE)
  require( workdirMinimumAge >= 1 )

  /** Approximate maximum age for a file in the working directory.  Files older
    * than this are moved regardless of their size.  This value is determined
    * by the "mothra.packer.workDir.maximumAge" property, or
    * DEFAULT_WORKDIR_MAXIMUM_AGE when that property is not set.
    */
  val workdirMaximumAge: Int =
    sys.props.get("mothra.packer.workDir.maximumAge").
      map { _.toInt }.getOrElse(DEFAULT_WORKDIR_MAXIMUM_AGE)
  require( workdirMaximumAge >= 1 )

  /** The maximum number of open files maintained by the file cache.  This value
    * is determined by the `mothra.packer.fileCacheSize` Java property, or by
    * `Packer.DEFAULT_FILE_CACHE_SIZE` when the property is not set.  This
    * value must be no less than `Packer.MINIMUM_FILE_CACHE_SIZE`.
    * @see Packer.DEFAULT_FILE_CACHE_SIZE for a full description of this
    * value. */
  val fileCacheSize: Int = sys.props.get("mothra.packer.fileCacheSize").
      map { _.toInt }.getOrElse(CorePacker.DEFAULT_FILE_CACHE_SIZE)
  require( fileCacheSize >= CorePacker.MINIMUM_FILE_CACHE_SIZE )

  /** The size of the thread pool that closes the work files and moves them to
    * the destination directory.  A task is potentially created every
    * `workdirCheckInterval` seconds if files are determined to have met the
    * limits.  This value is determined by the "mothra.packer.numMoveThreads"
    * property, or Packer.DEFAULT_NUM_MOVE_THREADS when that property is not
    * set.
    */
  val numMoveThreads: Int = sys.props.get("mothra.packer.numMoveThreads").
    map { _.toInt }.getOrElse(CorePacker.DEFAULT_NUM_MOVE_THREADS)
  require( numMoveThreads >= 1 )

  val packConf = PackerConfig(rootDir, workDir, archiveDir, compressCodec)


  //  <<<<<  DEFINE SOME HELPER CLASSES  >>>>>  //

  /** Class for flushing and closing open files at shutdown.  Its run()
    * method is invoked by the ShutdownHookManager. */
  private class Cleanup extends Runnable {
    def run(): Unit = {
      running = false
      logger.info("Shutting down...")
      packer.flushAllWorkFiles()
      packer.closeAllWorkFiles(true)
      logger.debug("Closed files")
      logger.info("Shutting down...done.")
    }
  }

  /**
    * Checks the files in the workdir to see whether any should be moved to
    * the repository because of age or size.  This runnable is periodicaly
    * started by the `checkWorkdirPool` thread pool.
    */
  private class CheckWorkdir extends Runnable {
    /** Determine whether `workfile` should be moved to the repostiory.  The
      * `workfile` is moved if its size or age meets one of these thresholds:
      * its size is larger than `workdirMaximumSize`, its age (in seconds) is
      * older than `workdirMaximumAge`, or its size AND age are at least
      * `workdirMinimumSize` and `workdirMinimumAge`.
      */
    private[this] def checkWorkFile(workfile: WorkFile): Boolean = {
      val age = workfile.age()
      if ( !running ) {
        false
      } else if ( age >= workdirMaximumAge ) {
        logger.debug("Found file that exceeds" +
          s" maximum age ${age} >= ${workdirMaximumAge}" +
          s": '${workfile.key}'")
        true
      } else {
        workfile.flush()
        val size = workfile.size()
        if ( size >= workdirMaximumSize ) {
          logger.debug("Found file that exceeds" +
            s" maximum size ${size} >= ${workdirMaximumSize}" +
            s": '${workfile.key}'")
          true
        } else if ( size >= workdirMinimumSize && age >= workdirMinimumAge ) {
          logger.debug("Found file that exceeds" +
            s" minimum size ${size} >= ${workdirMinimumSize}" +
            s" and minimum age ${age} >= ${workdirMinimumAge}" +
            s": '${workfile.key}'")
          true
        } else {
          false
        }
      }
    }

    override def run(): Unit = {
      if ( running ) {
        try {
          packer.checkWorkingFiles(checkWorkFile)
        } catch {
          case NonFatal(e) =>
            logger.warn(s"Error checking work files': ${e.toString}")
        }
      }
    }
  }


  //  <<<<<  LOG THE SETTINGS  >>>>>  //

  logger.info("Packer settings::")
  logger.info(s"Output compression: ${compressCodec.getOrElse("none")}")
  logger.info(s"Hours covered by each file: ${hoursPerFile}")
  logger.info(s"Maximum simultaneous pack jobs: ${maxPackJobs}")
  if ( oneShot ) {
    logger.info("Will shut down after a single scan of the incoming directory")
    logger.info(s"Ignoring polling interval value (${pollingInterval})")
  } else {
    logger.info(s"Polling interval of the incoming directory: ${pollingInterval}")
  }
  logger.info(s"Interval for checking size & age of working files: ${workdirCheckInterval}")
  logger.info(s"Approximate size range for closing working files: ${workdirMinimumSize} to ${workdirMaximumSize}")
  logger.info(s"Approximate age range for closing working files: ${workdirMinimumAge} to ${workdirMaximumAge}")
  logger.info(s"Maximum number of attempts to process a file: ${packAttempts}")
  logger.info(s"Maximum number of open files in the workDir: ${fileCacheSize}")
  logger.info(s"Number of threads that move files from workDir to destDir: ${numMoveThreads}")
  logger.info(archiveDir match {
    case Some(dir) => s"Archive location for expired working files: ${dir}"
    case None => "Do not archive expired working files"
  })
  logger.info(s"""JVM Parameters: ${ManagementFactory.getRuntimeMXBean.getInputArguments.toArray.mkString(",")}""")

  //  <<<<<  CREATE PACKER AND DIRECTORY POLLER  >>>>>  //

  // open, load, parse, compile, and run the packing logic file
  private[this] val packLogic = {
    val stream = packLogicPath.getFileSystem(conf).open(packLogicPath)
    val loader = RunTimeCodeLoader(stream)
    val result = loader.load()
    result match {
      case pl: PackingLogic => pl
      case pc: PartitionerConfigurator => PartitionerPackLogic(pc.partitioners)
      case _ => throw new Exception(
        s"Unexpected type returned from compiled code: result.getClass")
    }
  }

  /** The object that categorizes the records and writes them. */
  val packer = CorePacker(packLogic, packConf, infoModel,
    hoursPerFile, fileCacheSize, numMoveThreads)

  /** The object that watches for incoming files. */
  val watcher = DirMapping(incomingDir, packer)


  //  <<<<<  START THE THREADS  >>>>>  //

  logger.info("Packer threads are starting::")

  ShutdownHookManager.get().addShutdownHook(new Cleanup(), SHUTDOWN_PRIORITY)

  // the thread pool for the PackFileJobs
  private[this] val packerPool: ThreadPoolExecutor =
    new ThreadPoolExecutor(
      maxPackJobs, maxPackJobs, 0L, TimeUnit.SECONDS,
      new LinkedBlockingQueue[Runnable](),
      new PackerThreadFactory("PackFileThread-"))
  packerPool.prestartAllCoreThreads()

  /**
    * Writes a log message giving the number of tasks.
    */
  private[this] def logPackerTaskCount(): Unit = {
    val active = packerPool.getActiveCount()
    val completed = packerPool.getCompletedTaskCount()
    val total = packerPool.getTaskCount()
    logger.info(s"Packing task count: Completed: ${completed}," +
      s" Active: ${active}, Queued: ${total - completed - active}")
  }

  // PackFileJobs that have been submitted to the packerPool for execution
  private[this] var tasks = Queue.empty[PackFileJob]

  // move any files from a previous run from the workDir to the rootDir
  packer.initializeWorkDir()

  // a task that periodically spawns a `CheckWorkdir` thread to check which
  // files in the workDir need to be moved into the repository
  private[this] val checkWorkdirResult =
    Executors.newScheduledThreadPool(1,
      new PackerThreadFactory("CheckWorkFiles-")).scheduleAtFixedRate(
      new CheckWorkdir(),
        workdirCheckInterval, workdirCheckInterval, TimeUnit.SECONDS)

  /**
    * How often to print log messages regarding the number of tasks and number
    * of files waiting to be moved, in seconds.
    */
  val logTaskCountInterval = 5

  // print packer task count and mover task count every 5 seconds
  private[this] val logTaskCountThread =
    Executors.newScheduledThreadPool(1,
      new PackerThreadFactory("LogTaskCounts-")).scheduleAtFixedRate(
      new Thread() {
        override def run(): Unit = {
          logPackerTaskCount()
          packer.logMoverTaskCount()
        }
      },
        logTaskCountInterval, logTaskCountInterval, TimeUnit.SECONDS)

  // initial scan of incoming directory; submit each PackFileJob to the
  // packerPool
  tasks = tasks ++ (for {
    job <- watcher.jobs
    // Workaround scala/bug#11175 -Ywarn-unused:params false positive
    _ = job
    if running
  } yield job.submitTo(packerPool))

  // Main run loop
  while ( running && ( !oneShot || tasks.nonEmpty ) ) {
    // check the status of the PackFileJobs
    if ( running && tasks.nonEmpty ) {
      // group tasks into (0)running/queued, (1)successful, (2)failed but have
      // not reached `packAttempts`, and (3)failed `packAttempts` times.
      //
      // Note: an entry is made only if value is a non-empty.
      val groups = tasks.groupBy(job => {
        if ( !job.result.isDone ) { 0 }
        else if ( job.result.get ) { 1 }
        else if ( job.runCount < packAttempts ) { 2 }
        else { 3 }
      })
      for ( (k, v) <- groups ) {
        logger.trace(s"groups(${k}) contains ${v.size} tasks")
      }

      // set `tasks` to those that are currently running or are still queued
      tasks = groups.getOrElse(0, Queue.empty[PackFileJob])
      // requeue failed jobs that are allowed more attempts unless the source
      // file no longer exists
      for ( requeue <- groups.get(2) ) {
        tasks = tasks ++ (for {
          job <- requeue
          // Workaround scala/bug#11175 -Ywarn-unused:params false positive
          _ = job
          if watcher.fileExists(job.sourcePath)
          if running
        } yield job.submitTo(packerPool))
      }
      // log about files that failed `packAttempts` times
      for (
        failed <- groups.get(3);
        job <- failed
      ) {
        logger.warn(
          s"Maximum packing attempts reached for '${job.sourcePath}'")
      }
    }

    if ( oneShot ) {
      if ( tasks.nonEmpty && running ) {
        // sleep for 5 seconds
        Thread.sleep( 5000 )
      }
    } else if ( running ) {
      Thread.sleep( pollingInterval * 1000 )
      if ( running ) {
        // rescan the incoming directory
        tasks = tasks ++ (for {
          job <- watcher.jobs
          // Workaround scala/bug#11175 -Ywarn-unused:params false positive
          _ = job
          if running
         } yield job.submitTo(packerPool))
      }
    }

  }

  if ( running ) {
    // we should only get here when oneShot is active.  Close and move the
    // work files then call exit().  Do not rely on the code in Cleanup to
    // close the files, since the ShutdownHookManager may kill the process
    // before all the files are moved.
    logger.info("All tasks have completed and one-shot is active." +
      " Closing and moving work files...")
    checkWorkdirResult.cancel(false)
    packer.closeAllWorkFiles(true)
    logTaskCountThread.cancel(false)
    running = false
    System.exit(0)
  }
}

// @LICENSE_FOOTER@
//
// Copyright 2015-2022 Carnegie Mellon University. All Rights Reserved.
//
// This material is based upon work funded and supported by the
// Department of Defense and Department of Homeland Security under
// Contract No. FA8702-15-D-0002 with Carnegie Mellon University for the
// operation of the Software Engineering Institute, a federally funded
// research and development center sponsored by the United States
// Department of Defense. The U.S. Government has license rights in this
// software pursuant to DFARS 252.227.7014.
//
// NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING
// INSTITUTE MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
// UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
// IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF
// FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
// OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY DOES NOT
// MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM PATENT,
// TRADEMARK, OR COPYRIGHT INFRINGEMENT.
//
// Released under a GNU GPL 2.0-style license, please see LICENSE.txt or
// contact [email protected] for full terms.
//
// [DISTRIBUTION STATEMENT A] This material has been approved for public
// release and unlimited distribution. Please see Copyright notice for
// non-US Government use and distribution.
//
// Carnegie Mellon(R) and CERT(R) are registered in the U.S. Patent and
// Trademark Office by Carnegie Mellon University.
//
// This software includes and/or makes use of third party software each
// subject to its own license as detailed in LICENSE-thirdparty.tx
//
// DM20-1143




© 2015 - 2024 Weber Informatics LLC | Privacy Policy