tools.RollupDayMain.scala Maven / Gradle / Ivy

Go to download
// Copyright 2015-2022 by Carnegie Mellon University
// See license information in LICENSE.txt

package org.cert.netsa.mothra.tools

import org.cert.netsa.io.ipfix.InfoModel
import org.cert.netsa.mothra.packer.{
  PackerThreadFactory, Reader, Version, Writer}

import com.typesafe.scalalogging.StrictLogging
import java.io.{PrintWriter, StringWriter}
import java.lang.management.ManagementFactory
import java.util.concurrent.{Executors, LinkedBlockingQueue,
  ThreadPoolExecutor, TimeUnit}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{LocatedFileStatus, RemoteIterator}
import org.apache.hadoop.fs.{Path => HPath}
import org.apache.hadoop.io.compress.CompressionCodecFactory
import resource.managed // see http://jsuereth.com/scala-arm/index.html
import scala.collection.mutable.{Map => MutMap, Set}
import scala.util.control.NonFatal
import scala.util.matching.Regex
import scala.util.{Failure, Success, Try}

/**
  * Object to implement the RollupDay application.
  *
  * Typical Usage in a Spark environment:
  *
  * `spark-submit --class org.cert.netsa.mothra.packer.tools.RollupDayMain mothra-tools.jar  [  ...]`
  *
  * where:
  *
  * s1..sn:         Directories to process, as Hadoop URIs
  *
  *
  * RollupDay reduces the number of data files in a Mothra repository.  It
  * may also be used to modify the files' compression.
  *
  * RollupDay runs as a batch process, not as a daemon.
  *
  * RollupDay makes a single recursive scan of the source directories ,
  * , ... for files whose names match the pattern "YYYYMMDD.HH." or
  * "YYYYMMDD.HH-PTddH." (It looks for files matching the regular expression
  * `^\d{8}\.\d{2}(?:-PT\d\d?H)?\.`) Files whose names match that pattern and
  * reside in the same directory are processed by RollupDay to create a single
  * new file (see next paragraph) in the same directory containing the records
  * in all files in that directory.
  *
  * RollupDay joins the files in a directory into a single file by default.
  * The `mothra.rollupday.maximumSize` Java property may be used to limit the
  * maximum file size.  The size is for the compressed file if compression is
  * active.  The value is approximate since it is only checked after the data
  * appears on disk which occurs in large blocks because of buffering by the
  * Java stream code and the compression algorithm.
  *
  * There is always a single thread that recursively scans the directories.
  * The number of threads that joins the files may be set by specifying the
  * `mothra.rollupday.maxThreads` Java property.  If not specified, the
  * default is 6.
  *
  * By default, RollupDay does not compress the files it writes.
  * (NOTE: It should support writing the output using the same compression as
  * the input.)  To specify the compression codec that it should use, specify
  * the `mothra.rollupday.compression` Java property.  Values typically
  * supported by Hadoop include `bzip2`, `gzip`, `lz4`, `lzo`, `lzop`,
  * `snappy`, and `default`.  The empty string indicates no compression.
  *
  */
object RollupDayMain extends App with StrictLogging {

  def usage(full: Boolean = false): Unit = {
    print("""
Usage: spark-submit --class org.cert.netsa.mothra.packer.tools.RollupDayMain mothra-tools.jar  [  ...]

where:

s1..sn:         Directories to process, as Hadoop URIs
""")
    if ( full ) {
      print(s"""
RollupDay reduces the number of data files in a Mothra repository.  It
may also be used to modify the files' compression.

RollupDay runs as a batch process, not as a daemon.

RollupDay makes a single recursive scan of the source directories , ,
... for files whose names match the pattern "YYYYMMDD.HH." or
"YYYYMMDD.HH-PTddH." (It looks for files matching the regular expression
`^\\d{8}\\.\\d{2}(?:-PT\\d\\d?H)?\\.`) Files whose names match that pattern and
reside in the same directory are processed by RollupDay to create a single
new file (see next paragraph) in the same directory containing the records
in all files in that directory.

RollupDay joins the files in a directory into a single file by default.
The `mothra.rollupday.maximumSize` Java property may be used to limit the
maximum file size.  The size is for the compressed file if compression is
active.  The value is approximate since it is only checked after the data
appears on disk which occurs in large blocks because of buffering by the
Java stream code and the compression algorithm.

There is always a single thread that recursively scans the directories.
The number of threads that joins the files may be set by specifying the
`mothra.rollupday.maxThreads` Java property.  If not specified, the
default is ${DEFAULT_MAX_THREADS}.

By default, RollupDay does not compress the files it writes.
(NOTE: It should support writing the output using the same compression as
the input.)  To specify the compression codec that it should use, specify
the `mothra.rollupday.compression` Java property.  Values typically
supported by Hadoop include `bzip2`, `gzip`, `lz4`, `lzo`, `lzop`,
`snappy`, and `default`.  The empty string indicates no compression.
""")
    }
    System.exit(if (full) { 0 } else { 1 })
  }

  def version(): Unit = {
    println("RollupDay " + Version.get())
    System.exit(0)
  }

  //  <<<<<  DEFAULT VALUES FOR PROPERTY SETTINGS  >>>>>  //

  /**
    * The default compression codec to use for files written to HDFS.  This
    * may be modified by specifying the following property:
    * mothra.rollupday.compression.
    *
    * Values typically supported by Hadoop include `bzip2`, `gzip`, `lz4`,
    * `lzo`, `lzop`, `snappy`, and `default`.  The empty string indicates no
    * compression.
    */
  val DEFAULT_COMPRESSION = ""

  /**
    * The default number of threads to run for joining files when the
    * `mothra.rollupday.maxThreads` Java property is not set. (The scanning
    * task always runs in its own thread.)
    */
  val DEFAULT_MAX_THREADS = 6

  //  <<<<<  PROCESS THE COMMAND LINE ARGUMENTS  >>>>>  //

  val (switches, positionalArgs) = args.partition { _.substring(0, 1) == "-" }

  switches.collect {
    case "-V" | "--version" => version()
    case "-h" | "--help" => usage(true)
    case unknown: String =>
      println(s"Unknown argument '${unknown}'")
      usage()
  }

  logger.info("\n=============================" +
    " RollupDay is starting =============================\n")
  logger.info(s"This is RollupDay ${Version.get()}")

  if ( positionalArgs.length == 0 ) {
    logger.error(s"Called with no args; at least 1 required")
    usage()
  }

  /**
    * Joins the files specified in `files` that all exist in `dir` and
    * all of whose names begin with the same `basename` which is of the form
    * "YYYYMMDD."
    */
  private[this] def joinFilesBasename(
    dir: HPath, basename: String, files: Set[HPath]): Unit =
  {
    // the output file
    val dayfile = s"${basename}00-PT24H."

    // file currently being written to
    var writer: Writer = null

    // list of newly created files
    var newPaths = List.empty[HPath]

    // list of files that were successfully processed
    var removeList = List.empty[HPath]

    logger.debug(s"Joining ${files.size} files into '${dayfile}*' in ${dir}/")
    val t0 = System.currentTimeMillis()

    Try {
      writer = Writer(dir, dayfile, compressCodec, maximumSize)
      val originalPerm = writer.originalPermission
      if ( maximumSize.isEmpty ) {
        // process all input files
        for ( f <- files ) {
          // process all records in the input
          for ( reader <- managed(Reader(f, codecFactory)) ) {
            for ( record <- reader ) {
              writer.add(record)
            }
            removeList = f +: removeList
          }
        }
      } else {
        // process all input files
        for ( f <- files ) {
          // process all records in the input
          for ( reader <- managed(Reader(f, codecFactory)) ) {
            for ( record <- reader ) {
              if ( writer.reachedMaxSize ) {
                logger.trace(s"Closing file '${writer.getName}'")
                writer.close()
                newPaths = writer.exportFile +: newPaths
                writer = null
                logger.trace(
                  s"Creating additional writer for '${dayfile}' in ${dir}")
                writer = Writer(dir, dayfile, compressCodec, maximumSize)
              }
              writer.add(record)
            }
            removeList = f +: removeList
          }
        }
      }
      writer.close()
      newPaths = writer.exportFile +: newPaths
      writer = null
      // restore the original permission bits on the new files
      for ( perm <- originalPerm ; f <- newPaths ) {
        fileSystem.setPermission(f, perm)
      }
      //logger.trace(s"Removing old '${basename}*' files from ${dir}/")
      for ( f <- removeList ) {
        Try {fileSystem.delete(f, false)} match {
          case Failure(e) =>
            logger.warn(
              s"Failed to remove old file '${f}': ${e.toString}")
          case _ =>
        }
      }
    } match {
      case Success(_) =>
        logger.debug("Finished joining" +
          s" ${files.size} files into ${newPaths.size} '${dayfile}' files" +
          s" in ${dir}/ in " +
          f"${(System.currentTimeMillis() - t0).toDouble/1000.0}%.3f" +
          " seconds")
      case Failure(ex) =>
        logger.error(
          s"Failed to join ${files.size} files into '$dayfile' in $dir/", ex)
        for ( w <- Option(writer) ) { newPaths = w.exportFile +: newPaths }
        for ( f <- newPaths ) {
          Try { fileSystem.delete(f, false) } match {
            case Success(_) =>
            case Failure(ex) =>
              logger.error(
                s"Failed to remove new file '${f.getName}' in $dir/", ex)
          }
        }
    }
  }


  //  <<<<<  DEFINE SOME HELPER CLASSES  >>>>>  //

  /**
    * A Runnable that splits the files in `files` by unique basename and, for
    * each basename, joins those files into a new file.  It assumes all
    * entries in `files` live in `dir`.
    */
  private[this] case class DirectoryJob(
    dir: HPath, files: MutMap[String, Set[HPath]])
      extends Runnable
  {
    def run(): Unit = {
      if ( files.size > 1 ) {
        val sb = new StringBuilder()
        files.keys.addString(sb, ", ")
        logger.debug(
          s"Files covering multiple days (${sb.mkString}) found in ${dir}")
      }

      // Create threads to process the files for each unique basename in this
      // directory
      for ( (basename, set) <- files ) {
        joinFilesBasename(dir, basename, set)
      }
      signalQueue.add(0)
      ()
    }
  }


  // /////  Constants & Values Determined from Properties  /////


  /** The information model */
  implicit val infoModel = InfoModel.getCERTStandardInfoModel()

  /** The Hadoop configuration */
  implicit val hadoopConf = new Configuration()

  /** A Compression Codec Factory */
  private[this] val codecFactory = new CompressionCodecFactory(hadoopConf)

  /**
    * The compression codec used for files written to HDFS.  This may be set
    * by setting the "mothra.rollupday.compression" property.  If that
    * property is not set, DEFAULT_COMPRESSION is used.
    */
  val compressCodec = {
    val compressName = sys.props.get("mothra.rollupday.compression").
      getOrElse(DEFAULT_COMPRESSION)
    if ( compressName == "" ) {
      //logger.info("Using no compression for IPFIX files")
      None
    } else {
      Try {
        //logger.trace(s"have a name ${compressName}")
        val codec = codecFactory.getCodecByName(compressName)
        //logger.trace(s"have a codec ${codec}")
        // Make sure we can create a compressor, not using it here.
        codec.createCompressor()
        //logger.trace(s"have a compressor ${compressor}")
        codec
      } match {
        case Success(ok) =>
          //logger.info(s"Using ${compressName} compressor for IPFIX files")
          Option(ok)
        case Failure(e) =>
          logger.error("Unable to initialize compressor" +
            s" '${compressName}': ${e.toString}")
          val sw = new StringWriter
          e.printStackTrace(new PrintWriter(sw))
          logger.info("Unable to initialize compressor" +
            s" '${compressName}': ${sw.toString}")
          logger.warn("Using no compression for IPFIX files")
          None
      }
    }
  }
  //logger.trace(s"compressCodec is ${compressCodec}")

  /**
    * The maximum number of filejoiner threads to start.  It defaults to the
    * value `DEFAULT_MAX_THREADS`.
    *
    * This run-time behavior may be modified by setting the
    * mothra.rollupday.maxThreads property.
    */
  val maxThreads = (sys.props.get("mothra.rollupday.maxThreads").
    map { _.toInt }).getOrElse(DEFAULT_MAX_THREADS)
  require(maxThreads >= 1)

  /**
    * The (approximate) maximum size file to create.  The default is no
    * maximum.  When a file's size exceeds this value, the file is closed and
    * a new file is started.  Typically a file's size will not exceed this
    * value by more than the maximum size of an IPFIX message, 64k.
    */
  val maximumSize = (sys.props.get("mothra.rollupday.maximumSize").
    map { _.toLong })


  // /////  RollupDay procedural code begins here  /////

  // the argument(s) is/are the directory(s) to scan
  private[this] var dirList = positionalArgs.toList.map { new HPath(_) }

  // ensure all source directories use the same file system
  val fileSystem = dirList.head.getFileSystem(hadoopConf)
  if ( dirList.drop(1).exists{_.getFileSystem(hadoopConf) != fileSystem} )
  {
    logger.error("source directories use different file systems")
    throw new Exception("source directories use different file systems")
  }

  // log our settings
  logger.info("RollupDay settings::")
  logger.info(s"Number of top-level directories to scan: ${dirList.size}")
  logger.info(s"Maximum number of file joining threads: ${maxThreads}")
  logger.info("Approximate maximum output file size: " +
    maximumSize.map{ _.toString }.getOrElse("unlimited"))
  logger.info(s"Output file compression: ${compressCodec.getOrElse("none")}")
  logger.info(s"""JVM Parameters: ${ManagementFactory.getRuntimeMXBean.getInputArguments.toArray.mkString(",")}""")


  /** Object used by sub-threads to signal to the main thread that they have
    * completed. */
  private[this] val signalQueue = new LinkedBlockingQueue[Int]()

  private[this] val pool: ThreadPoolExecutor =
    new ThreadPoolExecutor(
      maxThreads, maxThreads, 0L, TimeUnit.SECONDS,
      new LinkedBlockingQueue[Runnable](),
      new PackerThreadFactory("RollupDayThread-"))

  /**
    * How often to print log messages regarding the number of tasks, in
    * seconds.
    */
  val logTaskCountInterval = 5

  // print task count every 5 seconds
  private[this] val logTaskCountThread = Executors.newScheduledThreadPool(1,
    new PackerThreadFactory("LogTaskCounts-"))
  logTaskCountThread.scheduleAtFixedRate(
    new Thread() {
      override def run(): Unit = {
        val active = pool.getActiveCount()
        val completed = pool.getCompletedTaskCount()
        val total = pool.getTaskCount()
        logger.info(s"Directories to scan: ${dirList.size}," +
          s" Total tasks: ${total}," +
          s" Completed tasks: ${completed}," +
          s" Active tasks: ${active}," +
          s" Queued tasks: ${total - active - completed}")
      }
    },
    logTaskCountInterval, logTaskCountInterval, TimeUnit.SECONDS)

  /** Regular expression that matches expected repo file names */
  private[this] val repoFileRegex =
    new Regex("""\A(\d{8}\.)\d{2}(?:-PT\d\d?H)?\..*\Z""")

  logger.info(s"Starting recursive scan of ${dirList.size} director" +
    (if ( 1 == dirList.size ) { "y" } else { "ies" }))

  // Recursively process all directories
  while ( dirList.nonEmpty ) {
    val dir = dirList.head
    dirList = dirList.tail

    logger.trace(s"Scanning directory '${dir}/'")
    val fileMap: MutMap[String, Set[HPath]] = MutMap.empty
    val iter = try {
      fileSystem.listLocatedStatus(dir)
    } catch {
      case NonFatal(e) =>
        // return an empty iterator
        logger.warn(s"Unable to get status of '${dir}/': ${e.getMessage}")
        new RemoteIterator[LocatedFileStatus](){
          def hasNext: Boolean = false
          def next(): LocatedFileStatus = throw new NoSuchElementException()
        }
    }
    while (
      Try {
        if ( !iter.hasNext ) {
          // finished with this directory
          false
        } else {
          // found an entry
          val entry = iter.next()
          if ( entry.isDirectory ) {
            dirList = entry.getPath +: dirList
          } else if ( entry.isFile ) {
            // its a file, check if it matches the regex
            for ( m <- repoFileRegex.findFirstMatchIn(entry.getPath.getName) ) {
              fileMap.getOrElseUpdate(m.group(1), Set.empty) += entry.getPath
            }
          }
          true
        }
      } match {
        case Success(ok) => ok
        case Failure(e) =>
          // ignore errors stat-ing files
          logger.debug(s"Unable to read directory entry: ${e.toString}")
          true
      }
    ) { /*empty-body*/ }

    // Create a thread for all files in the directory
    pool.execute(DirectoryJob(dir, fileMap))
  }

  // Finished scanning directories.  Wait for joining threads to finish.
  logger.info("Completed recursive directory scan")
  logger.info(
    s"Waiting for ${pool.getTaskCount() - pool.getCompletedTaskCount()}" +
      s" of ${pool.getTaskCount()} tasks to complete...")

  // all tasks are queued; shutdown the thread pool and allow the
  // running/queued tasks to complete
  pool.shutdown()

  // clear the signalQueue of previously completed tasks then wait for the
  // thread pool to terminate
  signalQueue.clear()
  while ( !pool.isTerminated() ) {
    signalQueue.poll(5, TimeUnit.SECONDS)
    //signalQueue.clear()
  }
  logger.debug("All tasks have completed")
  logTaskCountThread.shutdown()
  logTaskCountThread.awaitTermination(1, TimeUnit.SECONDS)

  logger.info("RollupDay is done")

}

// @LICENSE_FOOTER@
//
// Copyright 2015-2022 Carnegie Mellon University. All Rights Reserved.
//
// This material is based upon work funded and supported by the
// Department of Defense and Department of Homeland Security under
// Contract No. FA8702-15-D-0002 with Carnegie Mellon University for the
// operation of the Software Engineering Institute, a federally funded
// research and development center sponsored by the United States
// Department of Defense. The U.S. Government has license rights in this
// software pursuant to DFARS 252.227.7014.
//
// NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING
// INSTITUTE MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
// UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
// IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF
// FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
// OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY DOES NOT
// MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM PATENT,
// TRADEMARK, OR COPYRIGHT INFRINGEMENT.
//
// Released under a GNU GPL 2.0-style license, please see LICENSE.txt or
// contact [email protected] for full terms.
//
// [DISTRIBUTION STATEMENT A] This material has been approved for public
// release and unlimited distribution. Please see Copyright notice for
// non-US Government use and distribution.
//
// Carnegie Mellon(R) and CERT(R) are registered in the U.S. Patent and
// Trademark Office by Carnegie Mellon University.
//
// This software includes and/or makes use of third party software each
// subject to its own license as detailed in LICENSE-thirdparty.tx
//
// DM20-1143