Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
// Copyright 2015-2022 by Carnegie Mellon University
// See license information in LICENSE.txt
package org.cert.netsa.mothra.tools
import org.cert.netsa.io.ipfix.InfoModel
import org.cert.netsa.mothra.packer.{
PackerThreadFactory, Reader, Version, Writer}
import com.typesafe.scalalogging.StrictLogging
import java.io.{PrintWriter, StringWriter}
import java.lang.management.ManagementFactory
import java.util.concurrent.{Executors, LinkedBlockingQueue,
ThreadPoolExecutor, TimeUnit}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{LocatedFileStatus, RemoteIterator}
import org.apache.hadoop.fs.{Path => HPath}
import org.apache.hadoop.io.compress.CompressionCodecFactory
import resource.managed // see http://jsuereth.com/scala-arm/index.html
import scala.collection.mutable.{Map => MutMap, Set}
import scala.util.control.NonFatal
import scala.util.matching.Regex
import scala.util.{Failure, Success, Try}
/**
* Object to implement the RollupDay application.
*
* Typical Usage in a Spark environment:
*
* `spark-submit --class org.cert.netsa.mothra.packer.tools.RollupDayMain mothra-tools.jar [ ...]`
*
* where:
*
* s1..sn: Directories to process, as Hadoop URIs
*
*
* RollupDay reduces the number of data files in a Mothra repository. It
* may also be used to modify the files' compression.
*
* RollupDay runs as a batch process, not as a daemon.
*
* RollupDay makes a single recursive scan of the source directories ,
* , ... for files whose names match the pattern "YYYYMMDD.HH." or
* "YYYYMMDD.HH-PTddH." (It looks for files matching the regular expression
* `^\d{8}\.\d{2}(?:-PT\d\d?H)?\.`) Files whose names match that pattern and
* reside in the same directory are processed by RollupDay to create a single
* new file (see next paragraph) in the same directory containing the records
* in all files in that directory.
*
* RollupDay joins the files in a directory into a single file by default.
* The `mothra.rollupday.maximumSize` Java property may be used to limit the
* maximum file size. The size is for the compressed file if compression is
* active. The value is approximate since it is only checked after the data
* appears on disk which occurs in large blocks because of buffering by the
* Java stream code and the compression algorithm.
*
* There is always a single thread that recursively scans the directories.
* The number of threads that joins the files may be set by specifying the
* `mothra.rollupday.maxThreads` Java property. If not specified, the
* default is 6.
*
* By default, RollupDay does not compress the files it writes.
* (NOTE: It should support writing the output using the same compression as
* the input.) To specify the compression codec that it should use, specify
* the `mothra.rollupday.compression` Java property. Values typically
* supported by Hadoop include `bzip2`, `gzip`, `lz4`, `lzo`, `lzop`,
* `snappy`, and `default`. The empty string indicates no compression.
*
*/
object RollupDayMain extends App with StrictLogging {
def usage(full: Boolean = false): Unit = {
print("""
Usage: spark-submit --class org.cert.netsa.mothra.packer.tools.RollupDayMain mothra-tools.jar [ ...]
where:
s1..sn: Directories to process, as Hadoop URIs
""")
if ( full ) {
print(s"""
RollupDay reduces the number of data files in a Mothra repository. It
may also be used to modify the files' compression.
RollupDay runs as a batch process, not as a daemon.
RollupDay makes a single recursive scan of the source directories , ,
... for files whose names match the pattern "YYYYMMDD.HH." or
"YYYYMMDD.HH-PTddH." (It looks for files matching the regular expression
`^\\d{8}\\.\\d{2}(?:-PT\\d\\d?H)?\\.`) Files whose names match that pattern and
reside in the same directory are processed by RollupDay to create a single
new file (see next paragraph) in the same directory containing the records
in all files in that directory.
RollupDay joins the files in a directory into a single file by default.
The `mothra.rollupday.maximumSize` Java property may be used to limit the
maximum file size. The size is for the compressed file if compression is
active. The value is approximate since it is only checked after the data
appears on disk which occurs in large blocks because of buffering by the
Java stream code and the compression algorithm.
There is always a single thread that recursively scans the directories.
The number of threads that joins the files may be set by specifying the
`mothra.rollupday.maxThreads` Java property. If not specified, the
default is ${DEFAULT_MAX_THREADS}.
By default, RollupDay does not compress the files it writes.
(NOTE: It should support writing the output using the same compression as
the input.) To specify the compression codec that it should use, specify
the `mothra.rollupday.compression` Java property. Values typically
supported by Hadoop include `bzip2`, `gzip`, `lz4`, `lzo`, `lzop`,
`snappy`, and `default`. The empty string indicates no compression.
""")
}
System.exit(if (full) { 0 } else { 1 })
}
def version(): Unit = {
println("RollupDay " + Version.get())
System.exit(0)
}
// <<<<< DEFAULT VALUES FOR PROPERTY SETTINGS >>>>> //
/**
* The default compression codec to use for files written to HDFS. This
* may be modified by specifying the following property:
* mothra.rollupday.compression.
*
* Values typically supported by Hadoop include `bzip2`, `gzip`, `lz4`,
* `lzo`, `lzop`, `snappy`, and `default`. The empty string indicates no
* compression.
*/
val DEFAULT_COMPRESSION = ""
/**
* The default number of threads to run for joining files when the
* `mothra.rollupday.maxThreads` Java property is not set. (The scanning
* task always runs in its own thread.)
*/
val DEFAULT_MAX_THREADS = 6
// <<<<< PROCESS THE COMMAND LINE ARGUMENTS >>>>> //
val (switches, positionalArgs) = args.partition { _.substring(0, 1) == "-" }
switches.collect {
case "-V" | "--version" => version()
case "-h" | "--help" => usage(true)
case unknown: String =>
println(s"Unknown argument '${unknown}'")
usage()
}
logger.info("\n=============================" +
" RollupDay is starting =============================\n")
logger.info(s"This is RollupDay ${Version.get()}")
if ( positionalArgs.length == 0 ) {
logger.error(s"Called with no args; at least 1 required")
usage()
}
/**
* Joins the files specified in `files` that all exist in `dir` and
* all of whose names begin with the same `basename` which is of the form
* "YYYYMMDD."
*/
private[this] def joinFilesBasename(
dir: HPath, basename: String, files: Set[HPath]): Unit =
{
// the output file
val dayfile = s"${basename}00-PT24H."
// file currently being written to
var writer: Writer = null
// list of newly created files
var newPaths = List.empty[HPath]
// list of files that were successfully processed
var removeList = List.empty[HPath]
logger.debug(s"Joining ${files.size} files into '${dayfile}*' in ${dir}/")
val t0 = System.currentTimeMillis()
Try {
writer = Writer(dir, dayfile, compressCodec, maximumSize)
val originalPerm = writer.originalPermission
if ( maximumSize.isEmpty ) {
// process all input files
for ( f <- files ) {
// process all records in the input
for ( reader <- managed(Reader(f, codecFactory)) ) {
for ( record <- reader ) {
writer.add(record)
}
removeList = f +: removeList
}
}
} else {
// process all input files
for ( f <- files ) {
// process all records in the input
for ( reader <- managed(Reader(f, codecFactory)) ) {
for ( record <- reader ) {
if ( writer.reachedMaxSize ) {
logger.trace(s"Closing file '${writer.getName}'")
writer.close()
newPaths = writer.exportFile +: newPaths
writer = null
logger.trace(
s"Creating additional writer for '${dayfile}' in ${dir}")
writer = Writer(dir, dayfile, compressCodec, maximumSize)
}
writer.add(record)
}
removeList = f +: removeList
}
}
}
writer.close()
newPaths = writer.exportFile +: newPaths
writer = null
// restore the original permission bits on the new files
for ( perm <- originalPerm ; f <- newPaths ) {
fileSystem.setPermission(f, perm)
}
//logger.trace(s"Removing old '${basename}*' files from ${dir}/")
for ( f <- removeList ) {
Try {fileSystem.delete(f, false)} match {
case Failure(e) =>
logger.warn(
s"Failed to remove old file '${f}': ${e.toString}")
case _ =>
}
}
} match {
case Success(_) =>
logger.debug("Finished joining" +
s" ${files.size} files into ${newPaths.size} '${dayfile}' files" +
s" in ${dir}/ in " +
f"${(System.currentTimeMillis() - t0).toDouble/1000.0}%.3f" +
" seconds")
case Failure(ex) =>
logger.error(
s"Failed to join ${files.size} files into '$dayfile' in $dir/", ex)
for ( w <- Option(writer) ) { newPaths = w.exportFile +: newPaths }
for ( f <- newPaths ) {
Try { fileSystem.delete(f, false) } match {
case Success(_) =>
case Failure(ex) =>
logger.error(
s"Failed to remove new file '${f.getName}' in $dir/", ex)
}
}
}
}
// <<<<< DEFINE SOME HELPER CLASSES >>>>> //
/**
* A Runnable that splits the files in `files` by unique basename and, for
* each basename, joins those files into a new file. It assumes all
* entries in `files` live in `dir`.
*/
private[this] case class DirectoryJob(
dir: HPath, files: MutMap[String, Set[HPath]])
extends Runnable
{
def run(): Unit = {
if ( files.size > 1 ) {
val sb = new StringBuilder()
files.keys.addString(sb, ", ")
logger.debug(
s"Files covering multiple days (${sb.mkString}) found in ${dir}")
}
// Create threads to process the files for each unique basename in this
// directory
for ( (basename, set) <- files ) {
joinFilesBasename(dir, basename, set)
}
signalQueue.add(0)
()
}
}
// ///// Constants & Values Determined from Properties /////
/** The information model */
implicit val infoModel = InfoModel.getCERTStandardInfoModel()
/** The Hadoop configuration */
implicit val hadoopConf = new Configuration()
/** A Compression Codec Factory */
private[this] val codecFactory = new CompressionCodecFactory(hadoopConf)
/**
* The compression codec used for files written to HDFS. This may be set
* by setting the "mothra.rollupday.compression" property. If that
* property is not set, DEFAULT_COMPRESSION is used.
*/
val compressCodec = {
val compressName = sys.props.get("mothra.rollupday.compression").
getOrElse(DEFAULT_COMPRESSION)
if ( compressName == "" ) {
//logger.info("Using no compression for IPFIX files")
None
} else {
Try {
//logger.trace(s"have a name ${compressName}")
val codec = codecFactory.getCodecByName(compressName)
//logger.trace(s"have a codec ${codec}")
// Make sure we can create a compressor, not using it here.
codec.createCompressor()
//logger.trace(s"have a compressor ${compressor}")
codec
} match {
case Success(ok) =>
//logger.info(s"Using ${compressName} compressor for IPFIX files")
Option(ok)
case Failure(e) =>
logger.error("Unable to initialize compressor" +
s" '${compressName}': ${e.toString}")
val sw = new StringWriter
e.printStackTrace(new PrintWriter(sw))
logger.info("Unable to initialize compressor" +
s" '${compressName}': ${sw.toString}")
logger.warn("Using no compression for IPFIX files")
None
}
}
}
//logger.trace(s"compressCodec is ${compressCodec}")
/**
* The maximum number of filejoiner threads to start. It defaults to the
* value `DEFAULT_MAX_THREADS`.
*
* This run-time behavior may be modified by setting the
* mothra.rollupday.maxThreads property.
*/
val maxThreads = (sys.props.get("mothra.rollupday.maxThreads").
map { _.toInt }).getOrElse(DEFAULT_MAX_THREADS)
require(maxThreads >= 1)
/**
* The (approximate) maximum size file to create. The default is no
* maximum. When a file's size exceeds this value, the file is closed and
* a new file is started. Typically a file's size will not exceed this
* value by more than the maximum size of an IPFIX message, 64k.
*/
val maximumSize = (sys.props.get("mothra.rollupday.maximumSize").
map { _.toLong })
// ///// RollupDay procedural code begins here /////
// the argument(s) is/are the directory(s) to scan
private[this] var dirList = positionalArgs.toList.map { new HPath(_) }
// ensure all source directories use the same file system
val fileSystem = dirList.head.getFileSystem(hadoopConf)
if ( dirList.drop(1).exists{_.getFileSystem(hadoopConf) != fileSystem} )
{
logger.error("source directories use different file systems")
throw new Exception("source directories use different file systems")
}
// log our settings
logger.info("RollupDay settings::")
logger.info(s"Number of top-level directories to scan: ${dirList.size}")
logger.info(s"Maximum number of file joining threads: ${maxThreads}")
logger.info("Approximate maximum output file size: " +
maximumSize.map{ _.toString }.getOrElse("unlimited"))
logger.info(s"Output file compression: ${compressCodec.getOrElse("none")}")
logger.info(s"""JVM Parameters: ${ManagementFactory.getRuntimeMXBean.getInputArguments.toArray.mkString(",")}""")
/** Object used by sub-threads to signal to the main thread that they have
* completed. */
private[this] val signalQueue = new LinkedBlockingQueue[Int]()
private[this] val pool: ThreadPoolExecutor =
new ThreadPoolExecutor(
maxThreads, maxThreads, 0L, TimeUnit.SECONDS,
new LinkedBlockingQueue[Runnable](),
new PackerThreadFactory("RollupDayThread-"))
/**
* How often to print log messages regarding the number of tasks, in
* seconds.
*/
val logTaskCountInterval = 5
// print task count every 5 seconds
private[this] val logTaskCountThread = Executors.newScheduledThreadPool(1,
new PackerThreadFactory("LogTaskCounts-"))
logTaskCountThread.scheduleAtFixedRate(
new Thread() {
override def run(): Unit = {
val active = pool.getActiveCount()
val completed = pool.getCompletedTaskCount()
val total = pool.getTaskCount()
logger.info(s"Directories to scan: ${dirList.size}," +
s" Total tasks: ${total}," +
s" Completed tasks: ${completed}," +
s" Active tasks: ${active}," +
s" Queued tasks: ${total - active - completed}")
}
},
logTaskCountInterval, logTaskCountInterval, TimeUnit.SECONDS)
/** Regular expression that matches expected repo file names */
private[this] val repoFileRegex =
new Regex("""\A(\d{8}\.)\d{2}(?:-PT\d\d?H)?\..*\Z""")
logger.info(s"Starting recursive scan of ${dirList.size} director" +
(if ( 1 == dirList.size ) { "y" } else { "ies" }))
// Recursively process all directories
while ( dirList.nonEmpty ) {
val dir = dirList.head
dirList = dirList.tail
logger.trace(s"Scanning directory '${dir}/'")
val fileMap: MutMap[String, Set[HPath]] = MutMap.empty
val iter = try {
fileSystem.listLocatedStatus(dir)
} catch {
case NonFatal(e) =>
// return an empty iterator
logger.warn(s"Unable to get status of '${dir}/': ${e.getMessage}")
new RemoteIterator[LocatedFileStatus](){
def hasNext: Boolean = false
def next(): LocatedFileStatus = throw new NoSuchElementException()
}
}
while (
Try {
if ( !iter.hasNext ) {
// finished with this directory
false
} else {
// found an entry
val entry = iter.next()
if ( entry.isDirectory ) {
dirList = entry.getPath +: dirList
} else if ( entry.isFile ) {
// its a file, check if it matches the regex
for ( m <- repoFileRegex.findFirstMatchIn(entry.getPath.getName) ) {
fileMap.getOrElseUpdate(m.group(1), Set.empty) += entry.getPath
}
}
true
}
} match {
case Success(ok) => ok
case Failure(e) =>
// ignore errors stat-ing files
logger.debug(s"Unable to read directory entry: ${e.toString}")
true
}
) { /*empty-body*/ }
// Create a thread for all files in the directory
pool.execute(DirectoryJob(dir, fileMap))
}
// Finished scanning directories. Wait for joining threads to finish.
logger.info("Completed recursive directory scan")
logger.info(
s"Waiting for ${pool.getTaskCount() - pool.getCompletedTaskCount()}" +
s" of ${pool.getTaskCount()} tasks to complete...")
// all tasks are queued; shutdown the thread pool and allow the
// running/queued tasks to complete
pool.shutdown()
// clear the signalQueue of previously completed tasks then wait for the
// thread pool to terminate
signalQueue.clear()
while ( !pool.isTerminated() ) {
signalQueue.poll(5, TimeUnit.SECONDS)
//signalQueue.clear()
}
logger.debug("All tasks have completed")
logTaskCountThread.shutdown()
logTaskCountThread.awaitTermination(1, TimeUnit.SECONDS)
logger.info("RollupDay is done")
}
// @LICENSE_FOOTER@
//
// Copyright 2015-2022 Carnegie Mellon University. All Rights Reserved.
//
// This material is based upon work funded and supported by the
// Department of Defense and Department of Homeland Security under
// Contract No. FA8702-15-D-0002 with Carnegie Mellon University for the
// operation of the Software Engineering Institute, a federally funded
// research and development center sponsored by the United States
// Department of Defense. The U.S. Government has license rights in this
// software pursuant to DFARS 252.227.7014.
//
// NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING
// INSTITUTE MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
// UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
// IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF
// FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
// OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY DOES NOT
// MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM PATENT,
// TRADEMARK, OR COPYRIGHT INFRINGEMENT.
//
// Released under a GNU GPL 2.0-style license, please see LICENSE.txt or
// contact [email protected] for full terms.
//
// [DISTRIBUTION STATEMENT A] This material has been approved for public
// release and unlimited distribution. Please see Copyright notice for
// non-US Government use and distribution.
//
// Carnegie Mellon(R) and CERT(R) are registered in the U.S. Patent and
// Trademark Office by Carnegie Mellon University.
//
// This software includes and/or makes use of third party software each
// subject to its own license as detailed in LICENSE-thirdparty.tx
//
// DM20-1143