silk_appender.DirMapping.scala Maven / Gradle / Ivy
// Copyright 2015-2022 by Carnegie Mellon University
// See license information in LICENSE.txt
package org.cert.netsa.mothra.tools.silk_appender
import org.cert.netsa.io.silk.SilkConfig
import com.typesafe.scalalogging.StrictLogging
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path, PathFilter}
import scala.collection.mutable.Set
import scala.util.Try
/**
* A class to periodically check for packed SiLK Flow files in the
* directory `incomingDir`. For each source file, the file's
* contents is written to a target file in a directory hierarchy
* rooted at the directory `rootDir`. The location of the target
* file is determined by the given SilkConfig.
*
* Filenames in `incomingDir` that begin with a dot are ignored.
*
* The `jobs()` method returns an Iterable of [[PackFileJob]]
* instances where each instance spawns a thread to process the
* records in a single file.
*
* @param incomingDir The directory to check for incoming files.
* @param rootDir The root of the directory where new files are created.
*/
case class DirMapping(incomingDir: Path, rootDir: Path, silkConfig: SilkConfig)
extends StrictLogging
{
/** The number of times to attempt a job (must be at least 1). */
val JOB_ATTEMPTS = 3
// keep a handle to existing jobs so they can be tried again (until
// the job count reaches JOB_ATTEMPTS)
private[this] var knownJobs = Seq.empty[SilkFileJob]
// The KnownFiles object is to prevent creating multiple jobs for
// the same file or creating jobs for files that fail after
// JOB_ATTEMPTS attempts.
//
// FIXME: Since we only compare path names, if we process file 'foo'
// and a new file named 'foo' arrives within the interval, it will
// be ignored.
private[this] object KnownFiles {
private[this] var current = Set.empty[Path]
private[this] var previous = Set.empty[Path]
/** Record the presence of `path` and return `true` if `path` has been
* seen before; `false` otherwise. */
def isKnown(path: Path): Boolean = {
current += path
previous.contains(path)
}
/** Save the current set of known files. */
def saveList(): Unit = {
previous = current
current = Set.empty[Path]
}
}
/** Scans `incomingDir` for files. Filenames that do not begin a dot
* and whose length is non-zero are added to an Iterable of
* SilkFileJobs. */
def jobs(implicit conf: Configuration): Iterable[SilkFileJob] = {
logger.debug(s"Scanning '${incomingDir}'")
try {
//var jobs = Seq.empty[SilkFileJob]
val sFs = incomingDir.getFileSystem(conf)
var jobs: Seq[SilkFileJob] = for {
sourceStatus <- sFs.listStatus(
incomingDir, DirMapping.excludeDotFiles).toSeq
// Use a Try because isFile can throw an exception D:
if Try(sourceStatus.isFile()).getOrElse(false)
if sourceStatus.getLen() > 0
if !KnownFiles.isKnown(sourceStatus.getPath())
sourcePath = sourceStatus.getPath()
globInfo <- silkConfig.filenameToGlobInfo(sourcePath.toString)
relativePath = silkConfig.fileInfoToPath(globInfo)
destPath: Path = new Path(rootDir, relativePath)
} yield(SilkFileJob(sourcePath, destPath))
KnownFiles.saveList()
// append each previously failed job to 'jobs' if its runCount
// is below max
jobs = jobs ++ {
for {
j <- knownJobs
if j.runFailed
if j.runCount < JOB_ATTEMPTS
} yield j }
knownJobs = jobs
// result on success
jobs
} catch {
case ex: Exception =>
logger.error(s"Failure reading source direcotry '${incomingDir}': $ex")
Iterable() // result if reading the source dir fails
}
}
}
object DirMapping {
/** Returns filenames that do not begin with a dot. */
private object excludeDotFiles extends PathFilter {
def accept(p: Path): Boolean = !p.getName().startsWith(".")
}
}
// @LICENSE_FOOTER@
//
// Copyright 2015-2022 Carnegie Mellon University. All Rights Reserved.
//
// This material is based upon work funded and supported by the
// Department of Defense and Department of Homeland Security under
// Contract No. FA8702-15-D-0002 with Carnegie Mellon University for the
// operation of the Software Engineering Institute, a federally funded
// research and development center sponsored by the United States
// Department of Defense. The U.S. Government has license rights in this
// software pursuant to DFARS 252.227.7014.
//
// NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING
// INSTITUTE MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
// UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
// IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF
// FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
// OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY DOES NOT
// MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM PATENT,
// TRADEMARK, OR COPYRIGHT INFRINGEMENT.
//
// Released under a GNU GPL 2.0-style license, please see LICENSE.txt or
// contact [email protected] for full terms.
//
// [DISTRIBUTION STATEMENT A] This material has been approved for public
// release and unlimited distribution. Please see Copyright notice for
// non-US Government use and distribution.
//
// Carnegie Mellon(R) and CERT(R) are registered in the U.S. Patent and
// Trademark Office by Carnegie Mellon University.
//
// This software includes and/or makes use of third party software each
// subject to its own license as detailed in LICENSE-thirdparty.tx
//
// DM20-1143