packer.DirMapping.scala Maven / Gradle / Ivy
// Copyright 2015-2022 by Carnegie Mellon University
// See license information in LICENSE.txt
package org.cert.netsa.mothra.packer
import com.typesafe.scalalogging.StrictLogging
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path => HPath, PathFilter}
import scala.collection.mutable.Set
import scala.util.Try
/**
* A class to support iterating over the files in the directory `incomingDir`
* and remembering the list of files between scans of the directory. The
* `jobs` method creates a [[PackFileJob]] for each file it finds unless the
* file was seen on a previous check of the directory.
*
* Filenames in `incomingDir` that begin with a dot are ignored. Files whose
* size is 0 are also ignored.
*
* @param incomingDir The directory to check for incoming files.
* @param packer A parameter passed to the [[PackFileJob]] constructor.
* @param conf The Hadoop configuration.
*/
private[mothra] case class DirMapping(
incomingDir: HPath,
packer: CorePacker)
(implicit conf: Configuration)
extends StrictLogging
{
private[this] lazy val sFs = incomingDir.getFileSystem(conf)
/**
* The KnownFiles object is to prevent the creation of multiple jobs
* for the same file or creating jobs for files that fail after
* `packAttemps` attempts.
*
* The object maintains two sets of file paths: Those seen during
* this scan and those seen during the previous scan. Calling
* `checkPath(path)` adds `path` to the current set of files and
* returns `true` if `path` was seen the previous scan or `false`
* otherwise. Calling `freeze()` indicates that a scan has
* completed, and it makes the current set be the previous set and
* creates a new empty set for current.
*
* FIXME: Since we only compare path names, if we process file
* 'foo' and a new file named 'foo' arrives within the interval,
* the new 'foo' will be treated as the same file and be ignored.
*/
private[this] object KnownFiles {
private[this] var current = Set.empty[HPath]
private[this] var previous = Set.empty[HPath]
/** Record the presence of `path` and return `true` if `path` has been
* seen before; `false` otherwise. */
def checkPath(path: HPath): Boolean = {
current += path
previous.contains(path)
}
/** Freeze the set of known files. The next call to checkPath() will
* open a new set of known files. */
def freeze(): Unit = {
previous = current
current = Set.empty[HPath]
}
}
/** Returns true if `path` exists and is a file; false otherwise. `path` is
* expected to be a complete pathname on the same FileSystem as the
* incomingDir. */
private[mothra] def fileExists(path: HPath) =
// Use a Try because isFile can throw an exception
Try(sFs.getFileStatus(path).isFile()).getOrElse(false)
/** Scans `incomingDir` for files. Filenames that do not begin with a
* dot and whose length is non-zero are added to an Iterable of
* PackFileJobs. */
def jobs(implicit conf: Configuration): Iterator[PackFileJob] = {
logger.debug(s"Scanning '${incomingDir}'")
try {
val jobs: Seq[PackFileJob] = for {
sourceStatus <- sFs.listStatus(
incomingDir, DirMapping.excludeDotFiles).toSeq
if Try(sourceStatus.isFile()).getOrElse(false)
if sourceStatus.getLen() > 0
if !KnownFiles.checkPath(sourceStatus.getPath())
sourcePath = sourceStatus.getPath()
} yield PackFileJob(sourcePath, packer)
KnownFiles.freeze()
for (j <- jobs) {
logger.trace(s"Found new source file '${j.sourcePath}'")
}
logger.info(
s"Scanned ${incomingDir}; queuing ${jobs.size} new packing tasks")
// result on success
jobs.iterator
} catch {
case ex: Exception =>
logger.error(s"Failure reading source directory '${incomingDir}': $ex")
Iterator.empty // result if reading the source dir fails
}
}
}
private[mothra] object DirMapping {
/** Returns filenames that do not begin with a dot. */
private object excludeDotFiles extends PathFilter {
def accept(p: HPath): Boolean = !p.getName().startsWith(".")
}
}
// @LICENSE_FOOTER@
//
// Copyright 2015-2022 Carnegie Mellon University. All Rights Reserved.
//
// This material is based upon work funded and supported by the
// Department of Defense and Department of Homeland Security under
// Contract No. FA8702-15-D-0002 with Carnegie Mellon University for the
// operation of the Software Engineering Institute, a federally funded
// research and development center sponsored by the United States
// Department of Defense. The U.S. Government has license rights in this
// software pursuant to DFARS 252.227.7014.
//
// NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING
// INSTITUTE MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
// UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
// IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF
// FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
// OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY DOES NOT
// MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM PATENT,
// TRADEMARK, OR COPYRIGHT INFRINGEMENT.
//
// Released under a GNU GPL 2.0-style license, please see LICENSE.txt or
// contact [email protected] for full terms.
//
// [DISTRIBUTION STATEMENT A] This material has been approved for public
// release and unlimited distribution. Please see Copyright notice for
// non-US Government use and distribution.
//
// Carnegie Mellon(R) and CERT(R) are registered in the U.S. Patent and
// Trademark Office by Carnegie Mellon University.
//
// This software includes and/or makes use of third party software each
// subject to its own license as detailed in LICENSE-thirdparty.tx
//
// DM20-1143