All Downloads are FREE. Search and download functionalities are using the official Maven repository.

packer.DirMapping.scala Maven / Gradle / Ivy

// Copyright 2015-2022 by Carnegie Mellon University
// See license information in LICENSE.txt

package org.cert.netsa.mothra.packer

import com.typesafe.scalalogging.StrictLogging
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path => HPath, PathFilter}
import scala.collection.mutable.Set
import scala.util.Try

/**
  * A class to support iterating over the files in the directory `incomingDir`
  * and remembering the list of files between scans of the directory.  The
  * `jobs` method creates a [[PackFileJob]] for each file it finds unless the
  * file was seen on a previous check of the directory.
  *
  * Filenames in `incomingDir` that begin with a dot are ignored.  Files whose
  * size is 0 are also ignored.
  *
  * @param incomingDir The directory to check for incoming files.
  * @param packer A parameter passed to the [[PackFileJob]] constructor.
  * @param conf The Hadoop configuration.
  */
private[mothra] case class DirMapping(
  incomingDir: HPath,
  packer: CorePacker)
  (implicit conf: Configuration)
    extends StrictLogging
{
  private[this] lazy val sFs = incomingDir.getFileSystem(conf)

  /**
    * The KnownFiles object is to prevent the creation of multiple jobs
    * for the same file or creating jobs for files that fail after
    * `packAttemps` attempts.
    *
    * The object maintains two sets of file paths: Those seen during
    * this scan and those seen during the previous scan.  Calling
    * `checkPath(path)` adds `path` to the current set of files and
    * returns `true` if `path` was seen the previous scan or `false`
    * otherwise.  Calling `freeze()` indicates that a scan has
    * completed, and it makes the current set be the previous set and
    * creates a new empty set for current.
    *
    * FIXME: Since we only compare path names, if we process file
    * 'foo' and a new file named 'foo' arrives within the interval,
    * the new 'foo' will be treated as the same file and be ignored.
    */
  private[this] object KnownFiles {
    private[this] var current = Set.empty[HPath]

    private[this] var previous = Set.empty[HPath]

    /** Record the presence of `path` and return `true` if `path` has been
      * seen before; `false` otherwise. */
    def checkPath(path: HPath): Boolean = {
      current += path
      previous.contains(path)
    }
    /** Freeze the set of known files.  The next call to checkPath() will
      * open a new set of known files. */
    def freeze(): Unit = {
      previous = current
      current = Set.empty[HPath]
    }
  }

  /** Returns true if `path` exists and is a file; false otherwise.  `path` is
    * expected to be a complete pathname on the same FileSystem as the
    * incomingDir. */
  private[mothra] def fileExists(path: HPath) =
    // Use a Try because isFile can throw an exception
    Try(sFs.getFileStatus(path).isFile()).getOrElse(false)


  /** Scans `incomingDir` for files.  Filenames that do not begin with a
    * dot and whose length is non-zero are added to an Iterable of
    * PackFileJobs. */
  def jobs(implicit conf: Configuration): Iterator[PackFileJob] = {
    logger.debug(s"Scanning '${incomingDir}'")
    try {
      val jobs: Seq[PackFileJob] = for {
        sourceStatus <- sFs.listStatus(
          incomingDir, DirMapping.excludeDotFiles).toSeq
        if Try(sourceStatus.isFile()).getOrElse(false)
        if sourceStatus.getLen() > 0
        if !KnownFiles.checkPath(sourceStatus.getPath())
        sourcePath = sourceStatus.getPath()
      } yield PackFileJob(sourcePath, packer)

      KnownFiles.freeze()
      for (j <- jobs) {
        logger.trace(s"Found new source file '${j.sourcePath}'")
      }
      logger.info(
        s"Scanned ${incomingDir}; queuing ${jobs.size} new packing tasks")

      // result on success
      jobs.iterator
    } catch {
      case ex: Exception =>
        logger.error(s"Failure reading source directory '${incomingDir}': $ex")
        Iterator.empty // result if reading the source dir fails
    }
  }
}


private[mothra] object DirMapping {
  /** Returns filenames that do not begin with a dot. */
  private object excludeDotFiles extends PathFilter {
    def accept(p: HPath): Boolean = !p.getName().startsWith(".")
  }
}

// @LICENSE_FOOTER@
//
// Copyright 2015-2022 Carnegie Mellon University. All Rights Reserved.
//
// This material is based upon work funded and supported by the
// Department of Defense and Department of Homeland Security under
// Contract No. FA8702-15-D-0002 with Carnegie Mellon University for the
// operation of the Software Engineering Institute, a federally funded
// research and development center sponsored by the United States
// Department of Defense. The U.S. Government has license rights in this
// software pursuant to DFARS 252.227.7014.
//
// NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING
// INSTITUTE MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
// UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
// IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF
// FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
// OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY DOES NOT
// MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM PATENT,
// TRADEMARK, OR COPYRIGHT INFRINGEMENT.
//
// Released under a GNU GPL 2.0-style license, please see LICENSE.txt or
// contact [email protected] for full terms.
//
// [DISTRIBUTION STATEMENT A] This material has been approved for public
// release and unlimited distribution. Please see Copyright notice for
// non-US Government use and distribution.
//
// Carnegie Mellon(R) and CERT(R) are registered in the U.S. Patent and
// Trademark Office by Carnegie Mellon University.
//
// This software includes and/or makes use of third party software each
// subject to its own license as detailed in LICENSE-thirdparty.tx
//
// DM20-1143




© 2015 - 2024 Weber Informatics LLC | Privacy Policy