All Downloads are FREE. Search and download functionalities are using the official Maven repository.

silk_appender.DirMapping.scala Maven / Gradle / Ivy

// Copyright 2015-2022 by Carnegie Mellon University
// See license information in LICENSE.txt

package org.cert.netsa.mothra.tools.silk_appender

import org.cert.netsa.io.silk.SilkConfig

import com.typesafe.scalalogging.StrictLogging
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path, PathFilter}
import scala.collection.mutable.Set
import scala.util.Try

/**
  * A class to periodically check for packed SiLK Flow files in the
  * directory `incomingDir`.  For each source file, the file's
  * contents is written to a target file in a directory hierarchy
  * rooted at the directory `rootDir`.  The location of the target
  * file is determined by the given SilkConfig.
  *
  * Filenames in `incomingDir` that begin with a dot are ignored.
  *
  * The `jobs()` method returns an Iterable of [[PackFileJob]]
  * instances where each instance spawns a thread to process the
  * records in a single file.
  *
  * @param incomingDir The directory to check for incoming files.
  * @param rootDir The root of the directory where new files are created.
  */
case class DirMapping(incomingDir: Path, rootDir: Path, silkConfig: SilkConfig)
    extends StrictLogging
{
  /** The number of times to attempt a job (must be at least 1). */
  val JOB_ATTEMPTS = 3

  // keep a handle to existing jobs so they can be tried again (until
  // the job count reaches JOB_ATTEMPTS)
  private[this] var knownJobs = Seq.empty[SilkFileJob]

  // The KnownFiles object is to prevent creating multiple jobs for
  // the same file or creating jobs for files that fail after
  // JOB_ATTEMPTS attempts.
  //
  // FIXME: Since we only compare path names, if we process file 'foo'
  // and a new file named 'foo' arrives within the interval, it will
  // be ignored.
  private[this] object KnownFiles {
    private[this] var current = Set.empty[Path]

    private[this] var previous = Set.empty[Path]

    /** Record the presence of `path` and return `true` if `path` has been
      * seen before; `false` otherwise. */
    def isKnown(path: Path): Boolean = {
      current += path
      previous.contains(path)
    }
    /** Save the current set of known files. */
    def saveList(): Unit = {
      previous = current
      current = Set.empty[Path]
    }
  }

  /** Scans `incomingDir` for files.  Filenames that do not begin a dot
    * and whose length is non-zero are added to an Iterable of
    * SilkFileJobs. */

  def jobs(implicit conf: Configuration): Iterable[SilkFileJob] = {
    logger.debug(s"Scanning '${incomingDir}'")
    try {
      //var jobs = Seq.empty[SilkFileJob]
      val sFs = incomingDir.getFileSystem(conf)
      var jobs: Seq[SilkFileJob] = for {
        sourceStatus <- sFs.listStatus(
          incomingDir, DirMapping.excludeDotFiles).toSeq
        // Use a Try because isFile can throw an exception D:
        if Try(sourceStatus.isFile()).getOrElse(false)
        if sourceStatus.getLen() > 0
        if !KnownFiles.isKnown(sourceStatus.getPath())
        sourcePath = sourceStatus.getPath()
        globInfo <- silkConfig.filenameToGlobInfo(sourcePath.toString)
        relativePath = silkConfig.fileInfoToPath(globInfo)
        destPath: Path = new Path(rootDir, relativePath)
      } yield(SilkFileJob(sourcePath, destPath))

      KnownFiles.saveList()

      // append each previously failed job to 'jobs' if its runCount
      // is below max
      jobs = jobs ++ {
        for {
          j <- knownJobs
          if j.runFailed
          if j.runCount < JOB_ATTEMPTS
        } yield j }
      knownJobs = jobs

      // result on success
      jobs
    } catch {
      case ex: Exception =>
        logger.error(s"Failure reading source direcotry '${incomingDir}': $ex")
        Iterable() // result if reading the source dir fails
    }
  }

}


object DirMapping {
  /** Returns filenames that do not begin with a dot. */
  private object excludeDotFiles extends PathFilter {
    def accept(p: Path): Boolean = !p.getName().startsWith(".")
  }
}

// @LICENSE_FOOTER@
//
// Copyright 2015-2022 Carnegie Mellon University. All Rights Reserved.
//
// This material is based upon work funded and supported by the
// Department of Defense and Department of Homeland Security under
// Contract No. FA8702-15-D-0002 with Carnegie Mellon University for the
// operation of the Software Engineering Institute, a federally funded
// research and development center sponsored by the United States
// Department of Defense. The U.S. Government has license rights in this
// software pursuant to DFARS 252.227.7014.
//
// NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING
// INSTITUTE MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
// UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
// IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF
// FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
// OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY DOES NOT
// MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM PATENT,
// TRADEMARK, OR COPYRIGHT INFRINGEMENT.
//
// Released under a GNU GPL 2.0-style license, please see LICENSE.txt or
// contact [email protected] for full terms.
//
// [DISTRIBUTION STATEMENT A] This material has been approved for public
// release and unlimited distribution. Please see Copyright notice for
// non-US Government use and distribution.
//
// Carnegie Mellon(R) and CERT(R) are registered in the U.S. Patent and
// Trademark Office by Carnegie Mellon University.
//
// This software includes and/or makes use of third party software each
// subject to its own license as detailed in LICENSE-thirdparty.tx
//
// DM20-1143




© 2015 - 2024 Weber Informatics LLC | Privacy Policy