All Downloads are FREE. Search and download functionalities are using the official Maven repository.

packer.DirWatcher.scala Maven / Gradle / Ivy

// Copyright 2015-2022 by Carnegie Mellon University
// See license information in LICENSE.txt

package org.cert.netsa.mothra.packer

import com.typesafe.scalalogging.StrictLogging
import java.util.concurrent.{
  LinkedBlockingDeque, ScheduledThreadPoolExecutor, TimeUnit}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, Path => HPath, PathFilter}
import scala.collection.immutable.Set
import scala.collection.mutable.ArrayBuffer
import scala.util.{Failure, Success, Try}

/**
  * An abstract class to support iterating over the files in the directory
  * `incomingDir` and remembering the list of files between scans of the
  * directory.  When a file is first seen, it is given to the `handleFile`
  * abstract method to map the file to the type `Option[T]`.  The result is
  * added to `deque` if it is not empty.
  *
  * Filenames in `incomingDir` that begin with a dot are ignored.  Files whose
  * size is 0 are also ignored.
  *
  * @param incomingDir The directory to check for incoming files.
  * @param deque Where to store the names of the files.
  * @param conf The Hadoop configuration.
  */
private[mothra] abstract class AbstractDirWatcher(incomingDir: HPath)
  (implicit conf: Configuration)
    extends StrictLogging
{
  private[this] val sFs = incomingDir.getFileSystem(conf)

  /**
    * The KnownFiles object is to prevent the creation of multiple jobs
    * for the same file or creating jobs for files that fail after
    * `packAttemps` attempts.
    *
    * The object maintains two sets of file paths: Those seen during
    * this scan and those seen during the previous scan.  Calling
    * `checkPath(path)` adds `path` to the current set of files and
    * returns `true` if `path` was seen the previous scan or `false`
    * otherwise.  Calling `freeze()` indicates that a scan has
    * completed, and it makes the current set be the previous set and
    * creates a new empty set for current.
    *
    * FIXME: Since we only compare path names, if we process file
    * 'foo' and a new file named 'foo' arrives within the interval,
    * the new 'foo' will be treated as the same file and be ignored.
    */
  private[this] object KnownFiles {
    private[this] val current = ArrayBuffer.empty[HPath]

    private[this] var previous = Set.empty[HPath]

    /** Record the presence of `path` and return `true` if `path` has been
      * seen before; `false` otherwise. */
    def checkPath(path: HPath): Boolean = {
      current += path
      previous.contains(path)
    }
    /** Freeze the set of known files.  The next call to checkPath() will
      * open a new set of known files. */
    def freeze(): Unit = {
      previous = current.toSet
      current.clear()
    }
  }

  private[this] class Scanner() extends Runnable {
    def run(): Unit = {
      beforeScan() match {
        case DirWatcherResult.TERMINATE =>
          shutdown()
        case DirWatcherResult.SKIP =>
          // do nothing
        case DirWatcherResult.CONTINUE =>
          logger.debug(s"Scanning '${incomingDir}'")
          var visitResult: DirWatcherResult =
            DirWatcherResult.CONTINUE
          try {
            for {
              sourceStatus <- sFs.listStatus(
                incomingDir, AbstractDirWatcher.excludeDotFiles)
              // Workaround scala/bug#11175 -Ywarn-unused:params false positive
              _ = sourceStatus
              if DirWatcherResult.CONTINUE == visitResult
              if Try(sourceStatus.isFile()).getOrElse(false)
              if sourceStatus.getLen() > 0
              if !KnownFiles.checkPath(sourceStatus.getPath())
              sourcePath = sourceStatus.getPath()
            } {
              logger.trace(s"Found new file '${sourcePath}'")
              visitResult = Try {
                handleFile(sourcePath, sourceStatus)
              } match {
                case Success(x) => x
                case Failure(e) =>
                  logger.debug(s"Error handling incoming file: ${e.toString}")
                  DirWatcherResult.CONTINUE
              }
            }
            if ( DirWatcherResult.TERMINATE != visitResult ) {
              KnownFiles.freeze()
              visitResult = afterScan()
            }
          } catch {
            case ex: Exception =>
              logger.error(
                s"Failure reading source directory '${incomingDir}': $ex")
          }
          if ( DirWatcherResult.TERMINATE == visitResult ) {
            shutdown()
          }
      }
    }
  }


  private[this] val factory = new PackerThreadFactory("DirWatcher-")
  protected val pool = new ScheduledThreadPoolExecutor(1, factory)

  pool.setContinueExistingPeriodicTasksAfterShutdownPolicy(false)
  pool.setExecuteExistingDelayedTasksAfterShutdownPolicy(false)
  pool.setRemoveOnCancelPolicy(true)

  /**
    * Called before performing a periodic scan.  The scan is skipped if this
    * method returns `SKIP`.
    */
  protected def beforeScan(): DirWatcherResult

  /**
    * Called after performing a scan.  This is not invoked if beforeScan()
    * returned `SKIP`.
    */
  protected def afterScan(): DirWatcherResult

  /**
    * Called at shutdown.
    */
  protected def atShutdown(): Unit = {}

  /**
    * Called on each file in the directory.
    */
  protected def handleFile(file: HPath, status: FileStatus): DirWatcherResult

  /**
    * Scans the directory one time.
    */
  final def runOnce(): Unit = {
    pool.submit(new Scanner()).get()
    ()
  }

  /**
    * Schedules a scan to occur every `interval` seconds.
    */
  final def runPeriodically(interval: Int): Unit = {
    pool.scheduleAtFixedRate(new Scanner(), 0, interval, TimeUnit.SECONDS)
    ()
  }

  /**
    * Stops the periodic scanning of the directory.
    */
  final def shutdown(): Unit = {
    Try { atShutdown() }
    pool.shutdown()
  }

}


private[mothra] object AbstractDirWatcher {
  /** Returns filenames that do not begin with a dot. */
  private object excludeDotFiles extends PathFilter {
    def accept(p: HPath): Boolean = !p.getName().startsWith(".")
  }
}



private[mothra] final case class DirWatcher(
  incomingDir: HPath,
  deque: LinkedBlockingDeque[HPath])
  (implicit conf: Configuration)
    extends AbstractDirWatcher(incomingDir)(conf)
{
  var newfiles = 0

  def beforeScan(): DirWatcherResult = {
    newfiles = 0
    logger.info(s"Scanning ${incomingDir}...")
    DirWatcherResult.CONTINUE
  }

  def afterScan(): DirWatcherResult = {
    logger.info(s"Scanned ${incomingDir}; found ${newfiles} new files")
    DirWatcherResult.CONTINUE
  }

  def handleFile(file: HPath, status: FileStatus): DirWatcherResult = {
    deque.put(file)
    DirWatcherResult.CONTINUE
  }

}

// @LICENSE_FOOTER@
//
// Copyright 2015-2022 Carnegie Mellon University. All Rights Reserved.
//
// This material is based upon work funded and supported by the
// Department of Defense and Department of Homeland Security under
// Contract No. FA8702-15-D-0002 with Carnegie Mellon University for the
// operation of the Software Engineering Institute, a federally funded
// research and development center sponsored by the United States
// Department of Defense. The U.S. Government has license rights in this
// software pursuant to DFARS 252.227.7014.
//
// NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING
// INSTITUTE MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
// UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
// IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF
// FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
// OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY DOES NOT
// MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM PATENT,
// TRADEMARK, OR COPYRIGHT INFRINGEMENT.
//
// Released under a GNU GPL 2.0-style license, please see LICENSE.txt or
// contact [email protected] for full terms.
//
// [DISTRIBUTION STATEMENT A] This material has been approved for public
// release and unlimited distribution. Please see Copyright notice for
// non-US Government use and distribution.
//
// Carnegie Mellon(R) and CERT(R) are registered in the U.S. Patent and
// Trademark Office by Carnegie Mellon University.
//
// This software includes and/or makes use of third party software each
// subject to its own license as detailed in LICENSE-thirdparty.tx
//
// DM20-1143




© 2015 - 2024 Weber Informatics LLC | Privacy Policy