All Downloads are FREE. Search and download functionalities are using the official Maven repository.

packer.Partitioner.scala Maven / Gradle / Ivy

// Copyright 2015-2022 by Carnegie Mellon University
// See license information in LICENSE.txt

package org.cert.netsa.mothra.packer

import org.cert.netsa.io.ipfix.{Record, DeepFieldExtractor}
//import org.cert.netsa.data.net.{IPv4Address => NetIPv4, IPv6Address => NetIPv6,
//  IPv4Block, IPv6Block}

import com.typesafe.scalalogging.StrictLogging
import java.net.URLEncoder
import scala.collection.immutable.{Vector, VectorBuilder}
import scala.util.matching.Regex
import scala.util.{Failure, Success, Try}

/**
  * A method for Mothra packers to determine into which single partition
  * (if any) on one level of a partitioning hierarchy a
  * [[org.cert.netsa.io.ipfix.Record Record]] should be stored.
  * [[Partitioner]]s are used to partition (split) records into various
  * directories in the long-term data storage location.
  *
  * A single [[Partitioner]] typically splits on one attribute of a
  * [[org.cert.netsa.io.ipfix.Record Record]], such as its source IP
  * address, IP protocol, or destination port.
  *
  * A sequence of multiple [[Partitioner]]s is used by
  * [[PartitionerConfigurator]] to partition on multiple attributes in
  * order.
  *
  * @see [[Partitioner$ the companion object]] for numerous ways to
  *     define simple [[Partitioner]]s on various data types. These
  *     allow you to quickly specify ranges or specific values for
  *     different partitions.
 */
trait Partitioner {
  /** In the directory name generated by this partitoiner, the delimiter
    * between the operator, field name, and argument(s) */
  val delim: String = "="
  // Should be one of the characters in
  // org.cert.netsa.mothra.datasources.ipfix.IPFIXSource.CONSTRAINT_DELIMS_V2

  /** The maximum length of a path component that the file system
    * supports. */
  val max_path_component = 255

  /**
    * Returns the path component for the Record `rec` based on this
    * Partitioner as an [[scala.Option Option]].  Returns `None` when
    * either the Record lacks the necessary information for the
    * Partitioner or when the Partitioner does not designate a
    * specific path component for the Record.
    */
  def pathForRecord(rec: Record): Option[String]

  /**
    * Returns the path component extracted from a String representing a
    * filename created by super_mediator running in invariant mode as an
    * [[scala.Option Option]].  Returns `None` when either the String does not
    * include the field or when the Partitioner does not designate a specific
    * path component.
    */
  def pathForFilename(filename: String): Option[String]

}


/**
  * An object to hold classes that extend the [Partitioner] trait.
  */
object Partitioner {

  /**
    * NumericPartitioner supports partitioning based on the numeric
    * field named `key` in a [[org.cert.netsa.io.ipfix.Record Record]],
    * where each partition represents records where that field either
    * has a specific (single) value or falls with a range of values.
    *
    * NumericPartitioner is the parent class of other classes for
    * specific numeric types (such as 16-bit integers and 32-bit
    * floating point numbers).  These subclasses must specify a numeric
    * type and the allowable range of values for that type.
    *
    * @param key The path of the Information Element to be partioned on
    * @param min The minimum value supported for this numeric type
    * @param max The maximum value supported for this numeric type
    * @tparam T The type of the number
    */
  sealed abstract class NumericPartitioner[T : Ordering] (
    key: String,
    min: T,
    max: T) extends Partitioner with StrictLogging
  {
    import scala.math.Ordering.Implicits._

    /** Whether a missing value in the record is recorded by an
      * "is_null=FIELD" directory component. */
    var useNullComponent = false

    /** When `addNotIn` has been called, this is where `pathForRecord`
      * stores records whose `key` value does map to an Interval. */
    private[this] var notInString = Option.empty[String]

    /** Whether `coverRanges` has been called */
    private[this] var coverRangesUsed = false

    /** URL encoded version of the key */
    private[this] val encodedKey = URLEncoder.encode(key, "UTF-8")

    /** Regex to parse key from a super_mediator invariant filename */
    private[this] val regex = new Regex(s"-${key}-" + """(\d+)\D""")

    /**
      * Class that contains an [[Interval]] within this Partition.
      *
      * @param beg The begin value of the [[Interval]]
      * @param begIncluded Whether the [[Interval]] includes `beg` ("<="
      *     vs "<")
      * @param end The end value of the [[Interval]]
      * @param endIncluded Whether the [[Interval]] includes `end` ("<="
      *     vs "<")
      * @param pathPart The directory name for
      *     [[org.cert.netsa.io.ipfix.Record Record]]s that are in this
      *     [[Interval]]
      * @param contains A function to determine whether a value is in
      *     this [[Interval]]
      */
    private[this] case class Interval(
      beg: T, begIncluded: Boolean,
      end: T, endIncluded: Boolean,
      pathPart: String,
      contains: (T) => Boolean)
    {
      override def toString: String = {
        if (begIncluded) {
          if (endIncluded) {
            s"${beg} <= _ <= ${end}"
          } else {
            s"${beg} <= _ < ${end}"
          }
        } else {
          if (endIncluded) {
            s"${beg} < _ <= ${end}"
          } else {
            s"${beg} < _ < ${end}"
          }
        }
      }
    }

    /** The companion object for the Interval class above. */
    private[this] object Interval {
      /**
        * Create an interval that goes from `beg` to `end` where `beg`
        * and/or `end` may be excluded from the range.
        */
      def apply(beg: T, begIncluded: Boolean, end: T, endIncluded: Boolean):
          Interval =
      {
        if ( beg == end ) {
          assert(begIncluded && endIncluded)
          new Interval(beg, true, beg, true,
            s"eq${delim}${encodedKey}${delim}${beg}",
            {(x: T) => (x == beg)})
        } else if ( begIncluded && endIncluded ) {
          new Interval(beg, begIncluded, end, endIncluded,
            s"ge_le${delim}${encodedKey}${delim}${beg}${delim}${end}",
            {(x: T) => (x >= beg && x <= end)})
        } else if ( begIncluded && !endIncluded ) {
          new Interval(beg, begIncluded, end, endIncluded,
            s"ge_lt${delim}${encodedKey}${delim}${beg}${delim}${end}",
            {(x: T) => (x >= beg && x < end)})
        } else if ( !begIncluded && endIncluded ) {
          new Interval(beg, begIncluded, end, endIncluded,
            s"gt_le${delim}${encodedKey}${delim}${beg}${delim}${end}",
            {(x: T) => (x > beg && x <= end)})
        } else {
          assert( !begIncluded && !endIncluded )
          new Interval(beg, begIncluded, end, endIncluded,
            s"gt_lt${delim}${encodedKey}${delim}${beg}${delim}${end}",
            {(x: T) => (x > beg && x < end)})
        }
      }
    }

    /** The set of Intervals that comprise this partitioner, sorted by
      * the beginning of the interval.
      */
    private[this] var parts = Vector.empty[Interval]

    /** The object used to extract the field 'key' from a Record. */
    private[this] val extractor = DeepFieldExtractor[T](key)

    /**
      * Finds the interval that holds `value` and returns its pathPart
      * as an Option.  Returns None if no intervals hold the value.
      */
    private[this] def findInterval(value: T): Option[String] = {
      // binary search
      var bot = 0
      var top = parts.length - 1
      var s: Option[String] = None
      while (s.isEmpty && top >= bot) {
        val i = (bot + top) >> 1
        if ( parts(i).contains(value) ) {
          s = Option(parts(i).pathPart)
        } else if ( value <= parts(i).beg) {
          top = i - 1
        } else {
          bot = i + 1
        }
      }
      s
    }

    /**
      * Returns the path component for the
      * [[org.cert.netsa.io.ipfix.Record Record]] `rec` based on this
      * partition as an Option.  Returns None there is not a specific
      * component for the field's value.  The return value when the
      * field `key` is not present in the
      * [[org.cert.netsa.io.ipfix.Record Record]] depends on the value
      * of `useNullComponent`: returns None when it is `false` and
      * `Some(s"is_null\${delim}\${encodedKey}")` when it is `true.
      */
    def pathForRecord(rec: Record): Option[String] = {
      rec.apply[T](extractor) match {
        case Some(value) => findInterval(value).orElse(notInString)
        case None =>
          if (useNullComponent) {
            Option(s"is_null${delim}${encodedKey}")
          } else {
            None
          }
      }
    }

    /**
      * Returns the path component from the file named 'filename' based
      * on this partition as an Option.  Returns None there is not a
      * specific component for the field's value.  The return value when
      * the field `key` is not present in the Record depends on the
      * value of `useNullComponent`: returns None when it is `false` and
      * `Some("is_null\${delim}\${encodedKey}")` when it is `true`.
      */
    def pathForFilename(filename: String): Option[String] = {
      regex.findFirstMatchIn(filename) match {
        case Some(m) =>
          // found a match; convert the string to a number
          Try {
            val extracted = m.group(1)
            // The value gets Boxed, so ensure it gets the write type
            val value: T = min match {
              case _: Long => extracted.toLong.asInstanceOf[T]
              case _: Int => extracted.toInt.asInstanceOf[T]
              case _: Short => extracted.toShort.asInstanceOf[T]
              case _: Byte => extracted.toByte.asInstanceOf[T]
              case _ =>
                throw new UnsupportedOperationException(
                  "Unsupported partition type")
            }
            findInterval(value).orElse(notInString)
          } match {
            case Success(s) => s
            case Failure(e) =>
              logger.trace(
                s"Error parsing value for ${key} in '${filename}': ${e}")
              notInString
          }
        case None =>
          //logger.trace(
          //  s"Failed to find ${key} (regex='${regex}') in '${filename}'")
          if (useNullComponent) {
            Option(s"is_null${delim}${encodedKey}")
          } else {
            None
          }
      }
    }


    /**
      * Checks that the value 'v' is within the range from 'min' to
      * 'max'.  This is helper function used when adding a partition.
      */
    private[this] def checkRange(v: T): Unit = {
      if (v < min) {
        throw new IllegalArgumentException(
          s"Value ${v} is less than the allowed minimum of ${min}")
      }
      if (v > max) {
        throw new IllegalArgumentException(
          s"Value ${v} is greater than the allowed maximum of ${max}")
      }
    }

    /**
      * Throws an error when the caller attempts to add a partition
      * after calling one of the functions that covers all remaining
      * intervals (e.g., `addMultiplePartitionsEqualsAny`,
      * `addSinglePartitionEqualsAny`).
      */
    private[this] def checkCompletelyCovered(): Unit = {
      if ( coverRangesUsed ) {
        throw new IllegalArgumentException(
          "May not add a partition once coverRanges() has been called.")
      }
      if ( notInString.nonEmpty ) {
        throw new IllegalArgumentException(
          "May not add a partition once addNotIn() has been called.")
      }
    }

    /**
      * Inserts the interval 'v' into the global `parts` Vector at
      * position 'pos' and returns a new vector.  If 'pos' is -1, `v` is
      * appended to the vector.
      */
    private[this] def insertIntervalAt(v: Interval, pos: Int):
        Vector[Interval] =
    {
      if ( pos == -1 ) {
        // append
        parts :+ v
      } else if ( pos == 0 ) {
        // prepend
        v +: parts
      } else {
        // split and rejoin
        val splitVec = parts.splitAt(pos)
        ((splitVec._1 :+ v) ++ splitVec._2)
      }
    }

    /**
      * Updates the global Vector `parts` with the Interval `v`.
      */
    private[this] def addInterval(v: Interval): Unit = {
      var prev: Option[Interval] = None
      var next: Option[Interval] = None

      // find the first interval 'p' that would follow the new interval
      // 'v': either p.beg is greater the v.beg or (p.beg equals v.beg
      // and v.begIncluded is true)
      val pos = if ( v.begIncluded ) {
        parts.indexWhere({p => (p.beg >= v.beg)})
      } else {
        parts.indexWhere({p => (p.beg > v.beg)})
      }
      if ( pos == -1 ) {
        if ( !parts.isEmpty ) {
          prev = Option(parts.last)
        }
      } else {
        next = Option(parts(pos))
        if ( pos > 0 ) {
          prev = Option(parts(pos - 1))
        }
      }

      for (p <- prev) {
        // check overlap of end of p with begin of v
        if (v.beg < p.end
          || (v.beg == p.end && v.begIncluded && p.endIncluded))
        {
          throw new IllegalArgumentException(
            s"New interval (${v}) overlaps with existing interval (${p})")
        }
      }
      for (p <- next) {
        // check overlap of begin of p with end of v
        if (p.beg < v.end
          || (p.beg == v.end && p.begIncluded && v.endIncluded))
        {
          throw new IllegalArgumentException(
            s"New interval (${v}) overlaps with existing interval (${p})")
        }
      }

      parts = insertIntervalAt(v, pos)
    }

    /** Add a partition for when Record value `x` == `v`.
      *
      * @throws java.lang.IllegalArgumentException when an existing
      *     interval includes `v` or when `v` is outside the range
      *     `min`–`max`.
      */
    def addPartitionEquals(v: T): Unit = {
      checkCompletelyCovered()
      checkRange(v)
      val i = Interval(v, true, v, true,
        s"eq${delim}${encodedKey}${delim}${v}", {(x: T) => (x == v)})
      addInterval(i)
    }

    @deprecated("Replace with addMultiplePartitionsEqualsAny", "1.2.2")
    def addPartitionEqualsAny(s: Seq[T]): Unit =
      addMultiplePartitionsEqualsAny(s)

    /** Add multiple partitions for when Record value `x` equals any
      * value in the list.  This is a convenience function that invokes
      * [[addPartitionEquals]] for each value in `s`.
      * @throws java.lang.IllegalArgumentException when an existing
      * interval includes any value in `s` or when `v` is outside the
      * range `min`–`max`.
      *
      * @see [[addSinglePartitionEqualsAny]] for a method that puts
      *     multiple values in a single partition
      */
    def addMultiplePartitionsEqualsAny(s: Seq[T]): Unit = {
      checkCompletelyCovered()
      for (v <- s)
        addPartitionEquals(v)
    }

    /** Add a single partition for when Record value `x` equals any
      * value in the list.
      *
      * @throws java.lang.IllegalArgumentException when an existing
      *     interval includes any value in `s` or when `v` is outside
      *     the range `min`–`max`.
      *
      * @see [[addMultiplePartitionsEqualsAny]] for a method that
      *     creates multiple partitions: one for each value in a
      *     sequence
      */
    def addSinglePartitionEqualsAny(s: Seq[T]): Unit = {
      checkCompletelyCovered()
      for (v <- s)
        checkRange(v)

      // create multiple intervals that share the same path
      val sb = new StringBuilder()
      (for (v <- s) yield s"${v}").
        addString(sb, s"in${delim}${encodedKey}${delim}", delim, "")
      val path = sb.mkString

      for (v <- s) {
        val i = Interval(v, true, v, true, path, {(x: T) => (x == v)})
        addInterval(i)
      }
    }

    /**
      * Add a partition for when Record value `x` >= `v`.  Since
      * partitions may not overlap, this method and
      * [[addPartitionGreaterThan]] may only be called one time for a
      * Partitioner.
      * @throws java.lang.IllegalArgumentException when an existing
      * interval includes any value greater than or equal to `v` or when
      * `v` is outside the range `min`–`max`.
      */
    def addPartitionGreaterEquals(v: T): Unit = {
      checkCompletelyCovered()
      checkRange(v)
      val i = Interval(v, true, max, true,
        s"ge${delim}${encodedKey}${delim}${v}", {(x: T) => (x >= v)})
      addInterval(i)
    }

    /**
      * Add a partition for when Record value `x` > `v`.  Since
      * partitions may not overlap, this method and
      * [[addPartitionGreaterEquals]] may only be called one time for a
      * Partitioner.
      * @throws java.lang.IllegalArgumentException when an existing
      * interval includes any value greater than `v`, when `v` is
      * outside the range `min`–`max`, or when `v` equals `max`.
      */
    def addPartitionGreaterThan(v: T): Unit = {
      checkCompletelyCovered()
      checkRange(v)
      if (v == max) {
        throw new IllegalArgumentException(
          "GreaterThan partition will always be empty when" +
            s" value equals the allowed maximum of ${max}")
      }
      val i = Interval(v, false, max, true,
        s"gt${delim}${encodedKey}${delim}${v}", {(x: T) => (x > v)})
      addInterval(i)
    }

    /** Add a partition for when Record value `x` <= `v`.  Since
      * partitions may not overlap, this method and
      * [[addPartitionLessThan]] may only be called one time for a
      * Partitioner.
      *
      * @throws java.lang.IllegalArgumentException when an existing
      *     interval includes any value less than or equal to `v` or
      *     when `v` is outside the range `min`–`max`.
      */
    def addPartitionLessEquals(v: T): Unit = {
      checkCompletelyCovered()
      checkRange(v)
      val i = Interval(min, true, v, true,
        s"le${delim}${encodedKey}${delim}${v}", {(x: T) => (x <= v)})
      addInterval(i)
    }

    /** Add a partition for when Record value `x` < `v`.  Since
      * partitions may not overlap, this method and
      * [[addPartitionLessEquals]] may only be called one time for a
      * Partitioner.
      *
      * @throws java.lang.IllegalArgumentException when an existing
      *     interval includes any value less than `v`, when `v` is
      *     outside the range `min`–`max`, or when `v` equals `min`.
      */
    def addPartitionLessThan(v: T): Unit = {
      checkCompletelyCovered()
      checkRange(v)
      if (v == min) {
        throw new IllegalArgumentException(
          "LessThan partition will always be empty when" +
            s" value equals the allowed minimum of ${min}")
      }
      val i = Interval(min, true, v, false,
        s"lt${delim}${encodedKey}${delim}${v}", {(x: T) => (x < v)})
      addInterval(i)
    }

    /**
      * Add a partition for when Record value `x` satifies `lower` <=
      * `x` <= `upper`.
      *
      * @throws java.lang.IllegalArgumentException when an existing
      *     interval includes any value between `lower` and `upper`
      *     inclusive or when `lower` or `upper` are outside the range
      *     `min`–`max`.
      */
    def addPartitionRange(lower: T, upper: T): Unit =
      addPartitionRange(lower, true, upper, true)

    /**
      * Add a partition for when Record value `x` is within the range
      * `lower` to `upper` where `lower` and/or `upper` may be excluded
      * from the range by specifying the `lowerIncluded` and/or
      * `upperIncluded` parameters as `false`.
      *
      * @throws java.lang.IllegalArgumentException when an existing
      *     interval includes any value that would also be included in
      *     this interval, when `lower` or `upper` are outside the range
      *     `min`–`max`, when `lower` is greater than `upper`, or when
      *     either Boolean parameter is `false` and `lower` equals
      *     `upper`.
      */
    def addPartitionRange(
      lower: T, lowerIncluded: Boolean,
      upper: T, upperIncluded: Boolean): Unit =
    {
      checkCompletelyCovered()
      if (lower < min) {
        throw new IllegalArgumentException(
          s"Lower bound ${lower} is less than the allowed minimum of ${min}")
      }
      if (upper > max) {
        throw new IllegalArgumentException(
          s"Upper bound ${upper} is greater than the allowed maximum of ${max}")
      }
      if (lower > upper) {
        throw new IllegalArgumentException(
          s"Lower bound ${lower} is greater than the upper bound ${upper}")
      }
      if (lower == upper && !(lowerIncluded && upperIncluded)) {
        throw new IllegalArgumentException(
          s"When the lower bound and upper bound are equal ${lower} both" +
            " lowerIncluded and upperIncluded must be true")
      }
      val i = Interval(lower, lowerIncluded, upper, upperIncluded)
      addInterval(i)
    }

    /**
      * Adds multiple range rules to this Partitioner for all values
      * that are not currently covered by an existing partition so that
      * every possible value between `min` and `max` is covered. In
      * addtion, causes the Partitioner to create an "is_null=KEY"
      * directory for Records that do not contain the field `key`.
      *
      * If called on an empty Partitioner, creates a Partition from
      * `min` to `max` inclusive.
      *
      * @see [[addNotIn]] for an alternative way to handle remaining
      *     values
      */
    def coverRanges(): Unit = {
      checkCompletelyCovered()

      if ( parts.isEmpty ) {
        val i = Interval(min, true, max, true)
        addInterval(i)
        coverRangesUsed = true
        useNullComponent = true
        return
      }
      // new vector to build
      val newParts = new VectorBuilder[Interval]
      newParts.sizeHint(2 + 2 * parts.length)

      // handle the first Interval and any unhandled values before it
      val p = parts(0)
      if ( p.beg != min ) {
        val v = p.beg
        val i = if ( p.begIncluded ) {
          Interval(min, true, v, false,
            s"lt${delim}${encodedKey}${delim}${v}", {(x: T) => (x < v)})
        } else {
          Interval(min, true, v, true,
            s"le${delim}${encodedKey}${delim}${v}", {(x: T) => (x <= v)})
        }
        newParts ++= Seq(i, p)
      } else if ( p.begIncluded ) {
        newParts += p
      } else {
        val i = Interval(min, true, min, true)
        newParts ++= Seq(i, p)
      }

      var cur = p.end
      var curIncluded = p.endIncluded

      // loop over the remaining existing Intervals
      for (p <- parts.tail) {
        // ensure the current set of Intervals is sane
        if (p.beg < cur || (p.beg == cur && curIncluded && p.begIncluded)) {
          // something is wrong
          val upTo = if (curIncluded) { "<=" } else { "<" }
          throw new RuntimeException(s"Interval ${p} overlaps with previously" +
            s" processed values ${min} <= _ ${upTo} ${cur}")
        }
        if (p.beg > cur || (p.beg == cur && !curIncluded && !p.begIncluded)) {
          // add interval from cur to b.beg
          val i = Interval(cur, !curIncluded, p.beg, !p.begIncluded)
          newParts ++= Seq(i, p)
        } else {
          newParts += p
        }
        cur = p.end
        curIncluded = p.endIncluded
      }
      // handle any unhandled values after the final Interval
      if ( cur < max ) {
        val i = if ( curIncluded ) {
          Interval(cur, false, max, true,
            s"gt${delim}${encodedKey}${delim}${cur}", {(x: T) => (x > cur)})
        } else {
          Interval(cur, true, max, true,
            s"ge${delim}${encodedKey}${delim}${cur}", {(x: T) => (x >= cur)})
        }
        newParts += i
      } else if ( !curIncluded ) {
        val i = Interval(max, true, max, true)
        newParts += i
      }

      parts = newParts.result()

      coverRangesUsed = true
      useNullComponent = true
    }

    /**
      * Adds a single additional rule to this Partitioner that covers
      * all values that are not currently covered by an existing
      * partition.  This method may only be used when the Partitioner
      * does not contain any ranges; that is, when only
      * [[addPartitionEquals]], [[addMultiplePartitionsEqualsAny]], and
      * [[addSinglePartitionEqualsAny]] have been used.
      *
      * @throws java.lang.RuntimeException when the resulting path
      * component length would be larger than max_path_component, when
      * the Partitioner contains ranges, or when the Partitioner is
      * empty.
      *
      * @see [[coverRanges]] for an alternative way to handle remaining
      *     values
      */
    def addNotIn(): Unit = {
      checkCompletelyCovered()

      if ( parts.isEmpty ) {
        throw new RuntimeException(
          "May not use addNotIn() when no Partitions exist")
      }

      // ensure that each existing Interval is a point, not a range
      for (p <- parts) {
        if ( p.beg != p.end ) {
          throw new RuntimeException(
            "May not use addNotIn() when Partitioner includes ranges")
        }
        assert(p.begIncluded)
        assert(p.endIncluded)
      }

      // build a string of the values in the existing Intervals
      val sb = new StringBuilder()
      (for (p <- parts) yield s"${p.beg}").
        addString(sb, s"not_in${delim}${encodedKey}${delim}", delim, "")
      val path = sb.mkString

      if (path.length > max_path_component) {
        throw new RuntimeException("Resulting path component is too long")
      }

      // Store the string
      notInString = Option(path)

// The following code is similar to [[coverRanges]] in that in creates
// Intervals for each unhandled value.  Instead of doing this, we use
// the `notInString` whenever the binary search of the Intervals returns
// None.
//
//    // create and add an Interval for each uncovered range and have
//    // it use the path created above
//
//    // new vector to build
//    var newParts = new VectorBuilder[Interval]
//    newParts.sizeHint(2 + parts.length)
//
//    // handle the first Interval and any unhandled values before it
//    var p = parts(0)
//    if ( p.beg == min ) {
//      newParts += p
//    } else {
//      val i = Interval(min, true, p.beg, false, path, {(x: T) => (x < p.beg)})
//      newParts += (i, p)
//    }
//
//    var cur = p.beg
//
//    // loop over the remaining existing Intervals
//    for (p <- parts.tail) {
//      // ensure the current set of Intervals is sane
//      if (p.beg <= cur) {
//        // something is wrong
//        throw new RuntimeException(s"Interval ${p} overlaps with previously" +
//          s" processed values ${min} <= _ <= ${cur}")
//      }
//      // add interval from cur to b.beg
//      val i = Interval(cur, false, p.beg, false, path,
//        {(x: T) => (x > cur && x < p.end)})
//      newParts += (i, p)
//      cur = p.beg
//    }
//    // handle any unhandled values after the final Interval
//    if ( cur < max ) {
//      val i = Interval(cur, false, max, true, path, {(x: T) => (x > cur)})
//      newParts += i
//    }
//
//    parts = newParts.result()

      useNullComponent = true
    }

    /**
      * Checks that the ranges in the Partition are sane (i.e., that
      * they do not overlap) and returns `true` if they do not overlap
      * and `false` when they do.
      *
      * @param verbose If `true`, print each range as it is examined,
      * any holes found in the Partitions, and ranges that overlap.
      */
    def checkSanity(verbose: Boolean): Boolean = {
      var result = true
      var cur = min
      var curIncluded = false

      for (p <- parts) {
        if (p.beg < cur || (p.beg == cur && curIncluded && p.begIncluded)) {
          // something is wrong
          result = false
          if ( verbose ) {
            val upTo = if (curIncluded) { "<=" } else { "<" }
            println(s"Overlap ${p} with previous values ${upTo} ${cur}")
          }
        } else if ( !verbose ) {
          // do nothing
        } else if (p.beg > cur
          || (p.beg == cur && !curIncluded && !p.begIncluded))
        {
          // interval from cur to b.beg is missing
          val i = Interval(cur, !curIncluded, p.beg, !p.begIncluded)
          println(s"Missing ${i}")
          println(s"Covered ${p}")
        } else {
          // this interval meets the previous one
          println(s"Covered ${p}")
        }
        cur = p.end
        curIncluded = p.endIncluded
      }
      if ( verbose && (cur < max || !curIncluded) ) {
        val i = Interval(cur, !curIncluded, max, true)
        println(s"Missing ${i}")
      }
      result
    }

  }

  /** Partition a [[org.cert.netsa.io.ipfix.Record Record]] on its `key`
    * field which contains an unsigned 8-bit value. */
  final case class UInt8(key: String)
      extends NumericPartitioner[Short](key, 0.toShort, 0xff.toShort)

  /** Partition a [[org.cert.netsa.io.ipfix.Record Record]] on its `key`
    * field which contains an unsigned 16-bit value. */
  final case class UInt16(key: String)
      extends NumericPartitioner[Int](key, 0, 0xffff)

  /** Partition a [[org.cert.netsa.io.ipfix.Record Record]] on its `key`
    * field which contains an unsigned 32-bit value. */
  final case class UInt32(key: String)
      extends NumericPartitioner[Long](key, 0L, 0xffffffffL)

  /** Partition a [[org.cert.netsa.io.ipfix.Record Record]] on its `key`
    * field which contains an unsigned 64-bit value. */
  // FIXME: This upper bound is wrong.
  final case class UInt64(key: String)
      extends NumericPartitioner[Long](key, 0L, 0x7fffffffffffffffL)

  /** Partition a [[org.cert.netsa.io.ipfix.Record Record]] on its `key`
    * field which contains a signed 8-bit value. */
  final case class Int8(key: String)
      extends NumericPartitioner[Short](key, -0x80.toShort, 0x7f.toShort)

  /** Partition a [[org.cert.netsa.io.ipfix.Record Record]] on its `key`
    * field which contains a signed 16-bit value. */
  final case class Int16(key: String)
      extends NumericPartitioner[Int](key, -0x8000, 0x7fff)

  /** Partition a [[org.cert.netsa.io.ipfix.Record Record]] on its `key`
    * field which contains a signed 32-bit value. */
  final case class Int32(key: String)
      extends NumericPartitioner[Long](key, -0x80000000L, 0x7fffffffL)

  /** Partition a [[org.cert.netsa.io.ipfix.Record Record]] on its `key`
    * field which contains a signed 64-bit value. */
  final case class Int64(key: String)
      extends NumericPartitioner[Long](key,
        -0x8000000000000000L, 0x7fffffffffffffffL)

  /** Partition a [[org.cert.netsa.io.ipfix.Record Record]] on its `key`
    * field which contains a 32-bit floating point value. */
  final case class Float32(key: String)
      extends NumericPartitioner[Float](key, Float.MinValue, Float.MaxValue)

  /** Partition a [[org.cert.netsa.io.ipfix.Record Record]] on its `key`
    * field which contains a 64-bit floating point value. */
  final case class Float64(key: String)
      extends NumericPartitioner[Double](key, Double.MinValue, Double.MaxValue)


  /*
   * ************************************************************************
   * These two classes allow partitioning based on IP addresses.
   *
   * Their logic is the same as that for the numeric types earlier in
   * this file, meaning that both classes treat the value as a number.
   * This does not work so well in Spark-land where IP addresses are
   * represented as strings.
   *
   * In addition, the IPv6Address class assumes ":" is a valid character
   * in a file system path.  These addresses may need to be URL encoded
   * instead.
   *
   * ************************************************************************
   */
/*

  /** Partition a [[org.cert.netsa.io.ipfix.Record Record]] on its `key`
  field which contains an IPv4
    * Address. */
  final case class IPv4Address(key: String)
      extends NumericPartitioner[NetIPv4](key,
        NetIPv4("0.0.0.0"), NetIPv4("255.255.255.255"))
  {
    /**
      * Add a partition for when Record value `x` is contained in the
      * IPv4 Block `cidr`.
      */
    def addPartitionRange(cidr: IPv4Block): Unit =
      addPartitionRange(cidr.min, true, cidr.max, true)
  }

  /** Partition a [[org.cert.netsa.io.ipfix.Record Record]] on its `key`
    * field which contains an IPv6 Address. */
  final case class IPv6Address(key: String)
      extends NumericPartitioner[NetIPv6](key,
        NetIPv6("::"), NetIPv6("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")):
      Unit =
  {
    /**
      * Add a partition for when Record value `x` is contained in the
      * IPv6 Block `cidr`.
      */
    def addPartitionRange(cidr: IPv6Block): Unit =
      addPartitionRange(cidr.min, true, cidr.max, true)
  }

 */



  /*
   * Partition a [[org.cert.netsa.io.ipfix.Record Record]] on its `key`
   * field, creating a separate
   * partition for each unique value seen for that field.
   *
   * This Partitioner should be used for fields that have only a few
   * unique values, such as vlanId or observationDomain.  Other fields
   * should use a subclass of the [[NumericPartitioner]].  The
   * [[NumericPartitioner]] ensures that unexpected values go into a
   * relatively small number of files.  The [[NumericPartitioner]] is
   * also good for floating point values because of the issues
   * checking equality of floating point values.
   *
   * The type `T` should be an integer type.  It is passed to the
   * [[org.cert.netsa.io.ipfix.DeepFieldExtractor]].
   *
   * @param key The name of the Information Element to be partioned on
   * @tparam T The type of the field which is expected to be an
   * integer type ([[Short]], [[Int]], or [[Long]]).
   */
  case class UniqueValue[T](key: String)
      extends Partitioner
  {
    /** Whether a missing value in the record is recorded by an
      * "is_null=FIELD" directory component. */
    final val useNullComponent = true

    /** The object used to extract the field 'key' from a Record. */
    private[this] val extractor = DeepFieldExtractor[T](key)

    /** URL encoded version of the key */
    private[this] val encodedKey = URLEncoder.encode(key, "UTF-8")

    private[this] val partitionEqual = s"eq${delim}${encodedKey}${delim}"

    private[this] val partitionNull = Option(s"is_null${delim}${encodedKey}")

    /** Regex to parse key from a super_mediator invariant filename */
    private[this] val regex = new Regex(s"-${key}-" + """(\d+)\D""")

    /**
      * Returns the path component for the Record `rec` based on this
      * partition as an Option.  Returns None there is not a specific
      * component for the field's value.  The return value when the
      * field `key` is not present in the Record depends on the value
      * of `useNullComponent`: returns None when it is `false` and
      * {{{Option("is_null\${delim}\${encodedKey}")}}} when it is `true`.
      */
    final def pathForRecord(rec: Record): Option[String] = {
      rec.apply[T](extractor) match {
        case Some(value) =>
          Option(s"${partitionEqual}${value}")
        case None =>
          if (useNullComponent) {
            partitionNull
          } else {
            None
          }
      }
    }

    /**
      * Returns the path component from the file named 'filename' based on
      * this partition as an Option.  Returns None there is not a specific
      * component for the field's value.  The return value when the field
      * `key` is not present in the Record depends on the value of
      * `useNullComponent`: returns None when it is `false` and
      * {{{Option("is_null\${delim}\${encodedKey}")}}} when it is `true`.
      */
    def pathForFilename(filename: String): Option[String] = {
      // WARNING: This grabs whatever numbers follow the key, even if the
      // value is outside the range of the key ("protocolIdentifier-777-")
      regex.findFirstMatchIn(filename) match {
        case Some(m) =>
          Option(s"${partitionEqual}${m.group(1)}")
        case None =>
          if (useNullComponent) {
            partitionNull
          } else {
            None
          }
      }
    }

  }


}

// @LICENSE_FOOTER@
//
// Copyright 2015-2022 Carnegie Mellon University. All Rights Reserved.
//
// This material is based upon work funded and supported by the
// Department of Defense and Department of Homeland Security under
// Contract No. FA8702-15-D-0002 with Carnegie Mellon University for the
// operation of the Software Engineering Institute, a federally funded
// research and development center sponsored by the United States
// Department of Defense. The U.S. Government has license rights in this
// software pursuant to DFARS 252.227.7014.
//
// NO WARRANTY. THIS CARNEGIE MELLON UNIVERSITY AND SOFTWARE ENGINEERING
// INSTITUTE MATERIAL IS FURNISHED ON AN "AS-IS" BASIS. CARNEGIE MELLON
// UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER EXPRESSED OR
// IMPLIED, AS TO ANY MATTER INCLUDING, BUT NOT LIMITED TO, WARRANTY OF
// FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY, OR RESULTS
// OBTAINED FROM USE OF THE MATERIAL. CARNEGIE MELLON UNIVERSITY DOES NOT
// MAKE ANY WARRANTY OF ANY KIND WITH RESPECT TO FREEDOM FROM PATENT,
// TRADEMARK, OR COPYRIGHT INFRINGEMENT.
//
// Released under a GNU GPL 2.0-style license, please see LICENSE.txt or
// contact [email protected] for full terms.
//
// [DISTRIBUTION STATEMENT A] This material has been approved for public
// release and unlimited distribution. Please see Copyright notice for
// non-US Government use and distribution.
//
// Carnegie Mellon(R) and CERT(R) are registered in the U.S. Patent and
// Trademark Office by Carnegie Mellon University.
//
// This software includes and/or makes use of third party software each
// subject to its own license as detailed in LICENSE-thirdparty.tx
//
// DM20-1143




© 2015 - 2024 Weber Informatics LLC | Privacy Policy