All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.sadikovi.spark.netflow.DefaultSource.scala Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2016 sadikovi
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.github.sadikovi.spark.netflow

import java.io.IOException

import scala.collection.JavaConverters._
import scala.util.{Failure, Success, Try}
import scala.util.control.NonFatal

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, FSDataInputStream, Path}
import org.apache.hadoop.mapreduce.Job

import org.apache.spark.TaskContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile}
import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
import org.apache.spark.sql.types.StructType

import org.slf4j.LoggerFactory

import com.github.sadikovi.netflowlib.NetFlowReader
import com.github.sadikovi.netflowlib.predicate.Operators.FilterPredicate
import com.github.sadikovi.spark.netflow.sources._
import com.github.sadikovi.spark.util.{CloseableIterator, SerializableConfiguration}

class DefaultSource extends FileFormat with DataSourceRegister {
  @transient private val log = LoggerFactory.getLogger(classOf[DefaultSource])
  // Resolved interface for NetFlow version schema
  private var interface: ResolvedInterface = _
  // Resolved datasource options
  private var opts: NetFlowOptions = _

  override def shortName(): String = "netflow"

  /** Resolve interface based on datasource options and list of files to read */
  private[netflow] def resolveInterface(
      spark: SparkSession,
      options: Map[String, String],
      paths: Seq[Path]): ResolvedInterface = {
    // Create interface name to resolve SQL schema
    val className = options.get("version") match {
      case Some(version) => Try(version.toInt) match {
        case Success(flowVersion) => s"${DefaultSource.INTERNAL_PARTIAL_CLASSNAME}$flowVersion"
        case Failure(error) => version
      }
      case None =>
        // infer schema from files
        val flowVersion = DefaultSource.
          inferVersion(spark.sparkContext.hadoopConfiguration, paths).
          getOrElse(DefaultSource.VERSION_5)
        s"${DefaultSource.INTERNAL_PARTIAL_CLASSNAME}$flowVersion"
    }

    NetFlowRegistry.createInterface(className)
  }

  override def inferSchema(
      spark: SparkSession,
      options: Map[String, String],
      files: Seq[FileStatus]): Option[StructType] = {
    // Create interface name to resolve SQL schema
    if (interface == null) {
      interface = resolveInterface(spark, options, files.map(_.getPath))
    }
    log.info(s"Resolved interface as $interface")

    if (opts == null) {
      opts = new NetFlowOptions(options)
    }
    log.info(s"Resolved options as $opts")

    Some(interface.getSQLSchema(opts.applyConversion))
  }

  override def buildReader(
      spark: SparkSession,
      dataSchema: StructType,
      partitionSchema: StructType,
      requiredSchema: StructType,
      filters: Seq[Filter],
      options: Map[String, String],
      hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = {
    // Check that interface and options are resolved
    assert(interface != null)
    assert(opts != null)

    // Convert SQL columns to internal mapped columns, also check if statistics are enabled, so
    // schema is adjusted to include statistics columns
    val resolvedColumns: Array[MappedColumn] = if (requiredSchema.isEmpty) {
      Array.empty
    } else {
      requiredSchema.fieldNames.map { col => interface.getColumn(col) }
    }

    // Resolve filters into filters we support, also reduce to return only one Filter value
    val reducedFilter: Option[Filter] = NetFlowFilters.reduceFilter(filters)
    log.info(s"Reduced filter: $reducedFilter")

    // Convert filters into NetFlow filters, we also use `usePredicatePushdown` to disable
    // predicate pushdown (normally it is used for benchmarks), but in some situations when
    // library filters incorrectly this can be a short time fix
    val resolvedFilter: Option[FilterPredicate] = reducedFilter match {
      case Some(filter) if opts.usePredicatePushdown =>
        Option(NetFlowFilters.convertFilter(filter, interface))
      case Some(filter) if !opts.usePredicatePushdown =>
        log.warn("NetFlow library-level predicate pushdown is disabled")
        None
      case other =>
        None
    }
    log.info(s"Resolved NetFlow filter: $resolvedFilter")

    prepareRead(spark, hadoopConf, interface, opts, resolvedColumns, resolvedFilter)
  }

  /** Return partial function to read partitioned files */
  private def prepareRead(
      spark: SparkSession,
      hadoopConf: Configuration,
      interface: ResolvedInterface,
      opts: NetFlowOptions,
      resolvedColumns: Array[MappedColumn],
      resolvedFilter: Option[FilterPredicate]): (PartitionedFile) => Iterator[InternalRow] = {
    // Required version of NetFlow files
    val flowVersion = interface.version()
    // Array of internal columns for NetFlow library
    val internalColumns = resolvedColumns.map(_.internalColumn)
    // Number of columns to process
    val numColumns = resolvedColumns.length
    // when true, ignore corrupt files, either with wrong header or corrupt data block
    val ignoreCorruptFiles =
      spark.conf.getOption("spark.files.ignoreCorruptFiles").map(_.toBoolean).getOrElse(false)
    val confBroadcast = spark.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
    val optsBroadcast = spark.sparkContext.broadcast(opts)

    (file: PartitionedFile) => {
      // Separate logger for each partition, do not use global logger for default source
      val log = LoggerFactory.getLogger(classOf[DefaultSource])
      val opts = optsBroadcast.value
      val conf = confBroadcast.value.value

      val path = new Path(file.filePath)
      val fs = path.getFileSystem(conf)
      val fileLength = file.length

      // Prepare file stream
      var stm: FSDataInputStream = fs.open(path)
      // If reader initialization fails we either reset to null or report error
      // latter check will ensure that we log corrupt file
      val reader = try {
        NetFlowReader.prepareReader(stm, opts.bufferSize, ignoreCorruptFiles)
      } catch {
        case NonFatal(err) if ignoreCorruptFiles => null
        case NonFatal(err) => throw new RuntimeException(s"${err.getMessage}, file=$path", err)
      }
      // This flag is only checked when ignoreCorruptFiles = true, otherwise initialization will
      // throw exception, if file is corrupt
      if (reader == null || !reader.isValid()) {
        log.warn(s"Failed to read file $path, ignoreCorruptFiles=$ignoreCorruptFiles")
        Iterator.empty
      } else {
        val header = reader.getHeader()
        // Actual version of the file
        val actualVersion = header.getFlowVersion()

        log.debug(s"""
            > NetFlow: {
            >   File: $path,
            >   File length: $fileLength bytes,
            >   Flow version: $actualVersion,
            >   Compression: ${header.isCompressed()},
            >   Buffer size: ${opts.bufferSize} bytes,
            >   Start capture: ${header.getStartCapture()},
            >   End capture: ${header.getEndCapture()},
            >   Hostname: ${header.getHostname()},
            >   Comments: ${header.getComments()}
            > }
          """.stripMargin('>'))

        // Currently we cannot resolve version and proceed with parsing, we require pre-set version.
        require(actualVersion == flowVersion,
          s"Expected version $flowVersion, got $actualVersion for file $path. Scan of the files " +
            "with different (but compatible) versions, e.g. 5, 6, and 7 is not supported currently")
        // Build record buffer based on resolved filter, if filter is not defined use default scan
        // with trivial predicate
        val recordBuffer = resolvedFilter match {
          case Some(filter) => reader.prepareRecordBuffer(internalColumns, filter)
          case None => reader.prepareRecordBuffer(internalColumns)
        }

        val rawIterator = new CloseableIterator[Array[Object]] {
          private var delegate = recordBuffer.iterator().asScala
          // add filepath to report for any error message
          private val filepath = path

          override def getNext(): Array[Object] = {
            // If delegate has traversed over all elements mark it as finished
            // to allow to close stream
            if (delegate.hasNext) {
              try {
                delegate.next
              } catch {
                case NonFatal(err) =>
                  throw new RuntimeException(s"${err.getMessage}, file=$filepath", err)
              }
            } else {
              finished = true
              null
            }
          }

          override def close(): Unit = {
            // Close stream if possible of fail silently,
            // at this point exception does not really matter
            try {
              if (stm != null) {
                stm.close()
                stm = null
              }
            } catch {
              case err: Exception => // do nothing
            }
          }
        }

        // Ensure that the reader is closed even if the task fails or doesn't consume the entire
        // iterator of records.
        Option(TaskContext.get()).foreach { taskContext =>
          taskContext.addTaskCompletionListener { _ =>
            rawIterator.closeIfNeeded
          }
        }

        // Conversion iterator, applies defined modification for convertable fields
        val withConversionsIterator = if (opts.applyConversion) {
          // For each field we check if possible conversion is available. If it is we apply direct
          // conversion, otherwise return unchanged value. Note that this should be in sync with
          // `applyConversion` and updated schema from `ResolvedInterface`.
          // TODO: improve performance of conversion
          rawIterator.map { arr =>
            for (i <- 0 until numColumns) {
              resolvedColumns(i).convertFunction match {
                case Some(func) => arr(i) = func.directCatalyst(arr(i))
                case None => // do nothing
              }
            }
            arr
          }
        } else {
          rawIterator
        }

        new Iterator[InternalRow] {
          override def next(): InternalRow = InternalRow.fromSeq(withConversionsIterator.next())
          override def hasNext: Boolean = withConversionsIterator.hasNext
        }
      }
    }
  }

  override def prepareWrite(
      spark: SparkSession,
      job: Job,
      options: Map[String, String],
      dataSchema: StructType): OutputWriterFactory = {
    throw new UnsupportedOperationException("Write is not supported in this version of package")
  }

  override def equals(other: Any): Boolean = other match {
    case _: DefaultSource => true
    case _ => false
  }

  /** Get resolved interface, for testing */
  private[netflow] def getInterface(): ResolvedInterface = interface

  /** Get NetFlow options, for testing */
  private[netflow] def getOptions(): NetFlowOptions = opts
}

/** Companion object to maintain internal constants */
object DefaultSource {
  val INTERNAL_PARTIAL_CLASSNAME = "com.github.sadikovi.spark.netflow.version"
  val VERSION_5 = 5
  val VERSION_7 = 7

  /**
   * Infer NetFlow version given paths to the files.
   * Currently we look up only one file and check header version, since we only support read of
   * files that have the same version.
   */
  def inferVersion(conf: Configuration, paths: Seq[Path]): Option[Int] = {
    if (paths.isEmpty) {
      None
    } else {
      // take first path and read header
      val path = paths.head
      val fs = path.getFileSystem(conf)
      var stream: FSDataInputStream = null
      try {
        // load file with default buffer size and extract header
        stream = fs.open(path)
        val reader = NetFlowReader.prepareReader(stream)
        val header = reader.getHeader()
        Some(header.getFlowVersion())
      } catch {
        case ioe: IOException =>
          throw new IOException(
            s"Failed to infer version for provided NetFlow files using path '$path', " +
            s"reason: $ioe. Try specifying version manually using 'version' option", ioe)
      } finally {
        if (stream != null) {
          stream.close()
          stream = null
        }
      }
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy