com.github.sadikovi.netflowlib.NetFlowReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-netflow_2.10 Show documentation
spark-netflow
The newest version!
/*
 * Copyright 2016 sadikovi
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.github.sadikovi.netflowlib;

import java.io.DataInputStream;
import java.io.IOException;
import java.nio.ByteOrder;
import java.util.HashMap;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.github.sadikovi.netflowlib.ScanPlanner;
import com.github.sadikovi.netflowlib.Strategies.ScanStrategy;
import com.github.sadikovi.netflowlib.Buffers.RecordBuffer;
import com.github.sadikovi.netflowlib.Buffers.EmptyRecordBuffer;
import com.github.sadikovi.netflowlib.Buffers.FilterRecordBuffer;
import com.github.sadikovi.netflowlib.Buffers.ScanRecordBuffer;
import com.github.sadikovi.netflowlib.predicate.Columns.Column;
import com.github.sadikovi.netflowlib.predicate.Operators.FilterPredicate;
import com.github.sadikovi.netflowlib.statistics.Statistics;
import com.github.sadikovi.netflowlib.statistics.StatisticsTypes.LongStatistics;
import com.github.sadikovi.netflowlib.version.NetFlow;
import com.github.sadikovi.netflowlib.version.NetFlowV5;
import com.github.sadikovi.netflowlib.version.NetFlowV7;
import com.github.sadikovi.netflowlib.util.WrappedByteBuf;

/**
 * [[NetFlowReader]] is a main entry to process input stream of NetFlow file either from local
 * file system or HDFS. Provides API to retrieve header and other metadata before scanning records.
 * Uses statistics and planning based on [[ScanPlanner]] to decide whether or not the file needs
 * to be scanned.
 * [[FilterPredicate]] support is introduced to filter data on row basis.
 */
public final class NetFlowReader {
  private static Logger log = LoggerFactory.getLogger(NetFlowReader.class);

  // Internal byte offsets
  private static final short METADATA_LENGTH = 4;
  private static final short HEADER_OFFSET_LENGTH = 4;
  // Header check flags
  private static final short HEADER_MAGIC1 = 0xCF;
  private static final short HEADER_MAGIC2 = 0x10; // cisco flow
  // Byte order encoding
  private static final short HEADER_LITTLE_ENDIAN = 1;
  private static final short HEADER_BIG_ENDIAN = 2;

  /**
   * Initialize reader with input stream and provided buffer size, see
   * com.github.sadikovi.netflowlib.Buffers for more information on buffer size constants.
   * @param inputStream input stream, can be Hadoop FSDataInputStream
   * @param buffer buffer size in bytes
   * @param ignoreCorruptFile if true, ignores corrupt file, either when reading header or data
   * @return reader
   */
  public static NetFlowReader prepareReader(
      DataInputStream inputStream,
      int buffer,
      boolean ignoreCorruptFile) throws IOException {
    return new NetFlowReader(inputStream, buffer, ignoreCorruptFile);
  }

  /**
   * Initialize reader with input stream and buffer size. By default, fails if
   * file is corrupt, e.g. file is not NetFlow file, or has corrupt data block.
   * @param inputStream input stream, can be Hadoop FSDataInputStream
   * @param buffer buffer size in bytes
   * @return reader
   */
  public static NetFlowReader prepareReader(
      DataInputStream inputStream,
      int buffer) throws IOException {
    return prepareReader(inputStream, buffer, false);
  }

  /**
   * Initialize reader with input stream and default buffer size ~1Mb. By default, fails if
   * file is corrupt, e.g. file is not NetFlow file, or has corrupt data block.
   * @param inputStream input stream, can be Hadoop FSDataInputStream
   * @return reader
   */
  public static NetFlowReader prepareReader(DataInputStream inputStream) throws IOException {
    return prepareReader(inputStream, RecordBuffer.BUFFER_LENGTH_2, false);
  }

  /**
   * [[NetFlowReader]] provides interface to get parsed header and record buffer with chosen
   * strategy based on columns, predicate and statistics. Metadata, header are parsed as part of
   * initialization.
   */
  private NetFlowReader(
      DataInputStream inputStream,
      int buffer,
      boolean ignoreCorruptFile) throws IOException {
    in = inputStream;
    bufferLength = buffer;
    ignoreCorrupt = ignoreCorruptFile;
    byte[] metadata = null;
    WrappedByteBuf buf = null;

    try {
      metadata = new byte[METADATA_LENGTH];
      in.read(metadata, 0, METADATA_LENGTH);

      // Parse metadata, byte order does not really matter, so we go for big endian. Metadata contains
      // magic numbers to verify consistency of the NetFlow file, byte order encoded as either 1 or 2,
      // and stream version which affects header parsing (currently only 1 and 3 are supported).
      buf = WrappedByteBuf.init(metadata, ByteOrder.BIG_ENDIAN);
      short magic1 = buf.getUnsignedByte(0);
      short magic2 = buf.getUnsignedByte(1);
      short order = buf.getUnsignedByte(2);
      short stream = buf.getUnsignedByte(3);

      // Verify consistency of NetFlow file, also this ensures that we are at the beginning of the
      // input stream
      if (magic1 != HEADER_MAGIC1 || magic2 != HEADER_MAGIC2) {
        throw new IOException("Corrupt NetFlow file. Wrong magic number");
      }

      // Resolve byte order, last case corresponds to incorrect reading from buffer
      if (order == HEADER_BIG_ENDIAN) {
        byteOrder = ByteOrder.BIG_ENDIAN;
      } else if (order == HEADER_LITTLE_ENDIAN) {
        byteOrder = ByteOrder.LITTLE_ENDIAN;
      } else {
        throw new IOException("Could not recognize byte order " + order);
      }

      streamVersion = stream;

      // Check stream version
      ensureStreamVersion();

      // Read header
      header = getHeader();
    } catch (IOException err) {
      if (ignoreCorrupt) {
        // we subsume exception and log warning. Set header to null
        log.warn("Failed to initialize reader, ignoreCorruptFile=" + ignoreCorrupt +
          ", error=" + err);
        header = new CorruptNetFlowHeader();
      } else {
        throw err;
      }
    } finally {
      metadata = null;
      buf = null;
    }
  }

  /** Ensure that stream version is either version 1 or version 3 */
  private void ensureStreamVersion() throws UnsupportedOperationException {
    if (streamVersion != 1 && streamVersion != 3) {
      throw new UnsupportedOperationException("Unsupported stream version " + streamVersion);
    }
  }

  /** Prepare header using provided input stream */
  private NetFlowHeader prepareHeader() throws IOException {
    NetFlowHeader internalHeader;
    int numBytesRead = 0;
    int lenRead = 0;
    WrappedByteBuf buf;
    byte[] headerArray;

    // Read header depending on stream version (different from flow version)
    if (streamVersion == 1) {
      // Version 1 has static header
      // TODO: verify header size for stream version 1
      lenRead = NetFlowHeader.S1_HEADER_SIZE - METADATA_LENGTH;
      internalHeader = new NetFlowHeader(streamVersion, byteOrder);
    } else {
      // Version 3 with dynamic header size
      headerArray = new byte[HEADER_OFFSET_LENGTH];
      numBytesRead = in.read(headerArray, 0, HEADER_OFFSET_LENGTH);
      if (numBytesRead != HEADER_OFFSET_LENGTH) {
        throw new UnsupportedOperationException("Short read while loading header offset");
      }

      buf = WrappedByteBuf.init(headerArray, byteOrder);
      int headerSize = (int)buf.getUnsignedInt(0);
      if (headerSize <= 0) {
        throw new UnsupportedOperationException("Failed to load header of size " + headerSize);
      }

      // Actual header length, determine how many bytes to read
      lenRead = headerSize - METADATA_LENGTH - HEADER_OFFSET_LENGTH;
      internalHeader = new NetFlowHeader(streamVersion, byteOrder, headerSize);
    }

    // allocate buffer for length to read
    headerArray = new byte[lenRead];
    numBytesRead = in.read(headerArray, 0, lenRead);
    if (numBytesRead != lenRead) {
      throw new UnsupportedOperationException("Short read while loading header data");
    }
    // build buffer
    buf = WrappedByteBuf.init(headerArray, byteOrder);

    // resolve stream version (either 1 or 3)
    if (streamVersion == 1) {
      internalHeader.setFlowVersion((short)buf.getUnsignedShort(0));
      internalHeader.setStartCapture(buf.getUnsignedInt(2));
      internalHeader.setEndCapture(buf.getUnsignedInt(6));
      internalHeader.setHeaderFlags(buf.getUnsignedInt(10));
      internalHeader.setRotation(buf.getUnsignedInt(14));
      internalHeader.setNumFlows(buf.getUnsignedInt(18));
      internalHeader.setNumDropped(buf.getUnsignedInt(22));
      internalHeader.setNumMisordered(buf.getUnsignedInt(26));
      // Read hostname fixed bytes
      byte[] hostnameBytes = new byte[NetFlowHeader.S1_HEADER_HN_LEN];
      buf.getBytes(30, hostnameBytes, 0, hostnameBytes.length);
      internalHeader.setHostname(new String(hostnameBytes));
      // Read comments fixed bytes
      byte[] commentsBytes = new byte[NetFlowHeader.S1_HEADER_CMNT_LEN];
      buf.getBytes(30 + hostnameBytes.length, commentsBytes, 0, commentsBytes.length);
      internalHeader.setComments(new String(commentsBytes));

      // Dereference arrays
      hostnameBytes = null;
      commentsBytes = null;
    } else {
      // Resolve TLV (type-length value)
      // Set decode pointer to first tlv
      int dp = 0;
      int left = lenRead;
      // Smallest TLV is 2+2+0 (null TLV)
      // tlv_t - TLV type, tlv_l - TLV length, tlv_v - TLV value
      int tlv_t = 0;
      int tlv_l = 0;
      int tlv_v = 0;

      // Byte array for holding Strings
      byte[] pr;

      while (left >= 4) {
        // Parse type, store in host byte order
        tlv_t = buf.getUnsignedShort(dp);
        dp += 2;
        left -= 2;

        // Parse len, store in host byte order
        tlv_l = buf.getUnsignedShort(dp);
        dp += 2;
        left -= 2;

        // Parse val
        tlv_v = dp;

        // Point decode buffer at next tlv
        dp += tlv_l;
        left -= tlv_l;

        // TLV length check
        if (left < 0) {
          break;
        }

        switch(tlv_t) {
          // FT_TLV_VENDOR
          case 0x1:
            internalHeader.setVendor(buf.getUnsignedShort(tlv_v));
            break;
          // FT_TLV_EX_VER
          case 0x2:
            internalHeader.setFlowVersion((short) buf.getUnsignedShort(tlv_v));
            break;
          // FT_TLV_AGG_VER
          case 0x3:
            internalHeader.setAggVersion(buf.getUnsignedByte(tlv_v));
            break;
          // FT_TLV_AGG_METHOD
          case 0x4:
            internalHeader.setAggMethod(buf.getUnsignedByte(tlv_v));
            break;
          // FT_TLV_EXPORTER_IP
          case 0x5:
            internalHeader.setExporterIP(buf.getUnsignedInt(tlv_v));
            break;
          // FT_TLV_CAP_START
          case 0x6:
            internalHeader.setStartCapture(buf.getUnsignedInt(tlv_v));
            break;
          // FT_TLV_CAP_END
          case 0x7:
            internalHeader.setEndCapture(buf.getUnsignedInt(tlv_v));
            break;
          // FT_TLV_HEADER_FLAGS
          case 0x8:
            internalHeader.setHeaderFlags(buf.getUnsignedInt(tlv_v));
            break;
          // FT_TLV_ROT_SCHEDULE
          case 0x9:
            internalHeader.setRotation(buf.getUnsignedInt(tlv_v));
            break;
          // FT_TLV_FLOW_COUNT
          case 0xA:
            internalHeader.setNumFlows(buf.getUnsignedInt(tlv_v));
            break;
          // FT_TLV_FLOW_LOST
          case 0xB:
            internalHeader.setNumDropped(buf.getUnsignedInt(tlv_v));
            break;
          // FT_TLV_FLOW_MISORDERED
          case 0xC:
            internalHeader.setNumMisordered(buf.getUnsignedInt(tlv_v));
            break;
          // FT_TLV_PKT_CORRUPT
          case 0xD:
            internalHeader.setNumCorrupt(buf.getUnsignedInt(tlv_v));
            break;
          // FT_TLV_SEQ_RESET
          case 0xE:
            internalHeader.setSeqReset(buf.getUnsignedInt(tlv_v));
            break;
          // FT_TLV_CAP_HOSTNAME
          case 0xF:
            pr = new byte[tlv_l];
            buf.getBytes(tlv_v, pr, 0, pr.length);
            // Expected null-terminated string
            if (pr[pr.length - 1] != 0) {
              throw new UnsupportedOperationException("Char sequence is not null-terminated");
            }

            internalHeader.setHostname(new String(pr, 0, pr.length - 1));
            break;
          // FT_TLV_COMMENTS
          case 0x10:
            pr = new byte[tlv_l];
            buf.getBytes(tlv_v, pr, 0, pr.length);
            // Expected null-terminated string
            if (pr[pr.length - 1] != 0) {
              throw new UnsupportedOperationException("Char sequence is not null-terminated");
            }
            internalHeader.setComments(new String(pr, 0, pr.length - 1));
            break;
          // FT_TLV_IF_NAME
          case 0x11:
            // uint32_t, uint16_t, string:
            // - IP address of device
            // - ifIndex of interface
            // - interface name
            long ip = buf.getUnsignedInt(tlv_v);
            int ifIndex = buf.getUnsignedShort(tlv_v + 4);
            pr = new byte[tlv_l - 4 - 2];
            buf.getBytes(tlv_v + 4 + 2, pr, 0, pr.length);
            if (pr[pr.length - 1] != 0) {
              throw new UnsupportedOperationException("Char sequence is not null-terminated");
            }
            internalHeader.setInterfaceName(ip, ifIndex, new String(pr, 0, pr.length - 1));
            break;
          // FT_TLV_IF_ALIAS
          case 0x12:
            // uint32_t, uint16_t, uint16_t, string:
            // - IP address of device
            // - ifIndex count
            // - ifIndex of interface (count times)
            // - alias name
            long aliasIP = buf.getUnsignedInt(tlv_v);
            int aliasIfIndexCnt = buf.getUnsignedShort(tlv_v + 4);
            int aliasIfIndex = buf.getUnsignedShort(tlv_v + 4 + 2);
            pr = new byte[tlv_l - 4 - 2 - 2];
            buf.getBytes(tlv_v + 4 + 2 + 2, pr, 0, pr.length);
            if (pr[pr.length - 1] != 0) {
              throw new UnsupportedOperationException("Char sequence is not null-terminated");
            }

            internalHeader.setInterfaceAlias(aliasIP, aliasIfIndexCnt, aliasIfIndex,
              new String(pr, 0, pr.length - 1));
            break;
          // Case 0x0
          default:
            break;
        }
      }
      buf = null;
      pr = null;
    }
    return internalHeader;
  }

  /** Return NetFlow header for current input stream */
  public NetFlowHeader getHeader() throws IOException {
    if (header == null) {
      header = prepareHeader();
    }

    return header;
  }

  /** Prepare record buffer for full scan */
  public RecordBuffer prepareRecordBuffer(Column[] columns) {
    return prepareRecordBuffer(columns, null);
  }

  /** Prepare record buffer with default statistics on time */
  public RecordBuffer prepareRecordBuffer(Column[] columns, FilterPredicate predicate) {
    return prepareRecordBuffer(columns, predicate, null);
  }

  /** Prepare record buffer based on input stream */
  public RecordBuffer prepareRecordBuffer(
      Column[] columns,
      FilterPredicate predicate,
      HashMap stats) {
    // Since we are using statistics on a field, we have to make sure that it is initialized
    // properly
    if (stats == null) {
      stats = new HashMap();
    }

    // Find out appropriate strategy for set of columns and predicate. We also update statistics
    // with start and end capture time of the file.
    NetFlow flowInterface;
    if (header.getFlowVersion() == 5) {
      flowInterface = new NetFlowV5();
      stats.put(NetFlowV5.FIELD_UNIX_SECS,
        new LongStatistics(header.getStartCapture(), header.getEndCapture()));
    } else if (header.getFlowVersion() == 7) {
      flowInterface = new NetFlowV7();
      stats.put(NetFlowV7.FIELD_UNIX_SECS,
        new LongStatistics(header.getStartCapture(), header.getEndCapture()));
    } else {
      throw new UnsupportedOperationException("Version " + header.getFlowVersion() +
        " is not supported");
    }

    ScanStrategy strategy = ScanPlanner.buildStrategy(columns, predicate, stats);
    return prepareRecordBuffer(strategy, flowInterface);
  }

  // Prepare record buffer based on strategy and flow interface. Method is currently private, so
  // there is no option to pass custom scan strategy.
  private RecordBuffer prepareRecordBuffer(ScanStrategy strategy, NetFlow flowInterface) {
    if (strategy == null) {
      throw new IllegalArgumentException("Expected ScanStrategy instance, got null");
    }

    if (flowInterface == null) {
      throw new IllegalArgumentException("Expected NetFlow instance, got null");
    }

    // Depending on different strategy we either skip file directly, return full buffer or records,
    // or return filtering buffer, if there is a FilterScan.
    boolean isCompressed = header.isCompressed();
    int recordSize = flowInterface.recordSize();

    if (strategy.skipScan()) {
      log.info("Skip scan based on strategy " + strategy);
      return new EmptyRecordBuffer();
    } else if (strategy.fullScan()) {
      log.info("Full scan based on strategy " + strategy + ", ignoreCorrupt=" + ignoreCorrupt);
      // wrap into closeable iterator
      return new ScanRecordBuffer(in, strategy.getRecordMaterializer(), recordSize, byteOrder,
        isCompressed, bufferLength, ignoreCorrupt);
    } else {
      log.info("Filter scan based on strategy " + strategy + ", ignoreCorrupt=" + ignoreCorrupt);
      return new FilterRecordBuffer(in, strategy.getRecordMaterializer(), recordSize, byteOrder,
        isCompressed, bufferLength, ignoreCorrupt);
    }
  }

  @Override
  public String toString() {
    return getClass().getSimpleName() + "[byte order: " + byteOrder + ", stream version: " +
      streamVersion + ", buffer length: " + bufferLength + "]";
  }

  /** Return size of buffer in bytes that is currently used by reader */
  public int getBufferLength() {
    return this.bufferLength;
  }

  /**
   * Whether or not reader is valid, currently is based on validity of header, assuming that file
   * is of correct format, but might still have corrupt data blocks. See buffers implementation for
   * usage of `ignoreCorrupt`.
   */
  public boolean isValid() {
    return header.isValid();
  }

  // Stream of the NetFlow file
  private final DataInputStream in;
  // Byte order of the file
  private ByteOrder byteOrder;
  // Stream version of the file
  private short streamVersion;
  // Buffer size for record buffer
  private final int bufferLength;
  // NetFlow header
  private NetFlowHeader header = null;
  // Whether or not to ignore corrupt file stream
  private final boolean ignoreCorrupt;
}