water.util.ChunkSummary Maven / Gradle / Ivy

Go to download
package water.util;

import water.H2O;
import water.MRTask;
import water.fvec.Chunk;
import water.fvec.EnumWrappedVec;
import water.fvec.Vec;

/**
 * Simple summary of how many chunks of each type are in a Frame
 */
public class ChunkSummary extends MRTask {

  // static list of chunks for which statistics are to be gathered
  final transient static String[] chunkTypes = new String[]{
    "C0L",
    "C0D",
    "CBS",
    "CX0",                   // Sparser bitvector; popular so near head of list
    "CXI",                   // Sparse ints
    "C1",
    "C1N",
    "C1S",
    "C2",
    "C2S",
    "C4",
    "C4S",
    "C4F",
    "C8",
    "C16",                      // UUID
    "CStr",                     // Strings
    "CXD",                      // Sparse doubles
    "CUD",                      // Few Unique doubles
    "C8D",                      //leave this as last -> no compression
  };
  final transient static String[] chunkNames = new String[]{
          "Constant Integers",
          "Constant Reals",
          "Bits",
          "Sparse Bits",
          "Sparse Integers",
          "1-Byte Integers",
          "1-Byte Integers (w/o NAs)",
          "1-Byte Fractions",
          "2-Byte Integers",
          "2-Byte Fractions",
          "4-Byte Integers",
          "4-Byte Fractions",
          "32-bit Reals",
          "64-bit Integers",
          "128-bit UUID",
          "String",
          "Sparse Reals",
          "Unique Reals",
          "64-bit Reals",
  };

  // OUTPUT
  private long[] chunk_counts;
  private long total_chunk_count;
  private long[] chunk_byte_sizes;

  private long total_chunk_byte_size;
  private long[] byte_size_per_node; //averaged over all chunks
  private double byte_size_per_node_mean;
  private double byte_size_per_node_min;
  private double byte_size_per_node_max;
  private double byte_size_per_node_stddev;

  private long total_row_count;
  private long[] row_count_per_node;
  private double row_count_per_node_mean;
  private double row_count_per_node_min;
  private double row_count_per_node_max;
  private double row_count_per_node_stddev;

  private long total_chunk_count_per_col;
  private long[] chunk_count_per_col_per_node;
  private double chunk_count_per_col_per_node_mean;
  private double chunk_count_per_col_per_node_min;
  private double chunk_count_per_col_per_node_max;
  private double chunk_count_per_col_per_node_stddev;

  @Override
  public void map(Chunk[] cs) {
    chunk_counts = new long[chunkTypes.length];
    chunk_byte_sizes = new long[chunkTypes.length];
    byte_size_per_node = new long[H2O.CLOUD.size()];
    row_count_per_node = new long[H2O.CLOUD.size()];
    chunk_count_per_col_per_node = new long[H2O.CLOUD.size()];
    for( Chunk c : cs ) {       // Can be a big loop, for high column counts
      // Pull out the class name; trim a trailing "Chunk"
      String cname = c.getClass().getSimpleName();
      int nlen = cname.length();
      assert nlen > 5 && cname.charAt(nlen-5)=='C' && cname.charAt(nlen-1)=='k';
      String sname = cname.substring(0,nlen-5);
      if (sname.equals("EnumWrapped")) {
        Chunk ec = ((EnumWrappedVec.EnumWrappedChunk)c)._c;
        cname = ec.getClass().getSimpleName();
        nlen = cname.length();
        assert nlen > 5 && cname.charAt(nlen-5)=='C' && cname.charAt(nlen-1)=='k';
        sname = cname.substring(0,nlen-5);
      }
      // Table lookup, roughly sorted by frequency
      int j;
      for( j = 0; j < chunkTypes.length; ++j )
        if( sname.equals(chunkTypes[j]) )
          break;
      if( j==chunkTypes.length ) throw H2O.fail("Unknown Chunk Type: " + sname);
      chunk_counts[j]++;
      chunk_byte_sizes[j] += c.byteSize();
      byte_size_per_node[H2O.SELF.index()] += c.byteSize();
    }
    row_count_per_node[H2O.SELF.index()] += cs[0].len();
    total_row_count +=  cs[0].len();
    chunk_count_per_col_per_node[H2O.SELF.index()]++;
    total_chunk_count_per_col++;
  }

  @Override
  public void reduce(ChunkSummary mrt) {
    ArrayUtils.add(chunk_counts,mrt.chunk_counts);
    ArrayUtils.add(chunk_byte_sizes,mrt.chunk_byte_sizes);
    ArrayUtils.add(byte_size_per_node,mrt.byte_size_per_node);
    ArrayUtils.add(row_count_per_node,mrt.row_count_per_node);
    ArrayUtils.add(chunk_count_per_col_per_node,mrt.chunk_count_per_col_per_node);
    total_row_count += mrt.total_row_count;
    total_chunk_count_per_col += mrt.total_chunk_count_per_col;
  }

  @Override
  protected void postGlobal() {
    if (chunk_counts == null || chunk_byte_sizes == null || byte_size_per_node == null) return;
    assert(total_row_count == _fr.numRows());

    // compute counts and sizes
    total_chunk_byte_size = 0;
    total_chunk_count = 0;
    for (int j = 0; j < chunkTypes.length; ++j) {
      total_chunk_byte_size += chunk_byte_sizes[j];
      total_chunk_count += chunk_counts[j];
    }

    long check = 0;
    for (Vec v : _fr.vecs())
      check += v.nChunks();
    assert(total_chunk_count == check);

    // This doesn't always hold, FileVecs have File-based byte size, while Vecs have Chunk-based byte size.
//    assert(total_chunk_byte_size == _fr.byteSize());

    double[] res=MathUtils.min_max_mean_stddev(byte_size_per_node);
    byte_size_per_node_min = res[0];
    byte_size_per_node_max = res[1];
    byte_size_per_node_mean = res[2];
    byte_size_per_node_stddev = res[3];

    res=MathUtils.min_max_mean_stddev(row_count_per_node);
    row_count_per_node_min = res[0];
    row_count_per_node_max = res[1];
    row_count_per_node_mean = res[2];
    row_count_per_node_stddev = res[3];

    res=MathUtils.min_max_mean_stddev(chunk_count_per_col_per_node);
    chunk_count_per_col_per_node_min = res[0];
    chunk_count_per_col_per_node_max = res[1];
    chunk_count_per_col_per_node_mean = res[2];
    chunk_count_per_col_per_node_stddev = res[3];
  }

  String display(long val) { return String.format("%10s", val == 0 ? "  0  B" : PrettyPrint.bytes(val)); }

  public TwoDimTable toTwoDimTableChunkTypes() {
    final String tableHeader = "Chunk compression summary";
    int rows = 0;
    for (int j = 0; j < chunkTypes.length; ++j) if (chunk_counts != null && chunk_counts[j] > 0) rows++;
    final String[] rowHeaders = new String[rows];
    final String[] colHeaders = new String[]{"Chunk Type", "Chunk Name", "Count", "Count Percentage", "Size", "Size Percentage"};
    final String[] colTypes = new String[]{"string", "string", "int", "float", "string", "float"};
    final String[] colFormats = new String[]{"%8s", "%s", "%10d", "%10.3f %%", "%10s", "%10.3f %%"};
    final String colHeaderForRowHeaders = null;
    TwoDimTable table = new TwoDimTable(tableHeader, null, rowHeaders, colHeaders, colTypes, colFormats, colHeaderForRowHeaders);

    int row = 0;
    for (int j = 0; j < chunkTypes.length; ++j) {
      if (chunk_counts != null && chunk_counts[j] > 0) {
        table.set(row, 0, chunkTypes[j]);
        table.set(row, 1, chunkNames[j]);
        table.set(row, 2, chunk_counts[j]);
        table.set(row, 3, (float) chunk_counts[j] / total_chunk_count * 100.);
        table.set(row, 4, display(chunk_byte_sizes[j]));
        table.set(row, 5, (float) chunk_byte_sizes[j] / total_chunk_byte_size * 100.);
        row++;
      }
    }
    return table;
  }

  public TwoDimTable toTwoDimTableDistribution() {
    final String tableHeader = "Frame distribution summary";
    int rows = H2O.CLOUD.size() + 5;
    final String[] rowHeaders = new String[rows];
    int row;
    for (row=0; row 1 && byte_size_per_node_stddev > 0.2 * byte_size_per_node_mean) {
      sb.append("** Note: Dataset is not well distributed, consider rebalancing **\n");
    }
    return sb.toString();
  }
}