water.fvec.AppendableVec Maven / Gradle / Ivy

Go to download
package water.fvec;

import water.*;
import water.parser.ParseTime;
import water.util.ArrayUtils;

import java.util.Arrays;

/**
 * A NEW single distributed vector column.
 *
 * The NEW vector has no data, and takes no space.  It supports distributed
 * parallel writes to it, via calls to append2.  Such writes happen in parallel
 * and all writes are ordered.  Writes *will* be local to the node doing them,
 * specifically to allow control over locality.  By default, writes will go
 * local-homed chunks with no compression; there is a final 'close' to the NEW
 * vector which may do compression; the final 'close' will return some other
 * Vec type.  NEW Vectors do NOT support reads!
 */
public class AppendableVec extends Vec {

  public long _tmp_espc[];
  public static final byte NA     = 1;
  public static final byte ENUM   = 2;
  public static final byte NUMBER = 3;
  public static final byte TIME   = 4;
  public static final byte UUID   = 5;
  public static final byte STRING = 6;

  private byte[] _chunkTypes;

  public void setTypes(byte [] ts) {
    _chunkTypes = ts;
  }

  /**
   * This is a hack to fix SVMLight parsing. AppendableVec.close() checks the
   * types of each chunk.  This sets the chunk type for uncreated chunks of
   * all 0, are counted as numeric and not NA.
   * @param cidx
   */
  public void setPrecedingChunkTypes(int cidx, byte type) {
    if (_chunkTypes.length < cidx)
      _chunkTypes = Arrays.copyOf(_chunkTypes,cidx<<1);
    for (int i =0; i < cidx; i++) _chunkTypes[i] = type;
  }

  long _naCnt;
  long _enumCnt;
  long _strCnt;
  long _timCnt;
  long _totalCnt;

  public int _chunkOff;         // Public so the parser can find it


  public AppendableVec( Key key){
    this(key, new long[4], 0);
  }

  public AppendableVec( Key key, long [] espc, int chunkOff) {
    super(key, null); // NOTE: passing null for espc and then keeping private copy so that the size can change
    _tmp_espc = espc;
    _chunkTypes = new byte[4];
    _chunkOff = chunkOff;
  }
  // A NewVector chunk was "closed" - completed.  Add it's info to the roll-up.
  // This call is made in parallel across all node-local created chunks, but is
  // not called distributed.
  synchronized void closeChunk( NewChunk chk ) {
    final int cidx = chk._cidx - _chunkOff;
    while( cidx >= _chunkTypes.length )
      _chunkTypes = Arrays.copyOf(_chunkTypes,_chunkTypes.length<<1);
    while( cidx >= _tmp_espc.length ) // should not happen if espcs are preallocated and shared!
      _tmp_espc = Arrays.copyOf(_tmp_espc, _tmp_espc.length<<1);
    _tmp_espc[cidx] = chk._len;
    _chunkTypes[cidx] = chk.type();
    _naCnt += chk.naCnt();
    _enumCnt += chk.enumCnt();
    _strCnt += chk.strCnt();
    _timCnt += chk._timCnt;
    _totalCnt += chk._len;
  }

  public static Vec[] closeAll(AppendableVec [] avs) {
    Futures fs = new Futures();
    Vec [] res = closeAll(avs,fs);
    fs.blockForPending();
    return res;
  }

  public static Vec[] closeAll(AppendableVec [] avs, Futures fs) {
    Vec [] res = new Vec[avs.length];
    for(int i = 0; i < avs.length; ++i)
      res[i] = avs[i].close(fs);
    return res;
  }

  /**
   * Add AV build over sub-range of this vec (used e.g. by multifile parse where each file produces its own AV which represents sub-range of the resulting vec)
   * @param av
   */
  public void setSubRange(AppendableVec av) {
    assert _key.equals(av._key):"mismatched keys " + _key + ", " + av._key;
    System.arraycopy(av._tmp_espc, 0, _tmp_espc, av._chunkOff, av._tmp_espc.length);
    System.arraycopy(av._chunkTypes, 0, _chunkTypes, av._chunkOff, av._tmp_espc.length); // intentionally espc length which is guaranteed to be correct length, types may be longer!
    _strCnt += av._strCnt;
    _naCnt += av._naCnt;
    _enumCnt += av._enumCnt;
    _timCnt += av._timCnt;
    _totalCnt += av._totalCnt;
  }

  // We declare column to be string/enum if it has more enums than numbers
  public boolean shouldBeEnum() {
    long numCnt = _totalCnt - _strCnt - _naCnt - _enumCnt;
    return _enumCnt > numCnt;
  }

  // Class 'reduce' call on new vectors; to combine the roll-up info.
  // Called single-threaded from the M/R framework.
  public void reduce( AppendableVec nv ) {
    if( this == nv ) return;    // Trivially done
    // Combine arrays of elements-per-chunk
    if(_tmp_espc != nv._tmp_espc) {
      long e1[] = nv._tmp_espc;       // Shorter array of longs?
      if (e1.length > _tmp_espc.length) { // should not happen for shared espcs!
        e1 = _tmp_espc;               // Keep the shorter one in e1
        _tmp_espc = nv._tmp_espc;         // Keep longer in the object
      }
      for( int i=0; i _chunkTypes.length) {
      t1 = _chunkTypes;
      _chunkTypes = nv._chunkTypes;
    }
    for( int i =0 ; i < t1.length; ++i)
      _chunkTypes[i] |= t1[i];
    _naCnt += nv._naCnt;
    _enumCnt += nv._enumCnt;
    _strCnt += nv._strCnt;
    _timCnt += nv._timCnt;
    _totalCnt += nv._totalCnt;
  }


  // "Close" out a NEW vector - rewrite it to a plain Vec that supports random
  // reads, plus computes rows-per-chunk, min/max/mean, etc.
  public Vec close(Futures fs) {
    // Compute #chunks
    int nchunk = _tmp_espc.length;
    DKV.remove(chunkKey(nchunk),fs); // remove potential trailing key
    while( nchunk > 1 && _tmp_espc[nchunk-1] == 0 ) {
      nchunk--;
      DKV.remove(chunkKey(nchunk),fs); // remove potential trailing key
    }

    // Histogram chunk types
    int[] ctypes = new int[STRING+1];
    for( int i = 0; i < nchunk; ++i )
      ctypes[_chunkTypes[i]]++;

    // Odd case: new enum columns are usually made as new numeric columns,
    // with a domain pasted on afterwards.  All chunks look like
    // numbers.... but they are all valid enum numbers.
    boolean genEnumCol = false;
    if( ctypes[ENUM]==0 && ctypes[TIME] == 0 && ctypes[UUID] == 0 && ctypes[STRING] == 0 ) genEnumCol = true;
    // Make sure enums are consistent.  If we have a domain, and it is
    // dominating assume ENUM type.
    if( domain() != null && (genEnumCol || ctypes[ENUM]>ctypes[NUMBER]) ) {
      ctypes[ENUM] += ctypes[NUMBER]; ctypes[NUMBER]=0;
      ctypes[ENUM] += ctypes[NA]; ctypes[NA] = 0;        // All NA case
      if (nchunk == 0) ctypes[ENUM]++;                   // No rows in vec
    }

    // Find the dominant non-NA Chunk type.
    int idx = 0;
    for( int i=0; i ctypes[idx] )
        idx = i;

    if( idx!=ENUM ) setDomain(null);

    // Make Chunks other than the dominant type fail out to NAs.  This includes
    // converting numeric chunks to NAs in Enum columns - we cannot reverse
    // print the numbers to get the original text for the Enum back.
    for(int i = 0; i < nchunk; ++i)
      if(_chunkTypes[i] != idx && 
         !(idx==ENUM && _chunkTypes[i]==NUMBER && genEnumCol)) // Odd case: numeric chunks being forced/treated as a boolean enum
        DKV.put(chunkKey(i), new C0DChunk(Double.NaN, (int) _tmp_espc[i]),fs);

    byte type;
    switch( idx ) {
    case ENUM  : type = T_ENUM; break;
    case NUMBER: type = T_NUM ; break;
    case TIME  : type = T_TIME; break;
    case UUID  : type = T_UUID; break;
    case STRING: type = T_STR ; break;
    default    : type = T_BAD ; break;
    }

    // Compute elems-per-chunk.
    // Roll-up elem counts, so espc[i] is the starting element# of chunk i.
    long espc[] = new long[nchunk+1]; // Shorter array
    long x=0;                   // Total row count so far
    for( int i=0; i