water.Value Maven / Gradle / Ivy

Go to download
package water;

import jsr166y.ForkJoinPool;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.Log;

import java.io.IOException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;

/** The core Value stored in the distributed K/V store, used to cache Plain Old
 *  Java Objects, and maintain coherency around the cluster.  It contains an
 *  underlying byte[] which may be spilled to disk and freed by the {@link
 *  MemoryManager}, which is the {@link Iced} serialized version of the POJO,
 *  and a cached copy of the POJO itself.
 *  
 *  Requests to extract the POJO from the Value object first try to return the
 *  cached POJO.  If that is missing, then they will re-inflate the POJO from
 *  the {@link Iced} byte[].  If that is missing it is only because the byte[]
 *  was swapped to disk by the {@link Cleaner}.  It will be reloaded from disk
 *  and then inflated as normal.
 *  

 *  The H2O {@link DKV} supports the full Java Memory Model coherency
 *  but only with Gets and Puts.  Normal Java updates to the cached POJO are
 *  local-node visible (due to X86 & Java coherency rules) but NOT cluster-wide
 *  visible until a Put completes after the update.
 *  

 *  By the same token, updates ot the POJO are not reflected in the serialized
 *  form nor the disk-spill copy unless a Put is triggered.  As long as a local
 *  thread keeps a pointer to the POJO, they can update it at will.  If they
 *  wish to recover the POJO from the DKV at a later time with all updates
 *  intact, they must do a final Put after all updates.
 *  

 *  Value objects maintain the needed coherency state, as well as any cached
 *  copies, plus a bunch of utility & convenience functions.
 */
public final class Value extends Iced implements ForkJoinPool.ManagedBlocker {

  /** The Key part of a Key/Value store.  Transient, because the Value is
   *  typically found via its Key, and so the Key is available before we get
   *  the Value and does not need to be passed around the wire.  Not final,
   *  because Keys are interned slowly (for faster compares) and periodically a
   *  Value's Key will be updated to an interned but equivalent Key.  
   *  

   *  Should not be set by any user code.  */
  public transient Key _key;

  // ---
  // Type-id of serialized object; see TypeMap for the list.
  // Might be a primitive array type, or a Iced POJO
  private short _type;
  public int type() { return _type; }
  /** Class name of the embedded POJO, without needing an actual POJO. */
  public String className() { return TypeMap.className(_type); }

  // Max size of Values before we start asserting.
  // Sizes around this big, or larger are probably true errors.
  // In any case, they will cause issues with both GC (giant pause times on
  // many collectors) and I/O (long term blocking of TCP I/O channels to
  // service a single request, causing starvation of other requests).
  public static final int MAX = 256*1024*1024;

  /** Size of the serialized wad of bits.  Values are wads of bits; known small
   *  enough to 'chunk' politely on disk, or fit in a Java heap (larger Vecs
   *  are built via Chunks) but (much) larger than a UDP packet.  Values can
   *  point to either the disk or ram version or both.  There's no compression
   *  smarts (done by the big data Chunks) nor de-dup smarts (done by the
   *  nature of a K/V).  This is just a local placeholder for some user bits
   *  being held at this local Node. */
  public int _max;

  // ---
  // A array of this Value when cached in DRAM, or NULL if not cached.  The
  // contents of _mem are immutable (Key/Value mappings can be changed by an
  // explicit PUT action).  Cleared to null asynchronously by the memory
  // manager (but only if persisted to some disk or in a POJO).  Can be filled
  // in by reloading from disk, or by serializing a POJO.
  private volatile byte[] _mem;
  final byte[] rawMem() { return _mem; }

  // ---
  // A POJO version of the _mem array, or null if the _mem has not been
  // serialized or if _mem is primitive data and not a POJO.  Cleared to null
  // asynchronously by the memory manager (but only if persisted to some disk,
  // or in the _mem array).  Can be filled in by deserializing the _mem array.

  // NOTE THAT IF YOU MODIFY any fields of a POJO that is part of a Value,
  // - this is NOT the recommended programming style,
  // - those changes are visible to all CPUs on the writing node,
  // - but not to other nodes, and
  // - the POJO might be dropped by the MemoryManager and reconstituted from
  //   disk and/or the byte array back to it's original form, losing your changes.
  private volatile Freezable _pojo;
  Freezable rawPOJO() { return _pojo; }

  /** Invalidate byte[] cache.  Only used to eagerly free memory, for data
   *  which is expected to be read-once. */
  public final void freeMem() {
    assert isPersisted() || _pojo != null || _key.isChunkKey();
    _mem = null;
  }
  /** Invalidate POJO cache.  Only used to eagerly free memory, for data
   *  which is expected to be read-once. */
  public final void freePOJO() {
    assert isPersisted() || _mem != null;
    _pojo = null;
  }

  /** The FAST path get-byte-array - final method for speed.  Will (re)build
   *  the mem array from either the POJO or disk.  Never returns NULL.
   *  @return byte[] holding the serialized POJO  */
  public final byte[] memOrLoad() {
    byte[] mem = _mem;          // Read once!
    if( mem != null ) return mem;
    Freezable pojo = _pojo;     // Read once!
    if( pojo != null )          // Has the POJO, make raw bytes
      // Chunks have custom serializer here that skips all steps; just the chunk itself
      if( pojo instanceof Chunk ) return (_mem = ((Chunk)pojo).getBytes());
      else return (_mem = pojo.write(new AutoBuffer()).buf());
    if( _max == 0 ) return (_mem = new byte[0]);
    return (_mem = loadPersist());
  }
  // Just an empty shell of a Value, no local data but the Value is "real".
  // Any attempt to look at the Value will require a remote fetch.
  final boolean isEmpty() { return _max > 0 && _mem==null && _pojo == null && !isPersisted(); }

  /** The FAST path get-POJO as an {@link Iced} subclass - final method for
   *  speed.  Will (re)build the POJO from the _mem array.  Never returns NULL.
   *  @return The POJO, probably the cached instance.  */
  public final  T get() {
    touch();
    Iced pojo = (Iced)_pojo;    // Read once!
    if( pojo != null ) return (T)pojo;
    pojo = TypeMap.newInstance(_type);
    pojo.read(new AutoBuffer(memOrLoad()));
    return (T)(_pojo = pojo);
  }
  /** The FAST path get-POJO as a {@link Freezable} - final method for speed.
   *  Will (re)build the POJO from the _mem array.  Never returns NULL.  This
   *  version has more type-checking.
   *  @return The POJO, probably the cached instance.  */
  public final  T get(Class fc) {
    T pojo = getFreezable();
    assert fc.isAssignableFrom(pojo.getClass());
    return pojo;
  }
  /** The FAST path get-POJO as a {@link Freezable} - final method for speed.
   *  Will (re)build the POJO from the _mem array.  Never returns NULL.  
   *  @return The POJO, probably the cached instance.  */
  public final  T getFreezable() {
    touch();
    Freezable pojo = _pojo;     // Read once!
    if( pojo != null ) return (T)pojo;
    pojo = TypeMap.newFreezable(_type);
    pojo.read(new AutoBuffer(memOrLoad()));
    return (T)(_pojo = pojo);
  }

  // ---
  // Time of last access to this value.
  transient long _lastAccessedTime = System.currentTimeMillis();
  private void touch() {_lastAccessedTime = System.currentTimeMillis();}
  // Exposed and used for testing only; used to trigger premature cleaning/disk-swapping
  void touchAt(long time) {_lastAccessedTime = time;}

  // ---
  // Backend persistence info.  3 bits are reserved for 8 different flavors of
  // backend storage.  1 bit for whether or not the latest _mem field is
  // entirely persisted on the backend storage, or not.  Note that with only 1
  // bit here there is an unclosable datarace: one thread could be trying to
  // change _mem (e.g. to null for deletion) while another is trying to write
  // the existing _mem to disk (for persistence).  This datarace only happens
  // if we have racing deletes of an existing key, along with racing persist
  // attempts.  There are other races that are stopped higher up the stack: we
  // do not attempt to write to disk, unless we have *all* of a Value, so
  // extending _mem (from a remote read) should not conflict with writing _mem
  // to disk.
  //
  // The low 3 bits are final.
  // The on/off disk bit is strictly cleared by the higher layers (e.g. Value.java)
  // and strictly set by the persistence layers (e.g. PersistIce.java).
  private volatile byte _persist; // 3 bits of backend flavor; 1 bit of disk/notdisk
  public  final static byte ICE = 1<<0; // ICE: distributed local disks
  public  final static byte HDFS= 2<<0; // HDFS: backed by Hadoop cluster
  public  final static byte S3  = 3<<0; // Amazon S3
  public  final static byte NFS = 4<<0; // NFS: Standard file system
  public  final static byte TCP = 7<<0; // TCP: For profile purposes, not a storage system
  private final static byte BACKEND_MASK = (8-1);
  private final static byte NOTdsk = 0<<3; // latest _mem is persisted or not
  private final static byte ON_dsk = 1<<3;
  private void clrdsk() { _persist &= ~ON_dsk; } // note: not atomic
  /** Used by the persistance subclass to mark this Value as saved-on-disk. */
  public final void setdsk() { _persist |=  ON_dsk; } // note: not atomic
  /** Check if the backing byte[] has been saved-to-disk */
  public final boolean isPersisted() { return (_persist&ON_dsk)!=0; }
  final byte backend() { return (byte)(_persist&BACKEND_MASK); }

  // ---
  // Interface for using the persistence layer(s).
  boolean onICE (){ return (backend()) ==  ICE; }
  private boolean onHDFS(){ return (backend()) == HDFS; }
  private boolean onNFS (){ return (backend()) ==  NFS; }
  private boolean onS3  (){ return (backend()) ==   S3; }

  /** Store complete Values to disk */
  void storePersist() throws IOException {
    if( isPersisted() ) return;
    H2O.getPM().store(backend(), this);
    assert isPersisted();
  }

  /** Remove dead Values from disk */
  void removePersist() {
    // do not yank memory, as we could have a racing get hold on to this
    //  free_mem();
    if( !isPersisted() || !onICE() ) return; // Never hit disk?
    clrdsk();  // Not persisted now
    H2O.getPM().delete(backend(), this);
  }
  /** Load some or all of completely persisted Values */
  byte[] loadPersist() {
    assert isPersisted();
    try { 
      return H2O.getPM().load(backend(), this);
    } catch( IOException ioe ) {
      throw Log.throwErr(ioe);
    }
  }

  String nameOfPersist() { return nameOfPersist(backend()); }
  /** One of ICE, HDFS, S3, NFS or TCP, according to where this Value is persisted.
   *  @return Short String of the persitance name */
  public static String nameOfPersist(int x) {
    switch( x ) {
    case ICE : return "ICE";
    case HDFS: return "HDFS";
    case S3  : return "S3";
    case NFS : return "NFS";
    case TCP : return "TCP";
    default  : return null;
    }
  }

  /** Set persistence to HDFS from ICE */
  private void setHdfs() throws IOException {
    assert onICE();
    byte[] mem = memOrLoad();   // Get into stable memory
    removePersist();            // Remove from ICE disk
    _persist = Value.HDFS|Value.NOTdsk;
    storePersist();
    assert onHDFS();       // Flipped to HDFS
    _mem = mem; // Close a race with the H2O cleaner zapping _mem while removing from ice
  }

  /** Check if the Value's POJO is a subtype of given type integer.  Does not require the POJO.
   *  @return True if the Value's POJO is a subtype. */
  public static boolean isSubclassOf(int type, Class clz) { return clz.isAssignableFrom(TypeMap.theFreezable(type).getClass()); }

  /** Check if the Value's POJO is a {@link Key} subtype.  Does not require the POJO.
   *  @return True if the Value's POJO is a {@link Key} subtype. */
  public boolean isKey()      { return _type != TypeMap.PRIM_B  && TypeMap.theFreezable(_type) instanceof Key; }
  /** Check if the Value's POJO is a {@link Frame} subtype.  Does not require the POJO.
   *  @return True if the Value's POJO is a {@link Frame} subtype. */
  public boolean isFrame()    { return _type != TypeMap.PRIM_B  && TypeMap.theFreezable(_type) instanceof Frame; }
  /** Check if the Value's POJO is a {@link water.fvec.Vec.VectorGroup} subtype.  Does not require the POJO.
   *  @return True if the Value's POJO is a {@link water.fvec.Vec.VectorGroup} subtype. */
  public boolean isVecGroup() { return _type == TypeMap.VECGROUP; }
  /** Check if the Value's POJO is a {@link Lockable} subtype.  Does not require the POJO.
   *  @return True if the Value's POJO is a {@link Lockable} subtype. */
  public boolean isLockable() { return _type != TypeMap.PRIM_B && TypeMap.theFreezable(_type) instanceof Lockable; }
  /** Check if the Value's POJO is a {@link Vec} subtype.  Does not require the POJO.
   *  @return True if the Value's POJO is a {@link Vec} subtype. */
  public boolean isVec()      { return _type != TypeMap.PRIM_B && TypeMap.theFreezable(_type) instanceof Vec; }
  /** Check if the Value's POJO is a {@link hex.Model} subtype.  Does not require the POJO.
   *  @return True if the Value's POJO is a {@link hex.Model} subtype. */
  public boolean isModel()    { return _type != TypeMap.PRIM_B && TypeMap.theFreezable(_type) instanceof hex.Model; }
  /** Check if the Value's POJO is a {@link Job} subtype.  Does not require the POJO.
   *  @return True if the Value's POJO is a {@link Job} subtype. */
  public boolean isJob()      { return _type != TypeMap.PRIM_B && TypeMap.theFreezable(_type) instanceof Job; }

  public Class theFreezableClass() { return TypeMap.theFreezable(this._type).getClass(); }

  // --------------------------------------------------------------------------

  /** Construct a Value from all parts; not needed for most uses.  This special
   *  constructor is used by {@link water.fvec} to build Value objects over
   *  already-existing Files, so that the File contents will be lazily
   *  swapped-in as the Values are first used.  */
  public Value(Key k, int max, byte[] mem, short type, byte be ) {
    assert mem==null || mem.length==max;
    assert max < MAX : "Value size=0x"+Integer.toHexString(max);
    _key = k;
    _max = max;
    _mem = mem;
    _type = type;
    _pojo = null;
    // For the ICE backend, assume new values are not-yet-written.
    // For HDFS & NFS backends, assume we from global data and preserve the
    // passed-in persist bits
    byte p = (byte)(be&BACKEND_MASK);
    _persist = (p==ICE) ? p : be;
    _rwlock = new AtomicInteger(0);
    _replicas = null;
  }
  Value(Key k, byte[] mem ) { this(k, mem.length, mem, TypeMap.PRIM_B, ICE); }
  Value(Key k, String s ) { this(k, s.getBytes()); }
  Value(Key k, Iced pojo ) { this(k,pojo,ICE); }
  Value(Key k, Iced pojo, byte be ) {
    _key = k;
    _pojo = pojo;
    _type = (short)pojo.frozenType();
    _mem = (pojo instanceof Chunk)?((Chunk)pojo).getBytes():pojo.write(new AutoBuffer()).buf();
    _max = _mem.length;
    assert _max < MAX : "Value size = " + _max + " (0x"+Integer.toHexString(_max) + ") >= (MAX=" + MAX + ").";
    // For the ICE backend, assume new values are not-yet-written.
    // For HDFS & NFS backends, assume we from global data and preserve the
    // passed-in persist bits
    byte p = (byte)(be&BACKEND_MASK);
    _persist = (p==ICE) ? p : be;
    _rwlock = new AtomicInteger(0);
    _replicas = null;
  }
  /** Standard constructor to build a Value from a POJO and a Key.  */
  public Value(Key k, Freezable pojo) { this(k,pojo,ICE); }
  Value(Key k, Freezable pojo, byte be) {
    _key = k;
    _pojo = pojo;
    _type = (short)pojo.frozenType();
    _mem = pojo.write(new AutoBuffer()).buf();
    _max = _mem.length;
    byte p = (byte)(be&BACKEND_MASK);
    _persist = (p==ICE) ? p : be;
    _rwlock = new AtomicInteger(0);
    _replicas = null;
  }

  // Custom serializers: the _mem field is racily cleared by the MemoryManager
  // and the normal serializer then might ship over a null instead of the
  // intended byte[].  Also, the value is NOT on the deserialize'd machines disk
  @Override public AutoBuffer write_impl( AutoBuffer ab ) {
    return ab.put1(_persist).put2(_type).putA1(memOrLoad());
  }
  // Custom serializer: set _max from _mem length; set replicas & timestamp.
  @Override public Value read_impl(AutoBuffer bb) {
    assert _key == null;        // Not set yet
    _persist = bb.get1();       // Set persistence backend but...
    if( onICE() ) clrdsk();     // ... the on-disk flag is local, just deserialized thus not on MY disk
    _type = (short) bb.get2();
    _mem = bb.getA1();
    _max = _mem.length;
    assert _max < MAX : "Value size=0x"+Integer.toHexString(_max)+" during read is larger than "+Integer.toHexString(MAX)+", type: "+TypeMap.className(_type);
    _pojo = null;
    // On remote nodes _rwlock is initialized to 0 (signaling a remote PUT is
    // in progress) flips to -1 when the remote PUT is done, or +1 if a notify
    // needs to happen.
    _rwlock = new AtomicInteger(-1); // Set as 'remote put is done'
    _replicas = null;
    touch();
    return this;
  }

  // ---------------------
  // Ordering of K/V's!  This field tracks a bunch of things used in ordering
  // updates to the same Key.  Ordering Rules:
  // - Program Order.  You see your own writes.  All writes in a single thread
  //   strongly ordered (writes never roll back).  In particular can:
  //   PUT(v1), GET, PUT(null) and The Right Thing happens.
  // - Unrelated writes can race (unless fencing).
  // - Writes are not atomic: some people can see a write ahead of others.
  // - Last-write-wins: if we do a zillion writes to the same Key then wait "a
  //   long time", then do reads all reads will see the same last value.
  // - Blocking on a PUT stalls until the PUT is cloud-wide visible
  //
  // For comparison to H2O get/put MM
  // IA Memory Ordering,  8 principles from Rich Hudson, Intel
  // 1. Loads are not reordered with other loads
  // 2. Stores are not reordered with other stores
  // 3. Stores are not reordered with older loads
  // 4. Loads may be reordered with older stores to different locations but not
  //    with older stores to the same location
  // 5. In a multiprocessor system, memory ordering obeys causality (memory
  //    ordering respects transitive visibility).
  // 6. In a multiprocessor system, stores to the same location have a total order
  // 7. In a multiprocessor system, locked instructions have a total order
  // 8. Loads and stores are not reordered with locked instructions.
  //
  // My (KN, CNC) interpretation of H2O MM from today:
  // 1. Gets are not reordered with other Gets
  // 2  Puts may be reordered with Puts to different Keys.
  // 3. Puts may be reordered with older Gets to different Keys, but not with
  //    older Gets to the same Key.
  // 4. Gets may be reordered with older Puts to different Keys but not with
  //    older Puts to the same Key.
  // 5. Get/Put amongst threads doesn't obey causality
  // 6. Puts to the same Key have a total order.
  // 7. no such thing. although RMW operation exists with Put-like constraints.
  // 8. Gets and Puts may be reordered with RMW operations
  // 9. A write barrier exists that creates Sequential Consistency.  Same-key
  //    ordering (3-4) can't be used to create the effect.
  //
  // A Reader/Writer lock for the home node to control racing Gets and Puts.
  // - 0 for unlocked
  // - +N for locked by N concurrent GETs-in-flight
  // - -1 for write-locked
  //
  // An ACKACK from the client GET lowers the reader lock count.
  //
  // Home node PUTs alter which Value is mapped to a Key, then they block until
  // there are no active GETs, then atomically set the write-lock, then send
  // out invalidates to all the replicas.  PUTs return when all invalidates
  // have reported back.
  //
  // An initial remote PUT will default the value to 0.  A 2nd PUT attempt will
  // block until the 1st one completes (multiple writes to the same Key from
  // the same JVM block, so there is at most 1 outstanding write to the same
  // Key from the same JVM).  The 2nd PUT will CAS the value to 1, indicating
  // the need for the finishing 1st PUT to call notify().
  //
  // Note that this sequence involves a lot of blocking on repeated writes with
  // cached readers, but not the readers - i.e., writes are slow to complete.
  private transient AtomicInteger _rwlock;
  private boolean RW_CAS( int old, int nnn, String msg ) {
    if( !_rwlock.compareAndSet(old,nnn) ) return false;
    //System.out.println(_key+", "+old+" -> "+nnn+", "+msg);
    return true;
  }
  // List of who is replicated where
  private volatile byte[] _replicas;
  private static final AtomicReferenceFieldUpdater REPLICAS_UPDATER =
    AtomicReferenceFieldUpdater.newUpdater(Value.class,byte[].class, "_replicas");
  // Fills in the _replicas field atomically, on first set of a replica.
  private byte[] replicas( ) {
    byte[] r = _replicas;
    if( r != null ) return r;
    byte[] nr = new byte[H2O.CLOUD.size()+1/*1-based numbering*/+10/*limit of 10 clients*/];
    if( REPLICAS_UPDATER.compareAndSet(this,null,nr) ) return nr;
    r = _replicas/*read again, since CAS failed must be set now*/;
    assert r!= null;
    return r;
  }

  /** Atomically insert h2o into the replica list; reports false if the Value
   *  flagged against future replication with a -1.  Also bumps the active
   *  Get count, which remains until the Get completes (we receive an ACKACK). */
  boolean setReplica( H2ONode h2o ) {
    assert _key.home(); // Only the HOME node for a key tracks replicas
    assert h2o != H2O.SELF;     // Do not track self as a replica
    while( true ) {     // Repeat, in case racing GETs are bumping the counter
      int old = _rwlock.get();
      if( old == -1 ) return false; // Write-locked; no new replications.  Read fails to read *this* value
      assert old >= 0;              // Not negative
      if( RW_CAS(old,old+1,"rlock+") ) break;
    }
    // Narrow non-race here.  Here is a time window where the rwlock count went
    // up, but the replica list does not account for the new replica.  However,
    // the rwlock cannot go down until an ACKACK is received, and the ACK
    // (hence ACKACK) doesn't go out until after this function returns.
    replicas()[h2o._unique_idx] = 1;
    // Both rwlock taken, and replica count is up now.
    return true;
  }

  /** Atomically lower active GET count */
  void lowerActiveGetCount( H2ONode h2o ) {
    assert _key.home();    // Only the HOME node for a key tracks replicas
    assert h2o != H2O.SELF;// Do not track self as a replica
    while( true ) {        // Repeat, in case racing GETs are bumping the counter
      int old = _rwlock.get(); // Read the lock-word
      assert old > 0;      // Since lowering, must be at least 1
      assert old != -1;    // Not write-locked, because we are an active reader
      assert _replicas!=null && _replicas[h2o._unique_idx]==1; // Self-bit is set
      if( RW_CAS(old,old-1,"rlock-") ) {
        if( old-1 == 0 )   // GET count fell to zero?
          synchronized( this ) { notifyAll(); } // Notify any pending blocked PUTs
        return;            // Repeat until count is lowered
      }
    }
  }

  /** This value was atomically extracted from the local STORE by a successful
   *  TaskPutKey attempt (only 1 thread can ever extract and thus call here).
   *  No future lookups will find this Value, but there may be existing uses.
   *  Atomically set the rwlock count to -1 locking it from further GETs and
   *  ship out invalidates to caching replicas.  May need to block on active
   *  GETs.  Updates a set of Future invalidates that can be blocked against. */
  Futures lockAndInvalidate( H2ONode sender, Futures fs ) {
    assert _key.home(); // Only the HOME node for a key tracks replicas
    // Write-Lock against further GETs
    while( true ) {      // Repeat, in case racing GETs are bumping the counter
      int old = _rwlock.get();
      assert old >= 0 : _key+", rwlock="+old;  // Count does not go negative
      assert old != -1; // Only the thread doing a PUT ever locks
      if( old !=0 ) { // has readers?
        // Active readers: need to block until the GETs (of this very Value!)
        // all complete, before we can invalidate this Value - lest a racing
        // Invalidate bypass a GET.
        try { ForkJoinPool.managedBlock(this); } catch( InterruptedException ignore ) { }
      } else if( RW_CAS(0,-1,"wlock") )
        break;                  // Got the write-lock!
    }
    // We have the set of Nodes with replicas now.  Ship out invalidates.
    byte[] r = _replicas;
    if( r==null ) return fs; // No replicas, nothing to invalidate
    int max = r.length;
    for( int i=0; i