All Downloads are FREE. Search and download functionalities are using the official Maven repository.

water.HeartBeatThread Maven / Gradle / Ivy

There is a newer version: 3.8.2.9
Show newest version
package water;

import java.lang.management.ManagementFactory;
import javax.management.*;
import water.util.LinuxProcFileReader;
import water.util.Log;
import water.init.*;

/**
 * Starts a thread publishing multicast HeartBeats to the local subnet: the
 * Leader of this Cloud.
 *
 * @author 
 * @version 1.0
 */
public class HeartBeatThread extends Thread {
  public HeartBeatThread() {
    super("Heartbeat");
    setDaemon(true);
  }

  // Time between heartbeats.  Strictly several iterations less than the
  // timeout.
  static final int SLEEP = 1000;

  // Timeout in msec before we decide to not include a Node in the next round
  // of Paxos Cloud Membership voting.
  static public final int TIMEOUT = 60000;

  // Timeout in msec before we decide a Node is suspect, and call for a vote
  // to remove him.  This must be strictly greater than the TIMEOUT.
  static final int SUSPECT = TIMEOUT+500;

  // uniquely number heartbeats for better timelines
  static private char HB_VERSION;

  // Timeout in msec for all kinds of I/O operations on unresponsive clients.
  // Endlessly retry until this timeout, and then declare the client "dead", 
  // and toss out all in-flight client ops
  static public final int CLIENT_TIMEOUT=1000;

  // The Run Method.
  // Started by main() on a single thread, this code publishes Cloud membership
  // to the Cloud once a second (across all members).  If anybody disagrees
  // with the membership Heartbeat, they will start a round of Paxos group
  // discovery.
  public void run() {
    MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
    ObjectName os;
    try {
      os = new ObjectName("java.lang:type=OperatingSystem");
    } catch( MalformedObjectNameException e ) {
      throw Log.throwErr(e);
    }
    Thread.currentThread().setPriority(Thread.MAX_PRIORITY);
    int counter = 0;
    //noinspection InfiniteLoopStatement
    while( true ) {
      // Update the interesting health self-info for publication also
      H2O cloud = H2O.CLOUD;
      HeartBeat hb = H2O.SELF._heartbeat;
      hb._hb_version = HB_VERSION++;
      hb._jvm_boot_msec= TimeLine.JVM_BOOT_MSEC;

      // Run mini-benchmark every 5 mins.  However, on startup - do not have
      // all JVMs immediately launch a all-core benchmark - they will fight
      // with each other.  Stagger them using the hashcode.
      // Run this benchmark *before* testing the heap or GC, so the GC numbers
      // are current as of the send time.
      if( (counter+Math.abs(H2O.SELF.hashCode()*0xDECAF /*spread wider than 1 apart*/)) % (300/(Float.isNaN(hb._gflops)?10:1)) == 0) {
        hb._gflops   = (float)Linpack.run(hb._cpus_allowed);
        hb._membw    = (float)MemoryBandwidth.run(hb._cpus_allowed);
      }

      // Memory utilization as of last FullGC
      long kv_gc = Cleaner.KV_USED_AT_LAST_GC;
      long heap_gc = Cleaner.HEAP_USED_AT_LAST_GC;
      long pojo_gc = Math.max(heap_gc - kv_gc,0);
      long kv_mem = Cleaner.Histo.cached(); // More current than last FullGC numbers; can skyrocket
      // Since last FullGC, assuming POJO remains constant and KV changed: new free memory
      long free_mem = Math.max(MemoryManager.MEM_MAX-kv_mem-pojo_gc,0);
      long pojo_mem = MemoryManager.MEM_MAX-kv_mem-free_mem;
      hb.set_kv_mem(kv_mem);
      hb.set_pojo_mem(pojo_mem);
      hb.set_free_mem(free_mem);
      hb.set_swap_mem(Cleaner.Histo.swapped());
      hb._keys = H2O.STORE.size();

      try {
        hb._system_load_average = ((Double)mbs.getAttribute(os, "SystemLoadAverage")).floatValue();
        if( hb._system_load_average == -1 )  // SystemLoadAverage not available on windows
          hb._system_load_average = ((Double)mbs.getAttribute(os, "SystemCpuLoad")).floatValue();
      } catch( Exception e ) {/*Ignore, data probably not available on this VM*/ }

      int rpcs = 0;
      for( H2ONode h2o : cloud._memary )
        rpcs += h2o.taskSize();
      hb._rpcs       = (char)rpcs;
      // Scrape F/J pool counts
      hb._fjthrds = new short[H2O.MAX_PRIORITY+1];
      hb._fjqueue = new short[H2O.MAX_PRIORITY+1];
      for( int i=0; i SUSPECT ) {// We suspect this Node has taken a dirt nap
          if( !h2o._announcedLostContact ) {
            Paxos.print("hart: announce suspect node",cloud._memary,h2o.toString());
            h2o._announcedLostContact = true;
          }
        } else if( h2o._announcedLostContact ) {
          Paxos.print("hart: regained contact with node",cloud._memary,h2o.toString());
          h2o._announcedLostContact = false;
        }
      }
      counter++;

      // Once per second, for the entire cloud a Node will multi-cast publish
      // itself, so other unrelated Clouds discover each other and form up.
      try { Thread.sleep(SLEEP); } // Only once-sec per entire Cloud
      catch( InterruptedException ignore ) { }
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy