ai.rapids.cudf.Rmm Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cudf Show documentation
This project provides java bindings for cudf, to be able to process large amounts of data on a GPU. This is still a work in progress so some APIs may change until the 1.0 release.
The newest version!
/*
 * Copyright (c) 2019-2024, NVIDIA CORPORATION.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ai.rapids.cudf;

import java.io.File;
import java.util.concurrent.TimeUnit;

/**
 * This is the binding class for rmm lib.
 */
public class Rmm {
  private static volatile RmmTrackingResourceAdaptor tracker = null;
  private static volatile RmmDeviceMemoryResource deviceResource = null;
  private static volatile boolean initialized = false;
  private static volatile long poolSize = -1;
  private static volatile boolean poolingEnabled = false;
  static {
    NativeDepsLoader.loadNativeDeps();
  }

  enum LogLoc {
    NONE(0),
    FILE(1),
    STDOUT(2),
    STDERR(3);

    final int internalId;

    LogLoc(int internalId) {
      this.internalId = internalId;
    }
  }

  /**
   * What to send RMM alloc and free logs to.
   */
  public static class LogConf {
    final File file;
    final LogLoc loc;

    private LogConf(File file, LogLoc loc) {
      this.file = file;
      this.loc = loc;
    }
  }

  /**
   * Create a config that will write alloc/free logs to a file.
   */
  public static LogConf logTo(File location) {
    return new LogConf(location, LogLoc.FILE);
  }

  /**
   * Create a config that will write alloc/free logs to stdout.
   */
  public static LogConf logToStdout() {
    return new LogConf(null, LogLoc.STDOUT);
  }

  /**
   * Create a config that will write alloc/free logs to stderr.
   */
  public static LogConf logToStderr() {
    return new LogConf(null, LogLoc.STDERR);
  }


  /**
   * Get the RmmDeviceMemoryResource that was last set through the java APIs. This will
   * not return the correct value if the resource was not set using the java APIs. It will
   * return a null if the resource was never set through the java APIs.
   */
  public static synchronized RmmDeviceMemoryResource getCurrentDeviceResource() {
    return deviceResource;
  }

  /**
   * Get the currently set RmmTrackingResourceAdaptor that is set. This might return null if
   * RMM has nto been initialized.
   */
  public static synchronized RmmTrackingResourceAdaptor getTracker() {
    return tracker;
  }

  /**
   * Set the current device resource that RMM should use for all allocations and de-allocations.
   * This should only be done if you feel comfortable that the current device resource has no
   * pending allocations. Note that the caller of this is responsible for closing the current
   * RmmDeviceMemoryResource that is returned by this. Assuming that it was not used to create
   * the newResource. Please use the `shutdown` API to clear the resource as it does best
   * effort clean up before shutting it down. If `newResource` is not null this will initialize
   * the CUDA context for the calling thread if it is not already set. The caller is responsible
   * for setting the desired CUDA device prior to this call if a specific device is already set.
   * NOTE: All cudf methods will set the chosen CUDA device in the CUDA context of the calling
   * thread after this returns and `newResource` was not null.
   * 
If `newResource` is null this will unset the default CUDA device and mark RMM as not
   * initialized.
   * 
Be aware that for many of these APIs to work the RmmDeviceMemoryResource will need an
   * `RmmTrackingResourceAdaptor`. If one is not found and `newResource` is not null it will
   * be added to `newResource`.
   * 
Also be very careful with how you set this up. It is possible to set up an
   * RmmDeviceMemoryResource that is just bad, like multiple pools or pools on top of an
   * RmmAsyncMemoryResource, that does pooling already. Unless you know what you are doing it is
   * best to just use the `initialize` API instead.
   *
   * @param newResource the new resource to set. If it is null an RmmCudaMemoryResource will be
   *                    used, and RMM will be set as not initialized.
   * @param expectedResource the resource that we expect to be set. This is to let us avoid race
   *                         conditions with multiple things trying to set this at once. It should
   *                         never happen, but just to be careful.
   * @param forceChange if true then the expectedResource check is not done.
   */
  public static synchronized RmmDeviceMemoryResource setCurrentDeviceResource(
      RmmDeviceMemoryResource newResource,
      RmmDeviceMemoryResource expectedResource,
      boolean forceChange) {
    boolean shouldInit = false;
    boolean shouldDeinit = false;
    RmmDeviceMemoryResource newResourceToSet = newResource;
    if (newResourceToSet == null) {
      // We always want it to be set to something or else it can cause problems...
      newResourceToSet = new RmmCudaMemoryResource();
      if (initialized) {
        shouldDeinit = true;
      }
    } else if (!initialized) {
      shouldInit = true;
    }

    RmmDeviceMemoryResource oldResource = deviceResource;
    if (!forceChange && expectedResource != null && deviceResource != null) {
      long expectedOldHandle = expectedResource.getHandle();
      long oldHandle = deviceResource.getHandle();
      if (oldHandle != expectedOldHandle) {
        throw new RmmException("The expected device resource is not correct " +
            Long.toHexString(oldHandle) + " != " + Long.toHexString(expectedOldHandle));
      }
    }

    poolSize = -1;
    poolingEnabled = false;
    setGlobalValsFromResource(newResourceToSet);
    if (newResource != null && tracker == null) {
      // No tracker was set, but we need one
      tracker = new RmmTrackingResourceAdaptor<>(newResourceToSet, 256);
      newResourceToSet = tracker;
    }
    long newHandle = newResourceToSet.getHandle();
    setCurrentDeviceResourceInternal(newHandle);
    deviceResource = newResource;
    if (shouldInit) {
      initDefaultCudaDevice();
      MemoryCleaner.setDefaultGpu(Cuda.getDevice());
      initialized = true;
    }

    if (shouldDeinit) {
      cleanupDefaultCudaDevice();
      initialized = false;
    }
    return oldResource;
  }

  private static void setGlobalValsFromResource(RmmDeviceMemoryResource resource) {
    if (resource instanceof RmmTrackingResourceAdaptor) {
      Rmm.tracker = (RmmTrackingResourceAdaptor) resource;
    } else if (resource instanceof RmmPoolMemoryResource) {
      Rmm.poolSize = Math.max(((RmmPoolMemoryResource)resource).getMaxSize(), Rmm.poolSize);
      Rmm.poolingEnabled = true;
    } else if (resource instanceof RmmArenaMemoryResource) {
      Rmm.poolSize = Math.max(((RmmArenaMemoryResource)resource).getSize(), Rmm.poolSize);
      Rmm.poolingEnabled = true;
    } else if (resource instanceof RmmCudaAsyncMemoryResource) {
      Rmm.poolSize = Math.max(((RmmCudaAsyncMemoryResource)resource).getSize(), Rmm.poolSize);
      Rmm.poolingEnabled = true;
    }

    // Recurse as needed
    if (resource instanceof RmmWrappingDeviceMemoryResource) {
      setGlobalValsFromResource(((RmmWrappingDeviceMemoryResource)resource).getWrapped());
    }
  }

  /**
   * Initialize memory manager state and storage. This will always initialize
   * the CUDA context for the calling thread if it is not already set. The
   * caller is responsible for setting the desired CUDA device prior to this
   * call if a specific device is already set.
   * 
NOTE: All cudf methods will set the chosen CUDA device in the CUDA
   * context of the calling thread after this returns.
   * @param allocationMode Allocation strategy to use. Bit set using
   *                       {@link RmmAllocationMode#CUDA_DEFAULT},
   *                       {@link RmmAllocationMode#POOL},
   *                       {@link RmmAllocationMode#ARENA},
   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
   *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
   * @param logConf        How to do logging or null if you don't want to
   * @param poolSize       The initial pool size in bytes
   * @throws IllegalStateException if RMM has already been initialized
   */
  public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize)
      throws RmmException {
    if (initialized) {
      throw new IllegalStateException("RMM is already initialized");
    }

    boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0;
    boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0;
    boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0;
    boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0;

    if (isAsync && isManaged) {
      throw new IllegalArgumentException(
          "CUDA Unified Memory is not supported in CUDA_ASYNC allocation mode");
    }

    RmmDeviceMemoryResource resource = null;
    boolean succeeded = false;
    try {
      if (isPool) {
        if (isManaged) {
          resource = new RmmPoolMemoryResource<>(new RmmManagedMemoryResource(), poolSize, poolSize);
        } else {
          resource = new RmmPoolMemoryResource<>(new RmmCudaMemoryResource(), poolSize, poolSize);
        }
      } else if (isArena) {
        if (isManaged) {
          resource = new RmmArenaMemoryResource<>(new RmmManagedMemoryResource(), poolSize, false);
        } else {
          resource = new RmmArenaMemoryResource<>(new RmmCudaMemoryResource(), poolSize, false);
        }
      } else if (isAsync) {
        resource = new RmmLimitingResourceAdaptor<>(
            new RmmCudaAsyncMemoryResource(poolSize, poolSize), poolSize, 512);
      } else if (isManaged) {
        resource = new RmmManagedMemoryResource();
      } else {
        resource = new RmmCudaMemoryResource();
      }

      if (logConf != null && logConf.loc != LogLoc.NONE) {
        resource = new RmmLoggingResourceAdaptor<>(resource, logConf, true);
      }

      resource = new RmmTrackingResourceAdaptor<>(resource, 256);
      setCurrentDeviceResource(resource, null, false);
      succeeded = true;
    } finally {
      if (!succeeded && resource != null) {
        resource.close();
      }
    }
  }

  /**
   * Sets the size of the cuDF default pinned pool.
   *
   * @note This has to be called before cuDF functions are executed.
   *
   * @param size initial and maximum size for the cuDF default pinned pool.
   *        Pass size=0 to disable the default pool.
   *
   * @return true if we were able to setup the default resource, false if there was
   *         a resource already set.
   */
  public static synchronized native boolean configureDefaultCudfPinnedPoolSize(long size);

  /**
   * Get the most recently set pool size or -1 if RMM has not been initialized or pooling is
   * not enabled.
   */
  public static synchronized long getPoolSize() {
    return poolSize;
  }

  /**
   * Return true if rmm is initialized and pooling has been enabled, else false.
   */
  public static synchronized boolean isPoolingEnabled() {
    return poolingEnabled;
  }

  /**
   * Check if RMM has been initialized already or not.
   */
  public static boolean isInitialized() throws RmmException {
    return initialized;
  }

  /**
   * Return the amount of RMM memory allocated in bytes. Note that the result
   * may be less than the actual amount of allocated memory if underlying RMM
   * allocator decides to return more memory than what was requested. However,
   * the result will always be a lower bound on the amount allocated.
   */
  public static synchronized long getTotalBytesAllocated() {
    if (tracker == null) {
      return 0;
    } else {
      return tracker.getTotalBytesAllocated();
    }
  }

  /**
   * Returns the maximum amount of RMM memory (Bytes) outstanding during the
   * lifetime of the process.
   */
  public static synchronized long getMaximumTotalBytesAllocated() {
    if (tracker == null) {
      return 0;
    } else {
      return tracker.getMaxTotalBytesAllocated();
    }
  }

  /**
   * Resets a scoped maximum counter of RMM memory used to keep track of usage between
   * code sections while debugging.
   *
   * @param initialValue an initial value (in Bytes) to use for this scoped counter
   */
  public static synchronized void resetScopedMaximumBytesAllocated(long initialValue) {
    if (tracker != null) {
      tracker.resetScopedMaxTotalBytesAllocated(initialValue);
    }
  }

  /**
   * Resets a scoped maximum counter of RMM memory used to keep track of usage between
   * code sections while debugging.
   *
   * This resets the counter to 0 Bytes.
   */
  public static synchronized void resetScopedMaximumBytesAllocated() {
    if (tracker != null) {
      tracker.resetScopedMaxTotalBytesAllocated(0L);
    }
  }

  /**
   * Returns the maximum amount of RMM memory (Bytes) outstanding since the last
   * `resetScopedMaximumOutstanding` call was issued (it is "scoped" because it's the
   * maximum amount seen since the last reset).
   * 

   * If the memory used is net negative (for example if only frees happened since
   * reset, and we reset to 0), then result will be 0.
   * 

   * If `resetScopedMaximumBytesAllocated` is never called, the scope is the whole
   * program and is equivalent to `getMaximumTotalBytesAllocated`.
   *
   * @return the scoped maximum bytes allocated
   */
  public static synchronized long getScopedMaximumBytesAllocated() {
    if (tracker == null) {
      return 0L;
    } else {
      return tracker.getScopedMaxTotalBytesAllocated();
    }
  }

  /**
   * Sets the event handler to be called on RMM events (e.g.: allocation failure).
   * @param handler event handler to invoke on RMM events or null to clear an existing handler
   * @throws RmmException if an active handler is already set
   */
  public static void setEventHandler(RmmEventHandler handler) throws RmmException {
    setEventHandler(handler, false);
  }

  /**
   * Sets the event handler to be called on RMM events (e.g.: allocation failure) and
   * optionally enable debug mode (callbacks on every allocate and deallocate)
   * 
   * NOTE: Only enable debug mode when necessary, as code will run much slower!
   *
   * @param handler event handler to invoke on RMM events or null to clear an existing handler
   * @param enableDebug if true enable debug callbacks in RmmEventHandler
   *                    (onAllocated, onDeallocated)
   * @throws RmmException if an active handler is already set
   */
  public static synchronized void setEventHandler(RmmEventHandler handler,
                                     boolean enableDebug) throws RmmException {
    if (!initialized) {
      throw new RmmException("RMM has not been initialized");
    }
    if (deviceResource instanceof RmmEventHandlerResourceAdaptor) {
      throw new RmmException("Another event handler is already set");
    }
    if (tracker == null) {
      // This is just to be safe it should always be true if this is initialized.
      throw new RmmException("A tracker must be set for the event handler to work");
    }
    RmmEventHandlerResourceAdaptor newResource =
        new RmmEventHandlerResourceAdaptor<>(deviceResource, tracker, handler, enableDebug);
    boolean success = false;
    try {
      setCurrentDeviceResource(newResource, deviceResource, false);
      success = true;
    } finally {
      if (!success) {
        newResource.releaseWrapped();
      }
    }
  }

  /** Clears the active RMM event handler if one is set. */
  public static synchronized void clearEventHandler() throws RmmException {
    if (deviceResource != null && deviceResource instanceof RmmEventHandlerResourceAdaptor) {
      RmmEventHandlerResourceAdaptor orig =
          (RmmEventHandlerResourceAdaptor)deviceResource;
      boolean success = false;
      try {
        setCurrentDeviceResource(orig.wrapped, orig, false);
        success = true;
      } finally {
        if (success) {
          orig.releaseWrapped();
        }
      }
    }
  }

  public static native void initDefaultCudaDevice();

  public static native void cleanupDefaultCudaDevice();

  /**
   * Shut down any initialized RMM instance.  This should be used very rarely.  It does not need to
   * be used when shutting down your process because CUDA will handle releasing all of the
   * resources when your process exits.  This really should only be used if you want to turn off the
   * memory pool for some reasons.  As such we make an effort to be sure no resources have been
   * leaked before shutting down.  This may involve forcing a JVM GC to collect any leaked java
   * objects that still point to CUDA memory.  By default this will do a gc every 2 seconds and
   * wait for up to 4 seconds before throwing an RmmException if not all of the resources are freed.
   * @throws RmmException on any error. This includes if there are outstanding allocations that
   * could not be collected.
   */
  public static void shutdown() throws RmmException {
    shutdown(2, 4, TimeUnit.SECONDS);
  }

  /**
   * Shut down any initialized RMM instance.  This should be used very rarely.  It does not need to
   * be used when shutting down your process because CUDA will handle releasing all of the
   * resources when your process exits.  This really should only be used if you want to turn off the
   * memory pool for some reasons.  As such we make an effort to be sure no resources have been
   * leaked before shutting down.  This may involve forcing a JVM GC to collect any leaked java
   * objects that still point to CUDA memory.
   *
   * @param forceGCInterval how frequently should we force a JVM GC. This is just a recommendation
   *                        to the JVM to do a gc.
   * @param maxWaitTime the maximum amount of time to wait for all objects to be collected before
   *                    throwing an exception.
   * @param units the units for forceGcInterval and maxWaitTime.
   * @throws RmmException on any error. This includes if there are outstanding allocations that
   * could not be collected before maxWaitTime.
   */
  public static synchronized void shutdown(long forceGCInterval, long maxWaitTime, TimeUnit units)
      throws RmmException {
    long now = System.currentTimeMillis();
    final long endTime = now + units.toMillis(maxWaitTime);
    long nextGcTime = now;
    try {
      if (MemoryCleaner.bestEffortHasRmmBlockers()) {
        do {
          if (nextGcTime <= now) {
            System.gc();
            nextGcTime = nextGcTime + units.toMillis(forceGCInterval);
          }
          // Check if everything is ready about every 10 ms
          Thread.sleep(10);
          now = System.currentTimeMillis();
        } while (endTime > now && MemoryCleaner.bestEffortHasRmmBlockers());
      }
    } catch (InterruptedException e) {
      // Ignored
    }
    if (MemoryCleaner.bestEffortHasRmmBlockers()) {
      throw new RmmException("Could not shut down RMM there appear to be outstanding allocations");
    }
    if (initialized) {
      if (deviceResource != null) {
        setCurrentDeviceResource(null, deviceResource, true).close();
      }
    }
  }

  /**
   * Allocate device memory and return a pointer to device memory, using stream 0.
   * @param size   The size in bytes of the allocated memory region
   * @return Returned pointer to the allocated memory
   */
  public static DeviceMemoryBuffer alloc(long size) {
    return alloc(size, null);
  }

  /**
   * Allocate device memory and return a pointer to device memory.
   * @param size   The size in bytes of the allocated memory region
   * @param stream The stream in which to synchronize this command.
   * @return Returned pointer to the allocated memory
   */
  public static DeviceMemoryBuffer alloc(long size, Cuda.Stream stream) {
    long s = stream == null ? 0 : stream.getStream();
    return new DeviceMemoryBuffer(allocInternal(size, s), size, stream);
  }

  private static native long allocInternal(long size, long stream) throws RmmException;


  static native void free(long ptr, long length, long stream) throws RmmException;

  /**
   * Delete an rmm::device_buffer.
   */
  static native void freeDeviceBuffer(long rmmBufferAddress) throws RmmException;

  /**
   * Allocate device memory using `cudaMalloc` and return a pointer to device memory.
   * @param size   The size in bytes of the allocated memory region
   * @param stream The stream in which to synchronize this command.
   * @return Returned pointer to the allocated memory
   */
  public static CudaMemoryBuffer allocCuda(long size, Cuda.Stream stream) {
    long s = stream == null ? 0 : stream.getStream();
    return new CudaMemoryBuffer(allocCudaInternal(size, s), size, stream);
  }

  private static native long allocCudaInternal(long size, long stream) throws RmmException;

  static native void freeCuda(long ptr, long length, long stream) throws RmmException;

  static native long newCudaMemoryResource() throws RmmException;

  static native void releaseCudaMemoryResource(long handle);

  static native long newManagedMemoryResource() throws RmmException;

  static native void releaseManagedMemoryResource(long handle);

  static native long newPoolMemoryResource(long childHandle,
      long initSize, long maxSize) throws RmmException;

  static native void releasePoolMemoryResource(long handle);

  static native long newArenaMemoryResource(long childHandle,
      long size, boolean dumpOnOOM) throws RmmException;

  static native void releaseArenaMemoryResource(long handle);

  static native long newCudaAsyncMemoryResource(long size, long release) throws RmmException;

  static native void releaseCudaAsyncMemoryResource(long handle);

  static native long newLimitingResourceAdaptor(long handle, long limit, long align) throws RmmException;

  static native void releaseLimitingResourceAdaptor(long handle);

  static native long newLoggingResourceAdaptor(long handle, int type, String path,
      boolean autoFlush) throws RmmException;

  static native void releaseLoggingResourceAdaptor(long handle);


  static native long newTrackingResourceAdaptor(long handle, long alignment) throws RmmException;

  static native void releaseTrackingResourceAdaptor(long handle);

  static native long nativeGetTotalBytesAllocated(long handle);

  static native long nativeGetMaxTotalBytesAllocated(long handle);

  static native void nativeResetScopedMaxTotalBytesAllocated(long handle, long initValue);

  static native long nativeGetScopedMaxTotalBytesAllocated(long handle);

  static native long newEventHandlerResourceAdaptor(long handle, long trackerHandle,
      RmmEventHandler handler, long[] allocThresholds, long[] deallocThresholds, boolean debug);

  static native long releaseEventHandlerResourceAdaptor(long handle, boolean debug);

  private static native void setCurrentDeviceResourceInternal(long newHandle);

  public static native long newPinnedPoolMemoryResource(long initSize, long maxSize);

  public static native long setCudfPinnedPoolMemoryResource(long poolPtr);

  public static native void releasePinnedPoolMemoryResource(long poolPtr);

  public static native long allocFromPinnedPool(long poolPtr, long size);

  public static native void freeFromPinnedPool(long poolPtr, long ptr, long size);

  // only for tests
  public static native long allocFromFallbackPinnedPool(long size);

  // only for tests
  public static native void freeFromFallbackPinnedPool(long ptr, long size);
}