ai.rapids.cudf.Rmm Maven / Gradle / Ivy
Show all versions of cudf Show documentation
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ai.rapids.cudf;
import java.io.File;
import java.util.concurrent.TimeUnit;
/**
* This is the binding class for rmm lib.
*/
public class Rmm {
private static volatile RmmTrackingResourceAdaptor tracker = null;
private static volatile RmmDeviceMemoryResource deviceResource = null;
private static volatile boolean initialized = false;
private static volatile long poolSize = -1;
private static volatile boolean poolingEnabled = false;
static {
NativeDepsLoader.loadNativeDeps();
}
enum LogLoc {
NONE(0),
FILE(1),
STDOUT(2),
STDERR(3);
final int internalId;
LogLoc(int internalId) {
this.internalId = internalId;
}
}
/**
* What to send RMM alloc and free logs to.
*/
public static class LogConf {
final File file;
final LogLoc loc;
private LogConf(File file, LogLoc loc) {
this.file = file;
this.loc = loc;
}
}
/**
* Create a config that will write alloc/free logs to a file.
*/
public static LogConf logTo(File location) {
return new LogConf(location, LogLoc.FILE);
}
/**
* Create a config that will write alloc/free logs to stdout.
*/
public static LogConf logToStdout() {
return new LogConf(null, LogLoc.STDOUT);
}
/**
* Create a config that will write alloc/free logs to stderr.
*/
public static LogConf logToStderr() {
return new LogConf(null, LogLoc.STDERR);
}
/**
* Get the RmmDeviceMemoryResource that was last set through the java APIs. This will
* not return the correct value if the resource was not set using the java APIs. It will
* return a null if the resource was never set through the java APIs.
*/
public static synchronized RmmDeviceMemoryResource getCurrentDeviceResource() {
return deviceResource;
}
/**
* Get the currently set RmmTrackingResourceAdaptor that is set. This might return null if
* RMM has nto been initialized.
*/
public static synchronized RmmTrackingResourceAdaptor getTracker() {
return tracker;
}
/**
* Set the current device resource that RMM should use for all allocations and de-allocations.
* This should only be done if you feel comfortable that the current device resource has no
* pending allocations. Note that the caller of this is responsible for closing the current
* RmmDeviceMemoryResource that is returned by this. Assuming that it was not used to create
* the newResource. Please use the `shutdown` API to clear the resource as it does best
* effort clean up before shutting it down. If `newResource` is not null this will initialize
* the CUDA context for the calling thread if it is not already set. The caller is responsible
* for setting the desired CUDA device prior to this call if a specific device is already set.
* NOTE: All cudf methods will set the chosen CUDA device in the CUDA context of the calling
* thread after this returns and `newResource` was not null.
*
If `newResource` is null this will unset the default CUDA device and mark RMM as not
* initialized.
*
Be aware that for many of these APIs to work the RmmDeviceMemoryResource will need an
* `RmmTrackingResourceAdaptor`. If one is not found and `newResource` is not null it will
* be added to `newResource`.
*
Also be very careful with how you set this up. It is possible to set up an
* RmmDeviceMemoryResource that is just bad, like multiple pools or pools on top of an
* RmmAsyncMemoryResource, that does pooling already. Unless you know what you are doing it is
* best to just use the `initialize` API instead.
*
* @param newResource the new resource to set. If it is null an RmmCudaMemoryResource will be
* used, and RMM will be set as not initialized.
* @param expectedResource the resource that we expect to be set. This is to let us avoid race
* conditions with multiple things trying to set this at once. It should
* never happen, but just to be careful.
* @param forceChange if true then the expectedResource check is not done.
*/
public static synchronized RmmDeviceMemoryResource setCurrentDeviceResource(
RmmDeviceMemoryResource newResource,
RmmDeviceMemoryResource expectedResource,
boolean forceChange) {
boolean shouldInit = false;
boolean shouldDeinit = false;
RmmDeviceMemoryResource newResourceToSet = newResource;
if (newResourceToSet == null) {
// We always want it to be set to something or else it can cause problems...
newResourceToSet = new RmmCudaMemoryResource();
if (initialized) {
shouldDeinit = true;
}
} else if (!initialized) {
shouldInit = true;
}
RmmDeviceMemoryResource oldResource = deviceResource;
if (!forceChange && expectedResource != null && deviceResource != null) {
long expectedOldHandle = expectedResource.getHandle();
long oldHandle = deviceResource.getHandle();
if (oldHandle != expectedOldHandle) {
throw new RmmException("The expected device resource is not correct " +
Long.toHexString(oldHandle) + " != " + Long.toHexString(expectedOldHandle));
}
}
poolSize = -1;
poolingEnabled = false;
setGlobalValsFromResource(newResourceToSet);
if (newResource != null && tracker == null) {
// No tracker was set, but we need one
tracker = new RmmTrackingResourceAdaptor<>(newResourceToSet, 256);
newResourceToSet = tracker;
}
long newHandle = newResourceToSet.getHandle();
setCurrentDeviceResourceInternal(newHandle);
deviceResource = newResource;
if (shouldInit) {
initDefaultCudaDevice();
MemoryCleaner.setDefaultGpu(Cuda.getDevice());
initialized = true;
}
if (shouldDeinit) {
cleanupDefaultCudaDevice();
initialized = false;
}
return oldResource;
}
private static void setGlobalValsFromResource(RmmDeviceMemoryResource resource) {
if (resource instanceof RmmTrackingResourceAdaptor) {
Rmm.tracker = (RmmTrackingResourceAdaptor) resource;
} else if (resource instanceof RmmPoolMemoryResource) {
Rmm.poolSize = Math.max(((RmmPoolMemoryResource)resource).getMaxSize(), Rmm.poolSize);
Rmm.poolingEnabled = true;
} else if (resource instanceof RmmArenaMemoryResource) {
Rmm.poolSize = Math.max(((RmmArenaMemoryResource)resource).getSize(), Rmm.poolSize);
Rmm.poolingEnabled = true;
} else if (resource instanceof RmmCudaAsyncMemoryResource) {
Rmm.poolSize = Math.max(((RmmCudaAsyncMemoryResource)resource).getSize(), Rmm.poolSize);
Rmm.poolingEnabled = true;
}
// Recurse as needed
if (resource instanceof RmmWrappingDeviceMemoryResource) {
setGlobalValsFromResource(((RmmWrappingDeviceMemoryResource)resource).getWrapped());
}
}
/**
* Initialize memory manager state and storage. This will always initialize
* the CUDA context for the calling thread if it is not already set. The
* caller is responsible for setting the desired CUDA device prior to this
* call if a specific device is already set.
* NOTE: All cudf methods will set the chosen CUDA device in the CUDA
* context of the calling thread after this returns.
* @param allocationMode Allocation strategy to use. Bit set using
* {@link RmmAllocationMode#CUDA_DEFAULT},
* {@link RmmAllocationMode#POOL},
* {@link RmmAllocationMode#ARENA},
* {@link RmmAllocationMode#CUDA_ASYNC} and
* {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
* @param logConf How to do logging or null if you don't want to
* @param poolSize The initial pool size in bytes
* @throws IllegalStateException if RMM has already been initialized
*/
public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize)
throws RmmException {
if (initialized) {
throw new IllegalStateException("RMM is already initialized");
}
boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0;
boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0;
boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0;
boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0;
if (isAsync && isManaged) {
throw new IllegalArgumentException(
"CUDA Unified Memory is not supported in CUDA_ASYNC allocation mode");
}
RmmDeviceMemoryResource resource = null;
boolean succeeded = false;
try {
if (isPool) {
if (isManaged) {
resource = new RmmPoolMemoryResource<>(new RmmManagedMemoryResource(), poolSize, poolSize);
} else {
resource = new RmmPoolMemoryResource<>(new RmmCudaMemoryResource(), poolSize, poolSize);
}
} else if (isArena) {
if (isManaged) {
resource = new RmmArenaMemoryResource<>(new RmmManagedMemoryResource(), poolSize, false);
} else {
resource = new RmmArenaMemoryResource<>(new RmmCudaMemoryResource(), poolSize, false);
}
} else if (isAsync) {
resource = new RmmLimitingResourceAdaptor<>(
new RmmCudaAsyncMemoryResource(poolSize, poolSize), poolSize, 512);
} else if (isManaged) {
resource = new RmmManagedMemoryResource();
} else {
resource = new RmmCudaMemoryResource();
}
if (logConf != null && logConf.loc != LogLoc.NONE) {
resource = new RmmLoggingResourceAdaptor<>(resource, logConf, true);
}
resource = new RmmTrackingResourceAdaptor<>(resource, 256);
setCurrentDeviceResource(resource, null, false);
succeeded = true;
} finally {
if (!succeeded && resource != null) {
resource.close();
}
}
}
/**
* Sets the size of the cuDF default pinned pool.
*
* @note This has to be called before cuDF functions are executed.
*
* @param size initial and maximum size for the cuDF default pinned pool.
* Pass size=0 to disable the default pool.
*
* @return true if we were able to setup the default resource, false if there was
* a resource already set.
*/
public static synchronized native boolean configureDefaultCudfPinnedPoolSize(long size);
/**
* Get the most recently set pool size or -1 if RMM has not been initialized or pooling is
* not enabled.
*/
public static synchronized long getPoolSize() {
return poolSize;
}
/**
* Return true if rmm is initialized and pooling has been enabled, else false.
*/
public static synchronized boolean isPoolingEnabled() {
return poolingEnabled;
}
/**
* Check if RMM has been initialized already or not.
*/
public static boolean isInitialized() throws RmmException {
return initialized;
}
/**
* Return the amount of RMM memory allocated in bytes. Note that the result
* may be less than the actual amount of allocated memory if underlying RMM
* allocator decides to return more memory than what was requested. However,
* the result will always be a lower bound on the amount allocated.
*/
public static synchronized long getTotalBytesAllocated() {
if (tracker == null) {
return 0;
} else {
return tracker.getTotalBytesAllocated();
}
}
/**
* Returns the maximum amount of RMM memory (Bytes) outstanding during the
* lifetime of the process.
*/
public static synchronized long getMaximumTotalBytesAllocated() {
if (tracker == null) {
return 0;
} else {
return tracker.getMaxTotalBytesAllocated();
}
}
/**
* Resets a scoped maximum counter of RMM memory used to keep track of usage between
* code sections while debugging.
*
* @param initialValue an initial value (in Bytes) to use for this scoped counter
*/
public static synchronized void resetScopedMaximumBytesAllocated(long initialValue) {
if (tracker != null) {
tracker.resetScopedMaxTotalBytesAllocated(initialValue);
}
}
/**
* Resets a scoped maximum counter of RMM memory used to keep track of usage between
* code sections while debugging.
*
* This resets the counter to 0 Bytes.
*/
public static synchronized void resetScopedMaximumBytesAllocated() {
if (tracker != null) {
tracker.resetScopedMaxTotalBytesAllocated(0L);
}
}
/**
* Returns the maximum amount of RMM memory (Bytes) outstanding since the last
* `resetScopedMaximumOutstanding` call was issued (it is "scoped" because it's the
* maximum amount seen since the last reset).
*
* If the memory used is net negative (for example if only frees happened since
* reset, and we reset to 0), then result will be 0.
*
* If `resetScopedMaximumBytesAllocated` is never called, the scope is the whole
* program and is equivalent to `getMaximumTotalBytesAllocated`.
*
* @return the scoped maximum bytes allocated
*/
public static synchronized long getScopedMaximumBytesAllocated() {
if (tracker == null) {
return 0L;
} else {
return tracker.getScopedMaxTotalBytesAllocated();
}
}
/**
* Sets the event handler to be called on RMM events (e.g.: allocation failure).
* @param handler event handler to invoke on RMM events or null to clear an existing handler
* @throws RmmException if an active handler is already set
*/
public static void setEventHandler(RmmEventHandler handler) throws RmmException {
setEventHandler(handler, false);
}
/**
* Sets the event handler to be called on RMM events (e.g.: allocation failure) and
* optionally enable debug mode (callbacks on every allocate and deallocate)
*
* NOTE: Only enable debug mode when necessary, as code will run much slower!
*
* @param handler event handler to invoke on RMM events or null to clear an existing handler
* @param enableDebug if true enable debug callbacks in RmmEventHandler
* (onAllocated, onDeallocated)
* @throws RmmException if an active handler is already set
*/
public static synchronized void setEventHandler(RmmEventHandler handler,
boolean enableDebug) throws RmmException {
if (!initialized) {
throw new RmmException("RMM has not been initialized");
}
if (deviceResource instanceof RmmEventHandlerResourceAdaptor) {
throw new RmmException("Another event handler is already set");
}
if (tracker == null) {
// This is just to be safe it should always be true if this is initialized.
throw new RmmException("A tracker must be set for the event handler to work");
}
RmmEventHandlerResourceAdaptor newResource =
new RmmEventHandlerResourceAdaptor<>(deviceResource, tracker, handler, enableDebug);
boolean success = false;
try {
setCurrentDeviceResource(newResource, deviceResource, false);
success = true;
} finally {
if (!success) {
newResource.releaseWrapped();
}
}
}
/** Clears the active RMM event handler if one is set. */
public static synchronized void clearEventHandler() throws RmmException {
if (deviceResource != null && deviceResource instanceof RmmEventHandlerResourceAdaptor) {
RmmEventHandlerResourceAdaptor orig =
(RmmEventHandlerResourceAdaptor)deviceResource;
boolean success = false;
try {
setCurrentDeviceResource(orig.wrapped, orig, false);
success = true;
} finally {
if (success) {
orig.releaseWrapped();
}
}
}
}
public static native void initDefaultCudaDevice();
public static native void cleanupDefaultCudaDevice();
/**
* Shut down any initialized RMM instance. This should be used very rarely. It does not need to
* be used when shutting down your process because CUDA will handle releasing all of the
* resources when your process exits. This really should only be used if you want to turn off the
* memory pool for some reasons. As such we make an effort to be sure no resources have been
* leaked before shutting down. This may involve forcing a JVM GC to collect any leaked java
* objects that still point to CUDA memory. By default this will do a gc every 2 seconds and
* wait for up to 4 seconds before throwing an RmmException if not all of the resources are freed.
* @throws RmmException on any error. This includes if there are outstanding allocations that
* could not be collected.
*/
public static void shutdown() throws RmmException {
shutdown(2, 4, TimeUnit.SECONDS);
}
/**
* Shut down any initialized RMM instance. This should be used very rarely. It does not need to
* be used when shutting down your process because CUDA will handle releasing all of the
* resources when your process exits. This really should only be used if you want to turn off the
* memory pool for some reasons. As such we make an effort to be sure no resources have been
* leaked before shutting down. This may involve forcing a JVM GC to collect any leaked java
* objects that still point to CUDA memory.
*
* @param forceGCInterval how frequently should we force a JVM GC. This is just a recommendation
* to the JVM to do a gc.
* @param maxWaitTime the maximum amount of time to wait for all objects to be collected before
* throwing an exception.
* @param units the units for forceGcInterval and maxWaitTime.
* @throws RmmException on any error. This includes if there are outstanding allocations that
* could not be collected before maxWaitTime.
*/
public static synchronized void shutdown(long forceGCInterval, long maxWaitTime, TimeUnit units)
throws RmmException {
long now = System.currentTimeMillis();
final long endTime = now + units.toMillis(maxWaitTime);
long nextGcTime = now;
try {
if (MemoryCleaner.bestEffortHasRmmBlockers()) {
do {
if (nextGcTime <= now) {
System.gc();
nextGcTime = nextGcTime + units.toMillis(forceGCInterval);
}
// Check if everything is ready about every 10 ms
Thread.sleep(10);
now = System.currentTimeMillis();
} while (endTime > now && MemoryCleaner.bestEffortHasRmmBlockers());
}
} catch (InterruptedException e) {
// Ignored
}
if (MemoryCleaner.bestEffortHasRmmBlockers()) {
throw new RmmException("Could not shut down RMM there appear to be outstanding allocations");
}
if (initialized) {
if (deviceResource != null) {
setCurrentDeviceResource(null, deviceResource, true).close();
}
}
}
/**
* Allocate device memory and return a pointer to device memory, using stream 0.
* @param size The size in bytes of the allocated memory region
* @return Returned pointer to the allocated memory
*/
public static DeviceMemoryBuffer alloc(long size) {
return alloc(size, null);
}
/**
* Allocate device memory and return a pointer to device memory.
* @param size The size in bytes of the allocated memory region
* @param stream The stream in which to synchronize this command.
* @return Returned pointer to the allocated memory
*/
public static DeviceMemoryBuffer alloc(long size, Cuda.Stream stream) {
long s = stream == null ? 0 : stream.getStream();
return new DeviceMemoryBuffer(allocInternal(size, s), size, stream);
}
private static native long allocInternal(long size, long stream) throws RmmException;
static native void free(long ptr, long length, long stream) throws RmmException;
/**
* Delete an rmm::device_buffer.
*/
static native void freeDeviceBuffer(long rmmBufferAddress) throws RmmException;
/**
* Allocate device memory using `cudaMalloc` and return a pointer to device memory.
* @param size The size in bytes of the allocated memory region
* @param stream The stream in which to synchronize this command.
* @return Returned pointer to the allocated memory
*/
public static CudaMemoryBuffer allocCuda(long size, Cuda.Stream stream) {
long s = stream == null ? 0 : stream.getStream();
return new CudaMemoryBuffer(allocCudaInternal(size, s), size, stream);
}
private static native long allocCudaInternal(long size, long stream) throws RmmException;
static native void freeCuda(long ptr, long length, long stream) throws RmmException;
static native long newCudaMemoryResource() throws RmmException;
static native void releaseCudaMemoryResource(long handle);
static native long newManagedMemoryResource() throws RmmException;
static native void releaseManagedMemoryResource(long handle);
static native long newPoolMemoryResource(long childHandle,
long initSize, long maxSize) throws RmmException;
static native void releasePoolMemoryResource(long handle);
static native long newArenaMemoryResource(long childHandle,
long size, boolean dumpOnOOM) throws RmmException;
static native void releaseArenaMemoryResource(long handle);
static native long newCudaAsyncMemoryResource(long size, long release) throws RmmException;
static native void releaseCudaAsyncMemoryResource(long handle);
static native long newLimitingResourceAdaptor(long handle, long limit, long align) throws RmmException;
static native void releaseLimitingResourceAdaptor(long handle);
static native long newLoggingResourceAdaptor(long handle, int type, String path,
boolean autoFlush) throws RmmException;
static native void releaseLoggingResourceAdaptor(long handle);
static native long newTrackingResourceAdaptor(long handle, long alignment) throws RmmException;
static native void releaseTrackingResourceAdaptor(long handle);
static native long nativeGetTotalBytesAllocated(long handle);
static native long nativeGetMaxTotalBytesAllocated(long handle);
static native void nativeResetScopedMaxTotalBytesAllocated(long handle, long initValue);
static native long nativeGetScopedMaxTotalBytesAllocated(long handle);
static native long newEventHandlerResourceAdaptor(long handle, long trackerHandle,
RmmEventHandler handler, long[] allocThresholds, long[] deallocThresholds, boolean debug);
static native long releaseEventHandlerResourceAdaptor(long handle, boolean debug);
private static native void setCurrentDeviceResourceInternal(long newHandle);
public static native long newPinnedPoolMemoryResource(long initSize, long maxSize);
public static native long setCudfPinnedPoolMemoryResource(long poolPtr);
public static native void releasePinnedPoolMemoryResource(long poolPtr);
public static native long allocFromPinnedPool(long poolPtr, long size);
public static native void freeFromPinnedPool(long poolPtr, long ptr, long size);
// only for tests
public static native long allocFromFallbackPinnedPool(long size);
// only for tests
public static native void freeFromFallbackPinnedPool(long ptr, long size);
}