org.nd4j.linalg.jcublas.gpumetrics.GpuMetrics Maven / Gradle / Ivy
package org.nd4j.linalg.jcublas.gpumetrics;
import jcuda.Sizeof;
import jcuda.driver.CUoccupancyB2DSize;
import jcuda.driver.JCudaDriver;
import jcuda.runtime.cudaDeviceProp;
import jcuda.utils.KernelLauncher;
import lombok.AllArgsConstructor;
import lombok.Data;
import org.nd4j.linalg.jcublas.context.ContextHolder;
import org.nd4j.linalg.jcublas.kernel.KernelFunctionLoader;
import static jcuda.runtime.JCuda.*;
import org.nd4j.linalg.jcublas.util.PointerUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
See:
http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-occupancy-api-simplifies-launch-configuration/
*
* @author Adam Gibson
*/
@Data
@AllArgsConstructor
public class GpuMetrics {
public GpuMetrics() {
}
private static Logger log = LoggerFactory.getLogger(GpuMetrics.class);
public final static int MAX_THREADS = 256;
public final static int MAX_BLOCKS = 64;
private int gridSize,blockSize,sharedMemory;
private static CUoccupancyB2DSize DOUBLE = new CUoccupancyB2DSize() {
@Override
public long call(int blockSize) {
return blockSize * Sizeof.DOUBLE;
}
};
private static CUoccupancyB2DSize FLOAT = new CUoccupancyB2DSize() {
@Override
public long call(int blockSize) {
return blockSize * Sizeof.FLOAT;
}
};
/**
* Outputs the expected gpu information
* to send to the gpu for cuda
* kernel metadata.
* The first entry is the block size
* The second entry is the grid size
* The third entry is the shared memory
* @return a 3 length array
* representing the gpu information
*/
public int[] getGpuDefinitionInfo() {
int[] gpuDef = new int[4];
gpuDef[0] = getBlockSize();
gpuDef[1] = getGridSize();
gpuDef[2] = getSharedMemory();
gpuDef[3] = ContextHolder.getInstance().getCurrentGpuInformation().getMaxSharedMemoryPerBlock();
return gpuDef;
}
public int getGridSize() {
return gridSize;
}
public int getBlockSize() {
return blockSize;
}
public int getSharedMemory() {
return sharedMemory;
}
/**
* Given n, max threads
* @param n the number of elements to process
* @param maxThreads the max number of threads
* @param maxBlocks the max number of blocks
* @return an array with the number of threads as
* the first entry and number of blocks
* as the second entry
*/
public static int[] getThreadsAndBlocks(int n,int maxThreads,int maxBlocks) {
//get device capability, to avoid block/grid size exceed the upper bound
cudaDeviceProp prop = new cudaDeviceProp();
int[] devicePointer = new int[1];
cudaGetDevice(devicePointer);
cudaGetDeviceProperties(prop, devicePointer[0]);
int threads = (n < maxThreads*2) ? PointerUtil.nextPow2((n + 1) / 2) : maxThreads;
int blocks = (n + (threads * 2 - 1)) / (threads * 2);
if ((float)threads*blocks > (float)prop.maxGridSize[0] * prop.maxThreadsPerBlock)
{
throw new IllegalStateException("n is too large, please choose a smaller number!\n");
}
if (blocks > prop.maxGridSize[0])
{
log.warn("Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)\n",
blocks, prop.maxGridSize[0], threads * 2, threads);
blocks /= 2;
threads *= 2;
}
blocks = Math.min(maxBlocks, blocks);
return new int[] {threads,blocks};
}
/**
* Get the blocks and threads
* used for a kernel launch
* @param dataType the data type
* @param n the number of elements
* @return the information used
* for launching a kernel
*/
public static GpuMetrics blockAndThreads(String dataType,int n) {
//<<>>
//<<< gridSize, blockSize >>>
int size = dataType.equals("double") ? Sizeof.DOUBLE : Sizeof.FLOAT;
int[] threadsAndBlocks = getThreadsAndBlocks(n,MAX_THREADS,MAX_BLOCKS);
int sharedMemSize = (threadsAndBlocks[0] <= 32) ? 2 * threadsAndBlocks[0] * size : threadsAndBlocks[0] * size;
return new GpuMetrics(threadsAndBlocks[0],threadsAndBlocks[1],sharedMemSize);
}
/**
*
* @param functionName
* @param dataType
* @param n
* @return
*/
public static GpuMetrics blocksAndThreadsOccupancy(String functionName,String dataType, int n) {
int[] gridSize = new int[1];
int[] blockSize = new int[1];
KernelLauncher launcher = KernelFunctionLoader.launcher(functionName, dataType);
CUoccupancyB2DSize size = dataType.equals("float") ? FLOAT : DOUBLE;
JCudaDriver.cuOccupancyMaxPotentialBlockSize(gridSize,blockSize,launcher.getFunction(),size,0,0);
int gridSizeRet = (n + blockSize[0] - 1) / blockSize[0];
int blockSizeRet = blockSize[0];
//for smaller problems, ensure no index out of bounds
if(blockSizeRet > n)
blockSizeRet = n;
int maxBlockSize = ContextHolder.getInstance().getCurrentGpuInformation().getMaxThreadsPerBlock();
if(blockSizeRet > maxBlockSize)
blockSizeRet = maxBlockSize;
int maxGridSize = ContextHolder.getInstance().getCurrentGpuInformation().getMaxGrimDimX();
if(gridSizeRet > maxGridSize)
gridSizeRet = maxGridSize;
int maxSharedMem = ContextHolder.getInstance().getCurrentGpuInformation().getMaxSharedMemoryPerBlock();
int sharedMemSize = blockSizeRet * (dataType.equals("float") ? Sizeof.FLOAT : Sizeof.DOUBLE);
if(sharedMemSize > maxSharedMem)
sharedMemSize = maxSharedMem;
return new GpuMetrics(gridSizeRet,blockSizeRet,sharedMemSize);
}
/**
* Validates the current configuration
* against the gpu's hardware constraints.
*
* Throws an {@link IllegalArgumentException}
* if any of the values surpass the GPU's
* built in hardware constraints
*/
public void validate() {
int maxGrid = ContextHolder.getInstance().getCurrentGpuInformation().getMaxThreadsPerBlock();
int maxBlock = ContextHolder.getInstance().getCurrentGpuInformation().getMaxBlockDimx();
int maxShared = ContextHolder.getInstance().getCurrentGpuInformation().getMaxSharedMemoryPerBlock();
if(gridSize > maxGrid)
throw new IllegalArgumentException("Maximum grid size is " + maxGrid + " but was specified as " + gridSize);
if(blockSize > maxBlock)
throw new IllegalArgumentException("Maximum block size is " + maxBlock + " but was specified as " + blockSize);
if(sharedMemory > maxShared)
throw new IllegalArgumentException("Maximum shared memory size per block is " + maxShared + " but was specified as " + sharedMemory);
}
/**
* Special setter that queries
* the maximum amount of shared memory per block allowed
* @param sharedMemory
*/
public void setSharedMemoryNotOverMax(int sharedMemory) {
setSharedMemory(Math.min(sharedMemory,1024));
}
/**
* Special setter that queries
* the maximum amount of shared memory per block allowed
* @param gridSize
*/
public void setGridSizeNotOverMax(int gridSize) {
setGridSize(Math.min(gridSize,ContextHolder.getInstance().getCurrentGpuInformation().getMaxThreadsPerBlock()));
}
/**
* Special setter
* that queries the block size
* to ensure not over the max possible
* block size is specified.
* @param blockSize the block size to attempt to set
*/
public void setBlockSizeNotOverMax(int blockSize) {
setBlockSize(Math.min(blockSize,ContextHolder.getInstance().getCurrentGpuInformation().getMaxBlockDimx()));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy