com.simiacryptus.mindseye.lang.cudnn.CudaSystem Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mindseye-cudnn Show documentation
Show all versions of mindseye-cudnn Show documentation
CuDNN Neural Network Components
/*
* Copyright (c) 2019 by Andrew Charneski.
*
* The author licenses this file to you under the
* Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance
* with the License. You may obtain a copy
* of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.simiacryptus.mindseye.lang.cudnn;
import com.google.common.util.concurrent.MoreExecutors;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.simiacryptus.lang.ResourcePool;
import com.simiacryptus.lang.StaticResourcePool;
import com.simiacryptus.lang.TimedResult;
import com.simiacryptus.mindseye.lang.CoreSettings;
import com.simiacryptus.mindseye.lang.Result;
import com.simiacryptus.mindseye.lang.TensorList;
import com.simiacryptus.util.Util;
import com.simiacryptus.util.data.DoubleStatistics;
import jcuda.jcudnn.*;
import jcuda.runtime.JCuda;
import jcuda.runtime.cudaDeviceProp;
import jcuda.runtime.cudaStream_t;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.PrintStream;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
public class CudaSystem {
public static final HashSet> apiLog = new HashSet<>();
protected static final Logger logger = LoggerFactory.getLogger(CudaSystem.class);
protected static final Map propertyCache = new ConcurrentHashMap<>();
protected static final ThreadLocal currentDeviceId = new ThreadLocal();
protected static final ExecutorService logThread = Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().setDaemon(true).build());
protected static final long start = System.nanoTime();
protected static final DoubleStatistics createPoolingDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics createLRNDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaDeviceReset_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaFree_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaMalloc_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaDeviceSynchronize_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaSetDeviceFlags_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaMallocManaged_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaHostAlloc_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaFreeHost_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaDeviceGetLimit_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaDeviceSetLimit_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaMemcpyAsync_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaMemcpy_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaMemset_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnSoftmaxForward_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnSoftmaxBackward_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnCreateReduceTensorDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnSetReduceTensorDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnActivationBackward_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnActivationForward_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnAddTensor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnConvolutionBackwardBias_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnConvolutionBackwardData_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnConvolutionBackwardFilter_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnConvolutionForward_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnConvolutionBiasActivationForward_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnDestroyActivationDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnDestroyConvolutionDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnDestroyFilterDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnDestroyOpTensorDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnDestroyPoolingDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnDestroyTensorDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnGetPoolingNdForwardOutputDim_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnOpTensor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnReduceTensor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnPoolingBackward_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnPoolingForward_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnSetLRNDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnCreateLRNDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnDestroyLRNDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnLRNCrossChannelForward_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnLRNCrossChannelBackward_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnTransformTensor_execution = new DoubleStatistics();
protected static final DoubleStatistics cudnnSetTensor_execution = new DoubleStatistics();
protected static final DoubleStatistics deviceCount_execution = new DoubleStatistics();
protected static final DoubleStatistics setDevice_execution = new DoubleStatistics();
protected static final DoubleStatistics getDeviceProperties_execution = new DoubleStatistics();
protected static final DoubleStatistics getOutputDims_execution = new DoubleStatistics();
protected static final DoubleStatistics newActivationDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics newConvolutionNdDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics newConvolutions2dDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics newFilterDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics newOpDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics newTensorDescriptor_execution = new DoubleStatistics();
protected static final DoubleStatistics allocateBackwardDataWorkspace_execution = new DoubleStatistics();
protected static final DoubleStatistics allocateBackwardFilterWorkspace_execution = new DoubleStatistics();
protected static final DoubleStatistics allocateForwardWorkspace_execution = new DoubleStatistics();
protected static final DoubleStatistics getBackwardDataAlgorithm_execution = new DoubleStatistics();
protected static final DoubleStatistics getBackwardFilterAlgorithm_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaStreamCreate_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaStreamDestroy_execution = new DoubleStatistics();
protected static final DoubleStatistics cudaStreamSynchronize_execution = new DoubleStatistics();
protected static final DoubleStatistics getForwardAlgorithm_execution = new DoubleStatistics();
protected static final Object syncLock = new Object();
protected static final HashMap> handlePools = new HashMap<>();
private static final Map syncTimes = new HashMap<>();
private static final Executor garbageTruck = MoreExecutors.directExecutor();
private static final HashMap deviceLocks = new HashMap<>();
private static final long COPY_BLOCK_SIZE = Long.MAX_VALUE;
// private final List dirty = new ArrayList<>();
@Nonnull
public static AtomicInteger gpuGeneration = new AtomicInteger(0);
//Executors.newCachedThreadPool(new ThreadFactoryBuilder().setNameFormat("gpu-free-%d").setDaemon(true).getNetwork());
private static volatile Integer cachedDeviceCount = init();
private static volatile StaticResourcePool pool;
protected final ExecutorService executionThread = CoreSettings.INSTANCE().isSingleThreaded() ?
MoreExecutors.newDirectExecutorService() :
Executors.newSingleThreadExecutor(new ThreadFactoryBuilder().setNameFormat(toString()).build());
protected CudaSystem() {
}
public static void logHeader() {
logger.info(getHeader());
}
public static String getHeader() {
return Util.toString(CudaSystem::printHeader);
}
public static void printHeader(@Nonnull PrintStream out) {
@Nonnull int[] runtimeVersion = {0};
@Nonnull int[] driverVersion = {0};
JCuda.cudaRuntimeGetVersion(runtimeVersion);
JCuda.cudaDriverGetVersion(driverVersion);
@Nonnull CharSequence jCudaVersion = JCuda.getJCudaVersion();
out.printf("Time: %s; Driver %s; Runtime %s; Lib %s%n", new Date(), driverVersion[0], runtimeVersion[0], jCudaVersion);
@Nonnull long[] free = {0};
@Nonnull long[] total = {0};
JCuda.cudaMemGetInfo(free, total);
out.printf("Cuda Memory: %.1f freeRef, %.1f total%n", free[0] * 1.0 / (1024 * 1024), total[0] * 1.0 / (1024 * 1024));
@Nonnull final int[] deviceCount = new int[1];
JCuda.cudaGetDeviceCount(deviceCount);
IntStream.range(0, deviceCount[0]).forEach(device -> {
@Nonnull final cudaDeviceProp deviceProp = new cudaDeviceProp();
JCuda.cudaGetDeviceProperties(deviceProp, device);
out.printf("Device %d = %s%n", device, deviceProp, free[0], total[0]);
});
System.getProperties().forEach((k, v) -> {
boolean display = false;
if (k.toString().endsWith(".version")) display = true;
if (k.toString().startsWith("os.")) display = true;
if (k.toString().contains("arch")) display = true;
if (display) out.printf("%s = %s%n", k, v);
});
}
@Nonnull
protected static Map toMap(@Nonnull DoubleStatistics obj) {
@Nonnull HashMap map = new HashMap<>();
if (0 < obj.getCount()) {
map.put("stddev", Double.toString(obj.getStandardDeviation()));
map.put("mean", Double.toString(obj.getAverage()));
map.put("total", Double.toString(obj.getSum()));
map.put("max", Double.toString(obj.getMax()));
map.put("count", Double.toString(obj.getCount()));
}
return map;
}
@Nonnull
public static Map> getExecutionStatistics() {
@Nonnull HashMap> map = new HashMap<>();
map.put("createPoolingDescriptor", toMap(createPoolingDescriptor_execution));
map.put("cudaDeviceReset", toMap(cudaDeviceReset_execution));
map.put("cudaFree", toMap(cudaFree_execution));
map.put("cudaMalloc", toMap(cudaMalloc_execution));
map.put("cudaMallocManaged", toMap(cudaMallocManaged_execution));
map.put("cudaHostAlloc", toMap(cudaHostAlloc_execution));
map.put("cudaFreeHost", toMap(cudaFreeHost_execution));
map.put("cudaDeviceGetLimit", toMap(cudaDeviceGetLimit_execution));
map.put("cudaDeviceSetLimit", toMap(cudaDeviceSetLimit_execution));
map.put("cudaMemcpy", toMap(cudaMemcpy_execution));
map.put("cudaMemset", toMap(cudaMemset_execution));
map.put("cudnnActivationBackward", toMap(cudnnActivationBackward_execution));
map.put("cudnnActivationForward", toMap(cudnnActivationForward_execution));
map.put("cudnnAddTensor", toMap(cudnnAddTensor_execution));
map.put("cudnnConvolutionBackwardBias", toMap(cudnnConvolutionBackwardBias_execution));
map.put("cudnnConvolutionBackwardData", toMap(cudnnConvolutionBackwardData_execution));
map.put("cudnnConvolutionBackwardFilter", toMap(cudnnConvolutionBackwardFilter_execution));
map.put("cudnnConvolutionForward", toMap(cudnnConvolutionForward_execution));
map.put("cudnnDestroyActivationDescriptor", toMap(cudnnDestroyActivationDescriptor_execution));
map.put("cudnnDestroyConvolutionDescriptor", toMap(cudnnDestroyConvolutionDescriptor_execution));
map.put("cudnnDestroyFilterDescriptor", toMap(cudnnDestroyFilterDescriptor_execution));
map.put("cudnnDestroyOpTensorDescriptor", toMap(cudnnDestroyOpTensorDescriptor_execution));
map.put("cudnnDestroyPoolingDescriptor", toMap(cudnnDestroyPoolingDescriptor_execution));
map.put("cudnnDestroyTensorDescriptor", toMap(cudnnDestroyTensorDescriptor_execution));
map.put("cudnnGetPoolingNdForwardOutputDim", toMap(cudnnGetPoolingNdForwardOutputDim_execution));
map.put("cudnnOpTensor", toMap(cudnnOpTensor_execution));
map.put("cudnnPoolingBackward", toMap(cudnnPoolingBackward_execution));
map.put("cudnnPoolingForward", toMap(cudnnPoolingForward_execution));
map.put("cudnnTransformTensor", toMap(cudnnTransformTensor_execution));
map.put("cachedDeviceCount", toMap(deviceCount_execution));
map.put("setDevice", toMap(setDevice_execution));
map.put("getDeviceProperties", toMap(getDeviceProperties_execution));
map.put("getOutputDims", toMap(getOutputDims_execution));
map.put("newActivationDescriptor", toMap(newActivationDescriptor_execution));
map.put("newConvolutionNdDescriptor", toMap(newConvolutionNdDescriptor_execution));
map.put("newConvolutions2dDescriptor", toMap(newConvolutions2dDescriptor_execution));
map.put("newFilterDescriptor", toMap(newFilterDescriptor_execution));
map.put("newOpDescriptor", toMap(newOpDescriptor_execution));
map.put("newTensorDescriptor", toMap(newTensorDescriptor_execution));
map.put("allocateBackwardDataWorkspace", toMap(allocateBackwardDataWorkspace_execution));
map.put("allocateBackwardFilterWorkspace", toMap(allocateBackwardFilterWorkspace_execution));
map.put("allocateForwardWorkspace", toMap(allocateForwardWorkspace_execution));
map.put("getBackwardDataAlgorithm", toMap(getBackwardDataAlgorithm_execution));
map.put("getBackwardFilterAlgorithm", toMap(getBackwardFilterAlgorithm_execution));
map.put("getForwardAlgorithm", toMap(getForwardAlgorithm_execution));
map.put("cudaDeviceSynchronize", toMap(cudaDeviceSynchronize_execution));
map.put("cudaStreamCreate", toMap(cudaStreamCreate_execution));
map.put("cudaStreamDestroy", toMap(cudaStreamDestroy_execution));
map.put("cudaStreamSynchronize", toMap(cudaStreamSynchronize_execution));
map.put("cudaMemcpyAsync", toMap(cudaMemcpyAsync_execution));
map.put("cudaSetDeviceFlags", toMap(cudaSetDeviceFlags_execution));
for (CharSequence entry : map.entrySet().stream().filter(x -> x.getValue().isEmpty()).map(x -> x.getKey()).collect(Collectors.toList())) {
map.remove(entry);
}
return map;
}
public static int cudaDeviceReset() {
long startTime = System.nanoTime();
final int result = JCuda.cudaDeviceReset();
log("cudaDeviceReset", result, new Object[]{});
cudaDeviceReset_execution.accept((System.nanoTime() - startTime) / 1e9);
handle(result);
return result;
}
public static int cudaMalloc(final CudaPointer devPtr, final long size) {
long startTime = System.nanoTime();
final int result = JCuda.cudaMalloc(devPtr, size);
log("cudaMalloc", result, new Object[]{devPtr, size});
cudaMalloc_execution.accept((System.nanoTime() - startTime) / 1e9);
handle(result);
return result;
}
public static int cudaMallocManaged(final CudaPointer devPtr, final long size, int flags) {
long startTime = System.nanoTime();
final int result = JCuda.cudaMallocManaged(devPtr, size, flags);
log("cudaMallocManaged", result, new Object[]{devPtr, size, flags});
cudaMallocManaged_execution.accept((System.nanoTime() - startTime) / 1e9);
handle(result);
return result;
}
public static Integer getThreadDeviceId() {
return CudaSystem.currentDeviceId.get();
}
public static int cudaSetDeviceFlags(int flags) {
long startTime = System.nanoTime();
final int result = JCuda.cudaSetDeviceFlags(flags);
log("cudaSetDeviceFlags", result, new Object[]{flags});
cudaDeviceSynchronize_execution.accept((System.nanoTime() - startTime) / 1e9);
handle(result);
return result;
}
public static int cudaHostAlloc(final CudaPointer devPtr, final long size, int flags) {
long startTime = System.nanoTime();
final int result = JCuda.cudaHostAlloc(devPtr, size, flags);
cudaHostAlloc_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudaHostAlloc", result, new Object[]{devPtr, size, flags});
handle(result);
return result;
}
public static int cudaFreeHost(final CudaPointer devPtr) {
long startTime = System.nanoTime();
final int result = JCuda.cudaFreeHost(devPtr);
cudaFreeHost_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudaFreeHost", result, new Object[]{devPtr});
handle(result);
return result;
}
public static long cudaDeviceGetLimit(final int limit) {
long startTime = System.nanoTime();
@Nonnull long[] pValue = new long[1];
final int result = JCuda.cudaDeviceGetLimit(pValue, limit);
cudaDeviceGetLimit_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudaDeviceGetLimit(", result, new Object[]{pValue, limit});
return pValue[0];
}
public static void cudaDeviceSetLimit(final int limit, long value) {
long startTime = System.nanoTime();
final int result = JCuda.cudaDeviceSetLimit(limit, value);
cudaDeviceSetLimit_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudaDeviceSetLimit(", result, new Object[]{limit, value});
handle(result);
}
public static void cudaMemcpy(final CudaPointer dst, final CudaPointer src, final long count, final int cudaMemcpyKind_kind) {
if (count > COPY_BLOCK_SIZE) {
cudaMemcpy(dst, src, COPY_BLOCK_SIZE, cudaMemcpyKind_kind);
cudaMemcpy(dst.withByteOffset(COPY_BLOCK_SIZE), src.withByteOffset(COPY_BLOCK_SIZE), count - COPY_BLOCK_SIZE, cudaMemcpyKind_kind);
return;
}
long startTime = System.nanoTime();
final int result = JCuda.cudaMemcpy(dst, src, count, cudaMemcpyKind_kind);
cudaMemcpy_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudaMemcpy", result, new Object[]{dst, src, count, cudaMemcpyKind_kind});
handle(result);
}
public static void cudaMemcpyAsync(final CudaPointer dst, final CudaPointer src, final long count, final int cudaMemcpyKind_kind, cudaStream_t stream) {
long startTime = System.nanoTime();
final int result = JCuda.cudaMemcpyAsync(dst, src, count, cudaMemcpyKind_kind, stream);
cudaMemcpyAsync_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudaMemcpyAsync", result, new Object[]{dst, src, count, cudaMemcpyKind_kind, stream});
handle(result);
}
public static CudaResource cudaStreamCreate() {
long startTime = System.nanoTime();
@Nonnull cudaStream_t stream = new cudaStream_t();
int result = JCuda.cudaStreamCreate(stream);
cudaStreamCreate_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudaStreamCreate", result, new Object[]{stream});
handle(result);
return new CudaStream(stream);
}
public static int cudaStreamDestroy(cudaStream_t stream) {
long startTime = System.nanoTime();
int result = JCuda.cudaStreamDestroy(stream);
cudaStreamDestroy_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudaStreamDestroy", result, new Object[]{stream});
handle(result);
return result;
}
public static void cudaStreamSynchronize(cudaStream_t stream) {
long startTime = System.nanoTime();
int result = JCuda.cudaStreamSynchronize(stream);
cudaStreamSynchronize_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudaStreamSynchronize", result, new Object[]{stream});
handle(result);
}
public static void cudaMemset(final CudaPointer mem, final int c, final long count) {
long startTime = System.nanoTime();
final int result = JCuda.cudaMemset(mem, c, count);
//cudaDeviceSynchronize();
cudaMemset_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudaMemset", result, new Object[]{mem, c, count});
handle(result);
}
public static int cudnnDestroyActivationDescriptor(final cudnnActivationDescriptor activationDesc) {
long startTime = System.nanoTime();
final int result = JCudnn.cudnnDestroyActivationDescriptor(activationDesc);
cudnnDestroyActivationDescriptor_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudnnDestroyActivationDescriptor", result, new Object[]{activationDesc});
return result;
}
public static int cudnnDestroyConvolutionDescriptor(final cudnnConvolutionDescriptor convDesc) {
long startTime = System.nanoTime();
final int result = JCudnn.cudnnDestroyConvolutionDescriptor(convDesc);
cudnnDestroyConvolutionDescriptor_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudnnDestroyConvolutionDescriptor", result, new Object[]{convDesc});
return result;
}
public static int cudnnDestroyFilterDescriptor(final cudnnFilterDescriptor filterDesc) {
long startTime = System.nanoTime();
final int result = JCudnn.cudnnDestroyFilterDescriptor(filterDesc);
cudnnDestroyFilterDescriptor_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudnnDestroyFilterDescriptor", result, new Object[]{filterDesc});
return result;
}
public static int cudnnDestroyOpTensorDescriptor(final cudnnOpTensorDescriptor opTensorDesc) {
long startTime = System.nanoTime();
final int result = JCudnn.cudnnDestroyOpTensorDescriptor(opTensorDesc);
cudnnDestroyOpTensorDescriptor_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudnnDestroyOpTensorDescriptor", result, new Object[]{opTensorDesc});
return result;
}
public static int cudnnDestroyPoolingDescriptor(final cudnnPoolingDescriptor poolingDesc) {
long startTime = System.nanoTime();
final int result = JCudnn.cudnnDestroyPoolingDescriptor(poolingDesc);
cudnnDestroyPoolingDescriptor_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudnnDestroyPoolingDescriptor", result, new Object[]{poolingDesc});
return result;
}
public static int cudnnDestroyTensorDescriptor(final cudnnTensorDescriptor tensorDesc) {
long startTime = System.nanoTime();
final int result = JCudnn.cudnnDestroyTensorDescriptor(tensorDesc);
cudnnDestroyTensorDescriptor_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudnnDestroyTensorDescriptor", result, new Object[]{tensorDesc});
return result;
}
public static int cudnnGetPoolingNdForwardOutputDim(
final cudnnPoolingDescriptor poolingDesc,
final cudnnTensorDescriptor inputTensorDesc,
final int nbDims,
final int[] outputTensorDimA) {
long startTime = System.nanoTime();
final int result = JCudnn.cudnnGetPoolingNdForwardOutputDim(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
cudnnGetPoolingNdForwardOutputDim_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudnnGetPoolingNdForwardOutputDim", result, new Object[]{poolingDesc, inputTensorDesc, nbDims, outputTensorDimA});
return result;
}
public static int deviceCount() {
long startTime = System.nanoTime();
@Nonnull final int[] deviceCount = new int[1];
final int returnCode = JCuda.cudaGetDeviceCount(deviceCount);
log("cudaGetDeviceCount", returnCode, new Object[]{deviceCount});
deviceCount_execution.accept((System.nanoTime() - startTime) / 1e9);
CudaSystem.handle(returnCode);
return deviceCount[0];
}
public static boolean isOom(final Throwable t) {
if (t instanceof OutOfMemoryError) return true;
//if (t instanceof com.simiacryptus.mindseye.lang.cudnn.CudaError) return true;
if (null != t.getCause() && t != t.getCause()) return isOom(t.getCause());
return false;
}
public static int[] getStride(@Nonnull final int[] array) {
return IntStream.range(0, array.length).map(i -> IntStream.range(i + 1, array.length).map(ii -> array[ii]).reduce((a, b) -> a * b).orElse(1)).toArray();
}
public static void handle(final int returnCode) {
if (returnCode != cudnnStatus.CUDNN_STATUS_SUCCESS) {
CudaError cudaError = new CudaError("returnCode = " + cudnnStatus.stringFor(returnCode));
logger.warn("Cuda Error", cudaError);
throw cudaError;
}
}
protected static CharSequence renderToLog(final Object obj) {
if (obj instanceof int[]) {
if (((int[]) obj).length < 10) {
return Arrays.toString((int[]) obj);
}
}
if (obj instanceof double[]) {
if (((double[]) obj).length < 10) {
return Arrays.toString((double[]) obj);
}
}
if (obj instanceof float[]) {
if (((float[]) obj).length < 10) {
return Arrays.toString((float[]) obj);
}
}
if (obj instanceof long[]) {
if (((long[]) obj).length < 10) {
return Arrays.toString((long[]) obj);
}
}
return obj.toString();
}
@Nonnull
public static int[] getOutputDims(final cudnnTensorDescriptor srcTensorDesc, final cudnnFilterDescriptor filterDesc, final cudnnConvolutionDescriptor convDesc) {
long startTime = System.nanoTime();
@Nonnull final int[] tensorOuputDims = new int[4];
final int result = JCudnn.cudnnGetConvolutionNdForwardOutputDim(convDesc, srcTensorDesc, filterDesc, tensorOuputDims.length, tensorOuputDims);
getOutputDims_execution.accept((System.nanoTime() - startTime) / 1e9);
log("cudnnGetConvolutionNdForwardOutputDim", result, new Object[]{convDesc, srcTensorDesc, filterDesc, tensorOuputDims.length, tensorOuputDims});
CudaSystem.handle(result);
return tensorOuputDims;
}
// /**
// * With device.
// *
// * @param deviceId the n
// * @param action the action
// */
// public static void withDevice(int deviceId, @Nonnull Consumer action) {
// assert deviceId >= 0;
// final int prevDevice = getThreadDeviceId();
// try {
// CudaDevice.setDevice(deviceId);
// action.accept(new CudaDevice(deviceId));
// } finally {
// if (prevDevice >= 0) CudaDevice.setDevice(prevDevice);
// else CudaSystem.currentDeviceId.remove();
// }
// }
public static boolean removeLog(Consumer apiLog) {
return CudaSystem.apiLog.remove(apiLog);
}
public static void addLog(@Nonnull PrintStream log) {
printHeader(log);
apiLog.add(s -> log.println(s));
}
public static void addLog(@Nonnull Consumer log) {
apiLog.add(log);
}
public static void log(final CharSequence method, final Object result, @Nullable final Object[] args) {
CharSequence callstack = !CudaSettings.INSTANCE().isLogStack() ? "" : Util.toString(Arrays.stream(Thread.currentThread().getStackTrace())
.filter(x -> true
&& x.getClassName().startsWith("com.simiacryptus.mindseye.")
//&& !x.getClassName().startsWith("com.simiacryptus.mindseye.lang.")
//&& !x.getClassName().startsWith("com.simiacryptus.mindseye.test.")
)
//.limit(10)
.toArray(i -> new StackTraceElement[i]), ", ");
@Nonnull final CharSequence paramString = null == args ? "" : Arrays.stream(args).map(CudaSystem::renderToLog).reduce((a, b) -> a + ", " + b).orElse("");
final String message = String.format("%.6f @ %s(%d): %s(%s) = %s via [%s]", (System.nanoTime() - CudaSystem.start) / 1e9, Thread.currentThread().getName(), getThreadDeviceId(), method, paramString, result, callstack);
try {
CudaSystem.apiLog.forEach(apiLog -> CudaSystem.logThread.submit(() -> apiLog.accept(message)));
} catch (ConcurrentModificationException e) {
}
}
public static boolean isThreadDeviceId(int deviceId) {
Integer integer = getThreadDeviceId();
return integer != null && (deviceId == integer);
}
public static boolean isEnabled() {
return 0 < getCachedDeviceCount();
}
public static void withDevice(int deviceId, @Nonnull final Consumer fn) {
CudnnHandle threadlocal = CudnnHandle.threadContext.get();
final Integer incumbantDevice = getThreadDeviceId();
try {
if (threadlocal != null && threadlocal.getDeviceId() == deviceId) {
assert CudaSystem.isThreadDeviceId(threadlocal.getDeviceId());
fn.accept(threadlocal);
} else {
getPool(deviceId).apply(gpu -> {
gpu.wrap(() -> {
fn.accept(gpu);
return null;
}).get();
});
}
} finally {
if (null == threadlocal) CudnnHandle.threadContext.remove();
else CudnnHandle.threadContext.set(threadlocal);
if (null != incumbantDevice) CudaDevice.setDevice(incumbantDevice);
}
}
public static T withDevice(int deviceId, @Nonnull Function action) {
CudnnHandle threadlocal = CudnnHandle.threadContext.get();
final Integer incumbantDevice = getThreadDeviceId();
try {
if (threadlocal != null && threadlocal.getDeviceId() == deviceId) {
return action.apply(threadlocal);
} else {
return getPool(deviceId).apply(gpu -> {
return gpu.wrap(() -> action.apply(gpu)).get();
});
}
} finally {
if (null == threadlocal) CudnnHandle.threadContext.remove();
else CudnnHandle.threadContext.set(threadlocal);
if (null != incumbantDevice) CudaDevice.setDevice(incumbantDevice);
}
}
public static void run(@Nonnull final Consumer fn, Object... hints) {
CudnnHandle threadlocal = CudnnHandle.threadContext.get();
final Integer incumbantDevice = getThreadDeviceId();
try {
if (threadlocal != null) {
assert isThreadDeviceId(threadlocal.getDeviceId());
fn.accept(threadlocal);
} else {
int device = chooseDevice(hints);
getPool(device).apply(gpu -> {
return gpu.wrap(() -> {
fn.accept(gpu);
return null;
}).get();
});
}
} finally {
if (null == threadlocal) CudnnHandle.threadContext.remove();
else CudnnHandle.threadContext.set(threadlocal);
if (null != incumbantDevice) CudaDevice.setDevice(incumbantDevice);
}
}
public static CudnnHandle getThreadHandle() {
return CudnnHandle.threadContext.get();
}
public static T run(@Nonnull final Function fn, Object... hints) {
CudnnHandle threadlocal = CudnnHandle.threadContext.get();
final Integer incumbantDevice = getThreadDeviceId();
try {
if (threadlocal != null) {
assert CudaDevice.isThreadDeviceId(threadlocal.getDeviceId());
T result = fn.apply(threadlocal);
return result;
} else {
int device = chooseDevice(hints);
assert device >= 0;
return getPool(device).apply(gpu -> {
return gpu.wrap(() -> fn.apply(gpu)).get();
});
}
} finally {
if (null == threadlocal) CudnnHandle.threadContext.remove();
else CudnnHandle.threadContext.set(threadlocal);
if (null != incumbantDevice) CudaDevice.setDevice(incumbantDevice);
}
}
public static int chooseDevice(final Object[] hints) {
Set devices = Arrays.stream(hints).map(hint -> {
if (hint instanceof Result) {
TensorList data = ((Result) hint).getData();
if (data instanceof CudaTensorList) {
int deviceId = ((CudaTensorList) data).getDeviceId();
assert deviceId >= 0;
return deviceId;
}
} else if (hint instanceof CudaDeviceResource) {
int deviceId = ((CudaDeviceResource) hint).getDeviceId();
//assert deviceId >= 0 : String.format("%s/%d", hint.getClass(), deviceId);
if (deviceId >= 0) return deviceId;
} else if (hint instanceof Integer) {
Integer deviceId = (Integer) hint;
assert deviceId >= 0;
return deviceId;
}
return null;
}).filter(x -> x != null).collect(Collectors.toSet());
if (devices.isEmpty()) {
List candidates = Arrays.stream(CudaSettings.INSTANCE().defaultDevices.split(",")).map(x -> x.trim()).filter(x -> !x.isEmpty()).collect(Collectors.toList());
if (candidates.isEmpty()) {
int deviceId = (int) Math.floor(Math.random() * getCachedDeviceCount());
assert deviceId >= 0;
return deviceId;
} else {
return Integer.parseInt(candidates.get((int) (Math.random() * candidates.size())));
}
} else {
Integer deviceId = devices.stream().findAny().get();
assert deviceId >= 0;
return deviceId;
}
}
static int init() {
if (CudaSettings.INSTANCE().isDisable()) {
CudaDevice.logger.warn("Disabled CudaSystem");
}
final int deviceCount;
deviceCount = getDeviceCount();
for (int d = 0; d < deviceCount; d++) {
initDevice(d);
}
return deviceCount;
}
private static void initDevice(final int deviceNumber) {
CudaDevice.setDevice(deviceNumber);
CudaDevice.logger.info(String.format("Device %s - %s", deviceNumber, CudaDevice.getDeviceName(deviceNumber)));
try {
//CudaSystem.handle(CudaSystem.cudaSetDeviceFlags(JCuda.cudaDeviceScheduleBlockingSync));
} catch (Throwable e) {
CudaDevice.logger.warn("Error initializing GPU", e);
throw new RuntimeException(e);
}
for (@Nonnull DeviceLimits limit : DeviceLimits.values()) {
CudaDevice.logger.info(String.format("Default Limit %s = %s", limit, limit.get()));
}
DeviceLimits.HeapSize.set(16 * 1024 * 1024 * 1024);
DeviceLimits.FifoSize.set(8 * 1024 * 1024);
for (@Nonnull DeviceLimits limit : DeviceLimits.values()) {
CudaDevice.logger.info(String.format("Configured Limit %s = %s", limit, limit.get()));
}
}
private static int getDeviceCount() {
final int deviceCount;
if (CudaSettings.INSTANCE().isForceSingleGpu()) {
CudaDevice.logger.warn("Forcing Single-GPU Mode");
deviceCount = 1;
} else {
deviceCount = CudaSystem.deviceCount();
}
CudaDevice.logger.info(String.format("Found %s devices", deviceCount));
return deviceCount;
}
public static void synchronize(long time, int device) {
long startTime = System.nanoTime();
Long val = syncTimes.get(device);
if (null == val) val = 0L;
if (null == val || val < time) {
final Long finalVal = val;
CharSequence caller = !CudaSettings.INSTANCE().isProfileMemoryIO() ? "" : Util.getCaller();
withDevice(device, gpu -> {
if (null == finalVal || finalVal < time) {
TimedResult timedResult = TimedResult.time(() -> cudaDeviceSynchronize());
CudaTensorList.logger.debug(String.format("Synchronized %d in %.4f (%.6f -> %.6f -> %.6f) via %s", getThreadDeviceId(), timedResult.seconds(), (finalVal - startTime) / 1e9, (time - startTime) / 1e9, (timedResult.result - startTime) / 1e9, caller));
// synchronized (deviceLocks.computeIfAbsent(device, d -> new Object())) {
// if (null == finalVal || finalVal < time) {
// }
// }
}
});
}
}
public static long cudaDeviceSynchronize() {
long startTime = System.nanoTime();
final int result = JCuda.cudaDeviceSynchronize();
log("cudaDeviceSynchronize", result, new Object[]{});
cudaDeviceSynchronize_execution.accept((System.nanoTime() - startTime) / 1e9);
handle(result);
syncTimes.put(getThreadDeviceId(), startTime);
return startTime;
}
public static ResourcePool getPool(final int deviceId) {
assert deviceId >= 0;
return handlePools.computeIfAbsent(deviceId, d -> {
return new ResourcePool(CudaSettings.INSTANCE().getHandlesPerDevice()) {
@Override
public CudnnHandle create() {
return new CudnnHandle(deviceId);
}
};
});
}
public static int getCachedDeviceCount() {
if (null == cachedDeviceCount) {
synchronized (CudaSystem.class) {
if (null == cachedDeviceCount) {
cachedDeviceCount = init();
}
}
}
return cachedDeviceCount;
}
protected void cleanup() {
CudnnHandle.threadContext.remove();
}
public interface CudaDeviceResource {
int getDeviceId();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy