org.bytedeco.javacpp.cuda Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of cuda Show documentation
There is a newer version: 10.0-7.4-1.4.4
// Targeted by JavaCPP version 1.4.1: DO NOT EDIT THIS FILE

package org.bytedeco.javacpp;

import java.nio.*;
import org.bytedeco.javacpp.*;
import org.bytedeco.javacpp.annotation.*;

public class cuda extends org.bytedeco.javacpp.presets.cuda {
    static { Loader.load(); }

// Parsed from 

/*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

// #ifndef __cuda_cuda_h__
// #define __cuda_cuda_h__

// #include 
// #ifdef _MSC_VER
// #else
// #include 
// #endif

/**
 * CUDA API versioning support
 */
// #if defined(CUDA_FORCE_API_VERSION)
// #else
    public static final int __CUDA_API_VERSION = 8000;
// #endif /* CUDA_FORCE_API_VERSION */

// #if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
//     #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
//     #define __CUDA_API_PTDS(api) api ## _ptds
//     #define __CUDA_API_PTSZ(api) api ## _ptsz
// #else
//     #define __CUDA_API_PTDS(api) api
//     #define __CUDA_API_PTSZ(api) api
// #endif

// #if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 3020
// #endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 3020 */
// #if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4000
// #endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4000 */
// #if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4010
// #endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4010 */
// #if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050
// #endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */
// #if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050
// #endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */

// #if !defined(__CUDA_API_VERSION_INTERNAL)
// #if defined(__CUDA_API_VERSION) && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010
// #endif /* __CUDA_API_VERSION && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 */
// #endif /* __CUDA_API_VERSION_INTERNAL */

// #if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
// #endif

/**
 * \file cuda.h
 * \brief Header file for the CUDA Toolkit application programming interface.
 *
 * \file cudaGL.h
 * \brief Header file for the OpenGL interoperability functions of the
 * low-level CUDA driver application programming interface.
 *
 * \file cudaD3D9.h
 * \brief Header file for the Direct3D 9 interoperability functions of the
 * low-level CUDA driver application programming interface.
 */

/**
 * \defgroup CUDA_TYPES Data types used by CUDA driver
 * \{
 */

/**
 * CUDA API version number
 */
public static final int CUDA_VERSION = 8000;

// #ifdef __cplusplus
// #endif

/**
 * CUDA device pointer
 * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
 */ 
// #if __CUDA_API_VERSION >= 3020

// #if defined(_WIN64) || defined(__LP64__)
// #else
// #endif

// #endif /* __CUDA_API_VERSION >= 3020 */

/** CUDA device */
/** CUDA context */
@Opaque public static class CUctx_st extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public CUctx_st() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUctx_st(Pointer p) { super(p); }
}
/** CUDA module */
@Opaque public static class CUmod_st extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public CUmod_st() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUmod_st(Pointer p) { super(p); }
}
/** CUDA function */
@Opaque public static class CUfunc_st extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public CUfunc_st() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUfunc_st(Pointer p) { super(p); }
}
/** CUDA array */
@Opaque public static class CUarray_st extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public CUarray_st() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUarray_st(Pointer p) { super(p); }
}
/** CUDA mipmapped array */
@Opaque public static class CUmipmappedArray_st extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public CUmipmappedArray_st() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUmipmappedArray_st(Pointer p) { super(p); }
}
/** CUDA texture reference */
@Opaque public static class CUtexref_st extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public CUtexref_st() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUtexref_st(Pointer p) { super(p); }
}
/** CUDA surface reference */
@Opaque public static class CUsurfref_st extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public CUsurfref_st() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUsurfref_st(Pointer p) { super(p); }
}
/** CUDA event */
@Opaque public static class CUevent_st extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public CUevent_st() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUevent_st(Pointer p) { super(p); }
}
/** CUDA stream */
@Opaque public static class CUstream_st extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public CUstream_st() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUstream_st(Pointer p) { super(p); }
}
/** CUDA graphics interop resource */
@Opaque public static class CUgraphicsResource_st extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public CUgraphicsResource_st() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUgraphicsResource_st(Pointer p) { super(p); }
}
/** An opaque value that represents a CUDA texture object */
/** An opaque value that represents a CUDA surface object */

public static class CUuuid extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUuuid() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUuuid(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUuuid(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUuuid position(long position) {
        return (CUuuid)super.position(position);
    }

    public native @Cast("char") byte bytes(int i); public native CUuuid bytes(int i, byte bytes);
    @MemberGetter public native @Cast("char*") BytePointer bytes();
}


// #if __CUDA_API_VERSION >= 4010

/**
 * CUDA IPC handle size 
 */
public static final int CU_IPC_HANDLE_SIZE = 64;

/**
 * CUDA IPC event handle
 */
public static class CUipcEventHandle extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUipcEventHandle() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUipcEventHandle(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUipcEventHandle(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUipcEventHandle position(long position) {
        return (CUipcEventHandle)super.position(position);
    }

    public native @Cast("char") byte reserved(int i); public native CUipcEventHandle reserved(int i, byte reserved);
    @MemberGetter public native @Cast("char*") BytePointer reserved();
}

/**
 * CUDA IPC mem handle
 */
public static class CUipcMemHandle extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUipcMemHandle() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUipcMemHandle(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUipcMemHandle(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUipcMemHandle position(long position) {
        return (CUipcMemHandle)super.position(position);
    }

    public native @Cast("char") byte reserved(int i); public native CUipcMemHandle reserved(int i, byte reserved);
    @MemberGetter public native @Cast("char*") BytePointer reserved();
}

/**
 * CUDA Ipc Mem Flags
 */
/** enum CUipcMem_flags_enum */
public static final int
    /** Automatically enable peer access between remote devices as needed */
    CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1;

// #endif

/**
 * CUDA Mem Attach Flags
 */
/** enum CUmemAttach_flags_enum */
public static final int
    /** Memory can be accessed by any stream on any device */
    CU_MEM_ATTACH_GLOBAL = 0x1,
    /** Memory cannot be accessed by any stream on any device */
    CU_MEM_ATTACH_HOST   = 0x2,
    /** Memory can only be accessed by a single stream on the associated device */
    CU_MEM_ATTACH_SINGLE = 0x4;

/**
 * Context creation flags
 */
/** enum CUctx_flags_enum */
public static final int
    /** Automatic scheduling */
    CU_CTX_SCHED_AUTO          = 0x00,
    /** Set spin as default scheduling */
    CU_CTX_SCHED_SPIN          = 0x01,
    /** Set yield as default scheduling */
    CU_CTX_SCHED_YIELD         = 0x02,
    /** Set blocking synchronization as default scheduling */
    CU_CTX_SCHED_BLOCKING_SYNC = 0x04,
    /** Set blocking synchronization as default scheduling
                                         *  @deprecated This flag was deprecated as of CUDA 4.0
                                         *  and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
    CU_CTX_BLOCKING_SYNC       = 0x04,
    CU_CTX_SCHED_MASK          = 0x07, 
    /** Support mapped pinned allocations */
    CU_CTX_MAP_HOST            = 0x08,
    /** Keep local memory allocation after launch */
    CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10,
    CU_CTX_FLAGS_MASK          = 0x1f;

/**
 * Stream creation flags
 */
/** enum CUstream_flags_enum */
public static final int
    /** Default stream flag */
    CU_STREAM_DEFAULT      = 0x0,
    /** Stream does not synchronize with stream 0 (the NULL stream) */
    CU_STREAM_NON_BLOCKING = 0x1;

/**
 * Legacy stream handle
 *
 * Stream handle that can be passed as a CUstream to use an implicit stream
 * with legacy synchronization behavior.
 *
 * See details of the \link_sync_behavior
 */
public static native @MemberGetter CUstream_st CU_STREAM_LEGACY();
public static final CUstream_st CU_STREAM_LEGACY = CU_STREAM_LEGACY();

/**
 * Per-thread stream handle
 *
 * Stream handle that can be passed as a CUstream to use an implicit stream
 * with per-thread synchronization behavior.
 *
 * See details of the \link_sync_behavior
 */
public static native @MemberGetter CUstream_st CU_STREAM_PER_THREAD();
public static final CUstream_st CU_STREAM_PER_THREAD = CU_STREAM_PER_THREAD();

/**
 * Event creation flags
 */
/** enum CUevent_flags_enum */
public static final int
    /** Default event flag */
    CU_EVENT_DEFAULT        = 0x0,
    /** Event uses blocking synchronization */
    CU_EVENT_BLOCKING_SYNC  = 0x1,
    /** Event will not record timing data */
    CU_EVENT_DISABLE_TIMING = 0x2,
    /** Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
    CU_EVENT_INTERPROCESS   = 0x4;

// #if __CUDA_API_VERSION >= 8000
/**
 * Flags for ::cuStreamWaitValue32
 */
/** enum CUstreamWaitValue_flags_enum */
public static final int
    /** Wait until (int32_t)(*addr - value) >= 0. Note this is a
                                             cyclic comparison which ignores wraparound. (Default behavior.) */
    CU_STREAM_WAIT_VALUE_GEQ   = 0x0,
    /** Wait until *addr == value. */
    CU_STREAM_WAIT_VALUE_EQ    = 0x1,
    /** Wait until (*addr & value) != 0. */
    CU_STREAM_WAIT_VALUE_AND   = 0x2,
    /** Follow the wait operation with a flush of outstanding remote writes. This
                                             means that, if a remote write operation is guaranteed to have reached the
                                             device before the wait can be satisfied, that write is guaranteed to be
                                             visible to downstream device work. The device is permitted to reorder
                                             remote writes internally. For example, this flag would be required if
                                             two remote writes arrive in a defined order, the wait is satisfied by the
                                             second write, and downstream work needs to observe the first write. */
    CU_STREAM_WAIT_VALUE_FLUSH = 1<<30;

/**
 * Flags for ::cuStreamWriteValue32
 */
/** enum CUstreamWriteValue_flags_enum */
public static final int
    /** Default behavior */
    CU_STREAM_WRITE_VALUE_DEFAULT           = 0x0,
    /** Permits the write to be reordered with writes which were issued
                                                        before it, as a performance optimization. Normally,
                                                        ::cuStreamWriteValue32 will provide a memory fence before the
                                                        write, which has similar semantics to
                                                        __threadfence_system() but is scoped to the stream
                                                        rather than a CUDA thread. */
    CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1;

/**
 * Operations for ::cuStreamBatchMemOp
 */
/** enum CUstreamBatchMemOpType_enum */
public static final int
    /** Represents a ::cuStreamWaitValue32 operation */
    CU_STREAM_MEM_OP_WAIT_VALUE_32  = 1,
    /** Represents a ::cuStreamWriteValue32 operation */
    CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2,
    /** This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a
                                                  standalone operation. */
    CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3;

/**
 * Per-operation parameters for ::cuStreamBatchMemOp
 */
public static class CUstreamBatchMemOpParams extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUstreamBatchMemOpParams() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUstreamBatchMemOpParams(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUstreamBatchMemOpParams(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUstreamBatchMemOpParams position(long position) {
        return (CUstreamBatchMemOpParams)super.position(position);
    }

    public native @Cast("CUstreamBatchMemOpType") int operation(); public native CUstreamBatchMemOpParams operation(int operation);
        @Name("waitValue.operation") public native @Cast("CUstreamBatchMemOpType") int waitValue_operation(); public native CUstreamBatchMemOpParams waitValue_operation(int waitValue_operation);
        @Name("waitValue.address") public native @Cast("CUdeviceptr") long waitValue_address(); public native CUstreamBatchMemOpParams waitValue_address(long waitValue_address);
            @Name("waitValue.value") public native @Cast("cuuint32_t") int waitValue_value(); public native CUstreamBatchMemOpParams waitValue_value(int waitValue_value);
            @Name("waitValue.pad") public native @Cast("cuuint64_t") int waitValue_pad(); public native CUstreamBatchMemOpParams waitValue_pad(int waitValue_pad);
        @Name("waitValue.flags") public native @Cast("unsigned int") int waitValue_flags(); public native CUstreamBatchMemOpParams waitValue_flags(int waitValue_flags);
        /** For driver internal use. Initial value is unimportant. */
        @Name("waitValue.alias") public native @Cast("CUdeviceptr") long waitValue_alias(); public native CUstreamBatchMemOpParams waitValue_alias(long waitValue_alias);
        @Name("writeValue.operation") public native @Cast("CUstreamBatchMemOpType") int writeValue_operation(); public native CUstreamBatchMemOpParams writeValue_operation(int writeValue_operation);
        @Name("writeValue.address") public native @Cast("CUdeviceptr") long writeValue_address(); public native CUstreamBatchMemOpParams writeValue_address(long writeValue_address);
            @Name("writeValue.value") public native @Cast("cuuint32_t") int writeValue_value(); public native CUstreamBatchMemOpParams writeValue_value(int writeValue_value);
            @Name("writeValue.pad") public native @Cast("cuuint64_t") int writeValue_pad(); public native CUstreamBatchMemOpParams writeValue_pad(int writeValue_pad);
        @Name("writeValue.flags") public native @Cast("unsigned int") int writeValue_flags(); public native CUstreamBatchMemOpParams writeValue_flags(int writeValue_flags);
        /** For driver internal use. Initial value is unimportant. */
        @Name("writeValue.alias") public native @Cast("CUdeviceptr") long writeValue_alias(); public native CUstreamBatchMemOpParams writeValue_alias(long writeValue_alias);
        @Name("flushRemoteWrites.operation") public native @Cast("CUstreamBatchMemOpType") int flushRemoteWrites_operation(); public native CUstreamBatchMemOpParams flushRemoteWrites_operation(int flushRemoteWrites_operation);
        @Name("flushRemoteWrites.flags") public native @Cast("unsigned int") int flushRemoteWrites_flags(); public native CUstreamBatchMemOpParams flushRemoteWrites_flags(int flushRemoteWrites_flags);
    public native @Cast("cuuint64_t") int pad(int i); public native CUstreamBatchMemOpParams pad(int i, int pad);
    @MemberGetter public native @Cast("cuuint64_t*") IntPointer pad();
}
// #endif /* __CUDA_API_VERSION >= 8000 */

/**
 * Occupancy calculator flag
 */
/** enum CUoccupancy_flags_enum */
public static final int
    /** Default behavior */
    CU_OCCUPANCY_DEFAULT                  = 0x0,
    /** Assume global caching is enabled and cannot be automatically turned off */
    CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1;

/**
 * Array formats
 */
/** enum CUarray_format_enum */
public static final int
    /** Unsigned 8-bit integers */
    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01,
    /** Unsigned 16-bit integers */
    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
    /** Unsigned 32-bit integers */
    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
    /** Signed 8-bit integers */
    CU_AD_FORMAT_SIGNED_INT8    = 0x08,
    /** Signed 16-bit integers */
    CU_AD_FORMAT_SIGNED_INT16   = 0x09,
    /** Signed 32-bit integers */
    CU_AD_FORMAT_SIGNED_INT32   = 0x0a,
    /** 16-bit floating point */
    CU_AD_FORMAT_HALF           = 0x10,
    /** 32-bit floating point */
    CU_AD_FORMAT_FLOAT          = 0x20;

/**
 * Texture reference addressing modes
 */
/** enum CUaddress_mode_enum */
public static final int
    /** Wrapping address mode */
    CU_TR_ADDRESS_MODE_WRAP   = 0,
    /** Clamp to edge address mode */
    CU_TR_ADDRESS_MODE_CLAMP  = 1,
    /** Mirror address mode */
    CU_TR_ADDRESS_MODE_MIRROR = 2,
    /** Border address mode */
    CU_TR_ADDRESS_MODE_BORDER = 3;

/**
 * Texture reference filtering modes
 */
/** enum CUfilter_mode_enum */
public static final int
    /** Point filter mode */
    CU_TR_FILTER_MODE_POINT  = 0,
    /** Linear filter mode */
    CU_TR_FILTER_MODE_LINEAR = 1;

/**
 * Device properties
 */
/** enum CUdevice_attribute_enum */
public static final int
    /** Maximum number of threads per block */
    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,
    /** Maximum block dimension X */
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
    /** Maximum block dimension Y */
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,
    /** Maximum block dimension Z */
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,
    /** Maximum grid dimension X */
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
    /** Maximum grid dimension Y */
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,
    /** Maximum grid dimension Z */
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,
    /** Maximum shared memory available per block in bytes */
    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,
    /** Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,
    /** Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,
    /** Warp size in threads */
    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
    /** Maximum pitch in bytes allowed by memory copies */
    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,
    /** Maximum number of 32-bit registers available per block */
    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,
    /** Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,
    /** Typical clock frequency in kilohertz */
    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,
    /** Alignment requirement for textures */
    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,
    /** Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,
    /** Number of multiprocessors on device */
    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,
    /** Specifies whether there is a run time limit on kernels */
    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,
    /** Device is integrated with host memory */
    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,
    /** Device can map host memory into CUDA address space */
    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,
    /** Compute mode (See ::CUcomputemode for details) */
    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,
    /** Maximum 1D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,
    /** Maximum 2D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,
    /** Maximum 2D texture height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,
    /** Maximum 3D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,
    /** Maximum 3D texture height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,
    /** Maximum 3D texture depth */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,
    /** Maximum 2D layered texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,
    /** Maximum 2D layered texture height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,
    /** Maximum layers in a 2D layered texture */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,
    /** Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,
    /** Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,
    /** Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,
    /** Alignment requirement for surfaces */
    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,
    /** Device can possibly execute multiple kernels concurrently */
    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,
    /** Device has ECC support enabled */
    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,
    /** PCI bus ID of the device */
    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,
    /** PCI device ID of the device */
    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,
    /** Device is using TCC driver model */
    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,
    /** Peak memory clock frequency in kilohertz */
    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,
    /** Global memory bus width in bits */
    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,
    /** Size of L2 cache in bytes */
    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,
    /** Maximum resident threads per multiprocessor */
    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,
    /** Number of asynchronous engines */
    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,
    /** Device shares a unified address space with the host */
    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,    
    /** Maximum 1D layered texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,
    /** Maximum layers in a 1D layered texture */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,
    /** Deprecated, do not use. */
    CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,
    /** Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,
    /** Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,
    /** Alternate maximum 3D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,
    /** Alternate maximum 3D texture height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,
    /** Alternate maximum 3D texture depth */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,
    /** PCI domain ID of the device */
    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,
    /** Pitch alignment requirement for textures */
    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,
    /** Maximum cubemap texture width/height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,
    /** Maximum cubemap layered texture width/height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,
    /** Maximum layers in a cubemap layered texture */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,
    /** Maximum 1D surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,
    /** Maximum 2D surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,
    /** Maximum 2D surface height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,
    /** Maximum 3D surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,
    /** Maximum 3D surface height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,
    /** Maximum 3D surface depth */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,
    /** Maximum 1D layered surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,
    /** Maximum layers in a 1D layered surface */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,
    /** Maximum 2D layered surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,
    /** Maximum 2D layered surface height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,
    /** Maximum layers in a 2D layered surface */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,
    /** Maximum cubemap surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,
    /** Maximum cubemap layered surface width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,
    /** Maximum layers in a cubemap layered surface */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,
    /** Maximum 1D linear texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,
    /** Maximum 2D linear texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,
    /** Maximum 2D linear texture height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,
    /** Maximum 2D linear texture pitch in bytes */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,
    /** Maximum mipmapped 2D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,
    /** Maximum mipmapped 2D texture height */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,
    /** Major compute capability version number */
    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,     
    /** Minor compute capability version number */
    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
    /** Maximum mipmapped 1D texture width */
    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,
    /** Device supports stream priorities */
    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,
    /** Device supports caching globals in L1 */
    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,
    /** Device supports caching locals in L1 */
    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,
    /** Maximum shared memory available per multiprocessor in bytes */
    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,
    /** Maximum number of 32-bit registers available per multiprocessor */
    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,
    /** Device can allocate managed memory on this system */
    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,
    /** Device is on a multi-GPU board */
    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, 
    /** Unique id for a group of devices on the same multi-GPU board */
    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,
    /** Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,
    /** Ratio of single precision performance (in floating-point operations per second) to double precision performance */
    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,
    /** Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,
    /** Device can coherently access managed memory concurrently with the CPU */
    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,
    /** Device supports compute preemption. */
    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,
    /** Device can access host registered memory at the same virtual address as the CPU */
    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,
    CU_DEVICE_ATTRIBUTE_MAX = 92;

/**
 * Legacy device properties
 */
public static class CUdevprop extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUdevprop() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUdevprop(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUdevprop(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUdevprop position(long position) {
        return (CUdevprop)super.position(position);
    }

    /** Maximum number of threads per block */
    public native int maxThreadsPerBlock(); public native CUdevprop maxThreadsPerBlock(int maxThreadsPerBlock);
    /** Maximum size of each dimension of a block */
    public native int maxThreadsDim(int i); public native CUdevprop maxThreadsDim(int i, int maxThreadsDim);
    @MemberGetter public native IntPointer maxThreadsDim();
    /** Maximum size of each dimension of a grid */
    public native int maxGridSize(int i); public native CUdevprop maxGridSize(int i, int maxGridSize);
    @MemberGetter public native IntPointer maxGridSize();
    /** Shared memory available per block in bytes */
    public native int sharedMemPerBlock(); public native CUdevprop sharedMemPerBlock(int sharedMemPerBlock);
    /** Constant memory available on device in bytes */
    public native int totalConstantMemory(); public native CUdevprop totalConstantMemory(int totalConstantMemory);
    /** Warp size in threads */
    public native int SIMDWidth(); public native CUdevprop SIMDWidth(int SIMDWidth);
    /** Maximum pitch in bytes allowed by memory copies */
    public native int memPitch(); public native CUdevprop memPitch(int memPitch);
    /** 32-bit registers available per block */
    public native int regsPerBlock(); public native CUdevprop regsPerBlock(int regsPerBlock);
    /** Clock frequency in kilohertz */
    public native int clockRate(); public native CUdevprop clockRate(int clockRate);
    /** Alignment requirement for textures */
    public native int textureAlign(); public native CUdevprop textureAlign(int textureAlign);
}

/**
 * Pointer information
 */
/** enum CUpointer_attribute_enum */
public static final int
    /** The ::CUcontext on which a pointer was allocated or registered */
    CU_POINTER_ATTRIBUTE_CONTEXT = 1,
    /** The ::CUmemorytype describing the physical location of a pointer */
    CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,
    /** The address at which a pointer's memory may be accessed on the device */
    CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3,
    /** The address at which a pointer's memory may be accessed on the host */
    CU_POINTER_ATTRIBUTE_HOST_POINTER = 4,
    /** A pair of tokens for use with the nv-p2p.h Linux kernel interface */
    CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5,
    /** Synchronize every synchronous memory operation initiated on this region */
    CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6,
    /** A process-wide unique ID for an allocated memory region*/
    CU_POINTER_ATTRIBUTE_BUFFER_ID = 7,
    /** Indicates if the pointer points to managed memory */
    CU_POINTER_ATTRIBUTE_IS_MANAGED = 8;

/**
 * Function properties
 */
/** enum CUfunction_attribute_enum */
public static final int
    /**
     * The maximum number of threads per block, beyond which a launch of the
     * function would fail. This number depends on both the function and the
     * device on which the function is currently loaded.
     */
    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,

    /**
     * The size in bytes of statically-allocated shared memory required by
     * this function. This does not include dynamically-allocated shared
     * memory requested by the user at runtime.
     */
    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,

    /**
     * The size in bytes of user-allocated constant memory required by this
     * function.
     */
    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,

    /**
     * The size in bytes of local memory used by each thread of this function.
     */
    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,

    /**
     * The number of registers used by each thread of this function.
     */
    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,

    /**
     * The PTX virtual architecture version for which the function was
     * compiled. This value is the major PTX version * 10 + the minor PTX
     * version, so a PTX version 1.3 function would return the value 13.
     * Note that this may return the undefined value of 0 for cubins
     * compiled prior to CUDA 3.0.
     */
    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,

    /**
     * The binary architecture version for which the function was compiled.
     * This value is the major binary version * 10 + the minor binary version,
     * so a binary version 1.3 function would return the value 13. Note that
     * this will return a value of 10 for legacy cubins that do not have a
     * properly-encoded binary architecture version.
     */
    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,

    /**
     * The attribute to indicate whether the function has been compiled with 
     * user specified option "-Xptxas --dlcm=ca" set .
     */
    CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,

    CU_FUNC_ATTRIBUTE_MAX = 8;

/**
 * Function cache configurations
 */
/** enum CUfunc_cache_enum */
public static final int
    /** no preference for shared memory or L1 (default) */
    CU_FUNC_CACHE_PREFER_NONE    = 0x00,
    /** prefer larger shared memory and smaller L1 cache */
    CU_FUNC_CACHE_PREFER_SHARED  = 0x01,
    /** prefer larger L1 cache and smaller shared memory */
    CU_FUNC_CACHE_PREFER_L1      = 0x02,
    /** prefer equal sized L1 cache and shared memory */
    CU_FUNC_CACHE_PREFER_EQUAL   = 0x03;

/**
 * Shared memory configurations
 */
/** enum CUsharedconfig_enum */
public static final int
    /** set default shared memory bank size */
    CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00,
    /** set shared memory bank width to four bytes */
    CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01,
    /** set shared memory bank width to eight bytes */
    CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02;

/**
 * Memory types
 */
/** enum CUmemorytype_enum */
public static final int
    /** Host memory */
    CU_MEMORYTYPE_HOST    = 0x01,
    /** Device memory */
    CU_MEMORYTYPE_DEVICE  = 0x02,
    /** Array memory */
    CU_MEMORYTYPE_ARRAY   = 0x03,
    /** Unified device or host memory */
    CU_MEMORYTYPE_UNIFIED = 0x04;

/**
 * Compute Modes
 */
/** enum CUcomputemode_enum */
public static final int
    /** Default compute mode (Multiple contexts allowed per device) */
    CU_COMPUTEMODE_DEFAULT           = 0,
    /** Compute-prohibited mode (No contexts can be created on this device at this time) */
    CU_COMPUTEMODE_PROHIBITED        = 2,
    /** Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
    CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3;

/**
 * Memory advise values
 */
/** enum CUmem_advise_enum */
public static final int
    /** Data will mostly be read and only occassionally be written to */
    CU_MEM_ADVISE_SET_READ_MOSTLY          = 1,
    /** Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */
    CU_MEM_ADVISE_UNSET_READ_MOSTLY        = 2,
    /** Set the preferred location for the data as the specified device */
    CU_MEM_ADVISE_SET_PREFERRED_LOCATION   = 3,
    /** Clear the preferred location for the data */
    CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4,
    /** Data will be accessed by the specified device, so prevent page faults as much as possible */
    CU_MEM_ADVISE_SET_ACCESSED_BY          = 5,
    /** Let the Unified Memory subsystem decide on the page faulting policy for the specified device */
    CU_MEM_ADVISE_UNSET_ACCESSED_BY        = 6;

/** enum CUmem_range_attribute_enum */
public static final int
    /** Whether the range will mostly be read and only occassionally be written to */
    CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY            = 1,
    /** The preferred location of the range */
    CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION     = 2,
    /** Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */
    CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY            = 3,
    /** The last location to which the range was prefetched */
    CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4;

/**
 * Online compiler and linker options
 */
/** enum CUjit_option_enum */
public static final int
    /**
     * Max number of registers that a thread may use.\n
     * Option type: unsigned int\n
     * Applies to: compiler only
     */
    CU_JIT_MAX_REGISTERS = 0,

    /**
     * IN: Specifies minimum number of threads per block to target compilation
     * for\n
     * OUT: Returns the number of threads the compiler actually targeted.
     * This restricts the resource utilization fo the compiler (e.g. max
     * registers) such that a block with the given number of threads should be
     * able to launch based on register limitations. Note, this option does not
     * currently take into account any other resource limitations, such as
     * shared memory utilization.\n
     * Cannot be combined with ::CU_JIT_TARGET.\n
     * Option type: unsigned int\n
     * Applies to: compiler only
     */
    CU_JIT_THREADS_PER_BLOCK = 1,

    /**
     * Overwrites the option value with the total wall clock time, in
     * milliseconds, spent in the compiler and linker\n
     * Option type: float\n
     * Applies to: compiler and linker
     */
    CU_JIT_WALL_TIME = 2,

    /**
     * Pointer to a buffer in which to print any log messages
     * that are informational in nature (the buffer size is specified via
     * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
     * Option type: char *\n
     * Applies to: compiler and linker
     */
    CU_JIT_INFO_LOG_BUFFER = 3,

    /**
     * IN: Log buffer size in bytes.  Log messages will be capped at this size
     * (including null terminator)\n
     * OUT: Amount of log buffer filled with messages\n
     * Option type: unsigned int\n
     * Applies to: compiler and linker
     */
    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4,

    /**
     * Pointer to a buffer in which to print any log messages that
     * reflect errors (the buffer size is specified via option
     * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
     * Option type: char *\n
     * Applies to: compiler and linker
     */
    CU_JIT_ERROR_LOG_BUFFER = 5,

    /**
     * IN: Log buffer size in bytes.  Log messages will be capped at this size
     * (including null terminator)\n
     * OUT: Amount of log buffer filled with messages\n
     * Option type: unsigned int\n
     * Applies to: compiler and linker
     */
    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6,

    /**
     * Level of optimizations to apply to generated code (0 - 4), with 4
     * being the default and highest level of optimizations.\n
     * Option type: unsigned int\n
     * Applies to: compiler only
     */
    CU_JIT_OPTIMIZATION_LEVEL = 7,

    /**
     * No option value required. Determines the target based on the current
     * attached context (default)\n
     * Option type: No option value needed\n
     * Applies to: compiler and linker
     */
    CU_JIT_TARGET_FROM_CUCONTEXT = 8,

    /**
     * Target is chosen based on supplied ::CUjit_target.  Cannot be
     * combined with ::CU_JIT_THREADS_PER_BLOCK.\n
     * Option type: unsigned int for enumerated type ::CUjit_target\n
     * Applies to: compiler and linker
     */
    CU_JIT_TARGET = 9,

    /**
     * Specifies choice of fallback strategy if matching cubin is not found.
     * Choice is based on supplied ::CUjit_fallback.  This option cannot be
     * used with cuLink* APIs as the linker requires exact matches.\n
     * Option type: unsigned int for enumerated type ::CUjit_fallback\n
     * Applies to: compiler only
     */
    CU_JIT_FALLBACK_STRATEGY = 10,

    /**
     * Specifies whether to create debug information in output (-g)
     * (0: false, default)\n
     * Option type: int\n
     * Applies to: compiler and linker
     */
    CU_JIT_GENERATE_DEBUG_INFO = 11,

    /**
     * Generate verbose log messages (0: false, default)\n
     * Option type: int\n
     * Applies to: compiler and linker
     */
    CU_JIT_LOG_VERBOSE = 12,

    /**
     * Generate line number information (-lineinfo) (0: false, default)\n
     * Option type: int\n
     * Applies to: compiler only
     */
    CU_JIT_GENERATE_LINE_INFO = 13,

    /**
     * Specifies whether to enable caching explicitly (-dlcm) \n
     * Choice is based on supplied ::CUjit_cacheMode_enum.\n
     * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n
     * Applies to: compiler only
     */
    CU_JIT_CACHE_MODE = 14,

    /**
     * The below jit options are used for internal purposes only, in this version of CUDA
     */
    CU_JIT_NEW_SM3X_OPT = 15,
    CU_JIT_FAST_COMPILE = 16,

    CU_JIT_NUM_OPTIONS = 17;

/**
 * Online compilation targets
 */
/** enum CUjit_target_enum */
public static final int
    /** Compute device class 1.0 */
    CU_TARGET_COMPUTE_10 = 10,
    /** Compute device class 1.1 */
    CU_TARGET_COMPUTE_11 = 11,
    /** Compute device class 1.2 */
    CU_TARGET_COMPUTE_12 = 12,
    /** Compute device class 1.3 */
    CU_TARGET_COMPUTE_13 = 13,
    /** Compute device class 2.0 */
    CU_TARGET_COMPUTE_20 = 20,
    /** Compute device class 2.1 */
    CU_TARGET_COMPUTE_21 = 21,
    /** Compute device class 3.0 */
    CU_TARGET_COMPUTE_30 = 30,
    /** Compute device class 3.2 */
    CU_TARGET_COMPUTE_32 = 32,
    /** Compute device class 3.5 */
    CU_TARGET_COMPUTE_35 = 35,
    /** Compute device class 3.7 */
    CU_TARGET_COMPUTE_37 = 37,
    /** Compute device class 5.0 */
    CU_TARGET_COMPUTE_50 = 50,
    /** Compute device class 5.2 */
    CU_TARGET_COMPUTE_52 = 52,
    /** Compute device class 5.3 */
    CU_TARGET_COMPUTE_53 = 53,
    /** Compute device class 6.0. This must be removed for CUDA 7.0 toolkit. See bug 1518217. */
    CU_TARGET_COMPUTE_60 = 60,
    /** Compute device class 6.1. This must be removed for CUDA 7.0 toolkit.*/
    CU_TARGET_COMPUTE_61 = 61,
    /** Compute device class 6.2. This must be removed for CUDA 7.0 toolkit.*/
    CU_TARGET_COMPUTE_62 = 62;

/**
 * Cubin matching fallback strategies
 */
/** enum CUjit_fallback_enum */
public static final int
    /** Prefer to compile ptx if exact binary match not found */
    CU_PREFER_PTX = 0,

    /** Prefer to fall back to compatible binary code if exact match not found */
    CU_PREFER_BINARY = 1;

/**
 * Caching modes for dlcm 
 */
/** enum CUjit_cacheMode_enum */
public static final int
    /** Compile with no -dlcm flag specified */
    CU_JIT_CACHE_OPTION_NONE = 0,
    /** Compile with L1 cache disabled */
    CU_JIT_CACHE_OPTION_CG = 1,
    /** Compile with L1 cache enabled */
    CU_JIT_CACHE_OPTION_CA = 2;

/**
 * Device code formats
 */
/** enum CUjitInputType_enum */
public static final int
    /**
     * Compiled device-class-specific device code\n
     * Applicable options: none
     */
    CU_JIT_INPUT_CUBIN = 0,

    /**
     * PTX source code\n
     * Applicable options: PTX compiler options
     */
    CU_JIT_INPUT_PTX = 1,

    /**
     * Bundle of multiple cubins and/or PTX of some device code\n
     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
     */
    CU_JIT_INPUT_FATBINARY = 2,

    /**
     * Host object with embedded device code\n
     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
     */
    CU_JIT_INPUT_OBJECT = 3,

    /**
     * Archive of host objects with embedded device code\n
     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
     */
    CU_JIT_INPUT_LIBRARY = 4,

    CU_JIT_NUM_INPUT_TYPES = 5;

// #if __CUDA_API_VERSION >= 5050
@Opaque public static class CUlinkState_st extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public CUlinkState_st() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUlinkState_st(Pointer p) { super(p); }
}
// #endif /* __CUDA_API_VERSION >= 5050 */

/**
 * Flags to register a graphics resource
 */
/** enum CUgraphicsRegisterFlags_enum */
public static final int
    CU_GRAPHICS_REGISTER_FLAGS_NONE           = 0x00,
    CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY      = 0x01,
    CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD  = 0x02,
    CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST   = 0x04,
    CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08;

/**
 * Flags for mapping and unmapping interop resources
 */
/** enum CUgraphicsMapResourceFlags_enum */
public static final int
    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
    CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
    CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02;

/**
 * Array indices for cube faces
 */
/** enum CUarray_cubemap_face_enum */
public static final int
    /** Positive X face of cubemap */
    CU_CUBEMAP_FACE_POSITIVE_X  = 0x00,
    /** Negative X face of cubemap */
    CU_CUBEMAP_FACE_NEGATIVE_X  = 0x01,
    /** Positive Y face of cubemap */
    CU_CUBEMAP_FACE_POSITIVE_Y  = 0x02,
    /** Negative Y face of cubemap */
    CU_CUBEMAP_FACE_NEGATIVE_Y  = 0x03,
    /** Positive Z face of cubemap */
    CU_CUBEMAP_FACE_POSITIVE_Z  = 0x04,
    /** Negative Z face of cubemap */
    CU_CUBEMAP_FACE_NEGATIVE_Z  = 0x05;

/**
 * Limits
 */
/** enum CUlimit_enum */
public static final int
    /** GPU thread stack size */
    CU_LIMIT_STACK_SIZE                       = 0x00,
    /** GPU printf FIFO size */
    CU_LIMIT_PRINTF_FIFO_SIZE                 = 0x01,
    /** GPU malloc heap size */
    CU_LIMIT_MALLOC_HEAP_SIZE                 = 0x02,
    /** GPU device runtime launch synchronize depth */
    CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH           = 0x03,
    /** GPU device runtime pending launch count */
    CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04,
    CU_LIMIT_MAX = 0x04 + 1;

/**
 * Resource types
 */
/** enum CUresourcetype_enum */
public static final int
    /** Array resoure */
    CU_RESOURCE_TYPE_ARRAY           = 0x00,
    /** Mipmapped array resource */
    CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01,
    /** Linear resource */
    CU_RESOURCE_TYPE_LINEAR          = 0x02,
    /** Pitch 2D resource */
    CU_RESOURCE_TYPE_PITCH2D         = 0x03;

/**
 * Error codes
 */
/** enum cudaError_enum */
public static final int
    /**
     * The API call returned with no errors. In the case of query calls, this
     * can also mean that the operation being queried is complete (see
     * ::cuEventQuery() and ::cuStreamQuery()).
     */
    CUDA_SUCCESS                              = 0,

    /**
     * This indicates that one or more of the parameters passed to the API call
     * is not within an acceptable range of values.
     */
    CUDA_ERROR_INVALID_VALUE                  = 1,

    /**
     * The API call failed because it was unable to allocate enough memory to
     * perform the requested operation.
     */
    CUDA_ERROR_OUT_OF_MEMORY                  = 2,

    /**
     * This indicates that the CUDA driver has not been initialized with
     * ::cuInit() or that initialization has failed.
     */
    CUDA_ERROR_NOT_INITIALIZED                = 3,

    /**
     * This indicates that the CUDA driver is in the process of shutting down.
     */
    CUDA_ERROR_DEINITIALIZED                  = 4,

    /**
     * This indicates profiler is not initialized for this run. This can
     * happen when the application is running with external profiling tools
     * like visual profiler.
     */
    CUDA_ERROR_PROFILER_DISABLED              = 5,

    /**
     * @deprecated
     * This error return is deprecated as of CUDA 5.0. It is no longer an error
     * to attempt to enable/disable the profiling via ::cuProfilerStart or
     * ::cuProfilerStop without initialization.
     */
    CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,

    /**
     * @deprecated
     * This error return is deprecated as of CUDA 5.0. It is no longer an error
     * to call cuProfilerStart() when profiling is already enabled.
     */
    CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,

    /**
     * @deprecated
     * This error return is deprecated as of CUDA 5.0. It is no longer an error
     * to call cuProfilerStop() when profiling is already disabled.
     */
    CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,

    /**
     * This indicates that no CUDA-capable devices were detected by the installed
     * CUDA driver.
     */
    CUDA_ERROR_NO_DEVICE                      = 100,

    /**
     * This indicates that the device ordinal supplied by the user does not
     * correspond to a valid CUDA device.
     */
    CUDA_ERROR_INVALID_DEVICE                 = 101,


    /**
     * This indicates that the device kernel image is invalid. This can also
     * indicate an invalid CUDA module.
     */
    CUDA_ERROR_INVALID_IMAGE                  = 200,

    /**
     * This most frequently indicates that there is no context bound to the
     * current thread. This can also be returned if the context passed to an
     * API call is not a valid handle (such as a context that has had
     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
     * mixes different API versions (i.e. 3010 context with 3020 API calls).
     * See ::cuCtxGetApiVersion() for more details.
     */
    CUDA_ERROR_INVALID_CONTEXT                = 201,

    /**
     * This indicated that the context being supplied as a parameter to the
     * API call was already the active context.
     * @deprecated
     * This error return is deprecated as of CUDA 3.2. It is no longer an
     * error to attempt to push the active context via ::cuCtxPushCurrent().
     */
    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,

    /**
     * This indicates that a map or register operation has failed.
     */
    CUDA_ERROR_MAP_FAILED                     = 205,

    /**
     * This indicates that an unmap or unregister operation has failed.
     */
    CUDA_ERROR_UNMAP_FAILED                   = 206,

    /**
     * This indicates that the specified array is currently mapped and thus
     * cannot be destroyed.
     */
    CUDA_ERROR_ARRAY_IS_MAPPED                = 207,

    /**
     * This indicates that the resource is already mapped.
     */
    CUDA_ERROR_ALREADY_MAPPED                 = 208,

    /**
     * This indicates that there is no kernel image available that is suitable
     * for the device. This can occur when a user specifies code generation
     * options for a particular CUDA source file that do not include the
     * corresponding device configuration.
     */
    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,

    /**
     * This indicates that a resource has already been acquired.
     */
    CUDA_ERROR_ALREADY_ACQUIRED               = 210,

    /**
     * This indicates that a resource is not mapped.
     */
    CUDA_ERROR_NOT_MAPPED                     = 211,

    /**
     * This indicates that a mapped resource is not available for access as an
     * array.
     */
    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,

    /**
     * This indicates that a mapped resource is not available for access as a
     * pointer.
     */
    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,

    /**
     * This indicates that an uncorrectable ECC error was detected during
     * execution.
     */
    CUDA_ERROR_ECC_UNCORRECTABLE              = 214,

    /**
     * This indicates that the ::CUlimit passed to the API call is not
     * supported by the active device.
     */
    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,

    /**
     * This indicates that the ::CUcontext passed to the API call can
     * only be bound to a single CPU thread at a time but is already 
     * bound to a CPU thread.
     */
    CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,

    /**
     * This indicates that peer access is not supported across the given
     * devices.
     */
    CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        = 217,

    /**
     * This indicates that a PTX JIT compilation failed.
     */
    CUDA_ERROR_INVALID_PTX                    = 218,

    /**
     * This indicates an error with OpenGL or DirectX context.
     */
    CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       = 219,

    /**
    * This indicates that an uncorrectable NVLink error was detected during the
    * execution.
    */
    CUDA_ERROR_NVLINK_UNCORRECTABLE           = 220,

    /**
     * This indicates that the device kernel source is invalid.
     */
    CUDA_ERROR_INVALID_SOURCE                 = 300,

    /**
     * This indicates that the file specified was not found.
     */
    CUDA_ERROR_FILE_NOT_FOUND                 = 301,

    /**
     * This indicates that a link to a shared object failed to resolve.
     */
    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,

    /**
     * This indicates that initialization of a shared object failed.
     */
    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,

    /**
     * This indicates that an OS call failed.
     */
    CUDA_ERROR_OPERATING_SYSTEM               = 304,

    /**
     * This indicates that a resource handle passed to the API call was not
     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
     */
    CUDA_ERROR_INVALID_HANDLE                 = 400,

    /**
     * This indicates that a named symbol was not found. Examples of symbols
     * are global/constant variable names, texture names, and surface names.
     */
    CUDA_ERROR_NOT_FOUND                      = 500,

    /**
     * This indicates that asynchronous operations issued previously have not
     * completed yet. This result is not actually an error, but must be indicated
     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
     */
    CUDA_ERROR_NOT_READY                      = 600,

    /**
     * While executing a kernel, the device encountered a
     * load or store instruction on an invalid memory address.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    CUDA_ERROR_ILLEGAL_ADDRESS                = 700,

    /**
     * This indicates that a launch did not occur because it did not have
     * appropriate resources. This error usually indicates that the user has
     * attempted to pass too many arguments to the device kernel, or the
     * kernel launch specifies too many threads for the kernel's register
     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
     * when a 32-bit int is expected) is equivalent to passing too many
     * arguments and can also result in this error.
     */
    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,

    /**
     * This indicates that the device kernel took too long to execute. This can
     * only occur if timeouts are enabled - see the device attribute
     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,

    /**
     * This error indicates a kernel launch that uses an incompatible texturing
     * mode.
     */
    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
    
    /**
     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
     * trying to re-enable peer access to a context which has already
     * had peer access to it enabled.
     */
    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    = 704,

    /**
     * This error indicates that ::cuCtxDisablePeerAccess() is 
     * trying to disable peer access which has not been enabled yet 
     * via ::cuCtxEnablePeerAccess(). 
     */
    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        = 705,

    /**
     * This error indicates that the primary context for the specified device
     * has already been initialized.
     */
    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,

    /**
     * This error indicates that the context current to the calling thread
     * has been destroyed using ::cuCtxDestroy, or is a primary context which
     * has not yet been initialized.
     */
    CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,

    /**
     * A device-side assert triggered during kernel execution. The context
     * cannot be used anymore, and must be destroyed. All existing device 
     * memory allocations from this context are invalid and must be 
     * reconstructed if the program is to continue using CUDA.
     */
    CUDA_ERROR_ASSERT                         = 710,

    /**
     * This error indicates that the hardware resources required to enable
     * peer access have been exhausted for one or more of the devices 
     * passed to ::cuCtxEnablePeerAccess().
     */
    CUDA_ERROR_TOO_MANY_PEERS                 = 711,

    /**
     * This error indicates that the memory range passed to ::cuMemHostRegister()
     * has already been registered.
     */
    CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,

    /**
     * This error indicates that the pointer passed to ::cuMemHostUnregister()
     * does not correspond to any currently registered memory region.
     */
    CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     = 713,

    /**
     * While executing a kernel, the device encountered a stack error.
     * This can be due to stack corruption or exceeding the stack size limit.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    CUDA_ERROR_HARDWARE_STACK_ERROR           = 714,

    /**
     * While executing a kernel, the device encountered an illegal instruction.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    CUDA_ERROR_ILLEGAL_INSTRUCTION            = 715,

    /**
     * While executing a kernel, the device encountered a load or store instruction
     * on a memory address which is not aligned.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    CUDA_ERROR_MISALIGNED_ADDRESS             = 716,

    /**
     * While executing a kernel, the device encountered an instruction
     * which can only operate on memory locations in certain address spaces
     * (global, shared, or local), but was supplied a memory address not
     * belonging to an allowed address space.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    CUDA_ERROR_INVALID_ADDRESS_SPACE          = 717,

    /**
     * While executing a kernel, the device program counter wrapped its address space.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    CUDA_ERROR_INVALID_PC                     = 718,

    /**
     * An exception occurred on the device while executing a kernel. Common
     * causes include dereferencing an invalid device pointer and accessing
     * out of bounds shared memory.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    CUDA_ERROR_LAUNCH_FAILED                  = 719,


    /**
     * This error indicates that the attempted operation is not permitted.
     */
    CUDA_ERROR_NOT_PERMITTED                  = 800,

    /**
     * This error indicates that the attempted operation is not supported
     * on the current system or device.
     */
    CUDA_ERROR_NOT_SUPPORTED                  = 801,

    /**
     * This indicates that an unknown internal error has occurred.
     */
    CUDA_ERROR_UNKNOWN                        = 999;

/**
 * P2P Attributes
 */
/** enum CUdevice_P2PAttribute_enum */
public static final int
    /** A relative value indicating the performance of the link between two devices */
    CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK        = 0x01,
    /** P2P Access is enable */
    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED        = 0x02,
    /** Atomic operation over the link supported */
    CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03;

// #ifdef _WIN32
// #define CUDA_CB __stdcall
// #else
// #define CUDA_CB
// #endif

/**
 * CUDA stream callback
 * @param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback.  May be NULL.
 * @param status ::CUDA_SUCCESS or any persistent error on the stream.
 * @param userData User parameter provided at registration.
 */
@Convention("CUDA_CB") public static class CUstreamCallback extends FunctionPointer {
    static { Loader.load(); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public    CUstreamCallback(Pointer p) { super(p); }
    protected CUstreamCallback() { allocate(); }
    private native void allocate();
    public native void call(CUstream_st hStream, @Cast("CUresult") int status, Pointer userData);
}

/**
 * Block size to per-block dynamic shared memory mapping for a certain
 * kernel @param blockSize Block size of the kernel.
 *
 * @return The dynamic shared memory needed by a block.
 */
@Convention("CUDA_CB") public static class CUoccupancyB2DSize extends FunctionPointer {
    static { Loader.load(); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public    CUoccupancyB2DSize(Pointer p) { super(p); }
    protected CUoccupancyB2DSize() { allocate(); }
    private native void allocate();
    public native @Cast("size_t") long call(int blockSize);
}

/**
 * If set, host memory is portable between CUDA contexts.
 * Flag for ::cuMemHostAlloc()
 */
public static final int CU_MEMHOSTALLOC_PORTABLE =        0x01;

/**
 * If set, host memory is mapped into CUDA address space and
 * ::cuMemHostGetDevicePointer() may be called on the host pointer.
 * Flag for ::cuMemHostAlloc()
 */
public static final int CU_MEMHOSTALLOC_DEVICEMAP =       0x02;

/**
 * If set, host memory is allocated as write-combined - fast to write,
 * faster to DMA, slow to read except via SSE4 streaming load instruction
 * (MOVNTDQA).
 * Flag for ::cuMemHostAlloc()
 */
public static final int CU_MEMHOSTALLOC_WRITECOMBINED =   0x04;

/**
 * If set, host memory is portable between CUDA contexts.
 * Flag for ::cuMemHostRegister()
 */
public static final int CU_MEMHOSTREGISTER_PORTABLE =     0x01;

/**
 * If set, host memory is mapped into CUDA address space and
 * ::cuMemHostGetDevicePointer() may be called on the host pointer.
 * Flag for ::cuMemHostRegister()
 */
public static final int CU_MEMHOSTREGISTER_DEVICEMAP =    0x02;

/**
 * If set, the passed memory pointer is treated as pointing to some
 * memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
 * On Windows the flag is a no-op.
 * On Linux that memory is marked as non cache-coherent for the GPU and
 * is expected to be physically contiguous. It may return
 * CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
 * CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
 * On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED
 * is returned.
 * Flag for ::cuMemHostRegister()
 */
public static final int CU_MEMHOSTREGISTER_IOMEMORY =     0x04;

// #if __CUDA_API_VERSION >= 3020

/**
 * 2D memory copy parameters
 */
public static class CUDA_MEMCPY2D extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUDA_MEMCPY2D() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUDA_MEMCPY2D(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUDA_MEMCPY2D(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUDA_MEMCPY2D position(long position) {
        return (CUDA_MEMCPY2D)super.position(position);
    }

    /** Source X in bytes */
    public native @Cast("size_t") long srcXInBytes(); public native CUDA_MEMCPY2D srcXInBytes(long srcXInBytes);
    /** Source Y */
    public native @Cast("size_t") long srcY(); public native CUDA_MEMCPY2D srcY(long srcY);

    /** Source memory type (host, device, array) */
    public native @Cast("CUmemorytype") int srcMemoryType(); public native CUDA_MEMCPY2D srcMemoryType(int srcMemoryType);
    /** Source host pointer */
    @MemberGetter public native @Const Pointer srcHost();
    /** Source device pointer */
    public native @Cast("CUdeviceptr") long srcDevice(); public native CUDA_MEMCPY2D srcDevice(long srcDevice);
    /** Source array reference */
    public native CUarray_st srcArray(); public native CUDA_MEMCPY2D srcArray(CUarray_st srcArray);
    /** Source pitch (ignored when src is array) */
    public native @Cast("size_t") long srcPitch(); public native CUDA_MEMCPY2D srcPitch(long srcPitch);

    /** Destination X in bytes */
    public native @Cast("size_t") long dstXInBytes(); public native CUDA_MEMCPY2D dstXInBytes(long dstXInBytes);
    /** Destination Y */
    public native @Cast("size_t") long dstY(); public native CUDA_MEMCPY2D dstY(long dstY);

    /** Destination memory type (host, device, array) */
    public native @Cast("CUmemorytype") int dstMemoryType(); public native CUDA_MEMCPY2D dstMemoryType(int dstMemoryType);
    /** Destination host pointer */
    public native Pointer dstHost(); public native CUDA_MEMCPY2D dstHost(Pointer dstHost);
    /** Destination device pointer */
    public native @Cast("CUdeviceptr") long dstDevice(); public native CUDA_MEMCPY2D dstDevice(long dstDevice);
    /** Destination array reference */
    public native CUarray_st dstArray(); public native CUDA_MEMCPY2D dstArray(CUarray_st dstArray);
    /** Destination pitch (ignored when dst is array) */
    public native @Cast("size_t") long dstPitch(); public native CUDA_MEMCPY2D dstPitch(long dstPitch);

    /** Width of 2D memory copy in bytes */
    public native @Cast("size_t") long WidthInBytes(); public native CUDA_MEMCPY2D WidthInBytes(long WidthInBytes);
    /** Height of 2D memory copy */
    public native @Cast("size_t") long Height(); public native CUDA_MEMCPY2D Height(long Height);
}

/**
 * 3D memory copy parameters
 */
public static class CUDA_MEMCPY3D extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUDA_MEMCPY3D() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUDA_MEMCPY3D(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUDA_MEMCPY3D(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUDA_MEMCPY3D position(long position) {
        return (CUDA_MEMCPY3D)super.position(position);
    }

    /** Source X in bytes */
    public native @Cast("size_t") long srcXInBytes(); public native CUDA_MEMCPY3D srcXInBytes(long srcXInBytes);
    /** Source Y */
    public native @Cast("size_t") long srcY(); public native CUDA_MEMCPY3D srcY(long srcY);
    /** Source Z */
    public native @Cast("size_t") long srcZ(); public native CUDA_MEMCPY3D srcZ(long srcZ);
    /** Source LOD */
    public native @Cast("size_t") long srcLOD(); public native CUDA_MEMCPY3D srcLOD(long srcLOD);
    /** Source memory type (host, device, array) */
    public native @Cast("CUmemorytype") int srcMemoryType(); public native CUDA_MEMCPY3D srcMemoryType(int srcMemoryType);
    /** Source host pointer */
    @MemberGetter public native @Const Pointer srcHost();
    /** Source device pointer */
    public native @Cast("CUdeviceptr") long srcDevice(); public native CUDA_MEMCPY3D srcDevice(long srcDevice);
    /** Source array reference */
    public native CUarray_st srcArray(); public native CUDA_MEMCPY3D srcArray(CUarray_st srcArray);
    /** Must be NULL */
    public native Pointer reserved0(); public native CUDA_MEMCPY3D reserved0(Pointer reserved0);
    /** Source pitch (ignored when src is array) */
    public native @Cast("size_t") long srcPitch(); public native CUDA_MEMCPY3D srcPitch(long srcPitch);
    /** Source height (ignored when src is array; may be 0 if Depth==1) */
    public native @Cast("size_t") long srcHeight(); public native CUDA_MEMCPY3D srcHeight(long srcHeight);

    /** Destination X in bytes */
    public native @Cast("size_t") long dstXInBytes(); public native CUDA_MEMCPY3D dstXInBytes(long dstXInBytes);
    /** Destination Y */
    public native @Cast("size_t") long dstY(); public native CUDA_MEMCPY3D dstY(long dstY);
    /** Destination Z */
    public native @Cast("size_t") long dstZ(); public native CUDA_MEMCPY3D dstZ(long dstZ);
    /** Destination LOD */
    public native @Cast("size_t") long dstLOD(); public native CUDA_MEMCPY3D dstLOD(long dstLOD);
    /** Destination memory type (host, device, array) */
    public native @Cast("CUmemorytype") int dstMemoryType(); public native CUDA_MEMCPY3D dstMemoryType(int dstMemoryType);
    /** Destination host pointer */
    public native Pointer dstHost(); public native CUDA_MEMCPY3D dstHost(Pointer dstHost);
    /** Destination device pointer */
    public native @Cast("CUdeviceptr") long dstDevice(); public native CUDA_MEMCPY3D dstDevice(long dstDevice);
    /** Destination array reference */
    public native CUarray_st dstArray(); public native CUDA_MEMCPY3D dstArray(CUarray_st dstArray);
    /** Must be NULL */
    public native Pointer reserved1(); public native CUDA_MEMCPY3D reserved1(Pointer reserved1);
    /** Destination pitch (ignored when dst is array) */
    public native @Cast("size_t") long dstPitch(); public native CUDA_MEMCPY3D dstPitch(long dstPitch);
    /** Destination height (ignored when dst is array; may be 0 if Depth==1) */
    public native @Cast("size_t") long dstHeight(); public native CUDA_MEMCPY3D dstHeight(long dstHeight);

    /** Width of 3D memory copy in bytes */
    public native @Cast("size_t") long WidthInBytes(); public native CUDA_MEMCPY3D WidthInBytes(long WidthInBytes);
    /** Height of 3D memory copy */
    public native @Cast("size_t") long Height(); public native CUDA_MEMCPY3D Height(long Height);
    /** Depth of 3D memory copy */
    public native @Cast("size_t") long Depth(); public native CUDA_MEMCPY3D Depth(long Depth);
}

/**
 * 3D memory cross-context copy parameters
 */
public static class CUDA_MEMCPY3D_PEER extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUDA_MEMCPY3D_PEER() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUDA_MEMCPY3D_PEER(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUDA_MEMCPY3D_PEER(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUDA_MEMCPY3D_PEER position(long position) {
        return (CUDA_MEMCPY3D_PEER)super.position(position);
    }

    /** Source X in bytes */
    public native @Cast("size_t") long srcXInBytes(); public native CUDA_MEMCPY3D_PEER srcXInBytes(long srcXInBytes);
    /** Source Y */
    public native @Cast("size_t") long srcY(); public native CUDA_MEMCPY3D_PEER srcY(long srcY);
    /** Source Z */
    public native @Cast("size_t") long srcZ(); public native CUDA_MEMCPY3D_PEER srcZ(long srcZ);
    /** Source LOD */
    public native @Cast("size_t") long srcLOD(); public native CUDA_MEMCPY3D_PEER srcLOD(long srcLOD);
    /** Source memory type (host, device, array) */
    public native @Cast("CUmemorytype") int srcMemoryType(); public native CUDA_MEMCPY3D_PEER srcMemoryType(int srcMemoryType);
    /** Source host pointer */
    @MemberGetter public native @Const Pointer srcHost();
    /** Source device pointer */
    public native @Cast("CUdeviceptr") long srcDevice(); public native CUDA_MEMCPY3D_PEER srcDevice(long srcDevice);
    /** Source array reference */
    public native CUarray_st srcArray(); public native CUDA_MEMCPY3D_PEER srcArray(CUarray_st srcArray);
    /** Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
    public native CUctx_st srcContext(); public native CUDA_MEMCPY3D_PEER srcContext(CUctx_st srcContext);
    /** Source pitch (ignored when src is array) */
    public native @Cast("size_t") long srcPitch(); public native CUDA_MEMCPY3D_PEER srcPitch(long srcPitch);
    /** Source height (ignored when src is array; may be 0 if Depth==1) */
    public native @Cast("size_t") long srcHeight(); public native CUDA_MEMCPY3D_PEER srcHeight(long srcHeight);

    /** Destination X in bytes */
    public native @Cast("size_t") long dstXInBytes(); public native CUDA_MEMCPY3D_PEER dstXInBytes(long dstXInBytes);
    /** Destination Y */
    public native @Cast("size_t") long dstY(); public native CUDA_MEMCPY3D_PEER dstY(long dstY);
    /** Destination Z */
    public native @Cast("size_t") long dstZ(); public native CUDA_MEMCPY3D_PEER dstZ(long dstZ);
    /** Destination LOD */
    public native @Cast("size_t") long dstLOD(); public native CUDA_MEMCPY3D_PEER dstLOD(long dstLOD);
    /** Destination memory type (host, device, array) */
    public native @Cast("CUmemorytype") int dstMemoryType(); public native CUDA_MEMCPY3D_PEER dstMemoryType(int dstMemoryType);
    /** Destination host pointer */
    public native Pointer dstHost(); public native CUDA_MEMCPY3D_PEER dstHost(Pointer dstHost);
    /** Destination device pointer */
    public native @Cast("CUdeviceptr") long dstDevice(); public native CUDA_MEMCPY3D_PEER dstDevice(long dstDevice);
    /** Destination array reference */
    public native CUarray_st dstArray(); public native CUDA_MEMCPY3D_PEER dstArray(CUarray_st dstArray);
    /** Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
    public native CUctx_st dstContext(); public native CUDA_MEMCPY3D_PEER dstContext(CUctx_st dstContext);
    /** Destination pitch (ignored when dst is array) */
    public native @Cast("size_t") long dstPitch(); public native CUDA_MEMCPY3D_PEER dstPitch(long dstPitch);
    /** Destination height (ignored when dst is array; may be 0 if Depth==1) */
    public native @Cast("size_t") long dstHeight(); public native CUDA_MEMCPY3D_PEER dstHeight(long dstHeight);

    /** Width of 3D memory copy in bytes */
    public native @Cast("size_t") long WidthInBytes(); public native CUDA_MEMCPY3D_PEER WidthInBytes(long WidthInBytes);
    /** Height of 3D memory copy */
    public native @Cast("size_t") long Height(); public native CUDA_MEMCPY3D_PEER Height(long Height);
    /** Depth of 3D memory copy */
    public native @Cast("size_t") long Depth(); public native CUDA_MEMCPY3D_PEER Depth(long Depth);
}

/**
 * Array descriptor
 */
public static class CUDA_ARRAY_DESCRIPTOR extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUDA_ARRAY_DESCRIPTOR() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUDA_ARRAY_DESCRIPTOR(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUDA_ARRAY_DESCRIPTOR(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUDA_ARRAY_DESCRIPTOR position(long position) {
        return (CUDA_ARRAY_DESCRIPTOR)super.position(position);
    }

    /** Width of array */
    public native @Cast("size_t") long Width(); public native CUDA_ARRAY_DESCRIPTOR Width(long Width);
    /** Height of array */
    public native @Cast("size_t") long Height(); public native CUDA_ARRAY_DESCRIPTOR Height(long Height);

    /** Array format */
    public native @Cast("CUarray_format") int Format(); public native CUDA_ARRAY_DESCRIPTOR Format(int Format);
    /** Channels per array element */
    public native @Cast("unsigned int") int NumChannels(); public native CUDA_ARRAY_DESCRIPTOR NumChannels(int NumChannels);
}

/**
 * 3D array descriptor
 */
public static class CUDA_ARRAY3D_DESCRIPTOR extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUDA_ARRAY3D_DESCRIPTOR() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUDA_ARRAY3D_DESCRIPTOR(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUDA_ARRAY3D_DESCRIPTOR(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUDA_ARRAY3D_DESCRIPTOR position(long position) {
        return (CUDA_ARRAY3D_DESCRIPTOR)super.position(position);
    }

    /** Width of 3D array */
    public native @Cast("size_t") long Width(); public native CUDA_ARRAY3D_DESCRIPTOR Width(long Width);
    /** Height of 3D array */
    public native @Cast("size_t") long Height(); public native CUDA_ARRAY3D_DESCRIPTOR Height(long Height);
    /** Depth of 3D array */
    public native @Cast("size_t") long Depth(); public native CUDA_ARRAY3D_DESCRIPTOR Depth(long Depth);

    /** Array format */
    public native @Cast("CUarray_format") int Format(); public native CUDA_ARRAY3D_DESCRIPTOR Format(int Format);
    /** Channels per array element */
    public native @Cast("unsigned int") int NumChannels(); public native CUDA_ARRAY3D_DESCRIPTOR NumChannels(int NumChannels);
    /** Flags */
    public native @Cast("unsigned int") int Flags(); public native CUDA_ARRAY3D_DESCRIPTOR Flags(int Flags);
}

// #endif /* __CUDA_API_VERSION >= 3020 */

// #if __CUDA_API_VERSION >= 5000

/**
 * CUDA Resource descriptor
 */
public static class CUDA_RESOURCE_DESC extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUDA_RESOURCE_DESC() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUDA_RESOURCE_DESC(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUDA_RESOURCE_DESC(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUDA_RESOURCE_DESC position(long position) {
        return (CUDA_RESOURCE_DESC)super.position(position);
    }

    /** Resource type */
    public native @Cast("CUresourcetype") int resType(); public native CUDA_RESOURCE_DESC resType(int resType);

            /** CUDA array */
            @Name("res.array.hArray") public native CUarray_st res_array_hArray(); public native CUDA_RESOURCE_DESC res_array_hArray(CUarray_st res_array_hArray);
            /** CUDA mipmapped array */
            @Name("res.mipmap.hMipmappedArray") public native CUmipmappedArray_st res_mipmap_hMipmappedArray(); public native CUDA_RESOURCE_DESC res_mipmap_hMipmappedArray(CUmipmappedArray_st res_mipmap_hMipmappedArray);
            /** Device pointer */
            @Name("res.linear.devPtr") public native @Cast("CUdeviceptr") long res_linear_devPtr(); public native CUDA_RESOURCE_DESC res_linear_devPtr(long res_linear_devPtr);
            /** Array format */
            @Name("res.linear.format") public native @Cast("CUarray_format") int res_linear_format(); public native CUDA_RESOURCE_DESC res_linear_format(int res_linear_format);
            /** Channels per array element */
            @Name("res.linear.numChannels") public native @Cast("unsigned int") int res_linear_numChannels(); public native CUDA_RESOURCE_DESC res_linear_numChannels(int res_linear_numChannels);
            /** Size in bytes */
            @Name("res.linear.sizeInBytes") public native @Cast("size_t") long res_linear_sizeInBytes(); public native CUDA_RESOURCE_DESC res_linear_sizeInBytes(long res_linear_sizeInBytes);
            /** Device pointer */
            @Name("res.pitch2D.devPtr") public native @Cast("CUdeviceptr") long res_pitch2D_devPtr(); public native CUDA_RESOURCE_DESC res_pitch2D_devPtr(long res_pitch2D_devPtr);
            /** Array format */
            @Name("res.pitch2D.format") public native @Cast("CUarray_format") int res_pitch2D_format(); public native CUDA_RESOURCE_DESC res_pitch2D_format(int res_pitch2D_format);
            /** Channels per array element */
            @Name("res.pitch2D.numChannels") public native @Cast("unsigned int") int res_pitch2D_numChannels(); public native CUDA_RESOURCE_DESC res_pitch2D_numChannels(int res_pitch2D_numChannels);
            /** Width of the array in elements */
            @Name("res.pitch2D.width") public native @Cast("size_t") long res_pitch2D_width(); public native CUDA_RESOURCE_DESC res_pitch2D_width(long res_pitch2D_width);
            /** Height of the array in elements */
            @Name("res.pitch2D.height") public native @Cast("size_t") long res_pitch2D_height(); public native CUDA_RESOURCE_DESC res_pitch2D_height(long res_pitch2D_height);
            /** Pitch between two rows in bytes */
            @Name("res.pitch2D.pitchInBytes") public native @Cast("size_t") long res_pitch2D_pitchInBytes(); public native CUDA_RESOURCE_DESC res_pitch2D_pitchInBytes(long res_pitch2D_pitchInBytes);
            @Name("res.reserved.reserved") public native int res_reserved_reserved(int i); public native CUDA_RESOURCE_DESC res_reserved_reserved(int i, int res_reserved_reserved);
            @Name("res.reserved.reserved") @MemberGetter public native IntPointer res_reserved_reserved();

    /** Flags (must be zero) */
    public native @Cast("unsigned int") int flags(); public native CUDA_RESOURCE_DESC flags(int flags);
}

/**
 * Texture descriptor
 */
public static class CUDA_TEXTURE_DESC extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUDA_TEXTURE_DESC() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUDA_TEXTURE_DESC(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUDA_TEXTURE_DESC(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUDA_TEXTURE_DESC position(long position) {
        return (CUDA_TEXTURE_DESC)super.position(position);
    }

    /** Address modes */
    public native @Cast("CUaddress_mode") int addressMode(int i); public native CUDA_TEXTURE_DESC addressMode(int i, int addressMode);
    @MemberGetter public native @Cast("CUaddress_mode*") IntPointer addressMode();
    /** Filter mode */
    public native @Cast("CUfilter_mode") int filterMode(); public native CUDA_TEXTURE_DESC filterMode(int filterMode);
    /** Flags */
    public native @Cast("unsigned int") int flags(); public native CUDA_TEXTURE_DESC flags(int flags);
    /** Maximum anisotropy ratio */
    public native @Cast("unsigned int") int maxAnisotropy(); public native CUDA_TEXTURE_DESC maxAnisotropy(int maxAnisotropy);
    /** Mipmap filter mode */
    public native @Cast("CUfilter_mode") int mipmapFilterMode(); public native CUDA_TEXTURE_DESC mipmapFilterMode(int mipmapFilterMode);
    /** Mipmap level bias */
    public native float mipmapLevelBias(); public native CUDA_TEXTURE_DESC mipmapLevelBias(float mipmapLevelBias);
    /** Mipmap minimum level clamp */
    public native float minMipmapLevelClamp(); public native CUDA_TEXTURE_DESC minMipmapLevelClamp(float minMipmapLevelClamp);
    /** Mipmap maximum level clamp */
    public native float maxMipmapLevelClamp(); public native CUDA_TEXTURE_DESC maxMipmapLevelClamp(float maxMipmapLevelClamp); 
    /** Border Color */
    public native float borderColor(int i); public native CUDA_TEXTURE_DESC borderColor(int i, float borderColor);
    @MemberGetter public native FloatPointer borderColor();
    public native int reserved(int i); public native CUDA_TEXTURE_DESC reserved(int i, int reserved);
    @MemberGetter public native IntPointer reserved();
}

/**
 * Resource view format
 */
/** enum CUresourceViewFormat_enum */
public static final int
    /** No resource view format (use underlying resource format) */
    CU_RES_VIEW_FORMAT_NONE          = 0x00,
    /** 1 channel unsigned 8-bit integers */
    CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01,
    /** 2 channel unsigned 8-bit integers */
    CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02,
    /** 4 channel unsigned 8-bit integers */
    CU_RES_VIEW_FORMAT_UINT_4X8      = 0x03,
    /** 1 channel signed 8-bit integers */
    CU_RES_VIEW_FORMAT_SINT_1X8      = 0x04,
    /** 2 channel signed 8-bit integers */
    CU_RES_VIEW_FORMAT_SINT_2X8      = 0x05,
    /** 4 channel signed 8-bit integers */
    CU_RES_VIEW_FORMAT_SINT_4X8      = 0x06,
    /** 1 channel unsigned 16-bit integers */
    CU_RES_VIEW_FORMAT_UINT_1X16     = 0x07,
    /** 2 channel unsigned 16-bit integers */
    CU_RES_VIEW_FORMAT_UINT_2X16     = 0x08,
    /** 4 channel unsigned 16-bit integers */
    CU_RES_VIEW_FORMAT_UINT_4X16     = 0x09,
    /** 1 channel signed 16-bit integers */
    CU_RES_VIEW_FORMAT_SINT_1X16     = 0x0a,
    /** 2 channel signed 16-bit integers */
    CU_RES_VIEW_FORMAT_SINT_2X16     = 0x0b,
    /** 4 channel signed 16-bit integers */
    CU_RES_VIEW_FORMAT_SINT_4X16     = 0x0c,
    /** 1 channel unsigned 32-bit integers */
    CU_RES_VIEW_FORMAT_UINT_1X32     = 0x0d,
    /** 2 channel unsigned 32-bit integers */
    CU_RES_VIEW_FORMAT_UINT_2X32     = 0x0e,
    /** 4 channel unsigned 32-bit integers */
    CU_RES_VIEW_FORMAT_UINT_4X32     = 0x0f,
    /** 1 channel signed 32-bit integers */
    CU_RES_VIEW_FORMAT_SINT_1X32     = 0x10,
    /** 2 channel signed 32-bit integers */
    CU_RES_VIEW_FORMAT_SINT_2X32     = 0x11,
    /** 4 channel signed 32-bit integers */
    CU_RES_VIEW_FORMAT_SINT_4X32     = 0x12,
    /** 1 channel 16-bit floating point */
    CU_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13,
    /** 2 channel 16-bit floating point */
    CU_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14,
    /** 4 channel 16-bit floating point */
    CU_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15,
    /** 1 channel 32-bit floating point */
    CU_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16,
    /** 2 channel 32-bit floating point */
    CU_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17,
    /** 4 channel 32-bit floating point */
    CU_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18,
    /** Block compressed 1 */
    CU_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19,
    /** Block compressed 2 */
    CU_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a,
    /** Block compressed 3 */
    CU_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b,
    /** Block compressed 4 unsigned */
    CU_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c,
    /** Block compressed 4 signed */
    CU_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d,
    /** Block compressed 5 unsigned */
    CU_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e,
    /** Block compressed 5 signed */
    CU_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f,
    /** Block compressed 6 unsigned half-float */
    CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20,
    /** Block compressed 6 signed half-float */
    CU_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21,
    /** Block compressed 7 */
    CU_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22;

/**
 * Resource view descriptor
 */
public static class CUDA_RESOURCE_VIEW_DESC extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUDA_RESOURCE_VIEW_DESC() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUDA_RESOURCE_VIEW_DESC(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUDA_RESOURCE_VIEW_DESC(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUDA_RESOURCE_VIEW_DESC position(long position) {
        return (CUDA_RESOURCE_VIEW_DESC)super.position(position);
    }

    /** Resource view format */
    public native @Cast("CUresourceViewFormat") int format(); public native CUDA_RESOURCE_VIEW_DESC format(int format);
    /** Width of the resource view */
    public native @Cast("size_t") long width(); public native CUDA_RESOURCE_VIEW_DESC width(long width);
    /** Height of the resource view */
    public native @Cast("size_t") long height(); public native CUDA_RESOURCE_VIEW_DESC height(long height);
    /** Depth of the resource view */
    public native @Cast("size_t") long depth(); public native CUDA_RESOURCE_VIEW_DESC depth(long depth);
    /** First defined mipmap level */
    public native @Cast("unsigned int") int firstMipmapLevel(); public native CUDA_RESOURCE_VIEW_DESC firstMipmapLevel(int firstMipmapLevel);
    /** Last defined mipmap level */
    public native @Cast("unsigned int") int lastMipmapLevel(); public native CUDA_RESOURCE_VIEW_DESC lastMipmapLevel(int lastMipmapLevel);
    /** First layer index */
    public native @Cast("unsigned int") int firstLayer(); public native CUDA_RESOURCE_VIEW_DESC firstLayer(int firstLayer);
    /** Last layer index */
    public native @Cast("unsigned int") int lastLayer(); public native CUDA_RESOURCE_VIEW_DESC lastLayer(int lastLayer);
    public native @Cast("unsigned int") int reserved(int i); public native CUDA_RESOURCE_VIEW_DESC reserved(int i, int reserved);
    @MemberGetter public native @Cast("unsigned int*") IntPointer reserved();
}

/**
 * GPU Direct v3 tokens
 */
public static class CUDA_POINTER_ATTRIBUTE_P2P_TOKENS extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public CUDA_POINTER_ATTRIBUTE_P2P_TOKENS() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public CUDA_POINTER_ATTRIBUTE_P2P_TOKENS(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public CUDA_POINTER_ATTRIBUTE_P2P_TOKENS(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public CUDA_POINTER_ATTRIBUTE_P2P_TOKENS position(long position) {
        return (CUDA_POINTER_ATTRIBUTE_P2P_TOKENS)super.position(position);
    }

    public native @Cast("unsigned long long") long p2pToken(); public native CUDA_POINTER_ATTRIBUTE_P2P_TOKENS p2pToken(long p2pToken);
    public native @Cast("unsigned int") int vaSpaceToken(); public native CUDA_POINTER_ATTRIBUTE_P2P_TOKENS vaSpaceToken(int vaSpaceToken);
}

// #endif /* __CUDA_API_VERSION >= 5000 */

/**
 * If set, the CUDA array is a collection of layers, where each layer is either a 1D
 * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number 
 * of layers, not the depth of a 3D array.
 */
public static final int CUDA_ARRAY3D_LAYERED =        0x01;

/**
 * Deprecated, use CUDA_ARRAY3D_LAYERED
 */
public static final int CUDA_ARRAY3D_2DARRAY =        0x01;

/**
 * This flag must be set in order to bind a surface reference
 * to the CUDA array
 */
public static final int CUDA_ARRAY3D_SURFACE_LDST =   0x02;

/**
 * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The
 * width of such a CUDA array must be equal to its height, and Depth must be six.
 * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps
 * and Depth must be a multiple of six.
 */
public static final int CUDA_ARRAY3D_CUBEMAP =        0x04;

/**
 * This flag must be set in order to perform texture gather operations
 * on a CUDA array.
 */
public static final int CUDA_ARRAY3D_TEXTURE_GATHER = 0x08;

/**
 * This flag if set indicates that the CUDA
 * array is a DEPTH_TEXTURE.
*/
public static final int CUDA_ARRAY3D_DEPTH_TEXTURE = 0x10;

/**
 * Override the texref format with a format inferred from the array.
 * Flag for ::cuTexRefSetArray()
 */
public static final int CU_TRSA_OVERRIDE_FORMAT = 0x01;

/**
 * Read the texture as integers rather than promoting the values to floats
 * in the range [0,1].
 * Flag for ::cuTexRefSetFlags()
 */
public static final int CU_TRSF_READ_AS_INTEGER =         0x01;

/**
 * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
 * Flag for ::cuTexRefSetFlags()
 */
public static final int CU_TRSF_NORMALIZED_COORDINATES =  0x02;

/**
 * Perform sRGB->linear conversion during texture read.
 * Flag for ::cuTexRefSetFlags()
 */
public static final int CU_TRSF_SRGB =  0x10;

/**
 * End of array terminator for the \p extra parameter to
 * ::cuLaunchKernel
 */
public static native @MemberGetter Pointer CU_LAUNCH_PARAM_END();
public static final Pointer CU_LAUNCH_PARAM_END = CU_LAUNCH_PARAM_END();

/**
 * Indicator that the next value in the \p extra parameter to
 * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
 * parameters used for launching kernel \p f.  This buffer needs to
 * honor all alignment/padding requirements of the individual parameters.
 * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
 * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
 * effect.
 */
public static native @MemberGetter Pointer CU_LAUNCH_PARAM_BUFFER_POINTER();
public static final Pointer CU_LAUNCH_PARAM_BUFFER_POINTER = CU_LAUNCH_PARAM_BUFFER_POINTER();

/**
 * Indicator that the next value in the \p extra parameter to
 * ::cuLaunchKernel will be a pointer to a size_t which contains the
 * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
 * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
 * in the \p extra array if the value associated with
 * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
 */
public static native @MemberGetter Pointer CU_LAUNCH_PARAM_BUFFER_SIZE();
public static final Pointer CU_LAUNCH_PARAM_BUFFER_SIZE = CU_LAUNCH_PARAM_BUFFER_SIZE();

/**
 * For texture references loaded into the module, use default texunit from
 * texture reference.
 */
public static final int CU_PARAM_TR_DEFAULT = -1;

/**
 * Device that represents the CPU
 */
public static native @MemberGetter int CU_DEVICE_CPU();
public static final int CU_DEVICE_CPU = CU_DEVICE_CPU();

/**
 * Device that represents an invalid device
 */
public static native @MemberGetter int CU_DEVICE_INVALID();
public static final int CU_DEVICE_INVALID = CU_DEVICE_INVALID();

/** \} */ /* END CUDA_TYPES */

// #ifdef _WIN32
// #define CUDAAPI __stdcall
// #else
// #define CUDAAPI
// #endif

/**
 * \defgroup CUDA_ERROR Error Handling
 *
 * ___MANBRIEF___ error handling functions of the low-level CUDA driver API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the error handling functions of the low-level CUDA
 * driver application programming interface.
 *
 * \{
 */

/**
 * \brief Gets the string description of an error code
 *
 * Sets \p *pStr to the address of a NULL-terminated string description
 * of the error code \p error.
 * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
 * will be returned and \p *pStr will be set to the NULL address.
 *
 * @param error - Error code to convert to string
 * @param pStr - Address of the string pointer.
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::CUresult
 */
public static native @Cast("CUresult") int cuGetErrorString(@Cast("CUresult") int error, @Cast("const char**") PointerPointer pStr);
public static native @Cast("CUresult") int cuGetErrorString(@Cast("CUresult") int error, @Cast("const char**") @ByPtrPtr BytePointer pStr);
public static native @Cast("CUresult") int cuGetErrorString(@Cast("CUresult") int error, @Cast("const char**") @ByPtrPtr ByteBuffer pStr);
public static native @Cast("CUresult") int cuGetErrorString(@Cast("CUresult") int error, @Cast("const char**") @ByPtrPtr byte[] pStr);

/**
 * \brief Gets the string representation of an error code enum name
 *
 * Sets \p *pStr to the address of a NULL-terminated string representation
 * of the name of the enum error code \p error.
 * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
 * will be returned and \p *pStr will be set to the NULL address.
 *
 * @param error - Error code to convert to string
 * @param pStr - Address of the string pointer.
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::CUresult
 */
public static native @Cast("CUresult") int cuGetErrorName(@Cast("CUresult") int error, @Cast("const char**") PointerPointer pStr);
public static native @Cast("CUresult") int cuGetErrorName(@Cast("CUresult") int error, @Cast("const char**") @ByPtrPtr BytePointer pStr);
public static native @Cast("CUresult") int cuGetErrorName(@Cast("CUresult") int error, @Cast("const char**") @ByPtrPtr ByteBuffer pStr);
public static native @Cast("CUresult") int cuGetErrorName(@Cast("CUresult") int error, @Cast("const char**") @ByPtrPtr byte[] pStr);

/** \} */ /* END CUDA_ERROR */

/**
 * \defgroup CUDA_INITIALIZE Initialization
 *
 * ___MANBRIEF___ initialization functions of the low-level CUDA driver API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the initialization functions of the low-level CUDA
 * driver application programming interface.
 *
 * \{
 */

/**
 * \brief Initialize the CUDA driver API
 *
 * Initializes the driver API and must be called before any other function from
 * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit()
 * has not been called, any function from the driver API will return
 * ::CUDA_ERROR_NOT_INITIALIZED.
 *
 * @param Flags - Initialization flag for CUDA.
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 */
public static native @Cast("CUresult") int cuInit(@Cast("unsigned int") int Flags);

/** \} */ /* END CUDA_INITIALIZE */

/**
 * \defgroup CUDA_VERSION Version Management
 *
 * ___MANBRIEF___ version management functions of the low-level CUDA driver
 * API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the version management functions of the low-level
 * CUDA driver application programming interface.
 *
 * \{
 */

/**
 * \brief Returns the CUDA driver version
 *
 * Returns in \p *driverVersion the version number of the installed CUDA
 * driver. This function automatically returns ::CUDA_ERROR_INVALID_VALUE if
 * the \p driverVersion argument is NULL.
 *
 * @param driverVersion - Returns the CUDA driver version
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 */
public static native @Cast("CUresult") int cuDriverGetVersion(IntPointer driverVersion);
public static native @Cast("CUresult") int cuDriverGetVersion(IntBuffer driverVersion);
public static native @Cast("CUresult") int cuDriverGetVersion(int[] driverVersion);

/** \} */ /* END CUDA_VERSION */

/**
 * \defgroup CUDA_DEVICE Device Management
 *
 * ___MANBRIEF___ device management functions of the low-level CUDA driver API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the device management functions of the low-level
 * CUDA driver application programming interface.
 *
 * \{
 */

/**
 * \brief Returns a handle to a compute device
 *
 * Returns in \p *device a device handle given an ordinal in the range [0,
 * ::cuDeviceGetCount()-1].
 *
 * @param device  - Returned device handle
 * @param ordinal - Device number to get handle for
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa
 * ::cuDeviceGetAttribute,
 * ::cuDeviceGetCount,
 * ::cuDeviceGetName,
 * ::cuDeviceTotalMem
 */
public static native @Cast("CUresult") int cuDeviceGet(@Cast("CUdevice*") IntPointer device, int ordinal);
public static native @Cast("CUresult") int cuDeviceGet(@Cast("CUdevice*") IntBuffer device, int ordinal);
public static native @Cast("CUresult") int cuDeviceGet(@Cast("CUdevice*") int[] device, int ordinal);

/**
 * \brief Returns the number of compute-capable devices
 *
 * Returns in \p *count the number of devices with compute capability greater
 * than or equal to 1.0 that are available for execution. If there is no such
 * device, ::cuDeviceGetCount() returns 0.
 *
 * @param count - Returned number of compute-capable devices
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa
 * ::cuDeviceGetAttribute,
 * ::cuDeviceGetName,
 * ::cuDeviceGet,
 * ::cuDeviceTotalMem
 */
public static native @Cast("CUresult") int cuDeviceGetCount(IntPointer count);
public static native @Cast("CUresult") int cuDeviceGetCount(IntBuffer count);
public static native @Cast("CUresult") int cuDeviceGetCount(int[] count);

/**
 * \brief Returns an identifer string for the device
 *
 * Returns an ASCII string identifying the device \p dev in the NULL-terminated
 * string pointed to by \p name. \p len specifies the maximum length of the
 * string that may be returned.
 *
 * @param name - Returned identifier string for the device
 * @param len  - Maximum length of string to store in \p name
 * @param dev  - Device to get identifier string for
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa
 * ::cuDeviceGetAttribute,
 * ::cuDeviceGetCount,
 * ::cuDeviceGet,
 * ::cuDeviceTotalMem
 */
public static native @Cast("CUresult") int cuDeviceGetName(@Cast("char*") BytePointer name, int len, @Cast("CUdevice") int dev);
public static native @Cast("CUresult") int cuDeviceGetName(@Cast("char*") ByteBuffer name, int len, @Cast("CUdevice") int dev);
public static native @Cast("CUresult") int cuDeviceGetName(@Cast("char*") byte[] name, int len, @Cast("CUdevice") int dev);

// #if __CUDA_API_VERSION >= 3020
/**
 * \brief Returns the total amount of memory on the device
 *
 * Returns in \p *bytes the total amount of memory available on the device
 * \p dev in bytes.
 *
 * @param bytes - Returned memory available on device in bytes
 * @param dev   - Device handle
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa
 * ::cuDeviceGetAttribute,
 * ::cuDeviceGetCount,
 * ::cuDeviceGetName,
 * ::cuDeviceGet,
 */
public static native @Cast("CUresult") int cuDeviceTotalMem(@Cast("size_t*") SizeTPointer bytes, @Cast("CUdevice") int dev);
// #endif /* __CUDA_API_VERSION >= 3020 */

/**
 * \brief Returns information about the device
 *
 * Returns in \p *pi the integer value of the attribute \p attrib on device
 * \p dev. The supported attributes are:
 * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
 *   block;
 * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block;
 * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block;
 * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block;
 * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid;
 * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid;
 * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid;
 * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
 *   shared memory available to a thread block in bytes;
 * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
 *   __constant__ variables in a CUDA C kernel in bytes;
 * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads;
 * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
 *   memory copy functions that involve memory regions allocated through
 *   ::cuMemAllocPitch();
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D 
 *  texture width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
 *  for a 1D texture bound to linear memory;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum 
 *  mipmapped 1D texture width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D 
 *  texture width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D 
 *  texture height;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
 *  for a 2D texture bound to linear memory;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
 *  for a 2D texture bound to linear memory;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
 *  in bytes for a 2D texture bound to linear memory;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum 
 *  mipmapped 2D texture width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum
 *  mipmapped 2D texture height;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D 
 *  texture width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D 
 *  texture height;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D 
 *  texture depth;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: 
 *  Alternate maximum 3D texture width, 0 if no alternate
 *  maximum 3D texture size is supported;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: 
 *  Alternate maximum 3D texture height, 0 if no alternate
 *  maximum 3D texture size is supported;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: 
 *  Alternate maximum 3D texture depth, 0 if no alternate
 *  maximum 3D texture size is supported;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
 *  Maximum cubemap texture width or height;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: 
 *  Maximum 1D layered texture width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: 
 *   Maximum layers in a 1D layered texture;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: 
 *  Maximum 2D layered texture width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: 
 *   Maximum 2D layered texture height;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: 
 *   Maximum layers in a 2D layered texture;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: 
 *   Maximum cubemap layered texture width or height;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: 
 *   Maximum layers in a cubemap layered texture;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
 *   Maximum 1D surface width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
 *   Maximum 2D surface width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
 *   Maximum 2D surface height;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
 *   Maximum 3D surface width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
 *   Maximum 3D surface height;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
 *   Maximum 3D surface depth;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
 *   Maximum 1D layered surface width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
 *   Maximum layers in a 1D layered surface;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
 *   Maximum 2D layered surface width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
 *   Maximum 2D layered surface height;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
 *   Maximum layers in a 2D layered surface;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
 *   Maximum cubemap surface width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
 *   Maximum cubemap layered surface width;
 * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
 *   Maximum layers in a cubemap layered surface;
 * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
 *   registers available to a thread block;
 * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz;
 * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
 *   base addresses aligned to ::textureAlign bytes do not need an offset
 *   applied to texture fetches;
 * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
 *   for 2D texture references bound to pitched memory;
 * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
 *   memory between host and device while executing a kernel, or 0 if not;
 * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
 *   the device;
 * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
 *   for kernels executed on the device, or 0 if not;
 * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
 *   memory subsystem, or 0 if not;
 * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
 *   memory into the CUDA address space, or 0 if not;
 * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
 *   in. Available modes are as follows:
 *   - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
 *     can have multiple CUDA contexts present at a single time.
 *   - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is
 *     prohibited from creating new CUDA contexts.
 *   - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS:  Compute-exclusive-process mode - Device
 *     can have only one context used by a single process at a time.
 * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports
 *   executing multiple kernels within the same context simultaneously, or 0 if
 *   not. It is not guaranteed that multiple kernels will be resident
 *   on the device concurrently so this feature should not be relied upon for
 *   correctness;
 * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
 *    device, 0 if error correction is disabled or not supported by the device;
 * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device;
 * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier
 *   of the device;
 * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC
 *    is only available on Tesla hardware running Windows Vista or later;
 * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz;
 * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits;
 * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache;
 * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor;
 * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with 
 *   the host, or 0 if not;
 * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number;
 * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number;
 * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals 
 *    in L1 cache, 0 if caching globals in L1 cache is not supported by the device;
 * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals 
 *    in L1 cache, 0 if caching locals in L1 cache is not supported by the device;
 * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of
 *   shared memory available to a multiprocessor in bytes; this amount is shared
 *   by all thread blocks simultaneously resident on a multiprocessor;
 * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit
 *   registers available to a multiprocessor; this number is shared by all thread
 *   blocks simultaneously resident on a multiprocessor;
 * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory
 *   on this system, 0 if allocating managed memory is not supported by the device on this system.
 * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not.
 * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices
 *   associated with the same board. Devices on the same multi-GPU board will share the same identifier.
 * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host
 *   supports native atomic operations.
 * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance
 *   (in floating-point operations per second) to double precision performance.
 * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device suppports coherently accessing
 *   pageable memory without calling cudaHostRegister on it.
 * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory
 *   concurrently with the CPU.
 * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption.
 * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered
 *   memory at the same virtual address as the CPU.
 *
 * @param pi     - Returned device attribute value
 * @param attrib - Device attribute to query
 * @param dev    - Device handle
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa
 * ::cuDeviceGetCount,
 * ::cuDeviceGetName,
 * ::cuDeviceGet,
 * ::cuDeviceTotalMem
 */
public static native @Cast("CUresult") int cuDeviceGetAttribute(IntPointer pi, @Cast("CUdevice_attribute") int attrib, @Cast("CUdevice") int dev);
public static native @Cast("CUresult") int cuDeviceGetAttribute(IntBuffer pi, @Cast("CUdevice_attribute") int attrib, @Cast("CUdevice") int dev);
public static native @Cast("CUresult") int cuDeviceGetAttribute(int[] pi, @Cast("CUdevice_attribute") int attrib, @Cast("CUdevice") int dev);

/** \} */ /* END CUDA_DEVICE */

/**
 * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED]
 *
 * ___MANBRIEF___ deprecated device management functions of the low-level CUDA
 * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the device management functions of the low-level
 * CUDA driver application programming interface.
 *
 * \{
 */

/**
 * \brief Returns properties for a selected device
 *
 * @deprecated
 *
 * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute().
 *
 * Returns in \p *prop the properties of device \p dev. The ::CUdevprop
 * structure is defined as:
 *
 * {@code
     typedef struct CUdevprop_st {
     int maxThreadsPerBlock;
     int maxThreadsDim[3];
     int maxGridSize[3];
     int sharedMemPerBlock;
     int totalConstantMemory;
     int SIMDWidth;
     int memPitch;
     int regsPerBlock;
     int clockRate;
     int textureAlign
  } CUdevprop;
 * }
 * where:
 *
 * - ::maxThreadsPerBlock is the maximum number of threads per block;
 * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block;
 * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid;
 * - ::sharedMemPerBlock is the total amount of shared memory available per
 *   block in bytes;
 * - ::totalConstantMemory is the total amount of constant memory available on
 *   the device in bytes;
 * - ::SIMDWidth is the warp size;
 * - ::memPitch is the maximum pitch allowed by the memory copy functions that
 *   involve memory regions allocated through ::cuMemAllocPitch();
 * - ::regsPerBlock is the total number of registers available per block;
 * - ::clockRate is the clock frequency in kilohertz;
 * - ::textureAlign is the alignment requirement; texture base addresses that
 *   are aligned to ::textureAlign bytes do not need an offset applied to
 *   texture fetches.
 *
 * @param prop - Returned properties of device
 * @param dev  - Device to get properties for
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa
 * ::cuDeviceGetAttribute,
 * ::cuDeviceGetCount,
 * ::cuDeviceGetName,
 * ::cuDeviceGet,
 * ::cuDeviceTotalMem
 */
public static native @Cast("CUresult") int cuDeviceGetProperties(CUdevprop prop, @Cast("CUdevice") int dev);

/**
 * \brief Returns the compute capability of the device
 *
 * @deprecated
 *
 * This function was deprecated as of CUDA 5.0 and its functionality superceded
 * by ::cuDeviceGetAttribute(). 
 *
 * Returns in \p *major and \p *minor the major and minor revision numbers that
 * define the compute capability of the device \p dev.
 *
 * @param major - Major revision number
 * @param minor - Minor revision number
 * @param dev   - Device handle
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa
 * ::cuDeviceGetAttribute,
 * ::cuDeviceGetCount,
 * ::cuDeviceGetName,
 * ::cuDeviceGet,
 * ::cuDeviceTotalMem
 */
public static native @Cast("CUresult") int cuDeviceComputeCapability(IntPointer major, IntPointer minor, @Cast("CUdevice") int dev);
public static native @Cast("CUresult") int cuDeviceComputeCapability(IntBuffer major, IntBuffer minor, @Cast("CUdevice") int dev);
public static native @Cast("CUresult") int cuDeviceComputeCapability(int[] major, int[] minor, @Cast("CUdevice") int dev);

/** \} */ /* END CUDA_DEVICE_DEPRECATED */

/**
 * \defgroup CUDA_PRIMARY_CTX Primary Context Management
 *
 * ___MANBRIEF___ primary context management functions of the low-level CUDA driver
 * API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the primary context management functions of the low-level
 * CUDA driver application programming interface.
 *
 * The primary context unique per device and it's shared with CUDA runtime API.
 * Those functions allows seemless integration with other libraries using CUDA.
 *
 * \{
 */

// #if __CUDA_API_VERSION >= 7000

/**
 * \brief Retain the primary context on the GPU
 *
 * Retains the primary context on the device, creating it if necessary,
 * increasing its usage count. The caller must call
 * ::cuDevicePrimaryCtxRelease() when done using the context.
 * Unlike ::cuCtxCreate() the newly created context is not pushed onto the stack.
 *
 * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
 * the device is ::CU_COMPUTEMODE_PROHIBITED.  The function ::cuDeviceGetAttribute() 
 * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode 
 * of the device. 
 * The nvidia-smi tool can be used to set the compute mode for
 * devices. Documentation for nvidia-smi can be obtained by passing a
 * -h option to it.
 *
 * Please note that the primary context always supports pinned allocations. Other
 * flags can be specified by ::cuDevicePrimaryCtxSetFlags().
 *
 * @param pctx  - Returned context handle of the new context
 * @param dev   - Device for which primary context is requested
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_DEVICE,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_UNKNOWN
 * \notefnerr
 *
 * \sa ::cuDevicePrimaryCtxRelease,
 * ::cuDevicePrimaryCtxSetFlags,
 * ::cuCtxCreate,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize
 */
public static native @Cast("CUresult") int cuDevicePrimaryCtxRetain(@ByPtrPtr CUctx_st pctx, @Cast("CUdevice") int dev);

/**
 * \brief Release the primary context on the GPU
 *
 * Releases the primary context interop on the device by decreasing the usage
 * count by 1. If the usage drops to 0 the primary context of device \p dev
 * will be destroyed regardless of how many threads it is current to.
 *
 * Please note that unlike ::cuCtxDestroy() this method does not pop the context
 * from stack in any circumstances.
 *
 * @param dev - Device which primary context is released
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa ::cuDevicePrimaryCtxRetain,
 * ::cuCtxDestroy,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize
 */
public static native @Cast("CUresult") int cuDevicePrimaryCtxRelease(@Cast("CUdevice") int dev);

/**
 * \brief Set flags for the primary context
 *
 * Sets the flags for the primary context on the device overwriting perviously
 * set ones. If the primary context is already created
 * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE is returned.
 *
 * The three LSBs of the \p flags parameter can be used to control how the OS
 * thread, which owns the CUDA context at the time of an API call, interacts
 * with the OS scheduler when waiting for results from the GPU. Only one of
 * the scheduling flags can be set when creating a context.
 *
 * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
 * results from the GPU. This can decrease latency when waiting for the GPU,
 * but may lower the performance of CPU threads if they are performing work in
 * parallel with the CUDA thread.
 *
 * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
 * results from the GPU. This can increase latency when waiting for the GPU,
 * but can increase the performance of CPU threads performing work in parallel
 * with the GPU.
 *
 * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
 * synchronization primitive when waiting for the GPU to finish work.
 *
 * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
 * synchronization primitive when waiting for the GPU to finish work. 

 * Deprecated: This flag was deprecated as of CUDA 4.0 and was
 * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
 *
 * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
 * uses a heuristic based on the number of active CUDA contexts in the
 * process \e C and the number of logical processors in the system \e P. If
 * \e C > \e P, then CUDA will yield to other OS threads when waiting for
 * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
 * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
 * However, on low power devices like Tegra, it always defaults to
 * ::CU_CTX_SCHED_BLOCKING_SYNC.
 *
 * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
 * after resizing local memory for a kernel. This can prevent thrashing by
 * local memory allocations when launching many kernels with high local
 * memory usage at the cost of potentially increased memory usage.
 *
 * @param dev   - Device for which the primary context flags are set
 * @param flags - New flags for the device
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_DEVICE,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
 * \notefnerr
 *
 * \sa ::cuDevicePrimaryCtxRetain,
 * ::cuDevicePrimaryCtxGetState,
 * ::cuCtxCreate,
 * ::cuCtxGetFlags
 */
public static native @Cast("CUresult") int cuDevicePrimaryCtxSetFlags(@Cast("CUdevice") int dev, @Cast("unsigned int") int flags);

/**
 * \brief Get the state of the primary context
 *
 * Returns in \p *flags the flags for the primary context of \p dev, and in
 * \p *active whether it is active.  See ::cuDevicePrimaryCtxSetFlags for flag
 * values.
 *
 * @param dev    - Device to get primary context flags for
 * @param flags  - Pointer to store flags
 * @param active - Pointer to store context state; 0 = inactive, 1 = active
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_DEVICE,
 * ::CUDA_ERROR_INVALID_VALUE,
 * \notefnerr
 *
 * \sa ::cuDevicePrimaryCtxSetFlags,
 * ::cuCtxGetFlags
 */
public static native @Cast("CUresult") int cuDevicePrimaryCtxGetState(@Cast("CUdevice") int dev, @Cast("unsigned int*") IntPointer flags, IntPointer active);
public static native @Cast("CUresult") int cuDevicePrimaryCtxGetState(@Cast("CUdevice") int dev, @Cast("unsigned int*") IntBuffer flags, IntBuffer active);
public static native @Cast("CUresult") int cuDevicePrimaryCtxGetState(@Cast("CUdevice") int dev, @Cast("unsigned int*") int[] flags, int[] active);

/**
 * \brief Destroy all allocations and reset all state on the primary context
 *
 * Explicitly destroys and cleans up all resources associated with the current
 * device in the current process.
 *
 * Note that it is responsibility of the calling function to ensure that no
 * other module in the process is using the device any more. For that reason
 * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases.
 * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease()
 * even after resetting the device.
 *
 * @param dev - Device for which primary context is destroyed
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_DEVICE,
 * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
 * \notefnerr
 *
 * \sa ::cuDevicePrimaryCtxRetain,
 * ::cuDevicePrimaryCtxRelease,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize
 *
 */
public static native @Cast("CUresult") int cuDevicePrimaryCtxReset(@Cast("CUdevice") int dev);

// #endif /* __CUDA_API_VERSION >= 7000 */

/** \} */ /* END CUDA_PRIMARY_CTX */


/**
 * \defgroup CUDA_CTX Context Management
 *
 * ___MANBRIEF___ context management functions of the low-level CUDA driver
 * API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the context management functions of the low-level
 * CUDA driver application programming interface.
 *
 * \{
 */

// #if __CUDA_API_VERSION >= 3020
/**
 * \brief Create a CUDA context
 *
 * Creates a new CUDA context and associates it with the calling thread. The
 * \p flags parameter is described below. The context is created with a usage
 * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or
 * when done using the context. If a context is already current to the thread, 
 * it is supplanted by the newly created context and may be restored by a subsequent 
 * call to ::cuCtxPopCurrent().
 *
 * The three LSBs of the \p flags parameter can be used to control how the OS
 * thread, which owns the CUDA context at the time of an API call, interacts
 * with the OS scheduler when waiting for results from the GPU. Only one of
 * the scheduling flags can be set when creating a context.
 *
 * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
 * results from the GPU. This can decrease latency when waiting for the GPU,
 * but may lower the performance of CPU threads if they are performing work in
 * parallel with the CUDA thread.
 *
 * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
 * results from the GPU. This can increase latency when waiting for the GPU,
 * but can increase the performance of CPU threads performing work in parallel
 * with the GPU.
 * 
 * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
 * synchronization primitive when waiting for the GPU to finish work.
 *
 * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
 * synchronization primitive when waiting for the GPU to finish work. 

 * Deprecated: This flag was deprecated as of CUDA 4.0 and was
 * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. 
 *
 * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
 * uses a heuristic based on the number of active CUDA contexts in the
 * process \e C and the number of logical processors in the system \e P. If
 * \e C > \e P, then CUDA will yield to other OS threads when waiting for 
 * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while 
 * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). 
 * However, on low power devices like Tegra, it always defaults to 
 * ::CU_CTX_SCHED_BLOCKING_SYNC.
 *
 * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
 * This flag must be set in order to allocate pinned host memory that is
 * accessible to the GPU.
 *
 * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
 * after resizing local memory for a kernel. This can prevent thrashing by
 * local memory allocations when launching many kernels with high local
 * memory usage at the cost of potentially increased memory usage.
 *
 * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
 * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() 
 * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the 
 * compute mode of the device. The nvidia-smi tool can be used to set 
 * the compute mode for * devices. 
 * Documentation for nvidia-smi can be obtained by passing a
 * -h option to it.
 *
 * @param pctx  - Returned context handle of the new context
 * @param flags - Context creation flags
 * @param dev   - Device to create context on
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_DEVICE,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_UNKNOWN
 * \notefnerr
 *
 * \sa ::cuCtxDestroy,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize
 */
public static native @Cast("CUresult") int cuCtxCreate(@ByPtrPtr CUctx_st pctx, @Cast("unsigned int") int flags, @Cast("CUdevice") int dev);
// #endif /* __CUDA_API_VERSION >= 3020 */

// #if __CUDA_API_VERSION >= 4000
/**
 * \brief Destroy a CUDA context
 *
 * Destroys the CUDA context specified by \p ctx.  The context \p ctx will be
 * destroyed regardless of how many threads it is current to.
 * It is the responsibility of the calling function to ensure that no API
 * call issues using \p ctx while ::cuCtxDestroy() is executing.
 *
 * If \p ctx is current to the calling thread then \p ctx will also be 
 * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
 * were called).  If \p ctx is current to other threads, then \p ctx will
 * remain current to those threads, and attempting to access \p ctx from
 * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
 *
 * @param ctx - Context to destroy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize
 */
public static native @Cast("CUresult") int cuCtxDestroy(CUctx_st ctx);
// #endif /* __CUDA_API_VERSION >= 4000 */

// #if __CUDA_API_VERSION >= 4000
/**
 * \brief Pushes a context on the current CPU thread
 *
 * Pushes the given context \p ctx onto the CPU thread's stack of current
 * contexts. The specified context becomes the CPU thread's current context, so
 * all CUDA functions that operate on the current context are affected.
 *
 * The previous current context may be made current again by calling
 * ::cuCtxDestroy() or ::cuCtxPopCurrent().
 *
 * @param ctx - Context to push
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxDestroy,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize
 */
public static native @Cast("CUresult") int cuCtxPushCurrent(CUctx_st ctx);

/**
 * \brief Pops the current CUDA context from the current CPU thread.
 *
 * Pops the current CUDA context from the CPU thread and passes back the 
 * old context handle in \p *pctx. That context may then be made current 
 * to a different CPU thread by calling ::cuCtxPushCurrent().
 *
 * If a context was current to the CPU thread before ::cuCtxCreate() or
 * ::cuCtxPushCurrent() was called, this function makes that context current to
 * the CPU thread again.
 *
 * @param pctx - Returned new context handle
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxDestroy,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize
 */
public static native @Cast("CUresult") int cuCtxPopCurrent(@ByPtrPtr CUctx_st pctx);

/**
 * \brief Binds the specified CUDA context to the calling CPU thread
 *
 * Binds the specified CUDA context to the calling CPU thread.
 * If \p ctx is NULL then the CUDA context previously bound to the
 * calling CPU thread is unbound and ::CUDA_SUCCESS is returned.
 *
 * If there exists a CUDA context stack on the calling CPU thread, this
 * will replace the top of that stack with \p ctx.  
 * If \p ctx is NULL then this will be equivalent to popping the top
 * of the calling CPU thread's CUDA context stack (or a no-op if the
 * calling CPU thread's CUDA context stack is empty).
 *
 * @param ctx - Context to bind to the calling CPU thread
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT
 * \notefnerr
 *
 * \sa ::cuCtxGetCurrent, ::cuCtxCreate, ::cuCtxDestroy
 */
public static native @Cast("CUresult") int cuCtxSetCurrent(CUctx_st ctx);

/**
 * \brief Returns the CUDA context bound to the calling CPU thread.
 *
 * Returns in \p *pctx the CUDA context bound to the calling CPU thread.
 * If no context is bound to the calling CPU thread then \p *pctx is
 * set to NULL and ::CUDA_SUCCESS is returned.
 *
 * @param pctx - Returned context handle
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * \notefnerr
 *
 * \sa ::cuCtxSetCurrent, ::cuCtxCreate, ::cuCtxDestroy
 */
public static native @Cast("CUresult") int cuCtxGetCurrent(@ByPtrPtr CUctx_st pctx);
// #endif /* __CUDA_API_VERSION >= 4000 */

/**
 * \brief Returns the device ID for the current context
 *
 * Returns in \p *device the ordinal of the current context's device.
 *
 * @param device - Returned device ID for the current context
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxDestroy,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize
 */
public static native @Cast("CUresult") int cuCtxGetDevice(@Cast("CUdevice*") IntPointer device);
public static native @Cast("CUresult") int cuCtxGetDevice(@Cast("CUdevice*") IntBuffer device);
public static native @Cast("CUresult") int cuCtxGetDevice(@Cast("CUdevice*") int[] device);

// #if __CUDA_API_VERSION >= 7000
/**
 * \brief Returns the flags for the current context
 *
 * Returns in \p *flags the flags of the current context. See ::cuCtxCreate
 * for flag values.
 *
 * @param flags - Pointer to store flags of current context
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetCurrent,
 * ::cuCtxGetDevice
 * ::cuCtxGetLimit,
 * ::cuCtxGetSharedMemConfig,
 * ::cuCtxGetStreamPriorityRange
 */
public static native @Cast("CUresult") int cuCtxGetFlags(@Cast("unsigned int*") IntPointer flags);
public static native @Cast("CUresult") int cuCtxGetFlags(@Cast("unsigned int*") IntBuffer flags);
public static native @Cast("CUresult") int cuCtxGetFlags(@Cast("unsigned int*") int[] flags);
// #endif /* __CUDA_API_VERSION >= 7000 */

/**
 * \brief Block for a context's tasks to complete
 *
 * Blocks until the device has completed all preceding requested tasks.
 * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
 * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the 
 * CPU thread will block until the GPU context has finished its work.
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxDestroy,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit
 */
public static native @Cast("CUresult") int cuCtxSynchronize();

/**
 * \brief Set resource limits
 *
 * Setting \p limit to \p value is a request by the application to update
 * the current limit maintained by the context. The driver is free to
 * modify the requested value to meet h/w requirements (this could be
 * clamping to minimum or maximum values, rounding up to nearest element
 * size, etc). The application can use ::cuCtxGetLimit() to find out exactly
 * what the limit has been set to.
 *
 * Setting each ::CUlimit has its own specific restrictions, so each is
 * discussed here.
 *
 * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread.
 *   This limit is only applicable to devices of compute capability 2.0 and
 *   higher. Attempting to set this limit on devices of compute capability
 *   less than 2.0 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT
 *   being returned.
 *
 * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used
 *   by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE
 *   must be performed before launching any kernel that uses the ::printf()
 *   device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned.
 *   This limit is only applicable to devices of compute capability 2.0 and
 *   higher. Attempting to set this limit on devices of compute capability
 *   less than 2.0 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT
 *   being returned.
 *
 * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used
 *   by the ::malloc() and ::free() device system calls. Setting
 *   ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel
 *   that uses the ::malloc() or ::free() device system calls, otherwise
 *   ::CUDA_ERROR_INVALID_VALUE will be returned. This limit is only applicable
 *   to devices of compute capability 2.0 and higher. Attempting to set this
 *   limit on devices of compute capability less than 2.0 will result in the
 *   error ::CUDA_ERROR_UNSUPPORTED_LIMIT being returned.
 *
 * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of
 *   a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
 *   this limit must be performed before any launch of a kernel that uses the 
 *   device runtime and calls ::cudaDeviceSynchronize() above the default sync
 *   depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail 
 *   with error code ::cudaErrorSyncDepthExceeded if the limitation is 
 *   violated. This limit can be set smaller than the default or up the maximum
 *   launch depth of 24. When setting this limit, keep in mind that additional
 *   levels of sync depth require the driver to reserve large amounts of device
 *   memory which can no longer be used for user allocations. If these 
 *   reservations of device memory fail, ::cuCtxSetLimit will return 
 *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
 *   This limit is only applicable to devices of compute capability 3.5 and
 *   higher. Attempting to set this limit on devices of compute capability less
 *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being 
 *   returned.
 *
 * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of
 *   outstanding device runtime launches that can be made from the current
 *   context. A grid is outstanding from the point of launch up until the grid
 *   is known to have been completed. Device runtime launches which violate 
 *   this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
 *   ::cudaGetLastError() is called after launch. If more pending launches than
 *   the default (2048 launches) are needed for a module using the device
 *   runtime, this limit can be increased. Keep in mind that being able to
 *   sustain additional pending launches will require the driver to reserve
 *   larger amounts of device memory upfront which can no longer be used for
 *   allocations. If these reservations fail, ::cuCtxSetLimit will return
 *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
 *   This limit is only applicable to devices of compute capability 3.5 and
 *   higher. Attempting to set this limit on devices of compute capability less
 *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
 *   returned.
 *
 * @param limit - Limit to set
 * @param value - Size of limit
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_UNSUPPORTED_LIMIT,
 * ::CUDA_ERROR_OUT_OF_MEMORY
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxDestroy,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSynchronize
 */
public static native @Cast("CUresult") int cuCtxSetLimit(@Cast("CUlimit") int limit, @Cast("size_t") long value);

/**
 * \brief Returns resource limits
 *
 * Returns in \p *pvalue the current size of \p limit.  The supported
 * ::CUlimit values are:
 * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread.
 * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the
 *   ::printf() device system call.
 * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the
 *   ::malloc() and ::free() device system calls.
 * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread
 *   can issue the device runtime call ::cudaDeviceSynchronize() to wait on
 *   child grid launches to complete.
 * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding
 *   device runtime launches that can be made from this context.
 *
 * @param limit  - Limit to query
 * @param pvalue - Returned size of limit
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_UNSUPPORTED_LIMIT
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxDestroy,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize
 */
public static native @Cast("CUresult") int cuCtxGetLimit(@Cast("size_t*") SizeTPointer pvalue, @Cast("CUlimit") int limit);

/**
 * \brief Returns the preferred cache configuration for the current context.
 *
 * On devices where the L1 cache and shared memory use the same hardware
 * resources, this function returns through \p pconfig the preferred cache configuration
 * for the current context. This is only a preference. The driver will use
 * the requested configuration if possible, but it is free to choose a different
 * configuration if required to execute functions.
 *
 * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices
 * where the size of the L1 cache and shared memory are fixed.
 *
 * The supported cache configurations are:
 * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
 * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
 * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
 * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
 *
 * @param pconfig - Returned cache configuration
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxDestroy,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize,
 * ::cuFuncSetCacheConfig
 */
public static native @Cast("CUresult") int cuCtxGetCacheConfig(@Cast("CUfunc_cache*") IntPointer pconfig);
public static native @Cast("CUresult") int cuCtxGetCacheConfig(@Cast("CUfunc_cache*") IntBuffer pconfig);
public static native @Cast("CUresult") int cuCtxGetCacheConfig(@Cast("CUfunc_cache*") int[] pconfig);

/**
 * \brief Sets the preferred cache configuration for the current context.
 *
 * On devices where the L1 cache and shared memory use the same hardware
 * resources, this sets through \p config the preferred cache configuration for
 * the current context. This is only a preference. The driver will use
 * the requested configuration if possible, but it is free to choose a different
 * configuration if required to execute the function. Any function preference
 * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide
 * setting. Setting the context-wide cache configuration to
 * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer
 * to not change the cache configuration unless required to launch the kernel.
 *
 * This setting does nothing on devices where the size of the L1 cache and
 * shared memory are fixed.
 *
 * Launching a kernel with a different preference than the most recent
 * preference setting may insert a device-side synchronization point.
 *
 * The supported cache configurations are:
 * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
 * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
 * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
 * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
 *
 * @param config - Requested cache configuration
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxDestroy,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize,
 * ::cuFuncSetCacheConfig
 */
public static native @Cast("CUresult") int cuCtxSetCacheConfig(@Cast("CUfunc_cache") int config);

// #if __CUDA_API_VERSION >= 4020
/**
 * \brief Returns the current shared memory configuration for the current context.
 *
 * This function will return in \p pConfig the current size of shared memory banks
 * in the current context. On devices with configurable shared memory banks, 
 * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all 
 * subsequent kernel launches will by default use the new bank size. When 
 * ::cuCtxGetSharedMemConfig is called on devices without configurable shared 
 * memory, it will return the fixed bank size of the hardware.
 *
 * The returned bank configurations can be either:
 * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:  shared memory bank width is 
 *   four bytes.
 * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will
 *   eight bytes.
 *
 * @param pConfig - returned shared memory configuration
 * @return 
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxDestroy,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize,
 * ::cuCtxGetSharedMemConfig,
 * ::cuFuncSetCacheConfig,
 */
public static native @Cast("CUresult") int cuCtxGetSharedMemConfig(@Cast("CUsharedconfig*") IntPointer pConfig);
public static native @Cast("CUresult") int cuCtxGetSharedMemConfig(@Cast("CUsharedconfig*") IntBuffer pConfig);
public static native @Cast("CUresult") int cuCtxGetSharedMemConfig(@Cast("CUsharedconfig*") int[] pConfig);

/**
 * \brief Sets the shared memory configuration for the current context.
 *
 * On devices with configurable shared memory banks, this function will set
 * the context's shared memory bank size which is used for subsequent kernel 
 * launches. 
 *
 * Changed the shared memory configuration between launches may insert a device
 * side synchronization point between those launches.
 *
 * Changing the shared memory bank size will not increase shared memory usage
 * or affect occupancy of kernels, but may have major effects on performance. 
 * Larger bank sizes will allow for greater potential bandwidth to shared memory,
 * but will change what kinds of accesses to shared memory will result in bank 
 * conflicts.
 *
 * This function will do nothing on devices with fixed shared memory bank size.
 *
 * The supported bank configurations are:
 * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial
 *   setting (currently, four bytes).
 * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
 *   be natively four bytes.
 * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
 *   be natively eight bytes.
 *
 * @param config - requested shared memory configuration
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxDestroy,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize,
 * ::cuCtxGetSharedMemConfig,
 * ::cuFuncSetCacheConfig,
 */
public static native @Cast("CUresult") int cuCtxSetSharedMemConfig(@Cast("CUsharedconfig") int config);
// #endif

/**
 * \brief Gets the context's API version.
 *
 * Returns a version number in \p version corresponding to the capabilities of
 * the context (e.g. 3010 or 3020), which library developers can use to direct
 * callers to a specific API version. If \p ctx is NULL, returns the API version
 * used to create the currently bound context.
 *
 * Note that new API versions are only introduced when context capabilities are
 * changed that break binary compatibility, so the API version and driver version
 * may be different. For example, it is valid for the API version to be 3020 while
 * the driver version is 4020.
 *
 * @param ctx     - Context to check
 * @param version - Pointer to version
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_UNKNOWN
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxDestroy,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize
 */
public static native @Cast("CUresult") int cuCtxGetApiVersion(CUctx_st ctx, @Cast("unsigned int*") IntPointer version);
public static native @Cast("CUresult") int cuCtxGetApiVersion(CUctx_st ctx, @Cast("unsigned int*") IntBuffer version);
public static native @Cast("CUresult") int cuCtxGetApiVersion(CUctx_st ctx, @Cast("unsigned int*") int[] version);

/**
 * \brief Returns numerical values that correspond to the least and
 * greatest stream priorities.
 *
 * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
 * to the least and greatest stream priorities respectively. Stream priorities
 * follow a convention where lower numbers imply greater priorities. The range of
 * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
 * If the user attempts to create a stream with a priority value that is
 * outside the meaningful range as specified by this API, the priority is
 * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
 * respectively. See ::cuStreamCreateWithPriority for details on creating a
 * priority stream.
 * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
 * is not desired.
 *
 * This function will return '0' in both \p *leastPriority and \p *greatestPriority if
 * the current context's device does not support stream priorities
 * (see ::cuDeviceGetAttribute).
 *
 * @param leastPriority    - Pointer to an int in which the numerical value for least
 *                           stream priority is returned
 * @param greatestPriority - Pointer to an int in which the numerical value for greatest
 *                           stream priority is returned
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE,
 * \notefnerr
 *
 * \sa ::cuStreamCreateWithPriority,
 * ::cuStreamGetPriority,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize
 */
public static native @Cast("CUresult") int cuCtxGetStreamPriorityRange(IntPointer leastPriority, IntPointer greatestPriority);
public static native @Cast("CUresult") int cuCtxGetStreamPriorityRange(IntBuffer leastPriority, IntBuffer greatestPriority);
public static native @Cast("CUresult") int cuCtxGetStreamPriorityRange(int[] leastPriority, int[] greatestPriority);

/** \} */ /* END CUDA_CTX */

/**
 * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
 *
 * ___MANBRIEF___ deprecated context management functions of the low-level CUDA
 * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the deprecated context management functions of the low-level
 * CUDA driver application programming interface.
 *
 * \{
 */

/**
 * \brief Increment a context's usage-count
 *
 * @deprecated
 *
 * Note that this function is deprecated and should not be used.
 *
 * Increments the usage count of the context and passes back a context handle
 * in \p *pctx that must be passed to ::cuCtxDetach() when the application is
 * done with the context. ::cuCtxAttach() fails if there is no context current
 * to the thread.
 *
 * Currently, the \p flags parameter must be 0.
 *
 * @param pctx  - Returned context handle of the current context
 * @param flags - Context attach flags (must be 0)
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxDestroy,
 * ::cuCtxDetach,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize
 */
public static native @Cast("CUresult") int cuCtxAttach(@ByPtrPtr CUctx_st pctx, @Cast("unsigned int") int flags);

/**
 * \brief Decrement a context's usage-count
 *
 * @deprecated
 *
 * Note that this function is deprecated and should not be used.
 *
 * Decrements the usage count of the context \p ctx, and destroys the context
 * if the usage count goes to 0. The context must be a handle that was passed
 * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the
 * calling thread.
 *
 * @param ctx - Context to destroy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT
 * \notefnerr
 *
 * \sa ::cuCtxCreate,
 * ::cuCtxDestroy,
 * ::cuCtxGetApiVersion,
 * ::cuCtxGetCacheConfig,
 * ::cuCtxGetDevice,
 * ::cuCtxGetFlags,
 * ::cuCtxGetLimit,
 * ::cuCtxPopCurrent,
 * ::cuCtxPushCurrent,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxSetLimit,
 * ::cuCtxSynchronize
 */
public static native @Cast("CUresult") int cuCtxDetach(CUctx_st ctx);

/** \} */ /* END CUDA_CTX_DEPRECATED */


/**
 * \defgroup CUDA_MODULE Module Management
 *
 * ___MANBRIEF___ module management functions of the low-level CUDA driver API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the module management functions of the low-level CUDA
 * driver application programming interface.
 *
 * \{
 */

/**
 * \brief Loads a compute module
 *
 * Takes a filename \p fname and loads the corresponding module \p module into
 * the current context. The CUDA driver API does not attempt to lazily
 * allocate the resources needed by a module; if the memory for functions and
 * data (constant and global) needed by the module cannot be allocated,
 * ::cuModuleLoad() fails. The file should be a \e cubin file as output by
 * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or
 * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later.
 *
 * @param module - Returned module
 * @param fname  - Filename of module to load
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_PTX,
 * ::CUDA_ERROR_NOT_FOUND,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_FILE_NOT_FOUND,
 * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
 * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
 * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
 * \notefnerr
 *
 * \sa ::cuModuleGetFunction,
 * ::cuModuleGetGlobal,
 * ::cuModuleGetTexRef,
 * ::cuModuleLoadData,
 * ::cuModuleLoadDataEx,
 * ::cuModuleLoadFatBinary,
 * ::cuModuleUnload
 */
public static native @Cast("CUresult") int cuModuleLoad(@ByPtrPtr CUmod_st module, @Cast("const char*") BytePointer fname);
public static native @Cast("CUresult") int cuModuleLoad(@ByPtrPtr CUmod_st module, String fname);

/**
 * \brief Load a module's data
 *
 * Takes a pointer \p image and loads the corresponding module \p module into
 * the current context. The pointer may be obtained by mapping a \e cubin or
 * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
 * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
 * object into the executable resources and using operating system calls such
 * as Windows \c FindResource() to obtain the pointer.
 *
 * @param module - Returned module
 * @param image  - Module data to load
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_PTX,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
 * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
 * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
 * \notefnerr
 *
 * \sa ::cuModuleGetFunction,
 * ::cuModuleGetGlobal,
 * ::cuModuleGetTexRef,
 * ::cuModuleLoad,
 * ::cuModuleLoadDataEx,
 * ::cuModuleLoadFatBinary,
 * ::cuModuleUnload
 */
public static native @Cast("CUresult") int cuModuleLoadData(@ByPtrPtr CUmod_st module, @Const Pointer image);

/**
 * \brief Load a module's data with options
 *
 * Takes a pointer \p image and loads the corresponding module \p module into
 * the current context. The pointer may be obtained by mapping a \e cubin or
 * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
 * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
 * object into the executable resources and using operating system calls such
 * as Windows \c FindResource() to obtain the pointer. Options are passed as
 * an array via \p options and any corresponding parameters are passed in
 * \p optionValues. The number of total options is supplied via \p numOptions.
 * Any outputs will be returned via \p optionValues. 
 *
 * @param module       - Returned module
 * @param image        - Module data to load
 * @param numOptions   - Number of options
 * @param options      - Options for JIT
 * @param optionValues - Option values for JIT
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_PTX,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
 * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
 * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
 * \notefnerr
 *
 * \sa ::cuModuleGetFunction,
 * ::cuModuleGetGlobal,
 * ::cuModuleGetTexRef,
 * ::cuModuleLoad,
 * ::cuModuleLoadData,
 * ::cuModuleLoadFatBinary,
 * ::cuModuleUnload
 */
public static native @Cast("CUresult") int cuModuleLoadDataEx(@ByPtrPtr CUmod_st module, @Const Pointer image, @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntPointer options, @Cast("void**") PointerPointer optionValues);
public static native @Cast("CUresult") int cuModuleLoadDataEx(@ByPtrPtr CUmod_st module, @Const Pointer image, @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntPointer options, @Cast("void**") @ByPtrPtr Pointer optionValues);
public static native @Cast("CUresult") int cuModuleLoadDataEx(@ByPtrPtr CUmod_st module, @Const Pointer image, @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntBuffer options, @Cast("void**") @ByPtrPtr Pointer optionValues);
public static native @Cast("CUresult") int cuModuleLoadDataEx(@ByPtrPtr CUmod_st module, @Const Pointer image, @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") int[] options, @Cast("void**") @ByPtrPtr Pointer optionValues);

/**
 * \brief Load a module's data
 *
 * Takes a pointer \p fatCubin and loads the corresponding module \p module
 * into the current context. The pointer represents a fat binary object,
 * which is a collection of different \e cubin and/or \e PTX files, all
 * representing the same device code, but compiled and optimized for different
 * architectures.
 *
 * Prior to CUDA 4.0, there was no documented API for constructing and using
 * fat binary objects by programmers.  Starting with CUDA 4.0, fat binary
 * objects can be constructed by providing the -fatbin option to \b nvcc.
 * More information can be found in the \b nvcc document.
 *
 * @param module   - Returned module
 * @param fatCubin - Fat binary to load
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_PTX,
 * ::CUDA_ERROR_NOT_FOUND,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
 * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
 * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
 * \notefnerr
 *
 * \sa ::cuModuleGetFunction,
 * ::cuModuleGetGlobal,
 * ::cuModuleGetTexRef,
 * ::cuModuleLoad,
 * ::cuModuleLoadData,
 * ::cuModuleLoadDataEx,
 * ::cuModuleUnload
 */
public static native @Cast("CUresult") int cuModuleLoadFatBinary(@ByPtrPtr CUmod_st module, @Const Pointer fatCubin);

/**
 * \brief Unloads a module
 *
 * Unloads a module \p hmod from the current context.
 *
 * @param hmod - Module to unload
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuModuleGetFunction,
 * ::cuModuleGetGlobal,
 * ::cuModuleGetTexRef,
 * ::cuModuleLoad,
 * ::cuModuleLoadData,
 * ::cuModuleLoadDataEx,
 * ::cuModuleLoadFatBinary
 */
public static native @Cast("CUresult") int cuModuleUnload(CUmod_st hmod);

/**
 * \brief Returns a function handle
 *
 * Returns in \p *hfunc the handle of the function of name \p name located in
 * module \p hmod. If no function of that name exists, ::cuModuleGetFunction()
 * returns ::CUDA_ERROR_NOT_FOUND.
 *
 * @param hfunc - Returned function handle
 * @param hmod  - Module to retrieve function from
 * @param name  - Name of function to retrieve
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_NOT_FOUND
 * \notefnerr
 *
 * \sa ::cuModuleGetGlobal,
 * ::cuModuleGetTexRef,
 * ::cuModuleLoad,
 * ::cuModuleLoadData,
 * ::cuModuleLoadDataEx,
 * ::cuModuleLoadFatBinary,
 * ::cuModuleUnload
 */
public static native @Cast("CUresult") int cuModuleGetFunction(@ByPtrPtr CUfunc_st hfunc, CUmod_st hmod, @Cast("const char*") BytePointer name);
public static native @Cast("CUresult") int cuModuleGetFunction(@ByPtrPtr CUfunc_st hfunc, CUmod_st hmod, String name);

// #if __CUDA_API_VERSION >= 3020
/**
 * \brief Returns a global pointer from a module
 *
 * Returns in \p *dptr and \p *bytes the base pointer and size of the
 * global of name \p name located in module \p hmod. If no variable of that name
 * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both
 * parameters \p dptr and \p bytes are optional. If one of them is
 * NULL, it is ignored.
 *
 * @param dptr  - Returned global device pointer
 * @param bytes - Returned global size in bytes
 * @param hmod  - Module to retrieve global from
 * @param name  - Name of global to retrieve
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_NOT_FOUND
 * \notefnerr
 *
 * \sa ::cuModuleGetFunction,
 * ::cuModuleGetTexRef,
 * ::cuModuleLoad,
 * ::cuModuleLoadData,
 * ::cuModuleLoadDataEx,
 * ::cuModuleLoadFatBinary,
 * ::cuModuleUnload
 */
public static native @Cast("CUresult") int cuModuleGetGlobal(@Cast("CUdeviceptr*") LongPointer dptr, @Cast("size_t*") SizeTPointer bytes, CUmod_st hmod, @Cast("const char*") BytePointer name);
public static native @Cast("CUresult") int cuModuleGetGlobal(@Cast("CUdeviceptr*") LongBuffer dptr, @Cast("size_t*") SizeTPointer bytes, CUmod_st hmod, String name);
public static native @Cast("CUresult") int cuModuleGetGlobal(@Cast("CUdeviceptr*") long[] dptr, @Cast("size_t*") SizeTPointer bytes, CUmod_st hmod, @Cast("const char*") BytePointer name);
public static native @Cast("CUresult") int cuModuleGetGlobal(@Cast("CUdeviceptr*") LongPointer dptr, @Cast("size_t*") SizeTPointer bytes, CUmod_st hmod, String name);
public static native @Cast("CUresult") int cuModuleGetGlobal(@Cast("CUdeviceptr*") LongBuffer dptr, @Cast("size_t*") SizeTPointer bytes, CUmod_st hmod, @Cast("const char*") BytePointer name);
public static native @Cast("CUresult") int cuModuleGetGlobal(@Cast("CUdeviceptr*") long[] dptr, @Cast("size_t*") SizeTPointer bytes, CUmod_st hmod, String name);
// #endif /* __CUDA_API_VERSION >= 3020 */

/**
 * \brief Returns a handle to a texture reference
 *
 * Returns in \p *pTexRef the handle of the texture reference of name \p name
 * in the module \p hmod. If no texture reference of that name exists,
 * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference
 * handle should not be destroyed, since it will be destroyed when the module
 * is unloaded.
 *
 * @param pTexRef  - Returned texture reference
 * @param hmod     - Module to retrieve texture reference from
 * @param name     - Name of texture reference to retrieve
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_NOT_FOUND
 * \notefnerr
 *
 * \sa ::cuModuleGetFunction,
 * ::cuModuleGetGlobal,
 * ::cuModuleGetSurfRef,
 * ::cuModuleLoad,
 * ::cuModuleLoadData,
 * ::cuModuleLoadDataEx,
 * ::cuModuleLoadFatBinary,
 * ::cuModuleUnload
 */
public static native @Cast("CUresult") int cuModuleGetTexRef(@ByPtrPtr CUtexref_st pTexRef, CUmod_st hmod, @Cast("const char*") BytePointer name);
public static native @Cast("CUresult") int cuModuleGetTexRef(@ByPtrPtr CUtexref_st pTexRef, CUmod_st hmod, String name);

/**
 * \brief Returns a handle to a surface reference
 *
 * Returns in \p *pSurfRef the handle of the surface reference of name \p name
 * in the module \p hmod. If no surface reference of that name exists,
 * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND.
 *
 * @param pSurfRef  - Returned surface reference
 * @param hmod     - Module to retrieve surface reference from
 * @param name     - Name of surface reference to retrieve
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_NOT_FOUND
 * \notefnerr
 *
 * \sa ::cuModuleGetFunction,
 * ::cuModuleGetGlobal,
 * ::cuModuleGetTexRef,
 * ::cuModuleLoad,
 * ::cuModuleLoadData,
 * ::cuModuleLoadDataEx,
 * ::cuModuleLoadFatBinary,
 * ::cuModuleUnload
 */
public static native @Cast("CUresult") int cuModuleGetSurfRef(@ByPtrPtr CUsurfref_st pSurfRef, CUmod_st hmod, @Cast("const char*") BytePointer name);
public static native @Cast("CUresult") int cuModuleGetSurfRef(@ByPtrPtr CUsurfref_st pSurfRef, CUmod_st hmod, String name);

// #if __CUDA_API_VERSION >= 5050

/**
 * \brief Creates a pending JIT linker invocation.
 *
 * If the call is successful, the caller owns the returned CUlinkState, which
 * should eventually be destroyed with ::cuLinkDestroy.  The
 * device code machine size (32 or 64 bit) will match the calling application.
 *
 * Both linker and compiler options may be specified.  Compiler options will
 * be applied to inputs to this linker action which must be compiled from PTX.
 * The options ::CU_JIT_WALL_TIME,
 * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
 * will accumulate data until the CUlinkState is destroyed.
 *
 * \p optionValues must remain valid for the life of the CUlinkState if output
 * options are used.  No other references to inputs are maintained after this
 * call returns.
 *
 * @param numOptions   Size of options arrays
 * @param options      Array of linker and compiler options
 * @param optionValues Array of option values, each cast to void *
 * @param stateOut     On success, this will contain a CUlinkState to specify
 *                     and complete this action
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY
 * \notefnerr
 *
 * \sa ::cuLinkAddData,
 * ::cuLinkAddFile,
 * ::cuLinkComplete,
 * ::cuLinkDestroy
 */
public static native @Cast("CUresult") int cuLinkCreate(@Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntPointer options, @Cast("void**") PointerPointer optionValues, @ByPtrPtr CUlinkState_st stateOut);
public static native @Cast("CUresult") int cuLinkCreate(@Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntPointer options, @Cast("void**") @ByPtrPtr Pointer optionValues, @ByPtrPtr CUlinkState_st stateOut);
public static native @Cast("CUresult") int cuLinkCreate(@Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntBuffer options, @Cast("void**") @ByPtrPtr Pointer optionValues, @ByPtrPtr CUlinkState_st stateOut);
public static native @Cast("CUresult") int cuLinkCreate(@Cast("unsigned int") int numOptions, @Cast("CUjit_option*") int[] options, @Cast("void**") @ByPtrPtr Pointer optionValues, @ByPtrPtr CUlinkState_st stateOut);

/**
 * \brief Add an input to a pending linker invocation
 *
 * Ownership of \p data is retained by the caller.  No reference is retained to any
 * inputs after this call returns.
 *
 * This method accepts only compiler options, which are used if the data must
 * be compiled from PTX, and does not accept any of
 * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
 * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
 *
 * @param state        A pending linker action.
 * @param type         The type of the input data.
 * @param data         The input data.  PTX must be NULL-terminated.
 * @param size         The length of the input data.
 * @param name         An optional name for this input in log messages.
 * @param numOptions   Size of options.
 * @param options      Options to be applied only for this input (overrides options from ::cuLinkCreate).
 * @param optionValues Array of option values, each cast to void *.
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_IMAGE,
 * ::CUDA_ERROR_INVALID_PTX,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_NO_BINARY_FOR_GPU
 *
 * \sa ::cuLinkCreate,
 * ::cuLinkAddFile,
 * ::cuLinkComplete,
 * ::cuLinkDestroy
 */
public static native @Cast("CUresult") int cuLinkAddData(CUlinkState_st state, @Cast("CUjitInputType") int type, Pointer data, @Cast("size_t") long size, @Cast("const char*") BytePointer name,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntPointer options, @Cast("void**") PointerPointer optionValues);
public static native @Cast("CUresult") int cuLinkAddData(CUlinkState_st state, @Cast("CUjitInputType") int type, Pointer data, @Cast("size_t") long size, @Cast("const char*") BytePointer name,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntPointer options, @Cast("void**") @ByPtrPtr Pointer optionValues);
public static native @Cast("CUresult") int cuLinkAddData(CUlinkState_st state, @Cast("CUjitInputType") int type, Pointer data, @Cast("size_t") long size, String name,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntBuffer options, @Cast("void**") @ByPtrPtr Pointer optionValues);
public static native @Cast("CUresult") int cuLinkAddData(CUlinkState_st state, @Cast("CUjitInputType") int type, Pointer data, @Cast("size_t") long size, @Cast("const char*") BytePointer name,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") int[] options, @Cast("void**") @ByPtrPtr Pointer optionValues);
public static native @Cast("CUresult") int cuLinkAddData(CUlinkState_st state, @Cast("CUjitInputType") int type, Pointer data, @Cast("size_t") long size, String name,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntPointer options, @Cast("void**") @ByPtrPtr Pointer optionValues);
public static native @Cast("CUresult") int cuLinkAddData(CUlinkState_st state, @Cast("CUjitInputType") int type, Pointer data, @Cast("size_t") long size, @Cast("const char*") BytePointer name,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntBuffer options, @Cast("void**") @ByPtrPtr Pointer optionValues);
public static native @Cast("CUresult") int cuLinkAddData(CUlinkState_st state, @Cast("CUjitInputType") int type, Pointer data, @Cast("size_t") long size, String name,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") int[] options, @Cast("void**") @ByPtrPtr Pointer optionValues);

/**
 * \brief Add a file input to a pending linker invocation
 *
 * No reference is retained to any inputs after this call returns.
 *
 * This method accepts only compiler options, which are used if the input
 * must be compiled from PTX, and does not accept any of
 * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
 * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
 *
 * This method is equivalent to invoking ::cuLinkAddData on the contents
 * of the file.
 *
 * @param state        A pending linker action
 * @param type         The type of the input data
 * @param path         Path to the input file
 * @param numOptions   Size of options
 * @param options      Options to be applied only for this input (overrides options from ::cuLinkCreate)
 * @param optionValues Array of option values, each cast to void *
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_FILE_NOT_FOUND
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_IMAGE,
 * ::CUDA_ERROR_INVALID_PTX,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_NO_BINARY_FOR_GPU
 *
 * \sa ::cuLinkCreate,
 * ::cuLinkAddData,
 * ::cuLinkComplete,
 * ::cuLinkDestroy
 */
public static native @Cast("CUresult") int cuLinkAddFile(CUlinkState_st state, @Cast("CUjitInputType") int type, @Cast("const char*") BytePointer path,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntPointer options, @Cast("void**") PointerPointer optionValues);
public static native @Cast("CUresult") int cuLinkAddFile(CUlinkState_st state, @Cast("CUjitInputType") int type, @Cast("const char*") BytePointer path,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntPointer options, @Cast("void**") @ByPtrPtr Pointer optionValues);
public static native @Cast("CUresult") int cuLinkAddFile(CUlinkState_st state, @Cast("CUjitInputType") int type, String path,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntBuffer options, @Cast("void**") @ByPtrPtr Pointer optionValues);
public static native @Cast("CUresult") int cuLinkAddFile(CUlinkState_st state, @Cast("CUjitInputType") int type, @Cast("const char*") BytePointer path,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") int[] options, @Cast("void**") @ByPtrPtr Pointer optionValues);
public static native @Cast("CUresult") int cuLinkAddFile(CUlinkState_st state, @Cast("CUjitInputType") int type, String path,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntPointer options, @Cast("void**") @ByPtrPtr Pointer optionValues);
public static native @Cast("CUresult") int cuLinkAddFile(CUlinkState_st state, @Cast("CUjitInputType") int type, @Cast("const char*") BytePointer path,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") IntBuffer options, @Cast("void**") @ByPtrPtr Pointer optionValues);
public static native @Cast("CUresult") int cuLinkAddFile(CUlinkState_st state, @Cast("CUjitInputType") int type, String path,
    @Cast("unsigned int") int numOptions, @Cast("CUjit_option*") int[] options, @Cast("void**") @ByPtrPtr Pointer optionValues);

/**
 * \brief Complete a pending linker invocation
 *
 * Completes the pending linker action and returns the cubin image for the linked
 * device code, which can be used with ::cuModuleLoadData.  The cubin is owned by
 * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy.
 * This call does not destroy \p state.
 *
 * @param state    A pending linker invocation
 * @param cubinOut On success, this will point to the output image
 * @param sizeOut  Optional parameter to receive the size of the generated image
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_OUT_OF_MEMORY
 *
 * \sa ::cuLinkCreate,
 * ::cuLinkAddData,
 * ::cuLinkAddFile,
 * ::cuLinkDestroy,
 * ::cuModuleLoadData
 */
public static native @Cast("CUresult") int cuLinkComplete(CUlinkState_st state, @Cast("void**") PointerPointer cubinOut, @Cast("size_t*") SizeTPointer sizeOut);
public static native @Cast("CUresult") int cuLinkComplete(CUlinkState_st state, @Cast("void**") @ByPtrPtr Pointer cubinOut, @Cast("size_t*") SizeTPointer sizeOut);

/**
 * \brief Destroys state for a JIT linker invocation.
 *
 * @param state State object for the linker invocation
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_HANDLE
 *
 * \sa ::cuLinkCreate
 */
public static native @Cast("CUresult") int cuLinkDestroy(CUlinkState_st state);

// #endif /* __CUDA_API_VERSION >= 5050 */

/** \} */ /* END CUDA_MODULE */


/**
 * \defgroup CUDA_MEM Memory Management
 *
 * ___MANBRIEF___ memory management functions of the low-level CUDA driver API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the memory management functions of the low-level CUDA
 * driver application programming interface.
 *
 * \{
 */

// #if __CUDA_API_VERSION >= 3020
/**
 * \brief Gets free and total memory
 *
 * Returns in \p *free and \p *total respectively, the free and total amount of
 * memory available for allocation by the CUDA context, in bytes.
 *
 * @param free  - Returned free memory in bytes
 * @param total - Returned total memory in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemGetInfo(@Cast("size_t*") SizeTPointer free, @Cast("size_t*") SizeTPointer total);

/**
 * \brief Allocates device memory
 *
 * Allocates \p bytesize bytes of linear memory on the device and returns in
 * \p *dptr a pointer to the allocated memory. The allocated memory is suitably
 * aligned for any kind of variable. The memory is not cleared. If \p bytesize
 * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE.
 *
 * @param dptr     - Returned device pointer
 * @param bytesize - Requested allocation size in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemAlloc(@Cast("CUdeviceptr*") LongPointer dptr, @Cast("size_t") long bytesize);
public static native @Cast("CUresult") int cuMemAlloc(@Cast("CUdeviceptr*") LongBuffer dptr, @Cast("size_t") long bytesize);
public static native @Cast("CUresult") int cuMemAlloc(@Cast("CUdeviceptr*") long[] dptr, @Cast("size_t") long bytesize);

/**
 * \brief Allocates pitched device memory
 *
 * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on
 * the device and returns in \p *dptr a pointer to the allocated memory. The
 * function may pad the allocation to ensure that corresponding pointers in
 * any given row will continue to meet the alignment requirements for
 * coalescing as the address is updated from row to row. \p ElementSizeBytes
 * specifies the size of the largest reads and writes that will be performed
 * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced
 * memory transactions are not possible on other data sizes). If
 * \p ElementSizeBytes is smaller than the actual read/write size of a kernel,
 * the kernel will run correctly, but possibly at reduced speed. The pitch
 * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the
 * allocation. The intended usage of pitch is as a separate parameter of the
 * allocation, used to compute addresses within the 2D array. Given the row
 * and column of an array element of type \b T, the address is computed as:
 * {@code
   T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
 * }
 *
 * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with
 * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is
 * recommended that programmers consider performing pitch allocations using
 * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is
 * especially true if the application will be performing 2D memory copies
 * between different regions of device memory (whether linear memory or CUDA
 * arrays).
 *
 * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed
 * to match or exceed the alignment requirement for texture binding with
 * ::cuTexRefSetAddress2D().
 *
 * @param dptr             - Returned device pointer
 * @param pPitch           - Returned pitch of allocation in bytes
 * @param WidthInBytes     - Requested allocation width in bytes
 * @param Height           - Requested allocation height in rows
 * @param ElementSizeBytes - Size of largest reads/writes for range
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemAllocPitch(@Cast("CUdeviceptr*") LongPointer dptr, @Cast("size_t*") SizeTPointer pPitch, @Cast("size_t") long WidthInBytes, @Cast("size_t") long Height, @Cast("unsigned int") int ElementSizeBytes);
public static native @Cast("CUresult") int cuMemAllocPitch(@Cast("CUdeviceptr*") LongBuffer dptr, @Cast("size_t*") SizeTPointer pPitch, @Cast("size_t") long WidthInBytes, @Cast("size_t") long Height, @Cast("unsigned int") int ElementSizeBytes);
public static native @Cast("CUresult") int cuMemAllocPitch(@Cast("CUdeviceptr*") long[] dptr, @Cast("size_t*") SizeTPointer pPitch, @Cast("size_t") long WidthInBytes, @Cast("size_t") long Height, @Cast("unsigned int") int ElementSizeBytes);

/**
 * \brief Frees device memory
 *
 * Frees the memory space pointed to by \p dptr, which must have been returned
 * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch().
 *
 * @param dptr - Pointer to memory to free
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemFree(@Cast("CUdeviceptr") long dptr);

/**
 * \brief Get information on memory allocations
 *
 * Returns the base address in \p *pbase and size in \p *psize of the
 * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input
 * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one
 * of them is NULL, it is ignored.
 *
 * @param pbase - Returned base address
 * @param psize - Returned size of device memory allocation
 * @param dptr  - Device pointer to query
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemGetAddressRange(@Cast("CUdeviceptr*") LongPointer pbase, @Cast("size_t*") SizeTPointer psize, @Cast("CUdeviceptr") long dptr);
public static native @Cast("CUresult") int cuMemGetAddressRange(@Cast("CUdeviceptr*") LongBuffer pbase, @Cast("size_t*") SizeTPointer psize, @Cast("CUdeviceptr") long dptr);
public static native @Cast("CUresult") int cuMemGetAddressRange(@Cast("CUdeviceptr*") long[] pbase, @Cast("size_t*") SizeTPointer psize, @Cast("CUdeviceptr") long dptr);

/**
 * \brief Allocates page-locked host memory
 *
 * Allocates \p bytesize bytes of host memory that is page-locked and
 * accessible to the device. The driver tracks the virtual memory ranges
 * allocated with this function and automatically accelerates calls to
 * functions such as ::cuMemcpy(). Since the memory can be accessed directly by
 * the device, it can be read or written with much higher bandwidth than
 * pageable memory obtained with functions such as ::malloc(). Allocating
 * excessive amounts of memory with ::cuMemAllocHost() may degrade system
 * performance, since it reduces the amount of memory available to the system
 * for paging. As a result, this function is best used sparingly to allocate
 * staging areas for data exchange between host and device.
 *
 * Note all host memory allocated using ::cuMemHostAlloc() will automatically
 * be immediately accessible to all contexts on all devices which support unified
 * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
 * The device pointer that may be used to access this host memory from those 
 * contexts is always equal to the returned host pointer \p *pp.
 * See \ref CUDA_UNIFIED for additional details.
 *
 * @param pp       - Returned host pointer to page-locked memory
 * @param bytesize - Requested allocation size in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemAllocHost(@Cast("void**") PointerPointer pp, @Cast("size_t") long bytesize);
public static native @Cast("CUresult") int cuMemAllocHost(@Cast("void**") @ByPtrPtr Pointer pp, @Cast("size_t") long bytesize);
// #endif /* __CUDA_API_VERSION >= 3020 */

/**
 * \brief Frees page-locked host memory
 *
 * Frees the memory space pointed to by \p p, which must have been returned by
 * a previous call to ::cuMemAllocHost().
 *
 * @param p - Pointer to memory to free
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemFreeHost(Pointer p);

/**
 * \brief Allocates page-locked host memory
 *
 * Allocates \p bytesize bytes of host memory that is page-locked and accessible
 * to the device. The driver tracks the virtual memory ranges allocated with
 * this function and automatically accelerates calls to functions such as
 * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device,
 * it can be read or written with much higher bandwidth than pageable memory
 * obtained with functions such as ::malloc(). Allocating excessive amounts of
 * pinned memory may degrade system performance, since it reduces the amount
 * of memory available to the system for paging. As a result, this function is
 * best used sparingly to allocate staging areas for data exchange between
 * host and device.
 *
 * The \p Flags parameter enables different options to be specified that
 * affect the allocation, as follows.
 *
 * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be
 *   considered as pinned memory by all CUDA contexts, not just the one that
 *   performed the allocation.
 *
 * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address
 *   space. The device pointer to the memory may be obtained by calling
 *   ::cuMemHostGetDevicePointer(). This feature is available only on GPUs
 *   with compute capability greater than or equal to 1.1.
 *
 * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined
 *   (WC). WC memory can be transferred across the PCI Express bus more
 *   quickly on some system configurations, but cannot be read efficiently by
 *   most CPUs. WC memory is a good option for buffers that will be written by
 *   the CPU and read by the GPU via mapped pinned memory or host->device
 *   transfers.
 *
 * All of these flags are orthogonal to one another: a developer may allocate
 * memory that is portable, mapped and/or write-combined with no restrictions.
 *
 * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in
 * order for the ::CU_MEMHOSTALLOC_DEVICEMAP flag to have any effect.
 *
 * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for
 * devices that do not support mapped pinned memory. The failure is deferred
 * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
 * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag.
 *
 * The memory allocated by this function must be freed with ::cuMemFreeHost().
 *
 * Note all host memory allocated using ::cuMemHostAlloc() will automatically
 * be immediately accessible to all contexts on all devices which support unified
 * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
 * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer 
 * that may be used to access this host memory from those contexts is always equal 
 * to the returned host pointer \p *pp.  If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED
 * is specified, then the function ::cuMemHostGetDevicePointer() must be used
 * to query the device pointer, even if the context supports unified addressing.
 * See \ref CUDA_UNIFIED for additional details.
 *
 * @param pp       - Returned host pointer to page-locked memory
 * @param bytesize - Requested allocation size in bytes
 * @param Flags    - Flags for allocation request
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemHostAlloc(@Cast("void**") PointerPointer pp, @Cast("size_t") long bytesize, @Cast("unsigned int") int Flags);
public static native @Cast("CUresult") int cuMemHostAlloc(@Cast("void**") @ByPtrPtr Pointer pp, @Cast("size_t") long bytesize, @Cast("unsigned int") int Flags);

// #if __CUDA_API_VERSION >= 3020
/**
 * \brief Passes back device pointer of mapped pinned memory
 *
 * Passes back the device pointer \p pdptr corresponding to the mapped, pinned
 * host buffer \p p allocated by ::cuMemHostAlloc.
 *
 * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP
 * flag was not specified at the time the memory was allocated, or if the
 * function is called on a GPU that does not support mapped pinned memory.
 *
 * For devices that have a non-zero value for the device attribute
 * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
 * can also be accessed from the device using the host pointer \p p.
 * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
 * match the original host pointer \p p and depends on the devices visible to the
 * application. If all devices visible to the application have a non-zero value for the
 * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
 * will match the original pointer \p p. If any device visible to the application
 * has a zero value for the device attribute, the device pointer returned by
 * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p,
 * but it will be suitable for use on all devices provided Unified Virtual Addressing
 * is enabled. In such systems, it is valid to access the memory using either pointer
 * on devices that have a non-zero value for the device attribute. Note however that
 * such devices should access the memory using only of the two pointers and not both.
 *
 * \p Flags provides for future releases. For now, it must be set to 0.
 *
 * @param pdptr - Returned device pointer
 * @param p     - Host pointer
 * @param Flags - Options (must be 0)
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemHostGetDevicePointer(@Cast("CUdeviceptr*") LongPointer pdptr, Pointer p, @Cast("unsigned int") int Flags);
public static native @Cast("CUresult") int cuMemHostGetDevicePointer(@Cast("CUdeviceptr*") LongBuffer pdptr, Pointer p, @Cast("unsigned int") int Flags);
public static native @Cast("CUresult") int cuMemHostGetDevicePointer(@Cast("CUdeviceptr*") long[] pdptr, Pointer p, @Cast("unsigned int") int Flags);
// #endif /* __CUDA_API_VERSION >= 3020 */

/**
 * \brief Passes back flags that were used for a pinned allocation
 *
 * Passes back the flags \p pFlags that were specified when allocating
 * the pinned host buffer \p p allocated by ::cuMemHostAlloc.
 *
 * ::cuMemHostGetFlags() will fail if the pointer does not reside in
 * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc().
 *
 * @param pFlags - Returned flags word
 * @param p     - Host pointer
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuMemAllocHost, ::cuMemHostAlloc
 */
public static native @Cast("CUresult") int cuMemHostGetFlags(@Cast("unsigned int*") IntPointer pFlags, Pointer p);
public static native @Cast("CUresult") int cuMemHostGetFlags(@Cast("unsigned int*") IntBuffer pFlags, Pointer p);
public static native @Cast("CUresult") int cuMemHostGetFlags(@Cast("unsigned int*") int[] pFlags, Pointer p);

// #if __CUDA_API_VERSION >= 6000

/**
 * \brief Allocates memory that will be automatically managed by the Unified Memory system
 *
 * Allocates \p bytesize bytes of managed memory on the device and returns in
 * \p *dptr a pointer to the allocated memory. If the device doesn't support
 * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support
 * for managed memory can be queried using the device attribute
 * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably
 * aligned for any kind of variable. The memory is not cleared. If \p bytesize
 * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer
 * is valid on the CPU and on all GPUs in the system that support managed memory.
 * All accesses to this pointer must obey the Unified Memory programming model.
 *
 * \p flags specifies the default stream association for this allocation.
 * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If
 * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from
 * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the
 * allocation should not be accessed from devices that have a zero value for the
 * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to
 * ::cuStreamAttachMemAsync will be required to enable access on such devices.
 *
 * If the association is later changed via ::cuStreamAttachMemAsync to
 * a single stream, the default association as specifed during ::cuMemAllocManaged
 * is restored when that stream is destroyed. For __managed__ variables, the
 * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a
 * stream is an asynchronous operation, and as a result, the change to default
 * association won't happen until all work in the stream has completed.
 *
 * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree.
 *
 * Device memory oversubscription is possible for GPUs that have a non-zero value for the
 * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on
 * such GPUs may be evicted from device memory to host memory at any time by the Unified
 * Memory driver in order to make room for other allocations.
 *
 * In a multi-GPU system where all GPUs have a non-zero value for the device attribute
 * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this
 * API returns and instead may be populated on access. In such systems, managed memory can
 * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
 * maintain data locality and prevent excessive page faults to the extent possible. The application
 * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application
 * can also explicitly migrate memory to a desired processor's memory via
 * ::cuMemPrefetchAsync.
 *
 * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
 * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support
 * with each other, the physical storage for managed memory is created on the GPU which is active
 * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced
 * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
 * memory among such GPUs.
 *
 * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
 * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
 * is zero for at least one of those GPUs, the location chosen for physical storage of managed
 * memory is system-dependent.
 * - On Linux, the location chosen will be device memory as long as the current set of active
 * contexts are on devices that either have peer-to-peer support with each other or have a
 * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
 * If there is an active context on a GPU that does not have a non-zero value for that device
 * attribute and it does not have peer-to-peer support with the other devices that have active
 * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
 * Note that this means that managed memory that is located in device memory is migrated to
 * host memory if a new context is created on a GPU that doesn't have a non-zero value for
 * the device attribute and does not support peer-to-peer with at least one of the other devices
 * that has an active context. This in turn implies that context creation may fail if there is
 * insufficient host memory to migrate all managed allocations.
 * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
 * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
 * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
 * restrict CUDA to only use those GPUs that have peer-to-peer support.
 * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a
 * non-zero value to force the driver to always use device memory for physical storage.
 * When this environment variable is set to a non-zero value, all contexts created in
 * that process on devices that support managed memory have to be peer-to-peer compatible
 * with each other. Context creation will fail if a context is created on a device that
 * supports managed memory and is not peer-to-peer compatible with any of the other
 * managed memory supporting devices on which contexts were previously created, even if
 * those contexts have been destroyed. These environment variables are described
 * in the CUDA programming guide under the "CUDA environment variables" section.
 *
 * @param dptr     - Returned device pointer
 * @param bytesize - Requested allocation size in bytes
 * @param flags    - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_NOT_SUPPORTED,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
 * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync
 */
public static native @Cast("CUresult") int cuMemAllocManaged(@Cast("CUdeviceptr*") LongPointer dptr, @Cast("size_t") long bytesize, @Cast("unsigned int") int flags);
public static native @Cast("CUresult") int cuMemAllocManaged(@Cast("CUdeviceptr*") LongBuffer dptr, @Cast("size_t") long bytesize, @Cast("unsigned int") int flags);
public static native @Cast("CUresult") int cuMemAllocManaged(@Cast("CUdeviceptr*") long[] dptr, @Cast("size_t") long bytesize, @Cast("unsigned int") int flags);

// #endif /* __CUDA_API_VERSION >= 6000 */

// #if __CUDA_API_VERSION >= 4010

/**
 * \brief Returns a handle to a compute device
 *
 * Returns in \p *device a device handle given a PCI bus ID string.
 *
 * @param dev      - Returned device handle
 *
 * @param pciBusId - String in one of the following forms: 
 * [domain]:[bus]:[device].[function]
 * [domain]:[bus]:[device]
 * [bus]:[device].[function]
 * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa ::cuDeviceGet, ::cuDeviceGetAttribute, ::cuDeviceGetPCIBusId
 */
public static native @Cast("CUresult") int cuDeviceGetByPCIBusId(@Cast("CUdevice*") IntPointer dev, @Cast("const char*") BytePointer pciBusId);
public static native @Cast("CUresult") int cuDeviceGetByPCIBusId(@Cast("CUdevice*") IntBuffer dev, String pciBusId);
public static native @Cast("CUresult") int cuDeviceGetByPCIBusId(@Cast("CUdevice*") int[] dev, @Cast("const char*") BytePointer pciBusId);
public static native @Cast("CUresult") int cuDeviceGetByPCIBusId(@Cast("CUdevice*") IntPointer dev, String pciBusId);
public static native @Cast("CUresult") int cuDeviceGetByPCIBusId(@Cast("CUdevice*") IntBuffer dev, @Cast("const char*") BytePointer pciBusId);
public static native @Cast("CUresult") int cuDeviceGetByPCIBusId(@Cast("CUdevice*") int[] dev, String pciBusId);

/**
 * \brief Returns a PCI Bus Id string for the device
 *
 * Returns an ASCII string identifying the device \p dev in the NULL-terminated
 * string pointed to by \p pciBusId. \p len specifies the maximum length of the
 * string that may be returned.
 *
 * @param pciBusId - Returned identifier string for the device in the following format
 * [domain]:[bus]:[device].[function]
 * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
 * pciBusId should be large enough to store 13 characters including the NULL-terminator.
 *
 * @param len      - Maximum length of string to store in \p name
 *
 * @param dev      - Device to get identifier string for
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa ::cuDeviceGet, ::cuDeviceGetAttribute, ::cuDeviceGetByPCIBusId
 */
public static native @Cast("CUresult") int cuDeviceGetPCIBusId(@Cast("char*") BytePointer pciBusId, int len, @Cast("CUdevice") int dev);
public static native @Cast("CUresult") int cuDeviceGetPCIBusId(@Cast("char*") ByteBuffer pciBusId, int len, @Cast("CUdevice") int dev);
public static native @Cast("CUresult") int cuDeviceGetPCIBusId(@Cast("char*") byte[] pciBusId, int len, @Cast("CUdevice") int dev);

/**
 * \brief Gets an interprocess handle for a previously allocated event
 *
 * Takes as input a previously allocated event. This event must have been 
 * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING 
 * flags set. This opaque handle may be copied into other processes and
 * opened with ::cuIpcOpenEventHandle to allow efficient hardware
 * synchronization between GPU work in different processes.
 *
 * After the event has been opened in the importing process, 
 * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and 
 * ::cuEventQuery may be used in either process. Performing operations 
 * on the imported event after the exported event has been freed 
 * with ::cuEventDestroy will result in undefined behavior.
 *
 * IPC functionality is restricted to devices with support for unified 
 * addressing on Linux operating systems.
 *
 * @param pHandle - Pointer to a user allocated CUipcEventHandle
 *                    in which to return the opaque event handle
 * @param event   - Event allocated with ::CU_EVENT_INTERPROCESS and 
 *                    ::CU_EVENT_DISABLE_TIMING flags.
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_MAP_FAILED
 *
 * \sa 
 * ::cuEventCreate, 
 * ::cuEventDestroy, 
 * ::cuEventSynchronize,
 * ::cuEventQuery,
 * ::cuStreamWaitEvent,
 * ::cuIpcOpenEventHandle,
 * ::cuIpcGetMemHandle,
 * ::cuIpcOpenMemHandle,
 * ::cuIpcCloseMemHandle
 */
public static native @Cast("CUresult") int cuIpcGetEventHandle(CUipcEventHandle pHandle, CUevent_st event);

/**
 * \brief Opens an interprocess event handle for use in the current process
 *
 * Opens an interprocess event handle exported from another process with 
 * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like 
 * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. 
 * This event must be freed with ::cuEventDestroy.
 *
 * Performing operations on the imported event after the exported event has 
 * been freed with ::cuEventDestroy will result in undefined behavior.
 *
 * IPC functionality is restricted to devices with support for unified 
 * addressing on Linux operating systems.
 *
 * @param phEvent - Returns the imported event
 * @param handle  - Interprocess handle to open
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_MAP_FAILED,
 * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
 * ::CUDA_ERROR_INVALID_HANDLE
 *
 * \sa
 * ::cuEventCreate, 
 * ::cuEventDestroy, 
 * ::cuEventSynchronize,
 * ::cuEventQuery,
 * ::cuStreamWaitEvent,
 * ::cuIpcGetEventHandle,
 * ::cuIpcGetMemHandle,
 * ::cuIpcOpenMemHandle,
 * ::cuIpcCloseMemHandle
 */
public static native @Cast("CUresult") int cuIpcOpenEventHandle(@ByPtrPtr CUevent_st phEvent, @ByVal CUipcEventHandle handle);

/**
 * \brief Gets an interprocess memory handle for an existing device memory
 * allocation
 *
 * Takes a pointer to the base of an existing device memory allocation created 
 * with ::cuMemAlloc and exports it for use in another process. This is a 
 * lightweight operation and may be called multiple times on an allocation
 * without adverse effects. 
 *
 * If a region of memory is freed with ::cuMemFree and a subsequent call
 * to ::cuMemAlloc returns memory with the same device address,
 * ::cuIpcGetMemHandle will return a unique handle for the
 * new memory. 
 *
 * IPC functionality is restricted to devices with support for unified 
 * addressing on Linux operating systems.
 *
 * @param pHandle - Pointer to user allocated ::CUipcMemHandle to return
 *                    the handle in.
 * @param dptr    - Base pointer to previously allocated device memory 
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_MAP_FAILED,
 * 
 * \sa
 * ::cuMemAlloc,
 * ::cuMemFree,
 * ::cuIpcGetEventHandle,
 * ::cuIpcOpenEventHandle,
 * ::cuIpcOpenMemHandle,
 * ::cuIpcCloseMemHandle
 */
public static native @Cast("CUresult") int cuIpcGetMemHandle(CUipcMemHandle pHandle, @Cast("CUdeviceptr") long dptr);

/**
 * \brief Opens an interprocess memory handle exported from another process
 * and returns a device pointer usable in the local process.
 *
 * Maps memory exported from another process with ::cuIpcGetMemHandle into
 * the current device address space. For contexts on different devices 
 * ::cuIpcOpenMemHandle can attempt to enable peer access between the
 * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is 
 * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. 
 * ::cuDeviceCanAccessPeer can determine if a mapping is possible.
 *
 * Contexts that may open ::CUipcMemHandles are restricted in the following way.
 * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened 
 * by one ::CUcontext per ::CUdevice per other process.
 *
 * Memory returned from ::cuIpcOpenMemHandle must be freed with
 * ::cuIpcCloseMemHandle.
 *
 * Calling ::cuMemFree on an exported memory region before calling
 * ::cuIpcCloseMemHandle in the importing context will result in undefined
 * behavior.
 *
 * IPC functionality is restricted to devices with support for unified 
 * addressing on Linux operating systems.
 * 
 * @param pdptr  - Returned device pointer
 * @param handle - ::CUipcMemHandle to open
 * @param Flags  - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_MAP_FAILED,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_TOO_MANY_PEERS
 *
 * \note No guarantees are made about the address returned in \p *pdptr.  
 * In particular, multiple processes may not receive the same address for the same \p handle.
 *
 * \sa
 * ::cuMemAlloc,
 * ::cuMemFree,
 * ::cuIpcGetEventHandle,
 * ::cuIpcOpenEventHandle,
 * ::cuIpcGetMemHandle,
 * ::cuIpcCloseMemHandle,
 * ::cuCtxEnablePeerAccess,
 * ::cuDeviceCanAccessPeer,
 */
public static native @Cast("CUresult") int cuIpcOpenMemHandle(@Cast("CUdeviceptr*") LongPointer pdptr, @ByVal CUipcMemHandle handle, @Cast("unsigned int") int Flags);
public static native @Cast("CUresult") int cuIpcOpenMemHandle(@Cast("CUdeviceptr*") LongBuffer pdptr, @ByVal CUipcMemHandle handle, @Cast("unsigned int") int Flags);
public static native @Cast("CUresult") int cuIpcOpenMemHandle(@Cast("CUdeviceptr*") long[] pdptr, @ByVal CUipcMemHandle handle, @Cast("unsigned int") int Flags);

/**
 * \brief Close memory mapped with ::cuIpcOpenMemHandle
 * 
 * Unmaps memory returnd by ::cuIpcOpenMemHandle. The original allocation
 * in the exporting process as well as imported mappings in other processes
 * will be unaffected.
 *
 * Any resources used to enable peer access will be freed if this is the
 * last mapping using them.
 *
 * IPC functionality is restricted to devices with support for unified 
 * addressing on Linux operating systems.
 *
 * @param dptr - Device pointer returned by ::cuIpcOpenMemHandle
 * 
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_MAP_FAILED,
 * ::CUDA_ERROR_INVALID_HANDLE,
 *
 * \sa
 * ::cuMemAlloc,
 * ::cuMemFree,
 * ::cuIpcGetEventHandle,
 * ::cuIpcOpenEventHandle,
 * ::cuIpcGetMemHandle,
 * ::cuIpcOpenMemHandle,
 */
public static native @Cast("CUresult") int cuIpcCloseMemHandle(@Cast("CUdeviceptr") long dptr);

// #endif /* __CUDA_API_VERSION >= 4010 */

// #if __CUDA_API_VERSION >= 4000
/**
 * \brief Registers an existing host memory range for use by CUDA
 *
 * Page-locks the memory range specified by \p p and \p bytesize and maps it
 * for the device(s) as specified by \p Flags. This memory range also is added
 * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate
 * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed 
 * directly by the device, it can be read or written with much higher bandwidth 
 * than pageable memory that has not been registered.  Page-locking excessive
 * amounts of memory may degrade system performance, since it reduces the amount
 * of memory available to the system for paging. As a result, this function is
 * best used sparingly to register staging areas for data exchange between
 * host and device.
 *
 * This function has limited support on Mac OS X. OS 10.7 or higher is required.
 *
 * The \p Flags parameter enables different options to be specified that
 * affect the allocation, as follows.
 *
 * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be
 *   considered as pinned memory by all CUDA contexts, not just the one that
 *   performed the allocation.
 *
 * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address
 *   space. The device pointer to the memory may be obtained by calling
 *   ::cuMemHostGetDevicePointer(). This feature is available only on GPUs
 *   with compute capability greater than or equal to 1.1.
 *
 * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some
 *   I/O memory space, e.g. the PCI Express resource of a 3rd party device.
 *
 * All of these flags are orthogonal to one another: a developer may page-lock
 * memory that is portable or mapped with no restrictions.
 *
 * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in
 * order for the ::CU_MEMHOSTREGISTER_DEVICEMAP flag to have any effect.
 *
 * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for
 * devices that do not support mapped pinned memory. The failure is deferred
 * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
 * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag.
 *
 * For devices that have a non-zero value for the device attribute
 * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
 * can also be accessed from the device using the host pointer \p p.
 * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
 * match the original host pointer \p ptr and depends on the devices visible to the
 * application. If all devices visible to the application have a non-zero value for the
 * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
 * will match the original pointer \p ptr. If any device visible to the application
 * has a zero value for the device attribute, the device pointer returned by
 * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr,
 * but it will be suitable for use on all devices provided Unified Virtual Addressing
 * is enabled. In such systems, it is valid to access the memory using either pointer
 * on devices that have a non-zero value for the device attribute. Note however that
 * such devices should access the memory using only of the two pointers and not both.
 *
 * The memory page-locked by this function must be unregistered with 
 * ::cuMemHostUnregister().
 *
 * @param p        - Host pointer to memory to page-lock
 * @param bytesize - Size in bytes of the address range to page-lock
 * @param Flags    - Flags for allocation request
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
 * ::CUDA_ERROR_NOT_PERMITTED,
 * ::CUDA_ERROR_NOT_SUPPORTED
 * \notefnerr
 *
 * \sa ::cuMemHostUnregister, ::cuMemHostGetFlags, ::cuMemHostGetDevicePointer
 */
public static native @Cast("CUresult") int cuMemHostRegister(Pointer p, @Cast("size_t") long bytesize, @Cast("unsigned int") int Flags);

/**
 * \brief Unregisters a memory range that was registered with cuMemHostRegister.
 *
 * Unmaps the memory range whose base address is specified by \p p, and makes
 * it pageable again.
 *
 * The base address must be the same one specified to ::cuMemHostRegister().
 *
 * @param p - Host pointer to memory to unregister
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
 * \notefnerr
 *
 * \sa ::cuMemHostRegister
 */
public static native @Cast("CUresult") int cuMemHostUnregister(Pointer p);

/**
 * \brief Copies memory
 *
 * Copies data between two pointers. 
 * \p dst and \p src are base pointers of the destination and source, respectively.  
 * \p ByteCount specifies the number of bytes to copy.
 * Note that this function infers the type of the transfer (host to host, host to 
 *   device, device to device, or device to host) from the pointer values.  This
 *   function is only allowed in contexts which support unified addressing.
 *
 * @param dst - Destination unified virtual address space pointer
 * @param src - Source unified virtual address space pointer
 * @param ByteCount - Size of memory copy in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemcpy(@Cast("CUdeviceptr") long dst, @Cast("CUdeviceptr") long src, @Cast("size_t") long ByteCount);

/**
 * \brief Copies device memory between two contexts
 *
 * Copies from device memory in one context to device memory in another
 * context. \p dstDevice is the base device pointer of the destination memory 
 * and \p dstContext is the destination context.  \p srcDevice is the base 
 * device pointer of the source memory and \p srcContext is the source pointer.  
 * \p ByteCount specifies the number of bytes to copy.
 *
 * @param dstDevice  - Destination device pointer
 * @param dstContext - Destination context
 * @param srcDevice  - Source device pointer
 * @param srcContext - Source context
 * @param ByteCount  - Size of memory copy in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
 * ::cuMemcpy3DPeerAsync
 */
public static native @Cast("CUresult") int cuMemcpyPeer(@Cast("CUdeviceptr") long dstDevice, CUctx_st dstContext, @Cast("CUdeviceptr") long srcDevice, CUctx_st srcContext, @Cast("size_t") long ByteCount);

// #endif /* __CUDA_API_VERSION >= 4000 */

// #if __CUDA_API_VERSION >= 3020
/**
 * \brief Copies memory from Host to Device
 *
 * Copies from host memory to device memory. \p dstDevice and \p srcHost are
 * the base addresses of the destination and source, respectively. \p ByteCount
 * specifies the number of bytes to copy.
 *
 * @param dstDevice - Destination device pointer
 * @param srcHost   - Source host pointer
 * @param ByteCount - Size of memory copy in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemcpyHtoD(@Cast("CUdeviceptr") long dstDevice, @Const Pointer srcHost, @Cast("size_t") long ByteCount);

/**
 * \brief Copies memory from Device to Host
 *
 * Copies from device to host memory. \p dstHost and \p srcDevice specify the
 * base pointers of the destination and source, respectively. \p ByteCount
 * specifies the number of bytes to copy.
 *
 * @param dstHost   - Destination host pointer
 * @param srcDevice - Source device pointer
 * @param ByteCount - Size of memory copy in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemcpyDtoH(Pointer dstHost, @Cast("CUdeviceptr") long srcDevice, @Cast("size_t") long ByteCount);

/**
 * \brief Copies memory from Device to Device
 *
 * Copies from device memory to device memory. \p dstDevice and \p srcDevice
 * are the base pointers of the destination and source, respectively.
 * \p ByteCount specifies the number of bytes to copy.
 *
 * @param dstDevice - Destination device pointer
 * @param srcDevice - Source device pointer
 * @param ByteCount - Size of memory copy in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemcpyDtoD(@Cast("CUdeviceptr") long dstDevice, @Cast("CUdeviceptr") long srcDevice, @Cast("size_t") long ByteCount);

/**
 * \brief Copies memory from Device to Array
 *
 * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset
 * specify the CUDA array handle and starting index of the destination data.
 * \p srcDevice specifies the base pointer of the source. \p ByteCount
 * specifies the number of bytes to copy.
 *
 * @param dstArray  - Destination array
 * @param dstOffset - Offset in bytes of destination array
 * @param srcDevice - Source device pointer
 * @param ByteCount - Size of memory copy in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemcpyDtoA(CUarray_st dstArray, @Cast("size_t") long dstOffset, @Cast("CUdeviceptr") long srcDevice, @Cast("size_t") long ByteCount);

/**
 * \brief Copies memory from Array to Device
 *
 * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the
 * base pointer of the destination and must be naturally aligned with the CUDA
 * array elements. \p srcArray and \p srcOffset specify the CUDA array handle
 * and the offset in bytes into the array where the copy is to begin.
 * \p ByteCount specifies the number of bytes to copy and must be evenly
 * divisible by the array element size.
 *
 * @param dstDevice - Destination device pointer
 * @param srcArray  - Source array
 * @param srcOffset - Offset in bytes of source array
 * @param ByteCount - Size of memory copy in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemcpyAtoD(@Cast("CUdeviceptr") long dstDevice, CUarray_st srcArray, @Cast("size_t") long srcOffset, @Cast("size_t") long ByteCount);

/**
 * \brief Copies memory from Host to Array
 *
 * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
 * specify the CUDA array handle and starting offset in bytes of the destination
 * data.  \p pSrc specifies the base address of the source. \p ByteCount specifies
 * the number of bytes to copy.
 *
 * @param dstArray  - Destination array
 * @param dstOffset - Offset in bytes of destination array
 * @param srcHost   - Source host pointer
 * @param ByteCount - Size of memory copy in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemcpyHtoA(CUarray_st dstArray, @Cast("size_t") long dstOffset, @Const Pointer srcHost, @Cast("size_t") long ByteCount);

/**
 * \brief Copies memory from Array to Host
 *
 * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
 * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
 * array handle and starting offset in bytes of the source data.
 * \p ByteCount specifies the number of bytes to copy.
 *
 * @param dstHost   - Destination device pointer
 * @param srcArray  - Source array
 * @param srcOffset - Offset in bytes of source array
 * @param ByteCount - Size of memory copy in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemcpyAtoH(Pointer dstHost, CUarray_st srcArray, @Cast("size_t") long srcOffset, @Cast("size_t") long ByteCount);

/**
 * \brief Copies memory from Array to Array
 *
 * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray
 * specify the handles of the destination and source CUDA arrays for the copy,
 * respectively. \p dstOffset and \p srcOffset specify the destination and
 * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of
 * bytes to be copied. The size of the elements in the CUDA arrays need not be
 * the same format, but the elements must be the same size; and count must be
 * evenly divisible by that size.
 *
 * @param dstArray  - Destination array
 * @param dstOffset - Offset in bytes of destination array
 * @param srcArray  - Source array
 * @param srcOffset - Offset in bytes of source array
 * @param ByteCount - Size of memory copy in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemcpyAtoA(CUarray_st dstArray, @Cast("size_t") long dstOffset, CUarray_st srcArray, @Cast("size_t") long srcOffset, @Cast("size_t") long ByteCount);

/**
 * \brief Copies memory for 2D arrays
 *
 * Perform a 2D memory copy according to the parameters specified in \p pCopy.
 * The ::CUDA_MEMCPY2D structure is defined as:
 *
 * {@code
   typedef struct CUDA_MEMCPY2D_st {
      unsigned int srcXInBytes, srcY;
      CUmemorytype srcMemoryType;
          const void *srcHost;
          CUdeviceptr srcDevice;
          CUarray srcArray;
          unsigned int srcPitch;

      unsigned int dstXInBytes, dstY;
      CUmemorytype dstMemoryType;
          void *dstHost;
          CUdeviceptr dstDevice;
          CUarray dstArray;
          unsigned int dstPitch;

      unsigned int WidthInBytes;
      unsigned int Height;
   } CUDA_MEMCPY2D;
 * }
 * where:
 * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
 *   source and destination, respectively; ::CUmemorytype_enum is defined as:
 *
 * {@code
   typedef enum CUmemorytype_enum {
      CU_MEMORYTYPE_HOST = 0x01,
      CU_MEMORYTYPE_DEVICE = 0x02,
      CU_MEMORYTYPE_ARRAY = 0x03,
      CU_MEMORYTYPE_UNIFIED = 0x04
   } CUmemorytype;
 * }
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
 *   specify the (unified virtual address space) base address of the source data 
 *   and the bytes per row to apply.  ::srcArray is ignored.  
 * This value may be used only if unified addressing is supported in the calling 
 *   context.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
 * specify the (host) base address of the source data and the bytes per row to
 * apply. ::srcArray is ignored.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
 * specify the (device) base address of the source data and the bytes per row
 * to apply. ::srcArray is ignored.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
 * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
 * ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
 * specify the (host) base address of the destination data and the bytes per
 * row to apply. ::dstArray is ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
 *   specify the (unified virtual address space) base address of the source data 
 *   and the bytes per row to apply.  ::dstArray is ignored.  
 * This value may be used only if unified addressing is supported in the calling 
 *   context.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
 * specify the (device) base address of the destination data and the bytes per
 * row to apply. ::dstArray is ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
 * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
 * ignored.
 *
 * - ::srcXInBytes and ::srcY specify the base address of the source data for
 *   the copy.
 *
 * \par
 * For host pointers, the starting address is
 * {@code
  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
 * }
 *
 * \par
 * For device pointers, the starting address is
 * {@code
  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
 * }
 *
 * \par
 * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
 * element size.
 *
 * - ::dstXInBytes and ::dstY specify the base address of the destination data
 *   for the copy.
 *
 * \par
 * For host pointers, the base address is
 * {@code
  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
 * }
 *
 * \par
 * For device pointers, the starting address is
 * {@code
  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
 * }
 *
 * \par
 * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
 * element size.
 *
 * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
 *   the 2D copy being performed.
 * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
 *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
 *   ::WidthInBytes + dstXInBytes.
 *
 * \par
 * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
 * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
 * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
 * (device to device, CUDA array to device, CUDA array to CUDA array),
 * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
 * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
 * significantly slower in the cases where ::cuMemcpy2D() would have returned
 * an error code.
 *
 * @param pCopy - Parameters for the memory copy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemcpy2D(@Const CUDA_MEMCPY2D pCopy);

/**
 * \brief Copies memory for 2D arrays
 *
 * Perform a 2D memory copy according to the parameters specified in \p pCopy.
 * The ::CUDA_MEMCPY2D structure is defined as:
 *
 * {@code
   typedef struct CUDA_MEMCPY2D_st {
      unsigned int srcXInBytes, srcY;
      CUmemorytype srcMemoryType;
      const void *srcHost;
      CUdeviceptr srcDevice;
      CUarray srcArray;
      unsigned int srcPitch;
      unsigned int dstXInBytes, dstY;
      CUmemorytype dstMemoryType;
      void *dstHost;
      CUdeviceptr dstDevice;
      CUarray dstArray;
      unsigned int dstPitch;
      unsigned int WidthInBytes;
      unsigned int Height;
   } CUDA_MEMCPY2D;
 * }
 * where:
 * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
 *   source and destination, respectively; ::CUmemorytype_enum is defined as:
 *
 * {@code
   typedef enum CUmemorytype_enum {
      CU_MEMORYTYPE_HOST = 0x01,
      CU_MEMORYTYPE_DEVICE = 0x02,
      CU_MEMORYTYPE_ARRAY = 0x03,
      CU_MEMORYTYPE_UNIFIED = 0x04
   } CUmemorytype;
 * }
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
 *   specify the (unified virtual address space) base address of the source data 
 *   and the bytes per row to apply.  ::srcArray is ignored.  
 * This value may be used only if unified addressing is supported in the calling 
 *   context.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
 * specify the (host) base address of the source data and the bytes per row to
 * apply. ::srcArray is ignored.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
 * specify the (device) base address of the source data and the bytes per row
 * to apply. ::srcArray is ignored.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
 * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
 * ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
 *   specify the (unified virtual address space) base address of the source data 
 *   and the bytes per row to apply.  ::dstArray is ignored.  
 * This value may be used only if unified addressing is supported in the calling 
 *   context.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
 * specify the (host) base address of the destination data and the bytes per
 * row to apply. ::dstArray is ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
 * specify the (device) base address of the destination data and the bytes per
 * row to apply. ::dstArray is ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
 * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
 * ignored.
 *
 * - ::srcXInBytes and ::srcY specify the base address of the source data for
 *   the copy.
 *
 * \par
 * For host pointers, the starting address is
 * {@code
  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
 * }
 *
 * \par
 * For device pointers, the starting address is
 * {@code
  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
 * }
 *
 * \par
 * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
 * element size.
 *
 * - ::dstXInBytes and ::dstY specify the base address of the destination data
 *   for the copy.
 *
 * \par
 * For host pointers, the base address is
 * {@code
  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
 * }
 *
 * \par
 * For device pointers, the starting address is
 * {@code
  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
 * }
 *
 * \par
 * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
 * element size.
 *
 * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
 *   the 2D copy being performed.
 * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
 *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
 *   ::WidthInBytes + dstXInBytes.
 *
 * \par
 * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
 * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
 * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
 * (device to device, CUDA array to device, CUDA array to CUDA array),
 * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
 * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
 * significantly slower in the cases where ::cuMemcpy2D() would have returned
 * an error code.
 *
 * @param pCopy - Parameters for the memory copy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemcpy2DUnaligned(@Const CUDA_MEMCPY2D pCopy);

/**
 * \brief Copies memory for 3D arrays
 *
 * Perform a 3D memory copy according to the parameters specified in
 * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
 *
 * {@code
        typedef struct CUDA_MEMCPY3D_st {

            unsigned int srcXInBytes, srcY, srcZ;
            unsigned int srcLOD;
            CUmemorytype srcMemoryType;
                const void *srcHost;
                CUdeviceptr srcDevice;
                CUarray srcArray;
                unsigned int srcPitch;  // ignored when src is array
                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1

            unsigned int dstXInBytes, dstY, dstZ;
            unsigned int dstLOD;
            CUmemorytype dstMemoryType;
                void *dstHost;
                CUdeviceptr dstDevice;
                CUarray dstArray;
                unsigned int dstPitch;  // ignored when dst is array
                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1

            unsigned int WidthInBytes;
            unsigned int Height;
            unsigned int Depth;
        } CUDA_MEMCPY3D;
 * }
 * where:
 * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
 *   source and destination, respectively; ::CUmemorytype_enum is defined as:
 *
 * {@code
   typedef enum CUmemorytype_enum {
      CU_MEMORYTYPE_HOST = 0x01,
      CU_MEMORYTYPE_DEVICE = 0x02,
      CU_MEMORYTYPE_ARRAY = 0x03,
      CU_MEMORYTYPE_UNIFIED = 0x04
   } CUmemorytype;
 * }
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
 *   specify the (unified virtual address space) base address of the source data 
 *   and the bytes per row to apply.  ::srcArray is ignored.  
 * This value may be used only if unified addressing is supported in the calling 
 *   context.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
 * ::srcHeight specify the (host) base address of the source data, the bytes
 * per row, and the height of each 2D slice of the 3D array. ::srcArray is
 * ignored.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
 * ::srcHeight specify the (device) base address of the source data, the bytes
 * per row, and the height of each 2D slice of the 3D array. ::srcArray is
 * ignored.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
 * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
 * ::srcHeight are ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
 *   specify the (unified virtual address space) base address of the source data 
 *   and the bytes per row to apply.  ::dstArray is ignored.  
 * This value may be used only if unified addressing is supported in the calling 
 *   context.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
 * specify the (host) base address of the destination data, the bytes per row,
 * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
 * specify the (device) base address of the destination data, the bytes per
 * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
 * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
 * ::dstHeight are ignored.
 *
 * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
 *   data for the copy.
 *
 * \par
 * For host pointers, the starting address is
 * {@code
  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
 * }
 *
 * \par
 * For device pointers, the starting address is
 * {@code
  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
 * }
 *
 * \par
 * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
 * element size.
 *
 * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
 *   destination data for the copy.
 *
 * \par
 * For host pointers, the base address is
 * {@code
  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
 * }
 *
 * \par
 * For device pointers, the starting address is
 * {@code
  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
 * }
 *
 * \par
 * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
 * element size.
 *
 * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
 *   and depth of the 3D copy being performed.
 * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
 *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
 *   ::WidthInBytes + dstXInBytes.
 * - If specified, ::srcHeight must be greater than or equal to ::Height +
 *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
 *
 * \par
 * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum
 * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
 *
 * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
 * set to 0.
 *
 * @param pCopy - Parameters for the memory copy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemcpy3D(@Const CUDA_MEMCPY3D pCopy);
// #endif /* __CUDA_API_VERSION >= 3020 */

// #if __CUDA_API_VERSION >= 4000
/**
 * \brief Copies memory between contexts
 *
 * Perform a 3D memory copy according to the parameters specified in
 * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
 * for documentation of its parameters.
 *
 * @param pCopy - Parameters for the memory copy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_sync
 *
 * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
 * ::cuMemcpy3DPeerAsync
 */
public static native @Cast("CUresult") int cuMemcpy3DPeer(@Const CUDA_MEMCPY3D_PEER pCopy);

/**
 * \brief Copies memory asynchronously
 *
 * Copies data between two pointers. 
 * \p dst and \p src are base pointers of the destination and source, respectively.  
 * \p ByteCount specifies the number of bytes to copy.
 * Note that this function infers the type of the transfer (host to host, host to 
 *   device, device to device, or device to host) from the pointer values.  This
 *   function is only allowed in contexts which support unified addressing.
 *
 * @param dst       - Destination unified virtual address space pointer
 * @param src       - Source unified virtual address space pointer
 * @param ByteCount - Size of memory copy in bytes
 * @param hStream   - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemcpyAsync(@Cast("CUdeviceptr") long dst, @Cast("CUdeviceptr") long src, @Cast("size_t") long ByteCount, CUstream_st hStream);

/**
 * \brief Copies device memory between two contexts asynchronously.
 *
 * Copies from device memory in one context to device memory in another
 * context. \p dstDevice is the base device pointer of the destination memory 
 * and \p dstContext is the destination context.  \p srcDevice is the base 
 * device pointer of the source memory and \p srcContext is the source pointer.  
 * \p ByteCount specifies the number of bytes to copy.
 *
 * @param dstDevice  - Destination device pointer
 * @param dstContext - Destination context
 * @param srcDevice  - Source device pointer
 * @param srcContext - Source context
 * @param ByteCount  - Size of memory copy in bytes
 * @param hStream    - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, 
 * ::cuMemcpy3DPeerAsync
 */
public static native @Cast("CUresult") int cuMemcpyPeerAsync(@Cast("CUdeviceptr") long dstDevice, CUctx_st dstContext, @Cast("CUdeviceptr") long srcDevice, CUctx_st srcContext, @Cast("size_t") long ByteCount, CUstream_st hStream);
// #endif /* __CUDA_API_VERSION >= 4000 */

// #if __CUDA_API_VERSION >= 3020
/**
 * \brief Copies memory from Host to Device
 *
 * Copies from host memory to device memory. \p dstDevice and \p srcHost are
 * the base addresses of the destination and source, respectively. \p ByteCount
 * specifies the number of bytes to copy.
 *
 * @param dstDevice - Destination device pointer
 * @param srcHost   - Source host pointer
 * @param ByteCount - Size of memory copy in bytes
 * @param hStream   - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemcpyHtoDAsync(@Cast("CUdeviceptr") long dstDevice, @Const Pointer srcHost, @Cast("size_t") long ByteCount, CUstream_st hStream);

/**
 * \brief Copies memory from Device to Host
 *
 * Copies from device to host memory. \p dstHost and \p srcDevice specify the
 * base pointers of the destination and source, respectively. \p ByteCount
 * specifies the number of bytes to copy.
 *
 * @param dstHost   - Destination host pointer
 * @param srcDevice - Source device pointer
 * @param ByteCount - Size of memory copy in bytes
 * @param hStream   - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemcpyDtoHAsync(Pointer dstHost, @Cast("CUdeviceptr") long srcDevice, @Cast("size_t") long ByteCount, CUstream_st hStream);

/**
 * \brief Copies memory from Device to Device
 *
 * Copies from device memory to device memory. \p dstDevice and \p srcDevice
 * are the base pointers of the destination and source, respectively.
 * \p ByteCount specifies the number of bytes to copy.
 *
 * @param dstDevice - Destination device pointer
 * @param srcDevice - Source device pointer
 * @param ByteCount - Size of memory copy in bytes
 * @param hStream   - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemcpyDtoDAsync(@Cast("CUdeviceptr") long dstDevice, @Cast("CUdeviceptr") long srcDevice, @Cast("size_t") long ByteCount, CUstream_st hStream);

/**
 * \brief Copies memory from Host to Array
 *
 * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
 * specify the CUDA array handle and starting offset in bytes of the
 * destination data. \p srcHost specifies the base address of the source.
 * \p ByteCount specifies the number of bytes to copy.
 *
 * @param dstArray  - Destination array
 * @param dstOffset - Offset in bytes of destination array
 * @param srcHost   - Source host pointer
 * @param ByteCount - Size of memory copy in bytes
 * @param hStream   - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemcpyHtoAAsync(CUarray_st dstArray, @Cast("size_t") long dstOffset, @Const Pointer srcHost, @Cast("size_t") long ByteCount, CUstream_st hStream);

/**
 * \brief Copies memory from Array to Host
 *
 * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
 * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
 * array handle and starting offset in bytes of the source data.
 * \p ByteCount specifies the number of bytes to copy.
 *
 * @param dstHost   - Destination pointer
 * @param srcArray  - Source array
 * @param srcOffset - Offset in bytes of source array
 * @param ByteCount - Size of memory copy in bytes
 * @param hStream   - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemcpyAtoHAsync(Pointer dstHost, CUarray_st srcArray, @Cast("size_t") long srcOffset, @Cast("size_t") long ByteCount, CUstream_st hStream);

/**
 * \brief Copies memory for 2D arrays
 *
 * Perform a 2D memory copy according to the parameters specified in \p pCopy.
 * The ::CUDA_MEMCPY2D structure is defined as:
 *
 * {@code
   typedef struct CUDA_MEMCPY2D_st {
      unsigned int srcXInBytes, srcY;
      CUmemorytype srcMemoryType;
      const void *srcHost;
      CUdeviceptr srcDevice;
      CUarray srcArray;
      unsigned int srcPitch;
      unsigned int dstXInBytes, dstY;
      CUmemorytype dstMemoryType;
      void *dstHost;
      CUdeviceptr dstDevice;
      CUarray dstArray;
      unsigned int dstPitch;
      unsigned int WidthInBytes;
      unsigned int Height;
   } CUDA_MEMCPY2D;
 * }
 * where:
 * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
 *   source and destination, respectively; ::CUmemorytype_enum is defined as:
 *
 * {@code
   typedef enum CUmemorytype_enum {
      CU_MEMORYTYPE_HOST = 0x01,
      CU_MEMORYTYPE_DEVICE = 0x02,
      CU_MEMORYTYPE_ARRAY = 0x03,
      CU_MEMORYTYPE_UNIFIED = 0x04
   } CUmemorytype;
 * }
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
 * specify the (host) base address of the source data and the bytes per row to
 * apply. ::srcArray is ignored.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
 *   specify the (unified virtual address space) base address of the source data 
 *   and the bytes per row to apply.  ::srcArray is ignored.  
 * This value may be used only if unified addressing is supported in the calling 
 *   context.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
 * specify the (device) base address of the source data and the bytes per row
 * to apply. ::srcArray is ignored.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
 * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
 * ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
 *   specify the (unified virtual address space) base address of the source data 
 *   and the bytes per row to apply.  ::dstArray is ignored.  
 * This value may be used only if unified addressing is supported in the calling 
 *   context.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
 * specify the (host) base address of the destination data and the bytes per
 * row to apply. ::dstArray is ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
 * specify the (device) base address of the destination data and the bytes per
 * row to apply. ::dstArray is ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
 * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
 * ignored.
 *
 * - ::srcXInBytes and ::srcY specify the base address of the source data for
 *   the copy.
 *
 * \par
 * For host pointers, the starting address is
 * {@code
  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
 * }
 *
 * \par
 * For device pointers, the starting address is
 * {@code
  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
 * }
 *
 * \par
 * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
 * element size.
 *
 * - ::dstXInBytes and ::dstY specify the base address of the destination data
 *   for the copy.
 *
 * \par
 * For host pointers, the base address is
 * {@code
  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
 * }
 *
 * \par
 * For device pointers, the starting address is
 * {@code
  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
 * }
 *
 * \par
 * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
 * element size.
 *
 * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
 *   the 2D copy being performed.
 * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
 *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
 *   ::WidthInBytes + dstXInBytes.
 * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
 *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
 *   ::WidthInBytes + dstXInBytes.
 * - If specified, ::srcHeight must be greater than or equal to ::Height +
 *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
 *
 * \par
 * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum
 * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
 * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
 * (device to device, CUDA array to device, CUDA array to CUDA array),
 * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch().
 *
 * @param pCopy   - Parameters for the memory copy
 * @param hStream - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemcpy2DAsync(@Const CUDA_MEMCPY2D pCopy, CUstream_st hStream);

/**
 * \brief Copies memory for 3D arrays
 *
 * Perform a 3D memory copy according to the parameters specified in
 * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
 *
 * {@code
        typedef struct CUDA_MEMCPY3D_st {

            unsigned int srcXInBytes, srcY, srcZ;
            unsigned int srcLOD;
            CUmemorytype srcMemoryType;
                const void *srcHost;
                CUdeviceptr srcDevice;
                CUarray srcArray;
                unsigned int srcPitch;  // ignored when src is array
                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1

            unsigned int dstXInBytes, dstY, dstZ;
            unsigned int dstLOD;
            CUmemorytype dstMemoryType;
                void *dstHost;
                CUdeviceptr dstDevice;
                CUarray dstArray;
                unsigned int dstPitch;  // ignored when dst is array
                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1

            unsigned int WidthInBytes;
            unsigned int Height;
            unsigned int Depth;
        } CUDA_MEMCPY3D;
 * }
 * where:
 * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
 *   source and destination, respectively; ::CUmemorytype_enum is defined as:
 *
 * {@code
   typedef enum CUmemorytype_enum {
      CU_MEMORYTYPE_HOST = 0x01,
      CU_MEMORYTYPE_DEVICE = 0x02,
      CU_MEMORYTYPE_ARRAY = 0x03,
      CU_MEMORYTYPE_UNIFIED = 0x04
   } CUmemorytype;
 * }
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
 *   specify the (unified virtual address space) base address of the source data 
 *   and the bytes per row to apply.  ::srcArray is ignored.  
 * This value may be used only if unified addressing is supported in the calling 
 *   context.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
 * ::srcHeight specify the (host) base address of the source data, the bytes
 * per row, and the height of each 2D slice of the 3D array. ::srcArray is
 * ignored.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
 * ::srcHeight specify the (device) base address of the source data, the bytes
 * per row, and the height of each 2D slice of the 3D array. ::srcArray is
 * ignored.
 *
 * \par
 * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
 * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
 * ::srcHeight are ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
 *   specify the (unified virtual address space) base address of the source data 
 *   and the bytes per row to apply.  ::dstArray is ignored.  
 * This value may be used only if unified addressing is supported in the calling 
 *   context.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
 * specify the (host) base address of the destination data, the bytes per row,
 * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
 * specify the (device) base address of the destination data, the bytes per
 * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
 *
 * \par
 * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
 * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
 * ::dstHeight are ignored.
 *
 * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
 *   data for the copy.
 *
 * \par
 * For host pointers, the starting address is
 * {@code
  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
 * }
 *
 * \par
 * For device pointers, the starting address is
 * {@code
  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
 * }
 *
 * \par
 * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
 * element size.
 *
 * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
 *   destination data for the copy.
 *
 * \par
 * For host pointers, the base address is
 * {@code
  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
 * }
 *
 * \par
 * For device pointers, the starting address is
 * {@code
  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
 * }
 *
 * \par
 * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
 * element size.
 *
 * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
 *   and depth of the 3D copy being performed.
 * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
 *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
 *   ::WidthInBytes + dstXInBytes.
 * - If specified, ::srcHeight must be greater than or equal to ::Height +
 *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
 *
 * \par
 * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum
 * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
 *
 * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
 * set to 0.
 *
 * @param pCopy - Parameters for the memory copy
 * @param hStream - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemcpy3DAsync(@Const CUDA_MEMCPY3D pCopy, CUstream_st hStream);
// #endif /* __CUDA_API_VERSION >= 3020 */

// #if __CUDA_API_VERSION >= 4000
/**
 * \brief Copies memory between contexts asynchronously.
 *
 * Perform a 3D memory copy according to the parameters specified in
 * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
 * for documentation of its parameters.
 *
 * @param pCopy - Parameters for the memory copy
 * @param hStream - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
 * ::cuMemcpy3DPeerAsync
 */
public static native @Cast("CUresult") int cuMemcpy3DPeerAsync(@Const CUDA_MEMCPY3D_PEER pCopy, CUstream_st hStream);
// #endif /* __CUDA_API_VERSION >= 4000 */

// #if __CUDA_API_VERSION >= 3020
/**
 * \brief Initializes device memory
 *
 * Sets the memory range of \p N 8-bit values to the specified value
 * \p uc.
 *
 * @param dstDevice - Destination device pointer
 * @param uc        - Value to set
 * @param N         - Number of elements
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_memset
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemsetD8(@Cast("CUdeviceptr") long dstDevice, @Cast("unsigned char") byte uc, @Cast("size_t") long N);

/**
 * \brief Initializes device memory
 *
 * Sets the memory range of \p N 16-bit values to the specified value
 * \p us. The \p dstDevice pointer must be two byte aligned.
 *
 * @param dstDevice - Destination device pointer
 * @param us        - Value to set
 * @param N         - Number of elements
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_memset
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemsetD16(@Cast("CUdeviceptr") long dstDevice, @Cast("unsigned short") short us, @Cast("size_t") long N);

/**
 * \brief Initializes device memory
 *
 * Sets the memory range of \p N 32-bit values to the specified value
 * \p ui. The \p dstDevice pointer must be four byte aligned.
 *
 * @param dstDevice - Destination device pointer
 * @param ui        - Value to set
 * @param N         - Number of elements
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_memset
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemsetD32(@Cast("CUdeviceptr") long dstDevice, @Cast("unsigned int") int ui, @Cast("size_t") long N);

/**
 * \brief Initializes device memory
 *
 * Sets the 2D memory range of \p Width 8-bit values to the specified value
 * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
 * specifies the number of bytes between each row. This function performs
 * fastest when the pitch is one that has been passed back by
 * ::cuMemAllocPitch().
 *
 * @param dstDevice - Destination device pointer
 * @param dstPitch  - Pitch of destination device pointer
 * @param uc        - Value to set
 * @param Width     - Width of row
 * @param Height    - Number of rows
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_memset
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemsetD2D8(@Cast("CUdeviceptr") long dstDevice, @Cast("size_t") long dstPitch, @Cast("unsigned char") byte uc, @Cast("size_t") long Width, @Cast("size_t") long Height);

/**
 * \brief Initializes device memory
 *
 * Sets the 2D memory range of \p Width 16-bit values to the specified value
 * \p us. \p Height specifies the number of rows to set, and \p dstPitch
 * specifies the number of bytes between each row. The \p dstDevice pointer
 * and \p dstPitch offset must be two byte aligned. This function performs
 * fastest when the pitch is one that has been passed back by
 * ::cuMemAllocPitch().
 *
 * @param dstDevice - Destination device pointer
 * @param dstPitch  - Pitch of destination device pointer
 * @param us        - Value to set
 * @param Width     - Width of row
 * @param Height    - Number of rows
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_memset
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemsetD2D16(@Cast("CUdeviceptr") long dstDevice, @Cast("size_t") long dstPitch, @Cast("unsigned short") short us, @Cast("size_t") long Width, @Cast("size_t") long Height);

/**
 * \brief Initializes device memory
 *
 * Sets the 2D memory range of \p Width 32-bit values to the specified value
 * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
 * specifies the number of bytes between each row. The \p dstDevice pointer
 * and \p dstPitch offset must be four byte aligned. This function performs
 * fastest when the pitch is one that has been passed back by
 * ::cuMemAllocPitch().
 *
 * @param dstDevice - Destination device pointer
 * @param dstPitch  - Pitch of destination device pointer
 * @param ui        - Value to set
 * @param Width     - Width of row
 * @param Height    - Number of rows
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_memset
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemsetD2D32(@Cast("CUdeviceptr") long dstDevice, @Cast("size_t") long dstPitch, @Cast("unsigned int") int ui, @Cast("size_t") long Width, @Cast("size_t") long Height);

/**
 * \brief Sets device memory
 *
 * Sets the memory range of \p N 8-bit values to the specified value
 * \p uc.
 *
 * @param dstDevice - Destination device pointer
 * @param uc        - Value to set
 * @param N         - Number of elements
 * @param hStream   - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_memset
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemsetD8Async(@Cast("CUdeviceptr") long dstDevice, @Cast("unsigned char") byte uc, @Cast("size_t") long N, CUstream_st hStream);

/**
 * \brief Sets device memory
 *
 * Sets the memory range of \p N 16-bit values to the specified value
 * \p us. The \p dstDevice pointer must be two byte aligned.
 *
 * @param dstDevice - Destination device pointer
 * @param us        - Value to set
 * @param N         - Number of elements
 * @param hStream   - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_memset
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemsetD16Async(@Cast("CUdeviceptr") long dstDevice, @Cast("unsigned short") short us, @Cast("size_t") long N, CUstream_st hStream);

/**
 * \brief Sets device memory
 *
 * Sets the memory range of \p N 32-bit values to the specified value
 * \p ui. The \p dstDevice pointer must be four byte aligned.
 *
 * @param dstDevice - Destination device pointer
 * @param ui        - Value to set
 * @param N         - Number of elements
 * @param hStream   - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_memset
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuMemsetD32Async(@Cast("CUdeviceptr") long dstDevice, @Cast("unsigned int") int ui, @Cast("size_t") long N, CUstream_st hStream);

/**
 * \brief Sets device memory
 *
 * Sets the 2D memory range of \p Width 8-bit values to the specified value
 * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
 * specifies the number of bytes between each row. This function performs
 * fastest when the pitch is one that has been passed back by
 * ::cuMemAllocPitch().
 *
 * @param dstDevice - Destination device pointer
 * @param dstPitch  - Pitch of destination device pointer
 * @param uc        - Value to set
 * @param Width     - Width of row
 * @param Height    - Number of rows
 * @param hStream   - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_memset
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemsetD2D8Async(@Cast("CUdeviceptr") long dstDevice, @Cast("size_t") long dstPitch, @Cast("unsigned char") byte uc, @Cast("size_t") long Width, @Cast("size_t") long Height, CUstream_st hStream);

/**
 * \brief Sets device memory
 *
 * Sets the 2D memory range of \p Width 16-bit values to the specified value
 * \p us. \p Height specifies the number of rows to set, and \p dstPitch
 * specifies the number of bytes between each row. The \p dstDevice pointer 
 * and \p dstPitch offset must be two byte aligned. This function performs
 * fastest when the pitch is one that has been passed back by
 * ::cuMemAllocPitch().
 *
 * @param dstDevice - Destination device pointer
 * @param dstPitch  - Pitch of destination device pointer
 * @param us        - Value to set
 * @param Width     - Width of row
 * @param Height    - Number of rows
 * @param hStream   - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_memset
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemsetD2D16Async(@Cast("CUdeviceptr") long dstDevice, @Cast("size_t") long dstPitch, @Cast("unsigned short") short us, @Cast("size_t") long Width, @Cast("size_t") long Height, CUstream_st hStream);

/**
 * \brief Sets device memory
 *
 * Sets the 2D memory range of \p Width 32-bit values to the specified value
 * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
 * specifies the number of bytes between each row. The \p dstDevice pointer
 * and \p dstPitch offset must be four byte aligned. This function performs
 * fastest when the pitch is one that has been passed back by
 * ::cuMemAllocPitch().
 *
 * @param dstDevice - Destination device pointer
 * @param dstPitch  - Pitch of destination device pointer
 * @param ui        - Value to set
 * @param Width     - Width of row
 * @param Height    - Number of rows
 * @param hStream   - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 * \note_memset
 * \note_null_stream
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
 * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32,
 * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
 * ::cuMemsetD32, ::cuMemsetD32Async
 */
public static native @Cast("CUresult") int cuMemsetD2D32Async(@Cast("CUdeviceptr") long dstDevice, @Cast("size_t") long dstPitch, @Cast("unsigned int") int ui, @Cast("size_t") long Width, @Cast("size_t") long Height, CUstream_st hStream);

/**
 * \brief Creates a 1D or 2D CUDA array
 *
 * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure
 * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
 * The ::CUDA_ARRAY_DESCRIPTOR is defined as:
 *
 * {@code
    typedef struct {
        unsigned int Width;
        unsigned int Height;
        CUarray_format Format;
        unsigned int NumChannels;
    } CUDA_ARRAY_DESCRIPTOR;
 * }
 * where:
 *
 * - \p Width, and \p Height are the width, and height of the CUDA array (in
 * elements); the CUDA array is one-dimensional if height is 0, two-dimensional
 * otherwise;
 * - ::Format specifies the format of the elements; ::CUarray_format is
 * defined as:
 * {@code
    typedef enum CUarray_format_enum {
        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
        CU_AD_FORMAT_HALF = 0x10,
        CU_AD_FORMAT_FLOAT = 0x20
    } CUarray_format;
 *  }
 * - \p NumChannels specifies the number of packed components per CUDA array
 * element; it may be 1, 2, or 4;
 *
 * Here are examples of CUDA array descriptions:
 *
 * Description for a CUDA array of 2048 floats:
 * {@code
    CUDA_ARRAY_DESCRIPTOR desc;
    desc.Format = CU_AD_FORMAT_FLOAT;
    desc.NumChannels = 1;
    desc.Width = 2048;
    desc.Height = 1;
 * }
 *
 * Description for a 64 x 64 CUDA array of floats:
 * {@code
    CUDA_ARRAY_DESCRIPTOR desc;
    desc.Format = CU_AD_FORMAT_FLOAT;
    desc.NumChannels = 1;
    desc.Width = 64;
    desc.Height = 64;
 * }
 *
 * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit
 * float16's:
 * {@code
    CUDA_ARRAY_DESCRIPTOR desc;
    desc.FormatFlags = CU_AD_FORMAT_HALF;
    desc.NumChannels = 4;
    desc.Width = width;
    desc.Height = height;
 * }
 *
 * Description for a \p width x \p height CUDA array of 16-bit elements, each
 * of which is two 8-bit unsigned chars:
 * {@code
    CUDA_ARRAY_DESCRIPTOR arrayDesc;
    desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8;
    desc.NumChannels = 2;
    desc.Width = width;
    desc.Height = height;
 * }
 *
 * @param pHandle        - Returned array
 * @param pAllocateArray - Array descriptor
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_UNKNOWN
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuArrayCreate(@ByPtrPtr CUarray_st pHandle, @Const CUDA_ARRAY_DESCRIPTOR pAllocateArray);

/**
 * \brief Get a 1D or 2D CUDA array descriptor
 *
 * Returns in \p *pArrayDescriptor a descriptor containing information on the
 * format and dimensions of the CUDA array \p hArray. It is useful for
 * subroutines that have been passed a CUDA array, but need to know the CUDA
 * array parameters for validation or other purposes.
 *
 * @param pArrayDescriptor - Returned array descriptor
 * @param hArray           - Array to get descriptor of
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_HANDLE
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR pArrayDescriptor, CUarray_st hArray);
// #endif /* __CUDA_API_VERSION >= 3020 */


/**
 * \brief Destroys a CUDA array
 *
 * Destroys the CUDA array \p hArray.
 *
 * @param hArray - Array to destroy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_ARRAY_IS_MAPPED
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuArrayDestroy(CUarray_st hArray);

// #if __CUDA_API_VERSION >= 3020
/**
 * \brief Creates a 3D CUDA array
 *
 * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
 * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
 * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
 *
 * {@code
    typedef struct {
        unsigned int Width;
        unsigned int Height;
        unsigned int Depth;
        CUarray_format Format;
        unsigned int NumChannels;
        unsigned int Flags;
    } CUDA_ARRAY3D_DESCRIPTOR;
 * }
 * where:
 *
 * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
 * CUDA array (in elements); the following types of CUDA arrays can be allocated:
 *     - A 1D array is allocated if \p Height and \p Depth extents are both zero.
 *     - A 2D array is allocated if only \p Depth extent is zero.
 *     - A 3D array is allocated if all three extents are non-zero.
 *     - A 1D layered CUDA array is allocated if only \p Height is zero and the 
 *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number 
 *       of layers is determined by the depth extent.
 *     - A 2D layered CUDA array is allocated if all three extents are non-zero and 
 *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number 
 *       of layers is determined by the depth extent.
 *     - A cubemap CUDA array is allocated if all three extents are non-zero and the
 *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and 
 *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, 
 *       where the six layers represent the six faces of a cube. The order of the six 
 *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
 *     - A cubemap layered CUDA array is allocated if all three extents are non-zero, 
 *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. 
 *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six. 
 *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that 
 *       consists of a collection of cubemaps. The first six layers represent the first 
 *       cubemap, the next six layers form the second cubemap, and so on.
 *
 * - ::Format specifies the format of the elements; ::CUarray_format is
 * defined as:
 * {@code
    typedef enum CUarray_format_enum {
        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
        CU_AD_FORMAT_HALF = 0x10,
        CU_AD_FORMAT_FLOAT = 0x20
    } CUarray_format;
 *  }
 *
 * - \p NumChannels specifies the number of packed components per CUDA array
 * element; it may be 1, 2, or 4;
 *
 * - ::Flags may be set to 
 *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set, 
 *     \p Depth specifies the number of layers, not the depth of a 3D array.
 *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array.  
 *     If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array 
 *     to a surface reference.
 *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be
 *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
 *     then \p Depth must be a multiple of six.
 *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather.
 *     Texture gather can only be performed on 2D CUDA arrays.
 *
 * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. 
 * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute 
 * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute 
 * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH.
 *
 * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag 
 * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH 
 * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case.
 *
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * CUDA array type Valid extents that must always be met
{(width range in elements), (height range), 
 * (depth range)} Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
 
 * {(width range in elements), (height range), (depth range)}
1D { (1,TEXTURE1D_WIDTH), 0, 0 } { (1,SURFACE1D_WIDTH), 0, 0 }
2D { (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 } { (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D { (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
 * 
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), 
 * (1,TEXTURE3D_DEPTH_ALTERNATE) } { (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), 
 * (1,SURFACE3D_DEPTH) }
1D Layered { (1,TEXTURE1D_LAYERED_WIDTH), 0, 
 * (1,TEXTURE1D_LAYERED_LAYERS) } { (1,SURFACE1D_LAYERED_WIDTH), 0, 
 * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered { (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), 
 * (1,TEXTURE2D_LAYERED_LAYERS) } { (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), 
 * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap { (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 } { (1,SURFACECUBEMAP_WIDTH), 
 * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered { (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), 
 * (1,TEXTURECUBEMAP_LAYERED_LAYERS) } { (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), 
 * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
 *
 * Here are examples of CUDA array descriptions:
 *
 * Description for a CUDA array of 2048 floats:
 * {@code
    CUDA_ARRAY3D_DESCRIPTOR desc;
    desc.Format = CU_AD_FORMAT_FLOAT;
    desc.NumChannels = 1;
    desc.Width = 2048;
    desc.Height = 0;
    desc.Depth = 0;
 * }
 *
 * Description for a 64 x 64 CUDA array of floats:
 * {@code
    CUDA_ARRAY3D_DESCRIPTOR desc;
    desc.Format = CU_AD_FORMAT_FLOAT;
    desc.NumChannels = 1;
    desc.Width = 64;
    desc.Height = 64;
    desc.Depth = 0;
 * }
 *
 * Description for a \p width x \p height x \p depth CUDA array of 64-bit,
 * 4x16-bit float16's:
 * {@code
    CUDA_ARRAY3D_DESCRIPTOR desc;
    desc.FormatFlags = CU_AD_FORMAT_HALF;
    desc.NumChannels = 4;
    desc.Width = width;
    desc.Height = height;
    desc.Depth = depth;
 * }
 *
 * @param pHandle        - Returned array
 * @param pAllocateArray - 3D array descriptor
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_UNKNOWN
 * \notefnerr
 *
 * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuArray3DCreate(@ByPtrPtr CUarray_st pHandle, @Const CUDA_ARRAY3D_DESCRIPTOR pAllocateArray);

/**
 * \brief Get a 3D CUDA array descriptor
 *
 * Returns in \p *pArrayDescriptor a descriptor containing information on the
 * format and dimensions of the CUDA array \p hArray. It is useful for
 * subroutines that have been passed a CUDA array, but need to know the CUDA
 * array parameters for validation or other purposes.
 *
 * This function may be called on 1D and 2D arrays, in which case the \p Height
 * and/or \p Depth members of the descriptor struct will be set to 0.
 *
 * @param pArrayDescriptor - Returned 3D array descriptor
 * @param hArray           - 3D array to get descriptor of
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_HANDLE
 * \notefnerr
 *
 * \sa ::cuArray3DCreate, ::cuArrayCreate,
 * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
 * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
 * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
 * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
 * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
 * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
 * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
 * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
 * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
 */
public static native @Cast("CUresult") int cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR pArrayDescriptor, CUarray_st hArray);
// #endif /* __CUDA_API_VERSION >= 3020 */

// #if __CUDA_API_VERSION >= 5000

/**
 * \brief Creates a CUDA mipmapped array
 *
 * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
 * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle.
 * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is
 * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
 *
 * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
 *
 * {@code
    typedef struct {
        unsigned int Width;
        unsigned int Height;
        unsigned int Depth;
        CUarray_format Format;
        unsigned int NumChannels;
        unsigned int Flags;
    } CUDA_ARRAY3D_DESCRIPTOR;
 * }
 * where:
 *
 * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
 * CUDA array (in elements); the following types of CUDA arrays can be allocated:
 *     - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero.
 *     - A 2D mipmapped array is allocated if only \p Depth extent is zero.
 *     - A 3D mipmapped array is allocated if all three extents are non-zero.
 *     - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the 
 *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number 
 *       of layers is determined by the depth extent.
 *     - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and 
 *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number 
 *       of layers is determined by the depth extent.
 *     - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
 *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and 
 *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, 
 *       where the six layers represent the six faces of a cube. The order of the six 
 *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
 *     - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, 
 *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. 
 *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six. 
 *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that 
 *       consists of a collection of cubemaps. The first six layers represent the first 
 *       cubemap, the next six layers form the second cubemap, and so on.
 *
 * - ::Format specifies the format of the elements; ::CUarray_format is
 * defined as:
 * {@code
    typedef enum CUarray_format_enum {
        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
        CU_AD_FORMAT_HALF = 0x10,
        CU_AD_FORMAT_FLOAT = 0x20
    } CUarray_format;
 *  }
 *
 * - \p NumChannels specifies the number of packed components per CUDA array
 * element; it may be 1, 2, or 4;
 *
 * - ::Flags may be set to 
 *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set, 
 *     \p Depth specifies the number of layers, not the depth of a 3D array.
 *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of
 *     the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to 
 *     bind a mipmap level of the CUDA mipmapped array to a surface reference.
  *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be
 *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
 *     then \p Depth must be a multiple of six.
 *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather.
 *     Texture gather can only be performed on 2D CUDA mipmapped arrays.
 *
 * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. 
 * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute 
 * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute 
 * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH.
 *
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * CUDA array type Valid extents that must always be met
{(width range in elements), (height range), 
 * (depth range)}
1D { (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }
2D { (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }
3D { (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
 * 
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), 
 * (1,TEXTURE3D_DEPTH_ALTERNATE) }
1D Layered { (1,TEXTURE1D_LAYERED_WIDTH), 0, 
 * (1,TEXTURE1D_LAYERED_LAYERS) }
2D Layered { (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), 
 * (1,TEXTURE2D_LAYERED_LAYERS) }
Cubemap { (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }
Cubemap Layered { (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), 
 * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }
 *
 *
 * @param pHandle             - Returned mipmapped array
 * @param pMipmappedArrayDesc - mipmapped array descriptor
 * @param numMipmapLevels     - Number of mipmap levels
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY,
 * ::CUDA_ERROR_UNKNOWN
 * \notefnerr
 *
 * \sa ::cuMipmappedArrayDestroy, ::cuMipmappedArrayGetLevel, ::cuArrayCreate,
 */
public static native @Cast("CUresult") int cuMipmappedArrayCreate(@ByPtrPtr CUmipmappedArray_st pHandle, @Const CUDA_ARRAY3D_DESCRIPTOR pMipmappedArrayDesc, @Cast("unsigned int") int numMipmapLevels);

/**
 * \brief Gets a mipmap level of a CUDA mipmapped array
 *
 * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level
 * of the CUDA mipmapped array \p hMipmappedArray.
 *
 * If \p level is greater than the maximum number of levels in this mipmapped array,
 * ::CUDA_ERROR_INVALID_VALUE is returned.
 *
 * @param pLevelArray     - Returned mipmap level CUDA array
 * @param hMipmappedArray - CUDA mipmapped array
 * @param level           - Mipmap level
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_HANDLE
 * \notefnerr
 *
 * \sa ::cuMipmappedArrayCreate, ::cuMipmappedArrayDestroy, ::cuArrayCreate,
 */
public static native @Cast("CUresult") int cuMipmappedArrayGetLevel(@ByPtrPtr CUarray_st pLevelArray, CUmipmappedArray_st hMipmappedArray, @Cast("unsigned int") int level);

/**
 * \brief Destroys a CUDA mipmapped array
 *
 * Destroys the CUDA mipmapped array \p hMipmappedArray.
 *
 * @param hMipmappedArray - Mipmapped array to destroy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_ARRAY_IS_MAPPED
 * \notefnerr
 *
 * \sa ::cuMipmappedArrayCreate, ::cuMipmappedArrayGetLevel, ::cuArrayCreate,
 */
public static native @Cast("CUresult") int cuMipmappedArrayDestroy(CUmipmappedArray_st hMipmappedArray);

// #endif /* __CUDA_API_VERSION >= 5000 */

/** \} */ /* END CUDA_MEM */

/**
 * \defgroup CUDA_UNIFIED Unified Addressing
 *
 * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver
 * API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the unified addressing functions of the 
 * low-level CUDA driver application programming interface.
 *
 * \{
 *
 * \section CUDA_UNIFIED_overview Overview
 *
 * CUDA devices can share a unified address space with the host.  
 * For these devices there is no distinction between a device
 * pointer and a host pointer -- the same pointer value may be 
 * used to access memory from the host program and from a kernel 
 * running on the device (with exceptions enumerated below).
 *
 * \section CUDA_UNIFIED_support Supported Platforms
 * 
 * Whether or not a device supports unified addressing may be 
 * queried by calling ::cuDeviceGetAttribute() with the device 
 * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.
 *
 * Unified addressing is automatically enabled in 64-bit processes 
 * on devices with compute capability greater than or equal to 2.0.
 *
 * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values
 *
 * It is possible to look up information about the memory which backs a 
 * pointer value.  For instance, one may want to know if a pointer points
 * to host or device memory.  As another example, in the case of device 
 * memory, one may want to know on which CUDA device the memory 
 * resides.  These properties may be queried using the function 
 * ::cuPointerGetAttribute()
 *
 * Since pointers are unique, it is not necessary to specify information
 * about the pointers specified to the various copy functions in the 
 * CUDA API.  The function ::cuMemcpy() may be used to perform a copy
 * between two pointers, ignoring whether they point to host or device
 * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH()
 * unnecessary for devices supporting unified addressing).  For 
 * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be
 * used to specify that the CUDA driver should infer the location of the
 * pointer from its value.
 *
 * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
 *
 * All host memory allocated in all contexts using ::cuMemAllocHost() and
 * ::cuMemHostAlloc() is always directly accessible from all contexts on
 * all devices that support unified addressing.  This is the case regardless 
 * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and
 * ::CU_MEMHOSTALLOC_DEVICEMAP are specified.
 *
 * The pointer value through which allocated host memory may be accessed 
 * in kernels on all devices that support unified addressing is the same 
 * as the pointer value through which that memory is accessed on the host,
 * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device 
 * pointer for these allocations.
 * 
 * Note that this is not the case for memory allocated using the flag
 * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below.
 *
 * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory
 *
 * Upon enabling direct access from a context that supports unified addressing 
 * to another peer context that supports unified addressing using 
 * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using 
 * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible 
 * by the current context.  The device pointer value through
 * which any peer memory may be accessed in the current context
 * is the same pointer value through which that memory may be
 * accessed in the peer context.
 *
 * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing
 * 
 * Not all memory may be accessed on devices through the same pointer
 * value through which they are accessed on the host.  These exceptions
 * are host memory registered using ::cuMemHostRegister() and host memory
 * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED.  For these 
 * exceptions, there exists a distinct host and device address for the
 * memory.  The device address is guaranteed to not overlap any valid host
 * pointer range and is guaranteed to have the same value across all 
 * contexts that support unified addressing.  
 * 
 * This device address may be queried using ::cuMemHostGetDevicePointer() 
 * when a context using unified addressing is current.  Either the host 
 * or the unified device pointer value may be used to refer to this memory 
 * through ::cuMemcpy() and similar functions using the 
 * ::CU_MEMORYTYPE_UNIFIED memory type.
 *
 */

// #if __CUDA_API_VERSION >= 4000
/**
 * \brief Returns information about a pointer
 * 
 * The supported attributes are:
 * 
 * - ::CU_POINTER_ATTRIBUTE_CONTEXT: 
 * 
 *      Returns in \p *data the ::CUcontext in which \p ptr was allocated or 
 *      registered.   
 *      The type of \p data must be ::CUcontext *.  
 *      
 *      If \p ptr was not allocated by, mapped by, or registered with
 *      a ::CUcontext which uses unified virtual addressing then 
 *      ::CUDA_ERROR_INVALID_VALUE is returned.
 * 
 * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE: 
 *    
 *      Returns in \p *data the physical memory type of the memory that 
 *      \p ptr addresses as a ::CUmemorytype enumerated value.
 *      The type of \p data must be unsigned int.
 *      
 *      If \p ptr addresses device memory then \p *data is set to 
 *      ::CU_MEMORYTYPE_DEVICE.  The particular ::CUdevice on which the 
 *      memory resides is the ::CUdevice of the ::CUcontext returned by the 
 *      ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr.
 *      
 *      If \p ptr addresses host memory then \p *data is set to 
 *      ::CU_MEMORYTYPE_HOST.
 *      
 *      If \p ptr was not allocated by, mapped by, or registered with
 *      a ::CUcontext which uses unified virtual addressing then 
 *      ::CUDA_ERROR_INVALID_VALUE is returned.
 *
 *      If the current ::CUcontext does not support unified virtual 
 *      addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned.
 *    
 * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER:
 * 
 *      Returns in \p *data the device pointer value through which
 *      \p ptr may be accessed by kernels running in the current 
 *      ::CUcontext.
 *      The type of \p data must be CUdeviceptr *.
 * 
 *      If there exists no device pointer value through which
 *      kernels running in the current ::CUcontext may access
 *      \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned.
 * 
 *      If there is no current ::CUcontext then 
 *      ::CUDA_ERROR_INVALID_CONTEXT is returned.
 *      
 *      Except in the exceptional disjoint addressing cases discussed 
 *      below, the value returned in \p *data will equal the input 
 *      value \p ptr.
 * 
 * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER:
 * 
 *      Returns in \p *data the host pointer value through which 
 *      \p ptr may be accessed by by the host program.
 *      The type of \p data must be void **.
 *      If there exists no host pointer value through which
 *      the host program may directly access \p ptr then 
 *      ::CUDA_ERROR_INVALID_VALUE is returned.
 * 
 *      Except in the exceptional disjoint addressing cases discussed 
 *      below, the value returned in \p *data will equal the input 
 *      value \p ptr.
 *
 * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS:
 *
 *      Returns in \p *data two tokens for use with the nv-p2p.h Linux
 *      kernel interface. \p data must be a struct of type
 *      CUDA_POINTER_ATTRIBUTE_P2P_TOKENS.
 *
 *      \p ptr must be a pointer to memory obtained from :cuMemAlloc().
 *      Note that p2pToken and vaSpaceToken are only valid for the
 *      lifetime of the source allocation. A subsequent allocation at
 *      the same address may return completely different tokens.
 *      Querying this attribute has a side effect of setting the attribute
 *      ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that
 *      \p ptr points to.
 * 
 * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
 *
 *      A boolean attribute which when set, ensures that synchronous memory operations
 *      initiated on the region of memory that \p ptr points to will always synchronize.
 *      See further documentation in the section titled "API synchronization behavior"
 *      to learn more about cases when synchronous memory operations can
 *      exhibit asynchronous behavior.
 *
 * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID:
 *
 *      Returns in \p *data a buffer ID which is guaranteed to be unique within the process.
 *      \p data must point to an unsigned long long.
 *
 *      \p ptr must be a pointer to memory obtained from a CUDA memory allocation API.
 *      Every memory allocation from any of the CUDA memory allocation APIs will
 *      have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs
 *      from previous freed allocations. IDs are only unique within a single process.
 *
 *
 * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED:
 *
 *      Returns in \p *data a boolean that indicates whether the pointer points to
 *      managed memory or not.
 *
 * \par
 *
 * Note that for most allocations in the unified virtual address space
 * the host and device pointer for accessing the allocation will be the 
 * same.  The exceptions to this are
 *  - user memory registered using ::cuMemHostRegister 
 *  - host memory allocated using ::cuMemHostAlloc with the 
 *    ::CU_MEMHOSTALLOC_WRITECOMBINED flag
 * For these types of allocation there will exist separate, disjoint host 
 * and device addresses for accessing the allocation.  In particular 
 *  - The host address will correspond to an invalid unmapped device address 
 *    (which will result in an exception if accessed from the device) 
 *  - The device address will correspond to an invalid unmapped host address 
 *    (which will result in an exception if accessed from the host).
 * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER 
 * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host 
 * and device addresses from either address.
 *
 * @param data      - Returned pointer attribute value
 * @param attribute - Pointer attribute to query
 * @param ptr       - Pointer
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa cuPointerSetAttribute,
 * ::cuMemAlloc,
 * ::cuMemFree,
 * ::cuMemAllocHost,
 * ::cuMemFreeHost,
 * ::cuMemHostAlloc,
 * ::cuMemHostRegister,
 * ::cuMemHostUnregister
 */
public static native @Cast("CUresult") int cuPointerGetAttribute(Pointer data, @Cast("CUpointer_attribute") int attribute, @Cast("CUdeviceptr") long ptr);
// #endif /* __CUDA_API_VERSION >= 4000 */

// #if __CUDA_API_VERSION >= 8000
/**
 * \brief Prefetches memory to the specified destination device
 *
 * Prefetches memory to the specified destination device.  \p devPtr is the 
 * base device pointer of the memory to be prefetched and \p dstDevice is the 
 * destination device. \p count specifies the number of bytes to copy. \p hStream
 * is the stream in which the operation is enqueued. The memory range must refer
 * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables.
 *
 * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If
 * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
 * must be non-zero. Additionally, \p hStream must be associated with a device that has a
 * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
 *
 * The start address and end address of the memory range will be rounded down and rounded up
 * respectively to be aligned to CPU page size before the prefetch operation is enqueued
 * in the stream.
 *
 * If no physical memory has been allocated for this region, then this memory region
 * will be populated and mapped on the destination device. If there's insufficient
 * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
 * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory
 * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted.
 *
 * By default, any mappings to the previous location of the migrated pages are removed and
 * mappings for the new location are only setup on \p dstDevice. The exact behavior however
 * also depends on the settings applied to this memory range via ::cuMemAdvise as described
 * below:
 *
 * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range,
 * then that subset will create a read-only copy of the pages on \p dstDevice.
 *
 * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory
 * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the
 * preferred location of any pages in the memory range.
 *
 * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range,
 * then mappings to those pages from all the appropriate processors are updated to
 * refer to the new location if establishing such a mapping is possible. Otherwise,
 * those mappings are cleared.
 *
 * Note that this API is not required for functionality and only serves to improve performance
 * by allowing the application to migrate data to a suitable location before it is accessed.
 * Memory accesses to this range are always coherent and are allowed even when the data is
 * actively being migrated.
 *
 * Note that this function is asynchronous with respect to the host and all work
 * on other devices.
 *
 * @param devPtr    - Pointer to be prefetched
 * @param count     - Size in bytes
 * @param dstDevice - Destination device to prefetch to
 * @param hStream    - Stream to enqueue prefetch operation
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
 * ::cuMemcpy3DPeerAsync, ::cuMemAdvise
 */
public static native @Cast("CUresult") int cuMemPrefetchAsync(@Cast("CUdeviceptr") long devPtr, @Cast("size_t") long count, @Cast("CUdevice") int dstDevice, CUstream_st hStream);

/**
 * \brief Advise about the usage of a given memory range
 *
 * Advise the Unified Memory subsystem about the usage pattern for the memory range
 * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
 * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
 * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged
 * or declared via __managed__ variables.
 *
 * The \p advice parameter can take the following values:
 * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
 * from and only occasionally written to. Any read accesses from any processor to this region will create a
 * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync
 * is called on this region, it will create a read-only copy of the data on the destination processor.
 * If any processor writes to this region, all copies of the corresponding page will be invalidated
 * except for the one where the write occurred. The \p device argument is ignored for this advice.
 * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
 * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
 * Also, if a context is created on a device that does not have the device attribute
 * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until
 * all such contexts are destroyed.
 * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY:  Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the
 * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
 * copies of the data will be collapsed into a single copy. The location for the collapsed
 * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
 * copies was resident at that location. Otherwise, the location chosen is arbitrary.
 * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
 * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the
 * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the
 * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location
 * does not cause data to migrate to that location immediately. Instead, it guides the migration policy
 * when a fault occurs on that memory region. If the data is already in its preferred location and the
 * faulting processor can establish a mapping without requiring the data to be migrated, then
 * data migration will be avoided. On the other hand, if the data is not in its preferred location
 * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
 * it. It is important to note that setting the preferred location does not prevent data prefetching
 * done using ::cuMemPrefetchAsync.
 * Having a preferred location can override the page thrash detection and resolution logic in the Unified
 * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
 * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
 * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
 * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
 * policies associated with that advice will override the policies of this advice.
 * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
 * and changes the preferred location to none.
 * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device.
 * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then
 * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero.
 * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
 * it causes the data to always be mapped in the specified processor's page tables, as long as the
 * location of the data permits a mapping to be established. If the data gets migrated for any reason,
 * the mappings are updated accordingly.
 * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
 * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
 * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
 * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
 * migration may be too high. But preventing faults can still help improve performance, and so having
 * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
 * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
 * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
 * page in host memory.
 * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
 * policies associated with that advice will override the policies of this advice. Additionally, if the
 * preferred location of this memory region or any subset of it is also \p device, then the policies
 * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
 * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to
 * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults.
 *
 * @param devPtr - Pointer to memory to set the advice for
 * @param count  - Size in bytes of the memory range
 * @param advice - Advice to be applied for the specified memory range
 * @param device - Device to apply the advice for
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
 * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync
 */
public static native @Cast("CUresult") int cuMemAdvise(@Cast("CUdeviceptr") long devPtr, @Cast("size_t") long count, @Cast("CUmem_advise") int advice, @Cast("CUdevice") int device);

/**
 * \brief Query an attribute of a given memory range
 * 
 * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The
 * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
 * __managed__ variables.
 *
 * The \p attribute parameter can take the following values:
 * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted
 * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given
 * memory range have read-duplication enabled, or 0 otherwise.
 * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be
 * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device
 * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU
 * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID
 * if either all the pages don't have the same preferred location or some of the pages don't have a
 * preferred location at all. Note that the actual location of the pages in the memory range at the time of
 * the query may be different from the preferred location. 
 * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted
 * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned
 * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range.
 * If any device does not have that advice set for the entire memory range, that device will not be included.
 * If \p data is larger than the number of devices that have that advice set for that memory range,
 * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12
 * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be
 * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have
 * that advice set, then only as many devices will be returned as can fit in the array. There is no
 * guarantee on which specific devices will be returned, however.
 * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be
 * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location
 * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be
 * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU
 * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
 * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the
 * last location that the applicaton requested to prefetch the memory range to. It gives no indication as to
 * whether the prefetch operation to that location has completed or even begun.
 *
 * @param data      - A pointers to a memory location where the result
 *                    of each attribute query will be written to.
 * @param dataSize  - Array containing the size of data
 * @param attribute - The attribute to query
 * @param devPtr    - Start of the range to query
 * @param count     - Size of the range to query
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync,
 * ::cuMemAdvise
 */


/**
 * \brief Query attributes of a given memory range.
 *
 * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The
 * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
 * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes
 * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries.
 * The results of the query will be stored in \p data.
 *
 * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for
 * attribute descriptions and restrictions.
 *
 * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY
 * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION
 * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY
 * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION
 *
 * @param data          - A two-dimensional array containing pointers to memory
 *                        locations where the result of each attribute query will be written to.
 * @param dataSizes     - Array containing the sizes of each result
 * @param attributes    - An array of attributes to query
 *                        (numAttributes and the number of attributes in this array should match)
 * @param numAttributes - Number of attributes to query
 * @param devPtr        - Start of the range to query
 * @param count         - Size of the range to query
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise
 * ::cuMemPrefetchAsync
 */

// #endif /* __CUDA_API_VERSION >= 8000 */

// #if __CUDA_API_VERSION >= 6000
/**
 * \brief Set attributes on a previously allocated memory region
 *
 * The supported attributes are:
 *
 * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
 *
 *      A boolean attribute that can either be set (1) or unset (0). When set,
 *      the region of memory that \p ptr points to is guaranteed to always synchronize
 *      memory operations that are synchronous. If there are some previously initiated
 *      synchronous memory operations that are pending when this attribute is set, the
 *      function does not return until those memory operations are complete.
 *      See further documentation in the section titled "API synchronization behavior"
 *      to learn more about cases when synchronous memory operations can
 *      exhibit asynchronous behavior.
 *      \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set.
 *
 * @param value     - Pointer to memory containing the value to be set
 * @param attribute - Pointer attribute to set
 * @param ptr       - Pointer to a memory region allocated using CUDA memory allocation APIs
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa ::cuPointerGetAttribute,
 * ::cuPointerGetAttributes,
 * ::cuMemAlloc,
 * ::cuMemFree,
 * ::cuMemAllocHost,
 * ::cuMemFreeHost,
 * ::cuMemHostAlloc,
 * ::cuMemHostRegister,
 * ::cuMemHostUnregister
 */
public static native @Cast("CUresult") int cuPointerSetAttribute(@Const Pointer value, @Cast("CUpointer_attribute") int attribute, @Cast("CUdeviceptr") long ptr);
// #endif /* __CUDA_API_VERSION >= 6000 */

// #if __CUDA_API_VERSION >= 7000
/**
 * \brief Returns information about a pointer.
 *
 * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions):
 *
 * - ::CU_POINTER_ATTRIBUTE_CONTEXT
 * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE
 * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER
 * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER
 * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
 * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID
 * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED
 *
 * @param numAttributes - Number of attributes to query
 * @param attributes    - An array of attributes to query
 *                      (numAttributes and the number of attributes in this array should match)
 * @param data          - A two-dimensional array containing pointers to memory
 *                      locations where the result of each attribute query will be written to.
 * @param ptr           - Pointer to query
 *
 * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr
 * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values
 * and CUDA_SUCCESS is returned.
 *
 * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA
 * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned.
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa ::cuPointerGetAttribute,
 * ::cuPointerSetAttribute
 */
public static native @Cast("CUresult") int cuPointerGetAttributes(@Cast("unsigned int") int numAttributes, @Cast("CUpointer_attribute*") IntPointer attributes, @Cast("void**") PointerPointer data, @Cast("CUdeviceptr") long ptr);
public static native @Cast("CUresult") int cuPointerGetAttributes(@Cast("unsigned int") int numAttributes, @Cast("CUpointer_attribute*") IntPointer attributes, @Cast("void**") @ByPtrPtr Pointer data, @Cast("CUdeviceptr") long ptr);
public static native @Cast("CUresult") int cuPointerGetAttributes(@Cast("unsigned int") int numAttributes, @Cast("CUpointer_attribute*") IntBuffer attributes, @Cast("void**") @ByPtrPtr Pointer data, @Cast("CUdeviceptr") long ptr);
public static native @Cast("CUresult") int cuPointerGetAttributes(@Cast("unsigned int") int numAttributes, @Cast("CUpointer_attribute*") int[] attributes, @Cast("void**") @ByPtrPtr Pointer data, @Cast("CUdeviceptr") long ptr);
// #endif /* __CUDA_API_VERSION >= 7000 */

/** \} */ /* END CUDA_UNIFIED */

/**
 * \defgroup CUDA_STREAM Stream Management
 *
 * ___MANBRIEF___ stream management functions of the low-level CUDA driver API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the stream management functions of the low-level CUDA
 * driver application programming interface.
 *
 * \{
 */

/**
 * \brief Create a stream
 *
 * Creates a stream and returns a handle in \p phStream.  The \p Flags argument
 * determines behaviors of the stream.  Valid values for \p Flags are:
 * - ::CU_STREAM_DEFAULT: Default stream creation flag.
 * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created 
 *   stream may run concurrently with work in stream 0 (the NULL stream), and that
 *   the created stream should perform no implicit synchronization with stream 0.
 *
 * @param phStream - Returned newly created stream
 * @param Flags    - Parameters for stream creation
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY
 * \notefnerr
 *
 * \sa ::cuStreamDestroy,
 * ::cuStreamCreateWithPriority,
 * ::cuStreamGetPriority,
 * ::cuStreamGetFlags,
 * ::cuStreamWaitEvent,
 * ::cuStreamQuery,
 * ::cuStreamSynchronize,
 * ::cuStreamAddCallback
 */
public static native @Cast("CUresult") int cuStreamCreate(@ByPtrPtr CUstream_st phStream, @Cast("unsigned int") int Flags);

/**
 * \brief Create a stream with the given priority
 *
 * Creates a stream with the specified priority and returns a handle in \p phStream.
 * This API alters the scheduler priority of work in the stream. Work in a higher
 * priority stream may preempt work already executing in a low priority stream.
 *
 * \p priority follows a convention where lower numbers represent higher priorities.
 * '0' represents default priority. The range of meaningful numerical priorities can
 * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
 * outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
 * it will automatically be clamped to the lowest or the highest number in the range.
 *
 * @param phStream    - Returned newly created stream
 * @param flags       - Flags for stream creation. See ::cuStreamCreate for a list of
 *                      valid flags
 * @param priority    - Stream priority. Lower numbers represent higher priorities.
 *                      See ::cuCtxGetStreamPriorityRange for more information about
 *                      meaningful stream priorities that can be passed.
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY
 * \notefnerr
 *
 * \note Stream priorities are supported only on Quadro and Tesla GPUs
 * with compute capability 3.5 or higher.
 *
 * \note In the current implementation, only compute kernels launched in
 * priority streams are affected by the stream's priority. Stream priorities have
 * no effect on host-to-device and device-to-host memory operations.
 *
 * \sa ::cuStreamDestroy,
 * ::cuStreamCreate,
 * ::cuStreamGetPriority,
 * ::cuCtxGetStreamPriorityRange,
 * ::cuStreamGetFlags,
 * ::cuStreamWaitEvent,
 * ::cuStreamQuery,
 * ::cuStreamSynchronize,
 * ::cuStreamAddCallback
 */
public static native @Cast("CUresult") int cuStreamCreateWithPriority(@ByPtrPtr CUstream_st phStream, @Cast("unsigned int") int flags, int priority);


/**
 * \brief Query the priority of a given stream
 *
 * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
 * and return the priority in \p priority. Note that if the stream was created with a
 * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
 * this function returns the clamped priority.
 * See ::cuStreamCreateWithPriority for details about priority clamping.
 *
 * @param hStream    - Handle to the stream to be queried
 * @param priority   - Pointer to a signed integer in which the stream's priority is returned
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_OUT_OF_MEMORY
 * \notefnerr
 *
 * \sa ::cuStreamDestroy,
 * ::cuStreamCreate,
 * ::cuStreamCreateWithPriority,
 * ::cuCtxGetStreamPriorityRange,
 * ::cuStreamGetFlags
 */
public static native @Cast("CUresult") int cuStreamGetPriority(CUstream_st hStream, IntPointer priority);
public static native @Cast("CUresult") int cuStreamGetPriority(CUstream_st hStream, IntBuffer priority);
public static native @Cast("CUresult") int cuStreamGetPriority(CUstream_st hStream, int[] priority);

/**
 * \brief Query the flags of a given stream
 *
 * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
 * and return the flags in \p flags.
 *
 * @param hStream    - Handle to the stream to be queried
 * @param flags      - Pointer to an unsigned integer in which the stream's flags are returned
 *                     The value returned in \p flags is a logical 'OR' of all flags that
 *                     were used while creating this stream. See ::cuStreamCreate for the list
 *                     of valid flags
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_OUT_OF_MEMORY
 * \notefnerr
 *
 * \sa ::cuStreamDestroy,
 * ::cuStreamCreate,
 * ::cuStreamGetPriority
 */
public static native @Cast("CUresult") int cuStreamGetFlags(CUstream_st hStream, @Cast("unsigned int*") IntPointer flags);
public static native @Cast("CUresult") int cuStreamGetFlags(CUstream_st hStream, @Cast("unsigned int*") IntBuffer flags);
public static native @Cast("CUresult") int cuStreamGetFlags(CUstream_st hStream, @Cast("unsigned int*") int[] flags);


/**
 * \brief Make a compute stream wait on an event
 *
 * Makes all future work submitted to \p hStream wait until \p hEvent
 * reports completion before beginning execution.  This synchronization
 * will be performed efficiently on the device.  The event \p hEvent may
 * be from a different context than \p hStream, in which case this function
 * will perform cross-device synchronization.
 *
 * The stream \p hStream will wait only for the completion of the most recent
 * host call to ::cuEventRecord() on \p hEvent.  Once this call has returned,
 * any functions (including ::cuEventRecord() and ::cuEventDestroy()) may be
 * called on \p hEvent again, and subsequent calls will not have any
 * effect on \p hStream.
 *
 * If ::cuEventRecord() has not been called on \p hEvent, this call acts as if
 * the record has already completed, and so is a functional no-op.
 *
 * @param hStream - Stream to wait
 * @param hEvent  - Event to wait on (may not be NULL)
 * @param Flags   - Parameters for the operation (must be 0)
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * \note_null_stream
 * \notefnerr
 *
 * \sa ::cuStreamCreate,
 * ::cuEventRecord,
 * ::cuStreamQuery,
 * ::cuStreamSynchronize,
 * ::cuStreamAddCallback,
 * ::cuStreamDestroy
 */
public static native @Cast("CUresult") int cuStreamWaitEvent(CUstream_st hStream, CUevent_st hEvent, @Cast("unsigned int") int Flags);

/**
 * \brief Add a callback to a compute stream
 *
 * Adds a callback to be called on the host after all currently enqueued
 * items in the stream have completed.  For each 
 * cuStreamAddCallback call, the callback will be executed exactly once.
 * The callback will block later work in the stream until it is finished.
 *
 * The callback may be passed ::CUDA_SUCCESS or an error code.  In the event
 * of a device error, all subsequently executed callbacks will receive an
 * appropriate ::CUresult.
 *
 * Callbacks must not make any CUDA API calls.  Attempting to use a CUDA API
 * will result in ::CUDA_ERROR_NOT_PERMITTED.  Callbacks must not perform any
 * synchronization that may depend on outstanding device work or other callbacks
 * that are not mandated to run earlier.  Callbacks without a mandated order
 * (in independent streams) execute in undefined order and may be serialized.
 *
 * This API requires compute capability 1.1 or greater.  See
 * ::cuDeviceGetAttribute or ::cuDeviceGetProperties to query compute
 * capability.  Attempting to use this API with earlier compute versions will
 * return ::CUDA_ERROR_NOT_SUPPORTED.
 *
 * For the purposes of Unified Memory, callback execution makes a number of
 * guarantees:
 * 
 *   The callback stream is considered idle for the duration of the
 *   callback.  Thus, for example, a callback may always use memory attached
 *   to the callback stream.
 *   The start of execution of a callback has the same effect as
 *   synchronizing an event recorded in the same stream immediately prior to
 *   the callback.  It thus synchronizes streams which have been "joined"
 *   prior to the callback.
 *   Adding device work to any stream does not have the effect of making
 *   the stream active until all preceding callbacks have executed.  Thus, for
 *   example, a callback might use global attached memory even if work has
 *   been added to another stream, if it has been properly ordered with an
 *   event.
 *   Completion of a callback does not cause a stream to become
 *   active except as described above.  The callback stream will remain idle
 *   if no device work follows the callback, and will remain idle across
 *   consecutive callbacks without device work in between.  Thus, for example,
 *   stream synchronization can be done by signaling from a callback at the
 *   end of the stream.
 * 
 *
 * @param hStream  - Stream to add callback to
 * @param callback - The function to call once preceding stream operations are complete
 * @param userData - User specified data to be passed to the callback function
 * @param flags    - Reserved for future use, must be 0
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_NOT_SUPPORTED
 * \note_null_stream
 * \notefnerr
 *
 * \sa ::cuStreamCreate,
 * ::cuStreamQuery,
 * ::cuStreamSynchronize,
 * ::cuStreamWaitEvent,
 * ::cuStreamDestroy,
 * ::cuMemAllocManaged,
 * ::cuStreamAttachMemAsync
 */
public static native @Cast("CUresult") int cuStreamAddCallback(CUstream_st hStream, CUstreamCallback callback, Pointer userData, @Cast("unsigned int") int flags);

// #if __CUDA_API_VERSION >= 6000

/**
 * \brief Attach memory to a stream asynchronously
 *
 * Enqueues an operation in \p hStream to specify stream association of
 * \p length bytes of memory starting from \p dptr. This function is a
 * stream-ordered operation, meaning that it is dependent on, and will
 * only take effect when, previous work in stream has completed. Any
 * previous association is automatically replaced.
 *
 * \p dptr must point to an address within managed memory space declared
 * using the __managed__ keyword or allocated with ::cuMemAllocManaged.
 *
 * \p length must be zero, to indicate that the entire allocation's
 * stream association is being changed. Currently, it's not possible
 * to change stream association for a portion of an allocation.
 *
 * The stream association is specified using \p flags which must be
 * one of ::CUmemAttach_flags.
 * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed
 * by any stream on any device.
 * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee
 * that it won't access the memory on the device from any stream on a device that
 * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
 * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with
 * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
 * the program makes a guarantee that it will only access the memory on the device
 * from \p hStream. It is illegal to attach singly to the NULL stream, because the
 * NULL stream is a virtual global stream and not a specific stream. An error will
 * be returned in this case.
 *
 * When memory is associated with a single stream, the Unified Memory system will
 * allow CPU access to this memory region so long as all operations in \p hStream
 * have completed, regardless of whether other streams are active. In effect,
 * this constrains exclusive ownership of the managed memory region by
 * an active GPU to per-stream activity instead of whole-GPU activity.
 *
 * Accessing memory on the device from streams that are not associated with
 * it will produce undefined results. No error checking is performed by the
 * Unified Memory system to ensure that kernels launched into other streams
 * do not access this region. 
 *
 * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync
 * via events, synchronization or other means to ensure legal access to memory
 * at all times. Data visibility and coherency will be changed appropriately
 * for all kernels which follow a stream-association change.
 *
 * If \p hStream is destroyed while data is associated with it, the association is
 * removed and the association reverts to the default visibility of the allocation
 * as specified at ::cuMemAllocManaged. For __managed__ variables, the default
 * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an
 * asynchronous operation, and as a result, the change to default association won't
 * happen until all work in the stream has completed.
 *
 * @param hStream - Stream in which to enqueue the attach operation
 * @param dptr    - Pointer to memory (must be a pointer to managed memory)
 * @param length  - Length of memory (must be zero)
 * @param flags   - Must be one of ::CUmemAttach_flags
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_NOT_SUPPORTED
 * \note_null_stream
 * \notefnerr
 *
 * \sa ::cuStreamCreate,
 * ::cuStreamQuery,
 * ::cuStreamSynchronize,
 * ::cuStreamWaitEvent,
 * ::cuStreamDestroy,
 * ::cuMemAllocManaged
 */
public static native @Cast("CUresult") int cuStreamAttachMemAsync(CUstream_st hStream, @Cast("CUdeviceptr") long dptr, @Cast("size_t") long length, @Cast("unsigned int") int flags);

// #endif /* __CUDA_API_VERSION >= 6000 */

/**
 * \brief Determine status of a compute stream
 *
 * Returns ::CUDA_SUCCESS if all operations in the stream specified by
 * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not.
 *
 * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
 * is equivalent to having called ::cuStreamSynchronize().
 *
 * @param hStream - Stream to query status of
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_NOT_READY
 * \note_null_stream
 * \notefnerr
 *
 * \sa ::cuStreamCreate,
 * ::cuStreamWaitEvent,
 * ::cuStreamDestroy,
 * ::cuStreamSynchronize,
 * ::cuStreamAddCallback
 */
public static native @Cast("CUresult") int cuStreamQuery(CUstream_st hStream);

/**
 * \brief Wait until a stream's tasks are completed
 *
 * Waits until the device has completed all operations in the stream specified
 * by \p hStream. If the context was created with the 
 * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the
 * stream is finished with all of its tasks.
 *
 * @param hStream - Stream to wait for
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE
 * \note_null_stream
 * \notefnerr
 *
 * \sa ::cuStreamCreate,
 * ::cuStreamDestroy,
 * ::cuStreamWaitEvent,
 * ::cuStreamQuery,
 * ::cuStreamAddCallback
 */
public static native @Cast("CUresult") int cuStreamSynchronize(CUstream_st hStream);

// #if __CUDA_API_VERSION >= 4000
/**
 * \brief Destroys a stream
 *
 * Destroys the stream specified by \p hStream.  
 *
 * In case the device is still doing work in the stream \p hStream
 * when ::cuStreamDestroy() is called, the function will return immediately 
 * and the resources associated with \p hStream will be released automatically 
 * once the device has completed all work in \p hStream.
 *
 * @param hStream - Stream to destroy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuStreamCreate,
 * ::cuStreamWaitEvent,
 * ::cuStreamQuery,
 * ::cuStreamSynchronize,
 * ::cuStreamAddCallback
 */
public static native @Cast("CUresult") int cuStreamDestroy(CUstream_st hStream);
// #endif /* __CUDA_API_VERSION >= 4000 */

/** \} */ /* END CUDA_STREAM */


/**
 * \defgroup CUDA_EVENT Event Management
 *
 * ___MANBRIEF___ event management functions of the low-level CUDA driver API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the event management functions of the low-level CUDA
 * driver application programming interface.
 *
 * \{
 */

/**
 * \brief Creates an event
 *
 * Creates an event *phEvent with the flags specified via \p Flags. Valid flags
 * include:
 * - ::CU_EVENT_DEFAULT: Default event creation flag.
 * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
 *   synchronization.  A CPU thread that uses ::cuEventSynchronize() to wait on
 *   an event created with this flag will block until the event has actually
 *   been recorded.
 * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need
 *   to record timing data.  Events created with this flag specified and
 *   the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best
 *   performance when used with ::cuStreamWaitEvent() and ::cuEventQuery().
 * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an
 *   interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must
 *   be specified along with ::CU_EVENT_DISABLE_TIMING.
 *
 * @param phEvent - Returns newly created event
 * @param Flags   - Event creation flags
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_OUT_OF_MEMORY
 * \notefnerr
 *
 * \sa
 * ::cuEventRecord,
 * ::cuEventQuery,
 * ::cuEventSynchronize,
 * ::cuEventDestroy,
 * ::cuEventElapsedTime
 */
public static native @Cast("CUresult") int cuEventCreate(@ByPtrPtr CUevent_st phEvent, @Cast("unsigned int") int Flags);

/**
 * \brief Records an event
 *
 * Records an event. See note on NULL stream behavior. Since operation is
 * asynchronous, ::cuEventQuery or ::cuEventSynchronize() must be used
 * to determine when the event has actually been recorded.
 *
 * If ::cuEventRecord() has previously been called on \p hEvent, then this
 * call will overwrite any existing state in \p hEvent.  Any subsequent calls
 * which examine the status of \p hEvent will only examine the completion of
 * this most recent call to ::cuEventRecord().
 *
 * It is necessary that \p hEvent and \p hStream be created on the same context.
 *
 * @param hEvent  - Event to record
 * @param hStream - Stream to record event for
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_INVALID_VALUE
 * \note_null_stream
 * \notefnerr
 *
 * \sa ::cuEventCreate,
 * ::cuEventQuery,
 * ::cuEventSynchronize,
 * ::cuStreamWaitEvent,
 * ::cuEventDestroy,
 * ::cuEventElapsedTime
 */
public static native @Cast("CUresult") int cuEventRecord(CUevent_st hEvent, CUstream_st hStream);

/**
 * \brief Queries an event's status
 *
 * Query the status of all device work preceding the most recent
 * call to ::cuEventRecord() (in the appropriate compute streams,
 * as specified by the arguments to ::cuEventRecord()).
 *
 * If this work has successfully been completed by the device, or if
 * ::cuEventRecord() has not been called on \p hEvent, then ::CUDA_SUCCESS is
 * returned. If this work has not yet been completed by the device then
 * ::CUDA_ERROR_NOT_READY is returned.
 *
 * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
 * is equivalent to having called ::cuEventSynchronize().
 *
 * @param hEvent - Event to query
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_NOT_READY
 * \notefnerr
 *
 * \sa ::cuEventCreate,
 * ::cuEventRecord,
 * ::cuEventSynchronize,
 * ::cuEventDestroy,
 * ::cuEventElapsedTime
 */
public static native @Cast("CUresult") int cuEventQuery(CUevent_st hEvent);

/**
 * \brief Waits for an event to complete
 *
 * Wait until the completion of all device work preceding the most recent
 * call to ::cuEventRecord() (in the appropriate compute streams, as specified
 * by the arguments to ::cuEventRecord()).
 *
 * If ::cuEventRecord() has not been called on \p hEvent, ::CUDA_SUCCESS is
 * returned immediately.
 *
 * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC
 * flag will cause the calling CPU thread to block until the event has
 * been completed by the device.  If the ::CU_EVENT_BLOCKING_SYNC flag has
 * not been set, then the CPU thread will busy-wait until the event has
 * been completed by the device.
 *
 * @param hEvent - Event to wait for
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE
 * \notefnerr
 *
 * \sa ::cuEventCreate,
 * ::cuEventRecord,
 * ::cuEventQuery,
 * ::cuEventDestroy,
 * ::cuEventElapsedTime
 */
public static native @Cast("CUresult") int cuEventSynchronize(CUevent_st hEvent);

// #if __CUDA_API_VERSION >= 4000
/**
 * \brief Destroys an event
 *
 * Destroys the event specified by \p hEvent.
 *
 * In case \p hEvent has been recorded but has not yet been completed
 * when ::cuEventDestroy() is called, the function will return immediately and 
 * the resources associated with \p hEvent will be released automatically once
 * the device has completed \p hEvent.
 *
 * @param hEvent - Event to destroy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE
 * \notefnerr
 *
 * \sa ::cuEventCreate,
 * ::cuEventRecord,
 * ::cuEventQuery,
 * ::cuEventSynchronize,
 * ::cuEventElapsedTime
 */
public static native @Cast("CUresult") int cuEventDestroy(CUevent_st hEvent);
// #endif /* __CUDA_API_VERSION >= 4000 */

/**
 * \brief Computes the elapsed time between two events
 *
 * Computes the elapsed time between two events (in milliseconds with a
 * resolution of around 0.5 microseconds).
 *
 * If either event was last recorded in a non-NULL stream, the resulting time
 * may be greater than expected (even if both used the same stream handle). This
 * happens because the ::cuEventRecord() operation takes place asynchronously
 * and there is no guarantee that the measured latency is actually just between
 * the two events. Any number of other different stream operations could execute
 * in between the two measured events, thus altering the timing in a significant
 * way.
 *
 * If ::cuEventRecord() has not been called on either event then
 * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
 * on both events but one or both of them has not yet been completed (that is,
 * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
 * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
 * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
 * ::CUDA_ERROR_INVALID_HANDLE.
 *
 * @param pMilliseconds - Time between \p hStart and \p hEnd in ms
 * @param hStart        - Starting event
 * @param hEnd          - Ending event
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_NOT_READY
 * \notefnerr
 *
 * \sa ::cuEventCreate,
 * ::cuEventRecord,
 * ::cuEventQuery,
 * ::cuEventSynchronize,
 * ::cuEventDestroy
 */
public static native @Cast("CUresult") int cuEventElapsedTime(FloatPointer pMilliseconds, CUevent_st hStart, CUevent_st hEnd);
public static native @Cast("CUresult") int cuEventElapsedTime(FloatBuffer pMilliseconds, CUevent_st hStart, CUevent_st hEnd);
public static native @Cast("CUresult") int cuEventElapsedTime(float[] pMilliseconds, CUevent_st hStart, CUevent_st hEnd);

// #if __CUDA_API_VERSION >= 8000
/**
 * \brief Wait on a memory location
 *
 * Enqueues a synchronization of the stream on the given memory location. Work
 * ordered after the operation will block until the given condition on the
 * memory is satisfied. By default, the condition is to wait for
 * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal.
 * Other condition types can be specified via \p flags.
 *
 * If the memory was registered via ::cuMemHostRegister(), the device pointer
 * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
 * be used with managed memory (::cuMemAllocManaged).
 *
 * On Windows, the device must be using TCC, or the operation is not supported.
 * See ::cuDeviceGetAttributes().
 *
 * @param stream The stream to synchronize on the memory location.
 * @param addr The memory location to wait on.
 * @param value The value to compare with the memory location.
 * @param flags See ::CUstreamWaitValue_flags.
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_NOT_SUPPORTED
 * \notefnerr
 *
 * \sa ::cuStreamWriteValue32,
 * ::cuStreamBatchMemOp,
 * ::cuMemHostRegister,
 * ::cuStreamWaitEvent
 */
public static native @Cast("CUresult") int cuStreamWaitValue32(CUstream_st stream, @Cast("CUdeviceptr") long addr, @Cast("cuuint32_t") int value, @Cast("unsigned int") int flags);

/**
 * \brief Write a value to memory
 *
 * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
 * flag is passed, the write is preceded by a system-wide memory fence,
 * equivalent to a __threadfence_system() but scoped to the stream
 * rather than a CUDA thread.
 *
 * If the memory was registered via ::cuMemHostRegister(), the device pointer
 * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
 * be used with managed memory (::cuMemAllocManaged).
 *
 * On Windows, the device must be using TCC, or the operation is not supported.
 * See ::cuDeviceGetAttribute().
 *
 * @param stream The stream to do the write in.
 * @param addr The device address to write to.
 * @param value The value to write.
 * @param flags See ::CUstreamWriteValue_flags.
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_NOT_SUPPORTED
 * \notefnerr
 *
 * \sa ::cuStreamWaitValue32,
 * ::cuStreamBatchMemOp,
 * ::cuMemHostRegister,
 * ::cuEventRecord
 */
public static native @Cast("CUresult") int cuStreamWriteValue32(CUstream_st stream, @Cast("CUdeviceptr") long addr, @Cast("cuuint32_t") int value, @Cast("unsigned int") int flags);

/**
 * \brief Batch operations to synchronize the stream via memory operations
 *
 * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32().
 * Batching operations may avoid some performance overhead in both the API call
 * and the device execution versus adding them to the stream in separate API
 * calls. The operations are enqueued in the order they appear in the array.
 *
 * See ::CUstreamBatchMemOpType for the full set of supported operations, and
 * ::cuStreamWaitValue32() and ::cuStreamWriteValue32() for details of specific
 * operations.
 *
 * On Windows, the device must be using TCC, or this call is not supported. See
 * ::cuDeviceGetAttribute().
 *
 * @param stream The stream to enqueue the operations in.
 * @param count The number of operations in the array. Must be less than 256.
 * @param paramArray The types and parameters of the individual operations.
 * @param flags Reserved for future expansion; must be 0.
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_NOT_SUPPORTED
 * \notefnerr
 *
 * \sa ::cuStreamWaitValue32,
 * ::cuStreamWriteValue32,
 * ::cuMemHostRegister
 */
public static native @Cast("CUresult") int cuStreamBatchMemOp(CUstream_st stream, @Cast("unsigned int") int count, CUstreamBatchMemOpParams paramArray, @Cast("unsigned int") int flags);
// #endif /* __CUDA_API_VERSION >= 8000 */

/** \} */ /* END CUDA_EVENT */

/**
 * \defgroup CUDA_EXEC Execution Control
 *
 * ___MANBRIEF___ execution control functions of the low-level CUDA driver API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the execution control functions of the low-level CUDA
 * driver application programming interface.
 *
 * \{
 */

/**
 * \brief Returns information about a function
 *
 * Returns in \p *pi the integer value of the attribute \p attrib on the kernel
 * given by \p hfunc. The supported attributes are:
 * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
 *   per block, beyond which a launch of the function would fail. This number
 *   depends on both the function and the device on which the function is
 *   currently loaded.
 * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
 *   statically-allocated shared memory per block required by this function.
 *   This does not include dynamically-allocated shared memory requested by
 *   the user at runtime.
 * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
 *   constant memory required by this function.
 * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
 *   used by each thread of this function.
 * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
 *   of this function.
 * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
 *   which the function was compiled. This value is the major PTX version * 10
 *   + the minor PTX version, so a PTX version 1.3 function would return the
 *   value 13. Note that this may return the undefined value of 0 for cubins
 *   compiled prior to CUDA 3.0.
 * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
 *   which the function was compiled. This value is the major binary
 *   version * 10 + the minor binary version, so a binary version 1.3 function
 *   would return the value 13. Note that this will return a value of 10 for
 *   legacy cubins that do not have a properly-encoded binary architecture
 *   version.
 * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has  
 *   been compiled with user specified option "-Xptxas --dlcm=ca" set .
 *
 * @param pi     - Returned attribute value
 * @param attrib - Attribute requested
 * @param hfunc  - Function to query attribute of
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuCtxGetCacheConfig,
 * ::cuCtxSetCacheConfig,
 * ::cuFuncSetCacheConfig,
 * ::cuLaunchKernel
 */
public static native @Cast("CUresult") int cuFuncGetAttribute(IntPointer pi, @Cast("CUfunction_attribute") int attrib, CUfunc_st hfunc);
public static native @Cast("CUresult") int cuFuncGetAttribute(IntBuffer pi, @Cast("CUfunction_attribute") int attrib, CUfunc_st hfunc);
public static native @Cast("CUresult") int cuFuncGetAttribute(int[] pi, @Cast("CUfunction_attribute") int attrib, CUfunc_st hfunc);

/**
 * \brief Sets the preferred cache configuration for a device function
 *
 * On devices where the L1 cache and shared memory use the same hardware
 * resources, this sets through \p config the preferred cache configuration for
 * the device function \p hfunc. This is only a preference. The driver will use
 * the requested configuration if possible, but it is free to choose a different
 * configuration if required to execute \p hfunc.  Any context-wide preference
 * set via ::cuCtxSetCacheConfig() will be overridden by this per-function
 * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In
 * that case, the current context-wide setting will be used.
 *
 * This setting does nothing on devices where the size of the L1 cache and
 * shared memory are fixed.
 *
 * Launching a kernel with a different preference than the most recent
 * preference setting may insert a device-side synchronization point.
 *
 *
 * The supported cache configurations are:
 * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
 * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
 * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
 * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
 *
 * @param hfunc  - Kernel to configure cache for
 * @param config - Requested cache configuration
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT
 * \notefnerr
 *
 * \sa ::cuCtxGetCacheConfig,
 * ::cuCtxSetCacheConfig,
 * ::cuFuncGetAttribute,
 * ::cuLaunchKernel
 */
public static native @Cast("CUresult") int cuFuncSetCacheConfig(CUfunc_st hfunc, @Cast("CUfunc_cache") int config);

// #if __CUDA_API_VERSION >= 4020
/**
 * \brief Sets the shared memory configuration for a device function.
 *
 * On devices with configurable shared memory banks, this function will 
 * force all subsequent launches of the specified device function to have
 * the given shared memory bank size configuration. On any given launch of the
 * function, the shared memory configuration of the device will be temporarily
 * changed if needed to suit the function's preferred configuration. Changes in
 * shared memory configuration between subsequent launches of functions, 
 * may introduce a device side synchronization point.
 *
 * Any per-function setting of shared memory bank size set via 
 * ::cuFuncSetSharedMemConfig will override the context wide setting set with
 * ::cuCtxSetSharedMemConfig.
 *
 * Changing the shared memory bank size will not increase shared memory usage
 * or affect occupancy of kernels, but may have major effects on performance. 
 * Larger bank sizes will allow for greater potential bandwidth to shared memory,
 * but will change what kinds of accesses to shared memory will result in bank 
 * conflicts.
 *
 * This function will do nothing on devices with fixed shared memory bank size.
 *
 * The supported bank configurations are:
 * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory 
 *   configuration when launching this function.
 * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
 *   be natively four bytes when launching this function.
 * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
 *   be natively eight bytes when launching this function.
 *
 * @param hfunc  - kernel to be given a shared memory config
 * @param config - requested shared memory configuration
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT
 * \notefnerr
 *
 * \sa ::cuCtxGetCacheConfig,
 * ::cuCtxSetCacheConfig,
 * ::cuCtxGetSharedMemConfig,
 * ::cuCtxSetSharedMemConfig,
 * ::cuFuncGetAttribute,
 * ::cuLaunchKernel
 */
public static native @Cast("CUresult") int cuFuncSetSharedMemConfig(CUfunc_st hfunc, @Cast("CUsharedconfig") int config);
// #endif

// #if __CUDA_API_VERSION >= 4000
/**
 * \brief Launches a CUDA function
 *
 * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
 * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
 * \p blockDimZ threads.
 *
 * \p sharedMemBytes sets the amount of dynamic shared memory that will be
 * available to each thread block.
 *
 * Kernel parameters to \p f can be specified in one of two ways:
 *
 * 1) Kernel parameters can be specified via \p kernelParams.  If \p f
 * has N parameters, then \p kernelParams needs to be an array of N
 * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
 * must point to a region of memory from which the actual kernel
 * parameter will be copied.  The number of kernel parameters and their
 * offsets and sizes do not need to be specified as that information is
 * retrieved directly from the kernel's image.
 *
 * 2) Kernel parameters can also be packaged by the application into
 * a single buffer that is passed in via the \p extra parameter.
 * This places the burden on the application of knowing each kernel
 * parameter's size and alignment/padding within the buffer.  Here is
 * an example of using the \p extra parameter in this manner:
 * {@code
    size_t argBufferSize;
    char argBuffer[256];

    // populate argBuffer and argBufferSize

    void *config[] = {
        CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
        CU_LAUNCH_PARAM_BUFFER_SIZE,    &argBufferSize,
        CU_LAUNCH_PARAM_END
    };
    status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
 * }
 *
 * The \p extra parameter exists to allow ::cuLaunchKernel to take
 * additional less commonly used arguments.  \p extra specifies a list of
 * names of extra settings and their corresponding values.  Each extra
 * setting name is immediately followed by the corresponding value.  The
 * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END.
 *
 * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
 *   array;
 * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
 *   value in \p extra will be a pointer to a buffer containing all
 *   the kernel parameters for launching kernel \p f;
 * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
 *   value in \p extra will be a pointer to a size_t containing the
 *   size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER;
 *
 * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel
 * parameters are specified with both \p kernelParams and \p extra
 * (i.e. both \p kernelParams and \p extra are non-NULL).
 *
 * Calling ::cuLaunchKernel() sets persistent function state that is
 * the same as function state set through the following deprecated APIs:
 *  ::cuFuncSetBlockShape(),
 *  ::cuFuncSetSharedSize(),
 *  ::cuParamSetSize(),
 *  ::cuParamSeti(),
 *  ::cuParamSetf(),
 *  ::cuParamSetv().
 *
 * When the kernel \p f is launched via ::cuLaunchKernel(), the previous
 * block shape, shared size and parameter info associated with \p f
 * is overwritten.
 *
 * Note that to use ::cuLaunchKernel(), the kernel \p f must either have
 * been compiled with toolchain version 3.2 or later so that it will
 * contain kernel parameter information, or have no kernel parameters.
 * If either of these conditions is not met, then ::cuLaunchKernel() will
 * return ::CUDA_ERROR_INVALID_IMAGE.
 *
 * @param f              - Kernel to launch
 * @param gridDimX       - Width of grid in blocks
 * @param gridDimY       - Height of grid in blocks
 * @param gridDimZ       - Depth of grid in blocks
 * @param blockDimX      - X dimension of each thread block
 * @param blockDimY      - Y dimension of each thread block
 * @param blockDimZ      - Z dimension of each thread block
 * @param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
 * @param hStream        - Stream identifier
 * @param kernelParams   - Array of pointers to kernel parameters
 * @param extra          - Extra options
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_INVALID_IMAGE,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_LAUNCH_FAILED,
 * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
 * ::CUDA_ERROR_LAUNCH_TIMEOUT,
 * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
 * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
 * \note_null_stream
 * \notefnerr
 *
 * \sa ::cuCtxGetCacheConfig,
 * ::cuCtxSetCacheConfig,
 * ::cuFuncSetCacheConfig,
 * ::cuFuncGetAttribute
 */
public static native @Cast("CUresult") int cuLaunchKernel(CUfunc_st f,
                                @Cast("unsigned int") int gridDimX,
                                @Cast("unsigned int") int gridDimY,
                                @Cast("unsigned int") int gridDimZ,
                                @Cast("unsigned int") int blockDimX,
                                @Cast("unsigned int") int blockDimY,
                                @Cast("unsigned int") int blockDimZ,
                                @Cast("unsigned int") int sharedMemBytes,
                                CUstream_st hStream,
                                @Cast("void**") PointerPointer kernelParams,
                                @Cast("void**") PointerPointer extra);
public static native @Cast("CUresult") int cuLaunchKernel(CUfunc_st f,
                                @Cast("unsigned int") int gridDimX,
                                @Cast("unsigned int") int gridDimY,
                                @Cast("unsigned int") int gridDimZ,
                                @Cast("unsigned int") int blockDimX,
                                @Cast("unsigned int") int blockDimY,
                                @Cast("unsigned int") int blockDimZ,
                                @Cast("unsigned int") int sharedMemBytes,
                                CUstream_st hStream,
                                @Cast("void**") @ByPtrPtr Pointer kernelParams,
                                @Cast("void**") @ByPtrPtr Pointer extra);
// #endif /* __CUDA_API_VERSION >= 4000 */

/** \} */ /* END CUDA_EXEC */

/**
 * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED]
 *
 * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA
 * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the deprecated execution control functions of the
 * low-level CUDA driver application programming interface.
 *
 * \{
 */

/**
 * \brief Sets the block-dimensions for the function
 *
 * @deprecated
 *
 * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are
 * created when the kernel given by \p hfunc is launched.
 *
 * @param hfunc - Kernel to specify dimensions of
 * @param x     - X dimension
 * @param y     - Y dimension
 * @param z     - Z dimension
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuFuncSetSharedSize,
 * ::cuFuncSetCacheConfig,
 * ::cuFuncGetAttribute,
 * ::cuParamSetSize,
 * ::cuParamSeti,
 * ::cuParamSetf,
 * ::cuParamSetv,
 * ::cuLaunch,
 * ::cuLaunchGrid,
 * ::cuLaunchGridAsync,
 * ::cuLaunchKernel
 */
public static native @Cast("CUresult") int cuFuncSetBlockShape(CUfunc_st hfunc, int x, int y, int z);

/**
 * \brief Sets the dynamic shared-memory size for the function
 *
 * @deprecated
 *
 * Sets through \p bytes the amount of dynamic shared memory that will be
 * available to each thread block when the kernel given by \p hfunc is launched.
 *
 * @param hfunc - Kernel to specify dynamic shared-memory size for
 * @param bytes - Dynamic shared-memory size per thread in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuFuncSetBlockShape,
 * ::cuFuncSetCacheConfig,
 * ::cuFuncGetAttribute,
 * ::cuParamSetSize,
 * ::cuParamSeti,
 * ::cuParamSetf,
 * ::cuParamSetv,
 * ::cuLaunch,
 * ::cuLaunchGrid,
 * ::cuLaunchGridAsync,
 * ::cuLaunchKernel
 */
public static native @Cast("CUresult") int cuFuncSetSharedSize(CUfunc_st hfunc, @Cast("unsigned int") int bytes);

/**
 * \brief Sets the parameter size for the function
 *
 * @deprecated
 *
 * Sets through \p numbytes the total size in bytes needed by the function
 * parameters of the kernel corresponding to \p hfunc.
 *
 * @param hfunc    - Kernel to set parameter size for
 * @param numbytes - Size of parameter list in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuFuncSetBlockShape,
 * ::cuFuncSetSharedSize,
 * ::cuFuncGetAttribute,
 * ::cuParamSetf,
 * ::cuParamSeti,
 * ::cuParamSetv,
 * ::cuLaunch,
 * ::cuLaunchGrid,
 * ::cuLaunchGridAsync,
 * ::cuLaunchKernel
 */
public static native @Cast("CUresult") int cuParamSetSize(CUfunc_st hfunc, @Cast("unsigned int") int numbytes);

/**
 * \brief Adds an integer parameter to the function's argument list
 *
 * @deprecated
 *
 * Sets an integer parameter that will be specified the next time the
 * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
 *
 * @param hfunc  - Kernel to add parameter to
 * @param offset - Offset to add parameter to argument list
 * @param value  - Value of parameter
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuFuncSetBlockShape,
 * ::cuFuncSetSharedSize,
 * ::cuFuncGetAttribute,
 * ::cuParamSetSize,
 * ::cuParamSetf,
 * ::cuParamSetv,
 * ::cuLaunch,
 * ::cuLaunchGrid,
 * ::cuLaunchGridAsync,
 * ::cuLaunchKernel
 */
public static native @Cast("CUresult") int cuParamSeti(CUfunc_st hfunc, int offset, @Cast("unsigned int") int value);

/**
 * \brief Adds a floating-point parameter to the function's argument list
 *
 * @deprecated
 *
 * Sets a floating-point parameter that will be specified the next time the
 * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
 *
 * @param hfunc  - Kernel to add parameter to
 * @param offset - Offset to add parameter to argument list
 * @param value  - Value of parameter
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuFuncSetBlockShape,
 * ::cuFuncSetSharedSize,
 * ::cuFuncGetAttribute,
 * ::cuParamSetSize,
 * ::cuParamSeti,
 * ::cuParamSetv,
 * ::cuLaunch,
 * ::cuLaunchGrid,
 * ::cuLaunchGridAsync,
 * ::cuLaunchKernel
 */
public static native @Cast("CUresult") int cuParamSetf(CUfunc_st hfunc, int offset, float value);

/**
 * \brief Adds arbitrary data to the function's argument list
 *
 * @deprecated
 *
 * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr
 * into the parameter space of the kernel corresponding to \p hfunc. \p offset
 * is a byte offset.
 *
 * @param hfunc    - Kernel to add data to
 * @param offset   - Offset to add data to argument list
 * @param ptr      - Pointer to arbitrary data
 * @param numbytes - Size of data to copy in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuFuncSetBlockShape,
 * ::cuFuncSetSharedSize,
 * ::cuFuncGetAttribute,
 * ::cuParamSetSize,
 * ::cuParamSetf,
 * ::cuParamSeti,
 * ::cuLaunch,
 * ::cuLaunchGrid,
 * ::cuLaunchGridAsync,
 * ::cuLaunchKernel
 */
public static native @Cast("CUresult") int cuParamSetv(CUfunc_st hfunc, int offset, Pointer ptr, @Cast("unsigned int") int numbytes);

/**
 * \brief Launches a CUDA function
 *
 * @deprecated
 *
 * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block
 * contains the number of threads specified by a previous call to
 * ::cuFuncSetBlockShape().
 *
 * @param f - Kernel to launch
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_LAUNCH_FAILED,
 * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
 * ::CUDA_ERROR_LAUNCH_TIMEOUT,
 * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
 * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
 * \notefnerr
 *
 * \sa ::cuFuncSetBlockShape,
 * ::cuFuncSetSharedSize,
 * ::cuFuncGetAttribute,
 * ::cuParamSetSize,
 * ::cuParamSetf,
 * ::cuParamSeti,
 * ::cuParamSetv,
 * ::cuLaunchGrid,
 * ::cuLaunchGridAsync,
 * ::cuLaunchKernel
 */
public static native @Cast("CUresult") int cuLaunch(CUfunc_st f);

/**
 * \brief Launches a CUDA function
 *
 * @deprecated
 *
 * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
 * blocks. Each block contains the number of threads specified by a previous
 * call to ::cuFuncSetBlockShape().
 *
 * @param f           - Kernel to launch
 * @param grid_width  - Width of grid in blocks
 * @param grid_height - Height of grid in blocks
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_LAUNCH_FAILED,
 * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
 * ::CUDA_ERROR_LAUNCH_TIMEOUT,
 * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
 * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
 * \notefnerr
 *
 * \sa ::cuFuncSetBlockShape,
 * ::cuFuncSetSharedSize,
 * ::cuFuncGetAttribute,
 * ::cuParamSetSize,
 * ::cuParamSetf,
 * ::cuParamSeti,
 * ::cuParamSetv,
 * ::cuLaunch,
 * ::cuLaunchGridAsync,
 * ::cuLaunchKernel
 */
public static native @Cast("CUresult") int cuLaunchGrid(CUfunc_st f, int grid_width, int grid_height);

/**
 * \brief Launches a CUDA function
 *
 * @deprecated
 *
 * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
 * blocks. Each block contains the number of threads specified by a previous
 * call to ::cuFuncSetBlockShape().
 *
 * @param f           - Kernel to launch
 * @param grid_width  - Width of grid in blocks
 * @param grid_height - Height of grid in blocks
 * @param hStream     - Stream identifier
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_LAUNCH_FAILED,
 * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
 * ::CUDA_ERROR_LAUNCH_TIMEOUT,
 * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
 * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
 *
 * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), 
 *       this function may serialize kernel launches. In order to force the CUDA driver to retain 
 *		 asynchronous behavior, set the ::CU_CTX_LMEM_RESIZE_TO_MAX flag during context creation (see ::cuCtxCreate).
 *       
 * \note_null_stream
 * \notefnerr
 *
 * \sa ::cuFuncSetBlockShape,
 * ::cuFuncSetSharedSize,
 * ::cuFuncGetAttribute,
 * ::cuParamSetSize,
 * ::cuParamSetf,
 * ::cuParamSeti,
 * ::cuParamSetv,
 * ::cuLaunch,
 * ::cuLaunchGrid,
 * ::cuLaunchKernel
 */
public static native @Cast("CUresult") int cuLaunchGridAsync(CUfunc_st f, int grid_width, int grid_height, CUstream_st hStream);


/**
 * \brief Adds a texture-reference to the function's argument list
 *
 * @deprecated
 *
 * Makes the CUDA array or linear memory bound to the texture reference
 * \p hTexRef available to a device program as a texture. In this version of
 * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and
 * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT.
 *
 * @param hfunc   - Kernel to add texture-reference to
 * @param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT)
 * @param hTexRef - Texture-reference to add to argument list
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 */
public static native @Cast("CUresult") int cuParamSetTexRef(CUfunc_st hfunc, int texunit, CUtexref_st hTexRef);
/** \} */ /* END CUDA_EXEC_DEPRECATED */


// #if __CUDA_API_VERSION >= 6050
/**
 * \defgroup CUDA_OCCUPANCY Occupancy
 *
 * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver
 * API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the occupancy calculation functions of the low-level CUDA
 * driver application programming interface.
 *
 * \{
 */

/**
 * \brief Returns occupancy of a function
 *
 * Returns in \p *numBlocks the number of the maximum active blocks per
 * streaming multiprocessor.
 *
 * @param numBlocks       - Returned occupancy
 * @param func            - Kernel for which occupancy is calculated
 * @param blockSize       - Block size the kernel is intended to be launched with
 * @param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_UNKNOWN
 * \notefnerr
 *
 */
public static native @Cast("CUresult") int cuOccupancyMaxActiveBlocksPerMultiprocessor(IntPointer numBlocks, CUfunc_st func, int blockSize, @Cast("size_t") long dynamicSMemSize);
public static native @Cast("CUresult") int cuOccupancyMaxActiveBlocksPerMultiprocessor(IntBuffer numBlocks, CUfunc_st func, int blockSize, @Cast("size_t") long dynamicSMemSize);
public static native @Cast("CUresult") int cuOccupancyMaxActiveBlocksPerMultiprocessor(int[] numBlocks, CUfunc_st func, int blockSize, @Cast("size_t") long dynamicSMemSize);

/**
 * \brief Returns occupancy of a function
 *
 * Returns in \p *numBlocks the number of the maximum active blocks per
 * streaming multiprocessor.
 *
 * The \p Flags parameter controls how special cases are handled. The
 * valid flags are:
 *
 * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
 *   ::cuOccupancyMaxActiveBlocksPerMultiprocessor;
 *
 * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
 *   default behavior on platform where global caching affects
 *   occupancy. On such platforms, if caching is enabled, but
 *   per-block SM resource usage would result in zero occupancy, the
 *   occupancy calculator will calculate the occupancy as if caching
 *   is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes
 *   the occupancy calculator to return 0 in such cases. More information
 *   can be found about this feature in the "Unified L1/Texture Cache"
 *   section of the Maxwell tuning guide.
 *
 * @param numBlocks       - Returned occupancy
 * @param func            - Kernel for which occupancy is calculated
 * @param blockSize       - Block size the kernel is intended to be launched with
 * @param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
 * @param flags           - Requested behavior for the occupancy calculator
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_UNKNOWN
 * \notefnerr
 *
 */
public static native @Cast("CUresult") int cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(IntPointer numBlocks, CUfunc_st func, int blockSize, @Cast("size_t") long dynamicSMemSize, @Cast("unsigned int") int flags);
public static native @Cast("CUresult") int cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(IntBuffer numBlocks, CUfunc_st func, int blockSize, @Cast("size_t") long dynamicSMemSize, @Cast("unsigned int") int flags);
public static native @Cast("CUresult") int cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int[] numBlocks, CUfunc_st func, int blockSize, @Cast("size_t") long dynamicSMemSize, @Cast("unsigned int") int flags);
    
/**
 * \brief Suggest a launch configuration with reasonable occupancy
 *
 * Returns in \p *blockSize a reasonable block size that can achieve
 * the maximum occupancy (or, the maximum number of active warps with
 * the fewest blocks per multiprocessor), and in \p *minGridSize the
 * minimum grid size to achieve the maximum occupancy.
 *
 * If \p blockSizeLimit is 0, the configurator will use the maximum
 * block size permitted by the device / function instead.
 *
 * If per-block dynamic shared memory allocation is not needed, the
 * user should leave both \p blockSizeToDynamicSMemSize and \p
 * dynamicSMemSize as 0.
 *
 * If per-block dynamic shared memory allocation is needed, then if
 * the dynamic shared memory size is constant regardless of block
 * size, the size should be passed through \p dynamicSMemSize, and \p
 * blockSizeToDynamicSMemSize should be NULL.
 *
 * Otherwise, if the per-block dynamic shared memory size varies with
 * different block sizes, the user needs to provide a unary function
 * through \p blockSizeToDynamicSMemSize that computes the dynamic
 * shared memory needed by \p func for any given block size. \p
 * dynamicSMemSize is ignored. An example signature is:
 *
 * {@code
 *    // Take block size, returns dynamic shared memory needed
 *    size_t blockToSmem(int blockSize);
 * }
 *
 * @param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
 * @param blockSize   - Returned maximum block size that can achieve the maximum occupancy
 * @param func        - Kernel for which launch configuration is calculated
 * @param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
 * @param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
 * @param blockSizeLimit  - The maximum block size \p func is designed to handle
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_UNKNOWN
 * \notefnerr
 *
 */
public static native @Cast("CUresult") int cuOccupancyMaxPotentialBlockSize(IntPointer minGridSize, IntPointer blockSize, CUfunc_st func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, @Cast("size_t") long dynamicSMemSize, int blockSizeLimit);
public static native @Cast("CUresult") int cuOccupancyMaxPotentialBlockSize(IntBuffer minGridSize, IntBuffer blockSize, CUfunc_st func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, @Cast("size_t") long dynamicSMemSize, int blockSizeLimit);
public static native @Cast("CUresult") int cuOccupancyMaxPotentialBlockSize(int[] minGridSize, int[] blockSize, CUfunc_st func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, @Cast("size_t") long dynamicSMemSize, int blockSizeLimit);

/**
 * \brief Suggest a launch configuration with reasonable occupancy
 *
 * An extended version of ::cuOccupancyMaxPotentialBlockSize. In
 * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize,
 * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags
 * parameter.
 *
 * The \p Flags parameter controls how special cases are handled. The
 * valid flags are:
 *
 * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
 *   ::cuOccupancyMaxPotentialBlockSize;
 *
 * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
 *   default behavior on platform where global caching affects
 *   occupancy. On such platforms, the launch configurations that
 *   produces maximal occupancy might not support global
 *   caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE
 *   guarantees that the the produced launch configuration is global
 *   caching compatible at a potential cost of occupancy. More information
 *   can be found about this feature in the "Unified L1/Texture Cache"
 *   section of the Maxwell tuning guide.
 *
 * @param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
 * @param blockSize   - Returned maximum block size that can achieve the maximum occupancy
 * @param func        - Kernel for which launch configuration is calculated
 * @param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
 * @param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
 * @param blockSizeLimit  - The maximum block size \p func is designed to handle
 * @param flags       - Options
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_UNKNOWN
 * \notefnerr
 *
 */
public static native @Cast("CUresult") int cuOccupancyMaxPotentialBlockSizeWithFlags(IntPointer minGridSize, IntPointer blockSize, CUfunc_st func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, @Cast("size_t") long dynamicSMemSize, int blockSizeLimit, @Cast("unsigned int") int flags);
public static native @Cast("CUresult") int cuOccupancyMaxPotentialBlockSizeWithFlags(IntBuffer minGridSize, IntBuffer blockSize, CUfunc_st func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, @Cast("size_t") long dynamicSMemSize, int blockSizeLimit, @Cast("unsigned int") int flags);
public static native @Cast("CUresult") int cuOccupancyMaxPotentialBlockSizeWithFlags(int[] minGridSize, int[] blockSize, CUfunc_st func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, @Cast("size_t") long dynamicSMemSize, int blockSizeLimit, @Cast("unsigned int") int flags);

/** \} */ /* END CUDA_OCCUPANCY */
// #endif /* __CUDA_API_VERSION >= 6050 */

/**
 * \defgroup CUDA_TEXREF Texture Reference Management
 *
 * ___MANBRIEF___ texture reference management functions of the low-level CUDA
 * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the texture reference management functions of the
 * low-level CUDA driver application programming interface.
 *
 * \{
 */

/**
 * \brief Binds an array as a texture reference
 *
 * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any
 * previous address or CUDA array state associated with the texture reference
 * is superseded by this function. \p Flags must be set to
 * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is
 * unbound.
 *
 * @param hTexRef - Texture reference to bind
 * @param hArray  - Array to bind
 * @param Flags   - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefSetArray(CUtexref_st hTexRef, CUarray_st hArray, @Cast("unsigned int") int Flags);

/**
 * \brief Binds a mipmapped array to a texture reference
 *
 * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef.
 * Any previous address or CUDA array state associated with the texture reference
 * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. 
 * Any CUDA array previously bound to \p hTexRef is unbound.
 *
 * @param hTexRef         - Texture reference to bind
 * @param hMipmappedArray - Mipmapped array to bind
 * @param Flags           - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefSetMipmappedArray(CUtexref_st hTexRef, CUmipmappedArray_st hMipmappedArray, @Cast("unsigned int") int Flags);

// #if __CUDA_API_VERSION >= 3020
/**
 * \brief Binds an address as a texture reference
 *
 * Binds a linear address range to the texture reference \p hTexRef. Any
 * previous address or CUDA array state associated with the texture reference
 * is superseded by this function. Any memory previously bound to \p hTexRef
 * is unbound.
 *
 * Since the hardware enforces an alignment requirement on texture base
 * addresses, ::cuTexRefSetAddress() passes back a byte offset in
 * \p *ByteOffset that must be applied to texture fetches in order to read from
 * the desired memory. This offset must be divided by the texel size and
 * passed to kernels that read from the texture so they can be applied to the
 * ::tex1Dfetch() function.
 *
 * If the device memory pointer was returned from ::cuMemAlloc(), the offset
 * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter.
 *
 * The total number of elements (or texels) in the linear address range
 * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH.
 * The number of elements is computed as (\p bytes / bytesPerElement),
 * where bytesPerElement is determined from the data format and number of 
 * components set using ::cuTexRefSetFormat().
 *
 * @param ByteOffset - Returned byte offset
 * @param hTexRef    - Texture reference to bind
 * @param dptr       - Device pointer to bind
 * @param bytes      - Size of memory to bind in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefSetAddress(@Cast("size_t*") SizeTPointer ByteOffset, CUtexref_st hTexRef, @Cast("CUdeviceptr") long dptr, @Cast("size_t") long bytes);

/**
 * \brief Binds an address as a 2D texture reference
 *
 * Binds a linear address range to the texture reference \p hTexRef. Any
 * previous address or CUDA array state associated with the texture reference
 * is superseded by this function. Any memory previously bound to \p hTexRef
 * is unbound.
 *
 * Using a ::tex2D() function inside a kernel requires a call to either
 * ::cuTexRefSetArray() to bind the corresponding texture reference to an
 * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear
 * memory.
 *
 * Function calls to ::cuTexRefSetFormat() cannot follow calls to
 * ::cuTexRefSetAddress2D() for the same texture reference.
 *
 * It is required that \p dptr be aligned to the appropriate hardware-specific
 * texture alignment. You can query this value using the device attribute
 * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is
 * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
 *
 * \p Pitch has to be aligned to the hardware-specific texture pitch alignment.
 * This value can be queried using the device attribute 
 * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is 
 * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
 *
 * Width and Height, which are specified in elements (or texels), cannot exceed
 * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and
 * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
 * \p Pitch, which is specified in bytes, cannot exceed 
 * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
 *
 * @param hTexRef - Texture reference to bind
 * @param desc    - Descriptor of CUDA array
 * @param dptr    - Device pointer to bind
 * @param Pitch   - Line pitch in bytes
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefSetAddress2D(CUtexref_st hTexRef, @Const CUDA_ARRAY_DESCRIPTOR desc, @Cast("CUdeviceptr") long dptr, @Cast("size_t") long Pitch);
// #endif /* __CUDA_API_VERSION >= 3020 */

/**
 * \brief Sets the format for a texture reference
 *
 * Specifies the format of the data to be read by the texture reference
 * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the
 * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure:
 * They specify the format of each component and the number of components per
 * array element.
 *
 * @param hTexRef             - Texture reference
 * @param fmt                 - Format to set
 * @param NumPackedComponents - Number of components per array element
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefSetFormat(CUtexref_st hTexRef, @Cast("CUarray_format") int fmt, int NumPackedComponents);

/**
 * \brief Sets the addressing mode for a texture reference
 *
 * Specifies the addressing mode \p am for the given dimension \p dim of the
 * texture reference \p hTexRef. If \p dim is zero, the addressing mode is
 * applied to the first parameter of the functions used to fetch from the
 * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined
 * as:
 * {@code
   typedef enum CUaddress_mode_enum {
      CU_TR_ADDRESS_MODE_WRAP = 0,
      CU_TR_ADDRESS_MODE_CLAMP = 1,
      CU_TR_ADDRESS_MODE_MIRROR = 2,
      CU_TR_ADDRESS_MODE_BORDER = 3
   } CUaddress_mode;
 * }
 *
 * Note that this call has no effect if \p hTexRef is bound to linear memory.
 * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only 
 * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
 *
 * @param hTexRef - Texture reference
 * @param dim     - Dimension
 * @param am      - Addressing mode to set
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetArray,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefSetAddressMode(CUtexref_st hTexRef, int dim, @Cast("CUaddress_mode") int am);

/**
 * \brief Sets the filtering mode for a texture reference
 *
 * Specifies the filtering mode \p fm to be used when reading memory through
 * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
 *
 * {@code
   typedef enum CUfilter_mode_enum {
      CU_TR_FILTER_MODE_POINT = 0,
      CU_TR_FILTER_MODE_LINEAR = 1
   } CUfilter_mode;
 * }
 *
 * Note that this call has no effect if \p hTexRef is bound to linear memory.
 *
 * @param hTexRef - Texture reference
 * @param fm      - Filtering mode to set
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefSetFilterMode(CUtexref_st hTexRef, @Cast("CUfilter_mode") int fm);

/**
 * \brief Sets the mipmap filtering mode for a texture reference
 *
 * Specifies the mipmap filtering mode \p fm to be used when reading memory through
 * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
 *
 * {@code
   typedef enum CUfilter_mode_enum {
      CU_TR_FILTER_MODE_POINT = 0,
      CU_TR_FILTER_MODE_LINEAR = 1
   } CUfilter_mode;
 * }
 *
 * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
 *
 * @param hTexRef - Texture reference
 * @param fm      - Filtering mode to set
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefSetMipmapFilterMode(CUtexref_st hTexRef, @Cast("CUfilter_mode") int fm);

/**
 * \brief Sets the mipmap level bias for a texture reference
 *
 * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when 
 * reading memory through the texture reference \p hTexRef.
 *
 * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
 *
 * @param hTexRef - Texture reference
 * @param bias    - Mipmap level bias
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefSetMipmapLevelBias(CUtexref_st hTexRef, float bias);

/**
 * \brief Sets the mipmap min/max mipmap level clamps for a texture reference
 *
 * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp
 * respectively, to be used when reading memory through the texture reference 
 * \p hTexRef.
 *
 * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
 *
 * @param hTexRef        - Texture reference
 * @param minMipmapLevelClamp - Mipmap min level clamp
 * @param maxMipmapLevelClamp - Mipmap max level clamp
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefSetMipmapLevelClamp(CUtexref_st hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);

/**
 * \brief Sets the maximum anisotropy for a texture reference
 *
 * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through
 * the texture reference \p hTexRef. 
 *
 * Note that this call has no effect if \p hTexRef is bound to linear memory.
 *
 * @param hTexRef  - Texture reference
 * @param maxAniso - Maximum anisotropy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefSetMaxAnisotropy(CUtexref_st hTexRef, @Cast("unsigned int") int maxAniso);

/**
 * \brief Sets the border color for a texture reference
 *
 * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference
 * \p hTexRef. The color value supports only float type and holds color components in
 * the following sequence:
 * pBorderColor[0] holds 'R' component
 * pBorderColor[1] holds 'G' component
 * pBorderColor[2] holds 'B' component
 * pBorderColor[3] holds 'A' component
 *
 * Note that the color values can be set only when the Address mode is set to
 * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode.
 * Applications using integer border color values have to "reinterpret_cast" their values to float.
 *
 * @param hTexRef       - Texture reference
 * @param pBorderColor  - RGBA color
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddressMode,
 * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor
 */
public static native @Cast("CUresult") int cuTexRefSetBorderColor(CUtexref_st hTexRef, FloatPointer pBorderColor);
public static native @Cast("CUresult") int cuTexRefSetBorderColor(CUtexref_st hTexRef, FloatBuffer pBorderColor);
public static native @Cast("CUresult") int cuTexRefSetBorderColor(CUtexref_st hTexRef, float[] pBorderColor);

/**
 * \brief Sets the flags for a texture reference
 *
 * Specifies optional flags via \p Flags to specify the behavior of data
 * returned through the texture reference \p hTexRef. The valid flags are:
 *
 * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
 *   having the texture promote integer data to floating point data in the
 *   range [0, 1]. Note that texture with 32-bit integer format
 *   would not be promoted, regardless of whether or not this
 *   flag is specified;
 * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the 
 *   default behavior of having the texture coordinates range
 *   from [0, Dim) where Dim is the width or height of the CUDA
 *   array. Instead, the texture coordinates [0, 1.0) reference
 *   the entire breadth of the array dimension;
 *
 * @param hTexRef - Texture reference
 * @param Flags   - Optional flags to set
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefSetFlags(CUtexref_st hTexRef, @Cast("unsigned int") int Flags);

// #if __CUDA_API_VERSION >= 3020
/**
 * \brief Gets the address associated with a texture reference
 *
 * Returns in \p *pdptr the base address bound to the texture reference
 * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
 * is not bound to any device memory range.
 *
 * @param pdptr   - Returned device address
 * @param hTexRef - Texture reference
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefGetAddress(@Cast("CUdeviceptr*") LongPointer pdptr, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetAddress(@Cast("CUdeviceptr*") LongBuffer pdptr, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetAddress(@Cast("CUdeviceptr*") long[] pdptr, CUtexref_st hTexRef);
// #endif /* __CUDA_API_VERSION >= 3020 */

/**
 * \brief Gets the array bound to a texture reference
 *
 * Returns in \p *phArray the CUDA array bound to the texture reference
 * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
 * is not bound to any CUDA array.
 *
 * @param phArray - Returned array
 * @param hTexRef - Texture reference
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefGetArray(@ByPtrPtr CUarray_st phArray, CUtexref_st hTexRef);

/**
 * \brief Gets the mipmapped array bound to a texture reference
 *
 * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture 
 * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
 * is not bound to any CUDA mipmapped array.
 *
 * @param phMipmappedArray - Returned mipmapped array
 * @param hTexRef          - Texture reference
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefGetMipmappedArray(@ByPtrPtr CUmipmappedArray_st phMipmappedArray, CUtexref_st hTexRef);

/**
 * \brief Gets the addressing mode used by a texture reference
 *
 * Returns in \p *pam the addressing mode corresponding to the
 * dimension \p dim of the texture reference \p hTexRef. Currently, the only
 * valid value for \p dim are 0 and 1.
 *
 * @param pam     - Returned addressing mode
 * @param hTexRef - Texture reference
 * @param dim     - Dimension
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefGetAddressMode(@Cast("CUaddress_mode*") IntPointer pam, CUtexref_st hTexRef, int dim);
public static native @Cast("CUresult") int cuTexRefGetAddressMode(@Cast("CUaddress_mode*") IntBuffer pam, CUtexref_st hTexRef, int dim);
public static native @Cast("CUresult") int cuTexRefGetAddressMode(@Cast("CUaddress_mode*") int[] pam, CUtexref_st hTexRef, int dim);

/**
 * \brief Gets the filter-mode used by a texture reference
 *
 * Returns in \p *pfm the filtering mode of the texture reference
 * \p hTexRef.
 *
 * @param pfm     - Returned filtering mode
 * @param hTexRef - Texture reference
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefGetFilterMode(@Cast("CUfilter_mode*") IntPointer pfm, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetFilterMode(@Cast("CUfilter_mode*") IntBuffer pfm, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetFilterMode(@Cast("CUfilter_mode*") int[] pfm, CUtexref_st hTexRef);

/**
 * \brief Gets the format used by a texture reference
 *
 * Returns in \p *pFormat and \p *pNumChannels the format and number
 * of components of the CUDA array bound to the texture reference \p hTexRef.
 * If \p pFormat or \p pNumChannels is NULL, it will be ignored.
 *
 * @param pFormat      - Returned format
 * @param pNumChannels - Returned number of components
 * @param hTexRef      - Texture reference
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags
 */
public static native @Cast("CUresult") int cuTexRefGetFormat(@Cast("CUarray_format*") IntPointer pFormat, IntPointer pNumChannels, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetFormat(@Cast("CUarray_format*") IntBuffer pFormat, IntBuffer pNumChannels, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetFormat(@Cast("CUarray_format*") int[] pFormat, int[] pNumChannels, CUtexref_st hTexRef);

/**
 * \brief Gets the mipmap filtering mode for a texture reference
 *
 * Returns the mipmap filtering mode in \p pfm that's used when reading memory through
 * the texture reference \p hTexRef.
 *
 * @param pfm     - Returned mipmap filtering mode
 * @param hTexRef - Texture reference
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefGetMipmapFilterMode(@Cast("CUfilter_mode*") IntPointer pfm, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetMipmapFilterMode(@Cast("CUfilter_mode*") IntBuffer pfm, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetMipmapFilterMode(@Cast("CUfilter_mode*") int[] pfm, CUtexref_st hTexRef);

/**
 * \brief Gets the mipmap level bias for a texture reference
 *
 * Returns the mipmap level bias in \p pBias that's added to the specified mipmap
 * level when reading memory through the texture reference \p hTexRef.
 *
 * @param pbias   - Returned mipmap level bias
 * @param hTexRef - Texture reference
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefGetMipmapLevelBias(FloatPointer pbias, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetMipmapLevelBias(FloatBuffer pbias, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetMipmapLevelBias(float[] pbias, CUtexref_st hTexRef);

/**
 * \brief Gets the min/max mipmap level clamps for a texture reference
 *
 * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp
 * that's used when reading memory through the texture reference \p hTexRef. 
 *
 * @param pminMipmapLevelClamp - Returned mipmap min level clamp
 * @param pmaxMipmapLevelClamp - Returned mipmap max level clamp
 * @param hTexRef              - Texture reference
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefGetMipmapLevelClamp(FloatPointer pminMipmapLevelClamp, FloatPointer pmaxMipmapLevelClamp, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetMipmapLevelClamp(FloatBuffer pminMipmapLevelClamp, FloatBuffer pmaxMipmapLevelClamp, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetMipmapLevelClamp(float[] pminMipmapLevelClamp, float[] pmaxMipmapLevelClamp, CUtexref_st hTexRef);

/**
 * \brief Gets the maximum anisotropy for a texture reference
 *
 * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through
 * the texture reference \p hTexRef. 
 *
 * @param pmaxAniso - Returned maximum anisotropy
 * @param hTexRef   - Texture reference
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefGetMaxAnisotropy(IntPointer pmaxAniso, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetMaxAnisotropy(IntBuffer pmaxAniso, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetMaxAnisotropy(int[] pmaxAniso, CUtexref_st hTexRef);

/**
 * \brief Gets the border color used by a texture reference
 *
 * Returns in \p pBorderColor, values of the RGBA color used by
 * the texture reference \p hTexRef.
 * The color value is of type float and holds color components in
 * the following sequence:
 * pBorderColor[0] holds 'R' component
 * pBorderColor[1] holds 'G' component
 * pBorderColor[2] holds 'B' component
 * pBorderColor[3] holds 'A' component
 *
 * @param hTexRef  - Texture reference
 * @param pBorderColor   - Returned Type and Value of RGBA color
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddressMode,
 * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor
 */
public static native @Cast("CUresult") int cuTexRefGetBorderColor(FloatPointer pBorderColor, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetBorderColor(FloatBuffer pBorderColor, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetBorderColor(float[] pBorderColor, CUtexref_st hTexRef); 

/**
 * \brief Gets the flags used by a texture reference
 *
 * Returns in \p *pFlags the flags of the texture reference \p hTexRef.
 *
 * @param pFlags  - Returned flags
 * @param hTexRef - Texture reference
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefSetAddress,
 * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
 * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
 * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
 * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat
 */
public static native @Cast("CUresult") int cuTexRefGetFlags(@Cast("unsigned int*") IntPointer pFlags, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetFlags(@Cast("unsigned int*") IntBuffer pFlags, CUtexref_st hTexRef);
public static native @Cast("CUresult") int cuTexRefGetFlags(@Cast("unsigned int*") int[] pFlags, CUtexref_st hTexRef);

/** \} */ /* END CUDA_TEXREF */

/**
 * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED]
 *
 * ___MANBRIEF___ deprecated texture reference management functions of the
 * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the deprecated texture reference management
 * functions of the low-level CUDA driver application programming interface.
 *
 * \{
 */

/**
 * \brief Creates a texture reference
 *
 * @deprecated
 *
 * Creates a texture reference and returns its handle in \p *pTexRef. Once
 * created, the application must call ::cuTexRefSetArray() or
 * ::cuTexRefSetAddress() to associate the reference with allocated memory.
 * Other texture reference functions are used to specify the format and
 * interpretation (addressing, filtering, etc.) to be used when the memory is
 * read through this texture reference.
 *
 * @param pTexRef - Returned texture reference
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefDestroy
 */
public static native @Cast("CUresult") int cuTexRefCreate(@ByPtrPtr CUtexref_st pTexRef);

/**
 * \brief Destroys a texture reference
 *
 * @deprecated
 *
 * Destroys the texture reference specified by \p hTexRef.
 *
 * @param hTexRef - Texture reference to destroy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexRefCreate
 */
public static native @Cast("CUresult") int cuTexRefDestroy(CUtexref_st hTexRef);

/** \} */ /* END CUDA_TEXREF_DEPRECATED */


/**
 * \defgroup CUDA_SURFREF Surface Reference Management
 *
 * ___MANBRIEF___ surface reference management functions of the low-level CUDA
 * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the surface reference management functions of the
 * low-level CUDA driver application programming interface.
 *
 * \{
 */

/**
 * \brief Sets the CUDA array for a surface reference.
 *
 * Sets the CUDA array \p hArray to be read and written by the surface reference
 * \p hSurfRef.  Any previous CUDA array state associated with the surface
 * reference is superseded by this function.  \p Flags must be set to 0.
 * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array.
 * Any CUDA array previously bound to \p hSurfRef is unbound.
 
 * @param hSurfRef - Surface reference handle
 * @param hArray - CUDA array handle
 * @param Flags - set to 0
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuModuleGetSurfRef, ::cuSurfRefGetArray
 */
public static native @Cast("CUresult") int cuSurfRefSetArray(CUsurfref_st hSurfRef, CUarray_st hArray, @Cast("unsigned int") int Flags);

/**
 * \brief Passes back the CUDA array bound to a surface reference.
 *
 * Returns in \p *phArray the CUDA array bound to the surface reference
 * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference
 * is not bound to any CUDA array.
 

 * @param phArray - Surface reference handle
 * @param hSurfRef - Surface reference handle
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray
 */
public static native @Cast("CUresult") int cuSurfRefGetArray(@ByPtrPtr CUarray_st phArray, CUsurfref_st hSurfRef);

/** \} */ /* END CUDA_SURFREF */

// #if __CUDA_API_VERSION >= 5000
/**
 * \defgroup CUDA_TEXOBJECT Texture Object Management
 *
 * ___MANBRIEF___ texture object management functions of the low-level CUDA
 * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the texture object management functions of the
 * low-level CUDA driver application programming interface. The texture
 * object API is only supported on devices of compute capability 3.0 or higher.
 *
 * \{
 */

/**
 * \brief Creates a texture object
 *
 * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
 * the data to texture from. \p pTexDesc describes how the data should be sampled.
 * \p pResViewDesc is an optional argument that specifies an alternate format for
 * the data described by \p pResDesc, and also describes the subresource region
 * to restrict access to when texturing. \p pResViewDesc can only be specified if
 * the type of resource is a CUDA array or a CUDA mipmapped array.
 *
 * Texture objects are only supported on devices of compute capability 3.0 or higher.
 * Additionally, a texture object is an opaque value, and, as such, should only be
 * accessed through CUDA API calls.
 *
 * The ::CUDA_RESOURCE_DESC structure is defined as:
 * 
{@code
        typedef struct CUDA_RESOURCE_DESC_st
        {
            CUresourcetype resType;

            union {
                struct {
                    CUarray hArray;
                } array;
                struct {
                    CUmipmappedArray hMipmappedArray;
                } mipmap;
                struct {
                    CUdeviceptr devPtr;
                    CUarray_format format;
                    unsigned int numChannels;
                    size_t sizeInBytes;
                } linear;
                struct {
                    CUdeviceptr devPtr;
                    CUarray_format format;
                    unsigned int numChannels;
                    size_t width;
                    size_t height;
                    size_t pitchInBytes;
                } pitch2D;
            } res;

            unsigned int flags;
        } CUDA_RESOURCE_DESC;

 * }
 * where:
 * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from.
 * CUresourceType is defined as:
 * {@code
        typedef enum CUresourcetype_enum {
            CU_RESOURCE_TYPE_ARRAY           = 0x00,
            CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01,
            CU_RESOURCE_TYPE_LINEAR          = 0x02,
            CU_RESOURCE_TYPE_PITCH2D         = 0x03
        } CUresourcetype;
 * }
 *
 * \par
 * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray
 * must be set to a valid CUDA array handle.
 *
 * \par
 * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray
 * must be set to a valid CUDA mipmapped array handle.
 *
 * \par
 * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr
 * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
 * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels
 * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes
 * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed 
 * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)).
 *
 * \par
 * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr
 * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
 * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels
 * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width
 * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
 * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
 * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to 
 * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
 *
 * - ::flags must be set to zero.
 *
 *
 * The ::CUDA_TEXTURE_DESC struct is defined as
 * {@code
        typedef struct CUDA_TEXTURE_DESC_st {
            CUaddress_mode addressMode[3];
            CUfilter_mode filterMode;
            unsigned int flags;
            unsigned int maxAnisotropy;
            CUfilter_mode mipmapFilterMode;
            float mipmapLevelBias;
            float minMipmapLevelClamp;
            float maxMipmapLevelClamp;
        } CUDA_TEXTURE_DESC;
 * }
 * where
 * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as:
 *   {@code
        typedef enum CUaddress_mode_enum {
            CU_TR_ADDRESS_MODE_WRAP = 0,
            CU_TR_ADDRESS_MODE_CLAMP = 1,
            CU_TR_ADDRESS_MODE_MIRROR = 2,
            CU_TR_ADDRESS_MODE_BORDER = 3
        } CUaddress_mode;
 *   }
 *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES 
 *   is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
 *
 * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as:
 *   {@code
        typedef enum CUfilter_mode_enum {
            CU_TR_FILTER_MODE_POINT = 0,
            CU_TR_FILTER_MODE_LINEAR = 1
        } CUfilter_mode;
 *   }
 *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR.
 *
 * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following:
 *   - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of having the texture promote integer data to floating point data in the
 *     range [0, 1]. Note that texture with 32-bit integer format would not be promoted, regardless of whether or not this flag is specified.
 *   - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior of having the texture coordinates range from [0, Dim) where Dim is
 *     the width or height of the CUDA array. Instead, the texture coordinates [0, 1.0) reference the entire breadth of the array dimension; Note
 *     that for CUDA mipmapped arrays, this flag has to be set.
 *
 * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be
 *   clamped to the range [1,16].
 *
 * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
 *
 * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
 *
 * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
 *
 * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
 *
 *
 * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as
 * {@code
        typedef struct CUDA_RESOURCE_VIEW_DESC_st
        {
            CUresourceViewFormat format;
            size_t width;
            size_t height;
            size_t depth;
            unsigned int firstMipmapLevel;
            unsigned int lastMipmapLevel;
            unsigned int firstLayer;
            unsigned int lastLayer;
        } CUDA_RESOURCE_VIEW_DESC;
 * }
 * where:
 * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
 *   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
 *   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32.
 *   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
 *   a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base
 *   format but with 4 channels.
 *
 * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block
 *   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
 *   this value has to be equal to that of the original resource.
 *
 * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block
 *   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
 *   this value has to be equal to that of the original resource.
 *
 * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the
 *   original resource.
 *
 * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
 *   For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp
 *   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
 *   then the actual minimum mipmap level clamp will be 3.2.
 *
 * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
 *   has to be zero.
 *
 * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
 *   For non-layered resources, this value has to be zero.
 *
 * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources, 
 *   this value has to be zero.
 *
 *
 * @param pTexObject   - Texture object to create
 * @param pResDesc     - Resource descriptor
 * @param pTexDesc     - Texture descriptor
 * @param pResViewDesc - Resource view descriptor 
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexObjectDestroy
 */
public static native @Cast("CUresult") int cuTexObjectCreate(@Cast("CUtexObject*") LongPointer pTexObject, @Const CUDA_RESOURCE_DESC pResDesc, @Const CUDA_TEXTURE_DESC pTexDesc, @Const CUDA_RESOURCE_VIEW_DESC pResViewDesc);
public static native @Cast("CUresult") int cuTexObjectCreate(@Cast("CUtexObject*") LongBuffer pTexObject, @Const CUDA_RESOURCE_DESC pResDesc, @Const CUDA_TEXTURE_DESC pTexDesc, @Const CUDA_RESOURCE_VIEW_DESC pResViewDesc);
public static native @Cast("CUresult") int cuTexObjectCreate(@Cast("CUtexObject*") long[] pTexObject, @Const CUDA_RESOURCE_DESC pResDesc, @Const CUDA_TEXTURE_DESC pTexDesc, @Const CUDA_RESOURCE_VIEW_DESC pResViewDesc);

/**
 * \brief Destroys a texture object
 *
 * Destroys the texture object specified by \p texObject.
 *
 * @param texObject - Texture object to destroy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexObjectCreate
 */
public static native @Cast("CUresult") int cuTexObjectDestroy(@Cast("CUtexObject") long texObject);

/**
 * \brief Returns a texture object's resource descriptor
 *
 * Returns the resource descriptor for the texture object specified by \p texObject.
 *
 * @param pResDesc  - Resource descriptor
 * @param texObject - Texture object
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexObjectCreate
 */
public static native @Cast("CUresult") int cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC pResDesc, @Cast("CUtexObject") long texObject);

/**
 * \brief Returns a texture object's texture descriptor
 *
 * Returns the texture descriptor for the texture object specified by \p texObject.
 *
 * @param pTexDesc  - Texture descriptor
 * @param texObject - Texture object
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexObjectCreate
 */
public static native @Cast("CUresult") int cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC pTexDesc, @Cast("CUtexObject") long texObject);

/**
 * \brief Returns a texture object's resource view descriptor
 *
 * Returns the resource view descriptor for the texture object specified by \p texObject.
 * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned.
 *
 * @param pResViewDesc - Resource view descriptor
 * @param texObject    - Texture object
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuTexObjectCreate
 */
public static native @Cast("CUresult") int cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC pResViewDesc, @Cast("CUtexObject") long texObject);

/** \} */ /* END CUDA_TEXOBJECT */

/**
 * \defgroup CUDA_SURFOBJECT Surface Object Management
 *
 * ___MANBRIEF___ surface object management functions of the low-level CUDA
 * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the surface object management functions of the
 * low-level CUDA driver application programming interface. The surface
 * object API is only supported on devices of compute capability 3.0 or higher.
 *
 * \{
 */

/**
 * \brief Creates a surface object
 *
 * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
 * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be 
 * ::CU_RESOURCE_TYPE_ARRAY and  ::CUDA_RESOURCE_DESC::res::array::hArray
 * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero.
 *
 * Surface objects are only supported on devices of compute capability 3.0 or higher.
 * Additionally, a surface object is an opaque value, and, as such, should only be
 * accessed through CUDA API calls.
 *
 * @param pSurfObject - Surface object to create
 * @param pResDesc    - Resource descriptor
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuSurfObjectDestroy
 */
public static native @Cast("CUresult") int cuSurfObjectCreate(@Cast("CUsurfObject*") LongPointer pSurfObject, @Const CUDA_RESOURCE_DESC pResDesc);
public static native @Cast("CUresult") int cuSurfObjectCreate(@Cast("CUsurfObject*") LongBuffer pSurfObject, @Const CUDA_RESOURCE_DESC pResDesc);
public static native @Cast("CUresult") int cuSurfObjectCreate(@Cast("CUsurfObject*") long[] pSurfObject, @Const CUDA_RESOURCE_DESC pResDesc);

/**
 * \brief Destroys a surface object
 *
 * Destroys the surface object specified by \p surfObject.
 *
 * @param surfObject - Surface object to destroy
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuSurfObjectCreate
 */
public static native @Cast("CUresult") int cuSurfObjectDestroy(@Cast("CUsurfObject") long surfObject);

/**
 * \brief Returns a surface object's resource descriptor
 *
 * Returns the resource descriptor for the surface object specified by \p surfObject.
 *
 * @param pResDesc   - Resource descriptor
 * @param surfObject - Surface object
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE
 *
 * \sa ::cuSurfObjectCreate
 */
public static native @Cast("CUresult") int cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC pResDesc, @Cast("CUsurfObject") long surfObject);

/** \} */ /* END CUDA_SURFOBJECT */
// #endif /* __CUDA_API_VERSION >= 5000 */

// #if __CUDA_API_VERSION >= 4000
/**
 * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access
 *
 * ___MANBRIEF___ direct peer context memory access functions of the low-level
 * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the direct peer context memory access functions 
 * of the low-level CUDA driver application programming interface.
 *
 * \{
 */
    
/**
 * \brief Queries if a device may directly access a peer device's memory.
 *
 * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of
 * directly accessing memory from contexts on \p peerDev and 0 otherwise.
 * If direct access of \p peerDev from \p dev is possible, then access may be
 * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess().
 *
 * @param canAccessPeer - Returned access capability
 * @param dev           - Device from which allocations on \p peerDev are to
 *                        be directly accessed.
 * @param peerDev       - Device on which the allocations to be directly accessed 
 *                        by \p dev reside.
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_DEVICE
 * \notefnerr
 *
 * \sa ::cuCtxEnablePeerAccess,
 * ::cuCtxDisablePeerAccess
 */
public static native @Cast("CUresult") int cuDeviceCanAccessPeer(IntPointer canAccessPeer, @Cast("CUdevice") int dev, @Cast("CUdevice") int peerDev);
public static native @Cast("CUresult") int cuDeviceCanAccessPeer(IntBuffer canAccessPeer, @Cast("CUdevice") int dev, @Cast("CUdevice") int peerDev);
public static native @Cast("CUresult") int cuDeviceCanAccessPeer(int[] canAccessPeer, @Cast("CUdevice") int dev, @Cast("CUdevice") int peerDev);


/**
 * \brief Queries attributes of the link between two devices.
 *
 * Returns in \p *value the value of the requested attribute \p attrib of the
 * link between \p srcDevice and \p dstDevice. The supported attributes are:
 * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the
 *   performance of the link between two devices.
 * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable.
 * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over
 *   the link are supported.
 *
 * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid
 * or if they represent the same device.
 *
 * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is
 * a null pointer.
 *
 * @param value         - Returned value of the requested attribute
 * @param attrib        - The requested attribute of the link between \p srcDevice and \p dstDevice.
 * @param srcDevice     - The source device of the target link.
 * @param dstDevice     - The destination device of the target link.
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_DEVICE,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuCtxEnablePeerAccess,
 * ::cuCtxDisablePeerAccess,
 * ::cuCtxCanAccessPeer
 */


/**
 * \brief Enables direct access to memory allocations in a peer context.
 *
 * If both the current context and \p peerContext are on devices which support unified
 * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same
 * major compute capability, then on success all allocations from \p peerContext will
 * immediately be accessible by the current context.  See \ref CUDA_UNIFIED for additional
 * details.
 *
 * Note that access granted by this call is unidirectional and that in order to access
 * memory from the current context in \p peerContext, a separate symmetric call 
 * to ::cuCtxEnablePeerAccess() is required.
 *
 * There is a system-wide maximum of eight peer connections per device.
 *
 * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates
 * that the ::CUdevice of the current context cannot directly access memory
 * from the ::CUdevice of \p peerContext.
 *
 * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of
 * \p peerContext from the current context has already been enabled.
 *
 * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible 
 * because hardware resources required for peer access have been exhausted.
 *
 * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext
 * is not a valid context, or if the current context is \p peerContext.
 *
 * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0.
 *
 * @param peerContext - Peer context to enable direct access to from the current context
 * @param Flags       - Reserved for future use and must be set to 0
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
 * ::CUDA_ERROR_TOO_MANY_PEERS,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
 * ::CUDA_ERROR_INVALID_VALUE
 * \notefnerr
 *
 * \sa ::cuDeviceCanAccessPeer,
 * ::cuCtxDisablePeerAccess
 */
public static native @Cast("CUresult") int cuCtxEnablePeerAccess(CUctx_st peerContext, @Cast("unsigned int") int Flags);

/**
 * \brief Disables direct access to memory allocations in a peer context and 
 * unregisters any registered allocations.
 *
  Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has 
 * not yet been enabled from \p peerContext to the current context.
 *
 * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if
 * \p peerContext is not a valid context.
 *
 * @param peerContext - Peer context to disable direct access to
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * \notefnerr
 *
 * \sa ::cuDeviceCanAccessPeer,
 * ::cuCtxEnablePeerAccess
 */
public static native @Cast("CUresult") int cuCtxDisablePeerAccess(CUctx_st peerContext);

/** \} */ /* END CUDA_PEER_ACCESS */
// #endif /* __CUDA_API_VERSION >= 4000 */

/**
 * \defgroup CUDA_GRAPHICS Graphics Interoperability
 *
 * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA
 * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the graphics interoperability functions of the
 * low-level CUDA driver application programming interface.
 *
 * \{
 */

/**
 * \brief Unregisters a graphics resource for access by CUDA
 *
 * Unregisters the graphics resource \p resource so it is not accessible by
 * CUDA unless registered again.
 *
 * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is
 * returned.
 *
 * @param resource - Resource to unregister
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_UNKNOWN
 * \notefnerr
 *
 * \sa
 * ::cuGraphicsD3D9RegisterResource,
 * ::cuGraphicsD3D10RegisterResource,
 * ::cuGraphicsD3D11RegisterResource,
 * ::cuGraphicsGLRegisterBuffer,
 * ::cuGraphicsGLRegisterImage
 */
public static native @Cast("CUresult") int cuGraphicsUnregisterResource(CUgraphicsResource_st resource);

/**
 * \brief Get an array through which to access a subresource of a mapped graphics resource.
 *
 * Returns in \p *pArray an array through which the subresource of the mapped
 * graphics resource \p resource which corresponds to array index \p arrayIndex
 * and mipmap level \p mipLevel may be accessed.  The value set in \p *pArray may
 * change every time that \p resource is mapped.
 *
 * If \p resource is not a texture then it cannot be accessed via an array and
 * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
 * If \p arrayIndex is not a valid array index for \p resource then
 * ::CUDA_ERROR_INVALID_VALUE is returned.
 * If \p mipLevel is not a valid mipmap level for \p resource then
 * ::CUDA_ERROR_INVALID_VALUE is returned.
 * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
 *
 * @param pArray      - Returned array through which a subresource of \p resource may be accessed
 * @param resource    - Mapped resource to access
 * @param arrayIndex  - Array index for array textures or cubemap face
 *                      index as defined by ::CUarray_cubemap_face for
 *                      cubemap textures for the subresource to access
 * @param mipLevel    - Mipmap level for the subresource to access
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_NOT_MAPPED,
 * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
 * \notefnerr
 *
 * \sa ::cuGraphicsResourceGetMappedPointer
 */
public static native @Cast("CUresult") int cuGraphicsSubResourceGetMappedArray(@ByPtrPtr CUarray_st pArray, CUgraphicsResource_st resource, @Cast("unsigned int") int arrayIndex, @Cast("unsigned int") int mipLevel);

// #if __CUDA_API_VERSION >= 5000

/**
 * \brief Get a mipmapped array through which to access a mapped graphics resource.
 *
 * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics 
 * resource \p resource. The value set in \p *pMipmappedArray may change every time 
 * that \p resource is mapped.
 *
 * If \p resource is not a texture then it cannot be accessed via a mipmapped array and
 * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
 * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
 *
 * @param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed
 * @param resource        - Mapped resource to access
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_NOT_MAPPED,
 * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
 * \notefnerr
 *
 * \sa ::cuGraphicsResourceGetMappedPointer
 */
public static native @Cast("CUresult") int cuGraphicsResourceGetMappedMipmappedArray(@ByPtrPtr CUmipmappedArray_st pMipmappedArray, CUgraphicsResource_st resource);

// #endif /* __CUDA_API_VERSION >= 5000 */

// #if __CUDA_API_VERSION >= 3020
/**
 * \brief Get a device pointer through which to access a mapped graphics resource.
 *
 * Returns in \p *pDevPtr a pointer through which the mapped graphics resource
 * \p resource may be accessed.
 * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer.
 * The value set in \p pPointer may change every time that \p resource is mapped.
 *
 * If \p resource is not a buffer then it cannot be accessed via a pointer and
 * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned.
 * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
 * *
 * @param pDevPtr    - Returned pointer through which \p resource may be accessed
 * @param pSize      - Returned size of the buffer accessible starting at \p *pPointer
 * @param resource   - Mapped resource to access
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_NOT_MAPPED,
 * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER
 * \notefnerr
 *
 * \sa
 * ::cuGraphicsMapResources,
 * ::cuGraphicsSubResourceGetMappedArray
 */
public static native @Cast("CUresult") int cuGraphicsResourceGetMappedPointer(@Cast("CUdeviceptr*") LongPointer pDevPtr, @Cast("size_t*") SizeTPointer pSize, CUgraphicsResource_st resource);
public static native @Cast("CUresult") int cuGraphicsResourceGetMappedPointer(@Cast("CUdeviceptr*") LongBuffer pDevPtr, @Cast("size_t*") SizeTPointer pSize, CUgraphicsResource_st resource);
public static native @Cast("CUresult") int cuGraphicsResourceGetMappedPointer(@Cast("CUdeviceptr*") long[] pDevPtr, @Cast("size_t*") SizeTPointer pSize, CUgraphicsResource_st resource);
// #endif /* __CUDA_API_VERSION >= 3020 */

/**
 * \brief Set usage flags for mapping a graphics resource
 *
 * Set \p flags for mapping the graphics resource \p resource.
 *
 * Changes to \p flags will take effect the next time \p resource is mapped.
 * The \p flags argument may be any of the following:
 
 * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
 *   resource will be used. It is therefore assumed that this resource will be
 *   read from and written to by CUDA kernels.  This is the default value.
 * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
 *   access this resource will not write to this resource.
 * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
 *   which access this resource will not read from this resource and will
 *   write over the entire contents of the resource, so none of the data
 *   previously stored in the resource will be preserved.
 *
 * If \p resource is presently mapped for access by CUDA then
 * ::CUDA_ERROR_ALREADY_MAPPED is returned.
 * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned.
 *
 * @param resource - Registered resource to set flags for
 * @param flags    - Parameters for resource mapping
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_VALUE,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_ALREADY_MAPPED
 * \notefnerr
 *
 * \sa
 * ::cuGraphicsMapResources
 */
public static native @Cast("CUresult") int cuGraphicsResourceSetMapFlags(CUgraphicsResource_st resource, @Cast("unsigned int") int flags);

/**
 * \brief Map graphics resources for access by CUDA
 *
 * Maps the \p count graphics resources in \p resources for access by CUDA.
 *
 * The resources in \p resources may be accessed by CUDA until they
 * are unmapped. The graphics API from which \p resources were registered
 * should not access any resources while they are mapped by CUDA. If an
 * application does so, the results are undefined.
 *
 * This function provides the synchronization guarantee that any graphics calls
 * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA
 * work issued in \p stream begins.
 *
 * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
 * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
 *
 * @param count      - Number of resources to map
 * @param resources  - Resources to map for CUDA usage
 * @param hStream    - Stream with which to synchronize
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_ALREADY_MAPPED,
 * ::CUDA_ERROR_UNKNOWN
 * \note_null_stream
 * \notefnerr
 *
 * \sa
 * ::cuGraphicsResourceGetMappedPointer,
 * ::cuGraphicsSubResourceGetMappedArray,
 * ::cuGraphicsUnmapResources
 */
public static native @Cast("CUresult") int cuGraphicsMapResources(@Cast("unsigned int") int count, @ByPtrPtr CUgraphicsResource_st resources, CUstream_st hStream);

/**
 * \brief Unmap graphics resources.
 *
 * Unmaps the \p count graphics resources in \p resources.
 *
 * Once unmapped, the resources in \p resources may not be accessed by CUDA
 * until they are mapped again.
 *
 * This function provides the synchronization guarantee that any CUDA work issued
 * in \p stream before ::cuGraphicsUnmapResources() will complete before any
 * subsequently issued graphics work begins.
 *
 *
 * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
 * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
 *
 * @param count      - Number of resources to unmap
 * @param resources  - Resources to unmap
 * @param hStream    - Stream with which to synchronize
 *
 * @return
 * ::CUDA_SUCCESS,
 * ::CUDA_ERROR_DEINITIALIZED,
 * ::CUDA_ERROR_NOT_INITIALIZED,
 * ::CUDA_ERROR_INVALID_CONTEXT,
 * ::CUDA_ERROR_INVALID_HANDLE,
 * ::CUDA_ERROR_NOT_MAPPED,
 * ::CUDA_ERROR_UNKNOWN
 * \note_null_stream
 * \notefnerr
 *
 * \sa
 * ::cuGraphicsMapResources
 */
public static native @Cast("CUresult") int cuGraphicsUnmapResources(@Cast("unsigned int") int count, @ByPtrPtr CUgraphicsResource_st resources, CUstream_st hStream);

/** \} */ /* END CUDA_GRAPHICS */

public static native @Cast("CUresult") int cuGetExportTable(@Cast("const void**") PointerPointer ppExportTable, @Const CUuuid pExportTableId);
public static native @Cast("CUresult") int cuGetExportTable(@Cast("const void**") @ByPtrPtr Pointer ppExportTable, @Const CUuuid pExportTableId);


/**
 * CUDA API versioning support
 */
// #if defined(__CUDA_API_VERSION_INTERNAL)
// #endif /* __CUDA_API_VERSION_INTERNAL */

// #if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050)
// #endif /* defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050) */

// #if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050
// #endif /* defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050 */

// #if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050)
// #endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050) */

// #if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010)
// #endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) */

/**
 * CUDA API made obselete at API version 3020
 */
// #if defined(__CUDA_API_VERSION_INTERNAL)
// #endif /* CUDA_FORCE_LEGACY32_INTERNAL */

// #if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020
// #endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 3020 */
// #if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 4000
// #endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 4000 */
// #if defined(__CUDA_API_VERSION_INTERNAL)
// #endif /* __CUDA_API_VERSION_INTERNAL */

// #if defined(__CUDA_API_VERSION_INTERNAL)
// #endif

// #ifdef __cplusplus
// #endif

// #undef __CUDA_API_VERSION

// #endif /* __cuda_cuda_h__ */



// Parsed from 

/*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

// #if !defined(__HOST_DEFINES_H__)
// #define __HOST_DEFINES_H__

/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */
// #if defined(__GNUC__) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__)

// #if defined(__CUDACC_RTC__)
// #define __volatile__ volatile
// #endif /* __CUDACC_RTC__ */

// #define __no_return__
//         __attribute__((noreturn))
        
// #if defined(__CUDACC__) || defined(__CUDA_ARCH__)
/* gcc allows users to define attributes with underscores, 
   e.g., __attribute__((__noinline__)).
   Consider a non-CUDA source file (e.g. .cpp) that has the 
   above attribute specification, and includes this header file. In that case,
   defining __noinline__ as below  would cause a gcc compilation error.
   Hence, only define __noinline__ when the code is being processed
   by a  CUDA compiler component.
*/   
// #define __noinline__
//         __attribute__((noinline))
// #endif /* __CUDACC__  || __CUDA_ARCH__ */       
        
// #define __forceinline__
//         __inline__ __attribute__((always_inline))
// #define __align__(n)
//         __attribute__((aligned(n)))
// #define __thread__
//         __thread
// #define __import__
// #define __export__
// #define __cdecl
// #define __annotate__(a)
//         __attribute__((a))
// #define __location__(a)
//         __annotate__(a)
// #define CUDARTAPI

// #elif defined(_MSC_VER)

// #if _MSC_VER >= 1400

// #define __restrict__
//         __restrict

// #else /* _MSC_VER >= 1400 */

// #define __restrict__

// #endif /* _MSC_VER >= 1400 */

// #define __inline__
//         __inline
// #define __no_return__
//         __declspec(noreturn)
// #define __noinline__
//         __declspec(noinline)
// #define __forceinline__
//         __forceinline
// #define __align__(n)
//         __declspec(align(n))
// #define __thread__
//         __declspec(thread)
// #define __import__
//         __declspec(dllimport)
// #define __export__
//         __declspec(dllexport)
// #define __annotate__(a)
//         __declspec(a)
// #define __location__(a)
//         __annotate__(__##a##__)
// #define CUDARTAPI
//         __stdcall

// #else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */

// #define __inline__

// #if !defined(__align__)

// #error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! ---

// #endif /* !__align__ */

// #if !defined(CUDARTAPI)

// #error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! ---

// #endif /* !CUDARTAPI */

// #endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */

// #if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) ||
//     (defined(_MSC_VER) && _MSC_VER < 1900) ||
//     (!defined(__GNUC__) && !defined(_MSC_VER))

// #define __specialization_static
//         static

// #else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
//          (_MSC_VER && _MSC_VER < 1900) ||
//          (!__GNUC__ && !_MSC_VER) */

// #define __specialization_static

// #endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) ||
//          (_MSC_VER && _MSC_VER < 1900) ||
//          (!__GNUC__ && !_MSC_VER) */

// #if !defined(__CUDACC__) && !defined(__CUDABE__)

// #undef __annotate__
// #define __annotate__(a)

// #else /* !__CUDACC__ && !__CUDABE__ */

// #define __launch_bounds__(...)
//         __annotate__(launch_bounds(__VA_ARGS__))

// #endif /* !__CUDACC__ && !__CUDABE__ */

// #if defined(__CUDACC__) || defined(__CUDABE__) ||
//     defined(__GNUC__) || defined(_WIN64)

// #define __builtin_align__(a)
//         __align__(a)

// #else /* __CUDACC__ || __CUDABE__ || __GNUC__ || _WIN64 */

// #define __builtin_align__(a)

// #endif /* __CUDACC__ || __CUDABE__ || __GNUC__  || _WIN64 */

// #define __host__
//         __location__(host)
// #define __device__
//         __location__(device)
// #define __global__
//         __location__(global)
// #define __shared__
//         __location__(shared)
// #define __constant__
//         __location__(constant)
// #define __managed__
//         __location__(managed)
        
// #if (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__)) || !defined(__CUDACC__)
// #define __device_builtin__
// #define __device_builtin_texture_type__
// #define __device_builtin_surface_type__
// #define __cudart_builtin__
// #else /* (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__))  || !__CUDACC__ */
// #define __device_builtin__
//         __location__(device_builtin)
// #define __device_builtin_texture_type__
//         __location__(device_builtin_texture_type)
// #define __device_builtin_surface_type__
//         __location__(device_builtin_surface_type)
// #define __cudart_builtin__
//         __location__(cudart_builtin)
// #endif /* (defined(__CUDABE__) && !defined(__CUDACC_INTEGRATED__))  || !__CUDACC__ */

// #if defined(__CUDACC__) && defined(__clang__)

// #if !defined(__has_feature)
// #error --- !!! The Clang version does not support __has_feature !!! ---
// #endif /* !__has_feature */

// #if defined(__cplusplus) && defined(__CUDACC__)
// #if (__has_feature(cxx_noexcept))
// #define NV_CLANG_ATOMIC_NOEXCEPT noexcept
// #define NV_CLANG_ATOMIC_NOEXCEPT_(x) noexcept(x)
// #else /* !__has_feature(cxx_noexcept) */
// #define NV_CLANG_ATOMIC_NOEXCEPT throw()
// #define NV_CLANG_ATOMIC_NOEXCEPT_(x)
// #endif /* __has_feature(cxx_noexcept) */
// #define _Atomic(X) __nv_clang_atomic_t
// #endif /* defined(__cplusplus) && defined(__CUDACC__) */

// #endif /* __CUDACC__ && __clang__ */


// #endif /* !__HOST_DEFINES_H__ */


// Parsed from 

/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

// #if !defined(__DEVICE_TYPES_H__)
// #define __DEVICE_TYPES_H__

// #include "host_defines.h"

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

/** enum cudaRoundMode */
public static final int
    cudaRoundNearest = 0,
    cudaRoundZero = 1,
    cudaRoundPosInf = 2,
    cudaRoundMinInf = 3;

// #endif /* !__DEVICE_TYPES_H__ */


// Parsed from 

/*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

// #if !defined(__DRIVER_TYPES_H__)
// #define __DRIVER_TYPES_H__

// #include "host_defines.h"

/**
 * \defgroup CUDART_TYPES Data types used by CUDA Runtime
 * \ingroup CUDART
 *
 * \{
 */

/*******************************************************************************
*                                                                              *
*  TYPE DEFINITIONS USED BY RUNTIME API                                        *
*                                                                              *
*******************************************************************************/

// #if !defined(__CUDA_INTERNAL_COMPILATION__)

// #if !defined(__CUDACC_RTC__)
// #include 
// #include 
// #endif /* !defined(__CUDACC_RTC__) */

/** Default page-locked allocation flag */
public static final int cudaHostAllocDefault =                0x00;
/** Pinned memory accessible by all CUDA contexts */
public static final int cudaHostAllocPortable =               0x01;
/** Map allocation into device space */
public static final int cudaHostAllocMapped =                 0x02;
/** Write-combined memory */
public static final int cudaHostAllocWriteCombined =          0x04;

/** Default host memory registration flag */
public static final int cudaHostRegisterDefault =             0x00;
/** Pinned memory accessible by all CUDA contexts */
public static final int cudaHostRegisterPortable =            0x01;
/** Map registered memory into device space */
public static final int cudaHostRegisterMapped =              0x02;
/** Memory-mapped I/O space */
public static final int cudaHostRegisterIoMemory =            0x04;

/** Default peer addressing enable flag */
public static final int cudaPeerAccessDefault =               0x00;

/** Default stream flag */
public static final int cudaStreamDefault =                   0x00;
/** Stream does not synchronize with stream 0 (the NULL stream) */
public static final int cudaStreamNonBlocking =               0x01;

 /**
 * Legacy stream handle
 *
 * Stream handle that can be passed as a cudaStream_t to use an implicit stream
 * with legacy synchronization behavior.
 *
 * See details of the \link_sync_behavior
 */
public static native @MemberGetter CUstream_st cudaStreamLegacy();
public static final CUstream_st cudaStreamLegacy = cudaStreamLegacy();

/**
 * Per-thread stream handle
 *
 * Stream handle that can be passed as a cudaStream_t to use an implicit stream
 * with per-thread synchronization behavior.
 *
 * See details of the \link_sync_behavior
 */
public static native @MemberGetter CUstream_st cudaStreamPerThread();
public static final CUstream_st cudaStreamPerThread = cudaStreamPerThread();

/** Default event flag */
public static final int cudaEventDefault =                    0x00;
/** Event uses blocking synchronization */
public static final int cudaEventBlockingSync =               0x01;
/** Event will not record timing data */
public static final int cudaEventDisableTiming =              0x02;
/** Event is suitable for interprocess use. cudaEventDisableTiming must be set */
public static final int cudaEventInterprocess =               0x04;

/** Device flag - Automatic scheduling */
public static final int cudaDeviceScheduleAuto =              0x00;
/** Device flag - Spin default scheduling */
public static final int cudaDeviceScheduleSpin =              0x01;
/** Device flag - Yield default scheduling */
public static final int cudaDeviceScheduleYield =             0x02;
/** Device flag - Use blocking synchronization */
public static final int cudaDeviceScheduleBlockingSync =      0x04;
/** Device flag - Use blocking synchronization 
                                                    *  @deprecated This flag was deprecated as of CUDA 4.0 and
                                                    *  replaced with ::cudaDeviceScheduleBlockingSync. */
public static final int cudaDeviceBlockingSync =              0x04;
/** Device schedule flags mask */
public static final int cudaDeviceScheduleMask =              0x07;
/** Device flag - Support mapped pinned allocations */
public static final int cudaDeviceMapHost =                   0x08;
/** Device flag - Keep local memory allocation after launch */
public static final int cudaDeviceLmemResizeToMax =           0x10;
/** Device flags mask */
public static final int cudaDeviceMask =                      0x1f;

/** Default CUDA array allocation flag */
public static final int cudaArrayDefault =                    0x00;
/** Must be set in cudaMalloc3DArray to create a layered CUDA array */
public static final int cudaArrayLayered =                    0x01;
/** Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array */
public static final int cudaArraySurfaceLoadStore =           0x02;
/** Must be set in cudaMalloc3DArray to create a cubemap CUDA array */
public static final int cudaArrayCubemap =                    0x04;
/** Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array */
public static final int cudaArrayTextureGather =              0x08;

/** Automatically enable peer access between remote devices as needed */
public static final int cudaIpcMemLazyEnablePeerAccess =      0x01;

/** Memory can be accessed by any stream on any device*/
public static final int cudaMemAttachGlobal =                 0x01;
/** Memory cannot be accessed by any stream on any device */
public static final int cudaMemAttachHost =                   0x02;
/** Memory can only be accessed by a single stream on the associated device */
public static final int cudaMemAttachSingle =                 0x04;

/** Default behavior */
public static final int cudaOccupancyDefault =                0x00;
/** Assume global caching is enabled and cannot be automatically turned off */
public static final int cudaOccupancyDisableCachingOverride = 0x01;

/** Device id that represents the CPU */
public static final int cudaCpuDeviceId =                     ((int)-1);
/** Device id that represents an invalid device */
public static final int cudaInvalidDeviceId =                 ((int)-2);

// #endif /* !__CUDA_INTERNAL_COMPILATION__ */

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

/**
 * CUDA error types
 */
/** enum cudaError */
public static final int
    /**
     * The API call returned with no errors. In the case of query calls, this
     * can also mean that the operation being queried is complete (see
     * ::cudaEventQuery() and ::cudaStreamQuery()).
     */
    cudaSuccess                           = 0,
  
    /**
     * The device function being invoked (usually via ::cudaLaunchKernel()) was not
     * previously configured via the ::cudaConfigureCall() function.
     */
    cudaErrorMissingConfiguration         = 1,
  
    /**
     * The API call failed because it was unable to allocate enough memory to
     * perform the requested operation.
     */
    cudaErrorMemoryAllocation             = 2,
  
    /**
     * The API call failed because the CUDA driver and runtime could not be
     * initialized.
     */
    cudaErrorInitializationError          = 3,
  
    /**
     * An exception occurred on the device while executing a kernel. Common
     * causes include dereferencing an invalid device pointer and accessing
     * out of bounds shared memory. The device cannot be used until
     * ::cudaThreadExit() is called. All existing device memory allocations
     * are invalid and must be reconstructed if the program is to continue
     * using CUDA.
     */
    cudaErrorLaunchFailure                = 4,
  
    /**
     * This indicated that a previous kernel launch failed. This was previously
     * used for device emulation of kernel launches.
     * @deprecated
     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
     * removed with the CUDA 3.1 release.
     */
    cudaErrorPriorLaunchFailure           = 5,
  
    /**
     * This indicates that the device kernel took too long to execute. This can
     * only occur if timeouts are enabled - see the device property
     * \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled"
     * for more information.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    cudaErrorLaunchTimeout                = 6,
  
    /**
     * This indicates that a launch did not occur because it did not have
     * appropriate resources. Although this error is similar to
     * ::cudaErrorInvalidConfiguration, this error usually indicates that the
     * user has attempted to pass too many arguments to the device kernel, or the
     * kernel launch specifies too many threads for the kernel's register count.
     */
    cudaErrorLaunchOutOfResources         = 7,
  
    /**
     * The requested device function does not exist or is not compiled for the
     * proper device architecture.
     */
    cudaErrorInvalidDeviceFunction        = 8,
  
    /**
     * This indicates that a kernel launch is requesting resources that can
     * never be satisfied by the current device. Requesting more shared memory
     * per block than the device supports will trigger this error, as will
     * requesting too many threads or blocks. See ::cudaDeviceProp for more
     * device limitations.
     */
    cudaErrorInvalidConfiguration         = 9,
  
    /**
     * This indicates that the device ordinal supplied by the user does not
     * correspond to a valid CUDA device.
     */
    cudaErrorInvalidDevice                = 10,
  
    /**
     * This indicates that one or more of the parameters passed to the API call
     * is not within an acceptable range of values.
     */
    cudaErrorInvalidValue                 = 11,
  
    /**
     * This indicates that one or more of the pitch-related parameters passed
     * to the API call is not within the acceptable range for pitch.
     */
    cudaErrorInvalidPitchValue            = 12,
  
    /**
     * This indicates that the symbol name/identifier passed to the API call
     * is not a valid name or identifier.
     */
    cudaErrorInvalidSymbol                = 13,
  
    /**
     * This indicates that the buffer object could not be mapped.
     */
    cudaErrorMapBufferObjectFailed        = 14,
  
    /**
     * This indicates that the buffer object could not be unmapped.
     */
    cudaErrorUnmapBufferObjectFailed      = 15,
  
    /**
     * This indicates that at least one host pointer passed to the API call is
     * not a valid host pointer.
     */
    cudaErrorInvalidHostPointer           = 16,
  
    /**
     * This indicates that at least one device pointer passed to the API call is
     * not a valid device pointer.
     */
    cudaErrorInvalidDevicePointer         = 17,
  
    /**
     * This indicates that the texture passed to the API call is not a valid
     * texture.
     */
    cudaErrorInvalidTexture               = 18,
  
    /**
     * This indicates that the texture binding is not valid. This occurs if you
     * call ::cudaGetTextureAlignmentOffset() with an unbound texture.
     */
    cudaErrorInvalidTextureBinding        = 19,
  
    /**
     * This indicates that the channel descriptor passed to the API call is not
     * valid. This occurs if the format is not one of the formats specified by
     * ::cudaChannelFormatKind, or if one of the dimensions is invalid.
     */
    cudaErrorInvalidChannelDescriptor     = 20,
  
    /**
     * This indicates that the direction of the memcpy passed to the API call is
     * not one of the types specified by ::cudaMemcpyKind.
     */
    cudaErrorInvalidMemcpyDirection       = 21,
  
    /**
     * This indicated that the user has taken the address of a constant variable,
     * which was forbidden up until the CUDA 3.1 release.
     * @deprecated
     * This error return is deprecated as of CUDA 3.1. Variables in constant
     * memory may now have their address taken by the runtime via
     * ::cudaGetSymbolAddress().
     */
    cudaErrorAddressOfConstant            = 22,
  
    /**
     * This indicated that a texture fetch was not able to be performed.
     * This was previously used for device emulation of texture operations.
     * @deprecated
     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
     * removed with the CUDA 3.1 release.
     */
    cudaErrorTextureFetchFailed           = 23,
  
    /**
     * This indicated that a texture was not bound for access.
     * This was previously used for device emulation of texture operations.
     * @deprecated
     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
     * removed with the CUDA 3.1 release.
     */
    cudaErrorTextureNotBound              = 24,
  
    /**
     * This indicated that a synchronization operation had failed.
     * This was previously used for some device emulation functions.
     * @deprecated
     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
     * removed with the CUDA 3.1 release.
     */
    cudaErrorSynchronizationError         = 25,
  
    /**
     * This indicates that a non-float texture was being accessed with linear
     * filtering. This is not supported by CUDA.
     */
    cudaErrorInvalidFilterSetting         = 26,
  
    /**
     * This indicates that an attempt was made to read a non-float texture as a
     * normalized float. This is not supported by CUDA.
     */
    cudaErrorInvalidNormSetting           = 27,
  
    /**
     * Mixing of device and device emulation code was not allowed.
     * @deprecated
     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
     * removed with the CUDA 3.1 release.
     */
    cudaErrorMixedDeviceExecution         = 28,
  
    /**
     * This indicates that a CUDA Runtime API call cannot be executed because
     * it is being called during process shut down, at a point in time after
     * CUDA driver has been unloaded.
     */
    cudaErrorCudartUnloading              = 29,
  
    /**
     * This indicates that an unknown internal error has occurred.
     */
    cudaErrorUnknown                      = 30,

    /**
     * This indicates that the API call is not yet implemented. Production
     * releases of CUDA will never return this error.
     * @deprecated
     * This error return is deprecated as of CUDA 4.1.
     */
    cudaErrorNotYetImplemented            = 31,
  
    /**
     * This indicated that an emulated device pointer exceeded the 32-bit address
     * range.
     * @deprecated
     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
     * removed with the CUDA 3.1 release.
     */
    cudaErrorMemoryValueTooLarge          = 32,
  
    /**
     * This indicates that a resource handle passed to the API call was not
     * valid. Resource handles are opaque types like ::cudaStream_t and
     * ::cudaEvent_t.
     */
    cudaErrorInvalidResourceHandle        = 33,
  
    /**
     * This indicates that asynchronous operations issued previously have not
     * completed yet. This result is not actually an error, but must be indicated
     * differently than ::cudaSuccess (which indicates completion). Calls that
     * may return this value include ::cudaEventQuery() and ::cudaStreamQuery().
     */
    cudaErrorNotReady                     = 34,
  
    /**
     * This indicates that the installed NVIDIA CUDA driver is older than the
     * CUDA runtime library. This is not a supported configuration. Users should
     * install an updated NVIDIA display driver to allow the application to run.
     */
    cudaErrorInsufficientDriver           = 35,
  
    /**
     * This indicates that the user has called ::cudaSetValidDevices(),
     * ::cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(),
     * ::cudaD3D10SetDirect3DDevice, ::cudaD3D11SetDirect3DDevice(), or
     * ::cudaVDPAUSetVDPAUDevice() after initializing the CUDA runtime by
     * calling non-device management operations (allocating memory and
     * launching kernels are examples of non-device management operations).
     * This error can also be returned if using runtime/driver
     * interoperability and there is an existing ::CUcontext active on the
     * host thread.
     */
    cudaErrorSetOnActiveProcess           = 36,
  
    /**
     * This indicates that the surface passed to the API call is not a valid
     * surface.
     */
    cudaErrorInvalidSurface               = 37,
  
    /**
     * This indicates that no CUDA-capable devices were detected by the installed
     * CUDA driver.
     */
    cudaErrorNoDevice                     = 38,
  
    /**
     * This indicates that an uncorrectable ECC error was detected during
     * execution.
     */
    cudaErrorECCUncorrectable             = 39,
  
    /**
     * This indicates that a link to a shared object failed to resolve.
     */
    cudaErrorSharedObjectSymbolNotFound   = 40,
  
    /**
     * This indicates that initialization of a shared object failed.
     */
    cudaErrorSharedObjectInitFailed       = 41,
  
    /**
     * This indicates that the ::cudaLimit passed to the API call is not
     * supported by the active device.
     */
    cudaErrorUnsupportedLimit             = 42,
  
    /**
     * This indicates that multiple global or constant variables (across separate
     * CUDA source files in the application) share the same string name.
     */
    cudaErrorDuplicateVariableName        = 43,
  
    /**
     * This indicates that multiple textures (across separate CUDA source
     * files in the application) share the same string name.
     */
    cudaErrorDuplicateTextureName         = 44,
  
    /**
     * This indicates that multiple surfaces (across separate CUDA source
     * files in the application) share the same string name.
     */
    cudaErrorDuplicateSurfaceName         = 45,
  
    /**
     * This indicates that all CUDA devices are busy or unavailable at the current
     * time. Devices are often busy/unavailable due to use of
     * ::cudaComputeModeExclusive, ::cudaComputeModeProhibited or when long
     * running CUDA kernels have filled up the GPU and are blocking new work
     * from starting. They can also be unavailable due to memory constraints
     * on a device that already has active CUDA work being performed.
     */
    cudaErrorDevicesUnavailable           = 46,
  
    /**
     * This indicates that the device kernel image is invalid.
     */
    cudaErrorInvalidKernelImage           = 47,
  
    /**
     * This indicates that there is no kernel image available that is suitable
     * for the device. This can occur when a user specifies code generation
     * options for a particular CUDA source file that do not include the
     * corresponding device configuration.
     */
    cudaErrorNoKernelImageForDevice       = 48,
  
    /**
     * This indicates that the current context is not compatible with this
     * the CUDA Runtime. This can only occur if you are using CUDA
     * Runtime/Driver interoperability and have created an existing Driver
     * context using the driver API. The Driver context may be incompatible
     * either because the Driver context was created using an older version 
     * of the API, because the Runtime API call expects a primary driver 
     * context and the Driver context is not primary, or because the Driver 
     * context has been destroyed. Please see \ref CUDART_DRIVER "Interactions 
     * with the CUDA Driver API" for more information.
     */
    cudaErrorIncompatibleDriverContext    = 49,
      
    /**
     * This error indicates that a call to ::cudaDeviceEnablePeerAccess() is
     * trying to re-enable peer addressing on from a context which has already
     * had peer addressing enabled.
     */
    cudaErrorPeerAccessAlreadyEnabled     = 50,
    
    /**
     * This error indicates that ::cudaDeviceDisablePeerAccess() is trying to 
     * disable peer addressing which has not been enabled yet via 
     * ::cudaDeviceEnablePeerAccess().
     */
    cudaErrorPeerAccessNotEnabled         = 51,
    
    /**
     * This indicates that a call tried to access an exclusive-thread device that 
     * is already in use by a different thread.
     */
    cudaErrorDeviceAlreadyInUse           = 54,

    /**
     * This indicates profiler is not initialized for this run. This can
     * happen when the application is running with external profiling tools
     * like visual profiler.
     */
    cudaErrorProfilerDisabled             = 55,

    /**
     * @deprecated
     * This error return is deprecated as of CUDA 5.0. It is no longer an error
     * to attempt to enable/disable the profiling via ::cudaProfilerStart or
     * ::cudaProfilerStop without initialization.
     */
    cudaErrorProfilerNotInitialized       = 56,

    /**
     * @deprecated
     * This error return is deprecated as of CUDA 5.0. It is no longer an error
     * to call cudaProfilerStart() when profiling is already enabled.
     */
    cudaErrorProfilerAlreadyStarted       = 57,

    /**
     * @deprecated
     * This error return is deprecated as of CUDA 5.0. It is no longer an error
     * to call cudaProfilerStop() when profiling is already disabled.
     */
     cudaErrorProfilerAlreadyStopped       = 58,

    /**
     * An assert triggered in device code during kernel execution. The device
     * cannot be used again until ::cudaThreadExit() is called. All existing 
     * allocations are invalid and must be reconstructed if the program is to
     * continue using CUDA. 
     */
    cudaErrorAssert                        = 59,
  
    /**
     * This error indicates that the hardware resources required to enable
     * peer access have been exhausted for one or more of the devices 
     * passed to ::cudaEnablePeerAccess().
     */
    cudaErrorTooManyPeers                 = 60,
  
    /**
     * This error indicates that the memory range passed to ::cudaHostRegister()
     * has already been registered.
     */
    cudaErrorHostMemoryAlreadyRegistered  = 61,
        
    /**
     * This error indicates that the pointer passed to ::cudaHostUnregister()
     * does not correspond to any currently registered memory region.
     */
    cudaErrorHostMemoryNotRegistered      = 62,

    /**
     * This error indicates that an OS call failed.
     */
    cudaErrorOperatingSystem              = 63,

    /**
     * This error indicates that P2P access is not supported across the given
     * devices.
     */
    cudaErrorPeerAccessUnsupported        = 64,

    /**
     * This error indicates that a device runtime grid launch did not occur 
     * because the depth of the child grid would exceed the maximum supported
     * number of nested grid launches. 
     */
    cudaErrorLaunchMaxDepthExceeded       = 65,

    /**
     * This error indicates that a grid launch did not occur because the kernel 
     * uses file-scoped textures which are unsupported by the device runtime. 
     * Kernels launched via the device runtime only support textures created with 
     * the Texture Object API's.
     */
    cudaErrorLaunchFileScopedTex          = 66,

    /**
     * This error indicates that a grid launch did not occur because the kernel 
     * uses file-scoped surfaces which are unsupported by the device runtime.
     * Kernels launched via the device runtime only support surfaces created with
     * the Surface Object API's.
     */
    cudaErrorLaunchFileScopedSurf         = 67,

    /**
     * This error indicates that a call to ::cudaDeviceSynchronize made from
     * the device runtime failed because the call was made at grid depth greater
     * than than either the default (2 levels of grids) or user specified device 
     * limit ::cudaLimitDevRuntimeSyncDepth. To be able to synchronize on 
     * launched grids at a greater depth successfully, the maximum nested 
     * depth at which ::cudaDeviceSynchronize will be called must be specified 
     * with the ::cudaLimitDevRuntimeSyncDepth limit to the ::cudaDeviceSetLimit
     * api before the host-side launch of a kernel using the device runtime. 
     * Keep in mind that additional levels of sync depth require the runtime 
     * to reserve large amounts of device memory that cannot be used for 
     * user allocations.
     */
    cudaErrorSyncDepthExceeded            = 68,

    /**
     * This error indicates that a device runtime grid launch failed because
     * the launch would exceed the limit ::cudaLimitDevRuntimePendingLaunchCount.
     * For this launch to proceed successfully, ::cudaDeviceSetLimit must be
     * called to set the ::cudaLimitDevRuntimePendingLaunchCount to be higher 
     * than the upper bound of outstanding launches that can be issued to the
     * device runtime. Keep in mind that raising the limit of pending device
     * runtime launches will require the runtime to reserve device memory that
     * cannot be used for user allocations.
     */
    cudaErrorLaunchPendingCountExceeded   = 69,
    
    /**
     * This error indicates the attempted operation is not permitted.
     */
    cudaErrorNotPermitted                 = 70,

    /**
     * This error indicates the attempted operation is not supported
     * on the current system or device.
     */
    cudaErrorNotSupported                 = 71,

    /**
     * Device encountered an error in the call stack during kernel execution,
     * possibly due to stack corruption or exceeding the stack size limit.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    cudaErrorHardwareStackError           = 72,

    /**
     * The device encountered an illegal instruction during kernel execution
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    cudaErrorIllegalInstruction           = 73,

    /**
     * The device encountered a load or store instruction
     * on a memory address which is not aligned.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    cudaErrorMisalignedAddress            = 74,

    /**
     * While executing a kernel, the device encountered an instruction
     * which can only operate on memory locations in certain address spaces
     * (global, shared, or local), but was supplied a memory address not
     * belonging to an allowed address space.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    cudaErrorInvalidAddressSpace          = 75,

    /**
     * The device encountered an invalid program counter.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    cudaErrorInvalidPc                    = 76,

    /**
     * The device encountered a load or store instruction on an invalid memory address.
     * This leaves the process in an inconsistent state and any further CUDA work
     * will return the same error. To continue using CUDA, the process must be terminated
     * and relaunched.
     */
    cudaErrorIllegalAddress               = 77,

    /**
     * A PTX compilation failed. The runtime may fall back to compiling PTX if
     * an application does not contain a suitable binary for the current device.
     */
    cudaErrorInvalidPtx                   = 78,

    /**
     * This indicates an error with the OpenGL or DirectX context.
     */
    cudaErrorInvalidGraphicsContext       = 79,

    /**
     * This indicates that an uncorrectable NVLink error was detected during the
     * execution.
     */
    cudaErrorNvlinkUncorrectable          = 80,

    /**
     * This indicates an internal startup failure in the CUDA runtime.
     */
    cudaErrorStartupFailure               =   0x7f,

    /**
     * Any unhandled CUDA driver error is added to this value and returned via
     * the runtime. Production releases of CUDA should not return such errors.
     * @deprecated
     * This error return is deprecated as of CUDA 4.1.
     */
    cudaErrorApiFailureBase               = 10000;

/**
 * Channel format kind
 */
/** enum cudaChannelFormatKind */
public static final int
    /** Signed channel format */
    cudaChannelFormatKindSigned           = 0,
    /** Unsigned channel format */
    cudaChannelFormatKindUnsigned         = 1,
    /** Float channel format */
    cudaChannelFormatKindFloat            = 2,
    /** No channel format */
    cudaChannelFormatKindNone             = 3;

/**
 * CUDA Channel format descriptor
 */
public static class cudaChannelFormatDesc extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaChannelFormatDesc() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaChannelFormatDesc(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaChannelFormatDesc(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaChannelFormatDesc position(long position) {
        return (cudaChannelFormatDesc)super.position(position);
    }

    /** x */
    public native int x(); public native cudaChannelFormatDesc x(int x);
    /** y */
    public native int y(); public native cudaChannelFormatDesc y(int y);
    /** z */
    public native int z(); public native cudaChannelFormatDesc z(int z);
    /** w */
    public native int w(); public native cudaChannelFormatDesc w(int w);
    /** Channel format kind */
    public native @Cast("cudaChannelFormatKind") int f(); public native cudaChannelFormatDesc f(int f);
}

/**
 * CUDA array
 */
@Opaque public static class cudaArray extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public cudaArray() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaArray(Pointer p) { super(p); }
}

/**
 * CUDA array (as source copy argument)
 */

/**
 * CUDA mipmapped array
 */
@Opaque public static class cudaMipmappedArray extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public cudaMipmappedArray() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaMipmappedArray(Pointer p) { super(p); }
}

/**
 * CUDA mipmapped array (as source argument)
 */

/**
 * CUDA memory types
 */
/** enum cudaMemoryType */
public static final int
    /** Host memory */
    cudaMemoryTypeHost   = 1,
    /** Device memory */
    cudaMemoryTypeDevice = 2;

/**
 * CUDA memory copy types
 */
/** enum cudaMemcpyKind */
public static final int
    /** Host   -> Host */
    cudaMemcpyHostToHost          = 0,
    /** Host   -> Device */
    cudaMemcpyHostToDevice        = 1,
    /** Device -> Host */
    cudaMemcpyDeviceToHost        = 2,
    /** Device -> Device */
    cudaMemcpyDeviceToDevice      = 3,
    /** Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */
    cudaMemcpyDefault             = 4;

/**
 * CUDA Pitched memory pointer
 *
 * \sa ::make_cudaPitchedPtr
 */
public static class cudaPitchedPtr extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaPitchedPtr() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaPitchedPtr(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaPitchedPtr(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaPitchedPtr position(long position) {
        return (cudaPitchedPtr)super.position(position);
    }

    /** Pointer to allocated memory */
    public native Pointer ptr(); public native cudaPitchedPtr ptr(Pointer ptr);
    /** Pitch of allocated memory in bytes */
    public native @Cast("size_t") long pitch(); public native cudaPitchedPtr pitch(long pitch);
    /** Logical width of allocation in elements */
    public native @Cast("size_t") long xsize(); public native cudaPitchedPtr xsize(long xsize);
    /** Logical height of allocation in elements */
    public native @Cast("size_t") long ysize(); public native cudaPitchedPtr ysize(long ysize);
}

/**
 * CUDA extent
 *
 * \sa ::make_cudaExtent
 */
public static class cudaExtent extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaExtent() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaExtent(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaExtent(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaExtent position(long position) {
        return (cudaExtent)super.position(position);
    }

    /** Width in elements when referring to array memory, in bytes when referring to linear memory */
    public native @Cast("size_t") long width(); public native cudaExtent width(long width);
    /** Height in elements */
    public native @Cast("size_t") long height(); public native cudaExtent height(long height);
    /** Depth in elements */
    public native @Cast("size_t") long depth(); public native cudaExtent depth(long depth);
}

/**
 * CUDA 3D position
 *
 * \sa ::make_cudaPos
 */
public static class cudaPos extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaPos() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaPos(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaPos(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaPos position(long position) {
        return (cudaPos)super.position(position);
    }

    /** x */
    public native @Cast("size_t") long x(); public native cudaPos x(long x);
    /** y */
    public native @Cast("size_t") long y(); public native cudaPos y(long y);
    /** z */
    public native @Cast("size_t") long z(); public native cudaPos z(long z);
}

/**
 * CUDA 3D memory copying parameters
 */
public static class cudaMemcpy3DParms extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaMemcpy3DParms() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaMemcpy3DParms(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaMemcpy3DParms(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaMemcpy3DParms position(long position) {
        return (cudaMemcpy3DParms)super.position(position);
    }

    /** Source memory address */
    public native cudaArray srcArray(); public native cudaMemcpy3DParms srcArray(cudaArray srcArray);
    /** Source position offset */
    public native @ByRef cudaPos srcPos(); public native cudaMemcpy3DParms srcPos(cudaPos srcPos);
    /** Pitched source memory address */
    public native @ByRef cudaPitchedPtr srcPtr(); public native cudaMemcpy3DParms srcPtr(cudaPitchedPtr srcPtr);
  
    /** Destination memory address */
    public native cudaArray dstArray(); public native cudaMemcpy3DParms dstArray(cudaArray dstArray);
    /** Destination position offset */
    public native @ByRef cudaPos dstPos(); public native cudaMemcpy3DParms dstPos(cudaPos dstPos);
    /** Pitched destination memory address */
    public native @ByRef cudaPitchedPtr dstPtr(); public native cudaMemcpy3DParms dstPtr(cudaPitchedPtr dstPtr);
  
    /** Requested memory copy size */
    public native @ByRef cudaExtent extent(); public native cudaMemcpy3DParms extent(cudaExtent extent);
    /** Type of transfer */
    public native @Cast("cudaMemcpyKind") int kind(); public native cudaMemcpy3DParms kind(int kind);
}

/**
 * CUDA 3D cross-device memory copying parameters
 */
public static class cudaMemcpy3DPeerParms extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaMemcpy3DPeerParms() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaMemcpy3DPeerParms(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaMemcpy3DPeerParms(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaMemcpy3DPeerParms position(long position) {
        return (cudaMemcpy3DPeerParms)super.position(position);
    }

    /** Source memory address */
    public native cudaArray srcArray(); public native cudaMemcpy3DPeerParms srcArray(cudaArray srcArray);
    /** Source position offset */
    public native @ByRef cudaPos srcPos(); public native cudaMemcpy3DPeerParms srcPos(cudaPos srcPos);
    /** Pitched source memory address */
    public native @ByRef cudaPitchedPtr srcPtr(); public native cudaMemcpy3DPeerParms srcPtr(cudaPitchedPtr srcPtr);
    /** Source device */
    public native int srcDevice(); public native cudaMemcpy3DPeerParms srcDevice(int srcDevice);
  
    /** Destination memory address */
    public native cudaArray dstArray(); public native cudaMemcpy3DPeerParms dstArray(cudaArray dstArray);
    /** Destination position offset */
    public native @ByRef cudaPos dstPos(); public native cudaMemcpy3DPeerParms dstPos(cudaPos dstPos);
    /** Pitched destination memory address */
    public native @ByRef cudaPitchedPtr dstPtr(); public native cudaMemcpy3DPeerParms dstPtr(cudaPitchedPtr dstPtr);
    /** Destination device */
    public native int dstDevice(); public native cudaMemcpy3DPeerParms dstDevice(int dstDevice);
  
    /** Requested memory copy size */
    public native @ByRef cudaExtent extent(); public native cudaMemcpy3DPeerParms extent(cudaExtent extent);
}

/**
 * CUDA graphics interop resource
 */
@Opaque public static class cudaGraphicsResource extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public cudaGraphicsResource() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaGraphicsResource(Pointer p) { super(p); }
}

/**
 * CUDA graphics interop register flags
 */
/** enum cudaGraphicsRegisterFlags */
public static final int
    /** Default */
    cudaGraphicsRegisterFlagsNone             = 0,
    /** CUDA will not write to this resource */
    cudaGraphicsRegisterFlagsReadOnly         = 1, 
    /** CUDA will only write to and will not read from this resource */
    cudaGraphicsRegisterFlagsWriteDiscard     = 2,
    /** CUDA will bind this resource to a surface reference */
    cudaGraphicsRegisterFlagsSurfaceLoadStore = 4,
    /** CUDA will perform texture gather operations on this resource */
    cudaGraphicsRegisterFlagsTextureGather    = 8;

/**
 * CUDA graphics interop map flags
 */
/** enum cudaGraphicsMapFlags */
public static final int
    /** Default; Assume resource can be read/written */
    cudaGraphicsMapFlagsNone         = 0,
    /** CUDA will not write to this resource */
    cudaGraphicsMapFlagsReadOnly     = 1,
    /** CUDA will only write to and will not read from this resource */
    cudaGraphicsMapFlagsWriteDiscard = 2;

/**
 * CUDA graphics interop array indices for cube maps
 */
/** enum cudaGraphicsCubeFace */
public static final int
    /** Positive X face of cubemap */
    cudaGraphicsCubeFacePositiveX = 0x00,
    /** Negative X face of cubemap */
    cudaGraphicsCubeFaceNegativeX = 0x01,
    /** Positive Y face of cubemap */
    cudaGraphicsCubeFacePositiveY = 0x02,
    /** Negative Y face of cubemap */
    cudaGraphicsCubeFaceNegativeY = 0x03,
    /** Positive Z face of cubemap */
    cudaGraphicsCubeFacePositiveZ = 0x04,
    /** Negative Z face of cubemap */
    cudaGraphicsCubeFaceNegativeZ = 0x05;

/**
 * CUDA resource types
 */
/** enum cudaResourceType */
public static final int
    /** Array resource */
    cudaResourceTypeArray          = 0x00,
    /** Mipmapped array resource */
    cudaResourceTypeMipmappedArray = 0x01,
    /** Linear resource */
    cudaResourceTypeLinear         = 0x02,
    /** Pitch 2D resource */
    cudaResourceTypePitch2D        = 0x03;

/**
 * CUDA texture resource view formats
 */
/** enum cudaResourceViewFormat */
public static final int
    /** No resource view format (use underlying resource format) */
    cudaResViewFormatNone                      = 0x00,
    /** 1 channel unsigned 8-bit integers */
    cudaResViewFormatUnsignedChar1             = 0x01,
    /** 2 channel unsigned 8-bit integers */
    cudaResViewFormatUnsignedChar2             = 0x02,
    /** 4 channel unsigned 8-bit integers */
    cudaResViewFormatUnsignedChar4             = 0x03,
    /** 1 channel signed 8-bit integers */
    cudaResViewFormatSignedChar1               = 0x04,
    /** 2 channel signed 8-bit integers */
    cudaResViewFormatSignedChar2               = 0x05,
    /** 4 channel signed 8-bit integers */
    cudaResViewFormatSignedChar4               = 0x06,
    /** 1 channel unsigned 16-bit integers */
    cudaResViewFormatUnsignedShort1            = 0x07,
    /** 2 channel unsigned 16-bit integers */
    cudaResViewFormatUnsignedShort2            = 0x08,
    /** 4 channel unsigned 16-bit integers */
    cudaResViewFormatUnsignedShort4            = 0x09,
    /** 1 channel signed 16-bit integers */
    cudaResViewFormatSignedShort1              = 0x0a,
    /** 2 channel signed 16-bit integers */
    cudaResViewFormatSignedShort2              = 0x0b,
    /** 4 channel signed 16-bit integers */
    cudaResViewFormatSignedShort4              = 0x0c,
    /** 1 channel unsigned 32-bit integers */
    cudaResViewFormatUnsignedInt1              = 0x0d,
    /** 2 channel unsigned 32-bit integers */
    cudaResViewFormatUnsignedInt2              = 0x0e,
    /** 4 channel unsigned 32-bit integers */
    cudaResViewFormatUnsignedInt4              = 0x0f,
    /** 1 channel signed 32-bit integers */
    cudaResViewFormatSignedInt1                = 0x10,
    /** 2 channel signed 32-bit integers */
    cudaResViewFormatSignedInt2                = 0x11,
    /** 4 channel signed 32-bit integers */
    cudaResViewFormatSignedInt4                = 0x12,
    /** 1 channel 16-bit floating point */
    cudaResViewFormatHalf1                     = 0x13,
    /** 2 channel 16-bit floating point */
    cudaResViewFormatHalf2                     = 0x14,
    /** 4 channel 16-bit floating point */
    cudaResViewFormatHalf4                     = 0x15,
    /** 1 channel 32-bit floating point */
    cudaResViewFormatFloat1                    = 0x16,
    /** 2 channel 32-bit floating point */
    cudaResViewFormatFloat2                    = 0x17,
    /** 4 channel 32-bit floating point */
    cudaResViewFormatFloat4                    = 0x18,
    /** Block compressed 1 */
    cudaResViewFormatUnsignedBlockCompressed1  = 0x19,
    /** Block compressed 2 */
    cudaResViewFormatUnsignedBlockCompressed2  = 0x1a,
    /** Block compressed 3 */
    cudaResViewFormatUnsignedBlockCompressed3  = 0x1b,
    /** Block compressed 4 unsigned */
    cudaResViewFormatUnsignedBlockCompressed4  = 0x1c,
    /** Block compressed 4 signed */
    cudaResViewFormatSignedBlockCompressed4    = 0x1d,
    /** Block compressed 5 unsigned */
    cudaResViewFormatUnsignedBlockCompressed5  = 0x1e,
    /** Block compressed 5 signed */
    cudaResViewFormatSignedBlockCompressed5    = 0x1f,
    /** Block compressed 6 unsigned half-float */
    cudaResViewFormatUnsignedBlockCompressed6H = 0x20,
    /** Block compressed 6 signed half-float */
    cudaResViewFormatSignedBlockCompressed6H   = 0x21,
    /** Block compressed 7 */
    cudaResViewFormatUnsignedBlockCompressed7  = 0x22;

/**
 * CUDA resource descriptor
 */
public static class cudaResourceDesc extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaResourceDesc() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaResourceDesc(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaResourceDesc(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaResourceDesc position(long position) {
        return (cudaResourceDesc)super.position(position);
    }

	/** Resource type */
	public native @Cast("cudaResourceType") int resType(); public native cudaResourceDesc resType(int resType);
	
			/** CUDA array */
			@Name("res.array.array") public native cudaArray res_array_array(); public native cudaResourceDesc res_array_array(cudaArray res_array_array);
            /** CUDA mipmapped array */
            @Name("res.mipmap.mipmap") public native cudaMipmappedArray res_mipmap_mipmap(); public native cudaResourceDesc res_mipmap_mipmap(cudaMipmappedArray res_mipmap_mipmap);
			/** Device pointer */
			@Name("res.linear.devPtr") public native Pointer res_linear_devPtr(); public native cudaResourceDesc res_linear_devPtr(Pointer res_linear_devPtr);
			/** Channel descriptor */
			@Name("res.linear.desc") public native @ByRef cudaChannelFormatDesc res_linear_desc(); public native cudaResourceDesc res_linear_desc(cudaChannelFormatDesc res_linear_desc);
			/** Size in bytes */
			@Name("res.linear.sizeInBytes") public native @Cast("size_t") long res_linear_sizeInBytes(); public native cudaResourceDesc res_linear_sizeInBytes(long res_linear_sizeInBytes);
			/** Device pointer */
			@Name("res.pitch2D.devPtr") public native Pointer res_pitch2D_devPtr(); public native cudaResourceDesc res_pitch2D_devPtr(Pointer res_pitch2D_devPtr);
			/** Channel descriptor */
			@Name("res.pitch2D.desc") public native @ByRef cudaChannelFormatDesc res_pitch2D_desc(); public native cudaResourceDesc res_pitch2D_desc(cudaChannelFormatDesc res_pitch2D_desc);
			/** Width of the array in elements */
			@Name("res.pitch2D.width") public native @Cast("size_t") long res_pitch2D_width(); public native cudaResourceDesc res_pitch2D_width(long res_pitch2D_width);
			/** Height of the array in elements */
			@Name("res.pitch2D.height") public native @Cast("size_t") long res_pitch2D_height(); public native cudaResourceDesc res_pitch2D_height(long res_pitch2D_height);
			/** Pitch between two rows in bytes */
			@Name("res.pitch2D.pitchInBytes") public native @Cast("size_t") long res_pitch2D_pitchInBytes(); public native cudaResourceDesc res_pitch2D_pitchInBytes(long res_pitch2D_pitchInBytes);
}

/**
 * CUDA resource view descriptor
 */
public static class cudaResourceViewDesc extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaResourceViewDesc() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaResourceViewDesc(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaResourceViewDesc(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaResourceViewDesc position(long position) {
        return (cudaResourceViewDesc)super.position(position);
    }

    /** Resource view format */
    public native @Cast("cudaResourceViewFormat") int format(); public native cudaResourceViewDesc format(int format);
    /** Width of the resource view */
    public native @Cast("size_t") long width(); public native cudaResourceViewDesc width(long width);
    /** Height of the resource view */
    public native @Cast("size_t") long height(); public native cudaResourceViewDesc height(long height);
    /** Depth of the resource view */
    public native @Cast("size_t") long depth(); public native cudaResourceViewDesc depth(long depth);
    /** First defined mipmap level */
    public native @Cast("unsigned int") int firstMipmapLevel(); public native cudaResourceViewDesc firstMipmapLevel(int firstMipmapLevel);
    /** Last defined mipmap level */
    public native @Cast("unsigned int") int lastMipmapLevel(); public native cudaResourceViewDesc lastMipmapLevel(int lastMipmapLevel);
    /** First layer index */
    public native @Cast("unsigned int") int firstLayer(); public native cudaResourceViewDesc firstLayer(int firstLayer);
    /** Last layer index */
    public native @Cast("unsigned int") int lastLayer(); public native cudaResourceViewDesc lastLayer(int lastLayer);
}

/**
 * CUDA pointer attributes
 */
public static class cudaPointerAttributes extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaPointerAttributes() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaPointerAttributes(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaPointerAttributes(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaPointerAttributes position(long position) {
        return (cudaPointerAttributes)super.position(position);
    }

    /**
     * The physical location of the memory, ::cudaMemoryTypeHost or 
     * ::cudaMemoryTypeDevice.
     */
    public native @Cast("cudaMemoryType") int memoryType(); public native cudaPointerAttributes memoryType(int memoryType);

    /** 
     * The device against which the memory was allocated or registered.
     * If the memory type is ::cudaMemoryTypeDevice then this identifies 
     * the device on which the memory referred physically resides.  If
     * the memory type is ::cudaMemoryTypeHost then this identifies the 
     * device which was current when the memory was allocated or registered
     * (and if that device is deinitialized then this allocation will vanish
     * with that device's state).
     */
    public native int device(); public native cudaPointerAttributes device(int device);

    /**
     * The address which may be dereferenced on the current device to access 
     * the memory or NULL if no such address exists.
     */
    public native Pointer devicePointer(); public native cudaPointerAttributes devicePointer(Pointer devicePointer);

    /**
     * The address which may be dereferenced on the host to access the
     * memory or NULL if no such address exists.
     */
    public native Pointer hostPointer(); public native cudaPointerAttributes hostPointer(Pointer hostPointer);

    /**
     * Indicates if this pointer points to managed memory
     */
    public native int isManaged(); public native cudaPointerAttributes isManaged(int isManaged);
}

/**
 * CUDA function attributes
 */
public static class cudaFuncAttributes extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaFuncAttributes() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaFuncAttributes(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaFuncAttributes(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaFuncAttributes position(long position) {
        return (cudaFuncAttributes)super.position(position);
    }

   /**
    * The size in bytes of statically-allocated shared memory per block
    * required by this function. This does not include dynamically-allocated
    * shared memory requested by the user at runtime.
    */
   public native @Cast("size_t") long sharedSizeBytes(); public native cudaFuncAttributes sharedSizeBytes(long sharedSizeBytes);

   /**
    * The size in bytes of user-allocated constant memory required by this
    * function.
    */
   public native @Cast("size_t") long constSizeBytes(); public native cudaFuncAttributes constSizeBytes(long constSizeBytes);

   /**
    * The size in bytes of local memory used by each thread of this function.
    */
   public native @Cast("size_t") long localSizeBytes(); public native cudaFuncAttributes localSizeBytes(long localSizeBytes);

   /**
    * The maximum number of threads per block, beyond which a launch of the
    * function would fail. This number depends on both the function and the
    * device on which the function is currently loaded.
    */
   public native int maxThreadsPerBlock(); public native cudaFuncAttributes maxThreadsPerBlock(int maxThreadsPerBlock);

   /**
    * The number of registers used by each thread of this function.
    */
   public native int numRegs(); public native cudaFuncAttributes numRegs(int numRegs);

   /**
    * The PTX virtual architecture version for which the function was
    * compiled. This value is the major PTX version * 10 + the minor PTX
    * version, so a PTX version 1.3 function would return the value 13.
    */
   public native int ptxVersion(); public native cudaFuncAttributes ptxVersion(int ptxVersion);

   /**
    * The binary architecture version for which the function was compiled.
    * This value is the major binary version * 10 + the minor binary version,
    * so a binary version 1.3 function would return the value 13.
    */
   public native int binaryVersion(); public native cudaFuncAttributes binaryVersion(int binaryVersion);

   /**
    * The attribute to indicate whether the function has been compiled with 
    * user specified option "-Xptxas --dlcm=ca" set.
    */
   public native int cacheModeCA(); public native cudaFuncAttributes cacheModeCA(int cacheModeCA);
}

/**
 * CUDA function cache configurations
 */
/** enum cudaFuncCache */
public static final int
    /** Default function cache configuration, no preference */
    cudaFuncCachePreferNone   = 0,
    /** Prefer larger shared memory and smaller L1 cache  */
    cudaFuncCachePreferShared = 1,
    /** Prefer larger L1 cache and smaller shared memory */
    cudaFuncCachePreferL1     = 2,
    /** Prefer equal size L1 cache and shared memory */
    cudaFuncCachePreferEqual  = 3;

/**
 * CUDA shared memory configuration
 */

/** enum cudaSharedMemConfig */
public static final int
    cudaSharedMemBankSizeDefault   = 0,
    cudaSharedMemBankSizeFourByte  = 1,
    cudaSharedMemBankSizeEightByte = 2;

/**
 * CUDA device compute modes
 */
/** enum cudaComputeMode */
public static final int
    /** Default compute mode (Multiple threads can use ::cudaSetDevice() with this device) */
    cudaComputeModeDefault          = 0,
    /** Compute-exclusive-thread mode (Only one thread in one process will be able to use ::cudaSetDevice() with this device) */
    cudaComputeModeExclusive        = 1,
    /** Compute-prohibited mode (No threads can use ::cudaSetDevice() with this device) */
    cudaComputeModeProhibited       = 2,
    /** Compute-exclusive-process mode (Many threads in one process will be able to use ::cudaSetDevice() with this device) */
    cudaComputeModeExclusiveProcess = 3;

/**
 * CUDA Limits
 */
/** enum cudaLimit */
public static final int
    /** GPU thread stack size */
    cudaLimitStackSize                    = 0x00,
    /** GPU printf/fprintf FIFO size */
    cudaLimitPrintfFifoSize               = 0x01,
    /** GPU malloc heap size */
    cudaLimitMallocHeapSize               = 0x02,
    /** GPU device runtime synchronize depth */
    cudaLimitDevRuntimeSyncDepth          = 0x03,
    /** GPU device runtime pending launch count */
    cudaLimitDevRuntimePendingLaunchCount = 0x04;

/**
 * CUDA Memory Advise values
 */
/** enum cudaMemoryAdvise */
public static final int
    /** Data will mostly be read and only occassionally be written to */
    cudaMemAdviseSetReadMostly          = 1,
    /** Undo the effect of ::cudaMemAdviseSetReadMostly */
    cudaMemAdviseUnsetReadMostly        = 2,
    /** Set the preferred location for the data as the specified device */
    cudaMemAdviseSetPreferredLocation   = 3,
    /** Clear the preferred location for the data */
    cudaMemAdviseUnsetPreferredLocation = 4,
    /** Data will be accessed by the specified device, so prevent page faults as much as possible */
    cudaMemAdviseSetAccessedBy          = 5,
    /** Let the Unified Memory subsystem decide on the page faulting policy for the specified device */
    cudaMemAdviseUnsetAccessedBy        = 6;

/**
 * CUDA range attributes
 */
/** enum cudaMemRangeAttribute */
public static final int
    /** Whether the range will mostly be read and only occassionally be written to */
    cudaMemRangeAttributeReadMostly           = 1,
    /** The preferred location of the range */
    cudaMemRangeAttributePreferredLocation    = 2,
    /** Memory range has ::cudaMemAdviseSetAccessedBy set for specified device */
    cudaMemRangeAttributeAccessedBy           = 3,
    /** The last location to which the range was prefetched */
    cudaMemRangeAttributeLastPrefetchLocation = 4;

/**
 * CUDA Profiler Output modes
 */
/** enum cudaOutputMode */
public static final int
    /** Output mode Key-Value pair format. */
    cudaKeyValuePair    = 0x00,
    /** Output mode Comma separated values format. */
    cudaCSV             = 0x01;

/**
 * CUDA device attributes
 */
/** enum cudaDeviceAttr */
public static final int
    /** Maximum number of threads per block */
    cudaDevAttrMaxThreadsPerBlock             = 1,
    /** Maximum block dimension X */
    cudaDevAttrMaxBlockDimX                   = 2,
    /** Maximum block dimension Y */
    cudaDevAttrMaxBlockDimY                   = 3,
    /** Maximum block dimension Z */
    cudaDevAttrMaxBlockDimZ                   = 4,
    /** Maximum grid dimension X */
    cudaDevAttrMaxGridDimX                    = 5,
    /** Maximum grid dimension Y */
    cudaDevAttrMaxGridDimY                    = 6,
    /** Maximum grid dimension Z */
    cudaDevAttrMaxGridDimZ                    = 7,
    /** Maximum shared memory available per block in bytes */
    cudaDevAttrMaxSharedMemoryPerBlock        = 8,
    /** Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
    cudaDevAttrTotalConstantMemory            = 9,
    /** Warp size in threads */
    cudaDevAttrWarpSize                       = 10,
    /** Maximum pitch in bytes allowed by memory copies */
    cudaDevAttrMaxPitch                       = 11,
    /** Maximum number of 32-bit registers available per block */
    cudaDevAttrMaxRegistersPerBlock           = 12,
    /** Peak clock frequency in kilohertz */
    cudaDevAttrClockRate                      = 13,
    /** Alignment requirement for textures */
    cudaDevAttrTextureAlignment               = 14,
    /** Device can possibly copy memory and execute a kernel concurrently */
    cudaDevAttrGpuOverlap                     = 15,
    /** Number of multiprocessors on device */
    cudaDevAttrMultiProcessorCount            = 16,
    /** Specifies whether there is a run time limit on kernels */
    cudaDevAttrKernelExecTimeout              = 17,
    /** Device is integrated with host memory */
    cudaDevAttrIntegrated                     = 18,
    /** Device can map host memory into CUDA address space */
    cudaDevAttrCanMapHostMemory               = 19,
    /** Compute mode (See ::cudaComputeMode for details) */
    cudaDevAttrComputeMode                    = 20,
    /** Maximum 1D texture width */
    cudaDevAttrMaxTexture1DWidth              = 21,
    /** Maximum 2D texture width */
    cudaDevAttrMaxTexture2DWidth              = 22,
    /** Maximum 2D texture height */
    cudaDevAttrMaxTexture2DHeight             = 23,
    /** Maximum 3D texture width */
    cudaDevAttrMaxTexture3DWidth              = 24,
    /** Maximum 3D texture height */
    cudaDevAttrMaxTexture3DHeight             = 25,
    /** Maximum 3D texture depth */
    cudaDevAttrMaxTexture3DDepth              = 26,
    /** Maximum 2D layered texture width */
    cudaDevAttrMaxTexture2DLayeredWidth       = 27,
    /** Maximum 2D layered texture height */
    cudaDevAttrMaxTexture2DLayeredHeight      = 28,
    /** Maximum layers in a 2D layered texture */
    cudaDevAttrMaxTexture2DLayeredLayers      = 29,
    /** Alignment requirement for surfaces */
    cudaDevAttrSurfaceAlignment               = 30,
    /** Device can possibly execute multiple kernels concurrently */
    cudaDevAttrConcurrentKernels              = 31,
    /** Device has ECC support enabled */
    cudaDevAttrEccEnabled                     = 32,
    /** PCI bus ID of the device */
    cudaDevAttrPciBusId                       = 33,
    /** PCI device ID of the device */
    cudaDevAttrPciDeviceId                    = 34,
    /** Device is using TCC driver model */
    cudaDevAttrTccDriver                      = 35,
    /** Peak memory clock frequency in kilohertz */
    cudaDevAttrMemoryClockRate                = 36,
    /** Global memory bus width in bits */
    cudaDevAttrGlobalMemoryBusWidth           = 37,
    /** Size of L2 cache in bytes */
    cudaDevAttrL2CacheSize                    = 38,
    /** Maximum resident threads per multiprocessor */
    cudaDevAttrMaxThreadsPerMultiProcessor    = 39,
    /** Number of asynchronous engines */
    cudaDevAttrAsyncEngineCount               = 40,
    /** Device shares a unified address space with the host */
    cudaDevAttrUnifiedAddressing              = 41,    
    /** Maximum 1D layered texture width */
    cudaDevAttrMaxTexture1DLayeredWidth       = 42,
    /** Maximum layers in a 1D layered texture */
    cudaDevAttrMaxTexture1DLayeredLayers      = 43,
    /** Maximum 2D texture width if cudaArrayTextureGather is set */
    cudaDevAttrMaxTexture2DGatherWidth        = 45,
    /** Maximum 2D texture height if cudaArrayTextureGather is set */
    cudaDevAttrMaxTexture2DGatherHeight       = 46,
    /** Alternate maximum 3D texture width */
    cudaDevAttrMaxTexture3DWidthAlt           = 47,
    /** Alternate maximum 3D texture height */
    cudaDevAttrMaxTexture3DHeightAlt          = 48,
    /** Alternate maximum 3D texture depth */
    cudaDevAttrMaxTexture3DDepthAlt           = 49,
    /** PCI domain ID of the device */
    cudaDevAttrPciDomainId                    = 50,
    /** Pitch alignment requirement for textures */
    cudaDevAttrTexturePitchAlignment          = 51,
    /** Maximum cubemap texture width/height */
    cudaDevAttrMaxTextureCubemapWidth         = 52,
    /** Maximum cubemap layered texture width/height */
    cudaDevAttrMaxTextureCubemapLayeredWidth  = 53,
    /** Maximum layers in a cubemap layered texture */
    cudaDevAttrMaxTextureCubemapLayeredLayers = 54,
    /** Maximum 1D surface width */
    cudaDevAttrMaxSurface1DWidth              = 55,
    /** Maximum 2D surface width */
    cudaDevAttrMaxSurface2DWidth              = 56,
    /** Maximum 2D surface height */
    cudaDevAttrMaxSurface2DHeight             = 57,
    /** Maximum 3D surface width */
    cudaDevAttrMaxSurface3DWidth              = 58,
    /** Maximum 3D surface height */
    cudaDevAttrMaxSurface3DHeight             = 59,
    /** Maximum 3D surface depth */
    cudaDevAttrMaxSurface3DDepth              = 60,
    /** Maximum 1D layered surface width */
    cudaDevAttrMaxSurface1DLayeredWidth       = 61,
    /** Maximum layers in a 1D layered surface */
    cudaDevAttrMaxSurface1DLayeredLayers      = 62,
    /** Maximum 2D layered surface width */
    cudaDevAttrMaxSurface2DLayeredWidth       = 63,
    /** Maximum 2D layered surface height */
    cudaDevAttrMaxSurface2DLayeredHeight      = 64,
    /** Maximum layers in a 2D layered surface */
    cudaDevAttrMaxSurface2DLayeredLayers      = 65,
    /** Maximum cubemap surface width */
    cudaDevAttrMaxSurfaceCubemapWidth         = 66,
    /** Maximum cubemap layered surface width */
    cudaDevAttrMaxSurfaceCubemapLayeredWidth  = 67,
    /** Maximum layers in a cubemap layered surface */
    cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68,
    /** Maximum 1D linear texture width */
    cudaDevAttrMaxTexture1DLinearWidth        = 69,
    /** Maximum 2D linear texture width */
    cudaDevAttrMaxTexture2DLinearWidth        = 70,
    /** Maximum 2D linear texture height */
    cudaDevAttrMaxTexture2DLinearHeight       = 71,
    /** Maximum 2D linear texture pitch in bytes */
    cudaDevAttrMaxTexture2DLinearPitch        = 72,
    /** Maximum mipmapped 2D texture width */
    cudaDevAttrMaxTexture2DMipmappedWidth     = 73,
    /** Maximum mipmapped 2D texture height */
    cudaDevAttrMaxTexture2DMipmappedHeight    = 74,
    /** Major compute capability version number */
    cudaDevAttrComputeCapabilityMajor         = 75, 
    /** Minor compute capability version number */
    cudaDevAttrComputeCapabilityMinor         = 76,
    /** Maximum mipmapped 1D texture width */
    cudaDevAttrMaxTexture1DMipmappedWidth     = 77,
    /** Device supports stream priorities */
    cudaDevAttrStreamPrioritiesSupported      = 78,
    /** Device supports caching globals in L1 */
    cudaDevAttrGlobalL1CacheSupported         = 79,
    /** Device supports caching locals in L1 */
    cudaDevAttrLocalL1CacheSupported          = 80,
    /** Maximum shared memory available per multiprocessor in bytes */
    cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81,
    /** Maximum number of 32-bit registers available per multiprocessor */
    cudaDevAttrMaxRegistersPerMultiprocessor  = 82,
    /** Device can allocate managed memory on this system */
    cudaDevAttrManagedMemory                  = 83,
    /** Device is on a multi-GPU board */
    cudaDevAttrIsMultiGpuBoard                = 84,
    /** Unique identifier for a group of devices on the same multi-GPU board */
    cudaDevAttrMultiGpuBoardGroupID           = 85,
    /** Link between the device and the host supports native atomic operations */
    cudaDevAttrHostNativeAtomicSupported      = 86,
    /** Ratio of single precision performance (in floating-point operations per second) to double precision performance */
    cudaDevAttrSingleToDoublePrecisionPerfRatio = 87,
    /** Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
    cudaDevAttrPageableMemoryAccess           = 88,
    /** Device can coherently access managed memory concurrently with the CPU */
    cudaDevAttrConcurrentManagedAccess        = 89,
    /** Device supports Compute Preemption */
    cudaDevAttrComputePreemptionSupported     = 90,
    /** Device can access host registered memory at the same virtual address as the CPU */
    cudaDevAttrCanUseHostPointerForRegisteredMem = 91;

/**
 * CUDA device P2P attributes
 */

/** enum cudaDeviceP2PAttr */
public static final int
    /** A relative value indicating the performance of the link between two devices */
    cudaDevP2PAttrPerformanceRank              = 1,
    /** Peer access is enabled */
    cudaDevP2PAttrAccessSupported              = 2,
    /** Native atomic operation over the link supported */
    cudaDevP2PAttrNativeAtomicSupported        = 3;
/**
 * CUDA device properties
 */
public static class cudaDeviceProp extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaDeviceProp() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaDeviceProp(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaDeviceProp(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaDeviceProp position(long position) {
        return (cudaDeviceProp)super.position(position);
    }

    /** ASCII string identifying device */
    public native @Cast("char") byte name(int i); public native cudaDeviceProp name(int i, byte name);
    @MemberGetter public native @Cast("char*") BytePointer name();
    /** Global memory available on device in bytes */
    public native @Cast("size_t") long totalGlobalMem(); public native cudaDeviceProp totalGlobalMem(long totalGlobalMem);
    /** Shared memory available per block in bytes */
    public native @Cast("size_t") long sharedMemPerBlock(); public native cudaDeviceProp sharedMemPerBlock(long sharedMemPerBlock);
    /** 32-bit registers available per block */
    public native int regsPerBlock(); public native cudaDeviceProp regsPerBlock(int regsPerBlock);
    /** Warp size in threads */
    public native int warpSize(); public native cudaDeviceProp warpSize(int warpSize);
    /** Maximum pitch in bytes allowed by memory copies */
    public native @Cast("size_t") long memPitch(); public native cudaDeviceProp memPitch(long memPitch);
    /** Maximum number of threads per block */
    public native int maxThreadsPerBlock(); public native cudaDeviceProp maxThreadsPerBlock(int maxThreadsPerBlock);
    /** Maximum size of each dimension of a block */
    public native int maxThreadsDim(int i); public native cudaDeviceProp maxThreadsDim(int i, int maxThreadsDim);
    @MemberGetter public native IntPointer maxThreadsDim();
    /** Maximum size of each dimension of a grid */
    public native int maxGridSize(int i); public native cudaDeviceProp maxGridSize(int i, int maxGridSize);
    @MemberGetter public native IntPointer maxGridSize();
    /** Clock frequency in kilohertz */
    public native int clockRate(); public native cudaDeviceProp clockRate(int clockRate);
    /** Constant memory available on device in bytes */
    public native @Cast("size_t") long totalConstMem(); public native cudaDeviceProp totalConstMem(long totalConstMem);
    /** Major compute capability */
    public native int major(); public native cudaDeviceProp major(int major);
    /** Minor compute capability */
    public native int minor(); public native cudaDeviceProp minor(int minor);
    /** Alignment requirement for textures */
    public native @Cast("size_t") long textureAlignment(); public native cudaDeviceProp textureAlignment(long textureAlignment);
    /** Pitch alignment requirement for texture references bound to pitched memory */
    public native @Cast("size_t") long texturePitchAlignment(); public native cudaDeviceProp texturePitchAlignment(long texturePitchAlignment);
    /** Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
    public native int deviceOverlap(); public native cudaDeviceProp deviceOverlap(int deviceOverlap);
    /** Number of multiprocessors on device */
    public native int multiProcessorCount(); public native cudaDeviceProp multiProcessorCount(int multiProcessorCount);
    /** Specified whether there is a run time limit on kernels */
    public native int kernelExecTimeoutEnabled(); public native cudaDeviceProp kernelExecTimeoutEnabled(int kernelExecTimeoutEnabled);
    /** Device is integrated as opposed to discrete */
    public native int integrated(); public native cudaDeviceProp integrated(int integrated);
    /** Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
    public native int canMapHostMemory(); public native cudaDeviceProp canMapHostMemory(int canMapHostMemory);
    /** Compute mode (See ::cudaComputeMode) */
    public native int computeMode(); public native cudaDeviceProp computeMode(int computeMode);
    /** Maximum 1D texture size */
    public native int maxTexture1D(); public native cudaDeviceProp maxTexture1D(int maxTexture1D);
    /** Maximum 1D mipmapped texture size */
    public native int maxTexture1DMipmap(); public native cudaDeviceProp maxTexture1DMipmap(int maxTexture1DMipmap);
    /** Maximum size for 1D textures bound to linear memory */
    public native int maxTexture1DLinear(); public native cudaDeviceProp maxTexture1DLinear(int maxTexture1DLinear);
    /** Maximum 2D texture dimensions */
    public native int maxTexture2D(int i); public native cudaDeviceProp maxTexture2D(int i, int maxTexture2D);
    @MemberGetter public native IntPointer maxTexture2D();
    /** Maximum 2D mipmapped texture dimensions */
    public native int maxTexture2DMipmap(int i); public native cudaDeviceProp maxTexture2DMipmap(int i, int maxTexture2DMipmap);
    @MemberGetter public native IntPointer maxTexture2DMipmap();
    /** Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
    public native int maxTexture2DLinear(int i); public native cudaDeviceProp maxTexture2DLinear(int i, int maxTexture2DLinear);
    @MemberGetter public native IntPointer maxTexture2DLinear();
    /** Maximum 2D texture dimensions if texture gather operations have to be performed */
    public native int maxTexture2DGather(int i); public native cudaDeviceProp maxTexture2DGather(int i, int maxTexture2DGather);
    @MemberGetter public native IntPointer maxTexture2DGather();
    /** Maximum 3D texture dimensions */
    public native int maxTexture3D(int i); public native cudaDeviceProp maxTexture3D(int i, int maxTexture3D);
    @MemberGetter public native IntPointer maxTexture3D();
    /** Maximum alternate 3D texture dimensions */
    public native int maxTexture3DAlt(int i); public native cudaDeviceProp maxTexture3DAlt(int i, int maxTexture3DAlt);
    @MemberGetter public native IntPointer maxTexture3DAlt();
    /** Maximum Cubemap texture dimensions */
    public native int maxTextureCubemap(); public native cudaDeviceProp maxTextureCubemap(int maxTextureCubemap);
    /** Maximum 1D layered texture dimensions */
    public native int maxTexture1DLayered(int i); public native cudaDeviceProp maxTexture1DLayered(int i, int maxTexture1DLayered);
    @MemberGetter public native IntPointer maxTexture1DLayered();
    /** Maximum 2D layered texture dimensions */
    public native int maxTexture2DLayered(int i); public native cudaDeviceProp maxTexture2DLayered(int i, int maxTexture2DLayered);
    @MemberGetter public native IntPointer maxTexture2DLayered();
    /** Maximum Cubemap layered texture dimensions */
    public native int maxTextureCubemapLayered(int i); public native cudaDeviceProp maxTextureCubemapLayered(int i, int maxTextureCubemapLayered);
    @MemberGetter public native IntPointer maxTextureCubemapLayered();
    /** Maximum 1D surface size */
    public native int maxSurface1D(); public native cudaDeviceProp maxSurface1D(int maxSurface1D);
    /** Maximum 2D surface dimensions */
    public native int maxSurface2D(int i); public native cudaDeviceProp maxSurface2D(int i, int maxSurface2D);
    @MemberGetter public native IntPointer maxSurface2D();
    /** Maximum 3D surface dimensions */
    public native int maxSurface3D(int i); public native cudaDeviceProp maxSurface3D(int i, int maxSurface3D);
    @MemberGetter public native IntPointer maxSurface3D();
    /** Maximum 1D layered surface dimensions */
    public native int maxSurface1DLayered(int i); public native cudaDeviceProp maxSurface1DLayered(int i, int maxSurface1DLayered);
    @MemberGetter public native IntPointer maxSurface1DLayered();
    /** Maximum 2D layered surface dimensions */
    public native int maxSurface2DLayered(int i); public native cudaDeviceProp maxSurface2DLayered(int i, int maxSurface2DLayered);
    @MemberGetter public native IntPointer maxSurface2DLayered();
    /** Maximum Cubemap surface dimensions */
    public native int maxSurfaceCubemap(); public native cudaDeviceProp maxSurfaceCubemap(int maxSurfaceCubemap);
    /** Maximum Cubemap layered surface dimensions */
    public native int maxSurfaceCubemapLayered(int i); public native cudaDeviceProp maxSurfaceCubemapLayered(int i, int maxSurfaceCubemapLayered);
    @MemberGetter public native IntPointer maxSurfaceCubemapLayered();
    /** Alignment requirements for surfaces */
    public native @Cast("size_t") long surfaceAlignment(); public native cudaDeviceProp surfaceAlignment(long surfaceAlignment);
    /** Device can possibly execute multiple kernels concurrently */
    public native int concurrentKernels(); public native cudaDeviceProp concurrentKernels(int concurrentKernels);
    /** Device has ECC support enabled */
    public native int ECCEnabled(); public native cudaDeviceProp ECCEnabled(int ECCEnabled);
    /** PCI bus ID of the device */
    public native int pciBusID(); public native cudaDeviceProp pciBusID(int pciBusID);
    /** PCI device ID of the device */
    public native int pciDeviceID(); public native cudaDeviceProp pciDeviceID(int pciDeviceID);
    /** PCI domain ID of the device */
    public native int pciDomainID(); public native cudaDeviceProp pciDomainID(int pciDomainID);
    /** 1 if device is a Tesla device using TCC driver, 0 otherwise */
    public native int tccDriver(); public native cudaDeviceProp tccDriver(int tccDriver);
    /** Number of asynchronous engines */
    public native int asyncEngineCount(); public native cudaDeviceProp asyncEngineCount(int asyncEngineCount);
    /** Device shares a unified address space with the host */
    public native int unifiedAddressing(); public native cudaDeviceProp unifiedAddressing(int unifiedAddressing);
    /** Peak memory clock frequency in kilohertz */
    public native int memoryClockRate(); public native cudaDeviceProp memoryClockRate(int memoryClockRate);
    /** Global memory bus width in bits */
    public native int memoryBusWidth(); public native cudaDeviceProp memoryBusWidth(int memoryBusWidth);
    /** Size of L2 cache in bytes */
    public native int l2CacheSize(); public native cudaDeviceProp l2CacheSize(int l2CacheSize);
    /** Maximum resident threads per multiprocessor */
    public native int maxThreadsPerMultiProcessor(); public native cudaDeviceProp maxThreadsPerMultiProcessor(int maxThreadsPerMultiProcessor);
    /** Device supports stream priorities */
    public native int streamPrioritiesSupported(); public native cudaDeviceProp streamPrioritiesSupported(int streamPrioritiesSupported);
    /** Device supports caching globals in L1 */
    public native int globalL1CacheSupported(); public native cudaDeviceProp globalL1CacheSupported(int globalL1CacheSupported);
    /** Device supports caching locals in L1 */
    public native int localL1CacheSupported(); public native cudaDeviceProp localL1CacheSupported(int localL1CacheSupported);
    /** Shared memory available per multiprocessor in bytes */
    public native @Cast("size_t") long sharedMemPerMultiprocessor(); public native cudaDeviceProp sharedMemPerMultiprocessor(long sharedMemPerMultiprocessor);
    /** 32-bit registers available per multiprocessor */
    public native int regsPerMultiprocessor(); public native cudaDeviceProp regsPerMultiprocessor(int regsPerMultiprocessor);
    /** Device supports allocating managed memory on this system */
    public native int managedMemory(); public native cudaDeviceProp managedMemory(int managedMemory);
    /** Device is on a multi-GPU board */
    public native int isMultiGpuBoard(); public native cudaDeviceProp isMultiGpuBoard(int isMultiGpuBoard);
    /** Unique identifier for a group of devices on the same multi-GPU board */
    public native int multiGpuBoardGroupID(); public native cudaDeviceProp multiGpuBoardGroupID(int multiGpuBoardGroupID);
    /** Link between the device and the host supports native atomic operations */
    public native int hostNativeAtomicSupported(); public native cudaDeviceProp hostNativeAtomicSupported(int hostNativeAtomicSupported);
    /** Ratio of single precision performance (in floating-point operations per second) to double precision performance */
    public native int singleToDoublePrecisionPerfRatio(); public native cudaDeviceProp singleToDoublePrecisionPerfRatio(int singleToDoublePrecisionPerfRatio);
    /** Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
    public native int pageableMemoryAccess(); public native cudaDeviceProp pageableMemoryAccess(int pageableMemoryAccess);
    /** Device can coherently access managed memory concurrently with the CPU */
    public native int concurrentManagedAccess(); public native cudaDeviceProp concurrentManagedAccess(int concurrentManagedAccess);
}

/** Empty device properties */
// #define cudaDevicePropDontCare
//         {
//           {'\0'},    /* char   name[256];               */
//           0,         /* size_t totalGlobalMem;          */
//           0,         /* size_t sharedMemPerBlock;       */
//           0,         /* int    regsPerBlock;            */
//           0,         /* int    warpSize;                */
//           0,         /* size_t memPitch;                */
//           0,         /* int    maxThreadsPerBlock;      */
//           {0, 0, 0}, /* int    maxThreadsDim[3];        */
//           {0, 0, 0}, /* int    maxGridSize[3];          */
//           0,         /* int    clockRate;               */
//           0,         /* size_t totalConstMem;           */
//           -1,        /* int    major;                   */
//           -1,        /* int    minor;                   */
//           0,         /* size_t textureAlignment;        */
//           0,         /* size_t texturePitchAlignment    */
//           -1,        /* int    deviceOverlap;           */
//           0,         /* int    multiProcessorCount;     */
//           0,         /* int    kernelExecTimeoutEnabled */
//           0,         /* int    integrated               */
//           0,         /* int    canMapHostMemory         */
//           0,         /* int    computeMode              */
//           0,         /* int    maxTexture1D             */
//           0,         /* int    maxTexture1DMipmap       */
//           0,         /* int    maxTexture1DLinear       */
//           {0, 0},    /* int    maxTexture2D[2]          */
//           {0, 0},    /* int    maxTexture2DMipmap[2]    */
//           {0, 0, 0}, /* int    maxTexture2DLinear[3]    */
//           {0, 0},    /* int    maxTexture2DGather[2]    */
//           {0, 0, 0}, /* int    maxTexture3D[3]          */
//           {0, 0, 0}, /* int    maxTexture3DAlt[3]       */
//           0,         /* int    maxTextureCubemap        */
//           {0, 0},    /* int    maxTexture1DLayered[2]   */
//           {0, 0, 0}, /* int    maxTexture2DLayered[3]   */
//           {0, 0},    /* int    maxTextureCubemapLayered[2] */
//           0,         /* int    maxSurface1D             */
//           {0, 0},    /* int    maxSurface2D[2]          */
//           {0, 0, 0}, /* int    maxSurface3D[3]          */
//           {0, 0},    /* int    maxSurface1DLayered[2]   */
//           {0, 0, 0}, /* int    maxSurface2DLayered[3]   */
//           0,         /* int    maxSurfaceCubemap        */
//           {0, 0},    /* int    maxSurfaceCubemapLayered[2] */
//           0,         /* size_t surfaceAlignment         */
//           0,         /* int    concurrentKernels        */
//           0,         /* int    ECCEnabled               */
//           0,         /* int    pciBusID                 */
//           0,         /* int    pciDeviceID              */
//           0,         /* int    pciDomainID              */
//           0,         /* int    tccDriver                */
//           0,         /* int    asyncEngineCount         */
//           0,         /* int    unifiedAddressing        */
//           0,         /* int    memoryClockRate          */
//           0,         /* int    memoryBusWidth           */
//           0,         /* int    l2CacheSize              */
//           0,         /* int    maxThreadsPerMultiProcessor */
//           0,         /* int    streamPrioritiesSupported */
//           0,         /* int    globalL1CacheSupported   */
//           0,         /* int    localL1CacheSupported    */
//           0,         /* size_t sharedMemPerMultiprocessor; */
//           0,         /* int    regsPerMultiprocessor;   */
//           0,         /* int    managedMemory            */
//           0,         /* int    isMultiGpuBoard          */
//           0,         /* int    multiGpuBoardGroupID     */
//           0,         /* int    hostNativeAtomicSupported */
//           0,         /* int    singleToDoublePrecisionPerfRatio */
//           0,         /* int    pageableMemoryAccess     */
//           0,         /* int    concurrentManagedAccess  */
//         }

/**
 * CUDA IPC Handle Size
 */
public static final int CUDA_IPC_HANDLE_SIZE = 64;

/**
 * CUDA IPC event handle
 */
public static class cudaIpcEventHandle_t extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaIpcEventHandle_t() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaIpcEventHandle_t(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaIpcEventHandle_t(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaIpcEventHandle_t position(long position) {
        return (cudaIpcEventHandle_t)super.position(position);
    }

    public native @Cast("char") byte reserved(int i); public native cudaIpcEventHandle_t reserved(int i, byte reserved);
    @MemberGetter public native @Cast("char*") BytePointer reserved();
}

/**
 * CUDA IPC memory handle
 */
public static class cudaIpcMemHandle_t extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaIpcMemHandle_t() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaIpcMemHandle_t(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaIpcMemHandle_t(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaIpcMemHandle_t position(long position) {
        return (cudaIpcMemHandle_t)super.position(position);
    }

    public native @Cast("char") byte reserved(int i); public native cudaIpcMemHandle_t reserved(int i, byte reserved);
    @MemberGetter public native @Cast("char*") BytePointer reserved();
}

/*******************************************************************************
*                                                                              *
*  SHORTHAND TYPE DEFINITION USED BY RUNTIME API                               *
*                                                                              *
*******************************************************************************/

/**
 * CUDA Error types
 */

/**
 * CUDA stream
 */

/**
 * CUDA event types
 */

/**
 * CUDA graphics resource types
 */

/**
 * CUDA UUID types
 */
@Opaque public static class cudaUUID_t extends Pointer {
    /** Empty constructor. Calls {@code super((Pointer)null)}. */
    public cudaUUID_t() { super((Pointer)null); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaUUID_t(Pointer p) { super(p); }
}

/**
 * CUDA output file modes
 */

/** \} */
/** \} */ /* END CUDART_TYPES */

// #endif /* !__DRIVER_TYPES_H__ */


// Parsed from 

/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

// #if !defined(__SURFACE_TYPES_H__)
// #define __SURFACE_TYPES_H__

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

// #include "driver_types.h"

/**
 * \addtogroup CUDART_TYPES
 *
 * \{
 */

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

public static final int cudaSurfaceType1D =              0x01;
public static final int cudaSurfaceType2D =              0x02;
public static final int cudaSurfaceType3D =              0x03;
public static final int cudaSurfaceTypeCubemap =         0x0C;
public static final int cudaSurfaceType1DLayered =       0xF1;
public static final int cudaSurfaceType2DLayered =       0xF2;
public static final int cudaSurfaceTypeCubemapLayered =  0xFC;

/**
 * CUDA Surface boundary modes
 */
/** enum cudaSurfaceBoundaryMode */
public static final int
    /** Zero boundary mode */
    cudaBoundaryModeZero  = 0,
    /** Clamp boundary mode */
    cudaBoundaryModeClamp = 1,
    /** Trap boundary mode */
    cudaBoundaryModeTrap  = 2;

/**
 * CUDA Surface format modes
 */
/** enum cudaSurfaceFormatMode */
public static final int
    /** Forced format mode */
    cudaFormatModeForced = 0,
    /** Auto format mode */
    cudaFormatModeAuto = 1;

/**
 * CUDA Surface reference
 */
public static class surfaceReference extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public surfaceReference() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public surfaceReference(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public surfaceReference(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public surfaceReference position(long position) {
        return (surfaceReference)super.position(position);
    }

    /**
     * Channel descriptor for surface reference
     */
    public native @ByRef cudaChannelFormatDesc channelDesc(); public native surfaceReference channelDesc(cudaChannelFormatDesc channelDesc);
}

/**
 * An opaque value that represents a CUDA Surface object
 */

/** \} */
/** \} */ /* END CUDART_TYPES */

// #endif /* !__SURFACE_TYPES_H__ */


// Parsed from 

/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

// #if !defined(__TEXTURE_TYPES_H__)
// #define __TEXTURE_TYPES_H__

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

// #include "driver_types.h"

/**
 * \addtogroup CUDART_TYPES
 *
 * \{
 */

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

public static final int cudaTextureType1D =              0x01;
public static final int cudaTextureType2D =              0x02;
public static final int cudaTextureType3D =              0x03;
public static final int cudaTextureTypeCubemap =         0x0C;
public static final int cudaTextureType1DLayered =       0xF1;
public static final int cudaTextureType2DLayered =       0xF2;
public static final int cudaTextureTypeCubemapLayered =  0xFC;

/**
 * CUDA texture address modes
 */
/** enum cudaTextureAddressMode */
public static final int
    /** Wrapping address mode */
    cudaAddressModeWrap   = 0,
    /** Clamp to edge address mode */
    cudaAddressModeClamp  = 1,
    /** Mirror address mode */
    cudaAddressModeMirror = 2,
    /** Border address mode */
    cudaAddressModeBorder = 3;

/**
 * CUDA texture filter modes
 */
/** enum cudaTextureFilterMode */
public static final int
    /** Point filter mode */
    cudaFilterModePoint  = 0,
    /** Linear filter mode */
    cudaFilterModeLinear = 1;

/**
 * CUDA texture read modes
 */
/** enum cudaTextureReadMode */
public static final int
    /** Read texture as specified element type */
    cudaReadModeElementType     = 0,
    /** Read texture as normalized float */
    cudaReadModeNormalizedFloat = 1;

/**
 * CUDA texture reference
 */
public static class textureReference extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public textureReference() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public textureReference(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public textureReference(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public textureReference position(long position) {
        return (textureReference)super.position(position);
    }

    /**
     * Indicates whether texture reads are normalized or not
     */
    public native int normalized(); public native textureReference normalized(int normalized);
    /**
     * Texture filter mode
     */
    public native @Cast("cudaTextureFilterMode") int filterMode(); public native textureReference filterMode(int filterMode);
    /**
     * Texture address mode for up to 3 dimensions
     */
    public native @Cast("cudaTextureAddressMode") int addressMode(int i); public native textureReference addressMode(int i, int addressMode);
    @MemberGetter public native @Cast("cudaTextureAddressMode*") IntPointer addressMode();
    /**
     * Channel descriptor for the texture reference
     */
    public native @ByRef cudaChannelFormatDesc channelDesc(); public native textureReference channelDesc(cudaChannelFormatDesc channelDesc);
    /**
     * Perform sRGB->linear conversion during texture read
     */
    public native int sRGB(); public native textureReference sRGB(int sRGB);
    /**
     * Limit to the anisotropy ratio
     */
    public native @Cast("unsigned int") int maxAnisotropy(); public native textureReference maxAnisotropy(int maxAnisotropy);
    /**
     * Mipmap filter mode
     */
    public native @Cast("cudaTextureFilterMode") int mipmapFilterMode(); public native textureReference mipmapFilterMode(int mipmapFilterMode);
    /**
     * Offset applied to the supplied mipmap level
     */
    public native float mipmapLevelBias(); public native textureReference mipmapLevelBias(float mipmapLevelBias);
    /**
     * Lower end of the mipmap level range to clamp access to
     */
    public native float minMipmapLevelClamp(); public native textureReference minMipmapLevelClamp(float minMipmapLevelClamp);
    /**
     * Upper end of the mipmap level range to clamp access to
     */
    public native float maxMipmapLevelClamp(); public native textureReference maxMipmapLevelClamp(float maxMipmapLevelClamp);
    public native int __cudaReserved(int i); public native textureReference __cudaReserved(int i, int __cudaReserved);
    @MemberGetter public native IntPointer __cudaReserved();
}

/**
 * CUDA texture descriptor
 */
public static class cudaTextureDesc extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public cudaTextureDesc() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public cudaTextureDesc(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public cudaTextureDesc(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public cudaTextureDesc position(long position) {
        return (cudaTextureDesc)super.position(position);
    }

    /**
     * Texture address mode for up to 3 dimensions
     */
    public native @Cast("cudaTextureAddressMode") int addressMode(int i); public native cudaTextureDesc addressMode(int i, int addressMode);
    @MemberGetter public native @Cast("cudaTextureAddressMode*") IntPointer addressMode();
    /**
     * Texture filter mode
     */
    public native @Cast("cudaTextureFilterMode") int filterMode(); public native cudaTextureDesc filterMode(int filterMode);
    /**
     * Texture read mode
     */
    public native @Cast("cudaTextureReadMode") int readMode(); public native cudaTextureDesc readMode(int readMode);
    /**
     * Perform sRGB->linear conversion during texture read
     */
    public native int sRGB(); public native cudaTextureDesc sRGB(int sRGB);
    /**
     * Texture Border Color
     */
    public native float borderColor(int i); public native cudaTextureDesc borderColor(int i, float borderColor);
    @MemberGetter public native FloatPointer borderColor();
    /**
     * Indicates whether texture reads are normalized or not
     */
    public native int normalizedCoords(); public native cudaTextureDesc normalizedCoords(int normalizedCoords);
    /**
     * Limit to the anisotropy ratio
     */
    public native @Cast("unsigned int") int maxAnisotropy(); public native cudaTextureDesc maxAnisotropy(int maxAnisotropy);
    /**
     * Mipmap filter mode
     */
    public native @Cast("cudaTextureFilterMode") int mipmapFilterMode(); public native cudaTextureDesc mipmapFilterMode(int mipmapFilterMode);
    /**
     * Offset applied to the supplied mipmap level
     */
    public native float mipmapLevelBias(); public native cudaTextureDesc mipmapLevelBias(float mipmapLevelBias);
    /**
     * Lower end of the mipmap level range to clamp access to
     */
    public native float minMipmapLevelClamp(); public native cudaTextureDesc minMipmapLevelClamp(float minMipmapLevelClamp);
    /**
     * Upper end of the mipmap level range to clamp access to
     */
    public native float maxMipmapLevelClamp(); public native cudaTextureDesc maxMipmapLevelClamp(float maxMipmapLevelClamp);
}

/**
 * An opaque value that represents a CUDA texture object
 */

/** \} */
/** \} */ /* END CUDART_TYPES */

// #endif /* !__TEXTURE_TYPES_H__ */


// Parsed from 

/*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

// #if !defined(__VECTOR_TYPES_H__)
// #define __VECTOR_TYPES_H__

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

// #if !defined(__CUDA_LIBDEVICE__) && !defined(__CUDACC_RTC__)
// #define EXCLUDE_FROM_RTC
// #include "builtin_types.h"
// #undef EXCLUDE_FROM_RTC
// #endif /* !__CUDA_LIBDEVICE__ && !__CUDACC_RTC__ */
// #include "host_defines.h"

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

// #if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && !defined(__CUDABE__) &&
//     defined(_WIN32) && !defined(_WIN64)

// #else /* !__CUDACC__ && !__CUDACC_RTC__ && !__CUDABE__ && _WIN32 && !_WIN64 */

// #define __cuda_builtin_vector_align8(tag, members)
// struct __device_builtin__ __align__(8) tag
// {
//     members
// }

// #endif /* !__CUDACC__ && !__CUDACC_RTC__ && !__CUDABE__ && _WIN32 && !_WIN64 */

public static class char1 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public char1() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public char1(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public char1(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public char1 position(long position) {
        return (char1)super.position(position);
    }

    public native byte x(); public native char1 x(byte x);
}

public static class uchar1 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public uchar1() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public uchar1(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public uchar1(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public uchar1 position(long position) {
        return (uchar1)super.position(position);
    }

    public native @Cast("unsigned char") byte x(); public native uchar1 x(byte x);
}


public static class char2 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public char2() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public char2(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public char2(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public char2 position(long position) {
        return (char2)super.position(position);
    }

    public native byte x(); public native char2 x(byte x);
    public native byte y(); public native char2 y(byte y);
}

public static class uchar2 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public uchar2() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public uchar2(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public uchar2(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public uchar2 position(long position) {
        return (uchar2)super.position(position);
    }

    public native @Cast("unsigned char") byte x(); public native uchar2 x(byte x);
    public native @Cast("unsigned char") byte y(); public native uchar2 y(byte y);
}

public static class char3 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public char3() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public char3(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public char3(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public char3 position(long position) {
        return (char3)super.position(position);
    }

    public native byte x(); public native char3 x(byte x);
    public native byte y(); public native char3 y(byte y);
    public native byte z(); public native char3 z(byte z);
}

public static class uchar3 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public uchar3() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public uchar3(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public uchar3(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public uchar3 position(long position) {
        return (uchar3)super.position(position);
    }

    public native @Cast("unsigned char") byte x(); public native uchar3 x(byte x);
    public native @Cast("unsigned char") byte y(); public native uchar3 y(byte y);
    public native @Cast("unsigned char") byte z(); public native uchar3 z(byte z);
}

public static class char4 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public char4() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public char4(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public char4(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public char4 position(long position) {
        return (char4)super.position(position);
    }

    public native byte x(); public native char4 x(byte x);
    public native byte y(); public native char4 y(byte y);
    public native byte z(); public native char4 z(byte z);
    public native byte w(); public native char4 w(byte w);
}

public static class uchar4 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public uchar4() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public uchar4(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public uchar4(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public uchar4 position(long position) {
        return (uchar4)super.position(position);
    }

    public native @Cast("unsigned char") byte x(); public native uchar4 x(byte x);
    public native @Cast("unsigned char") byte y(); public native uchar4 y(byte y);
    public native @Cast("unsigned char") byte z(); public native uchar4 z(byte z);
    public native @Cast("unsigned char") byte w(); public native uchar4 w(byte w);
}

public static class short1 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public short1() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public short1(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public short1(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public short1 position(long position) {
        return (short1)super.position(position);
    }

    public native short x(); public native short1 x(short x);
}

public static class ushort1 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public ushort1() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public ushort1(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public ushort1(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public ushort1 position(long position) {
        return (ushort1)super.position(position);
    }

    public native @Cast("unsigned short") short x(); public native ushort1 x(short x);
}

public static class short2 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public short2() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public short2(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public short2(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public short2 position(long position) {
        return (short2)super.position(position);
    }

    public native short x(); public native short2 x(short x);
    public native short y(); public native short2 y(short y);
}

public static class ushort2 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public ushort2() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public ushort2(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public ushort2(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public ushort2 position(long position) {
        return (ushort2)super.position(position);
    }

    public native @Cast("unsigned short") short x(); public native ushort2 x(short x);
    public native @Cast("unsigned short") short y(); public native ushort2 y(short y);
}

public static class short3 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public short3() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public short3(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public short3(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public short3 position(long position) {
        return (short3)super.position(position);
    }

    public native short x(); public native short3 x(short x);
    public native short y(); public native short3 y(short y);
    public native short z(); public native short3 z(short z);
}

public static class ushort3 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public ushort3() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public ushort3(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public ushort3(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public ushort3 position(long position) {
        return (ushort3)super.position(position);
    }

    public native @Cast("unsigned short") short x(); public native ushort3 x(short x);
    public native @Cast("unsigned short") short y(); public native ushort3 y(short y);
    public native @Cast("unsigned short") short z(); public native ushort3 z(short z);
}

public static class short4 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public short4() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public short4(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public short4(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public short4 position(long position) {
        return (short4)super.position(position);
    }
 
    public native short x(); public native short4 x(short x); public native short y(); public native short4 y(short y); public native short z(); public native short4 z(short z); public native short w(); public native short4 w(short w);
}
public static class ushort4 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public ushort4() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public ushort4(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public ushort4(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public ushort4 position(long position) {
        return (ushort4)super.position(position);
    }
 
    public native @Cast("unsigned short") short x(); public native ushort4 x(short x); public native @Cast("unsigned short") short y(); public native ushort4 y(short y); public native @Cast("unsigned short") short z(); public native ushort4 z(short z); public native @Cast("unsigned short") short w(); public native ushort4 w(short w);
}

public static class int1 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public int1() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public int1(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public int1(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public int1 position(long position) {
        return (int1)super.position(position);
    }

    public native int x(); public native int1 x(int x);
}

public static class uint1 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public uint1() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public uint1(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public uint1(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public uint1 position(long position) {
        return (uint1)super.position(position);
    }

    public native @Cast("unsigned int") int x(); public native uint1 x(int x);
}

public static class int2 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public int2() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public int2(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public int2(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public int2 position(long position) {
        return (int2)super.position(position);
    }
 
    public native int x(); public native int2 x(int x); public native int y(); public native int2 y(int y);
}
public static class uint2 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public uint2() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public uint2(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public uint2(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public uint2 position(long position) {
        return (uint2)super.position(position);
    }
 
    public native @Cast("unsigned int") int x(); public native uint2 x(int x); public native @Cast("unsigned int") int y(); public native uint2 y(int y);
}

public static class int3 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public int3() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public int3(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public int3(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public int3 position(long position) {
        return (int3)super.position(position);
    }

    public native int x(); public native int3 x(int x);
    public native int y(); public native int3 y(int y);
    public native int z(); public native int3 z(int z);
}

public static class uint3 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public uint3() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public uint3(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public uint3(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public uint3 position(long position) {
        return (uint3)super.position(position);
    }

    public native @Cast("unsigned int") int x(); public native uint3 x(int x);
    public native @Cast("unsigned int") int y(); public native uint3 y(int y);
    public native @Cast("unsigned int") int z(); public native uint3 z(int z);
}

public static class int4 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public int4() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public int4(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public int4(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public int4 position(long position) {
        return (int4)super.position(position);
    }

    public native int x(); public native int4 x(int x);
    public native int y(); public native int4 y(int y);
    public native int z(); public native int4 z(int z);
    public native int w(); public native int4 w(int w);
}

public static class uint4 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public uint4() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public uint4(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public uint4(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public uint4 position(long position) {
        return (uint4)super.position(position);
    }

    public native @Cast("unsigned int") int x(); public native uint4 x(int x);
    public native @Cast("unsigned int") int y(); public native uint4 y(int y);
    public native @Cast("unsigned int") int z(); public native uint4 z(int z);
    public native @Cast("unsigned int") int w(); public native uint4 w(int w);
}

public static class long1 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public long1() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public long1(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public long1(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public long1 position(long position) {
        return (long1)super.position(position);
    }

    public native long x(); public native long1 x(long x);
}

public static class ulong1 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public ulong1() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public ulong1(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public ulong1(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public ulong1 position(long position) {
        return (ulong1)super.position(position);
    }

    public native @Cast("unsigned long") long x(); public native ulong1 x(long x);
}

// #if defined(_WIN32)
public static class long2 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public long2() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public long2(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public long2(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public long2 position(long position) {
        return (long2)super.position(position);
    }
 
    public native long x(); public native long2 x(long x); public native long y(); public native long2 y(long y);
}
public static class ulong2 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public ulong2() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public ulong2(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public ulong2(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public ulong2 position(long position) {
        return (ulong2)super.position(position);
    }
 
    public native @Cast("unsigned long int") long x(); public native ulong2 x(long x); public native @Cast("unsigned long int") long y(); public native ulong2 y(long y);
}
// #else /* !_WIN32 */

// #endif /* _WIN32 */

public static class long3 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public long3() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public long3(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public long3(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public long3 position(long position) {
        return (long3)super.position(position);
    }

    public native long x(); public native long3 x(long x);
    public native long y(); public native long3 y(long y);
    public native long z(); public native long3 z(long z);
}

public static class ulong3 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public ulong3() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public ulong3(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public ulong3(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public ulong3 position(long position) {
        return (ulong3)super.position(position);
    }

    public native @Cast("unsigned long int") long x(); public native ulong3 x(long x);
    public native @Cast("unsigned long int") long y(); public native ulong3 y(long y);
    public native @Cast("unsigned long int") long z(); public native ulong3 z(long z);
}

public static class long4 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public long4() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public long4(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public long4(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public long4 position(long position) {
        return (long4)super.position(position);
    }

    public native long x(); public native long4 x(long x);
    public native long y(); public native long4 y(long y);
    public native long z(); public native long4 z(long z);
    public native long w(); public native long4 w(long w);
}

public static class ulong4 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public ulong4() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public ulong4(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public ulong4(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public ulong4 position(long position) {
        return (ulong4)super.position(position);
    }

    public native @Cast("unsigned long int") long x(); public native ulong4 x(long x);
    public native @Cast("unsigned long int") long y(); public native ulong4 y(long y);
    public native @Cast("unsigned long int") long z(); public native ulong4 z(long z);
    public native @Cast("unsigned long int") long w(); public native ulong4 w(long w);
}

public static class float1 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public float1() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public float1(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public float1(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public float1 position(long position) {
        return (float1)super.position(position);
    }

    public native float x(); public native float1 x(float x);
}

// #if !defined(__CUDACC__) && !defined(__CUDABE__) && defined(__arm__) &&
//     defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6

// #else /* !__CUDACC__ && !__CUDABE__ && __arm__ && __ARM_PCS_VFP &&
//          __GNUC__ == 4&& __GNUC_MINOR__ == 6 */

public static class float2 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public float2() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public float2(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public float2(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public float2 position(long position) {
        return (float2)super.position(position);
    }
 
    public native float x(); public native float2 x(float x); public native float y(); public native float2 y(float y);
}

// #endif /* !__CUDACC__ && !__CUDABE__ && __arm__ && __ARM_PCS_VFP &&
//           __GNUC__ == 4&& __GNUC_MINOR__ == 6 */

public static class float3 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public float3() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public float3(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public float3(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public float3 position(long position) {
        return (float3)super.position(position);
    }

    public native float x(); public native float3 x(float x);
    public native float y(); public native float3 y(float y);
    public native float z(); public native float3 z(float z);
}

public static class float4 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public float4() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public float4(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public float4(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public float4 position(long position) {
        return (float4)super.position(position);
    }

    public native float x(); public native float4 x(float x);
    public native float y(); public native float4 y(float y);
    public native float z(); public native float4 z(float z);
    public native float w(); public native float4 w(float w);
}

public static class longlong1 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public longlong1() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public longlong1(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public longlong1(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public longlong1 position(long position) {
        return (longlong1)super.position(position);
    }

    public native long x(); public native longlong1 x(long x);
}

public static class ulonglong1 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public ulonglong1() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public ulonglong1(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public ulonglong1(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public ulonglong1 position(long position) {
        return (ulonglong1)super.position(position);
    }

    public native @Cast("unsigned long long int") long x(); public native ulonglong1 x(long x);
}

public static class longlong2 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public longlong2() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public longlong2(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public longlong2(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public longlong2 position(long position) {
        return (longlong2)super.position(position);
    }

    public native long x(); public native longlong2 x(long x);
    public native long y(); public native longlong2 y(long y);
}

public static class ulonglong2 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public ulonglong2() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public ulonglong2(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public ulonglong2(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public ulonglong2 position(long position) {
        return (ulonglong2)super.position(position);
    }

    public native @Cast("unsigned long long int") long x(); public native ulonglong2 x(long x);
    public native @Cast("unsigned long long int") long y(); public native ulonglong2 y(long y);
}

public static class longlong3 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public longlong3() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public longlong3(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public longlong3(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public longlong3 position(long position) {
        return (longlong3)super.position(position);
    }

    public native long x(); public native longlong3 x(long x);
    public native long y(); public native longlong3 y(long y);
    public native long z(); public native longlong3 z(long z);
}

public static class ulonglong3 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public ulonglong3() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public ulonglong3(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public ulonglong3(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public ulonglong3 position(long position) {
        return (ulonglong3)super.position(position);
    }

    public native @Cast("unsigned long long int") long x(); public native ulonglong3 x(long x);
    public native @Cast("unsigned long long int") long y(); public native ulonglong3 y(long y);
    public native @Cast("unsigned long long int") long z(); public native ulonglong3 z(long z);
}

public static class longlong4 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public longlong4() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public longlong4(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public longlong4(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public longlong4 position(long position) {
        return (longlong4)super.position(position);
    }

    public native long x(); public native longlong4 x(long x);
    public native long y(); public native longlong4 y(long y);
    public native long z(); public native longlong4 z(long z);
    public native long w(); public native longlong4 w(long w);
}

public static class ulonglong4 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public ulonglong4() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public ulonglong4(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public ulonglong4(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public ulonglong4 position(long position) {
        return (ulonglong4)super.position(position);
    }

    public native @Cast("unsigned long long int") long x(); public native ulonglong4 x(long x);
    public native @Cast("unsigned long long int") long y(); public native ulonglong4 y(long y);
    public native @Cast("unsigned long long int") long z(); public native ulonglong4 z(long z);
    public native @Cast("unsigned long long int") long w(); public native ulonglong4 w(long w);
}

public static class double1 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public double1() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public double1(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public double1(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public double1 position(long position) {
        return (double1)super.position(position);
    }

    public native double x(); public native double1 x(double x);
}

public static class double2 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public double2() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public double2(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public double2(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public double2 position(long position) {
        return (double2)super.position(position);
    }

    public native double x(); public native double2 x(double x);
    public native double y(); public native double2 y(double y);
}

public static class double3 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public double3() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public double3(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public double3(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public double3 position(long position) {
        return (double3)super.position(position);
    }

    public native double x(); public native double3 x(double x);
    public native double y(); public native double3 y(double y);
    public native double z(); public native double3 z(double z);
}

public static class double4 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public double4() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public double4(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public double4(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public double4 position(long position) {
        return (double4)super.position(position);
    }

    public native double x(); public native double4 x(double x);
    public native double y(); public native double4 y(double y);
    public native double z(); public native double4 z(double z);
    public native double w(); public native double4 w(double w);
}

// #if !defined(__CUDACC__) && !defined(__CUDABE__) &&
//     defined(_WIN32) && !defined(_WIN64)

// #endif /* !__CUDACC__ && !__CUDABE__ && _WIN32 && !_WIN64 */

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

@NoOffset public static class dim3 extends Pointer {
    static { Loader.load(); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public dim3(Pointer p) { super(p); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public dim3(long size) { super((Pointer)null); allocateArray(size); }
    private native void allocateArray(long size);
    @Override public dim3 position(long position) {
        return (dim3)super.position(position);
    }

    public native @Cast("unsigned int") int x(); public native dim3 x(int x);
    public native @Cast("unsigned int") int y(); public native dim3 y(int y);
    public native @Cast("unsigned int") int z(); public native dim3 z(int z);
// #if defined(__cplusplus)
    public dim3(@Cast("unsigned int") int vx/*=1*/, @Cast("unsigned int") int vy/*=1*/, @Cast("unsigned int") int vz/*=1*/) { super((Pointer)null); allocate(vx, vy, vz); }
    private native void allocate(@Cast("unsigned int") int vx/*=1*/, @Cast("unsigned int") int vy/*=1*/, @Cast("unsigned int") int vz/*=1*/);
    public dim3() { super((Pointer)null); allocate(); }
    private native void allocate();
    public dim3(@ByVal uint3 v) { super((Pointer)null); allocate(v); }
    private native void allocate(@ByVal uint3 v);
    public native @ByVal @Name("operator uint3") uint3 asUint3();
// #endif /* __cplusplus */
}

// #undef  __cuda_builtin_vector_align8

// #endif /* !__VECTOR_TYPES_H__ */


// Parsed from 

/*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

// #include "device_types.h"
// #if !defined(__CUDACC_RTC__)
// #define EXCLUDE_FROM_RTC
// #include "driver_types.h"
// #undef EXCLUDE_FROM_RTC
// #endif /* !__CUDACC_RTC__ */
// #include "surface_types.h"
// #include "texture_types.h"
// #include "vector_types.h"


// Parsed from 

/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

// #if !defined(__CUDA_RUNTIME_API_H__)
// #define __CUDA_RUNTIME_API_H__

/**
 * \latexonly
 * \page sync_async API synchronization behavior
 *
 * \section memcpy_sync_async_behavior Memcpy
 * The API provides memcpy/memset functions in both synchronous and asynchronous forms,
 * the latter having an \e "Async" suffix. This is a misnomer as each function
 * may exhibit synchronous or asynchronous behavior depending on the arguments
 * passed to the function. In the reference documentation, each memcpy function is
 * categorized as \e synchronous or \e asynchronous, corresponding to the definitions
 * below.
 * 
 * \subsection MemcpySynchronousBehavior Synchronous
 * 
 * 

 *  For transfers from pageable host memory to device memory, a stream sync is performed
 * before the copy is initiated. The function will return once the pageable
 * buffer has been copied to the staging memory for DMA transfer to device memory,
 * but the DMA to final destination may not have completed.
 * 
 * 
 For transfers from pinned host memory to device memory, the function is synchronous
 * with respect to the host.
 *
 * 
 For transfers from device to either pageable or pinned host memory, the function returns
 * only once the copy has completed.
 * 
 * 
 For transfers from device memory to device memory, no host-side synchronization is
 * performed.
 *
 * 
 For transfers from any host memory to any host memory, the function is fully
 * synchronous with respect to the host.
 * 
 * 
 * \subsection MemcpyAsynchronousBehavior Asynchronous
 *
 * 
 *  For transfers from device memory to pageable host memory, the function
 * will return only once the copy has completed.
 *
 * 
 For transfers from any host memory to any host memory, the function is fully
 * synchronous with respect to the host.
 * 
 * 
 For all other transfers, the function is fully asynchronous. If pageable
 * memory must first be staged to pinned memory, this will be handled
 * asynchronously with a worker thread.
 * 
 *
 * \section memset_sync_async_behavior Memset
 * The cudaMemset functions are asynchronous with respect to the host
 * except when the target memory is pinned host memory. The \e Async
 * versions are always asynchronous with respect to the host.
 *
 * \section kernel_launch_details Kernel Launches
 * Kernel launches are asynchronous with respect to the host. Details of
 * concurrent kernel execution and data transfers can be found in the CUDA
 * Programmers Guide.
 *
 * \endlatexonly
 */

/**
 * There are two levels for the runtime API.
 *
 * The C API (cuda_runtime_api.h) is
 * a C-style interface that does not require compiling with \p nvcc.
 *
 * The \ref CUDART_HIGHLEVEL "C++ API" (cuda_runtime.h) is a
 * C++-style interface built on top of the C API. It wraps some of the
 * C API routines, using overloading, references and default arguments.
 * These wrappers can be used from C++ code and can be compiled with any C++
 * compiler. The C++ API also has some CUDA-specific wrappers that wrap
 * C API routines that deal with symbols, textures, and device functions.
 * These wrappers require the use of \p nvcc because they depend on code being
 * generated by the compiler. For example, the execution configuration syntax
 * to invoke kernels is only available in source code compiled with \p nvcc.
 */

/** CUDA Runtime API Version */
public static final int CUDART_VERSION =  8000;

// #include "host_defines.h"
// #include "builtin_types.h"

// #if !defined(__CUDACC_INTEGRATED__)
// #include "cuda_device_runtime_api.h"
// #endif /* !defined(__CUDACC_INTEGRATED__) */

// #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) || defined(__CUDA_API_VERSION_INTERNAL)
//     #define __CUDART_API_PER_THREAD_DEFAULT_STREAM
//     #define __CUDART_API_PTDS(api) api ## _ptds
//     #define __CUDART_API_PTSZ(api) api ## _ptsz
// #else
//     #define __CUDART_API_PTDS(api) api
//     #define __CUDART_API_PTSZ(api) api
// #endif

// #if defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM)
// #endif

/** \cond impl_private */
// #if !defined(__dv)

// #if defined(__cplusplus)

// #define __dv(v)
//         = v

// #else /* __cplusplus */

// #define __dv(v)

// #endif /* __cplusplus */

// #endif /* !__dv */
/** \endcond impl_private */

// #if !defined(__CUDACC_INTEGRATED__) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350))   /** Visible to SM>=3.5 and "__host__ __device__" only **/

// #define CUDART_DEVICE __device__ 

// #else

// #define CUDART_DEVICE

// #endif /** CUDART_DEVICE */

// #if defined(__cplusplus)
// #endif /* __cplusplus */

/**
 * \defgroup CUDART_DEVICE Device Management
 *
 * ___MANBRIEF___ device management functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the device management functions of the CUDA runtime
 * application programming interface.
 *
 * \{
 */

/**
 * \brief Destroy all allocations and reset all state on the current device
 * in the current process.
 *
 * Explicitly destroys and cleans up all resources associated with the current
 * device in the current process.  Any subsequent API call to this device will 
 * reinitialize the device.
 *
 * Note that this function will reset the device immediately.  It is the caller's
 * responsibility to ensure that the device is not being accessed by any 
 * other host threads from the process when this function is called.
 *
 * @return
 * ::cudaSuccess
 * \notefnerr
 *
 * \sa ::cudaDeviceSynchronize
 */
public static native @Cast("cudaError_t") int cudaDeviceReset();

/**
 * \brief Wait for compute device to finish
 *
 * Blocks until the device has completed all preceding requested tasks.
 * ::cudaDeviceSynchronize() returns an error if one of the preceding tasks
 * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for 
 * this device, the host thread will block until the device has finished 
 * its work.
 *
 * @return
 * ::cudaSuccess
 * \notefnerr
 *
 * \sa ::cudaDeviceReset
 */
public static native @Cast("cudaError_t") int cudaDeviceSynchronize();

/**
 * \brief Set resource limits
 *
 * Setting \p limit to \p value is a request by the application to update
 * the current limit maintained by the device.  The driver is free to
 * modify the requested value to meet h/w requirements (this could be
 * clamping to minimum or maximum values, rounding up to nearest element
 * size, etc).  The application can use ::cudaDeviceGetLimit() to find out
 * exactly what the limit has been set to.
 *
 * Setting each ::cudaLimit has its own specific restrictions, so each is
 * discussed here.
 *
 * - ::cudaLimitStackSize controls the stack size in bytes of each GPU thread.
 *
 * - ::cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO
 *   used by the ::printf() and ::fprintf() device system calls. Setting
 *   ::cudaLimitPrintfFifoSize must not be performed after launching any kernel
 *   that uses the ::printf() or ::fprintf() device system calls - in such case
 *   ::cudaErrorInvalidValue will be returned.
 *
 * - ::cudaLimitMallocHeapSize controls the size in bytes of the heap used by
 *   the ::malloc() and ::free() device system calls. Setting
 *   ::cudaLimitMallocHeapSize must not be performed after launching any kernel
 *   that uses the ::malloc() or ::free() device system calls - in such case
 *   ::cudaErrorInvalidValue will be returned.
 *
 * - ::cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a
 *   grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
 *   this limit must be performed before any launch of a kernel that uses the
 *   device runtime and calls ::cudaDeviceSynchronize() above the default sync
 *   depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail
 *   with error code ::cudaErrorSyncDepthExceeded if the limitation is
 *   violated. This limit can be set smaller than the default or up the maximum
 *   launch depth of 24. When setting this limit, keep in mind that additional
 *   levels of sync depth require the runtime to reserve large amounts of
 *   device memory which can no longer be used for user allocations. If these
 *   reservations of device memory fail, ::cudaDeviceSetLimit will return
 *   ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value.
 *   This limit is only applicable to devices of compute capability 3.5 and
 *   higher. Attempting to set this limit on devices of compute capability less
 *   than 3.5 will result in the error ::cudaErrorUnsupportedLimit being
 *   returned.
 *
 * - ::cudaLimitDevRuntimePendingLaunchCount controls the maximum number of
 *   outstanding device runtime launches that can be made from the current
 *   device. A grid is outstanding from the point of launch up until the grid
 *   is known to have been completed. Device runtime launches which violate 
 *   this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
 *   ::cudaGetLastError() is called after launch. If more pending launches than
 *   the default (2048 launches) are needed for a module using the device
 *   runtime, this limit can be increased. Keep in mind that being able to
 *   sustain additional pending launches will require the runtime to reserve
 *   larger amounts of device memory upfront which can no longer be used for
 *   allocations. If these reservations fail, ::cudaDeviceSetLimit will return
 *   ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value.
 *   This limit is only applicable to devices of compute capability 3.5 and
 *   higher. Attempting to set this limit on devices of compute capability less
 *   than 3.5 will result in the error ::cudaErrorUnsupportedLimit being
 *   returned. 
 *
 * @param limit - Limit to set
 * @param value - Size of limit
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorUnsupportedLimit,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorMemoryAllocation
 * \notefnerr
 *
 * \sa ::cudaDeviceGetLimit
 */
public static native @Cast("cudaError_t") int cudaDeviceSetLimit(@Cast("cudaLimit") int limit, @Cast("size_t") long value);

/**
 * \brief Returns resource limits
 *
 * Returns in \p *pValue the current size of \p limit.  The supported
 * ::cudaLimit values are:
 * - ::cudaLimitStackSize: stack size in bytes of each GPU thread;
 * - ::cudaLimitPrintfFifoSize: size in bytes of the shared FIFO used by the
 *   ::printf() and ::fprintf() device system calls.
 * - ::cudaLimitMallocHeapSize: size in bytes of the heap used by the
 *   ::malloc() and ::free() device system calls;
 * - ::cudaLimitDevRuntimeSyncDepth: maximum grid depth at which a
 *   thread can isssue the device runtime call ::cudaDeviceSynchronize()
 *   to wait on child grid launches to complete.
 * - ::cudaLimitDevRuntimePendingLaunchCount: maximum number of outstanding
 *   device runtime launches.
 *
 * @param limit  - Limit to query
 * @param pValue - Returned size of the limit
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorUnsupportedLimit,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaDeviceSetLimit
 */
public static native @Cast("cudaError_t") int cudaDeviceGetLimit(@Cast("size_t*") SizeTPointer pValue, @Cast("cudaLimit") int limit);

/**
 * \brief Returns the preferred cache configuration for the current device.
 *
 * On devices where the L1 cache and shared memory use the same hardware
 * resources, this returns through \p pCacheConfig the preferred cache
 * configuration for the current device. This is only a preference. The
 * runtime will use the requested configuration if possible, but it is free to
 * choose a different configuration if required to execute functions.
 *
 * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices
 * where the size of the L1 cache and shared memory are fixed.
 *
 * The supported cache configurations are:
 * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
 * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
 * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
 * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
 *
 * @param pCacheConfig - Returned cache configuration
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInitializationError
 * \notefnerr
 *
 * \sa cudaDeviceSetCacheConfig,
 * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
 * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
 */
public static native @Cast("cudaError_t") int cudaDeviceGetCacheConfig(@Cast("cudaFuncCache*") IntPointer pCacheConfig);
public static native @Cast("cudaError_t") int cudaDeviceGetCacheConfig(@Cast("cudaFuncCache*") IntBuffer pCacheConfig);
public static native @Cast("cudaError_t") int cudaDeviceGetCacheConfig(@Cast("cudaFuncCache*") int[] pCacheConfig);

/**
 * \brief Returns numerical values that correspond to the least and
 * greatest stream priorities.
 *
 * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
 * to the least and greatest stream priorities respectively. Stream priorities
 * follow a convention where lower numbers imply greater priorities. The range of
 * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
 * If the user attempts to create a stream with a priority value that is
 * outside the the meaningful range as specified by this API, the priority is
 * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
 * respectively. See ::cudaStreamCreateWithPriority for details on creating a
 * priority stream.
 * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
 * is not desired.
 *
 * This function will return '0' in both \p *leastPriority and \p *greatestPriority if
 * the current context's device does not support stream priorities
 * (see ::cudaDeviceGetAttribute).
 *
 * @param leastPriority    - Pointer to an int in which the numerical value for least
 *                           stream priority is returned
 * @param greatestPriority - Pointer to an int in which the numerical value for greatest
 *                           stream priority is returned
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaStreamCreateWithPriority,
 * ::cudaStreamGetPriority
 */
public static native @Cast("cudaError_t") int cudaDeviceGetStreamPriorityRange(IntPointer leastPriority, IntPointer greatestPriority);
public static native @Cast("cudaError_t") int cudaDeviceGetStreamPriorityRange(IntBuffer leastPriority, IntBuffer greatestPriority);
public static native @Cast("cudaError_t") int cudaDeviceGetStreamPriorityRange(int[] leastPriority, int[] greatestPriority);

/**
 * \brief Sets the preferred cache configuration for the current device.
 *
 * On devices where the L1 cache and shared memory use the same hardware
 * resources, this sets through \p cacheConfig the preferred cache
 * configuration for the current device. This is only a preference. The
 * runtime will use the requested configuration if possible, but it is free to
 * choose a different configuration if required to execute the function. Any
 * function preference set via
 * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)"
 * or
 * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
 * will be preferred over this device-wide setting. Setting the device-wide
 * cache configuration to ::cudaFuncCachePreferNone will cause subsequent
 * kernel launches to prefer to not change the cache configuration unless
 * required to launch the kernel.
 *
 * This setting does nothing on devices where the size of the L1 cache and
 * shared memory are fixed.
 *
 * Launching a kernel with a different preference than the most recent
 * preference setting may insert a device-side synchronization point.
 *
 * The supported cache configurations are:
 * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
 * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
 * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
 * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
 *
 * @param cacheConfig - Requested cache configuration
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInitializationError
 * \notefnerr
 *
 * \sa ::cudaDeviceGetCacheConfig,
 * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
 * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
 */
public static native @Cast("cudaError_t") int cudaDeviceSetCacheConfig(@Cast("cudaFuncCache") int cacheConfig);

/**
 * \brief Returns the shared memory configuration for the current device.
 *
 * This function will return in \p pConfig the current size of shared memory banks
 * on the current device. On devices with configurable shared memory banks, 
 * ::cudaDeviceSetSharedMemConfig can be used to change this setting, so that all 
 * subsequent kernel launches will by default use the new bank size. When 
 * ::cudaDeviceGetSharedMemConfig is called on devices without configurable shared 
 * memory, it will return the fixed bank size of the hardware.
 *
 * The returned bank configurations can be either:
 * - ::cudaSharedMemBankSizeFourByte - shared memory bank width is four bytes.
 * - ::cudaSharedMemBankSizeEightByte - shared memory bank width is eight bytes.
 *
 * @param pConfig - Returned cache configuration
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInitializationError
 * \notefnerr
 *
 * \sa ::cudaDeviceSetCacheConfig,
 * ::cudaDeviceGetCacheConfig,
 * ::cudaDeviceSetSharedMemConfig,
 * ::cudaFuncSetCacheConfig
 */
public static native @Cast("cudaError_t") int cudaDeviceGetSharedMemConfig(@Cast("cudaSharedMemConfig*") IntPointer pConfig);
public static native @Cast("cudaError_t") int cudaDeviceGetSharedMemConfig(@Cast("cudaSharedMemConfig*") IntBuffer pConfig);
public static native @Cast("cudaError_t") int cudaDeviceGetSharedMemConfig(@Cast("cudaSharedMemConfig*") int[] pConfig);

/**
 * \brief Sets the shared memory configuration for the current device.
 *
 * On devices with configurable shared memory banks, this function will set
 * the shared memory bank size which is used for all subsequent kernel launches.
 * Any per-function setting of shared memory set via ::cudaFuncSetSharedMemConfig
 * will override the device wide setting.
 *
 * Changing the shared memory configuration between launches may introduce
 * a device side synchronization point.
 *
 * Changing the shared memory bank size will not increase shared memory usage
 * or affect occupancy of kernels, but may have major effects on performance. 
 * Larger bank sizes will allow for greater potential bandwidth to shared memory,
 * but will change what kinds of accesses to shared memory will result in bank 
 * conflicts.
 *
 * This function will do nothing on devices with fixed shared memory bank size.
 *
 * The supported bank configurations are:
 * - ::cudaSharedMemBankSizeDefault: set bank width the device default (currently,
 *   four bytes)
 * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be four bytes
 *   natively.
 * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight 
 *   bytes natively.
 *
 * @param config - Requested cache configuration
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInitializationError
 * \notefnerr
 *
 * \sa ::cudaDeviceSetCacheConfig,
 * ::cudaDeviceGetCacheConfig,
 * ::cudaDeviceGetSharedMemConfig,
 * ::cudaFuncSetCacheConfig
 */
public static native @Cast("cudaError_t") int cudaDeviceSetSharedMemConfig(@Cast("cudaSharedMemConfig") int config);

/**
 * \brief Returns a handle to a compute device
 *
 * Returns in \p *device a device ordinal given a PCI bus ID string.
 *
 * @param device   - Returned device ordinal
 *
 * @param pciBusId - String in one of the following forms: 
 * [domain]:[bus]:[device].[function]
 * [domain]:[bus]:[device]
 * [bus]:[device].[function]
 * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevice
 * \notefnerr
 *
 * \sa ::cudaDeviceGetPCIBusId
 */
public static native @Cast("cudaError_t") int cudaDeviceGetByPCIBusId(IntPointer device, @Cast("const char*") BytePointer pciBusId);
public static native @Cast("cudaError_t") int cudaDeviceGetByPCIBusId(IntBuffer device, String pciBusId);
public static native @Cast("cudaError_t") int cudaDeviceGetByPCIBusId(int[] device, @Cast("const char*") BytePointer pciBusId);
public static native @Cast("cudaError_t") int cudaDeviceGetByPCIBusId(IntPointer device, String pciBusId);
public static native @Cast("cudaError_t") int cudaDeviceGetByPCIBusId(IntBuffer device, @Cast("const char*") BytePointer pciBusId);
public static native @Cast("cudaError_t") int cudaDeviceGetByPCIBusId(int[] device, String pciBusId);

/**
 * \brief Returns a PCI Bus Id string for the device
 *
 * Returns an ASCII string identifying the device \p dev in the NULL-terminated
 * string pointed to by \p pciBusId. \p len specifies the maximum length of the
 * string that may be returned.
 *
 * @param pciBusId - Returned identifier string for the device in the following format
 * [domain]:[bus]:[device].[function]
 * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
 * pciBusId should be large enough to store 13 characters including the NULL-terminator.
 *
 * @param len      - Maximum length of string to store in \p name
 *
 * @param device   - Device to get identifier string for
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevice
 * \notefnerr
 *
 * \sa ::cudaDeviceGetByPCIBusId
 
 */
public static native @Cast("cudaError_t") int cudaDeviceGetPCIBusId(@Cast("char*") BytePointer pciBusId, int len, int device);
public static native @Cast("cudaError_t") int cudaDeviceGetPCIBusId(@Cast("char*") ByteBuffer pciBusId, int len, int device);
public static native @Cast("cudaError_t") int cudaDeviceGetPCIBusId(@Cast("char*") byte[] pciBusId, int len, int device);

/**
 * \brief Gets an interprocess handle for a previously allocated event
 *
 * Takes as input a previously allocated event. This event must have been 
 * created with the ::cudaEventInterprocess and ::cudaEventDisableTiming
 * flags set. This opaque handle may be copied into other processes and
 * opened with ::cudaIpcOpenEventHandle to allow efficient hardware
 * synchronization between GPU work in different processes.
 *
 * After the event has been been opened in the importing process, 
 * ::cudaEventRecord, ::cudaEventSynchronize, ::cudaStreamWaitEvent and 
 * ::cudaEventQuery may be used in either process. Performing operations 
 * on the imported event after the exported event has been freed 
 * with ::cudaEventDestroy will result in undefined behavior.
 *
 * IPC functionality is restricted to devices with support for unified 
 * addressing on Linux operating systems.
 *
 * @param handle - Pointer to a user allocated cudaIpcEventHandle
 *                    in which to return the opaque event handle
 * @param event   - Event allocated with ::cudaEventInterprocess and 
 *                    ::cudaEventDisableTiming flags.
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorMemoryAllocation,
 * ::cudaErrorMapBufferObjectFailed
 *
 * \sa
 * ::cudaEventCreate,
 * ::cudaEventDestroy,
 * ::cudaEventSynchronize,
 * ::cudaEventQuery,
 * ::cudaStreamWaitEvent,
 * ::cudaIpcOpenEventHandle,
 * ::cudaIpcGetMemHandle,
 * ::cudaIpcOpenMemHandle,
 * ::cudaIpcCloseMemHandle
 */
public static native @Cast("cudaError_t") int cudaIpcGetEventHandle(cudaIpcEventHandle_t handle, CUevent_st event);

/**
 * \brief Opens an interprocess event handle for use in the current process
 *
 * Opens an interprocess event handle exported from another process with 
 * ::cudaIpcGetEventHandle. This function returns a ::cudaEvent_t that behaves like 
 * a locally created event with the ::cudaEventDisableTiming flag specified. 
 * This event must be freed with ::cudaEventDestroy.
 *
 * Performing operations on the imported event after the exported event has 
 * been freed with ::cudaEventDestroy will result in undefined behavior.
 *
 * IPC functionality is restricted to devices with support for unified 
 * addressing on Linux operating systems.
 *
 * @param event - Returns the imported event
 * @param handle  - Interprocess handle to open
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMapBufferObjectFailed,
 * ::cudaErrorInvalidResourceHandle
 *
  * \sa
 * ::cudaEventCreate,
 * ::cudaEventDestroy,
 * ::cudaEventSynchronize,
 * ::cudaEventQuery,
 * ::cudaStreamWaitEvent,
 * ::cudaIpcGetEventHandle,
 * ::cudaIpcGetMemHandle,
 * ::cudaIpcOpenMemHandle,
 * ::cudaIpcCloseMemHandle
 */
public static native @Cast("cudaError_t") int cudaIpcOpenEventHandle(@ByPtrPtr CUevent_st event, @ByVal cudaIpcEventHandle_t handle);


/**
 * \brief Gets an interprocess memory handle for an existing device memory
 *          allocation
 *
 * Takes a pointer to the base of an existing device memory allocation created 
 * with ::cudaMalloc and exports it for use in another process. This is a 
 * lightweight operation and may be called multiple times on an allocation
 * without adverse effects. 
 *
 * If a region of memory is freed with ::cudaFree and a subsequent call
 * to ::cudaMalloc returns memory with the same device address,
 * ::cudaIpcGetMemHandle will return a unique handle for the
 * new memory. 
 *
 * IPC functionality is restricted to devices with support for unified 
 * addressing on Linux operating systems.
 *
 * @param handle - Pointer to user allocated ::cudaIpcMemHandle to return
 *                    the handle in.
 * @param devPtr - Base pointer to previously allocated device memory 
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorMemoryAllocation,
 * ::cudaErrorMapBufferObjectFailed,
 *
 * \sa
 * ::cudaMalloc,
 * ::cudaFree,
 * ::cudaIpcGetEventHandle,
 * ::cudaIpcOpenEventHandle,
 * ::cudaIpcOpenMemHandle,
 * ::cudaIpcCloseMemHandle
 */
public static native @Cast("cudaError_t") int cudaIpcGetMemHandle(cudaIpcMemHandle_t handle, Pointer devPtr);

/**
 * \brief Opens an interprocess memory handle exported from another process
 *          and returns a device pointer usable in the local process.
 *
 * Maps memory exported from another process with ::cudaIpcGetMemHandle into
 * the current device address space. For contexts on different devices 
 * ::cudaIpcOpenMemHandle can attempt to enable peer access between the
 * devices as if the user called ::cudaDeviceEnablePeerAccess. This behavior is 
 * controlled by the ::cudaIpcMemLazyEnablePeerAccess flag. 
 * ::cudaDeviceCanAccessPeer can determine if a mapping is possible.
 *
 * Contexts that may open ::cudaIpcMemHandles are restricted in the following way.
 * ::cudaIpcMemHandles from each device in a given process may only be opened 
 * by one context per device per other process.
 *
 * Memory returned from ::cudaIpcOpenMemHandle must be freed with
 * ::cudaIpcCloseMemHandle.
 *
 * Calling ::cudaFree on an exported memory region before calling
 * ::cudaIpcCloseMemHandle in the importing context will result in undefined
 * behavior.
 * 
 * IPC functionality is restricted to devices with support for unified 
 * addressing on Linux operating systems.
 *
 * @param devPtr - Returned device pointer
 * @param handle - ::cudaIpcMemHandle to open
 * @param flags  - Flags for this operation. Must be specified as ::cudaIpcMemLazyEnablePeerAccess
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMapBufferObjectFailed,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorTooManyPeers
 *
 * \note No guarantees are made about the address returned in \p *devPtr.  
 * In particular, multiple processes may not receive the same address for the same \p handle.
 *
 * \sa
 * ::cudaMalloc,
 * ::cudaFree,
 * ::cudaIpcGetEventHandle,
 * ::cudaIpcOpenEventHandle,
 * ::cudaIpcGetMemHandle,
 * ::cudaIpcCloseMemHandle,
 * ::cudaDeviceEnablePeerAccess,
 * ::cudaDeviceCanAccessPeer,
 */
public static native @Cast("cudaError_t") int cudaIpcOpenMemHandle(@Cast("void**") PointerPointer devPtr, @ByVal cudaIpcMemHandle_t handle, @Cast("unsigned int") int flags);
public static native @Cast("cudaError_t") int cudaIpcOpenMemHandle(@Cast("void**") @ByPtrPtr Pointer devPtr, @ByVal cudaIpcMemHandle_t handle, @Cast("unsigned int") int flags);

/**
 * \brief Close memory mapped with cudaIpcOpenMemHandle
 * 
 * Unmaps memory returnd by ::cudaIpcOpenMemHandle. The original allocation
 * in the exporting process as well as imported mappings in other processes
 * will be unaffected.
 *
 * Any resources used to enable peer access will be freed if this is the
 * last mapping using them.
 *
 * IPC functionality is restricted to devices with support for unified 
 * addressing on Linux operating systems.
 *
 * @param devPtr - Device pointer returned by ::cudaIpcOpenMemHandle
 * 
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMapBufferObjectFailed,
 * ::cudaErrorInvalidResourceHandle,
 *
 * \sa
 * ::cudaMalloc,
 * ::cudaFree,
 * ::cudaIpcGetEventHandle,
 * ::cudaIpcOpenEventHandle,
 * ::cudaIpcGetMemHandle,
 * ::cudaIpcOpenMemHandle,
 */
public static native @Cast("cudaError_t") int cudaIpcCloseMemHandle(Pointer devPtr);

/** \} */ /* END CUDART_DEVICE */

/**
 * \defgroup CUDART_THREAD_DEPRECATED Thread Management [DEPRECATED]
 *
 * ___MANBRIEF___ deprecated thread management functions of the CUDA runtime
 * API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes deprecated thread management functions of the CUDA runtime
 * application programming interface.
 *
 * \{
 */

/**
 * \brief Exit and clean up from CUDA launches
 *
 * @deprecated
 *
 * Note that this function is deprecated because its name does not 
 * reflect its behavior.  Its functionality is identical to the 
 * non-deprecated function ::cudaDeviceReset(), which should be used
 * instead.
 *
 * Explicitly destroys all cleans up all resources associated with the current
 * device in the current process.  Any subsequent API call to this device will 
 * reinitialize the device.  
 *
 * Note that this function will reset the device immediately.  It is the caller's
 * responsibility to ensure that the device is not being accessed by any 
 * other host threads from the process when this function is called.
 *
 * @return
 * ::cudaSuccess
 * \notefnerr
 *
 * \sa ::cudaDeviceReset
 */
public static native @Cast("cudaError_t") int cudaThreadExit();

/**
 * \brief Wait for compute device to finish
 *
 * @deprecated
 *
 * Note that this function is deprecated because its name does not 
 * reflect its behavior.  Its functionality is similar to the 
 * non-deprecated function ::cudaDeviceSynchronize(), which should be used
 * instead.
 *
 * Blocks until the device has completed all preceding requested tasks.
 * ::cudaThreadSynchronize() returns an error if one of the preceding tasks
 * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for 
 * this device, the host thread will block until the device has finished 
 * its work.
 *
 * @return
 * ::cudaSuccess
 * \notefnerr
 *
 * \sa ::cudaDeviceSynchronize
 */
public static native @Cast("cudaError_t") int cudaThreadSynchronize();

/**
 * \brief Set resource limits
 *
 * @deprecated
 *
 * Note that this function is deprecated because its name does not 
 * reflect its behavior.  Its functionality is identical to the 
 * non-deprecated function ::cudaDeviceSetLimit(), which should be used
 * instead.
 *
 * Setting \p limit to \p value is a request by the application to update
 * the current limit maintained by the device.  The driver is free to
 * modify the requested value to meet h/w requirements (this could be
 * clamping to minimum or maximum values, rounding up to nearest element
 * size, etc).  The application can use ::cudaThreadGetLimit() to find out
 * exactly what the limit has been set to.
 *
 * Setting each ::cudaLimit has its own specific restrictions, so each is
 * discussed here.
 *
 * - ::cudaLimitStackSize controls the stack size of each GPU thread.
 *
 * - ::cudaLimitPrintfFifoSize controls the size of the shared FIFO
 *   used by the ::printf() and ::fprintf() device system calls.
 *   Setting ::cudaLimitPrintfFifoSize must be performed before
 *   launching any kernel that uses the ::printf() or ::fprintf() device
 *   system calls, otherwise ::cudaErrorInvalidValue will be returned.
 *
 * - ::cudaLimitMallocHeapSize controls the size of the heap used
 *   by the ::malloc() and ::free() device system calls.  Setting
 *   ::cudaLimitMallocHeapSize must be performed before launching
 *   any kernel that uses the ::malloc() or ::free() device system calls,
 *   otherwise ::cudaErrorInvalidValue will be returned.
 *
 * @param limit - Limit to set
 * @param value - Size in bytes of limit
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorUnsupportedLimit,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaDeviceSetLimit
 */
public static native @Cast("cudaError_t") int cudaThreadSetLimit(@Cast("cudaLimit") int limit, @Cast("size_t") long value);

/**
 * \brief Returns resource limits
 *
 * @deprecated
 *
 * Note that this function is deprecated because its name does not 
 * reflect its behavior.  Its functionality is identical to the 
 * non-deprecated function ::cudaDeviceGetLimit(), which should be used
 * instead.
 *
 * Returns in \p *pValue the current size of \p limit.  The supported
 * ::cudaLimit values are:
 * - ::cudaLimitStackSize: stack size of each GPU thread;
 * - ::cudaLimitPrintfFifoSize: size of the shared FIFO used by the
 *   ::printf() and ::fprintf() device system calls.
 * - ::cudaLimitMallocHeapSize: size of the heap used by the
 *   ::malloc() and ::free() device system calls;
 *
 * @param limit  - Limit to query
 * @param pValue - Returned size in bytes of limit
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorUnsupportedLimit,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaDeviceGetLimit
 */
public static native @Cast("cudaError_t") int cudaThreadGetLimit(@Cast("size_t*") SizeTPointer pValue, @Cast("cudaLimit") int limit);

/**
 * \brief Returns the preferred cache configuration for the current device.
 *
 * @deprecated
 *
 * Note that this function is deprecated because its name does not 
 * reflect its behavior.  Its functionality is identical to the 
 * non-deprecated function ::cudaDeviceGetCacheConfig(), which should be 
 * used instead.
 * 
 * On devices where the L1 cache and shared memory use the same hardware
 * resources, this returns through \p pCacheConfig the preferred cache
 * configuration for the current device. This is only a preference. The
 * runtime will use the requested configuration if possible, but it is free to
 * choose a different configuration if required to execute functions.
 *
 * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices
 * where the size of the L1 cache and shared memory are fixed.
 *
 * The supported cache configurations are:
 * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
 * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
 * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
 *
 * @param pCacheConfig - Returned cache configuration
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInitializationError
 * \notefnerr
 *
 * \sa cudaDeviceGetCacheConfig
 */
public static native @Cast("cudaError_t") int cudaThreadGetCacheConfig(@Cast("cudaFuncCache*") IntPointer pCacheConfig);
public static native @Cast("cudaError_t") int cudaThreadGetCacheConfig(@Cast("cudaFuncCache*") IntBuffer pCacheConfig);
public static native @Cast("cudaError_t") int cudaThreadGetCacheConfig(@Cast("cudaFuncCache*") int[] pCacheConfig);

/**
 * \brief Sets the preferred cache configuration for the current device.
 *
 * @deprecated
 *
 * Note that this function is deprecated because its name does not 
 * reflect its behavior.  Its functionality is identical to the 
 * non-deprecated function ::cudaDeviceSetCacheConfig(), which should be 
 * used instead.
 * 
 * On devices where the L1 cache and shared memory use the same hardware
 * resources, this sets through \p cacheConfig the preferred cache
 * configuration for the current device. This is only a preference. The
 * runtime will use the requested configuration if possible, but it is free to
 * choose a different configuration if required to execute the function. Any
 * function preference set via
 * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)"
 * or
 * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
 * will be preferred over this device-wide setting. Setting the device-wide
 * cache configuration to ::cudaFuncCachePreferNone will cause subsequent
 * kernel launches to prefer to not change the cache configuration unless
 * required to launch the kernel.
 *
 * This setting does nothing on devices where the size of the L1 cache and
 * shared memory are fixed.
 *
 * Launching a kernel with a different preference than the most recent
 * preference setting may insert a device-side synchronization point.
 *
 * The supported cache configurations are:
 * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
 * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
 * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
 *
 * @param cacheConfig - Requested cache configuration
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInitializationError
 * \notefnerr
 *
 * \sa ::cudaDeviceSetCacheConfig
 */
public static native @Cast("cudaError_t") int cudaThreadSetCacheConfig(@Cast("cudaFuncCache") int cacheConfig);

/** \} */ /* END CUDART_THREAD_DEPRECATED */

/**
 * \defgroup CUDART_ERROR Error Handling
 *
 * ___MANBRIEF___ error handling functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the error handling functions of the CUDA runtime
 * application programming interface.
 *
 * \{
 */

/**
 * \brief Returns the last error from a runtime call
 *
 * Returns the last error that has been produced by any of the runtime calls
 * in the same host thread and resets it to ::cudaSuccess.
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMissingConfiguration,
 * ::cudaErrorMemoryAllocation,
 * ::cudaErrorInitializationError,
 * ::cudaErrorLaunchFailure,
 * ::cudaErrorLaunchTimeout,
 * ::cudaErrorLaunchOutOfResources,
 * ::cudaErrorInvalidDeviceFunction,
 * ::cudaErrorInvalidConfiguration,
 * ::cudaErrorInvalidDevice,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidPitchValue,
 * ::cudaErrorInvalidSymbol,
 * ::cudaErrorUnmapBufferObjectFailed,
 * ::cudaErrorInvalidHostPointer,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidTexture,
 * ::cudaErrorInvalidTextureBinding,
 * ::cudaErrorInvalidChannelDescriptor,
 * ::cudaErrorInvalidMemcpyDirection,
 * ::cudaErrorInvalidFilterSetting,
 * ::cudaErrorInvalidNormSetting,
 * ::cudaErrorUnknown,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorInsufficientDriver,
 * ::cudaErrorSetOnActiveProcess,
 * ::cudaErrorStartupFailure,
 * \notefnerr
 *
 * \sa ::cudaPeekAtLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError
 */
public static native @Cast("cudaError_t") int cudaGetLastError();

/**
 * \brief Returns the last error from a runtime call
 *
 * Returns the last error that has been produced by any of the runtime calls
 * in the same host thread. Note that this call does not reset the error to
 * ::cudaSuccess like ::cudaGetLastError().
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMissingConfiguration,
 * ::cudaErrorMemoryAllocation,
 * ::cudaErrorInitializationError,
 * ::cudaErrorLaunchFailure,
 * ::cudaErrorLaunchTimeout,
 * ::cudaErrorLaunchOutOfResources,
 * ::cudaErrorInvalidDeviceFunction,
 * ::cudaErrorInvalidConfiguration,
 * ::cudaErrorInvalidDevice,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidPitchValue,
 * ::cudaErrorInvalidSymbol,
 * ::cudaErrorUnmapBufferObjectFailed,
 * ::cudaErrorInvalidHostPointer,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidTexture,
 * ::cudaErrorInvalidTextureBinding,
 * ::cudaErrorInvalidChannelDescriptor,
 * ::cudaErrorInvalidMemcpyDirection,
 * ::cudaErrorInvalidFilterSetting,
 * ::cudaErrorInvalidNormSetting,
 * ::cudaErrorUnknown,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorInsufficientDriver,
 * ::cudaErrorSetOnActiveProcess,
 * ::cudaErrorStartupFailure,
 * \notefnerr
 *
 * \sa ::cudaGetLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError
 */
public static native @Cast("cudaError_t") int cudaPeekAtLastError();

/**
 * \brief Returns the string representation of an error code enum name
 *
 * Returns a string containing the name of an error code in the enum.  If the error
 * code is not recognized, "unrecognized error code" is returned.
 *
 * @param error - Error code to convert to string
 *
 * @return
 * \p char* pointer to a NULL-terminated string
 *
 * \sa ::cudaGetErrorString, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError
 */
public static native @Cast("const char*") BytePointer cudaGetErrorName(@Cast("cudaError_t") int error);

/**
 * \brief Returns the description string for an error code
 *
 * Returns the description string for an error code.  If the error
 * code is not recognized, "unrecognized error code" is returned.
 *
 * @param error - Error code to convert to string
 *
 * @return
 * \p char* pointer to a NULL-terminated string
 *
 * \sa ::cudaGetErrorName, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError
 */
public static native @Cast("const char*") BytePointer cudaGetErrorString(@Cast("cudaError_t") int error);
/** \} */ /* END CUDART_ERROR */

/**
 * \addtogroup CUDART_DEVICE 
 *
 * \{
 */

/**
 * \brief Returns the number of compute-capable devices
 *
 * Returns in \p *count the number of devices with compute capability greater
 * or equal to 2.0 that are available for execution.  If there is no such
 * device then ::cudaGetDeviceCount() will return ::cudaErrorNoDevice.
 * If no driver can be loaded to determine if any such devices exist then
 * ::cudaGetDeviceCount() will return ::cudaErrorInsufficientDriver.
 *
 * @param count - Returns the number of devices with compute capability
 * greater or equal to 2.0
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorNoDevice,
 * ::cudaErrorInsufficientDriver
 * \notefnerr
 *
 * \sa ::cudaGetDevice, ::cudaSetDevice, ::cudaGetDeviceProperties,
 * ::cudaChooseDevice
 */
public static native @Cast("cudaError_t") int cudaGetDeviceCount(IntPointer count);
public static native @Cast("cudaError_t") int cudaGetDeviceCount(IntBuffer count);
public static native @Cast("cudaError_t") int cudaGetDeviceCount(int[] count);

/**
 * \brief Returns information about the compute-device
 *
 * Returns in \p *prop the properties of device \p dev. The ::cudaDeviceProp
 * structure is defined as:
 * 
{@code
    struct cudaDeviceProp {
        char name[256];
        size_t totalGlobalMem;
        size_t sharedMemPerBlock;
        int regsPerBlock;
        int warpSize;
        size_t memPitch;
        int maxThreadsPerBlock;
        int maxThreadsDim[3];
        int maxGridSize[3];
        int clockRate;
        size_t totalConstMem;
        int major;
        int minor;
        size_t textureAlignment;
        size_t texturePitchAlignment;
        int deviceOverlap;
        int multiProcessorCount;
        int kernelExecTimeoutEnabled;
        int integrated;
        int canMapHostMemory;
        int computeMode;
        int maxTexture1D;
        int maxTexture1DMipmap;
        int maxTexture1DLinear;
        int maxTexture2D[2];
        int maxTexture2DMipmap[2];
        int maxTexture2DLinear[3];
        int maxTexture2DGather[2];
        int maxTexture3D[3];
        int maxTexture3DAlt[3];
        int maxTextureCubemap;
        int maxTexture1DLayered[2];
        int maxTexture2DLayered[3];
        int maxTextureCubemapLayered[2];
        int maxSurface1D;
        int maxSurface2D[2];
        int maxSurface3D[3];
        int maxSurface1DLayered[2];
        int maxSurface2DLayered[3];
        int maxSurfaceCubemap;
        int maxSurfaceCubemapLayered[2];
        size_t surfaceAlignment;
        int concurrentKernels;
        int ECCEnabled;
        int pciBusID;
        int pciDeviceID;
        int pciDomainID;
        int tccDriver;
        int asyncEngineCount;
        int unifiedAddressing;
        int memoryClockRate;
        int memoryBusWidth;
        int l2CacheSize;
        int maxThreadsPerMultiProcessor;
        int streamPrioritiesSupported;
        int globalL1CacheSupported;
        int localL1CacheSupported;
        size_t sharedMemPerMultiprocessor;
        int regsPerMultiprocessor;
        int managedMemSupported;
        int isMultiGpuBoard;
        int multiGpuBoardGroupID;
        int singleToDoublePrecisionPerfRatio;
        int pageableMemoryAccess;
        int concurrentManagedAccess;
    }
 }
 * where:
 * - \ref ::cudaDeviceProp::name "name[256]" is an ASCII string identifying
 *   the device;
 * - \ref ::cudaDeviceProp::totalGlobalMem "totalGlobalMem" is the total
 *   amount of global memory available on the device in bytes;
 * - \ref ::cudaDeviceProp::sharedMemPerBlock "sharedMemPerBlock" is the
 *   maximum amount of shared memory available to a thread block in bytes;
 * - \ref ::cudaDeviceProp::regsPerBlock "regsPerBlock" is the maximum number
 *   of 32-bit registers available to a thread block;
 * - \ref ::cudaDeviceProp::warpSize "warpSize" is the warp size in threads;
 * - \ref ::cudaDeviceProp::memPitch "memPitch" is the maximum pitch in
 *   bytes allowed by the memory copy functions that involve memory regions
 *   allocated through ::cudaMallocPitch();
 * - \ref ::cudaDeviceProp::maxThreadsPerBlock "maxThreadsPerBlock" is the
 *   maximum number of threads per block;
 * - \ref ::cudaDeviceProp::maxThreadsDim "maxThreadsDim[3]" contains the
 *   maximum size of each dimension of a block;
 * - \ref ::cudaDeviceProp::maxGridSize "maxGridSize[3]" contains the
 *   maximum size of each dimension of a grid;
 * - \ref ::cudaDeviceProp::clockRate "clockRate" is the clock frequency in
 *   kilohertz;
 * - \ref ::cudaDeviceProp::totalConstMem "totalConstMem" is the total amount
 *   of constant memory available on the device in bytes;
 * - \ref ::cudaDeviceProp::major "major",
 *   \ref ::cudaDeviceProp::minor "minor" are the major and minor revision
 *   numbers defining the device's compute capability;
 * - \ref ::cudaDeviceProp::textureAlignment "textureAlignment" is the
 *   alignment requirement; texture base addresses that are aligned to
 *   \ref ::cudaDeviceProp::textureAlignment "textureAlignment" bytes do not
 *   need an offset applied to texture fetches;
 * - \ref ::cudaDeviceProp::texturePitchAlignment "texturePitchAlignment" is the
 *   pitch alignment requirement for 2D texture references that are bound to 
 *   pitched memory;
 * - \ref ::cudaDeviceProp::deviceOverlap "deviceOverlap" is 1 if the device
 *   can concurrently copy memory between host and device while executing a
 *   kernel, or 0 if not.  Deprecated, use instead asyncEngineCount.
 * - \ref ::cudaDeviceProp::multiProcessorCount "multiProcessorCount" is the
 *   number of multiprocessors on the device;
 * - \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled"
 *   is 1 if there is a run time limit for kernels executed on the device, or
 *   0 if not.
 * - \ref ::cudaDeviceProp::integrated "integrated" is 1 if the device is an
 *   integrated (motherboard) GPU and 0 if it is a discrete (card) component.
 * - \ref ::cudaDeviceProp::canMapHostMemory "canMapHostMemory" is 1 if the
 *   device can map host memory into the CUDA address space for use with
 *   ::cudaHostAlloc()/::cudaHostGetDevicePointer(), or 0 if not;
 * - \ref ::cudaDeviceProp::computeMode "computeMode" is the compute mode
 *   that the device is currently in. Available modes are as follows:
 *   - cudaComputeModeDefault: Default mode - Device is not restricted and
 *     multiple threads can use ::cudaSetDevice() with this device.
 *   - cudaComputeModeExclusive: Compute-exclusive mode - Only one thread will
 *     be able to use ::cudaSetDevice() with this device.
 *   - cudaComputeModeProhibited: Compute-prohibited mode - No threads can use
 *     ::cudaSetDevice() with this device.
 *   - cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many 
 *     threads in one process will be able to use ::cudaSetDevice() with this device.
 *   
 If ::cudaSetDevice() is called on an already occupied \p device with 
 *   computeMode ::cudaComputeModeExclusive, ::cudaErrorDeviceAlreadyInUse
 *   will be immediately returned indicating the device cannot be used.
 *   When an occupied exclusive mode device is chosen with ::cudaSetDevice,
 *   all subsequent non-device management runtime functions will return
 *   ::cudaErrorDevicesUnavailable.
 * - \ref ::cudaDeviceProp::maxTexture1D "maxTexture1D" is the maximum 1D
 *   texture size.
 * - \ref ::cudaDeviceProp::maxTexture1DMipmap "maxTexture1DMipmap" is the maximum
 *   1D mipmapped texture texture size.
 * - \ref ::cudaDeviceProp::maxTexture1DLinear "maxTexture1DLinear" is the maximum
 *   1D texture size for textures bound to linear memory.
 * - \ref ::cudaDeviceProp::maxTexture2D "maxTexture2D[2]" contains the maximum
 *   2D texture dimensions.
 * - \ref ::cudaDeviceProp::maxTexture2DMipmap "maxTexture2DMipmap[2]" contains the
 *   maximum 2D mipmapped texture dimensions.
 * - \ref ::cudaDeviceProp::maxTexture2DLinear "maxTexture2DLinear[3]" contains the 
 *   maximum 2D texture dimensions for 2D textures bound to pitch linear memory.
 * - \ref ::cudaDeviceProp::maxTexture2DGather "maxTexture2DGather[2]" contains the 
 *   maximum 2D texture dimensions if texture gather operations have to be performed.
 * - \ref ::cudaDeviceProp::maxTexture3D "maxTexture3D[3]" contains the maximum
 *   3D texture dimensions.
 * - \ref ::cudaDeviceProp::maxTexture3DAlt "maxTexture3DAlt[3]"
 *   contains the maximum alternate 3D texture dimensions.
 * - \ref ::cudaDeviceProp::maxTextureCubemap "maxTextureCubemap" is the 
 *   maximum cubemap texture width or height.
 * - \ref ::cudaDeviceProp::maxTexture1DLayered "maxTexture1DLayered[2]" contains
 *   the maximum 1D layered texture dimensions.
 * - \ref ::cudaDeviceProp::maxTexture2DLayered "maxTexture2DLayered[3]" contains
 *   the maximum 2D layered texture dimensions.
 * - \ref ::cudaDeviceProp::maxTextureCubemapLayered "maxTextureCubemapLayered[2]"
 *   contains the maximum cubemap layered texture dimensions.
 * - \ref ::cudaDeviceProp::maxSurface1D "maxSurface1D" is the maximum 1D
 *   surface size.
 * - \ref ::cudaDeviceProp::maxSurface2D "maxSurface2D[2]" contains the maximum
 *   2D surface dimensions.
 * - \ref ::cudaDeviceProp::maxSurface3D "maxSurface3D[3]" contains the maximum
 *   3D surface dimensions.
 * - \ref ::cudaDeviceProp::maxSurface1DLayered "maxSurface1DLayered[2]" contains
 *   the maximum 1D layered surface dimensions.
 * - \ref ::cudaDeviceProp::maxSurface2DLayered "maxSurface2DLayered[3]" contains
 *   the maximum 2D layered surface dimensions.
 * - \ref ::cudaDeviceProp::maxSurfaceCubemap "maxSurfaceCubemap" is the maximum 
 *   cubemap surface width or height.
 * - \ref ::cudaDeviceProp::maxSurfaceCubemapLayered "maxSurfaceCubemapLayered[2]"
 *   contains the maximum cubemap layered surface dimensions.
 * - \ref ::cudaDeviceProp::surfaceAlignment "surfaceAlignment" specifies the
 *   alignment requirements for surfaces.
 * - \ref ::cudaDeviceProp::concurrentKernels "concurrentKernels" is 1 if the
 *   device supports executing multiple kernels within the same context
 *   simultaneously, or 0 if not. It is not guaranteed that multiple kernels
 *   will be resident on the device concurrently so this feature should not be
 *   relied upon for correctness;
 * - \ref ::cudaDeviceProp::ECCEnabled "ECCEnabled" is 1 if the device has ECC
 *   support turned on, or 0 if not.
 * - \ref ::cudaDeviceProp::pciBusID "pciBusID" is the PCI bus identifier of
 *   the device.
 * - \ref ::cudaDeviceProp::pciDeviceID "pciDeviceID" is the PCI device
 *   (sometimes called slot) identifier of the device.
 * - \ref ::cudaDeviceProp::pciDomainID "pciDomainID" is the PCI domain identifier
 *   of the device.
 * - \ref ::cudaDeviceProp::tccDriver "tccDriver" is 1 if the device is using a
 *   TCC driver or 0 if not.
 * - \ref ::cudaDeviceProp::asyncEngineCount "asyncEngineCount" is 1 when the
 *   device can concurrently copy memory between host and device while executing
 *   a kernel. It is 2 when the device can concurrently copy memory between host
 *   and device in both directions and execute a kernel at the same time. It is
 *   0 if neither of these is supported.
 * - \ref ::cudaDeviceProp::unifiedAddressing "unifiedAddressing" is 1 if the device 
 *   shares a unified address space with the host and 0 otherwise.
 * - \ref ::cudaDeviceProp::memoryClockRate "memoryClockRate" is the peak memory 
 *   clock frequency in kilohertz.
 * - \ref ::cudaDeviceProp::memoryBusWidth "memoryBusWidth" is the memory bus width  
 *   in bits.
 * - \ref ::cudaDeviceProp::l2CacheSize "l2CacheSize" is L2 cache size in bytes. 
 * - \ref ::cudaDeviceProp::maxThreadsPerMultiProcessor "maxThreadsPerMultiProcessor"  
 *   is the number of maximum resident threads per multiprocessor.
 * - \ref ::cudaDeviceProp::streamPrioritiesSupported "streamPrioritiesSupported"
 *   is 1 if the device supports stream priorities, or 0 if it is not supported.
 * - \ref ::cudaDeviceProp::globalL1CacheSupported "globalL1CacheSupported"
 *   is 1 if the device supports caching of globals in L1 cache, or 0 if it is not supported.
 * - \ref ::cudaDeviceProp::localL1CacheSupported "localL1CacheSupported"
 *   is 1 if the device supports caching of locals in L1 cache, or 0 if it is not supported.
 * - \ref ::cudaDeviceProp::sharedMemPerMultiprocessor "sharedMemPerMultiprocessor" is the
 *   maximum amount of shared memory available to a multiprocessor in bytes; this amount is
 *   shared by all thread blocks simultaneously resident on a multiprocessor;
 * - \ref ::cudaDeviceProp::regsPerMultiprocessor "regsPerMultiprocessor" is the maximum number
 *   of 32-bit registers available to a multiprocessor; this number is shared
 *   by all thread blocks simultaneously resident on a multiprocessor;
 * - \ref ::cudaDeviceProp::managedMemory "managedMemory"
 *   is 1 if the device supports allocating managed memory on this system, or 0 if it is not supported.
 * - \ref ::cudaDeviceProp::isMultiGpuBoard "isMultiGpuBoard"
 *   is 1 if the device is on a multi-GPU board (e.g. Gemini cards), and 0 if not;
 * - \ref ::cudaDeviceProp::multiGpuBoardGroupID "multiGpuBoardGroupID" is a unique identifier
 *   for a group of devices associated with the same board.
 *   Devices on the same multi-GPU board will share the same identifier;
 * - \ref ::cudaDeviceProp::singleToDoublePrecisionPerfRatio "singleToDoublePrecisionPerfRatio"  
 *   is the ratio of single precision performance (in floating-point operations per second)
 *   to double precision performance.
 * - \ref ::cudaDeviceProp::pageableMemoryAccess "pageableMemoryAccess" is 1 if the device supports
 *   coherently accessing pageable memory without calling cudaHostRegister on it, and 0 otherwise.
 * - \ref ::cudaDeviceProp::concurrentManagedAccess "concurrentManagedAccess" is 1 if the device can
 *   coherently access managed memory concurrently with the CPU, and 0 otherwise.
 *
 * @param prop   - Properties for the specified device
 * @param device - Device number to get properties for
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidDevice
 *
 * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice,
 * ::cudaDeviceGetAttribute
 */
public static native @Cast("cudaError_t") int cudaGetDeviceProperties(cudaDeviceProp prop, int device);

/**
 * \brief Returns information about the device
 *
 * Returns in \p *value the integer value of the attribute \p attr on device
 * \p device. The supported attributes are:
 * - ::cudaDevAttrMaxThreadsPerBlock: Maximum number of threads per block;
 * - ::cudaDevAttrMaxBlockDimX: Maximum x-dimension of a block;
 * - ::cudaDevAttrMaxBlockDimY: Maximum y-dimension of a block;
 * - ::cudaDevAttrMaxBlockDimZ: Maximum z-dimension of a block;
 * - ::cudaDevAttrMaxGridDimX: Maximum x-dimension of a grid;
 * - ::cudaDevAttrMaxGridDimY: Maximum y-dimension of a grid;
 * - ::cudaDevAttrMaxGridDimZ: Maximum z-dimension of a grid;
 * - ::cudaDevAttrMaxSharedMemoryPerBlock: Maximum amount of shared memory
 *   available to a thread block in bytes;
 * - ::cudaDevAttrTotalConstantMemory: Memory available on device for
 *   __constant__ variables in a CUDA C kernel in bytes;
 * - ::cudaDevAttrWarpSize: Warp size in threads;
 * - ::cudaDevAttrMaxPitch: Maximum pitch in bytes allowed by the memory copy
 *   functions that involve memory regions allocated through ::cudaMallocPitch();
 * - ::cudaDevAttrMaxTexture1DWidth: Maximum 1D texture width;
 * - ::cudaDevAttrMaxTexture1DLinearWidth: Maximum width for a 1D texture bound
 *   to linear memory;
 * - ::cudaDevAttrMaxTexture1DMipmappedWidth: Maximum mipmapped 1D texture width;
 * - ::cudaDevAttrMaxTexture2DWidth: Maximum 2D texture width;
 * - ::cudaDevAttrMaxTexture2DHeight: Maximum 2D texture height;
 * - ::cudaDevAttrMaxTexture2DLinearWidth: Maximum width for a 2D texture
 *   bound to linear memory;
 * - ::cudaDevAttrMaxTexture2DLinearHeight: Maximum height for a 2D texture
 *   bound to linear memory;
 * - ::cudaDevAttrMaxTexture2DLinearPitch: Maximum pitch in bytes for a 2D
 *   texture bound to linear memory;
 * - ::cudaDevAttrMaxTexture2DMipmappedWidth: Maximum mipmapped 2D texture
 *   width;
 * - ::cudaDevAttrMaxTexture2DMipmappedHeight: Maximum mipmapped 2D texture
 *   height;
 * - ::cudaDevAttrMaxTexture3DWidth: Maximum 3D texture width;
 * - ::cudaDevAttrMaxTexture3DHeight: Maximum 3D texture height;
 * - ::cudaDevAttrMaxTexture3DDepth: Maximum 3D texture depth;
 * - ::cudaDevAttrMaxTexture3DWidthAlt: Alternate maximum 3D texture width,
 *   0 if no alternate maximum 3D texture size is supported;
 * - ::cudaDevAttrMaxTexture3DHeightAlt: Alternate maximum 3D texture height,
 *   0 if no alternate maximum 3D texture size is supported;
 * - ::cudaDevAttrMaxTexture3DDepthAlt: Alternate maximum 3D texture depth,
 *   0 if no alternate maximum 3D texture size is supported;
 * - ::cudaDevAttrMaxTextureCubemapWidth: Maximum cubemap texture width or
 *   height;
 * - ::cudaDevAttrMaxTexture1DLayeredWidth: Maximum 1D layered texture width;
 * - ::cudaDevAttrMaxTexture1DLayeredLayers: Maximum layers in a 1D layered
 *   texture;
 * - ::cudaDevAttrMaxTexture2DLayeredWidth: Maximum 2D layered texture width;
 * - ::cudaDevAttrMaxTexture2DLayeredHeight: Maximum 2D layered texture height;
 * - ::cudaDevAttrMaxTexture2DLayeredLayers: Maximum layers in a 2D layered
 *   texture;
 * - ::cudaDevAttrMaxTextureCubemapLayeredWidth: Maximum cubemap layered
 *   texture width or height;
 * - ::cudaDevAttrMaxTextureCubemapLayeredLayers: Maximum layers in a cubemap
 *   layered texture;
 * - ::cudaDevAttrMaxSurface1DWidth: Maximum 1D surface width;
 * - ::cudaDevAttrMaxSurface2DWidth: Maximum 2D surface width;
 * - ::cudaDevAttrMaxSurface2DHeight: Maximum 2D surface height;
 * - ::cudaDevAttrMaxSurface3DWidth: Maximum 3D surface width;
 * - ::cudaDevAttrMaxSurface3DHeight: Maximum 3D surface height;
 * - ::cudaDevAttrMaxSurface3DDepth: Maximum 3D surface depth;
 * - ::cudaDevAttrMaxSurface1DLayeredWidth: Maximum 1D layered surface width;
 * - ::cudaDevAttrMaxSurface1DLayeredLayers: Maximum layers in a 1D layered
 *   surface;
 * - ::cudaDevAttrMaxSurface2DLayeredWidth: Maximum 2D layered surface width;
 * - ::cudaDevAttrMaxSurface2DLayeredHeight: Maximum 2D layered surface height;
 * - ::cudaDevAttrMaxSurface2DLayeredLayers: Maximum layers in a 2D layered
 *   surface;
 * - ::cudaDevAttrMaxSurfaceCubemapWidth: Maximum cubemap surface width;
 * - ::cudaDevAttrMaxSurfaceCubemapLayeredWidth: Maximum cubemap layered
 *   surface width;
 * - ::cudaDevAttrMaxSurfaceCubemapLayeredLayers: Maximum layers in a cubemap
 *   layered surface;
 * - ::cudaDevAttrMaxRegistersPerBlock: Maximum number of 32-bit registers 
 *   available to a thread block;
 * - ::cudaDevAttrClockRate: Peak clock frequency in kilohertz;
 * - ::cudaDevAttrTextureAlignment: Alignment requirement; texture base
 *   addresses aligned to ::textureAlign bytes do not need an offset applied
 *   to texture fetches;
 * - ::cudaDevAttrTexturePitchAlignment: Pitch alignment requirement for 2D
 *   texture references bound to pitched memory;
 * - ::cudaDevAttrGpuOverlap: 1 if the device can concurrently copy memory
 *   between host and device while executing a kernel, or 0 if not;
 * - ::cudaDevAttrMultiProcessorCount: Number of multiprocessors on the device;
 * - ::cudaDevAttrKernelExecTimeout: 1 if there is a run time limit for kernels
 *   executed on the device, or 0 if not;
 * - ::cudaDevAttrIntegrated: 1 if the device is integrated with the memory
 *   subsystem, or 0 if not;
 * - ::cudaDevAttrCanMapHostMemory: 1 if the device can map host memory into
 *   the CUDA address space, or 0 if not;
 * - ::cudaDevAttrComputeMode: Compute mode is the compute mode that the device
 *   is currently in. Available modes are as follows:
 *   - ::cudaComputeModeDefault: Default mode - Device is not restricted and
 *     multiple threads can use ::cudaSetDevice() with this device.
 *   - ::cudaComputeModeExclusive: Compute-exclusive mode - Only one thread will
 *     be able to use ::cudaSetDevice() with this device.
 *   - ::cudaComputeModeProhibited: Compute-prohibited mode - No threads can use
 *     ::cudaSetDevice() with this device.
 *   - ::cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many 
 *     threads in one process will be able to use ::cudaSetDevice() with this
 *     device.
 * - ::cudaDevAttrConcurrentKernels: 1 if the device supports executing
 *   multiple kernels within the same context simultaneously, or 0 if
 *   not. It is not guaranteed that multiple kernels will be resident on the
 *   device concurrently so this feature should not be relied upon for
 *   correctness;
 * - ::cudaDevAttrEccEnabled: 1 if error correction is enabled on the device,
 *   0 if error correction is disabled or not supported by the device;
 * - ::cudaDevAttrPciBusId: PCI bus identifier of the device;
 * - ::cudaDevAttrPciDeviceId: PCI device (also known as slot) identifier of
 *   the device;
 * - ::cudaDevAttrTccDriver: 1 if the device is using a TCC driver. TCC is only
 *   available on Tesla hardware running Windows Vista or later;
 * - ::cudaDevAttrMemoryClockRate: Peak memory clock frequency in kilohertz;
 * - ::cudaDevAttrGlobalMemoryBusWidth: Global memory bus width in bits;
 * - ::cudaDevAttrL2CacheSize: Size of L2 cache in bytes. 0 if the device
 *   doesn't have L2 cache;
 * - ::cudaDevAttrMaxThreadsPerMultiProcessor: Maximum resident threads per 
 *   multiprocessor;
 * - ::cudaDevAttrUnifiedAddressing: 1 if the device shares a unified address
 *   space with the host, or 0 if not;
 * - ::cudaDevAttrComputeCapabilityMajor: Major compute capability version
 *   number;
 * - ::cudaDevAttrComputeCapabilityMinor: Minor compute capability version
 *   number;
 * - ::cudaDevAttrStreamPrioritiesSupported: 1 if the device supports stream
 *   priorities, or 0 if not;
 * - ::cudaDevAttrGlobalL1CacheSupported: 1 if device supports caching globals 
 *    in L1 cache, 0 if not;
 * - ::cudaDevAttrGlobalL1CacheSupported: 1 if device supports caching locals 
 *    in L1 cache, 0 if not;
 * - ::cudaDevAttrMaxSharedMemoryPerMultiprocessor: Maximum amount of shared memory
 *   available to a multiprocessor in bytes; this amount is shared by all 
 *   thread blocks simultaneously resident on a multiprocessor;
 * - ::cudaDevAttrMaxRegistersPerMultiprocessor: Maximum number of 32-bit registers 
 *   available to a multiprocessor; this number is shared by all thread blocks
 *   simultaneously resident on a multiprocessor;
 * - ::cudaDevAttrManagedMemSupported: 1 if device supports allocating
 *   managed memory, 0 if not;
 * - ::cudaDevAttrIsMultiGpuBoard: 1 if device is on a multi-GPU board, 0 if not;
 * - ::cudaDevAttrMultiGpuBoardGroupID: Unique identifier for a group of devices on the
 *   same multi-GPU board;
 * - ::cudaDevAttrHostNativeAtomicSupported: 1 if the link between the device and the
 *   host supports native atomic operations;
 * - ::cudaDevAttrSingleToDoublePrecisionPerfRatio: Ratio of single precision performance
 *   (in floating-point operations per second) to double precision performance;
 * - ::cudaDevAttrPageableMemoryAccess: 1 if the device supports coherently accessing
 *   pageable memory without calling cudaHostRegister on it, and 0 otherwise.
 * - ::cudaDevAttrConcurrentManagedAccess: 1 if the device can coherently access managed
 *   memory concurrently with the CPU, and 0 otherwise.
 * - ::cudaDevAttrComputePreemptionSupported: 1 if the device supports
 *   Compute Preemption, 0 if not.
 * - ::cudaDevAttrCanUseHostPointerForRegisteredMem: 1 if the device can access host
 *   registered memory at the same virtual address as the CPU, and 0 otherwise.
 *
 * @param value  - Returned device attribute value
 * @param attr   - Device attribute to query
 * @param device - Device number to query 
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidDevice,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice,
 * ::cudaGetDeviceProperties
 */
public static native @Cast("cudaError_t") int cudaDeviceGetAttribute(IntPointer value, @Cast("cudaDeviceAttr") int attr, int device);
public static native @Cast("cudaError_t") int cudaDeviceGetAttribute(IntBuffer value, @Cast("cudaDeviceAttr") int attr, int device);
public static native @Cast("cudaError_t") int cudaDeviceGetAttribute(int[] value, @Cast("cudaDeviceAttr") int attr, int device);

/**
 * \brief Queries attributes of the link between two devices.
 *
 * Returns in \p *value the value of the requested attribute \p attrib of the
 * link between \p srcDevice and \p dstDevice. The supported attributes are:
 * - ::CudaDevP2PAttrPerformanceRank: A relative value indicating the
 *   performance of the link between two devices. Lower value means better
 *   performance (0 being the value used for most performant link).
 * - ::CudaDevP2PAttrAccessSupported: 1 if peer access is enabled.
 * - ::CudaDevP2PAttrNativeAtomicSupported: 1 if native atomic operations over
 *   the link are supported.
 *
 * Returns ::cudaErrorInvalidDevice if \p srcDevice or \p dstDevice are not valid
 * or if they represent the same device.
 *
 * Returns ::cudaErrorInvalidValue if \p attrib is not valid or if \p value is
 * a null pointer.
 *
 * @param value         - Returned value of the requested attribute
 * @param attrib        - The requested attribute of the link between \p srcDevice and \p dstDevice.
 * @param srcDevice     - The source device of the target link.
 * @param dstDevice     - The destination device of the target link.
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidDevice,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaCtxEnablePeerAccess,
 * ::cudaCtxDisablePeerAccess,
 * ::cudaCtxCanAccessPeer
 */
public static native @Cast("cudaError_t") int cudaDeviceGetP2PAttribute(IntPointer value, @Cast("cudaDeviceP2PAttr") int attr, int srcDevice, int dstDevice);
public static native @Cast("cudaError_t") int cudaDeviceGetP2PAttribute(IntBuffer value, @Cast("cudaDeviceP2PAttr") int attr, int srcDevice, int dstDevice);
public static native @Cast("cudaError_t") int cudaDeviceGetP2PAttribute(int[] value, @Cast("cudaDeviceP2PAttr") int attr, int srcDevice, int dstDevice);

/**
 * \brief Select compute-device which best matches criteria
 *
 * Returns in \p *device the device which has properties that best match
 * \p *prop.
 *
 * @param device - Device with best match
 * @param prop   - Desired device properties
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice,
 * ::cudaGetDeviceProperties
 */
public static native @Cast("cudaError_t") int cudaChooseDevice(IntPointer device, @Const cudaDeviceProp prop);
public static native @Cast("cudaError_t") int cudaChooseDevice(IntBuffer device, @Const cudaDeviceProp prop);
public static native @Cast("cudaError_t") int cudaChooseDevice(int[] device, @Const cudaDeviceProp prop);

/**
 * \brief Set device to be used for GPU executions
 *
 * Sets \p device as the current device for the calling host thread.
 * Valid device id's are 0 to (::cudaGetDeviceCount() - 1).
 *
 * Any device memory subsequently allocated from this host thread
 * using ::cudaMalloc(), ::cudaMallocPitch() or ::cudaMallocArray()
 * will be physically resident on \p device.  Any host memory allocated
 * from this host thread using ::cudaMallocHost() or ::cudaHostAlloc() 
 * or ::cudaHostRegister() will have its lifetime associated  with
 * \p device.  Any streams or events created from this host thread will 
 * be associated with \p device.  Any kernels launched from this host
 * thread using the <<<>>> operator or ::cudaLaunchKernel() will be executed
 * on \p device.
 *
 * This call may be made from any host thread, to any device, and at 
 * any time.  This function will do no synchronization with the previous 
 * or new device, and should be considered a very low overhead call.
 *
 * @param device - Device on which the active host thread should execute the
 * device code.
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidDevice,
 * ::cudaErrorDeviceAlreadyInUse
 * \notefnerr
 *
 * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties,
 * ::cudaChooseDevice
 */
public static native @Cast("cudaError_t") int cudaSetDevice(int device);

/**
 * \brief Returns which device is currently being used
 *
 * Returns in \p *device the current device for the calling host thread.
 *
 * @param device - Returns the device on which the active host thread
 * executes the device code.
 *
 * @return
 * ::cudaSuccess
 * \notefnerr
 *
 * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties,
 * ::cudaChooseDevice
 */
public static native @Cast("cudaError_t") int cudaGetDevice(IntPointer device);
public static native @Cast("cudaError_t") int cudaGetDevice(IntBuffer device);
public static native @Cast("cudaError_t") int cudaGetDevice(int[] device);

/**
 * \brief Set a list of devices that can be used for CUDA
 *
 * Sets a list of devices for CUDA execution in priority order using
 * \p device_arr. The parameter \p len specifies the number of elements in the
 * list.  CUDA will try devices from the list sequentially until it finds one
 * that works.  If this function is not called, or if it is called with a \p len
 * of 0, then CUDA will go back to its default behavior of trying devices
 * sequentially from a default list containing all of the available CUDA
 * devices in the system. If a specified device ID in the list does not exist,
 * this function will return ::cudaErrorInvalidDevice. If \p len is not 0 and
 * \p device_arr is NULL or if \p len exceeds the number of devices in
 * the system, then ::cudaErrorInvalidValue is returned.
 *
 * @param device_arr - List of devices to try
 * @param len        - Number of devices in specified list
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevice
 * \notefnerr
 *
 * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties,
 * ::cudaSetDeviceFlags,
 * ::cudaChooseDevice
 */
public static native @Cast("cudaError_t") int cudaSetValidDevices(IntPointer device_arr, int len);
public static native @Cast("cudaError_t") int cudaSetValidDevices(IntBuffer device_arr, int len);
public static native @Cast("cudaError_t") int cudaSetValidDevices(int[] device_arr, int len);

/**
 * \brief Sets flags to be used for device executions
 *
 * Records \p flags as the flags to use when initializing the current 
 * device.  If no device has been made current to the calling thread,
 * then \p flags will be applied to the initialization of any device
 * initialized by the calling host thread, unless that device has had
 * its initialization flags set explicitly by this or any host thread.
 * 
 * If the current device has been set and that device has already been 
 * initialized then this call will fail with the error 
 * ::cudaErrorSetOnActiveProcess.  In this case it is necessary 
 * to reset \p device using ::cudaDeviceReset() before the device's
 * initialization flags may be set.
 *
 * The two LSBs of the \p flags parameter can be used to control how the CPU
 * thread interacts with the OS scheduler when waiting for results from the
 * device.
 *
 * - ::cudaDeviceScheduleAuto: The default value if the \p flags parameter is
 * zero, uses a heuristic based on the number of active CUDA contexts in the
 * process \p C and the number of logical processors in the system \p P. If
 * \p C \> \p P, then CUDA will yield to other OS threads when waiting for the
 * device, otherwise CUDA will not yield while waiting for results and
 * actively spin on the processor.
 * - ::cudaDeviceScheduleSpin: Instruct CUDA to actively spin when waiting for
 * results from the device. This can decrease latency when waiting for the
 * device, but may lower the performance of CPU threads if they are performing
 * work in parallel with the CUDA thread.
 * - ::cudaDeviceScheduleYield: Instruct CUDA to yield its thread when waiting
 * for results from the device. This can increase latency when waiting for the
 * device, but can increase the performance of CPU threads performing work in
 * parallel with the device.
 * - ::cudaDeviceScheduleBlockingSync: Instruct CUDA to block the CPU thread 
 * on a synchronization primitive when waiting for the device to finish work.
 * - ::cudaDeviceBlockingSync: Instruct CUDA to block the CPU thread on a 
 * synchronization primitive when waiting for the device to finish work. 

 * \ref deprecated "Deprecated:" This flag was deprecated as of CUDA 4.0 and
 * replaced with ::cudaDeviceScheduleBlockingSync.
 * - ::cudaDeviceMapHost: This flag enables allocating pinned
 * host memory that is accessible to the device. It is implicit for the
 * runtime but may be absent if a context is created using the driver API.
 * If this flag is not set, ::cudaHostGetDevicePointer() will always return
 * a failure code.
 * - ::cudaDeviceLmemResizeToMax: Instruct CUDA to not reduce local memory
 * after resizing local memory for a kernel. This can prevent thrashing by
 * local memory allocations when launching many kernels with high local
 * memory usage at the cost of potentially increased memory usage.
 *
 * @param flags - Parameters for device operation
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidDevice,
 * ::cudaErrorSetOnActiveProcess
 *
 * \sa ::cudaGetDeviceFlags, ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties,
 * ::cudaSetDevice, ::cudaSetValidDevices,
 * ::cudaChooseDevice
 */
public static native @Cast("cudaError_t") int cudaSetDeviceFlags( @Cast("unsigned int") int flags );

/**
 * \brief Gets the flags for the current device
 *
 * Returns in \p flags the flags for the current device.  If there is a
 * current device for the calling thread, and the device has been initialized
 * or flags have been set on that device specifically, the flags for the
 * device are returned.  If there is no current device, but flags have been
 * set for the thread with ::cudaSetDeviceFlags, the thread flags are returned.
 * Finally, if there is no current device and no thread flags, the flags for
 * the first device are returned, which may be the default flags.  Compare
 * to the behavior of ::cudaSetDeviceFlags.
 *
 * Typically, the flags returned should match the behavior that will be seen
 * if the calling thread uses a device after this call, without any change to
 * the flags or current device inbetween by this or another thread.  Note that
 * if the device is not initialized, it is possible for another thread to
 * change the flags for the current device before it is initialized.
 * Additionally, when using exclusive mode, if this thread has not requested a
 * specific device, it may use a device other than the first device, contrary
 * to the assumption made by this function.
 *
 * If a context has been created via the driver API and is current to the
 * calling thread, the flags for that context are always returned.
 *
 * Flags returned by this function may specifically include ::cudaDeviceMapHost
 * even though it is not accepted by ::cudaSetDeviceFlags because it is
 * implicit in runtime API flags.  The reason for this is that the current
 * context may have been created via the driver API in which case the flag is
 * not implicit and may be unset.
 *
 * @param flags - Pointer to store the device flags
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidDevice
 *
 * \sa ::cudaGetDevice, ::cudaGetDeviceProperties,
 * ::cudaSetDevice, ::cudaSetDeviceFlags
 */
public static native @Cast("cudaError_t") int cudaGetDeviceFlags( @Cast("unsigned int*") IntPointer flags );
public static native @Cast("cudaError_t") int cudaGetDeviceFlags( @Cast("unsigned int*") IntBuffer flags );
public static native @Cast("cudaError_t") int cudaGetDeviceFlags( @Cast("unsigned int*") int[] flags );
/** \} */ /* END CUDART_DEVICE */

/**
 * \defgroup CUDART_STREAM Stream Management
 *
 * ___MANBRIEF___ stream management functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the stream management functions of the CUDA runtime
 * application programming interface.
 *
 * \{
 */

/**
 * \brief Create an asynchronous stream
 *
 * Creates a new asynchronous stream.
 *
 * @param pStream - Pointer to new stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaStreamCreateWithPriority,
 * ::cudaStreamCreateWithFlags,
 * ::cudaStreamGetPriority,
 * ::cudaStreamGetFlags,
 * ::cudaStreamQuery,
 * ::cudaStreamSynchronize,
 * ::cudaStreamWaitEvent,
 * ::cudaStreamAddCallback,
 * ::cudaStreamDestroy
 */
public static native @Cast("cudaError_t") int cudaStreamCreate(@ByPtrPtr CUstream_st pStream);

/**
 * \brief Create an asynchronous stream
 *
 * Creates a new asynchronous stream.  The \p flags argument determines the 
 * behaviors of the stream.  Valid values for \p flags are
 * - ::cudaStreamDefault: Default stream creation flag.
 * - ::cudaStreamNonBlocking: Specifies that work running in the created 
 *   stream may run concurrently with work in stream 0 (the NULL stream), and that
 *   the created stream should perform no implicit synchronization with stream 0.
 *
 * @param pStream - Pointer to new stream identifier
 * @param flags   - Parameters for stream creation
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaStreamCreate,
 * ::cudaStreamCreateWithPriority,
 * ::cudaStreamGetFlags,
 * ::cudaStreamQuery,
 * ::cudaStreamSynchronize,
 * ::cudaStreamWaitEvent,
 * ::cudaStreamAddCallback,
 * ::cudaStreamDestroy
 */
public static native @Cast("cudaError_t") int cudaStreamCreateWithFlags(@ByPtrPtr CUstream_st pStream, @Cast("unsigned int") int flags);

/**
 * \brief Create an asynchronous stream with the specified priority
 *
 * Creates a stream with the specified priority and returns a handle in \p pStream.
 * This API alters the scheduler priority of work in the stream. Work in a higher
 * priority stream may preempt work already executing in a low priority stream.
 *
 * \p priority follows a convention where lower numbers represent higher priorities.
 * '0' represents default priority. The range of meaningful numerical priorities can
 * be queried using ::cudaDeviceGetStreamPriorityRange. If the specified priority is
 * outside the numerical range returned by ::cudaDeviceGetStreamPriorityRange,
 * it will automatically be clamped to the lowest or the highest number in the range.
 *
 * @param pStream  - Pointer to new stream identifier
 * @param flags    - Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed
 * @param priority - Priority of the stream. Lower numbers represent higher priorities.
 *                   See ::cudaDeviceGetStreamPriorityRange for more information about
 *                   the meaningful stream priorities that can be passed.
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \note Stream priorities are supported only on GPUs
 * with compute capability 3.5 or higher.
 *
 * \note In the current implementation, only compute kernels launched in
 * priority streams are affected by the stream's priority. Stream priorities have
 * no effect on host-to-device and device-to-host memory operations.
 *
 * \sa ::cudaStreamCreate,
 * ::cudaStreamCreateWithFlags,
 * ::cudaDeviceGetStreamPriorityRange,
 * ::cudaStreamGetPriority,
 * ::cudaStreamQuery,
 * ::cudaStreamWaitEvent,
 * ::cudaStreamAddCallback,
 * ::cudaStreamSynchronize,
 * ::cudaStreamDestroy
 */
public static native @Cast("cudaError_t") int cudaStreamCreateWithPriority(@ByPtrPtr CUstream_st pStream, @Cast("unsigned int") int flags, int priority);

/**
 * \brief Query the priority of a stream
 *
 * Query the priority of a stream. The priority is returned in in \p priority.
 * Note that if the stream was created with a priority outside the meaningful
 * numerical range returned by ::cudaDeviceGetStreamPriorityRange,
 * this function returns the clamped priority.
 * See ::cudaStreamCreateWithPriority for details about priority clamping.
 *
 * @param hStream    - Handle to the stream to be queried
 * @param priority   - Pointer to a signed integer in which the stream's priority is returned
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidResourceHandle
 * \notefnerr
 *
 * \sa ::cudaStreamCreateWithPriority,
 * ::cudaDeviceGetStreamPriorityRange,
 * ::cudaStreamGetFlags
 */
public static native @Cast("cudaError_t") int cudaStreamGetPriority(CUstream_st hStream, IntPointer priority);
public static native @Cast("cudaError_t") int cudaStreamGetPriority(CUstream_st hStream, IntBuffer priority);
public static native @Cast("cudaError_t") int cudaStreamGetPriority(CUstream_st hStream, int[] priority);

/**
 * \brief Query the flags of a stream
 *
 * Query the flags of a stream. The flags are returned in \p flags.
 * See ::cudaStreamCreateWithFlags for a list of valid flags.
 *
 * @param hStream - Handle to the stream to be queried
 * @param flags   - Pointer to an unsigned integer in which the stream's flags are returned
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidResourceHandle
 * \notefnerr
 *
 * \sa ::cudaStreamCreateWithPriority,
 * ::cudaStreamCreateWithFlags,
 * ::cudaStreamGetPriority
 */
public static native @Cast("cudaError_t") int cudaStreamGetFlags(CUstream_st hStream, @Cast("unsigned int*") IntPointer flags);
public static native @Cast("cudaError_t") int cudaStreamGetFlags(CUstream_st hStream, @Cast("unsigned int*") IntBuffer flags);
public static native @Cast("cudaError_t") int cudaStreamGetFlags(CUstream_st hStream, @Cast("unsigned int*") int[] flags);

/**
 * \brief Destroys and cleans up an asynchronous stream
 *
 * Destroys and cleans up the asynchronous stream specified by \p stream.
 *
 * In case the device is still doing work in the stream \p stream
 * when ::cudaStreamDestroy() is called, the function will return immediately 
 * and the resources associated with \p stream will be released automatically 
 * once the device has completed all work in \p stream.
 *
 * @param stream - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidResourceHandle
 * \notefnerr
 *
 * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback
 */
public static native @Cast("cudaError_t") int cudaStreamDestroy(CUstream_st stream);

/**
 * \brief Make a compute stream wait on an event
 *
 * Makes all future work submitted to \p stream wait until \p event reports
 * completion before beginning execution.  This synchronization will be
 * performed efficiently on the device.  The event \p event may
 * be from a different context than \p stream, in which case this function
 * will perform cross-device synchronization.
 *
 * The stream \p stream will wait only for the completion of the most recent
 * host call to ::cudaEventRecord() on \p event.  Once this call has returned,
 * any functions (including ::cudaEventRecord() and ::cudaEventDestroy()) may be
 * called on \p event again, and the subsequent calls will not have any effect
 * on \p stream.
 *
 * If ::cudaEventRecord() has not been called on \p event, this call acts as if
 * the record has already completed, and so is a functional no-op.
 *
 * @param stream - Stream to wait
 * @param event  - Event to wait on
 * @param flags  - Parameters for the operation (must be 0)
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidResourceHandle
 * \note_null_stream
 * \notefnerr
 *
 * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy
 */
public static native @Cast("cudaError_t") int cudaStreamWaitEvent(CUstream_st stream, CUevent_st event, @Cast("unsigned int") int flags);

// #ifdef _WIN32
// #define CUDART_CB __stdcall
// #else
// #define CUDART_CB
// #endif

/**
 * Type of stream callback functions.
 * @param stream The stream as passed to ::cudaStreamAddCallback, may be NULL.
 * @param status ::cudaSuccess or any persistent error on the stream.
 * @param userData User parameter provided at registration.
 */
@Convention("CUDART_CB") public static class cudaStreamCallback_t extends FunctionPointer {
    static { Loader.load(); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public    cudaStreamCallback_t(Pointer p) { super(p); }
    protected cudaStreamCallback_t() { allocate(); }
    private native void allocate();
    public native void call(CUstream_st stream, @Cast("cudaError_t") int status, Pointer userData);
}

/**
 * \brief Add a callback to a compute stream
 *
 * Adds a callback to be called on the host after all currently enqueued
 * items in the stream have completed.  For each 
 * cudaStreamAddCallback call, a callback will be executed exactly once.
 * The callback will block later work in the stream until it is finished.
 *
 * The callback may be passed ::cudaSuccess or an error code.  In the event
 * of a device error, all subsequently executed callbacks will receive an
 * appropriate ::cudaError_t.
 *
 * Callbacks must not make any CUDA API calls.  Attempting to use CUDA APIs
 * will result in ::cudaErrorNotPermitted.  Callbacks must not perform any
 * synchronization that may depend on outstanding device work or other callbacks
 * that are not mandated to run earlier.  Callbacks without a mandated order
 * (in independent streams) execute in undefined order and may be serialized.
 *
 * For the purposes of Unified Memory, callback execution makes a number of
 * guarantees:
 * 
 *   The callback stream is considered idle for the duration of the
 *   callback.  Thus, for example, a callback may always use memory attached
 *   to the callback stream.
 *   The start of execution of a callback has the same effect as
 *   synchronizing an event recorded in the same stream immediately prior to
 *   the callback.  It thus synchronizes streams which have been "joined"
 *   prior to the callback.
 *   Adding device work to any stream does not have the effect of making
 *   the stream active until all preceding callbacks have executed.  Thus, for
 *   example, a callback might use global attached memory even if work has
 *   been added to another stream, if it has been properly ordered with an
 *   event.
 *   Completion of a callback does not cause a stream to become
 *   active except as described above.  The callback stream will remain idle
 *   if no device work follows the callback, and will remain idle across
 *   consecutive callbacks without device work in between.  Thus, for example,
 *   stream synchronization can be done by signaling from a callback at the
 *   end of the stream.
 * 
 *
 * @param stream   - Stream to add callback to
 * @param callback - The function to call once preceding stream operations are complete
 * @param userData - User specified data to be passed to the callback function
 * @param flags    - Reserved for future use, must be 0
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorNotSupported
 * \note_null_stream
 * \notefnerr
 *
 * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamWaitEvent, ::cudaStreamDestroy, ::cudaMallocManaged, ::cudaStreamAttachMemAsync
 */
public static native @Cast("cudaError_t") int cudaStreamAddCallback(CUstream_st stream,
        cudaStreamCallback_t callback, Pointer userData, @Cast("unsigned int") int flags);

/**
 * \brief Waits for stream tasks to complete
 *
 * Blocks until \p stream has completed all operations. If the
 * ::cudaDeviceScheduleBlockingSync flag was set for this device, 
 * the host thread will block until the stream is finished with 
 * all of its tasks.
 *
 * @param stream - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidResourceHandle
 * \notefnerr
 *
 * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamWaitEvent, ::cudaStreamAddCallback, ::cudaStreamDestroy
 */
public static native @Cast("cudaError_t") int cudaStreamSynchronize(CUstream_st stream);

/**
 * \brief Queries an asynchronous stream for completion status
 *
 * Returns ::cudaSuccess if all operations in \p stream have
 * completed, or ::cudaErrorNotReady if not.
 *
 * For the purposes of Unified Memory, a return value of ::cudaSuccess
 * is equivalent to having called ::cudaStreamSynchronize().
 *
 * @param stream - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorNotReady,
 * ::cudaErrorInvalidResourceHandle
 * \notefnerr
 *
 * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy
 */
public static native @Cast("cudaError_t") int cudaStreamQuery(CUstream_st stream);

/**
 * \brief Attach memory to a stream asynchronously
 *
 * Enqueues an operation in \p stream to specify stream association of
 * \p length bytes of memory starting from \p devPtr. This function is a
 * stream-ordered operation, meaning that it is dependent on, and will
 * only take effect when, previous work in stream has completed. Any
 * previous association is automatically replaced.
 *
 * \p devPtr must point to an address within managed memory space declared
 * using the __managed__ keyword or allocated with ::cudaMallocManaged.
 *
 * \p length must be zero, to indicate that the entire allocation's
 * stream association is being changed.  Currently, it's not possible
 * to change stream association for a portion of an allocation. The default
 * value for \p length is zero.
 *
 * The stream association is specified using \p flags which must be
 * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle.
 * The default value for \p flags is ::cudaMemAttachSingle
 * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed
 * by any stream on any device.
 * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee
 * that it won't access the memory on the device from any stream on a device that
 * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
 * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with
 * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess,
 * the program makes a guarantee that it will only access the memory on the device
 * from \p stream. It is illegal to attach singly to the NULL stream, because the
 * NULL stream is a virtual global stream and not a specific stream. An error will
 * be returned in this case.
 *
 * When memory is associated with a single stream, the Unified Memory system will
 * allow CPU access to this memory region so long as all operations in \p stream
 * have completed, regardless of whether other streams are active. In effect,
 * this constrains exclusive ownership of the managed memory region by
 * an active GPU to per-stream activity instead of whole-GPU activity.
 *
 * Accessing memory on the device from streams that are not associated with
 * it will produce undefined results. No error checking is performed by the
 * Unified Memory system to ensure that kernels launched into other streams
 * do not access this region. 
 *
 * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync
 * via events, synchronization or other means to ensure legal access to memory
 * at all times. Data visibility and coherency will be changed appropriately
 * for all kernels which follow a stream-association change.
 *
 * If \p stream is destroyed while data is associated with it, the association is
 * removed and the association reverts to the default visibility of the allocation
 * as specified at ::cudaMallocManaged. For __managed__ variables, the default
 * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an
 * asynchronous operation, and as a result, the change to default association won't
 * happen until all work in the stream has completed.
 *
 * @param stream  - Stream in which to enqueue the attach operation
 * @param devPtr  - Pointer to memory (must be a pointer to managed memory)
 * @param length  - Length of memory (must be zero, defaults to zero)
 * @param flags   - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle)
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorNotReady,
 * ::cudaErrorInvalidValue
 * ::cudaErrorInvalidResourceHandle
 * \notefnerr
 *
 * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged
 */
public static native @Cast("cudaError_t") int cudaStreamAttachMemAsync(CUstream_st stream, Pointer devPtr, @Cast("size_t") long length/*=0*/, @Cast("unsigned int") int flags/*=cudaMemAttachSingle*/);
public static native @Cast("cudaError_t") int cudaStreamAttachMemAsync(CUstream_st stream, Pointer devPtr);

/** \} */ /* END CUDART_STREAM */

/**
 * \defgroup CUDART_EVENT Event Management
 *
 * ___MANBRIEF___ event management functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the event management functions of the CUDA runtime
 * application programming interface.
 *
 * \{
 */

/**
 * \brief Creates an event object
 *
 * Creates an event object using ::cudaEventDefault.
 *
 * @param event - Newly created event
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInitializationError,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorLaunchFailure,
 * ::cudaErrorMemoryAllocation
 * \notefnerr
 *
 * \sa \ref ::cudaEventCreate(cudaEvent_t*, unsigned int) "cudaEventCreate (C++ API)",
 * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery,
 * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
 * ::cudaStreamWaitEvent
 */
public static native @Cast("cudaError_t") int cudaEventCreate(@ByPtrPtr CUevent_st event);

/**
 * \brief Creates an event object with the specified flags
 *
 * Creates an event object with the specified flags. Valid flags include:
 * - ::cudaEventDefault: Default event creation flag.
 * - ::cudaEventBlockingSync: Specifies that event should use blocking
 *   synchronization. A host thread that uses ::cudaEventSynchronize() to wait
 *   on an event created with this flag will block until the event actually
 *   completes.
 * - ::cudaEventDisableTiming: Specifies that the created event does not need
 *   to record timing data.  Events created with this flag specified and
 *   the ::cudaEventBlockingSync flag not specified will provide the best
 *   performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery().
 * - ::cudaEventInterprocess: Specifies that the created event may be used as an
 *   interprocess event by ::cudaIpcGetEventHandle(). ::cudaEventInterprocess must
 *   be specified along with ::cudaEventDisableTiming.
 *
 * @param event - Newly created event
 * @param flags - Flags for new event
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInitializationError,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorLaunchFailure,
 * ::cudaErrorMemoryAllocation
 * \notefnerr
 *
 * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
 * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
 * ::cudaStreamWaitEvent
 */
public static native @Cast("cudaError_t") int cudaEventCreateWithFlags(@ByPtrPtr CUevent_st event, @Cast("unsigned int") int flags);

/**
 * \brief Records an event
 *
 * Records an event. See note about NULL stream behavior. Since operation
 * is asynchronous, ::cudaEventQuery() or ::cudaEventSynchronize() must
 * be used to determine when the event has actually been recorded.
 *
 * If ::cudaEventRecord() has previously been called on \p event, then this
 * call will overwrite any existing state in \p event.  Any subsequent calls
 * which examine the status of \p event will only examine the completion of
 * this most recent call to ::cudaEventRecord().
 *
 * @param event  - Event to record
 * @param stream - Stream in which to record event
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInitializationError,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorLaunchFailure
 * \note_null_stream
 * \notefnerr
 *
 * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
 * ::cudaEventCreateWithFlags, ::cudaEventQuery,
 * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
 * ::cudaStreamWaitEvent
 */
public static native @Cast("cudaError_t") int cudaEventRecord(CUevent_st event, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaEventRecord(CUevent_st event);

/**
 * \brief Queries an event's status
 *
 * Query the status of all device work preceding the most recent call to
 * ::cudaEventRecord() (in the appropriate compute streams, as specified by the
 * arguments to ::cudaEventRecord()).
 *
 * If this work has successfully been completed by the device, or if
 * ::cudaEventRecord() has not been called on \p event, then ::cudaSuccess is
 * returned. If this work has not yet been completed by the device then
 * ::cudaErrorNotReady is returned.
 *
 * For the purposes of Unified Memory, a return value of ::cudaSuccess
 * is equivalent to having called ::cudaEventSynchronize().
 *
 * @param event - Event to query
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorNotReady,
 * ::cudaErrorInitializationError,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorLaunchFailure
 * \notefnerr
 *
 * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
 * ::cudaEventCreateWithFlags, ::cudaEventRecord,
 * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime
 */
public static native @Cast("cudaError_t") int cudaEventQuery(CUevent_st event);

/**
 * \brief Waits for an event to complete
 *
 * Wait until the completion of all device work preceding the most recent
 * call to ::cudaEventRecord() (in the appropriate compute streams, as specified
 * by the arguments to ::cudaEventRecord()).
 *
 * If ::cudaEventRecord() has not been called on \p event, ::cudaSuccess is
 * returned immediately.
 *
 * Waiting for an event that was created with the ::cudaEventBlockingSync
 * flag will cause the calling CPU thread to block until the event has
 * been completed by the device.  If the ::cudaEventBlockingSync flag has
 * not been set, then the CPU thread will busy-wait until the event has
 * been completed by the device.
 *
 * @param event - Event to wait for
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInitializationError,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorLaunchFailure
 * \notefnerr
 *
 * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
 * ::cudaEventCreateWithFlags, ::cudaEventRecord,
 * ::cudaEventQuery, ::cudaEventDestroy, ::cudaEventElapsedTime
 */
public static native @Cast("cudaError_t") int cudaEventSynchronize(CUevent_st event);

/**
 * \brief Destroys an event object
 *
 * Destroys the event specified by \p event.
 *
 * In case \p event has been recorded but has not yet been completed
 * when ::cudaEventDestroy() is called, the function will return immediately and 
 * the resources associated with \p event will be released automatically once
 * the device has completed \p event.
 *
 * @param event - Event to destroy
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInitializationError,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorLaunchFailure
 * \notefnerr
 *
 * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
 * ::cudaEventCreateWithFlags, ::cudaEventQuery,
 * ::cudaEventSynchronize, ::cudaEventRecord, ::cudaEventElapsedTime
 */
public static native @Cast("cudaError_t") int cudaEventDestroy(CUevent_st event);

/**
 * \brief Computes the elapsed time between events
 *
 * Computes the elapsed time between two events (in milliseconds with a
 * resolution of around 0.5 microseconds).
 *
 * If either event was last recorded in a non-NULL stream, the resulting time
 * may be greater than expected (even if both used the same stream handle). This
 * happens because the ::cudaEventRecord() operation takes place asynchronously
 * and there is no guarantee that the measured latency is actually just between
 * the two events. Any number of other different stream operations could execute
 * in between the two measured events, thus altering the timing in a significant
 * way.
 *
 * If ::cudaEventRecord() has not been called on either event, then
 * ::cudaErrorInvalidResourceHandle is returned. If ::cudaEventRecord() has been
 * called on both events but one or both of them has not yet been completed
 * (that is, ::cudaEventQuery() would return ::cudaErrorNotReady on at least one
 * of the events), ::cudaErrorNotReady is returned. If either event was created
 * with the ::cudaEventDisableTiming flag, then this function will return
 * ::cudaErrorInvalidResourceHandle.
 *
 * @param ms    - Time between \p start and \p end in ms
 * @param start - Starting event
 * @param end   - Ending event
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorNotReady,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInitializationError,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorLaunchFailure
 * \notefnerr
 *
 * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
 * ::cudaEventCreateWithFlags, ::cudaEventQuery,
 * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventRecord
 */
public static native @Cast("cudaError_t") int cudaEventElapsedTime(FloatPointer ms, CUevent_st start, CUevent_st end);
public static native @Cast("cudaError_t") int cudaEventElapsedTime(FloatBuffer ms, CUevent_st start, CUevent_st end);
public static native @Cast("cudaError_t") int cudaEventElapsedTime(float[] ms, CUevent_st start, CUevent_st end);

/** \} */ /* END CUDART_EVENT */

/**
 * \defgroup CUDART_EXECUTION Execution Control
 *
 * ___MANBRIEF___ execution control functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the execution control functions of the CUDA runtime
 * application programming interface.
 *
 * Some functions have overloaded C++ API template versions documented separately in the
 * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
 *
 * \{
 */

// #if CUDART_VERSION >= 7000
/**
 * \brief Launches a device function
 *
 * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y
 * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x ×
 * \p blockDim.y × \p blockDim.z) threads.
 *
 * If the kernel has N parameters the \p args should point to array of N pointers.
 * Each pointer, from args[0] to args[N - 1], point to the region
 * of memory from which the actual parameter will be copied.
 *
 * For templated functions, pass the function symbol as follows:
 * func_name
 *
 * \p sharedMem sets the amount of dynamic shared memory that will be available to
 * each thread block.
 *
 * \p stream specifies a stream the invocation is associated to.
 *
 * @param func        - Device function symbol
 * @param gridDim     - Grid dimentions
 * @param blockDim    - Block dimentions
 * @param args        - Arguments
 * @param sharedMem   - Shared memory
 * @param stream      - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidDeviceFunction,
 * ::cudaErrorInvalidConfiguration,
 * ::cudaErrorLaunchFailure,
 * ::cudaErrorLaunchTimeout,
 * ::cudaErrorLaunchOutOfResources,
 * ::cudaErrorSharedObjectInitFailed
 * \note_null_stream
 * \notefnerr
 *
 * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)"
 */
public static native @Cast("cudaError_t") int cudaLaunchKernel(@Const Pointer func, @ByVal dim3 gridDim, @ByVal dim3 blockDim, @Cast("void**") PointerPointer args, @Cast("size_t") long sharedMem, CUstream_st stream);
public static native @Cast("cudaError_t") int cudaLaunchKernel(@Const Pointer func, @ByVal dim3 gridDim, @ByVal dim3 blockDim, @Cast("void**") @ByPtrPtr Pointer args, @Cast("size_t") long sharedMem, CUstream_st stream);

// #endif /* CUDART_VERSION >= 7000 */

/**
 * \brief Sets the preferred cache configuration for a device function
 *
 * On devices where the L1 cache and shared memory use the same hardware
 * resources, this sets through \p cacheConfig the preferred cache configuration
 * for the function specified via \p func. This is only a preference. The
 * runtime will use the requested configuration if possible, but it is free to
 * choose a different configuration if required to execute \p func.
 *
 * \p func is a device function symbol and must be declared as a
 * \c __global__ function. If the specified function does not exist,
 * then ::cudaErrorInvalidDeviceFunction is returned. For templated functions,
 * pass the function symbol as follows: func_name
 *
 * This setting does nothing on devices where the size of the L1 cache and
 * shared memory are fixed.
 *
 * Launching a kernel with a different preference than the most recent
 * preference setting may insert a device-side synchronization point.
 *
 * The supported cache configurations are:
 * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
 * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
 * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
 * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
 *
 * @param func        - Device function symbol
 * @param cacheConfig - Requested cache configuration
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInitializationError,
 * ::cudaErrorInvalidDeviceFunction
 * \notefnerr
 * \note_string_api_deprecation2
 *
 * \sa ::cudaConfigureCall,
 * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
 * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
 * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
 * ::cudaSetDoubleForDevice,
 * ::cudaSetDoubleForHost,
 * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)",
 * ::cudaThreadGetCacheConfig,
 * ::cudaThreadSetCacheConfig
 */
public static native @Cast("cudaError_t") int cudaFuncSetCacheConfig(@Const Pointer func, @Cast("cudaFuncCache") int cacheConfig);

/**
 * \brief Sets the shared memory configuration for a device function
 *
 * On devices with configurable shared memory banks, this function will 
 * force all subsequent launches of the specified device function to have
 * the given shared memory bank size configuration. On any given launch of the
 * function, the shared memory configuration of the device will be temporarily
 * changed if needed to suit the function's preferred configuration. Changes in
 * shared memory configuration between subsequent launches of functions, 
 * may introduce a device side synchronization point.
 *
 * Any per-function setting of shared memory bank size set via 
 * ::cudaFuncSetSharedMemConfig will override the device wide setting set by
 * ::cudaDeviceSetSharedMemConfig.
 *
 * Changing the shared memory bank size will not increase shared memory usage
 * or affect occupancy of kernels, but may have major effects on performance. 
 * Larger bank sizes will allow for greater potential bandwidth to shared memory,
 * but will change what kinds of accesses to shared memory will result in bank 
 * conflicts.
 *
 * This function will do nothing on devices with fixed shared memory bank size.
 *
 * For templated functions, pass the function symbol as follows:
 * func_name
 *
 * The supported bank configurations are:
 * - ::cudaSharedMemBankSizeDefault: use the device's shared memory configuration
 *   when launching this function.
 * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be 
 *   four bytes natively when launching this function.
 * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight 
 *   bytes natively when launching this function.
 *
 * @param func   - Device function symbol
 * @param config - Requested shared memory configuration
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInitializationError,
 * ::cudaErrorInvalidDeviceFunction,
 * ::cudaErrorInvalidValue,
 * \notefnerr
 * \note_string_api_deprecation2
 *
 * \sa ::cudaConfigureCall,
 * ::cudaDeviceSetSharedMemConfig,
 * ::cudaDeviceGetSharedMemConfig,
 * ::cudaDeviceSetCacheConfig,
 * ::cudaDeviceGetCacheConfig,
 * ::cudaFuncSetCacheConfig
 */
public static native @Cast("cudaError_t") int cudaFuncSetSharedMemConfig(@Const Pointer func, @Cast("cudaSharedMemConfig") int config);

/**
 * \brief Find out attributes for a given function
 *
 * This function obtains the attributes of a function specified via \p func.
 * \p func is a device function symbol and must be declared as a
 * \c __global__ function. The fetched attributes are placed in \p attr.
 * If the specified function does not exist, then
 * ::cudaErrorInvalidDeviceFunction is returned. For templated functions, pass
 * the function symbol as follows: func_name
 *
 * Note that some function attributes such as
 * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock"
 * may vary based on the device that is currently being used.
 *
 * @param attr - Return pointer to function's attributes
 * @param func - Device function symbol
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInitializationError,
 * ::cudaErrorInvalidDeviceFunction
 * \notefnerr
 * \note_string_api_deprecation2
 *
 * \sa ::cudaConfigureCall,
 * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
 * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
 * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
 * ::cudaSetDoubleForDevice,
 * ::cudaSetDoubleForHost,
 * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)"
 */
public static native @Cast("cudaError_t") int cudaFuncGetAttributes(cudaFuncAttributes attr, @Const Pointer func);

/**
 * \brief Converts a double argument to be executed on a device
 *
 * @param d - Double to convert
 *
 * @deprecated This function is deprecated as of CUDA 7.5
 *
 * Converts the double value of \p d to an internal float representation if
 * the device does not support double arithmetic. If the device does natively
 * support doubles, then this function does nothing.
 *
 * @return
 * ::cudaSuccess
 * \notefnerr
 *
 * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)",
 * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
 * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
 * ::cudaSetDoubleForHost,
 * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)"
 */
public static native @Cast("cudaError_t") int cudaSetDoubleForDevice(DoublePointer d);
public static native @Cast("cudaError_t") int cudaSetDoubleForDevice(DoubleBuffer d);
public static native @Cast("cudaError_t") int cudaSetDoubleForDevice(double[] d);

/**
 * \brief Converts a double argument after execution on a device
 *
 * @deprecated This function is deprecated as of CUDA 7.5
 *
 * Converts the double value of \p d from a potentially internal float
 * representation if the device does not support double arithmetic. If the
 * device does natively support doubles, then this function does nothing.
 *
 * @param d - Double to convert
 *
 * @return
 * ::cudaSuccess
 * \notefnerr
 *
 * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)",
 * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
 * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
 * ::cudaSetDoubleForDevice,
 * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)"
 */
public static native @Cast("cudaError_t") int cudaSetDoubleForHost(DoublePointer d);
public static native @Cast("cudaError_t") int cudaSetDoubleForHost(DoubleBuffer d);
public static native @Cast("cudaError_t") int cudaSetDoubleForHost(double[] d);

/** \} */ /* END CUDART_EXECUTION */

// #if CUDART_VERSION >= 6050

/**
 * \defgroup CUDART_OCCUPANCY Occupancy
 *
 * ___MANBRIEF___ occupancy calculation functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the occupancy calculation functions of the CUDA runtime
 * application programming interface.
 *
 * Besides the occupancy calculator functions
 * (\ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessor and \ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags),
 * there are also C++ only occupancy-based launch configuration functions documented in
 * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
 *
 * See
 * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
 * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
 * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
 * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
 *
 * \{
 */

/**
 * \brief Returns occupancy for a device function
 *
 * Returns in \p *numBlocks the maximum number of active blocks per
 * streaming multiprocessor for the device function.
 *
 * @param numBlocks       - Returned occupancy
 * @param func            - Kernel function for which occupancy is calculated
 * @param blockSize       - Block size the kernel is intended to be launched with
 * @param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorCudartUnloading,
 * ::cudaErrorInitializationError,
 * ::cudaErrorInvalidDevice,
 * ::cudaErrorInvalidDeviceFunction,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorUnknown,
 * \notefnerr
 *
 * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,
 * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
 * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)",
 * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)"
 * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)"
 */
public static native @Cast("cudaError_t") int cudaOccupancyMaxActiveBlocksPerMultiprocessor(IntPointer numBlocks, @Const Pointer func, int blockSize, @Cast("size_t") long dynamicSMemSize);
public static native @Cast("cudaError_t") int cudaOccupancyMaxActiveBlocksPerMultiprocessor(IntBuffer numBlocks, @Const Pointer func, int blockSize, @Cast("size_t") long dynamicSMemSize);
public static native @Cast("cudaError_t") int cudaOccupancyMaxActiveBlocksPerMultiprocessor(int[] numBlocks, @Const Pointer func, int blockSize, @Cast("size_t") long dynamicSMemSize);

// #if CUDART_VERSION >= 7000
/**
 * \brief Returns occupancy for a device function with the specified flags
 *
 * Returns in \p *numBlocks the maximum number of active blocks per
 * streaming multiprocessor for the device function.
 *
 * The \p flags parameter controls how special cases are handled. Valid flags include:
 *
 * - ::cudaOccupancyDefault: keeps the default behavior as
 *   ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
 *
 * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
 *   on platform where global caching affects occupancy. On such platforms, if caching
 *   is enabled, but per-block SM resource usage would result in zero occupancy, the
 *   occupancy calculator will calculate the occupancy as if caching is disabled.
 *   Setting this flag makes the occupancy calculator to return 0 in such cases.
 *   More information can be found about this feature in the "Unified L1/Texture Cache"
 *   section of the Maxwell tuning guide.
 *
 * @param numBlocks       - Returned occupancy
 * @param func            - Kernel function for which occupancy is calculated
 * @param blockSize       - Block size the kernel is intended to be launched with
 * @param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
 * @param flags           - Requested behavior for the occupancy calculator
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorCudartUnloading,
 * ::cudaErrorInitializationError,
 * ::cudaErrorInvalidDevice,
 * ::cudaErrorInvalidDeviceFunction,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorUnknown,
 * \notefnerr
 *
 * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor,
 * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
 * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)",
 * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)"
 * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)"
 */
public static native @Cast("cudaError_t") int cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(IntPointer numBlocks, @Const Pointer func, int blockSize, @Cast("size_t") long dynamicSMemSize, @Cast("unsigned int") int flags);
public static native @Cast("cudaError_t") int cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(IntBuffer numBlocks, @Const Pointer func, int blockSize, @Cast("size_t") long dynamicSMemSize, @Cast("unsigned int") int flags);
public static native @Cast("cudaError_t") int cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int[] numBlocks, @Const Pointer func, int blockSize, @Cast("size_t") long dynamicSMemSize, @Cast("unsigned int") int flags);

/** \} */ /* END CUDA_OCCUPANCY */
// #endif /* CUDART_VERSION >= 7000 */
// #endif /* CUDART_VERSION >= 6050 */

/**
 * \defgroup CUDART_EXECUTION_DEPRECATED Execution Control [DEPRECATED]
 *
 * ___MANBRIEF___ deprecated execution control functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the deprecated execution control functions of the CUDA runtime
 * application programming interface.
 *
 * Some functions have overloaded C++ API template versions documented separately in the
 * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
 *
 * \{
 */

/**
 * \brief Configure a device-launch
 *
 * @deprecated This function is deprecated as of CUDA 7.0
 *
 * Specifies the grid and block dimensions for the device call to be executed
 * similar to the execution configuration syntax. ::cudaConfigureCall() is
 * stack based. Each call pushes data on top of an execution stack. This data
 * contains the dimension for the grid and thread blocks, together with any
 * arguments for the call.
 *
 * @param gridDim   - Grid dimensions
 * @param blockDim  - Block dimensions
 * @param sharedMem - Shared memory
 * @param stream    - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidConfiguration
 * \note_null_stream
 * \notefnerr
 *
 * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
 * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
 * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
 * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)",
 * ::cudaSetDoubleForDevice,
 * ::cudaSetDoubleForHost,
 * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)",
 */
public static native @Cast("cudaError_t") int cudaConfigureCall(@ByVal dim3 gridDim, @ByVal dim3 blockDim, @Cast("size_t") long sharedMem/*=0*/, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaConfigureCall(@ByVal dim3 gridDim, @ByVal dim3 blockDim);

/**
 * \brief Configure a device launch
 *
 * @deprecated This function is deprecated as of CUDA 7.0
 *
 * Pushes \p size bytes of the argument pointed to by \p arg at \p offset
 * bytes from the start of the parameter passing area, which starts at
 * offset 0. The arguments are stored in the top of the execution stack.
 * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument()"
 * must be preceded by a call to ::cudaConfigureCall().
 *
 * @param arg    - Argument to push for a kernel launch
 * @param size   - Size of argument
 * @param offset - Offset in argument stack to push new arg
 *
 * @return
 * ::cudaSuccess
 * \notefnerr
 *
 * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
 * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
 * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
 * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)",
 * ::cudaSetDoubleForDevice,
 * ::cudaSetDoubleForHost,
 * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)",
 */
public static native @Cast("cudaError_t") int cudaSetupArgument(@Const Pointer arg, @Cast("size_t") long size, @Cast("size_t") long offset);

/**
 * \brief Launches a device function
 *
 * @deprecated This function is deprecated as of CUDA 7.0
 *
 * Launches the function \p func on the device. The parameter \p func must
 * be a device function symbol. The parameter specified by \p func must be
 * declared as a \p __global__ function. For templated functions, pass the
 * function symbol as follows: func_name
 * \ref ::cudaLaunch(const void*) "cudaLaunch()" must be preceded by a call to
 * ::cudaConfigureCall() since it pops the data that was pushed by
 * ::cudaConfigureCall() from the execution stack.
 *
 * @param func - Device function symbol
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidDeviceFunction,
 * ::cudaErrorInvalidConfiguration,
 * ::cudaErrorLaunchFailure,
 * ::cudaErrorLaunchTimeout,
 * ::cudaErrorLaunchOutOfResources,
 * ::cudaErrorSharedObjectInitFailed
 * \notefnerr
 * \note_string_api_deprecation_50
 *
 * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
 * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
 * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
 * \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)",
 * ::cudaSetDoubleForDevice,
 * ::cudaSetDoubleForHost,
 * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)",
 * ::cudaThreadGetCacheConfig,
 * ::cudaThreadSetCacheConfig
 */
public static native @Cast("cudaError_t") int cudaLaunch(@Const Pointer func);


/** \} */ /* END CUDART_EXECUTION_DEPRECATED */


/**
 * \defgroup CUDART_MEMORY Memory Management
 *
 * ___MANBRIEF___ memory management functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the memory management functions of the CUDA runtime
 * application programming interface.
 *
 * Some functions have overloaded C++ API template versions documented separately in the
 * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
 *
 * \{
 */

/**
 * \brief Allocates memory that will be automatically managed by the Unified Memory system
 *
 * Allocates \p size bytes of managed memory on the device and returns in
 * \p *devPtr a pointer to the allocated memory. If the device doesn't support
 * allocating managed memory, ::cudaErrorNotSupported is returned. Support
 * for managed memory can be queried using the device attribute
 * ::cudaDevAttrManagedMemory. The allocated memory is suitably
 * aligned for any kind of variable. The memory is not cleared. If \p size
 * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer
 * is valid on the CPU and on all GPUs in the system that support managed memory.
 * All accesses to this pointer must obey the Unified Memory programming model.
 *
 * \p flags specifies the default stream association for this allocation.
 * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The
 * default value for \p flags is ::cudaMemAttachGlobal.
 * If ::cudaMemAttachGlobal is specified, then this memory is accessible from
 * any stream on any device. If ::cudaMemAttachHost is specified, then the
 * allocation should not be accessed from devices that have a zero value for the
 * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to
 * ::cudaStreamAttachMemAsync will be required to enable access on such devices.
 *
 * If the association is later changed via ::cudaStreamAttachMemAsync to
 * a single stream, the default association, as specifed during ::cudaMallocManaged,
 * is restored when that stream is destroyed. For __managed__ variables, the
 * default association is always ::cudaMemAttachGlobal. Note that destroying a
 * stream is an asynchronous operation, and as a result, the change to default
 * association won't happen until all work in the stream has completed.
 *
 * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree.
 *
 * Device memory oversubscription is possible for GPUs that have a non-zero value for the
 * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on
 * such GPUs may be evicted from device memory to host memory at any time by the Unified
 * Memory driver in order to make room for other allocations.
 *
 * In a multi-GPU system where all GPUs have a non-zero value for the device attribute
 * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this
 * API returns and instead may be populated on access. In such systems, managed memory can
 * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
 * maintain data locality and prevent excessive page faults to the extent possible. The application
 * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application
 * can also explicitly migrate memory to a desired processor's memory via
 * ::cudaMemPrefetchAsync.
 *
 * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
 * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support
 * with each other, the physical storage for managed memory is created on the GPU which is active
 * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced
 * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
 * memory among such GPUs.
 *
 * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
 * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess
 * is zero for at least one of those GPUs, the location chosen for physical storage of managed
 * memory is system-dependent.
 * - On Linux, the location chosen will be device memory as long as the current set of active
 * contexts are on devices that either have peer-to-peer support with each other or have a
 * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
 * If there is an active context on a GPU that does not have a non-zero value for that device
 * attribute and it does not have peer-to-peer support with the other devices that have active
 * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
 * Note that this means that managed memory that is located in device memory is migrated to
 * host memory if a new context is created on a GPU that doesn't have a non-zero value for
 * the device attribute and does not support peer-to-peer with at least one of the other devices
 * that has an active context. This in turn implies that context creation may fail if there is
 * insufficient host memory to migrate all managed allocations.
 * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
 * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
 * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
 * restrict CUDA to only use those GPUs that have peer-to-peer support.
 * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero
 * value to force the driver to always use device memory for physical storage.
 * When this environment variable is set to a non-zero value, all devices used in
 * that process that support managed memory have to be peer-to-peer compatible
 * with each other. The error ::cudaErrorInvalidDevice will be returned if a device
 * that supports managed memory is used and it is not peer-to-peer compatible with
 * any of the other managed memory supporting devices that were previously used in
 * that process, even if ::cudaDeviceReset has been called on those devices. These
 * environment variables are described in the CUDA programming guide under the
 * "CUDA environment variables" section.
 *
 * @param devPtr - Pointer to allocated device memory
 * @param size   - Requested allocation size in bytes
 * @param flags  - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal)
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMemoryAllocation
 * ::cudaErrorNotSupported
 * ::cudaErrorInvalidValue
 *
 * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
 * ::cudaMalloc3D, ::cudaMalloc3DArray,
 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync
 */
public static native @Cast("cudaError_t") int cudaMallocManaged(@Cast("void**") PointerPointer devPtr, @Cast("size_t") long size, @Cast("unsigned int") int flags/*=cudaMemAttachGlobal*/);
public static native @Cast("cudaError_t") int cudaMallocManaged(@Cast("void**") @ByPtrPtr Pointer devPtr, @Cast("size_t") long size);
public static native @Cast("cudaError_t") int cudaMallocManaged(@Cast("void**") @ByPtrPtr Pointer devPtr, @Cast("size_t") long size, @Cast("unsigned int") int flags/*=cudaMemAttachGlobal*/);


/**
 * \brief Allocate memory on the device
 *
 * Allocates \p size bytes of linear memory on the device and returns in
 * \p *devPtr a pointer to the allocated memory. The allocated memory is
 * suitably aligned for any kind of variable. The memory is not cleared.
 * ::cudaMalloc() returns ::cudaErrorMemoryAllocation in case of failure.
 *
 * The device version of ::cudaFree cannot be used with a \p *devPtr
 * allocated using the host API, and vice versa.
 *
 * @param devPtr - Pointer to allocated device memory
 * @param size   - Requested allocation size in bytes
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMemoryAllocation
 *
 * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
 * ::cudaMalloc3D, ::cudaMalloc3DArray,
 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 * ::cudaFreeHost, ::cudaHostAlloc
 */
public static native @Cast("cudaError_t") int cudaMalloc(@Cast("void**") PointerPointer devPtr, @Cast("size_t") long size);
public static native @Cast("cudaError_t") int cudaMalloc(@Cast("void**") @ByPtrPtr Pointer devPtr, @Cast("size_t") long size);

/**
 * \brief Allocates page-locked memory on the host
 *
 * Allocates \p size bytes of host memory that is page-locked and accessible
 * to the device. The driver tracks the virtual memory ranges allocated with
 * this function and automatically accelerates calls to functions such as
 * ::cudaMemcpy*(). Since the memory can be accessed directly by the device,
 * it can be read or written with much higher bandwidth than pageable memory
 * obtained with functions such as ::malloc(). Allocating excessive amounts of
 * memory with ::cudaMallocHost() may degrade system performance, since it
 * reduces the amount of memory available to the system for paging. As a
 * result, this function is best used sparingly to allocate staging areas for
 * data exchange between host and device.
 *
 * @param ptr  - Pointer to allocated host memory
 * @param size - Requested allocation size in bytes
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMemoryAllocation
 * \notefnerr
 *
 * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocArray, ::cudaMalloc3D,
 * ::cudaMalloc3DArray, ::cudaHostAlloc, ::cudaFree, ::cudaFreeArray,
 * \ref ::cudaMallocHost(void**, size_t, unsigned int) "cudaMallocHost (C++ API)",
 * ::cudaFreeHost, ::cudaHostAlloc
 */
public static native @Cast("cudaError_t") int cudaMallocHost(@Cast("void**") PointerPointer ptr, @Cast("size_t") long size);
public static native @Cast("cudaError_t") int cudaMallocHost(@Cast("void**") @ByPtrPtr Pointer ptr, @Cast("size_t") long size);

/**
 * \brief Allocates pitched memory on the device
 *
 * Allocates at least \p width (in bytes) * \p height bytes of linear memory
 * on the device and returns in \p *devPtr a pointer to the allocated memory.
 * The function may pad the allocation to ensure that corresponding pointers
 * in any given row will continue to meet the alignment requirements for
 * coalescing as the address is updated from row to row. The pitch returned in
 * \p *pitch by ::cudaMallocPitch() is the width in bytes of the allocation.
 * The intended usage of \p pitch is as a separate parameter of the allocation,
 * used to compute addresses within the 2D array. Given the row and column of
 * an array element of type \p T, the address is computed as:
 * {@code
    T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
   }
 *
 * For allocations of 2D arrays, it is recommended that programmers consider
 * performing pitch allocations using ::cudaMallocPitch(). Due to pitch
 * alignment restrictions in the hardware, this is especially true if the
 * application will be performing 2D memory copies between different regions
 * of device memory (whether linear memory or CUDA arrays).
 *
 * @param devPtr - Pointer to allocated pitched device memory
 * @param pitch  - Pitch for allocation
 * @param width  - Requested pitched allocation width (in bytes)
 * @param height - Requested pitched allocation height
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMemoryAllocation
 * \notefnerr
 *
 * \sa ::cudaMalloc, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray,
 * ::cudaHostAlloc
 */
public static native @Cast("cudaError_t") int cudaMallocPitch(@Cast("void**") PointerPointer devPtr, @Cast("size_t*") SizeTPointer pitch, @Cast("size_t") long width, @Cast("size_t") long height);
public static native @Cast("cudaError_t") int cudaMallocPitch(@Cast("void**") @ByPtrPtr Pointer devPtr, @Cast("size_t*") SizeTPointer pitch, @Cast("size_t") long width, @Cast("size_t") long height);

/**
 * \brief Allocate an array on the device
 *
 * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure
 * \p desc and returns a handle to the new CUDA array in \p *array.
 *
 * The ::cudaChannelFormatDesc is defined as:
 * {@code
    struct cudaChannelFormatDesc {
        int x, y, z, w;
    enum cudaChannelFormatKind f;
    };
    }
 * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
 * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
 *
 * The \p flags parameter enables different options to be specified that affect
 * the allocation, as follows.
 * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation
 * - ::cudaArraySurfaceLoadStore: Allocates an array that can be read from or written to using a surface reference
 * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the array.
 *
 * \p width and \p height must meet certain size requirements. See ::cudaMalloc3DArray() for more details.
 *
 * @param array  - Pointer to allocated array in device memory
 * @param desc   - Requested channel format
 * @param width  - Requested array allocation width
 * @param height - Requested array allocation height
 * @param flags  - Requested properties of allocated array
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMemoryAllocation
 * \notefnerr
 *
 * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaFreeArray,
 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray,
 * ::cudaHostAlloc
 */
public static native @Cast("cudaError_t") int cudaMallocArray(@ByPtrPtr cudaArray array, @Const cudaChannelFormatDesc desc, @Cast("size_t") long width, @Cast("size_t") long height/*=0*/, @Cast("unsigned int") int flags/*=0*/);
public static native @Cast("cudaError_t") int cudaMallocArray(@ByPtrPtr cudaArray array, @Const cudaChannelFormatDesc desc, @Cast("size_t") long width);

/**
 * \brief Frees memory on the device
 *
 * Frees the memory space pointed to by \p devPtr, which must have been
 * returned by a previous call to ::cudaMalloc() or ::cudaMallocPitch().
 * Otherwise, or if ::cudaFree(\p devPtr) has already been called before,
 * an error is returned. If \p devPtr is 0, no operation is performed.
 * ::cudaFree() returns ::cudaErrorInvalidDevicePointer in case of failure.
 *
 * The device version of ::cudaFree cannot be used with a \p *devPtr
 * allocated using the host API, and vice versa.
 *
 * @param devPtr - Device pointer to memory to free
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInitializationError
 * \notefnerr
 *
 * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocArray, ::cudaFreeArray,
 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray,
 * ::cudaHostAlloc
 */
public static native @Cast("cudaError_t") int cudaFree(Pointer devPtr);

/**
 * \brief Frees page-locked memory
 *
 * Frees the memory space pointed to by \p hostPtr, which must have been
 * returned by a previous call to ::cudaMallocHost() or ::cudaHostAlloc().
 *
 * @param ptr - Pointer to memory to free
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInitializationError
 * \notefnerr
 *
 * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
 * ::cudaFreeArray,
 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 * ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaHostAlloc
 */
public static native @Cast("cudaError_t") int cudaFreeHost(Pointer ptr);

/**
 * \brief Frees an array on the device
 *
 * Frees the CUDA array \p array, which must have been * returned by a
 * previous call to ::cudaMallocArray(). If ::cudaFreeArray(\p array) has
 * already been called before, ::cudaErrorInvalidValue is returned. If
 * \p devPtr is 0, no operation is performed.
 *
 * @param array - Pointer to array to free
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInitializationError
 * \notefnerr
 *
 * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 * ::cudaFreeHost, ::cudaHostAlloc
 */
public static native @Cast("cudaError_t") int cudaFreeArray(cudaArray array);

/**
 * \brief Frees a mipmapped array on the device
 *
 * Frees the CUDA mipmapped array \p mipmappedArray, which must have been 
 * returned by a previous call to ::cudaMallocMipmappedArray(). 
 * If ::cudaFreeMipmappedArray(\p mipmappedArray) has already been called before,
 * ::cudaErrorInvalidValue is returned.
 *
 * @param mipmappedArray - Pointer to mipmapped array to free
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInitializationError
 * \notefnerr
 *
 * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 * ::cudaFreeHost, ::cudaHostAlloc
 */
public static native @Cast("cudaError_t") int cudaFreeMipmappedArray(cudaMipmappedArray mipmappedArray);


/**
 * \brief Allocates page-locked memory on the host
 *
 * Allocates \p size bytes of host memory that is page-locked and accessible
 * to the device. The driver tracks the virtual memory ranges allocated with
 * this function and automatically accelerates calls to functions such as
 * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it
 * can be read or written with much higher bandwidth than pageable memory
 * obtained with functions such as ::malloc(). Allocating excessive amounts of
 * pinned memory may degrade system performance, since it reduces the amount
 * of memory available to the system for paging. As a result, this function is
 * best used sparingly to allocate staging areas for data exchange between host
 * and device.
 *
 * The \p flags parameter enables different options to be specified that affect
 * the allocation, as follows.
 * - ::cudaHostAllocDefault: This flag's value is defined to be 0 and causes
 * ::cudaHostAlloc() to emulate ::cudaMallocHost().
 * - ::cudaHostAllocPortable: The memory returned by this call will be
 * considered as pinned memory by all CUDA contexts, not just the one that
 * performed the allocation.
 * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space.
 * The device pointer to the memory may be obtained by calling
 * ::cudaHostGetDevicePointer().
 * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC).
 * WC memory can be transferred across the PCI Express bus more quickly on some
 * system configurations, but cannot be read efficiently by most CPUs.  WC
 * memory is a good option for buffers that will be written by the CPU and read
 * by the device via mapped pinned memory or host->device transfers.
 *
 * All of these flags are orthogonal to one another: a developer may allocate
 * memory that is portable, mapped and/or write-combined with no restrictions.
 *
 * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost
 * flag in order for the ::cudaHostAllocMapped flag to have any effect.
 *
 * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices
 * that do not support mapped pinned memory. The failure is deferred to
 * ::cudaHostGetDevicePointer() because the memory may be mapped into other
 * CUDA contexts via the ::cudaHostAllocPortable flag.
 *
 * Memory allocated by this function must be freed with ::cudaFreeHost().
 *
 * @param pHost - Device pointer to allocated memory
 * @param size  - Requested allocation size in bytes
 * @param flags - Requested properties of allocated memory
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMemoryAllocation
 * \notefnerr
 *
 * \sa ::cudaSetDeviceFlags,
 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 * ::cudaFreeHost
 */
public static native @Cast("cudaError_t") int cudaHostAlloc(@Cast("void**") PointerPointer pHost, @Cast("size_t") long size, @Cast("unsigned int") int flags);
public static native @Cast("cudaError_t") int cudaHostAlloc(@Cast("void**") @ByPtrPtr Pointer pHost, @Cast("size_t") long size, @Cast("unsigned int") int flags);

/**
 * \brief Registers an existing host memory range for use by CUDA
 *
 * Page-locks the memory range specified by \p ptr and \p size and maps it
 * for the device(s) as specified by \p flags. This memory range also is added
 * to the same tracking mechanism as ::cudaHostAlloc() to automatically accelerate
 * calls to functions such as ::cudaMemcpy(). Since the memory can be accessed 
 * directly by the device, it can be read or written with much higher bandwidth 
 * than pageable memory that has not been registered.  Page-locking excessive
 * amounts of memory may degrade system performance, since it reduces the amount
 * of memory available to the system for paging. As a result, this function is
 * best used sparingly to register staging areas for data exchange between
 * host and device.
 *
 * The \p flags parameter enables different options to be specified that
 * affect the allocation, as follows.
 *
 * - ::cudaHostRegisterDefault: On a system with unified virtual addressing,
 *   the memory will be both mapped and portable.  On a system with no unified
 *   virtual addressing, the memory will be neither mapped nor portable.
 *
 * - ::cudaHostRegisterPortable: The memory returned by this call will be
 *   considered as pinned memory by all CUDA contexts, not just the one that
 *   performed the allocation.
 *
 * - ::cudaHostRegisterMapped: Maps the allocation into the CUDA address
 *   space. The device pointer to the memory may be obtained by calling
 *   ::cudaHostGetDevicePointer().
 *
 * - ::cudaHostRegisterIoMemory: The passed memory pointer is treated as
 *   pointing to some memory-mapped I/O space, e.g. belonging to a
 *   third-party PCIe device, and it will marked as non cache-coherent and
 *   contiguous.
 *
 * All of these flags are orthogonal to one another: a developer may page-lock
 * memory that is portable or mapped with no restrictions.
 *
 * The CUDA context must have been created with the ::cudaMapHost flag in
 * order for the ::cudaHostRegisterMapped flag to have any effect.
 *
 * The ::cudaHostRegisterMapped flag may be specified on CUDA contexts for
 * devices that do not support mapped pinned memory. The failure is deferred
 * to ::cudaHostGetDevicePointer() because the memory may be mapped into
 * other CUDA contexts via the ::cudaHostRegisterPortable flag.
 *
 * For devices that have a non-zero value for the device attribute
 * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory
 * can also be accessed from the device using the host pointer \p ptr.
 * The device pointer returned by ::cudaHostGetDevicePointer() may or may not
 * match the original host pointer \p ptr and depends on the devices visible to the
 * application. If all devices visible to the application have a non-zero value for the
 * device attribute, the device pointer returned by ::cudaHostGetDevicePointer()
 * will match the original pointer \p ptr. If any device visible to the application
 * has a zero value for the device attribute, the device pointer returned by
 * ::cudaHostGetDevicePointer() will not match the original host pointer \p ptr,
 * but it will be suitable for use on all devices provided Unified Virtual Addressing
 * is enabled. In such systems, it is valid to access the memory using either pointer
 * on devices that have a non-zero value for the device attribute. Note however that
 * such devices should access the memory using only of the two pointers and not both.
 *
 * The memory page-locked by this function must be unregistered with ::cudaHostUnregister().
 *
 * @param ptr   - Host pointer to memory to page-lock
 * @param size  - Size in bytes of the address range to page-lock in bytes
 * @param flags - Flags for allocation request
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorMemoryAllocation,
 * ::cudaErrorHostMemoryAlreadyRegistered
 * \notefnerr
 *
 * \sa ::cudaHostUnregister, ::cudaHostGetFlags, ::cudaHostGetDevicePointer
 */
public static native @Cast("cudaError_t") int cudaHostRegister(Pointer ptr, @Cast("size_t") long size, @Cast("unsigned int") int flags);

/**
 * \brief Unregisters a memory range that was registered with cudaHostRegister
 *
 * Unmaps the memory range whose base address is specified by \p ptr, and makes
 * it pageable again.
 *
 * The base address must be the same one specified to ::cudaHostRegister().
 *
 * @param ptr - Host pointer to memory to unregister
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaHostUnregister
 */
public static native @Cast("cudaError_t") int cudaHostUnregister(Pointer ptr);

/**
 * \brief Passes back device pointer of mapped host memory allocated by
 * cudaHostAlloc or registered by cudaHostRegister
 *
 * Passes back the device pointer corresponding to the mapped, pinned host
 * buffer allocated by ::cudaHostAlloc() or registered by ::cudaHostRegister().
 *
 * ::cudaHostGetDevicePointer() will fail if the ::cudaDeviceMapHost flag was
 * not specified before deferred context creation occurred, or if called on a
 * device that does not support mapped, pinned memory.
 *
 * For devices that have a non-zero value for the device attribute
 * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory
 * can also be accessed from the device using the host pointer \p pHost.
 * The device pointer returned by ::cudaHostGetDevicePointer() may or may not
 * match the original host pointer \p pHost and depends on the devices visible to the
 * application. If all devices visible to the application have a non-zero value for the
 * device attribute, the device pointer returned by ::cudaHostGetDevicePointer()
 * will match the original pointer \p pHost. If any device visible to the application
 * has a zero value for the device attribute, the device pointer returned by
 * ::cudaHostGetDevicePointer() will not match the original host pointer \p pHost,
 * but it will be suitable for use on all devices provided Unified Virtual Addressing
 * is enabled. In such systems, it is valid to access the memory using either pointer
 * on devices that have a non-zero value for the device attribute. Note however that
 * such devices should access the memory using only of the two pointers and not both.
 *
 * \p flags provides for future releases.  For now, it must be set to 0.
 *
 * @param pDevice - Returned device pointer for mapped memory
 * @param pHost   - Requested host pointer mapping
 * @param flags   - Flags for extensions (must be 0 for now)
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorMemoryAllocation
 * \notefnerr
 *
 * \sa ::cudaSetDeviceFlags, ::cudaHostAlloc
 */
public static native @Cast("cudaError_t") int cudaHostGetDevicePointer(@Cast("void**") PointerPointer pDevice, Pointer pHost, @Cast("unsigned int") int flags);
public static native @Cast("cudaError_t") int cudaHostGetDevicePointer(@Cast("void**") @ByPtrPtr Pointer pDevice, Pointer pHost, @Cast("unsigned int") int flags);

/**
 * \brief Passes back flags used to allocate pinned host memory allocated by
 * cudaHostAlloc
 *
 * ::cudaHostGetFlags() will fail if the input pointer does not
 * reside in an address range allocated by ::cudaHostAlloc().
 *
 * @param pFlags - Returned flags word
 * @param pHost - Host pointer
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaHostAlloc
 */
public static native @Cast("cudaError_t") int cudaHostGetFlags(@Cast("unsigned int*") IntPointer pFlags, Pointer pHost);
public static native @Cast("cudaError_t") int cudaHostGetFlags(@Cast("unsigned int*") IntBuffer pFlags, Pointer pHost);
public static native @Cast("cudaError_t") int cudaHostGetFlags(@Cast("unsigned int*") int[] pFlags, Pointer pHost);

/**
 * \brief Allocates logical 1D, 2D, or 3D memory objects on the device
 *
 * Allocates at least \p width * \p height * \p depth bytes of linear memory
 * on the device and returns a ::cudaPitchedPtr in which \p ptr is a pointer
 * to the allocated memory. The function may pad the allocation to ensure
 * hardware alignment requirements are met. The pitch returned in the \p pitch
 * field of \p pitchedDevPtr is the width in bytes of the allocation.
 *
 * The returned ::cudaPitchedPtr contains additional fields \p xsize and
 * \p ysize, the logical width and height of the allocation, which are
 * equivalent to the \p width and \p height \p extent parameters provided by
 * the programmer during allocation.
 *
 * For allocations of 2D and 3D objects, it is highly recommended that
 * programmers perform allocations using ::cudaMalloc3D() or
 * ::cudaMallocPitch(). Due to alignment restrictions in the hardware, this is
 * especially true if the application will be performing memory copies
 * involving 2D or 3D objects (whether linear memory or CUDA arrays).
 *
 * @param pitchedDevPtr  - Pointer to allocated pitched device memory
 * @param extent         - Requested allocation size (\p width field in bytes)
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMemoryAllocation
 * \notefnerr
 *
 * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMemcpy3D, ::cudaMemset3D,
 * ::cudaMalloc3DArray, ::cudaMallocArray, ::cudaFreeArray,
 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 * ::cudaFreeHost, ::cudaHostAlloc, ::make_cudaPitchedPtr, ::make_cudaExtent
 */
public static native @Cast("cudaError_t") int cudaMalloc3D(cudaPitchedPtr pitchedDevPtr, @ByVal cudaExtent extent);

/**
 * \brief Allocate an array on the device
 *
 * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure
 * \p desc and returns a handle to the new CUDA array in \p *array.
 *
 * The ::cudaChannelFormatDesc is defined as:
 * {@code
    struct cudaChannelFormatDesc {
        int x, y, z, w;
        enum cudaChannelFormatKind f;
    };
    }
 * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
 * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
 *
 * ::cudaMalloc3DArray() can allocate the following:
 *
 * - A 1D array is allocated if the height and depth extents are both zero.
 * - A 2D array is allocated if only the depth extent is zero.
 * - A 3D array is allocated if all three extents are non-zero.
 * - A 1D layered CUDA array is allocated if only the height extent is zero and
 * the cudaArrayLayered flag is set. Each layer is a 1D array. The number of layers is 
 * determined by the depth extent.
 * - A 2D layered CUDA array is allocated if all three extents are non-zero and 
 * the cudaArrayLayered flag is set. Each layer is a 2D array. The number of layers is 
 * determined by the depth extent.
 * - A cubemap CUDA array is allocated if all three extents are non-zero and the
 * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six. A cubemap is
 * a special type of 2D layered CUDA array, where the six layers represent the six faces of a cube. 
 * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace.
 * - A cubemap layered CUDA array is allocated if all three extents are non-zero, and both,
 * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be 
 * a multiple of six. A cubemap layered CUDA array is a special type of 2D layered CUDA array that consists 
 * of a collection of cubemaps. The first six layers represent the first cubemap, the next six layers form 
 * the second cubemap, and so on.
 *
 *
 * The \p flags parameter enables different options to be specified that affect
 * the allocation, as follows.
 * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation
 * - ::cudaArrayLayered: Allocates a layered CUDA array, with the depth extent indicating the number of layers
 * - ::cudaArrayCubemap: Allocates a cubemap CUDA array. Width must be equal to height, and depth must be six.
 *   If the cudaArrayLayered flag is also set, depth must be a multiple of six.
 * - ::cudaArraySurfaceLoadStore: Allocates a CUDA array that could be read from or written to using a surface
 *   reference.
 * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA 
 *   array. Texture gather can only be performed on 2D CUDA arrays.
 *
 * The width, height and depth extents must meet certain size requirements as listed in the following table.
 * All values are specified in elements.
 *
 * Note that 2D CUDA arrays have different size requirements if the ::cudaArrayTextureGather flag is set. In that
 * case, the valid range for (width, height, depth) is ((1,maxTexture2DGather[0]), (1,maxTexture2DGather[1]), 0).
 *
 * \xmlonly
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * CUDA array type
 * Valid extents that must always be met {(width range in elements),
 * (height range), (depth range)}
 * Valid extents with cudaArraySurfaceLoadStore set {(width range in
 * elements), (height range), (depth range)}
 * 
 * 
 * 
 * 
 * 1D
 * { (1,maxTexture1D), 0, 0 }
 * { (1,maxSurface1D), 0, 0 }
 * 
 * 
 * 2D
 * { (1,maxTexture2D[0]), (1,maxTexture2D[1]), 0 }
 * { (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 }
 * 
 * 
 * 3D
 * { (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) }
 * OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]),
 * (1,maxTexture3DAlt[2]) }
 * { (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) }
 * 
 * 
 * 1D Layered
 * { (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) }
 * { (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) }
 * 
 * 
 * 2D Layered
 * { (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]),
 * (1,maxTexture2DLayered[2]) }
 * { (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]),
 * (1,maxSurface2DLayered[2]) }
 * 
 * 
 * Cubemap
 * { (1,maxTextureCubemap), (1,maxTextureCubemap), 6 }
 * { (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 }
 * 
 * 
 * Cubemap Layered
 * { (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]),
 * (1,maxTextureCubemapLayered[1]) }
 * { (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]),
 * (1,maxSurfaceCubemapLayered[1]) }
 * 
 * 
 * 
 * 

 * \endxmlonly
 *
 * @param array  - Pointer to allocated array in device memory
 * @param desc   - Requested channel format
 * @param extent - Requested allocation size (\p width field in elements)
 * @param flags  - Flags for extensions
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMemoryAllocation
 * \notefnerr
 *
 * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
 * ::cudaFreeArray,
 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 * ::cudaFreeHost, ::cudaHostAlloc,
 * ::make_cudaExtent
 */
public static native @Cast("cudaError_t") int cudaMalloc3DArray(@ByPtrPtr cudaArray array, @Const cudaChannelFormatDesc desc, @ByVal cudaExtent extent, @Cast("unsigned int") int flags/*=0*/);
public static native @Cast("cudaError_t") int cudaMalloc3DArray(@ByPtrPtr cudaArray array, @Const cudaChannelFormatDesc desc, @ByVal cudaExtent extent);

/**
 * \brief Allocate a mipmapped array on the device
 *
 * Allocates a CUDA mipmapped array according to the ::cudaChannelFormatDesc structure
 * \p desc and returns a handle to the new CUDA mipmapped array in \p *mipmappedArray.
 * \p numLevels specifies the number of mipmap levels to be allocated. This value is
 * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
 *
 * The ::cudaChannelFormatDesc is defined as:
 * {@code
    struct cudaChannelFormatDesc {
        int x, y, z, w;
        enum cudaChannelFormatKind f;
    };
    }
 * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
 * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
 *
 * ::cudaMallocMipmappedArray() can allocate the following:
 *
 * - A 1D mipmapped array is allocated if the height and depth extents are both zero.
 * - A 2D mipmapped array is allocated if only the depth extent is zero.
 * - A 3D mipmapped array is allocated if all three extents are non-zero.
 * - A 1D layered CUDA mipmapped array is allocated if only the height extent is zero and
 * the cudaArrayLayered flag is set. Each layer is a 1D mipmapped array. The number of layers is 
 * determined by the depth extent.
 * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and 
 * the cudaArrayLayered flag is set. Each layer is a 2D mipmapped array. The number of layers is 
 * determined by the depth extent.
 * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
 * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six.
 * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace.
 * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, and both,
 * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be 
 * a multiple of six. A cubemap layered CUDA mipmapped array is a special type of 2D layered CUDA mipmapped
 * array that consists of a collection of cubemap mipmapped arrays. The first six layers represent the 
 * first cubemap mipmapped array, the next six layers form the second cubemap mipmapped array, and so on.
 *
 *
 * The \p flags parameter enables different options to be specified that affect
 * the allocation, as follows.
 * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default mipmapped array allocation
 * - ::cudaArrayLayered: Allocates a layered CUDA mipmapped array, with the depth extent indicating the number of layers
 * - ::cudaArrayCubemap: Allocates a cubemap CUDA mipmapped array. Width must be equal to height, and depth must be six.
 *   If the cudaArrayLayered flag is also set, depth must be a multiple of six.
 * - ::cudaArraySurfaceLoadStore: This flag indicates that individual mipmap levels of the CUDA mipmapped array 
 *   will be read from or written to using a surface reference.
 * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA 
 *   array. Texture gather can only be performed on 2D CUDA mipmapped arrays, and the gather operations are
 *   performed only on the most detailed mipmap level.
 *
 * The width, height and depth extents must meet certain size requirements as listed in the following table.
 * All values are specified in elements.
 *
 * \xmlonly
 * 
 * 
 * 
 * 
 * 
 * 
 * CUDA array type
 * Valid extents {(width range in elements), (height range), (depth
 * range)}
 * 
 * 
 * 
 * 
 * 1D
 * { (1,maxTexture1DMipmap), 0, 0 }
 * 
 * 
 * 2D
 * { (1,maxTexture2DMipmap[0]), (1,maxTexture2DMipmap[1]), 0 }
 * 
 * 
 * 3D
 * { (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) }
 * 
 * 
 * 1D Layered
 * { (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) }
 * 
 * 
 * 2D Layered
 * { (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]),
 * (1,maxTexture2DLayered[2]) }
 * 
 * 
 * Cubemap
 * { (1,maxTextureCubemap), (1,maxTextureCubemap), 6 }
 * 
 * 
 * Cubemap Layered
 * { (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]),
 * (1,maxTextureCubemapLayered[1]) }
 * 
 * 
 * 
 * 

 * \endxmlonly
 *
 * @param mipmappedArray  - Pointer to allocated mipmapped array in device memory
 * @param desc            - Requested channel format
 * @param extent          - Requested allocation size (\p width field in elements)
 * @param numLevels       - Number of mipmap levels to allocate
 * @param flags           - Flags for extensions
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorMemoryAllocation
 * \notefnerr
 *
 * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
 * ::cudaFreeArray,
 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 * ::cudaFreeHost, ::cudaHostAlloc,
 * ::make_cudaExtent
 */
public static native @Cast("cudaError_t") int cudaMallocMipmappedArray(@ByPtrPtr cudaMipmappedArray mipmappedArray, @Const cudaChannelFormatDesc desc, @ByVal cudaExtent extent, @Cast("unsigned int") int numLevels, @Cast("unsigned int") int flags/*=0*/);
public static native @Cast("cudaError_t") int cudaMallocMipmappedArray(@ByPtrPtr cudaMipmappedArray mipmappedArray, @Const cudaChannelFormatDesc desc, @ByVal cudaExtent extent, @Cast("unsigned int") int numLevels);

/**
 * \brief Gets a mipmap level of a CUDA mipmapped array
 *
 * Returns in \p *levelArray a CUDA array that represents a single mipmap level
 * of the CUDA mipmapped array \p mipmappedArray.
 *
 * If \p level is greater than the maximum number of levels in this mipmapped array,
 * ::cudaErrorInvalidValue is returned.
 *
 * @param levelArray     - Returned mipmap level CUDA array
 * @param mipmappedArray - CUDA mipmapped array
 * @param level          - Mipmap level
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
 * ::cudaFreeArray,
 * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
 * ::cudaFreeHost, ::cudaHostAlloc,
 * ::make_cudaExtent
 */
public static native @Cast("cudaError_t") int cudaGetMipmappedArrayLevel(@ByPtrPtr cudaArray levelArray, cudaMipmappedArray mipmappedArray, @Cast("unsigned int") int level);

/**
 * \brief Copies data between 3D objects
 *
{@code
struct cudaExtent {
  size_t width;
  size_t height;
  size_t depth;
};
struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);

struct cudaPos {
  size_t x;
  size_t y;
  size_t z;
};
struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);

struct cudaMemcpy3DParms {
  cudaArray_t           srcArray;
  struct cudaPos        srcPos;
  struct cudaPitchedPtr srcPtr;
  cudaArray_t           dstArray;
  struct cudaPos        dstPos;
  struct cudaPitchedPtr dstPtr;
  struct cudaExtent     extent;
  enum cudaMemcpyKind   kind;
};
}
 *
 * ::cudaMemcpy3D() copies data betwen two 3D objects. The source and
 * destination objects may be in either host memory, device memory, or a CUDA
 * array. The source, destination, extent, and kind of copy performed is
 * specified by the ::cudaMemcpy3DParms struct which should be initialized to
 * zero before use:
{@code
cudaMemcpy3DParms myParms = {0};
}
 *
 * The struct passed to ::cudaMemcpy3D() must specify one of \p srcArray or
 * \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one
 * non-zero source or destination will cause ::cudaMemcpy3D() to return an
 * error.
 *
 * The \p srcPos and \p dstPos fields are optional offsets into the source and
 * destination objects and are defined in units of each object's elements. The
 * element for a host or device pointer is assumed to be unsigned char.
 * For CUDA arrays, positions must be in the range [0, 2048) for any
 * dimension.
 *
 * The \p extent field defines the dimensions of the transferred area in
 * elements. If a CUDA array is participating in the copy, the extent is
 * defined in terms of that array's elements. If no CUDA array is
 * participating in the copy then the extents are defined in elements of
 * unsigned char.
 *
 * The \p kind field defines the direction of the copy. It must be one of
 * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 *
 * If the source and destination are both arrays, ::cudaMemcpy3D() will return
 * an error if they do not have the same element size.
 *
 * The source and destination object may not overlap. If overlapping source
 * and destination objects are specified, undefined behavior will result.
 *
 * The source object must lie entirely within the region defined by \p srcPos
 * and \p extent. The destination object must lie entirely within the region
 * defined by \p dstPos and \p extent.
 *
 * ::cudaMemcpy3D() returns an error if the pitch of \p srcPtr or \p dstPtr
 * exceeds the maximum allowed. The pitch of a ::cudaPitchedPtr allocated
 * with ::cudaMalloc3D() will always be valid.
 *
 * @param p - 3D memory copy parameters
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidPitchValue,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_sync
 *
 * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3DAsync,
 * ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
 * ::make_cudaExtent, ::make_cudaPos
 */
public static native @Cast("cudaError_t") int cudaMemcpy3D(@Const cudaMemcpy3DParms p);

/**
 * \brief Copies memory between devices
 *
 * Perform a 3D memory copy according to the parameters specified in
 * \p p.  See the definition of the ::cudaMemcpy3DPeerParms structure
 * for documentation of its parameters.
 *
 * Note that this function is synchronous with respect to the host only if
 * the source or destination of the transfer is host memory.  Note also 
 * that this copy is serialized with respect to all pending and future 
 * asynchronous work in to the current device, the copy's source device,
 * and the copy's destination device (use ::cudaMemcpy3DPeerAsync to avoid 
 * this synchronization).
 *
 * @param p - Parameters for the memory copy
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevice
 * \notefnerr
 * \note_sync
 *
 * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
 * ::cudaMemcpy3DPeerAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpy3DPeer(@Const cudaMemcpy3DPeerParms p);

/**
 * \brief Copies data between 3D objects
 *
{@code
struct cudaExtent {
  size_t width;
  size_t height;
  size_t depth;
};
struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);

struct cudaPos {
  size_t x;
  size_t y;
  size_t z;
};
struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);

struct cudaMemcpy3DParms {
  cudaArray_t           srcArray;
  struct cudaPos        srcPos;
  struct cudaPitchedPtr srcPtr;
  cudaArray_t           dstArray;
  struct cudaPos        dstPos;
  struct cudaPitchedPtr dstPtr;
  struct cudaExtent     extent;
  enum cudaMemcpyKind   kind;
};
}
 *
 * ::cudaMemcpy3DAsync() copies data betwen two 3D objects. The source and
 * destination objects may be in either host memory, device memory, or a CUDA
 * array. The source, destination, extent, and kind of copy performed is
 * specified by the ::cudaMemcpy3DParms struct which should be initialized to
 * zero before use:
{@code
cudaMemcpy3DParms myParms = {0};
}
 *
 * The struct passed to ::cudaMemcpy3DAsync() must specify one of \p srcArray
 * or \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one
 * non-zero source or destination will cause ::cudaMemcpy3DAsync() to return an
 * error.
 *
 * The \p srcPos and \p dstPos fields are optional offsets into the source and
 * destination objects and are defined in units of each object's elements. The
 * element for a host or device pointer is assumed to be unsigned char.
 * For CUDA arrays, positions must be in the range [0, 2048) for any
 * dimension.
 *
 * The \p extent field defines the dimensions of the transferred area in
 * elements. If a CUDA array is participating in the copy, the extent is
 * defined in terms of that array's elements. If no CUDA array is
 * participating in the copy then the extents are defined in elements of
 * unsigned char.
 *
 * The \p kind field defines the direction of the copy. It must be one of
 * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 *
 * If the source and destination are both arrays, ::cudaMemcpy3DAsync() will
 * return an error if they do not have the same element size.
 *
 * The source and destination object may not overlap. If overlapping source
 * and destination objects are specified, undefined behavior will result.
 *
 * The source object must lie entirely within the region defined by \p srcPos
 * and \p extent. The destination object must lie entirely within the region
 * defined by \p dstPos and \p extent.
 *
 * ::cudaMemcpy3DAsync() returns an error if the pitch of \p srcPtr or
 * \p dstPtr exceeds the maximum allowed. The pitch of a
 * ::cudaPitchedPtr allocated with ::cudaMalloc3D() will always be valid.
 *
 * ::cudaMemcpy3DAsync() is asynchronous with respect to the host, so
 * the call may return before the copy is complete. The copy can optionally
 * be associated to a stream by passing a non-zero \p stream argument. If
 * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
 * is non-zero, the copy may overlap with operations in other streams.
 *
 * The device version of this function only handles device to device copies and
 * cannot be given local or shared pointers.
 *
 * @param p      - 3D memory copy parameters
 * @param stream - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidPitchValue,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3D,
 * ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
 * ::make_cudaExtent, ::make_cudaPos
 */
public static native @Cast("cudaError_t") int cudaMemcpy3DAsync(@Const cudaMemcpy3DParms p, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemcpy3DAsync(@Const cudaMemcpy3DParms p);

/**
 * \brief Copies memory between devices asynchronously.
 *
 * Perform a 3D memory copy according to the parameters specified in
 * \p p.  See the definition of the ::cudaMemcpy3DPeerParms structure
 * for documentation of its parameters.
 *
 * @param p      - Parameters for the memory copy
 * @param stream - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevice
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
 * ::cudaMemcpy3DPeerAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpy3DPeerAsync(@Const cudaMemcpy3DPeerParms p, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemcpy3DPeerAsync(@Const cudaMemcpy3DPeerParms p);

/**
 * \brief Gets free and total device memory
 *
 * Returns in \p *free and \p *total respectively, the free and total amount of
 * memory available for allocation by the device in bytes.
 *
 * @param free  - Returned free memory in bytes
 * @param total - Returned total memory in bytes
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInitializationError,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorLaunchFailure
 * \notefnerr
 *
 */
public static native @Cast("cudaError_t") int cudaMemGetInfo(@Cast("size_t*") SizeTPointer free, @Cast("size_t*") SizeTPointer total);

/**
 * \brief Gets info about the specified cudaArray
 * 
 * Returns in \p *desc, \p *extent and \p *flags respectively, the type, shape 
 * and flags of \p array.
 *
 * Any of \p *desc, \p *extent and \p *flags may be specified as NULL.
 *
 * @param desc   - Returned array type
 * @param extent - Returned array shape. 2D arrays will have depth of zero
 * @param flags  - Returned array flags
 * @param array  - The ::cudaArray to get info for
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 */
public static native @Cast("cudaError_t") int cudaArrayGetInfo(cudaChannelFormatDesc desc, cudaExtent extent, @Cast("unsigned int*") IntPointer flags, cudaArray array);
public static native @Cast("cudaError_t") int cudaArrayGetInfo(cudaChannelFormatDesc desc, cudaExtent extent, @Cast("unsigned int*") IntBuffer flags, cudaArray array);
public static native @Cast("cudaError_t") int cudaArrayGetInfo(cudaChannelFormatDesc desc, cudaExtent extent, @Cast("unsigned int*") int[] flags, cudaArray array);

/**
 * \brief Copies data between host and device
 *
 * Copies \p count bytes from the memory area pointed to by \p src to the
 * memory area pointed to by \p dst, where \p kind specifies the direction
 * of the copy, and must be one of ::cudaMemcpyHostToHost,
 * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing. Calling
 * ::cudaMemcpy() with dst and src pointers that do not match the direction of
 * the copy results in an undefined behavior.
 *
 * @param dst   - Destination memory address
 * @param src   - Source memory address
 * @param count - Size in bytes to copy
 * @param kind  - Type of transfer
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 *
 * \note_sync
 *
 * \sa ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpy(Pointer dst, @Const Pointer src, @Cast("size_t") long count, @Cast("cudaMemcpyKind") int kind);

/**
 * \brief Copies memory between two devices
 *
 * Copies memory from one device to memory on another device.  \p dst is the 
 * base device pointer of the destination memory and \p dstDevice is the 
 * destination device.  \p src is the base device pointer of the source memory 
 * and \p srcDevice is the source device.  \p count specifies the number of bytes 
 * to copy.
 *
 * Note that this function is asynchronous with respect to the host, but 
 * serialized with respect all pending and future asynchronous work in to the 
 * current device, \p srcDevice, and \p dstDevice (use ::cudaMemcpyPeerAsync 
 * to avoid this synchronization).
 *
 * @param dst       - Destination device pointer
 * @param dstDevice - Destination device
 * @param src       - Source device pointer
 * @param srcDevice - Source device
 * @param count     - Size of memory copy in bytes
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevice
 * \notefnerr
 * \note_sync
 *
 * \sa ::cudaMemcpy, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
 * ::cudaMemcpy3DPeerAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpyPeer(Pointer dst, int dstDevice, @Const Pointer src, int srcDevice, @Cast("size_t") long count);

/**
 * \brief Copies data between host and device
 *
 * Copies \p count bytes from the memory area pointed to by \p src to the
 * CUDA array \p dst starting at the upper left corner
 * (\p wOffset, \p hOffset), where \p kind specifies the direction
 * of the copy, and must be one of ::cudaMemcpyHostToHost,
 * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 *
 * @param dst     - Destination memory address
 * @param wOffset - Destination starting X offset
 * @param hOffset - Destination starting Y offset
 * @param src     - Source memory address
 * @param count   - Size in bytes to copy
 * @param kind    - Type of transfer
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_sync
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpyToArray(cudaArray dst, @Cast("size_t") long wOffset, @Cast("size_t") long hOffset, @Const Pointer src, @Cast("size_t") long count, @Cast("cudaMemcpyKind") int kind);

/**
 * \brief Copies data between host and device
 *
 * Copies \p count bytes from the CUDA array \p src starting at the upper
 * left corner (\p wOffset, hOffset) to the memory area pointed to by \p dst,
 * where \p kind specifies the direction of the copy, and must be one of
 * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 *
 * @param dst     - Destination memory address
 * @param src     - Source memory address
 * @param wOffset - Source starting X offset
 * @param hOffset - Source starting Y offset
 * @param count   - Size in bytes to copy
 * @param kind    - Type of transfer
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_sync
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpyFromArray(Pointer dst, cudaArray src, @Cast("size_t") long wOffset, @Cast("size_t") long hOffset, @Cast("size_t") long count, @Cast("cudaMemcpyKind") int kind);

/**
 * \brief Copies data between host and device
 *
 * Copies \p count bytes from the CUDA array \p src starting at the upper
 * left corner (\p wOffsetSrc, \p hOffsetSrc) to the CUDA array \p dst
 * starting at the upper left corner (\p wOffsetDst, \p hOffsetDst) where
 * \p kind specifies the direction of the copy, and must be one of
 * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 *
 * @param dst        - Destination memory address
 * @param wOffsetDst - Destination starting X offset
 * @param hOffsetDst - Destination starting Y offset
 * @param src        - Source memory address
 * @param wOffsetSrc - Source starting X offset
 * @param hOffsetSrc - Source starting Y offset
 * @param count      - Size in bytes to copy
 * @param kind       - Type of transfer
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpyArrayToArray(cudaArray dst, @Cast("size_t") long wOffsetDst, @Cast("size_t") long hOffsetDst, cudaArray src, @Cast("size_t") long wOffsetSrc, @Cast("size_t") long hOffsetSrc, @Cast("size_t") long count, @Cast("cudaMemcpyKind") int kind/*=cudaMemcpyDeviceToDevice*/);
public static native @Cast("cudaError_t") int cudaMemcpyArrayToArray(cudaArray dst, @Cast("size_t") long wOffsetDst, @Cast("size_t") long hOffsetDst, cudaArray src, @Cast("size_t") long wOffsetSrc, @Cast("size_t") long hOffsetSrc, @Cast("size_t") long count);

/**
 * \brief Copies data between host and device
 *
 * Copies a matrix (\p height rows of \p width bytes each) from the memory
 * area pointed to by \p src to the memory area pointed to by \p dst, where
 * \p kind specifies the direction of the copy, and must be one of
 * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing. \p dpitch and
 * \p spitch are the widths in memory in bytes of the 2D arrays pointed to by
 * \p dst and \p src, including any padding added to the end of each row. The
 * memory areas may not overlap. \p width must not exceed either \p dpitch or
 * \p spitch. Calling ::cudaMemcpy2D() with \p dst and \p src pointers that do
 * not match the direction of the copy results in an undefined behavior.
 * ::cudaMemcpy2D() returns an error if \p dpitch or \p spitch exceeds
 * the maximum allowed.
 *
 * @param dst    - Destination memory address
 * @param dpitch - Pitch of destination memory
 * @param src    - Source memory address
 * @param spitch - Pitch of source memory
 * @param width  - Width of matrix transfer (columns in bytes)
 * @param height - Height of matrix transfer (rows)
 * @param kind   - Type of transfer
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidPitchValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 *
 * \sa ::cudaMemcpy, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpy2D(Pointer dst, @Cast("size_t") long dpitch, @Const Pointer src, @Cast("size_t") long spitch, @Cast("size_t") long width, @Cast("size_t") long height, @Cast("cudaMemcpyKind") int kind);

/**
 * \brief Copies data between host and device
 *
 * Copies a matrix (\p height rows of \p width bytes each) from the memory
 * area pointed to by \p src to the CUDA array \p dst starting at the
 * upper left corner (\p wOffset, \p hOffset) where \p kind specifies the
 * direction of the copy, and must be one of ::cudaMemcpyHostToHost,
 * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 * \p spitch is the width in memory in bytes of the 2D array pointed to by
 * \p src, including any padding added to the end of each row. \p wOffset +
 * \p width must not exceed the width of the CUDA array \p dst. \p width must
 * not exceed \p spitch. ::cudaMemcpy2DToArray() returns an error if \p spitch
 * exceeds the maximum allowed.
 *
 * @param dst     - Destination memory address
 * @param wOffset - Destination starting X offset
 * @param hOffset - Destination starting Y offset
 * @param src     - Source memory address
 * @param spitch  - Pitch of source memory
 * @param width   - Width of matrix transfer (columns in bytes)
 * @param height  - Height of matrix transfer (rows)
 * @param kind    - Type of transfer
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidPitchValue,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_sync
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpy2DToArray(cudaArray dst, @Cast("size_t") long wOffset, @Cast("size_t") long hOffset, @Const Pointer src, @Cast("size_t") long spitch, @Cast("size_t") long width, @Cast("size_t") long height, @Cast("cudaMemcpyKind") int kind);

/**
 * \brief Copies data between host and device
 *
 * Copies a matrix (\p height rows of \p width bytes each) from the CUDA
 * array \p srcArray starting at the upper left corner
 * (\p wOffset, \p hOffset) to the memory area pointed to by \p dst, where
 * \p kind specifies the direction of the copy, and must be one of
 * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing. \p dpitch is the
 * width in memory in bytes of the 2D array pointed to by \p dst, including any
 * padding added to the end of each row. \p wOffset + \p width must not exceed
 * the width of the CUDA array \p src. \p width must not exceed \p dpitch.
 * ::cudaMemcpy2DFromArray() returns an error if \p dpitch exceeds the maximum
 * allowed.
 *
 * @param dst     - Destination memory address
 * @param dpitch  - Pitch of destination memory
 * @param src     - Source memory address
 * @param wOffset - Source starting X offset
 * @param hOffset - Source starting Y offset
 * @param width   - Width of matrix transfer (columns in bytes)
 * @param height  - Height of matrix transfer (rows)
 * @param kind    - Type of transfer
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidPitchValue,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_sync
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpy2DFromArray(Pointer dst, @Cast("size_t") long dpitch, cudaArray src, @Cast("size_t") long wOffset, @Cast("size_t") long hOffset, @Cast("size_t") long width, @Cast("size_t") long height, @Cast("cudaMemcpyKind") int kind);

/**
 * \brief Copies data between host and device
 *
 * Copies a matrix (\p height rows of \p width bytes each) from the CUDA
 * array \p srcArray starting at the upper left corner
 * (\p wOffsetSrc, \p hOffsetSrc) to the CUDA array \p dst starting at
 * the upper left corner (\p wOffsetDst, \p hOffsetDst), where \p kind
 * specifies the direction of the copy, and must be one of
 * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 * \p wOffsetDst + \p width must not exceed the width of the CUDA array \p dst.
 * \p wOffsetSrc + \p width must not exceed the width of the CUDA array \p src.
 *
 * @param dst        - Destination memory address
 * @param wOffsetDst - Destination starting X offset
 * @param hOffsetDst - Destination starting Y offset
 * @param src        - Source memory address
 * @param wOffsetSrc - Source starting X offset
 * @param hOffsetSrc - Source starting Y offset
 * @param width      - Width of matrix transfer (columns in bytes)
 * @param height     - Height of matrix transfer (rows)
 * @param kind       - Type of transfer
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_sync
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpy2DArrayToArray(cudaArray dst, @Cast("size_t") long wOffsetDst, @Cast("size_t") long hOffsetDst, cudaArray src, @Cast("size_t") long wOffsetSrc, @Cast("size_t") long hOffsetSrc, @Cast("size_t") long width, @Cast("size_t") long height, @Cast("cudaMemcpyKind") int kind/*=cudaMemcpyDeviceToDevice*/);
public static native @Cast("cudaError_t") int cudaMemcpy2DArrayToArray(cudaArray dst, @Cast("size_t") long wOffsetDst, @Cast("size_t") long hOffsetDst, cudaArray src, @Cast("size_t") long wOffsetSrc, @Cast("size_t") long hOffsetSrc, @Cast("size_t") long width, @Cast("size_t") long height);

/**
 * \brief Copies data to the given symbol on the device
 *
 * Copies \p count bytes from the memory area pointed to by \p src
 * to the memory area pointed to by \p offset bytes from the start of symbol
 * \p symbol. The memory areas may not overlap. \p symbol is a variable that
 * resides in global or constant memory space. \p kind can be either
 * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
 * Passing ::cudaMemcpyDefault is recommended, in which case the type of
 * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
 * is only allowed on systems that support unified virtual addressing.
 *
 * @param symbol - Device symbol address
 * @param src    - Source memory address
 * @param count  - Size in bytes to copy
 * @param offset - Offset from start of symbol in bytes
 * @param kind   - Type of transfer
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidSymbol,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_sync
 * \note_string_api_deprecation
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpyToSymbol(@Const Pointer symbol, @Const Pointer src, @Cast("size_t") long count, @Cast("size_t") long offset/*=0*/, @Cast("cudaMemcpyKind") int kind/*=cudaMemcpyHostToDevice*/);
public static native @Cast("cudaError_t") int cudaMemcpyToSymbol(@Const Pointer symbol, @Const Pointer src, @Cast("size_t") long count);

/**
 * \brief Copies data from the given symbol on the device
 *
 * Copies \p count bytes from the memory area pointed to by \p offset bytes
 * from the start of symbol \p symbol to the memory area pointed to by \p dst.
 * The memory areas may not overlap. \p symbol is a variable that
 * resides in global or constant memory space. \p kind can be either
 * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
 * Passing ::cudaMemcpyDefault is recommended, in which case the type of
 * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
 * is only allowed on systems that support unified virtual addressing.
 *
 * @param dst    - Destination memory address
 * @param symbol - Device symbol address
 * @param count  - Size in bytes to copy
 * @param offset - Offset from start of symbol in bytes
 * @param kind   - Type of transfer
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidSymbol,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_sync
 * \note_string_api_deprecation
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpyFromSymbol(Pointer dst, @Const Pointer symbol, @Cast("size_t") long count, @Cast("size_t") long offset/*=0*/, @Cast("cudaMemcpyKind") int kind/*=cudaMemcpyDeviceToHost*/);
public static native @Cast("cudaError_t") int cudaMemcpyFromSymbol(Pointer dst, @Const Pointer symbol, @Cast("size_t") long count);


/**
 * \brief Copies data between host and device
 *
 * Copies \p count bytes from the memory area pointed to by \p src to the
 * memory area pointed to by \p dst, where \p kind specifies the
 * direction of the copy, and must be one of ::cudaMemcpyHostToHost,
 * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 * 
 * The memory areas may not overlap. Calling ::cudaMemcpyAsync() with \p dst and
 * \p src pointers that do not match the direction of the copy results in an
 * undefined behavior.
 *
 * ::cudaMemcpyAsync() is asynchronous with respect to the host, so the call
 * may return before the copy is complete. The copy can optionally be
 * associated to a stream by passing a non-zero \p stream argument. If \p kind
 * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and the \p stream is
 * non-zero, the copy may overlap with operations in other streams.
 *
 * The device version of this function only handles device to device copies and
 * cannot be given local or shared pointers.
 *
 * @param dst    - Destination memory address
 * @param src    - Source memory address
 * @param count  - Size in bytes to copy
 * @param kind   - Type of transfer
 * @param stream - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpyAsync(Pointer dst, @Const Pointer src, @Cast("size_t") long count, @Cast("cudaMemcpyKind") int kind, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemcpyAsync(Pointer dst, @Const Pointer src, @Cast("size_t") long count, @Cast("cudaMemcpyKind") int kind);

/**
 * \brief Copies memory between two devices asynchronously.
 *
 * Copies memory from one device to memory on another device.  \p dst is the 
 * base device pointer of the destination memory and \p dstDevice is the 
 * destination device.  \p src is the base device pointer of the source memory 
 * and \p srcDevice is the source device.  \p count specifies the number of bytes 
 * to copy.
 *
 * Note that this function is asynchronous with respect to the host and all work
 * on other devices.
 *
 * @param dst       - Destination device pointer
 * @param dstDevice - Destination device
 * @param src       - Source device pointer
 * @param srcDevice - Source device
 * @param count     - Size of memory copy in bytes
 * @param stream    - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevice
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
 * ::cudaMemcpy3DPeerAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpyPeerAsync(Pointer dst, int dstDevice, @Const Pointer src, int srcDevice, @Cast("size_t") long count, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemcpyPeerAsync(Pointer dst, int dstDevice, @Const Pointer src, int srcDevice, @Cast("size_t") long count);

/**
 * \brief Copies data between host and device
 *
 * Copies \p count bytes from the memory area pointed to by \p src to the
 * CUDA array \p dst starting at the upper left corner
 * (\p wOffset, \p hOffset), where \p kind specifies the
 * direction of the copy, and must be one of ::cudaMemcpyHostToHost,
 * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 *
 * ::cudaMemcpyToArrayAsync() is asynchronous with respect to the host, so
 * the call may return before the copy is complete. The copy can optionally
 * be associated to a stream by passing a non-zero \p stream argument. If \p
 * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
 * is non-zero, the copy may overlap with operations in other streams.
 *
 * @param dst     - Destination memory address
 * @param wOffset - Destination starting X offset
 * @param hOffset - Destination starting Y offset
 * @param src     - Source memory address
 * @param count   - Size in bytes to copy
 * @param kind    - Type of transfer
 * @param stream  - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpyToArrayAsync(cudaArray dst, @Cast("size_t") long wOffset, @Cast("size_t") long hOffset, @Const Pointer src, @Cast("size_t") long count, @Cast("cudaMemcpyKind") int kind, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemcpyToArrayAsync(cudaArray dst, @Cast("size_t") long wOffset, @Cast("size_t") long hOffset, @Const Pointer src, @Cast("size_t") long count, @Cast("cudaMemcpyKind") int kind);

/**
 * \brief Copies data between host and device
 *
 * Copies \p count bytes from the CUDA array \p src starting at the upper
 * left corner (\p wOffset, hOffset) to the memory area pointed to by \p dst,
 * where \p kind specifies the direction of the copy, and must be one of
 * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 *
 * ::cudaMemcpyFromArrayAsync() is asynchronous with respect to the host, so
 * the call may return before the copy is complete. The copy can optionally
 * be associated to a stream by passing a non-zero \p stream argument. If \p
 * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
 * is non-zero, the copy may overlap with operations in other streams.
 *
 * @param dst     - Destination memory address
 * @param src     - Source memory address
 * @param wOffset - Source starting X offset
 * @param hOffset - Source starting Y offset
 * @param count   - Size in bytes to copy
 * @param kind    - Type of transfer
 * @param stream  - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpyFromArrayAsync(Pointer dst, cudaArray src, @Cast("size_t") long wOffset, @Cast("size_t") long hOffset, @Cast("size_t") long count, @Cast("cudaMemcpyKind") int kind, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemcpyFromArrayAsync(Pointer dst, cudaArray src, @Cast("size_t") long wOffset, @Cast("size_t") long hOffset, @Cast("size_t") long count, @Cast("cudaMemcpyKind") int kind);

/**
 * \brief Copies data between host and device
 *
 * Copies a matrix (\p height rows of \p width bytes each) from the memory
 * area pointed to by \p src to the memory area pointed to by \p dst, where
 * \p kind specifies the direction of the copy, and must be one of
 * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 * \p dpitch and \p spitch are the widths in memory in bytes of the 2D arrays
 * pointed to by \p dst and \p src, including any padding added to the end of
 * each row. The memory areas may not overlap. \p width must not exceed either
 * \p dpitch or \p spitch.
 *
 * Calling ::cudaMemcpy2DAsync() with \p dst and \p src pointers that do not
 * match the direction of the copy results in an undefined behavior.
 * ::cudaMemcpy2DAsync() returns an error if \p dpitch or \p spitch is greater
 * than the maximum allowed.
 *
 * ::cudaMemcpy2DAsync() is asynchronous with respect to the host, so
 * the call may return before the copy is complete. The copy can optionally
 * be associated to a stream by passing a non-zero \p stream argument. If
 * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and
 * \p stream is non-zero, the copy may overlap with operations in other
 * streams.
 *
 * The device version of this function only handles device to device copies and
 * cannot be given local or shared pointers.
 *
 * @param dst    - Destination memory address
 * @param dpitch - Pitch of destination memory
 * @param src    - Source memory address
 * @param spitch - Pitch of source memory
 * @param width  - Width of matrix transfer (columns in bytes)
 * @param height - Height of matrix transfer (rows)
 * @param kind   - Type of transfer
 * @param stream - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidPitchValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpy2DAsync(Pointer dst, @Cast("size_t") long dpitch, @Const Pointer src, @Cast("size_t") long spitch, @Cast("size_t") long width, @Cast("size_t") long height, @Cast("cudaMemcpyKind") int kind, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemcpy2DAsync(Pointer dst, @Cast("size_t") long dpitch, @Const Pointer src, @Cast("size_t") long spitch, @Cast("size_t") long width, @Cast("size_t") long height, @Cast("cudaMemcpyKind") int kind);

/**
 * \brief Copies data between host and device
 *
 * Copies a matrix (\p height rows of \p width bytes each) from the memory
 * area pointed to by \p src to the CUDA array \p dst starting at the
 * upper left corner (\p wOffset, \p hOffset) where \p kind specifies the
 * direction of the copy, and must be one of ::cudaMemcpyHostToHost,
 * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 * \p spitch is the width in memory in bytes of the 2D array pointed to by
 * \p src, including any padding added to the end of each row. \p wOffset +
 * \p width must not exceed the width of the CUDA array \p dst. \p width must
 * not exceed \p spitch. ::cudaMemcpy2DToArrayAsync() returns an error if
 * \p spitch exceeds the maximum allowed.
 *
 * ::cudaMemcpy2DToArrayAsync() is asynchronous with respect to the host, so
 * the call may return before the copy is complete. The copy can optionally
 * be associated to a stream by passing a non-zero \p stream argument. If
 * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and
 * \p stream is non-zero, the copy may overlap with operations in other
 * streams.
 *
 * @param dst     - Destination memory address
 * @param wOffset - Destination starting X offset
 * @param hOffset - Destination starting Y offset
 * @param src     - Source memory address
 * @param spitch  - Pitch of source memory
 * @param width   - Width of matrix transfer (columns in bytes)
 * @param height  - Height of matrix transfer (rows)
 * @param kind    - Type of transfer
 * @param stream  - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidPitchValue,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpy2DToArrayAsync(cudaArray dst, @Cast("size_t") long wOffset, @Cast("size_t") long hOffset, @Const Pointer src, @Cast("size_t") long spitch, @Cast("size_t") long width, @Cast("size_t") long height, @Cast("cudaMemcpyKind") int kind, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemcpy2DToArrayAsync(cudaArray dst, @Cast("size_t") long wOffset, @Cast("size_t") long hOffset, @Const Pointer src, @Cast("size_t") long spitch, @Cast("size_t") long width, @Cast("size_t") long height, @Cast("cudaMemcpyKind") int kind);

/**
 * \brief Copies data between host and device
 *
 * Copies a matrix (\p height rows of \p width bytes each) from the CUDA
 * array \p srcArray starting at the upper left corner
 * (\p wOffset, \p hOffset) to the memory area pointed to by \p dst, where
 * \p kind specifies the direction of the copy, and must be one of
 * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
 * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
 * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
 * inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 * \p dpitch is the width in memory in bytes of the 2D
 * array pointed to by \p dst, including any padding added to the end of each
 * row. \p wOffset + \p width must not exceed the width of the CUDA array
 * \p src. \p width must not exceed \p dpitch. ::cudaMemcpy2DFromArrayAsync()
 * returns an error if \p dpitch exceeds the maximum allowed.
 *
 * ::cudaMemcpy2DFromArrayAsync() is asynchronous with respect to the host, so
 * the call may return before the copy is complete. The copy can optionally be
 * associated to a stream by passing a non-zero \p stream argument. If \p kind
 * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream is
 * non-zero, the copy may overlap with operations in other streams.
 *
 * @param dst     - Destination memory address
 * @param dpitch  - Pitch of destination memory
 * @param src     - Source memory address
 * @param wOffset - Source starting X offset
 * @param hOffset - Source starting Y offset
 * @param width   - Width of matrix transfer (columns in bytes)
 * @param height  - Height of matrix transfer (rows)
 * @param kind    - Type of transfer
 * @param stream  - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidPitchValue,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpy2DFromArrayAsync(Pointer dst, @Cast("size_t") long dpitch, cudaArray src, @Cast("size_t") long wOffset, @Cast("size_t") long hOffset, @Cast("size_t") long width, @Cast("size_t") long height, @Cast("cudaMemcpyKind") int kind, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemcpy2DFromArrayAsync(Pointer dst, @Cast("size_t") long dpitch, cudaArray src, @Cast("size_t") long wOffset, @Cast("size_t") long hOffset, @Cast("size_t") long width, @Cast("size_t") long height, @Cast("cudaMemcpyKind") int kind);

/**
 * \brief Copies data to the given symbol on the device
 *
 * Copies \p count bytes from the memory area pointed to by \p src
 * to the memory area pointed to by \p offset bytes from the start of symbol
 * \p symbol. The memory areas may not overlap. \p symbol is a variable that
 * resides in global or constant memory space. \p kind can be either
 * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
 * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
 * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 *
 * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so
 * the call may return before the copy is complete. The copy can optionally
 * be associated to a stream by passing a non-zero \p stream argument. If
 * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy
 * may overlap with operations in other streams.
 *
 * @param symbol - Device symbol address
 * @param src    - Source memory address
 * @param count  - Size in bytes to copy
 * @param offset - Offset from start of symbol in bytes
 * @param kind   - Type of transfer
 * @param stream - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidSymbol,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_async
 * \note_null_stream
 * \note_string_api_deprecation
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyFromSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpyToSymbolAsync(@Const Pointer symbol, @Const Pointer src, @Cast("size_t") long count, @Cast("size_t") long offset, @Cast("cudaMemcpyKind") int kind, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemcpyToSymbolAsync(@Const Pointer symbol, @Const Pointer src, @Cast("size_t") long count, @Cast("size_t") long offset, @Cast("cudaMemcpyKind") int kind);

/**
 * \brief Copies data from the given symbol on the device
 *
 * Copies \p count bytes from the memory area pointed to by \p offset bytes
 * from the start of symbol \p symbol to the memory area pointed to by \p dst.
 * The memory areas may not overlap. \p symbol is a variable that resides in
 * global or constant memory space. \p kind can be either
 * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
 * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
 * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
 * allowed on systems that support unified virtual addressing.
 *
 * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so
 * the call may return before the copy is complete. The copy can optionally be
 * associated to a stream by passing a non-zero \p stream argument. If \p kind
 * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap
 * with operations in other streams.
 *
 * @param dst    - Destination memory address
 * @param symbol - Device symbol address
 * @param count  - Size in bytes to copy
 * @param offset - Offset from start of symbol in bytes
 * @param kind   - Type of transfer
 * @param stream - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidSymbol,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidMemcpyDirection
 * \notefnerr
 * \note_async
 * \note_null_stream
 * \note_string_api_deprecation
 *
 * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
 * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
 * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
 * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
 * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
 * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
 * ::cudaMemcpyToSymbolAsync
 */
public static native @Cast("cudaError_t") int cudaMemcpyFromSymbolAsync(Pointer dst, @Const Pointer symbol, @Cast("size_t") long count, @Cast("size_t") long offset, @Cast("cudaMemcpyKind") int kind, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemcpyFromSymbolAsync(Pointer dst, @Const Pointer symbol, @Cast("size_t") long count, @Cast("size_t") long offset, @Cast("cudaMemcpyKind") int kind);


/**
 * \brief Initializes or sets device memory to a value
 *
 * Fills the first \p count bytes of the memory area pointed to by \p devPtr
 * with the constant byte value \p value.
 *
 * Note that this function is asynchronous with respect to the host unless
 * \p devPtr refers to pinned host memory.
 *
 * @param devPtr - Pointer to device memory
 * @param value  - Value to set for each byte of specified memory
 * @param count  - Size in bytes to set
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer
 * \notefnerr
 * \note_memset
 *
 * \sa ::cudaMemset2D, ::cudaMemset3D, ::cudaMemsetAsync,
 * ::cudaMemset2DAsync, ::cudaMemset3DAsync
 */
public static native @Cast("cudaError_t") int cudaMemset(Pointer devPtr, int value, @Cast("size_t") long count);

/**
 * \brief Initializes or sets device memory to a value
 *
 * Sets to the specified value \p value a matrix (\p height rows of \p width
 * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the
 * 2D array pointed to by \p dstPtr, including any padding added to the end
 * of each row. This function performs fastest when the pitch is one that has
 * been passed back by ::cudaMallocPitch().
 *
 * Note that this function is asynchronous with respect to the host unless
 * \p devPtr refers to pinned host memory.
 *
 * @param devPtr - Pointer to 2D device memory
 * @param pitch  - Pitch in bytes of 2D device memory
 * @param value  - Value to set for each byte of specified memory
 * @param width  - Width of matrix set (columns in bytes)
 * @param height - Height of matrix set (rows)
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer
 * \notefnerr
 * \note_memset
 *
 * \sa ::cudaMemset, ::cudaMemset3D, ::cudaMemsetAsync,
 * ::cudaMemset2DAsync, ::cudaMemset3DAsync
 */
public static native @Cast("cudaError_t") int cudaMemset2D(Pointer devPtr, @Cast("size_t") long pitch, int value, @Cast("size_t") long width, @Cast("size_t") long height);

/**
 * \brief Initializes or sets device memory to a value
 *
 * Initializes each element of a 3D array to the specified value \p value.
 * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field
 * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed
 * to by \p pitchedDevPtr, including any padding added to the end of each row.
 * The \p xsize field specifies the logical width of each row in bytes, while
 * the \p ysize field specifies the height of each 2D slice in rows.
 *
 * The extents of the initialized region are specified as a \p width in bytes,
 * a \p height in rows, and a \p depth in slices.
 *
 * Extents with \p width greater than or equal to the \p xsize of
 * \p pitchedDevPtr may perform significantly faster than extents narrower
 * than the \p xsize. Secondarily, extents with \p height equal to the
 * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is
 * shorter than the \p ysize.
 *
 * This function performs fastest when the \p pitchedDevPtr has been allocated
 * by ::cudaMalloc3D().
 *
 * Note that this function is asynchronous with respect to the host unless
 * \p pitchedDevPtr refers to pinned host memory.
 *
 * @param pitchedDevPtr - Pointer to pitched device memory
 * @param value         - Value to set for each byte of specified memory
 * @param extent        - Size parameters for where to set device memory (\p width field in bytes)
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer
 * \notefnerr
 * \note_memset
 *
 * \sa ::cudaMemset, ::cudaMemset2D,
 * ::cudaMemsetAsync, ::cudaMemset2DAsync, ::cudaMemset3DAsync,
 * ::cudaMalloc3D, ::make_cudaPitchedPtr,
 * ::make_cudaExtent
 */
public static native @Cast("cudaError_t") int cudaMemset3D(@ByVal cudaPitchedPtr pitchedDevPtr, int value, @ByVal cudaExtent extent);

/**
 * \brief Initializes or sets device memory to a value
 *
 * Fills the first \p count bytes of the memory area pointed to by \p devPtr
 * with the constant byte value \p value.
 *
 * ::cudaMemsetAsync() is asynchronous with respect to the host, so
 * the call may return before the memset is complete. The operation can optionally
 * be associated to a stream by passing a non-zero \p stream argument.
 * If \p stream is non-zero, the operation may overlap with operations in other streams.
 *
 * The device version of this function only handles device to device copies and
 * cannot be given local or shared pointers.
 *
 * @param devPtr - Pointer to device memory
 * @param value  - Value to set for each byte of specified memory
 * @param count  - Size in bytes to set
 * @param stream - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer
 * \notefnerr
 * \note_memset
 * \note_null_stream
 *
 * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
 * ::cudaMemset2DAsync, ::cudaMemset3DAsync
 */
public static native @Cast("cudaError_t") int cudaMemsetAsync(Pointer devPtr, int value, @Cast("size_t") long count, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemsetAsync(Pointer devPtr, int value, @Cast("size_t") long count);

/**
 * \brief Initializes or sets device memory to a value
 *
 * Sets to the specified value \p value a matrix (\p height rows of \p width
 * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the
 * 2D array pointed to by \p dstPtr, including any padding added to the end
 * of each row. This function performs fastest when the pitch is one that has
 * been passed back by ::cudaMallocPitch().
 *
 * ::cudaMemset2DAsync() is asynchronous with respect to the host, so
 * the call may return before the memset is complete. The operation can optionally
 * be associated to a stream by passing a non-zero \p stream argument.
 * If \p stream is non-zero, the operation may overlap with operations in other streams.
 *
 * The device version of this function only handles device to device copies and
 * cannot be given local or shared pointers.
 *
 * @param devPtr - Pointer to 2D device memory
 * @param pitch  - Pitch in bytes of 2D device memory
 * @param value  - Value to set for each byte of specified memory
 * @param width  - Width of matrix set (columns in bytes)
 * @param height - Height of matrix set (rows)
 * @param stream - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer
 * \notefnerr
 * \note_memset
 * \note_null_stream
 *
 * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
 * ::cudaMemsetAsync, ::cudaMemset3DAsync
 */
public static native @Cast("cudaError_t") int cudaMemset2DAsync(Pointer devPtr, @Cast("size_t") long pitch, int value, @Cast("size_t") long width, @Cast("size_t") long height, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemset2DAsync(Pointer devPtr, @Cast("size_t") long pitch, int value, @Cast("size_t") long width, @Cast("size_t") long height);

/**
 * \brief Initializes or sets device memory to a value
 *
 * Initializes each element of a 3D array to the specified value \p value.
 * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field
 * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed
 * to by \p pitchedDevPtr, including any padding added to the end of each row.
 * The \p xsize field specifies the logical width of each row in bytes, while
 * the \p ysize field specifies the height of each 2D slice in rows.
 *
 * The extents of the initialized region are specified as a \p width in bytes,
 * a \p height in rows, and a \p depth in slices.
 *
 * Extents with \p width greater than or equal to the \p xsize of
 * \p pitchedDevPtr may perform significantly faster than extents narrower
 * than the \p xsize. Secondarily, extents with \p height equal to the
 * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is
 * shorter than the \p ysize.
 *
 * This function performs fastest when the \p pitchedDevPtr has been allocated
 * by ::cudaMalloc3D().
 *
 * ::cudaMemset3DAsync() is asynchronous with respect to the host, so
 * the call may return before the memset is complete. The operation can optionally
 * be associated to a stream by passing a non-zero \p stream argument.
 * If \p stream is non-zero, the operation may overlap with operations in other streams.
 *
 * The device version of this function only handles device to device copies and
 * cannot be given local or shared pointers.
 *
 * @param pitchedDevPtr - Pointer to pitched device memory
 * @param value         - Value to set for each byte of specified memory
 * @param extent        - Size parameters for where to set device memory (\p width field in bytes)
 * @param stream - Stream identifier
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer
 * \notefnerr
 * \note_memset
 * \note_null_stream
 *
 * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
 * ::cudaMemsetAsync, ::cudaMemset2DAsync,
 * ::cudaMalloc3D, ::make_cudaPitchedPtr,
 * ::make_cudaExtent
 */
public static native @Cast("cudaError_t") int cudaMemset3DAsync(@ByVal cudaPitchedPtr pitchedDevPtr, int value, @ByVal cudaExtent extent, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemset3DAsync(@ByVal cudaPitchedPtr pitchedDevPtr, int value, @ByVal cudaExtent extent);

/**
 * \brief Finds the address associated with a CUDA symbol
 *
 * Returns in \p *devPtr the address of symbol \p symbol on the device.
 * \p symbol is a variable that resides in global or constant memory space.
 * If \p symbol cannot be found, or if \p symbol is not declared in the
 * global or constant memory space, \p *devPtr is unchanged and the error
 * ::cudaErrorInvalidSymbol is returned.
 *
 * @param devPtr - Return device pointer associated with symbol
 * @param symbol - Device symbol address
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidSymbol
 * \notefnerr
 * \note_string_api_deprecation
 *
 * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)",
 * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)"
 */
public static native @Cast("cudaError_t") int cudaGetSymbolAddress(@Cast("void**") PointerPointer devPtr, @Const Pointer symbol);
public static native @Cast("cudaError_t") int cudaGetSymbolAddress(@Cast("void**") @ByPtrPtr Pointer devPtr, @Const Pointer symbol);

/**
 * \brief Finds the size of the object associated with a CUDA symbol
 *
 * Returns in \p *size the size of symbol \p symbol. \p symbol is a variable that
 * resides in global or constant memory space. If \p symbol cannot be found, or
 * if \p symbol is not declared in global or constant memory space, \p *size is
 * unchanged and the error ::cudaErrorInvalidSymbol is returned.
 *
 * @param size   - Size of object associated with symbol
 * @param symbol - Device symbol address
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidSymbol
 * \notefnerr
 * \note_string_api_deprecation
 *
 * \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)",
 * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)"
 */
public static native @Cast("cudaError_t") int cudaGetSymbolSize(@Cast("size_t*") SizeTPointer size, @Const Pointer symbol);

/**
 * \brief Prefetches memory to the specified destination device
 *
 * Prefetches memory to the specified destination device.  \p devPtr is the 
 * base device pointer of the memory to be prefetched and \p dstDevice is the 
 * destination device. \p count specifies the number of bytes to copy. \p stream
 * is the stream in which the operation is enqueued. The memory range must refer
 * to managed memory allocated via ::cudaMallocManaged or declared via __managed__ variables.
 *
 * Passing in cudaCpuDeviceId for \p dstDevice will prefetch the data to host memory. If
 * \p dstDevice is a GPU, then the device attribute ::cudaDevAttrConcurrentManagedAccess
 * must be non-zero. Additionally, \p stream must be associated with a device that has a
 * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
 *
 * The start address and end address of the memory range will be rounded down and rounded up
 * respectively to be aligned to CPU page size before the prefetch operation is enqueued
 * in the stream.
 *
 * If no physical memory has been allocated for this region, then this memory region
 * will be populated and mapped on the destination device. If there's insufficient
 * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
 * ::cudaMallocManaged allocations to host memory in order to make room. Device memory
 * allocated using ::cudaMalloc or ::cudaMallocArray will not be evicted.
 *
 * By default, any mappings to the previous location of the migrated pages are removed and
 * mappings for the new location are only setup on \p dstDevice. The exact behavior however
 * also depends on the settings applied to this memory range via ::cudaMemAdvise as described
 * below:
 *
 * If ::cudaMemAdviseSetReadMostly was set on any subset of this memory range,
 * then that subset will create a read-only copy of the pages on \p dstDevice.
 *
 * If ::cudaMemAdviseSetPreferredLocation was called on any subset of this memory
 * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the
 * preferred location of any pages in the memory range.
 *
 * If ::cudaMemAdviseSetAccessedBy was called on any subset of this memory range,
 * then mappings to those pages from all the appropriate processors are updated to
 * refer to the new location if establishing such a mapping is possible. Otherwise,
 * those mappings are cleared.
 *
 * Note that this API is not required for functionality and only serves to improve performance
 * by allowing the application to migrate data to a suitable location before it is accessed.
 * Memory accesses to this range are always coherent and are allowed even when the data is
 * actively being migrated.
 *
 * Note that this function is asynchronous with respect to the host and all work
 * on other devices.
 *
 * @param devPtr    - Pointer to be prefetched
 * @param count     - Size in bytes
 * @param dstDevice - Destination device to prefetch to
 * @param stream    - Stream to enqueue prefetch operation
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevice
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
 * ::cudaMemcpy3DPeerAsync, ::cudaMemAdvise
 */
public static native @Cast("cudaError_t") int cudaMemPrefetchAsync(@Const Pointer devPtr, @Cast("size_t") long count, int dstDevice, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaMemPrefetchAsync(@Const Pointer devPtr, @Cast("size_t") long count, int dstDevice);

/**
 * \brief Advise about the usage of a given memory range
 *
 * Advise the Unified Memory subsystem about the usage pattern for the memory range
 * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
 * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
 * advice is applied. The memory range must refer to managed memory allocated via ::cudaMallocManaged
 * or declared via __managed__ variables.
 *
 * The \p advice parameter can take the following values:
 * - ::cudaMemAdviseSetReadMostly: This implies that the data is mostly going to be read
 * from and only occasionally written to. Any read accesses from any processor to this region will create a
 * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cudaMemPrefetchAsync
 * is called on this region, it will create a read-only copy of the data on the destination processor.
 * If any processor writes to this region, all copies of the corresponding page will be invalidated
 * except for the one where the write occurred. The \p device argument is ignored for this advice.
 * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
 * that has a non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
 * Also, if a context is created on a device that does not have the device attribute
 * ::cudaDevAttrConcurrentManagedAccess set, then read-duplication will not occur until
 * all such contexts are destroyed.
 * - ::cudaMemAdviceUnsetReadMostly: Undoes the effect of ::cudaMemAdviceReadMostly and also prevents the
 * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
 * copies of the data will be collapsed into a single copy. The location for the collapsed
 * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
 * copies was resident at that location. Otherwise, the location chosen is arbitrary.
 * - ::cudaMemAdviseSetPreferredLocation: This advice sets the preferred location for the
 * data to be the memory belonging to \p device. Passing in cudaCpuDeviceId for \p device sets the
 * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the
 * device attribute ::cudaDevAttrConcurrentManagedAccess. Setting the preferred location
 * does not cause data to migrate to that location immediately. Instead, it guides the migration policy
 * when a fault occurs on that memory region. If the data is already in its preferred location and the
 * faulting processor can establish a mapping without requiring the data to be migrated, then
 * data migration will be avoided. On the other hand, if the data is not in its preferred location
 * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
 * it. It is important to note that setting the preferred location does not prevent data prefetching
 * done using ::cudaMemPrefetchAsync.
 * Having a preferred location can override the page thrash detection and resolution logic in the Unified
 * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
 * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
 * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
 * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the
 * policies associated with that advice will override the policies of this advice.
 * - ::cudaMemAdviseUnsetPreferredLocation: Undoes the effect of ::cudaMemAdviseSetPreferredLocation
 * and changes the preferred location to none.
 * - ::cudaMemAdviseSetAccessedBy: This advice implies that the data will be accessed by \p device.
 * Passing in ::cudaCpuDeviceId for \p device will set the advice for the CPU. If \p device is a GPU, then
 * the device attribute ::cudaDevAttrConcurrentManagedAccess must be non-zero.
 * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
 * it causes the data to always be mapped in the specified processor's page tables, as long as the
 * location of the data permits a mapping to be established. If the data gets migrated for any reason,
 * the mappings are updated accordingly.
 * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
 * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
 * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
 * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
 * migration may be too high. But preventing faults can still help improve performance, and so having
 * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
 * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
 * ::cudaMemAdviceSetAccessedBy flag set for this data will now have its mapping updated to point to the
 * page in host memory.
 * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the
 * policies associated with that advice will override the policies of this advice. Additionally, if the
 * preferred location of this memory region or any subset of it is also \p device, then the policies
 * associated with ::cudaMemAdviseSetPreferredLocation will override the policies of this advice.
 * - ::cudaMemAdviseUnsetAccessedBy: Undoes the effect of ::cudaMemAdviseSetAccessedBy. Any mappings to
 * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults.
 *
 * @param devPtr - Pointer to memory to set the advice for
 * @param count  - Size in bytes of the memory range
 * @param advice - Advice to be applied for the specified memory range
 * @param device - Device to apply the advice for
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevice
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
 * ::cudaMemcpy3DPeerAsync, ::cudaMemPrefetchAsync
 */
public static native @Cast("cudaError_t") int cudaMemAdvise(@Const Pointer devPtr, @Cast("size_t") long count, @Cast("cudaMemoryAdvise") int advice, int device);

/**
* \brief Query an attribute of a given memory range
*
* Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The
* memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via
* __managed__ variables.
*
* The \p attribute parameter can take the following values:
* - ::cudaMemRangeAttributeReadMostly: If this attribute is specified, \p data will be interpreted
* as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given
* memory range have read-duplication enabled, or 0 otherwise.
* - ::cudaMemRangeAttributePreferredLocation: If this attribute is specified, \p data will be
* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device
* id if all pages in the memory range have that GPU as their preferred location, or it will be cudaCpuDeviceId
* if all pages in the memory range have the CPU as their preferred location, or it will be cudaInvalidDeviceId
* if either all the pages don't have the same preferred location or some of the pages don't have a
* preferred location at all. Note that the actual location of the pages in the memory range at the time of
* the query may be different from the preferred location.
* - ::cudaMemRangeAttributeAccessedBy: If this attribute is specified, \p data will be interpreted
* as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned
* will be a list of device ids that had ::cudaMemAdviceSetAccessedBy set for that entire memory range.
* If any device does not have that advice set for the entire memory range, that device will not be included.
* If \p data is larger than the number of devices that have that advice set for that memory range,
* cudaInvalidDeviceId will be returned in all the extra space provided. For ex., if \p dataSize is 12
* (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be
* { 0, cudaInvalidDeviceId, cudaInvalidDeviceId }. If \p data is smaller than the number of devices that have
* that advice set, then only as many devices will be returned as can fit in the array. There is no
* guarantee on which specific devices will be returned, however.
* - ::cudaMemRangeAttributeLastPrefetchLocation: If this attribute is specified, \p data will be
* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location
* to which all pages in the memory range were prefetched explicitly via ::cudaMemPrefetchAsync. This will either be
* a GPU id or cudaCpuDeviceId depending on whether the last location for prefetch was a GPU or the CPU
* respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
* prefetched to the same location, cudaInvalidDeviceId will be returned. Note that this simply returns the
* last location that the applicaton requested to prefetch the memory range to. It gives no indication as to
* whether the prefetch operation to that location has completed or even begun.
*
* @param data      - A pointers to a memory location where the result
*                    of each attribute query will be written to.
* @param dataSize  - Array containing the size of data
* @param attribute - The attribute to query
* @param devPtr    - Start of the range to query
* @param count     - Size of the range to query
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 * \notefnerr
 * \note_async
 * \note_null_stream
 *
 * \sa ::cudaMemRangeGetAttributes, ::cudaMemPrefetchAsync,
 * ::cudaMemAdvise
 */
public static native @Cast("cudaError_t") int cudaMemRangeGetAttribute(Pointer data, @Cast("size_t") long dataSize, @Cast("cudaMemRangeAttribute") int attribute, @Const Pointer devPtr, @Cast("size_t") long count);

/**
 * \brief Query attributes of a given memory range.
 *
 * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The
 * memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via
 * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes
 * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries.
 * The results of the query will be stored in \p data.
 *
 * The list of supported attributes are given below. Please refer to ::cudaMemRangeGetAttribute for
 * attribute descriptions and restrictions.
 *
 * - ::cudaMemRangeAttributeReadMostly
 * - ::cudaMemRangeAttributePreferredLocation
 * - ::cudaMemRangeAttributeAccessedBy
 * - ::cudaMemRangeAttributeLastPrefetchLocation
 *
 * @param data          - A two-dimensional array containing pointers to memory
 *                        locations where the result of each attribute query will be written to.
 * @param dataSizes     - Array containing the sizes of each result
 * @param attributes    - An array of attributes to query
 *                        (numAttributes and the number of attributes in this array should match)
 * @param numAttributes - Number of attributes to query
 * @param devPtr        - Start of the range to query
 * @param count         - Size of the range to query
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaMemRangeGetAttribute, ::cudaMemAdvise
 * ::cudaMemPrefetchAsync
 */
public static native @Cast("cudaError_t") int cudaMemRangeGetAttributes(@Cast("void**") PointerPointer data, @Cast("size_t*") SizeTPointer dataSizes, @Cast("cudaMemRangeAttribute*") IntPointer attributes, @Cast("size_t") long numAttributes, @Const Pointer devPtr, @Cast("size_t") long count);
public static native @Cast("cudaError_t") int cudaMemRangeGetAttributes(@Cast("void**") @ByPtrPtr Pointer data, @Cast("size_t*") SizeTPointer dataSizes, @Cast("cudaMemRangeAttribute*") IntPointer attributes, @Cast("size_t") long numAttributes, @Const Pointer devPtr, @Cast("size_t") long count);
public static native @Cast("cudaError_t") int cudaMemRangeGetAttributes(@Cast("void**") @ByPtrPtr Pointer data, @Cast("size_t*") SizeTPointer dataSizes, @Cast("cudaMemRangeAttribute*") IntBuffer attributes, @Cast("size_t") long numAttributes, @Const Pointer devPtr, @Cast("size_t") long count);
public static native @Cast("cudaError_t") int cudaMemRangeGetAttributes(@Cast("void**") @ByPtrPtr Pointer data, @Cast("size_t*") SizeTPointer dataSizes, @Cast("cudaMemRangeAttribute*") int[] attributes, @Cast("size_t") long numAttributes, @Const Pointer devPtr, @Cast("size_t") long count);

/** \} */ /* END CUDART_MEMORY */

/**
 * \defgroup CUDART_UNIFIED Unified Addressing
 *
 * ___MANBRIEF___ unified addressing functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the unified addressing functions of the CUDA 
 * runtime application programming interface.
 *
 * \{
 *
 * \section CUDART_UNIFIED_overview Overview
 *
 * CUDA devices can share a unified address space with the host.  
 * For these devices there is no distinction between a device
 * pointer and a host pointer -- the same pointer value may be 
 * used to access memory from the host program and from a kernel 
 * running on the device (with exceptions enumerated below).
 *
 * \section CUDART_UNIFIED_support Supported Platforms
 * 
 * Whether or not a device supports unified addressing may be 
 * queried by calling ::cudaGetDeviceProperties() with the device 
 * property ::cudaDeviceProp::unifiedAddressing.
 *
 * Unified addressing is automatically enabled in 64-bit processes .
 *
 * Unified addressing is not yet supported on Windows Vista or
 * Windows 7 for devices that do not use the TCC driver model.
 *
 * \section CUDART_UNIFIED_lookup Looking Up Information from Pointer Values
 *
 * It is possible to look up information about the memory which backs a 
 * pointer value.  For instance, one may want to know if a pointer points
 * to host or device memory.  As another example, in the case of device 
 * memory, one may want to know on which CUDA device the memory 
 * resides.  These properties may be queried using the function 
 * ::cudaPointerGetAttributes()
 *
 * Since pointers are unique, it is not necessary to specify information
 * about the pointers specified to ::cudaMemcpy() and other copy functions.  
 * The copy direction ::cudaMemcpyDefault may be used to specify that the 
 * CUDA runtime should infer the location of the pointer from its value.
 *
 * \section CUDART_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
 *
 * All host memory allocated through all devices using ::cudaMallocHost() and
 * ::cudaHostAlloc() is always directly accessible from all devices that 
 * support unified addressing.  This is the case regardless of whether or 
 * not the flags ::cudaHostAllocPortable and ::cudaHostAllocMapped are 
 * specified.
 *
 * The pointer value through which allocated host memory may be accessed 
 * in kernels on all devices that support unified addressing is the same 
 * as the pointer value through which that memory is accessed on the host.
 * It is not necessary to call ::cudaHostGetDevicePointer() to get the device 
 * pointer for these allocations.  
 *
 * Note that this is not the case for memory allocated using the flag
 * ::cudaHostAllocWriteCombined, as discussed below.
 *
 * \section CUDART_UNIFIED_autopeerregister Direct Access of Peer Memory
 
 * Upon enabling direct access from a device that supports unified addressing 
 * to another peer device that supports unified addressing using 
 * ::cudaDeviceEnablePeerAccess() all memory allocated in the peer device using 
 * ::cudaMalloc() and ::cudaMallocPitch() will immediately be accessible 
 * by the current device.  The device pointer value through 
 * which any peer's memory may be accessed in the current device 
 * is the same pointer value through which that memory may be 
 * accessed from the peer device. 
 *
 * \section CUDART_UNIFIED_exceptions Exceptions, Disjoint Addressing
 * 
 * Not all memory may be accessed on devices through the same pointer
 * value through which they are accessed on the host.  These exceptions
 * are host memory registered using ::cudaHostRegister() and host memory
 * allocated using the flag ::cudaHostAllocWriteCombined.  For these 
 * exceptions, there exists a distinct host and device address for the
 * memory.  The device address is guaranteed to not overlap any valid host
 * pointer range and is guaranteed to have the same value across all devices
 * that support unified addressing.  
 * 
 * This device address may be queried using ::cudaHostGetDevicePointer() 
 * when a device using unified addressing is current.  Either the host 
 * or the unified device pointer value may be used to refer to this memory 
 * in ::cudaMemcpy() and similar functions using the ::cudaMemcpyDefault 
 * memory direction.
 *
 */

/**
 * \brief Returns attributes about a specified pointer
 *
 * Returns in \p *attributes the attributes of the pointer \p ptr.
 * If pointer was not allocated in, mapped by or registered with context
 * supporting unified addressing ::cudaErrorInvalidValue is returned.
 *
 * The ::cudaPointerAttributes structure is defined as:
 * {@code
    struct cudaPointerAttributes {
        enum cudaMemoryType memoryType;
        int device;
        void *devicePointer;
        void *hostPointer;
        int isManaged;
    }
    }
 * In this structure, the individual fields mean
 *
 * - \ref ::cudaPointerAttributes::memoryType "memoryType" identifies the physical 
 *   location of the memory associated with pointer \p ptr.  It can be
 *   ::cudaMemoryTypeHost for host memory or ::cudaMemoryTypeDevice for device
 *   memory.
 *
 * - \ref ::cudaPointerAttributes::device "device" is the device against which
 *   \p ptr was allocated.  If \p ptr has memory type ::cudaMemoryTypeDevice
 *   then this identifies the device on which the memory referred to by \p ptr
 *   physically resides.  If \p ptr has memory type ::cudaMemoryTypeHost then this
 *   identifies the device which was current when the allocation was made
 *   (and if that device is deinitialized then this allocation will vanish
 *   with that device's state).
 *
 * - \ref ::cudaPointerAttributes::devicePointer "devicePointer" is
 *   the device pointer alias through which the memory referred to by \p ptr
 *   may be accessed on the current device.
 *   If the memory referred to by \p ptr cannot be accessed directly by the 
 *   current device then this is NULL.  
 *
 * - \ref ::cudaPointerAttributes::hostPointer "hostPointer" is
 *   the host pointer alias through which the memory referred to by \p ptr
 *   may be accessed on the host.
 *   If the memory referred to by \p ptr cannot be accessed directly by the
 *   host then this is NULL.
 *
 * - \ref ::cudaPointerAttributes::isManaged "isManaged" indicates if
 *   the pointer \p ptr points to managed memory or not.
 *
 * @param attributes - Attributes for the specified pointer
 * @param ptr        - Pointer to get attributes for
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidDevice,
 * ::cudaErrorInvalidValue
 *
 * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice,
 * ::cudaChooseDevice
 */
public static native @Cast("cudaError_t") int cudaPointerGetAttributes(cudaPointerAttributes attributes, @Const Pointer ptr);

/** \} */ /* END CUDART_UNIFIED */

/**
 * \defgroup CUDART_PEER Peer Device Memory Access
 *
 * ___MANBRIEF___ peer device memory access functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the peer device memory access functions of the CUDA runtime
 * application programming interface.
 *
 * \{
 */

/**
 * \brief Queries if a device may directly access a peer device's memory.
 *
 * Returns in \p *canAccessPeer a value of 1 if device \p device is capable of
 * directly accessing memory from \p peerDevice and 0 otherwise.  If direct
 * access of \p peerDevice from \p device is possible, then access may be
 * enabled by calling ::cudaDeviceEnablePeerAccess().
 *
 * @param canAccessPeer - Returned access capability
 * @param device        - Device from which allocations on \p peerDevice are to
 *                        be directly accessed.
 * @param peerDevice    - Device on which the allocations to be directly accessed 
 *                        by \p device reside.
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidDevice
 * \notefnerr
 *
 * \sa ::cudaDeviceEnablePeerAccess,
 * ::cudaDeviceDisablePeerAccess
 */
public static native @Cast("cudaError_t") int cudaDeviceCanAccessPeer(IntPointer canAccessPeer, int device, int peerDevice);
public static native @Cast("cudaError_t") int cudaDeviceCanAccessPeer(IntBuffer canAccessPeer, int device, int peerDevice);
public static native @Cast("cudaError_t") int cudaDeviceCanAccessPeer(int[] canAccessPeer, int device, int peerDevice);

/**
 * \brief Enables direct access to memory allocations on a peer device.
 *
 * On success, all allocations from \p peerDevice will immediately be accessible by
 * the current device.  They will remain accessible until access is explicitly
 * disabled using ::cudaDeviceDisablePeerAccess() or either device is reset using
 * ::cudaDeviceReset().
 *
 * Note that access granted by this call is unidirectional and that in order to access
 * memory on the current device from \p peerDevice, a separate symmetric call 
 * to ::cudaDeviceEnablePeerAccess() is required.
 *
 * Each device can support a system-wide maximum of eight peer connections.
 *
 * Peer access is not supported in 32 bit applications.
 *
 * Returns ::cudaErrorInvalidDevice if ::cudaDeviceCanAccessPeer() indicates
 * that the current device cannot directly access memory from \p peerDevice.
 *
 * Returns ::cudaErrorPeerAccessAlreadyEnabled if direct access of
 * \p peerDevice from the current device has already been enabled.
 *
 * Returns ::cudaErrorInvalidValue if \p flags is not 0.
 *
 * @param peerDevice  - Peer device to enable direct access to from the current device
 * @param flags       - Reserved for future use and must be set to 0
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidDevice,
 * ::cudaErrorPeerAccessAlreadyEnabled,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaDeviceCanAccessPeer,
 * ::cudaDeviceDisablePeerAccess
 */
public static native @Cast("cudaError_t") int cudaDeviceEnablePeerAccess(int peerDevice, @Cast("unsigned int") int flags);

/**
 * \brief Disables direct access to memory allocations on a peer device.
 *
 * Returns ::cudaErrorPeerAccessNotEnabled if direct access to memory on
 * \p peerDevice has not yet been enabled from the current device.
 *
 * @param peerDevice - Peer device to disable direct access to
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorPeerAccessNotEnabled,
 * ::cudaErrorInvalidDevice
 * \notefnerr
 *
 * \sa ::cudaDeviceCanAccessPeer,
 * ::cudaDeviceEnablePeerAccess
 */
public static native @Cast("cudaError_t") int cudaDeviceDisablePeerAccess(int peerDevice);

/** \} */ /* END CUDART_PEER */

/** \defgroup CUDART_OPENGL OpenGL Interoperability */

/** \defgroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED] */

/** \defgroup CUDART_D3D9 Direct3D 9 Interoperability */

/** \defgroup CUDART_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED] */

/** \defgroup CUDART_D3D10 Direct3D 10 Interoperability */

/** \defgroup CUDART_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED] */

/** \defgroup CUDART_D3D11 Direct3D 11 Interoperability */

/** \defgroup CUDART_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED] */

/** \defgroup CUDART_VDPAU VDPAU Interoperability */

/** \defgroup CUDART_EGL EGL Interoperability */

/**
 * \defgroup CUDART_INTEROP Graphics Interoperability
 *
 * ___MANBRIEF___ graphics interoperability functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the graphics interoperability functions of the CUDA
 * runtime application programming interface.
 *
 * \{
 */

/**
 * \brief Unregisters a graphics resource for access by CUDA
 *
 * Unregisters the graphics resource \p resource so it is not accessible by
 * CUDA unless registered again.
 *
 * If \p resource is invalid then ::cudaErrorInvalidResourceHandle is
 * returned.
 *
 * @param resource - Resource to unregister
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorUnknown
 * \notefnerr
 *
 * \sa
 * ::cudaGraphicsD3D9RegisterResource,
 * ::cudaGraphicsD3D10RegisterResource,
 * ::cudaGraphicsD3D11RegisterResource,
 * ::cudaGraphicsGLRegisterBuffer,
 * ::cudaGraphicsGLRegisterImage
 */
public static native @Cast("cudaError_t") int cudaGraphicsUnregisterResource(cudaGraphicsResource resource);

/**
 * \brief Set usage flags for mapping a graphics resource
 *
 * Set \p flags for mapping the graphics resource \p resource.
 *
 * Changes to \p flags will take effect the next time \p resource is mapped.
 * The \p flags argument may be any of the following:
 * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how \p resource will
 *     be used. It is therefore assumed that CUDA may read from or write to \p resource.
 * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA will not write to \p resource.
 * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies CUDA will not read from \p resource and will
 *   write over the entire contents of \p resource, so none of the data
 *   previously stored in \p resource will be preserved.
 *
 * If \p resource is presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
 * If \p flags is not one of the above values then ::cudaErrorInvalidValue is returned.
 *
 * @param resource - Registered resource to set flags for
 * @param flags    - Parameters for resource mapping
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorUnknown,
 * \notefnerr
 *
 * \sa
 * ::cudaGraphicsMapResources
 */
public static native @Cast("cudaError_t") int cudaGraphicsResourceSetMapFlags(cudaGraphicsResource resource, @Cast("unsigned int") int flags);

/**
 * \brief Map graphics resources for access by CUDA
 *
 * Maps the \p count graphics resources in \p resources for access by CUDA.
 *
 * The resources in \p resources may be accessed by CUDA until they
 * are unmapped. The graphics API from which \p resources were registered
 * should not access any resources while they are mapped by CUDA. If an
 * application does so, the results are undefined.
 *
 * This function provides the synchronization guarantee that any graphics calls
 * issued before ::cudaGraphicsMapResources() will complete before any subsequent CUDA
 * work issued in \p stream begins.
 *
 * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle
 * is returned. If any of \p resources are presently mapped for access by
 * CUDA then ::cudaErrorUnknown is returned.
 *
 * @param count     - Number of resources to map
 * @param resources - Resources to map for CUDA
 * @param stream    - Stream for synchronization
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorUnknown
 * \note_null_stream
 * \notefnerr
 *
 * \sa
 * ::cudaGraphicsResourceGetMappedPointer,
 * ::cudaGraphicsSubResourceGetMappedArray,
 * ::cudaGraphicsUnmapResources
 */
public static native @Cast("cudaError_t") int cudaGraphicsMapResources(int count, @ByPtrPtr cudaGraphicsResource resources, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaGraphicsMapResources(int count, @ByPtrPtr cudaGraphicsResource resources);

/**
 * \brief Unmap graphics resources.
 *
 * Unmaps the \p count graphics resources in \p resources.
 *
 * Once unmapped, the resources in \p resources may not be accessed by CUDA
 * until they are mapped again.
 *
 * This function provides the synchronization guarantee that any CUDA work issued
 * in \p stream before ::cudaGraphicsUnmapResources() will complete before any
 * subsequently issued graphics work begins.
 *
 * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle
 * is returned. If any of \p resources are not presently mapped for access by
 * CUDA then ::cudaErrorUnknown is returned.
 *
 * @param count     - Number of resources to unmap
 * @param resources - Resources to unmap
 * @param stream    - Stream for synchronization
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorUnknown
 * \note_null_stream
 * \notefnerr
 *
 * \sa
 * ::cudaGraphicsMapResources
 */
public static native @Cast("cudaError_t") int cudaGraphicsUnmapResources(int count, @ByPtrPtr cudaGraphicsResource resources, CUstream_st stream/*=0*/);
public static native @Cast("cudaError_t") int cudaGraphicsUnmapResources(int count, @ByPtrPtr cudaGraphicsResource resources);

/**
 * \brief Get an device pointer through which to access a mapped graphics resource.
 *
 * Returns in \p *devPtr a pointer through which the mapped graphics resource
 * \p resource may be accessed.
 * Returns in \p *size the size of the memory in bytes which may be accessed from that pointer.
 * The value set in \p devPtr may change every time that \p resource is mapped.
 *
 * If \p resource is not a buffer then it cannot be accessed via a pointer and
 * ::cudaErrorUnknown is returned.
 * If \p resource is not mapped then ::cudaErrorUnknown is returned.
 * *
 * @param devPtr     - Returned pointer through which \p resource may be accessed
 * @param size       - Returned size of the buffer accessible starting at \p *devPtr
 * @param resource   - Mapped resource to access
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorUnknown
 * \notefnerr
 *
 * \sa
 * ::cudaGraphicsMapResources,
 * ::cudaGraphicsSubResourceGetMappedArray
 */
public static native @Cast("cudaError_t") int cudaGraphicsResourceGetMappedPointer(@Cast("void**") PointerPointer devPtr, @Cast("size_t*") SizeTPointer size, cudaGraphicsResource resource);
public static native @Cast("cudaError_t") int cudaGraphicsResourceGetMappedPointer(@Cast("void**") @ByPtrPtr Pointer devPtr, @Cast("size_t*") SizeTPointer size, cudaGraphicsResource resource);

/**
 * \brief Get an array through which to access a subresource of a mapped graphics resource.
 *
 * Returns in \p *array an array through which the subresource of the mapped
 * graphics resource \p resource which corresponds to array index \p arrayIndex
 * and mipmap level \p mipLevel may be accessed.  The value set in \p array may
 * change every time that \p resource is mapped.
 *
 * If \p resource is not a texture then it cannot be accessed via an array and
 * ::cudaErrorUnknown is returned.
 * If \p arrayIndex is not a valid array index for \p resource then
 * ::cudaErrorInvalidValue is returned.
 * If \p mipLevel is not a valid mipmap level for \p resource then
 * ::cudaErrorInvalidValue is returned.
 * If \p resource is not mapped then ::cudaErrorUnknown is returned.
 *
 * @param array       - Returned array through which a subresource of \p resource may be accessed
 * @param resource    - Mapped resource to access
 * @param arrayIndex  - Array index for array textures or cubemap face
 *                      index as defined by ::cudaGraphicsCubeFace for
 *                      cubemap textures for the subresource to access
 * @param mipLevel    - Mipmap level for the subresource to access
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorUnknown
 * \notefnerr
 *
 * \sa ::cudaGraphicsResourceGetMappedPointer
 */
public static native @Cast("cudaError_t") int cudaGraphicsSubResourceGetMappedArray(@ByPtrPtr cudaArray array, cudaGraphicsResource resource, @Cast("unsigned int") int arrayIndex, @Cast("unsigned int") int mipLevel);

/**
 * \brief Get a mipmapped array through which to access a mapped graphics resource.
 *
 * Returns in \p *mipmappedArray a mipmapped array through which the mapped
 * graphics resource \p resource may be accessed. The value set in \p mipmappedArray may
 * change every time that \p resource is mapped.
 *
 * If \p resource is not a texture then it cannot be accessed via an array and
 * ::cudaErrorUnknown is returned.
 * If \p resource is not mapped then ::cudaErrorUnknown is returned.
 *
 * @param mipmappedArray - Returned mipmapped array through which \p resource may be accessed
 * @param resource       - Mapped resource to access
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidResourceHandle,
 * ::cudaErrorUnknown
 * \notefnerr
 *
 * \sa ::cudaGraphicsResourceGetMappedPointer
 */
public static native @Cast("cudaError_t") int cudaGraphicsResourceGetMappedMipmappedArray(@ByPtrPtr cudaMipmappedArray mipmappedArray, cudaGraphicsResource resource);

/** \} */ /* END CUDART_INTEROP */

/**
 * \defgroup CUDART_TEXTURE Texture Reference Management
 *
 * ___MANBRIEF___ texture reference management functions of the CUDA runtime
 * API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the low level texture reference management functions
 * of the CUDA runtime application programming interface.
 *
 * Some functions have overloaded C++ API template versions documented separately in the
 * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
 *
 * \{
 */

/**
 * \brief Get the channel descriptor of an array
 *
 * Returns in \p *desc the channel descriptor of the CUDA array \p array.
 *
 * @param desc  - Channel format
 * @param array - Memory array on device
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
 * ::cudaGetTextureReference,
 * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
 * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
 * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
 * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
 */
public static native @Cast("cudaError_t") int cudaGetChannelDesc(cudaChannelFormatDesc desc, cudaArray array);

/**
 * \brief Returns a channel descriptor using the specified format
 *
 * Returns a channel descriptor with format \p f and number of bits of each
 * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
 * defined as:
 * {@code
  struct cudaChannelFormatDesc {
    int x, y, z, w;
    enum cudaChannelFormatKind f;
  };
 * }
 *
 * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
 * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
 *
 * @param x - X component
 * @param y - Y component
 * @param z - Z component
 * @param w - W component
 * @param f - Channel format
 *
 * @return
 * Channel descriptor with format \p f
 *
 * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
 * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
 * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
 * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
 */
public static native @ByVal cudaChannelFormatDesc cudaCreateChannelDesc(int x, int y, int z, int w, @Cast("cudaChannelFormatKind") int f);


/**
 * \brief Binds a memory area to a texture
 *
 * Binds \p size bytes of the memory area pointed to by \p devPtr to the
 * texture reference \p texref. \p desc describes how the memory is interpreted
 * when fetching values from the texture. Any memory previously bound to
 * \p texref is unbound.
 *
 * Since the hardware enforces an alignment requirement on texture base
 * addresses,
 * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()"
 * returns in \p *offset a byte offset that
 * must be applied to texture fetches in order to read from the desired memory.
 * This offset must be divided by the texel size and passed to kernels that
 * read from the texture so they can be applied to the ::tex1Dfetch() function.
 * If the device memory pointer was returned from ::cudaMalloc(), the offset is
 * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
 *
 * The total number of elements (or texels) in the linear address range
 * cannot exceed ::cudaDeviceProp::maxTexture1DLinear[0].
 * The number of elements is computed as (\p size / elementSize),
 * where elementSize is determined from \p desc.
 *
 * @param offset - Offset in bytes
 * @param texref - Texture to bind
 * @param devPtr - Memory area on device
 * @param desc   - Channel format
 * @param size   - Size of the memory area pointed to by devPtr
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidTexture
 * \notefnerr
 *
 * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
 * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
 * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
 * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
 */
public static native @Cast("cudaError_t") int cudaBindTexture(@Cast("size_t*") SizeTPointer offset, @Const textureReference texref, @Const Pointer devPtr, @Const cudaChannelFormatDesc desc, @Cast("size_t") long size/*=UINT_MAX*/);
public static native @Cast("cudaError_t") int cudaBindTexture(@Cast("size_t*") SizeTPointer offset, @Const textureReference texref, @Const Pointer devPtr, @Const cudaChannelFormatDesc desc);

/**
 * \brief Binds a 2D memory area to a texture
 *
 * Binds the 2D memory area pointed to by \p devPtr to the
 * texture reference \p texref. The size of the area is constrained by
 * \p width in texel units, \p height in texel units, and \p pitch in byte
 * units. \p desc describes how the memory is interpreted when fetching values
 * from the texture. Any memory previously bound to \p texref is unbound.
 *
 * Since the hardware enforces an alignment requirement on texture base
 * addresses, ::cudaBindTexture2D() returns in \p *offset a byte offset that
 * must be applied to texture fetches in order to read from the desired memory.
 * This offset must be divided by the texel size and passed to kernels that
 * read from the texture so they can be applied to the ::tex2D() function.
 * If the device memory pointer was returned from ::cudaMalloc(), the offset is
 * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
 *
 * \p width and \p height, which are specified in elements (or texels), cannot
 * exceed ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1]
 * respectively. \p pitch, which is specified in bytes, cannot exceed
 * ::cudaDeviceProp::maxTexture2DLinear[2].
 *
 * The driver returns ::cudaErrorInvalidValue if \p pitch is not a multiple of
 * ::cudaDeviceProp::texturePitchAlignment.
 *
 * @param offset - Offset in bytes
 * @param texref - Texture reference to bind
 * @param devPtr - 2D memory area on device
 * @param desc   - Channel format
 * @param width  - Width in texel units
 * @param height - Height in texel units
 * @param pitch  - Pitch in bytes
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidTexture
 * \notefnerr
 *
 * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
 * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
 * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
 * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
 * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaBindTextureToArray (C API)",
 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
 */
public static native @Cast("cudaError_t") int cudaBindTexture2D(@Cast("size_t*") SizeTPointer offset, @Const textureReference texref, @Const Pointer devPtr, @Const cudaChannelFormatDesc desc, @Cast("size_t") long width, @Cast("size_t") long height, @Cast("size_t") long pitch);

/**
 * \brief Binds an array to a texture
 *
 * Binds the CUDA array \p array to the texture reference \p texref.
 * \p desc describes how the memory is interpreted when fetching values from
 * the texture. Any CUDA array previously bound to \p texref is unbound.
 *
 * @param texref - Texture to bind
 * @param array  - Memory array on device
 * @param desc   - Channel format
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidTexture
 * \notefnerr
 *
 * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
 * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
 * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
 * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
 */
public static native @Cast("cudaError_t") int cudaBindTextureToArray(@Const textureReference texref, cudaArray array, @Const cudaChannelFormatDesc desc);

/**
 * \brief Binds a mipmapped array to a texture
 *
 * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p texref.
 * \p desc describes how the memory is interpreted when fetching values from
 * the texture. Any CUDA mipmapped array previously bound to \p texref is unbound.
 *
 * @param texref         - Texture to bind
 * @param mipmappedArray - Memory mipmapped array on device
 * @param desc           - Channel format
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidDevicePointer,
 * ::cudaErrorInvalidTexture
 * \notefnerr
 *
 * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
 * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
 * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
 * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
 */
public static native @Cast("cudaError_t") int cudaBindTextureToMipmappedArray(@Const textureReference texref, cudaMipmappedArray mipmappedArray, @Const cudaChannelFormatDesc desc);

/**
 * \brief Unbinds a texture
 *
 * Unbinds the texture bound to \p texref.
 *
 * @param texref - Texture to unbind
 *
 * @return
 * ::cudaSuccess
 * \notefnerr
 *
 * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
 * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
 * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
 * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
 */
public static native @Cast("cudaError_t") int cudaUnbindTexture(@Const textureReference texref);

/**
 * \brief Get the alignment offset of a texture
 *
 * Returns in \p *offset the offset that was returned when texture reference
 * \p texref was bound.
 *
 * @param offset - Offset of texture reference in bytes
 * @param texref - Texture to get offset of
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidTexture,
 * ::cudaErrorInvalidTextureBinding
 * \notefnerr
 *
 * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
 * ::cudaGetChannelDesc, ::cudaGetTextureReference,
 * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
 * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
 * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
 * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
 */
public static native @Cast("cudaError_t") int cudaGetTextureAlignmentOffset(@Cast("size_t*") SizeTPointer offset, @Const textureReference texref);

/**
 * \brief Get the texture reference associated with a symbol
 *
 * Returns in \p *texref the structure associated to the texture reference
 * defined by symbol \p symbol.
 *
 * @param texref - Texture reference associated with symbol
 * @param symbol - Texture to get reference for
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidTexture
 * \notefnerr
 * \note_string_api_deprecation_50
 *
 * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
 * ::cudaGetChannelDesc,
 * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)",
 * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
 * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
 * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
 * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)"
 */
public static native @Cast("cudaError_t") int cudaGetTextureReference(@Cast("const textureReference**") PointerPointer texref, @Const Pointer symbol);
public static native @Cast("cudaError_t") int cudaGetTextureReference(@Const @ByPtrPtr textureReference texref, @Const Pointer symbol);

/** \} */ /* END CUDART_TEXTURE */

/**
 * \defgroup CUDART_SURFACE Surface Reference Management
 *
 * ___MANBRIEF___ surface reference management functions of the CUDA runtime
 * API (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the low level surface reference management functions
 * of the CUDA runtime application programming interface.
 *
 * Some functions have overloaded C++ API template versions documented separately in the
 * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
 *
 * \{
 */

/**
 * \brief Binds an array to a surface
 *
 * Binds the CUDA array \p array to the surface reference \p surfref.
 * \p desc describes how the memory is interpreted when fetching values from
 * the surface. Any CUDA array previously bound to \p surfref is unbound.
 *
 * @param surfref - Surface to bind
 * @param array  - Memory array on device
 * @param desc   - Channel format
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue,
 * ::cudaErrorInvalidSurface
 * \notefnerr
 *
 * \sa \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)",
 * \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)",
 * ::cudaGetSurfaceReference
 */
public static native @Cast("cudaError_t") int cudaBindSurfaceToArray(@Const surfaceReference surfref, cudaArray array, @Const cudaChannelFormatDesc desc);

/**
 * \brief Get the surface reference associated with a symbol
 *
 * Returns in \p *surfref the structure associated to the surface reference
 * defined by symbol \p symbol.
 *
 * @param surfref - Surface reference associated with symbol
 * @param symbol - Surface to get reference for
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidSurface
 * \notefnerr
 * \note_string_api_deprecation_50
 *
 * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)"
 */
public static native @Cast("cudaError_t") int cudaGetSurfaceReference(@Cast("const surfaceReference**") PointerPointer surfref, @Const Pointer symbol);
public static native @Cast("cudaError_t") int cudaGetSurfaceReference(@Const @ByPtrPtr surfaceReference surfref, @Const Pointer symbol);

/** \} */ /* END CUDART_SURFACE */

/**
 * \defgroup CUDART_TEXTURE_OBJECT Texture Object Management
 *
 * ___MANBRIEF___ texture object management functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the low level texture object management functions
 * of the CUDA runtime application programming interface. The texture
 * object API is only supported on devices of compute capability 3.0 or higher.
 *
 * \{
 */

/**
 * \brief Creates a texture object
 *
 * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
 * the data to texture from. \p pTexDesc describes how the data should be sampled.
 * \p pResViewDesc is an optional argument that specifies an alternate format for
 * the data described by \p pResDesc, and also describes the subresource region
 * to restrict access to when texturing. \p pResViewDesc can only be specified if
 * the type of resource is a CUDA array or a CUDA mipmapped array.
 *
 * Texture objects are only supported on devices of compute capability 3.0 or higher.
 * Additionally, a texture object is an opaque value, and, as such, should only be
 * accessed through CUDA API calls.
 *
 * The ::cudaResourceDesc structure is defined as:
 * {@code
        struct cudaResourceDesc {
	        enum cudaResourceType resType;
        	
	        union {
		        struct {
			        cudaArray_t array;
		        } array;
                struct {
                    cudaMipmappedArray_t mipmap;
                } mipmap;
		        struct {
			        void *devPtr;
			        struct cudaChannelFormatDesc desc;
			        size_t sizeInBytes;
		        } linear;
		        struct {
			        void *devPtr;
			        struct cudaChannelFormatDesc desc;
			        size_t width;
			        size_t height;
			        size_t pitchInBytes;
		        } pitch2D;
	        } res;
        };
 * }
 * where:
 * - ::cudaResourceDesc::resType specifies the type of resource to texture from.
 * CUresourceType is defined as:
 * {@code
        enum cudaResourceType {
            cudaResourceTypeArray          = 0x00,
            cudaResourceTypeMipmappedArray = 0x01,
            cudaResourceTypeLinear         = 0x02,
            cudaResourceTypePitch2D        = 0x03
        };
 * }
 *
 * \par
 * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeArray, ::cudaResourceDesc::res::array::array
 * must be set to a valid CUDA array handle.
 *
 * \par
 * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeMipmappedArray, ::cudaResourceDesc::res::mipmap::mipmap
 * must be set to a valid CUDA mipmapped array handle and ::cudaTextureDesc::normalizedCoords must be set to true.
 *
 * \par
 * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeLinear, ::cudaResourceDesc::res::linear::devPtr
 * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment.
 * ::cudaResourceDesc::res::linear::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::linear::sizeInBytes
 * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed 
 * ::cudaDeviceProp::maxTexture1DLinear. The number of elements is computed as (sizeInBytes / sizeof(desc)).
 *
 * \par
 * If ::cudaResourceDesc::resType is set to ::cudaResourceTypePitch2D, ::cudaResourceDesc::res::pitch2D::devPtr
 * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment.
 * ::cudaResourceDesc::res::pitch2D::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::pitch2D::width
 * and ::cudaResourceDesc::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
 * ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1] respectively.
 * ::cudaResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to 
 * ::cudaDeviceProp::texturePitchAlignment. Pitch cannot exceed ::cudaDeviceProp::maxTexture2DLinear[2].
 *
 *
 * The ::cudaTextureDesc struct is defined as
 * {@code
        struct cudaTextureDesc {
            enum cudaTextureAddressMode addressMode[3];
            enum cudaTextureFilterMode  filterMode;
            enum cudaTextureReadMode    readMode;
            int                         sRGB;
            float                       borderColor[4];
            int                         normalizedCoords;
            unsigned int                maxAnisotropy;
            enum cudaTextureFilterMode  mipmapFilterMode;
            float                       mipmapLevelBias;
            float                       minMipmapLevelClamp;
            float                       maxMipmapLevelClamp;
        };
 * }
 * where
 * - ::cudaTextureDesc::addressMode specifies the addressing mode for each dimension of the texture data. ::cudaTextureAddressMode is defined as:
 *   {@code
        enum cudaTextureAddressMode {
            cudaAddressModeWrap   = 0,
            cudaAddressModeClamp  = 1,
            cudaAddressModeMirror = 2,
            cudaAddressModeBorder = 3
        };
 *   }
 *   This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. Also, if ::cudaTextureDesc::normalizedCoords
 *   is set to zero, ::cudaAddressModeWrap and ::cudaAddressModeMirror won't be supported and will be switched to ::cudaAddressModeClamp.
 *
 * - ::cudaTextureDesc::filterMode specifies the filtering mode to be used when fetching from the texture. ::cudaTextureFilterMode is defined as:
 *   {@code
        enum cudaTextureFilterMode {
            cudaFilterModePoint  = 0,
            cudaFilterModeLinear = 1
        };
 *   }
 *   This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear.
 *
 * - ::cudaTextureDesc::readMode specifies whether integer data should be converted to floating point or not. ::cudaTextureReadMode is defined as:
 *   {@code
        enum cudaTextureReadMode {
            cudaReadModeElementType     = 0,
            cudaReadModeNormalizedFloat = 1
        };
 *   }
 *   Note that this applies only to 8-bit and 16-bit integer formats. 32-bit integer format would not be promoted, regardless of 
 *   whether or not this ::cudaTextureDesc::readMode is set ::cudaReadModeNormalizedFloat is specified.
 *
 * - ::cudaTextureDesc::sRGB specifies whether sRGB to linear conversion should be performed during texture fetch.
 *
 * - ::cudaTextureDesc::borderColor specifies the float values of color. where:
 *   ::cudaTextureDesc::borderColor[0] contains value of 'R', 
 *   ::cudaTextureDesc::borderColor[1] contains value of 'G',
 *   ::cudaTextureDesc::borderColor[2] contains value of 'B', 
 *   ::cudaTextureDesc::borderColor[3] contains value of 'A'
 *   Note that application using integer border color values will need to  these values to float.
 *   The values are set only when the addressing mode specified by ::cudaTextureDesc::addressMode is cudaAddressModeBorder.
 *
 * - ::cudaTextureDesc::normalizedCoords specifies whether the texture coordinates will be normalized or not.
 *
 * - ::cudaTextureDesc::maxAnisotropy specifies the maximum anistropy ratio to be used when doing anisotropic filtering. This value will be
 *   clamped to the range [1,16].
 *
 * - ::cudaTextureDesc::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
 *
 * - ::cudaTextureDesc::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
 *
 * - ::cudaTextureDesc::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
 *
 * - ::cudaTextureDesc::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
 *
 *
 * The ::cudaResourceViewDesc struct is defined as
 * {@code
        struct cudaResourceViewDesc {
            enum cudaResourceViewFormat format;
            size_t                      width;
            size_t                      height;
            size_t                      depth;
            unsigned int                firstMipmapLevel;
            unsigned int                lastMipmapLevel;
            unsigned int                firstLayer;
            unsigned int                lastLayer;
        };
 * }
 * where:
 * - ::cudaResourceViewDesc::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
 *   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
 *   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a 32-bit unsigned integer format
 *   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
 *   a 32-bit unsigned int with 2 channels. The other BC formats require the underlying resource to have the same 32-bit unsigned int
 *   format but with 4 channels.
 *
 * - ::cudaResourceViewDesc::width specifies the new width of the texture data. If the resource view format is a block
 *   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
 *   this value has to be equal to that of the original resource.
 *
 * - ::cudaResourceViewDesc::height specifies the new height of the texture data. If the resource view format is a block
 *   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
 *   this value has to be equal to that of the original resource.
 *
 * - ::cudaResourceViewDesc::depth specifies the new depth of the texture data. This value has to be equal to that of the
 *   original resource.
 *
 * - ::cudaResourceViewDesc::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
 *   For non-mipmapped resources, this value has to be zero.::cudaTextureDesc::minMipmapLevelClamp and ::cudaTextureDesc::maxMipmapLevelClamp
 *   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
 *   then the actual minimum mipmap level clamp will be 3.2.
 *
 * - ::cudaResourceViewDesc::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
 *   has to be zero.
 *
 * - ::cudaResourceViewDesc::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
 *   For non-layered resources, this value has to be zero.
 *
 * - ::cudaResourceViewDesc::lastLayer specifies the last layer index for layered textures. For non-layered resources, 
 *   this value has to be zero.
 *
 *
 * @param pTexObject   - Texture object to create
 * @param pResDesc     - Resource descriptor
 * @param pTexDesc     - Texture descriptor
 * @param pResViewDesc - Resource view descriptor
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 *
 * \sa ::cudaDestroyTextureObject
 */

public static native @Cast("cudaError_t") int cudaCreateTextureObject(@Cast("cudaTextureObject_t*") LongPointer pTexObject, @Const cudaResourceDesc pResDesc, @Const cudaTextureDesc pTexDesc, @Const cudaResourceViewDesc pResViewDesc);
public static native @Cast("cudaError_t") int cudaCreateTextureObject(@Cast("cudaTextureObject_t*") LongBuffer pTexObject, @Const cudaResourceDesc pResDesc, @Const cudaTextureDesc pTexDesc, @Const cudaResourceViewDesc pResViewDesc);
public static native @Cast("cudaError_t") int cudaCreateTextureObject(@Cast("cudaTextureObject_t*") long[] pTexObject, @Const cudaResourceDesc pResDesc, @Const cudaTextureDesc pTexDesc, @Const cudaResourceViewDesc pResViewDesc);

/**
 * \brief Destroys a texture object
 *
 * Destroys the texture object specified by \p texObject.
 *
 * @param texObject - Texture object to destroy
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 *
 * \sa ::cudaCreateTextureObject
 */
public static native @Cast("cudaError_t") int cudaDestroyTextureObject(@Cast("cudaTextureObject_t") long texObject);

/**
 * \brief Returns a texture object's resource descriptor
 *
 * Returns the resource descriptor for the texture object specified by \p texObject.
 *
 * @param pResDesc  - Resource descriptor
 * @param texObject - Texture object
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 *
 * \sa ::cudaCreateTextureObject
 */
public static native @Cast("cudaError_t") int cudaGetTextureObjectResourceDesc(cudaResourceDesc pResDesc, @Cast("cudaTextureObject_t") long texObject);

/**
 * \brief Returns a texture object's texture descriptor
 *
 * Returns the texture descriptor for the texture object specified by \p texObject.
 *
 * @param pTexDesc  - Texture descriptor
 * @param texObject - Texture object
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 *
 * \sa ::cudaCreateTextureObject
 */
public static native @Cast("cudaError_t") int cudaGetTextureObjectTextureDesc(cudaTextureDesc pTexDesc, @Cast("cudaTextureObject_t") long texObject);

/**
 * \brief Returns a texture object's resource view descriptor
 *
 * Returns the resource view descriptor for the texture object specified by \p texObject.
 * If no resource view was specified, ::cudaErrorInvalidValue is returned.
 *
 * @param pResViewDesc - Resource view descriptor
 * @param texObject    - Texture object
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 *
 * \sa ::cudaCreateTextureObject
 */
public static native @Cast("cudaError_t") int cudaGetTextureObjectResourceViewDesc(cudaResourceViewDesc pResViewDesc, @Cast("cudaTextureObject_t") long texObject);

/** \} */ /* END CUDART_TEXTURE_OBJECT */

/**
 * \defgroup CUDART_SURFACE_OBJECT Surface Object Management
 *
 * ___MANBRIEF___ surface object management functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the low level texture object management functions
 * of the CUDA runtime application programming interface. The surface object 
 * API is only supported on devices of compute capability 3.0 or higher.
 *
 * \{
 */

/**
 * \brief Creates a surface object
 *
 * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
 * the data to perform surface load/stores on. ::cudaResourceDesc::resType must be 
 * ::cudaResourceTypeArray and  ::cudaResourceDesc::res::array::array
 * must be set to a valid CUDA array handle.
 *
 * Surface objects are only supported on devices of compute capability 3.0 or higher.
 * Additionally, a surface object is an opaque value, and, as such, should only be
 * accessed through CUDA API calls.
 *
 * @param pSurfObject - Surface object to create
 * @param pResDesc    - Resource descriptor
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 *
 * \sa ::cudaDestroySurfaceObject
 */

public static native @Cast("cudaError_t") int cudaCreateSurfaceObject(@Cast("cudaSurfaceObject_t*") LongPointer pSurfObject, @Const cudaResourceDesc pResDesc);
public static native @Cast("cudaError_t") int cudaCreateSurfaceObject(@Cast("cudaSurfaceObject_t*") LongBuffer pSurfObject, @Const cudaResourceDesc pResDesc);
public static native @Cast("cudaError_t") int cudaCreateSurfaceObject(@Cast("cudaSurfaceObject_t*") long[] pSurfObject, @Const cudaResourceDesc pResDesc);

/**
 * \brief Destroys a surface object
 *
 * Destroys the surface object specified by \p surfObject.
 *
 * @param surfObject - Surface object to destroy
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 *
 * \sa ::cudaCreateSurfaceObject
 */
public static native @Cast("cudaError_t") int cudaDestroySurfaceObject(@Cast("cudaSurfaceObject_t") long surfObject);

/**
 * \brief Returns a surface object's resource descriptor
 * Returns the resource descriptor for the surface object specified by \p surfObject.
 *
 * @param pResDesc   - Resource descriptor
 * @param surfObject - Surface object
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 *
 * \sa ::cudaCreateSurfaceObject
 */
public static native @Cast("cudaError_t") int cudaGetSurfaceObjectResourceDesc(cudaResourceDesc pResDesc, @Cast("cudaSurfaceObject_t") long surfObject);

/** \} */ /* END CUDART_SURFACE_OBJECT */

/**
 * \defgroup CUDART__VERSION Version Management
 *
 * \{
 */

/**
 * \brief Returns the CUDA driver version
 *
 * Returns in \p *driverVersion the version number of the installed CUDA
 * driver. If no driver is installed, then 0 is returned as the driver
 * version (via \p driverVersion). This function automatically returns
 * ::cudaErrorInvalidValue if the \p driverVersion argument is NULL.
 *
 * @param driverVersion - Returns the CUDA driver version.
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 * \notefnerr
 *
 * \sa ::cudaRuntimeGetVersion
 */
public static native @Cast("cudaError_t") int cudaDriverGetVersion(IntPointer driverVersion);
public static native @Cast("cudaError_t") int cudaDriverGetVersion(IntBuffer driverVersion);
public static native @Cast("cudaError_t") int cudaDriverGetVersion(int[] driverVersion);

/**
 * \brief Returns the CUDA Runtime version
 *
 * Returns in \p *runtimeVersion the version number of the installed CUDA
 * Runtime. This function automatically returns ::cudaErrorInvalidValue if
 * the \p runtimeVersion argument is NULL.
 *
 * @param runtimeVersion - Returns the CUDA Runtime version.
 *
 * @return
 * ::cudaSuccess,
 * ::cudaErrorInvalidValue
 *
 * \sa ::cudaDriverGetVersion
 */
public static native @Cast("cudaError_t") int cudaRuntimeGetVersion(IntPointer runtimeVersion);
public static native @Cast("cudaError_t") int cudaRuntimeGetVersion(IntBuffer runtimeVersion);
public static native @Cast("cudaError_t") int cudaRuntimeGetVersion(int[] runtimeVersion);

/** \} */ /* END CUDART__VERSION */

/** \cond impl_private */
public static native @Cast("cudaError_t") int cudaGetExportTable(@Cast("const void**") PointerPointer ppExportTable, @Const cudaUUID_t pExportTableId);
public static native @Cast("cudaError_t") int cudaGetExportTable(@Cast("const void**") @ByPtrPtr Pointer ppExportTable, @Const cudaUUID_t pExportTableId);
/** \endcond impl_private */

/**
 * \defgroup CUDART_HIGHLEVEL C++ API Routines
 *
 * ___MANBRIEF___ C++ high level API functions of the CUDA runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the C++ high level API functions of the CUDA runtime
 * application programming interface. To use these functions, your
 * application needs to be compiled with the \p nvcc compiler.
 *
 * \brief C++-style interface built on top of CUDA runtime API
 */

/**
 * \defgroup CUDART_DRIVER Interactions with the CUDA Driver API
 *
 * ___MANBRIEF___ interactions between CUDA Driver API and CUDA Runtime API
 * (___CURRENT_FILE___) ___ENDMANBRIEF___
 *
 * This section describes the interactions between the CUDA Driver API and the CUDA Runtime API
 *
 * \{
 *
 * \section CUDART_CUDA_primary Primary Contexts
 *
 * There exists a one to one relationship between CUDA devices in the CUDA Runtime
 * API and ::CUcontext s in the CUDA Driver API within a process.  The specific
 * context which the CUDA Runtime API uses for a device is called the device's
 * primary context.  From the perspective of the CUDA Runtime API, a device and 
 * its primary context are synonymous.
 *
 * \section CUDART_CUDA_init Initialization and Tear-Down
 *
 * CUDA Runtime API calls operate on the CUDA Driver API ::CUcontext which is current to
 * to the calling host thread.  
 *
 * The function ::cudaSetDevice() makes the primary context for the
 * specified device current to the calling thread by calling ::cuCtxSetCurrent().
 *
 * The CUDA Runtime API will automatically initialize the primary context for
 * a device at the first CUDA Runtime API call which requires an active context.
 * If no ::CUcontext is current to the calling thread when a CUDA Runtime API call 
 * which requires an active context is made, then the primary context for a device 
 * will be selected, made current to the calling thread, and initialized.
 *
 * The context which the CUDA Runtime API initializes will be initialized using 
 * the parameters specified by the CUDA Runtime API functions
 * ::cudaSetDeviceFlags(), 
 * ::cudaD3D9SetDirect3DDevice(), 
 * ::cudaD3D10SetDirect3DDevice(), 
 * ::cudaD3D11SetDirect3DDevice(), 
 * ::cudaGLSetGLDevice(), and
 * ::cudaVDPAUSetVDPAUDevice().
 * Note that these functions will fail with ::cudaErrorSetOnActiveProcess if they are 
 * called when the primary context for the specified device has already been initialized.
 * (or if the current device has already been initialized, in the case of 
 * ::cudaSetDeviceFlags()). 
 *
 * Primary contexts will remain active until they are explicitly deinitialized 
 * using ::cudaDeviceReset().  The function ::cudaDeviceReset() will deinitialize the 
 * primary context for the calling thread's current device immediately.  The context 
 * will remain current to all of the threads that it was current to.  The next CUDA 
 * Runtime API call on any thread which requires an active context will trigger the 
 * reinitialization of that device's primary context.
 *
 * Note that there is no reference counting of the primary context's lifetime.  It is
 * recommended that the primary context not be deinitialized except just before exit
 * or to recover from an unspecified launch failure.
 * 
 * \section CUDART_CUDA_context Context Interoperability
 *
 * Note that the use of multiple ::CUcontext s per device within a single process 
 * will substantially degrade performance and is strongly discouraged.  Instead,
 * it is highly recommended that the implicit one-to-one device-to-context mapping
 * for the process provided by the CUDA Runtime API be used.
 *
 * If a non-primary ::CUcontext created by the CUDA Driver API is current to a
 * thread then the CUDA Runtime API calls to that thread will operate on that 
 * ::CUcontext, with some exceptions listed below.  Interoperability between data
 * types is discussed in the following sections.
 *
 * The function ::cudaPointerGetAttributes() will return the error 
 * ::cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a 
 * non-primary context.  The function ::cudaDeviceEnablePeerAccess() and the rest of 
 * the peer access API may not be called when a non-primary ::CUcontext is current.  
 * To use the pointer query and peer access APIs with a context created using the 
 * CUDA Driver API, it is necessary that the CUDA Driver API be used to access
 * these features.
 *
 * All CUDA Runtime API state (e.g, global variables' addresses and values) travels
 * with its underlying ::CUcontext.  In particular, if a ::CUcontext is moved from one 
 * thread to another then all CUDA Runtime API state will move to that thread as well.
 *
 * Please note that attaching to legacy contexts (those with a version of 3010 as returned
 * by ::cuCtxGetApiVersion()) is not possible. The CUDA Runtime will return
 * ::cudaErrorIncompatibleDriverContext in such cases.
 *
 * \section CUDART_CUDA_stream Interactions between CUstream and cudaStream_t
 *
 * The types ::CUstream and ::cudaStream_t are identical and may be used interchangeably.
 *
 * \section CUDART_CUDA_event Interactions between CUevent and cudaEvent_t
 *
 * The types ::CUevent and ::cudaEvent_t are identical and may be used interchangeably.
 *
 * \section CUDART_CUDA_array Interactions between CUarray and cudaArray_t 
 *
 * The types ::CUarray and struct ::cudaArray * represent the same data type and may be used
 * interchangeably by casting the two types between each other.
 *
 * In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray *,
 * it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray *.
 *
 * In order to use a struct ::cudaArray * in a CUDA Driver API function which takes a ::CUarray,
 * it is necessary to explicitly cast the struct ::cudaArray * to a ::CUarray .
 *
 * \section CUDART_CUDA_graphicsResource Interactions between CUgraphicsResource and cudaGraphicsResource_t
 *
 * The types ::CUgraphicsResource and ::cudaGraphicsResource_t represent the same data type and may be used
 * interchangeably by casting the two types between each other.
 *
 * In order to use a ::CUgraphicsResource in a CUDA Runtime API function which takes a 
 * ::cudaGraphicsResource_t, it is necessary to explicitly cast the ::CUgraphicsResource 
 * to a ::cudaGraphicsResource_t.
 *
 * In order to use a ::cudaGraphicsResource_t in a CUDA Driver API function which takes a
 * ::CUgraphicsResource, it is necessary to explicitly cast the ::cudaGraphicsResource_t 
 * to a ::CUgraphicsResource.
 *
 * \}
 */

// #if defined(__CUDA_API_VERSION_INTERNAL)
// #elif defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM)
// #endif

// #if defined(__cplusplus)

// #endif /* __cplusplus */

// #undef __dv

// #endif /* !__CUDA_RUNTIME_API_H__ */


// Parsed from 

/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

// #if !defined(__DRIVER_FUNCTIONS_H__)
// #define __DRIVER_FUNCTIONS_H__

// #include "builtin_types.h"
// #include "host_defines.h"
// #include "driver_types.h"

/**
 * \addtogroup CUDART_MEMORY
 *
 * \{
 */

/**
 * \brief Returns a cudaPitchedPtr based on input parameters
 *
 * Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
 * \p p, \p xsz, and \p ysz.
 *
 * @param d   - Pointer to allocated memory
 * @param p   - Pitch of allocated memory in bytes
 * @param xsz - Logical width of allocation in elements
 * @param ysz - Logical height of allocation in elements
 *
 * @return
 * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
 *
 * \sa make_cudaExtent, make_cudaPos
 */
public static native @ByVal cudaPitchedPtr make_cudaPitchedPtr(Pointer d, @Cast("size_t") long p, @Cast("size_t") long xsz, @Cast("size_t") long ysz);

/**
 * \brief Returns a cudaPos based on input parameters
 *
 * Returns a ::cudaPos based on the specified input parameters \p x,
 * \p y, and \p z.
 *
 * @param x - X position
 * @param y - Y position
 * @param z - Z position
 *
 * @return
 * ::cudaPos specified by \p x, \p y, and \p z
 *
 * \sa make_cudaExtent, make_cudaPitchedPtr
 */
public static native @ByVal cudaPos make_cudaPos(@Cast("size_t") long x, @Cast("size_t") long y, @Cast("size_t") long z);

/**
 * \brief Returns a cudaExtent based on input parameters
 *
 * Returns a ::cudaExtent based on the specified input parameters \p w,
 * \p h, and \p d.
 *
 * @param w - Width in elements when referring to array memory, in bytes when referring to linear memory
 * @param h - Height in elements
 * @param d - Depth in elements
 *
 * @return
 * ::cudaExtent specified by \p w, \p h, and \p d
 *
 * \sa make_cudaPitchedPtr, make_cudaPos
 */
public static native @ByVal cudaExtent make_cudaExtent(@Cast("size_t") long w, @Cast("size_t") long h, @Cast("size_t") long d);

/** \} */ /* END CUDART_MEMORY */

// #endif /* !__DRIVER_FUNCTIONS_H__ */


// Parsed from 

/*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

// #if !defined(__VECTOR_FUNCTIONS_H__)
// #define __VECTOR_FUNCTIONS_H__

/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

// #include "builtin_types.h"
// #include "host_defines.h"
// #include "vector_types.h"

// #if defined(__CUDACC_RTC__)
// #define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
// #else /* !__CUDACC_RTC__ */
// #define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
// #endif /* __CUDACC_RTC__ */

// #if defined(__CUDACC_RTC__)
/*******************************************************************************
*                                                                              *
*                                                                              *
*                                                                              *
*******************************************************************************/

public static native @ByVal char1 make_char1(byte x);

public static native @ByVal uchar1 make_uchar1(@Cast("unsigned char") byte x);

public static native @ByVal char2 make_char2(byte x, byte y);

public static native @ByVal uchar2 make_uchar2(@Cast("unsigned char") byte x, @Cast("unsigned char") byte y);

public static native @ByVal char3 make_char3(byte x, byte y, byte z);

public static native @ByVal uchar3 make_uchar3(@Cast("unsigned char") byte x, @Cast("unsigned char") byte y, @Cast("unsigned char") byte z);

public static native @ByVal char4 make_char4(byte x, byte y, byte z, byte w);

public static native @ByVal uchar4 make_uchar4(@Cast("unsigned char") byte x, @Cast("unsigned char") byte y, @Cast("unsigned char") byte z, @Cast("unsigned char") byte w);

public static native @ByVal short1 make_short1(short x);

public static native @ByVal ushort1 make_ushort1(@Cast("unsigned short") short x);

public static native @ByVal short2 make_short2(short x, short y);

public static native @ByVal ushort2 make_ushort2(@Cast("unsigned short") short x, @Cast("unsigned short") short y);

public static native @ByVal short3 make_short3(short x,short y, short z);

public static native @ByVal ushort3 make_ushort3(@Cast("unsigned short") short x, @Cast("unsigned short") short y, @Cast("unsigned short") short z);

public static native @ByVal short4 make_short4(short x, short y, short z, short w);

public static native @ByVal ushort4 make_ushort4(@Cast("unsigned short") short x, @Cast("unsigned short") short y, @Cast("unsigned short") short z, @Cast("unsigned short") short w);

public static native @ByVal int1 make_int1(int x);

public static native @ByVal uint1 make_uint1(@Cast("unsigned int") int x);

public static native @ByVal int2 make_int2(int x, int y);

public static native @ByVal uint2 make_uint2(@Cast("unsigned int") int x, @Cast("unsigned int") int y);

public static native @ByVal int3 make_int3(int x, int y, int z);

public static native @ByVal uint3 make_uint3(@Cast("unsigned int") int x, @Cast("unsigned int") int y, @Cast("unsigned int") int z);

public static native @ByVal int4 make_int4(int x, int y, int z, int w);

public static native @ByVal uint4 make_uint4(@Cast("unsigned int") int x, @Cast("unsigned int") int y, @Cast("unsigned int") int z, @Cast("unsigned int") int w);

public static native @ByVal long1 make_long1(long x);

public static native @ByVal ulong1 make_ulong1(@Cast("unsigned long int") long x);

public static native @ByVal long2 make_long2(long x, long y);

public static native @ByVal ulong2 make_ulong2(@Cast("unsigned long int") long x, @Cast("unsigned long int") long y);

public static native @ByVal long3 make_long3(long x, long y, long z);

public static native @ByVal ulong3 make_ulong3(@Cast("unsigned long int") long x, @Cast("unsigned long int") long y, @Cast("unsigned long int") long z);

public static native @ByVal long4 make_long4(long x, long y, long z, long w);

public static native @ByVal ulong4 make_ulong4(@Cast("unsigned long int") long x, @Cast("unsigned long int") long y, @Cast("unsigned long int") long z, @Cast("unsigned long int") long w);

public static native @ByVal float1 make_float1(float x);

public static native @ByVal float2 make_float2(float x, float y);

public static native @ByVal float3 make_float3(float x, float y, float z);

public static native @ByVal float4 make_float4(float x, float y, float z, float w);

public static native @ByVal longlong1 make_longlong1(long x);

public static native @ByVal ulonglong1 make_ulonglong1(@Cast("unsigned long long int") long x);

public static native @ByVal longlong2 make_longlong2(long x, long y);

public static native @ByVal ulonglong2 make_ulonglong2(@Cast("unsigned long long int") long x, @Cast("unsigned long long int") long y);

public static native @ByVal longlong3 make_longlong3(long x, long y, long z);

public static native @ByVal ulonglong3 make_ulonglong3(@Cast("unsigned long long int") long x, @Cast("unsigned long long int") long y, @Cast("unsigned long long int") long z);

public static native @ByVal longlong4 make_longlong4(long x, long y, long z, long w);

public static native @ByVal ulonglong4 make_ulonglong4(@Cast("unsigned long long int") long x, @Cast("unsigned long long int") long y, @Cast("unsigned long long int") long z, @Cast("unsigned long long int") long w);

public static native @ByVal double1 make_double1(double x);

public static native @ByVal double2 make_double2(double x, double y);

public static native @ByVal double3 make_double3(double x, double y, double z);

public static native @ByVal double4 make_double4(double x, double y, double z, double w);
// #endif /* __CUDACC_RTC__ */

// #undef __VECTOR_FUNCTIONS_DECL__

// #if !defined(__CUDACC_RTC__)
// #include "vector_functions.hpp"
// #endif /* !__CUDACC_RTC__ */

// #endif /* !__VECTOR_FUNCTIONS_H__ */


// Parsed from 

/*
 * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

// #if !defined(CU_COMPLEX_H_)
// #define CU_COMPLEX_H_

/* When trying to include C header file in C++ Code extern "C" is required
 * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
 * extern "C" cannot be nested
 * Hence keep the header out of extern "C" block
 */

// #include        /* import fabsf, sqrt */

// #if defined(__cplusplus)
// #endif /* __cplusplus */

// #include "vector_types.h"

public static native float cuCrealf(@ByVal @Cast("cuFloatComplex*") float2 x);

public static native float cuCimagf(@ByVal @Cast("cuFloatComplex*") float2 x);

public static native @ByVal @Cast("cuFloatComplex*") float2 make_cuFloatComplex(float r, float i);

public static native @ByVal @Cast("cuFloatComplex*") float2 cuConjf(@ByVal @Cast("cuFloatComplex*") float2 x);
public static native @ByVal @Cast("cuFloatComplex*") float2 cuCaddf(@ByVal @Cast("cuFloatComplex*") float2 x,
                                                              @ByVal @Cast("cuFloatComplex*") float2 y);

public static native @ByVal @Cast("cuFloatComplex*") float2 cuCsubf(@ByVal @Cast("cuFloatComplex*") float2 x,
                                                              @ByVal @Cast("cuFloatComplex*") float2 y);

/* This implementation could suffer from intermediate overflow even though
 * the final result would be in range. However, various implementations do
 * not guard against this (presumably to avoid losing performance), so we 
 * don't do it either to stay competitive.
 */
public static native @ByVal @Cast("cuFloatComplex*") float2 cuCmulf(@ByVal @Cast("cuFloatComplex*") float2 x,
                                                              @ByVal @Cast("cuFloatComplex*") float2 y);

/* This implementation guards against intermediate underflow and overflow
 * by scaling. Such guarded implementations are usually the default for
 * complex library implementations, with some also offering an unguarded,
 * faster version.
 */
public static native @ByVal @Cast("cuFloatComplex*") float2 cuCdivf(@ByVal @Cast("cuFloatComplex*") float2 x,
                                                              @ByVal @Cast("cuFloatComplex*") float2 y);

/* 
 * We would like to call hypotf(), but it's not available on all platforms.
 * This discrete implementation guards against intermediate underflow and 
 * overflow by scaling. Otherwise we would lose half the exponent range. 
 * There are various ways of doing guarded computation. For now chose the 
 * simplest and fastest solution, however this may suffer from inaccuracies 
 * if sqrt and division are not IEEE compliant. 
 */
public static native float cuCabsf(@ByVal @Cast("cuFloatComplex*") float2 x);

/* Double precision */

public static native double cuCreal(@ByVal @Cast("cuDoubleComplex*") double2 x);

public static native double cuCimag(@ByVal @Cast("cuDoubleComplex*") double2 x);

public static native @ByVal @Cast("cuDoubleComplex*") double2 make_cuDoubleComplex(double r, double i);

public static native @ByVal @Cast("cuDoubleComplex*") double2 cuConj(@ByVal @Cast("cuDoubleComplex*") double2 x);

public static native @ByVal @Cast("cuDoubleComplex*") double2 cuCadd(@ByVal @Cast("cuDoubleComplex*") double2 x,
                                                             @ByVal @Cast("cuDoubleComplex*") double2 y);

public static native @ByVal @Cast("cuDoubleComplex*") double2 cuCsub(@ByVal @Cast("cuDoubleComplex*") double2 x,
                                                             @ByVal @Cast("cuDoubleComplex*") double2 y);

/* This implementation could suffer from intermediate overflow even though
 * the final result would be in range. However, various implementations do
 * not guard against this (presumably to avoid losing performance), so we 
 * don't do it either to stay competitive.
 */
public static native @ByVal @Cast("cuDoubleComplex*") double2 cuCmul(@ByVal @Cast("cuDoubleComplex*") double2 x,
                                                             @ByVal @Cast("cuDoubleComplex*") double2 y);

/* This implementation guards against intermediate underflow and overflow
 * by scaling. Such guarded implementations are usually the default for
 * complex library implementations, with some also offering an unguarded,
 * faster version.
 */
public static native @ByVal @Cast("cuDoubleComplex*") double2 cuCdiv(@ByVal @Cast("cuDoubleComplex*") double2 x,
                                                             @ByVal @Cast("cuDoubleComplex*") double2 y);

/* This implementation guards against intermediate underflow and overflow
 * by scaling. Otherwise we would lose half the exponent range. There are
 * various ways of doing guarded computation. For now chose the simplest
 * and fastest solution, however this may suffer from inaccuracies if sqrt
 * and division are not IEEE compliant.
 */
public static native double cuCabs(@ByVal @Cast("cuDoubleComplex*") double2 x);

// #if defined(__cplusplus)
// #endif /* __cplusplus */

/* aliases */
public static native @ByVal @Cast("cuComplex*") float2 make_cuComplex(float x, 
                                                                float y);

/* float-to-double promotion */
public static native @ByVal @Cast("cuDoubleComplex*") double2 cuComplexFloatToDouble(@ByVal @Cast("cuFloatComplex*") float2 c);

public static native @ByVal @Cast("cuFloatComplex*") float2 cuComplexDoubleToFloat(@ByVal @Cast("cuDoubleComplex*") double2 c);


public static native @ByVal @Cast("cuComplex*") float2 cuCfmaf( @ByVal @Cast("cuComplex*") float2 x, @ByVal @Cast("cuComplex*") float2 y, @ByVal @Cast("cuComplex*") float2 d);

public static native @ByVal @Cast("cuDoubleComplex*") double2 cuCfma( @ByVal @Cast("cuDoubleComplex*") double2 x, @ByVal @Cast("cuDoubleComplex*") double2 y, @ByVal @Cast("cuDoubleComplex*") double2 d);

// #endif /* !defined(CU_COMPLEX_H_) */


// Parsed from 

/*
 * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

/**
 * \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics
 * This section describes half precision intrinsic functions that are
 * only supported in device code.
 */

/**
 * \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions
 * \ingroup CUDA_MATH_INTRINSIC_HALF
 */

/**
 * \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions
 * \ingroup CUDA_MATH_INTRINSIC_HALF
 */

/**
 * \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions
 * \ingroup CUDA_MATH_INTRINSIC_HALF
 */

/**
 * \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions
 * \ingroup CUDA_MATH_INTRINSIC_HALF
 */

/**
 * \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion And Data Movement
 * \ingroup CUDA_MATH_INTRINSIC_HALF
 */

/**
 * \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions
 * \ingroup CUDA_MATH_INTRINSIC_HALF
 */

/**
 * \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions
 * \ingroup CUDA_MATH_INTRINSIC_HALF
 */

// #ifndef CUDA_FP16_H_JNESTUG4
// #define CUDA_FP16_H_JNESTUG4

public static class __half extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public __half() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public __half(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public __half(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public __half position(long position) {
        return (__half)super.position(position);
    }

   public native @Cast("unsigned short") short x(); public native __half x(short x);
}

public static class __half2 extends Pointer {
    static { Loader.load(); }
    /** Default native constructor. */
    public __half2() { super((Pointer)null); allocate(); }
    /** Native array allocator. Access with {@link Pointer#position(long)}. */
    public __half2(long size) { super((Pointer)null); allocateArray(size); }
    /** Pointer cast constructor. Invokes {@link Pointer#Pointer(Pointer)}. */
    public __half2(Pointer p) { super(p); }
    private native void allocate();
    private native void allocateArray(long size);
    @Override public __half2 position(long position) {
        return (__half2)super.position(position);
    }

   public native @Cast("unsigned int") int x(); public native __half2 x(int x);
}

// #ifndef CUDA_NO_HALF
// #endif /*CUDA_NO_HALF*/

// #if defined(__CUDACC__)
// #endif /*defined(__CUDACC__)*/
// #endif /* end of include guard: CUDA_FP16_H_JNESTUG4 */


// Parsed from 

/*
 * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

// #if !defined(__LIBRARY_TYPES_H__)
// #define __LIBRARY_TYPES_H__


/** enum cudaDataType_t */
public static final int
	CUDA_R_16F = 2,  /* real as a half */
	CUDA_C_16F = 6,  /* complex as a pair of half numbers */
	CUDA_R_32F = 0,  /* real as a float */
	CUDA_C_32F = 4,  /* complex as a pair of float numbers */
	CUDA_R_64F = 1,  /* real as a double */
	CUDA_C_64F = 5,  /* complex as a pair of double numbers */
	CUDA_R_8I = 3,  /* real as a signed char */
	CUDA_C_8I = 7,  /* complex as a pair of signed char numbers */
	CUDA_R_8U = 8,  /* real as a unsigned char */
	CUDA_C_8U = 9,  /* complex as a pair of unsigned char numbers */
	CUDA_R_32I = 10, /* real as a signed int */
	CUDA_C_32I = 11, /* complex as a pair of signed int numbers */
	CUDA_R_32U = 12, /* real as a unsigned int */
	CUDA_C_32U = 13;  /* complex as a pair of unsigned int numbers */ 


/** enum libraryPropertyType_t */
public static final int
	MAJOR_VERSION = 0,
	MINOR_VERSION = 1,
	PATCH_LEVEL = 2;

// #endif /* !__LIBRARY_TYPES_H__ */


}
CUDA array type	Valid extents that must always be met {(width range in elements), (height range), * (depth range)}	Valid extents with CUDA_ARRAY3D_SURFACE_LDST set * {(width range in elements), (height range), (depth range)}
1D	{ (1,TEXTURE1D_WIDTH), 0, 0 }	{ (1,SURFACE1D_WIDTH), 0, 0 }
2D	{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }	{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D	{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } * OR { (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), * (1,TEXTURE3D_DEPTH_ALTERNATE) }	{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), * (1,SURFACE3D_DEPTH) }
1D Layered	{ (1,TEXTURE1D_LAYERED_WIDTH), 0, * (1,TEXTURE1D_LAYERED_LAYERS) }	{ (1,SURFACE1D_LAYERED_WIDTH), 0, * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered	{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), * (1,TEXTURE2D_LAYERED_LAYERS) }	{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap	{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }	{ (1,SURFACECUBEMAP_WIDTH), * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered	{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }	{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
CUDA array type	Valid extents that must always be met {(width range in elements), (height range), * (depth range)}
1D	{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }
2D	{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }
3D	{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } * OR { (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), * (1,TEXTURE3D_DEPTH_ALTERNATE) }
1D Layered	{ (1,TEXTURE1D_LAYERED_WIDTH), 0, * (1,TEXTURE1D_LAYERED_LAYERS) }
2D Layered	{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), * (1,TEXTURE2D_LAYERED_LAYERS) }
Cubemap	{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }
Cubemap Layered	{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }