jcuda.jcufft.JCufft Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of nd4j-jcublas-common Show documentation
There is a newer version: 0.4-rc3.7
/*
 *
 *  * Copyright 2015 Skymind,Inc.
 *  *
 *  *    Licensed under the Apache License, Version 2.0 (the "License");
 *  *    you may not use this file except in compliance with the License.
 *  *    You may obtain a copy of the License at
 *  *
 *  *        http://www.apache.org/licenses/LICENSE-2.0
 *  *
 *  *    Unless required by applicable law or agreed to in writing, software
 *  *    distributed under the License is distributed on an "AS IS" BASIS,
 *  *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  *    See the License for the specific language governing permissions and
 *  *    limitations under the License.
 *
 *
 */

package jcuda.jcufft;

import jcuda.*;
import jcuda.runtime.JCuda;
import jcuda.runtime.cudaError;
import jcuda.runtime.cudaMemcpyKind;
import jcuda.runtime.cudaStream_t;

/**
 * Java bindings for CUFFT, the NVIDIA CUDA FFT library.

 * 

 * Most comments are taken from the CUFFT library documentation

 * 

 */
public class JCufft
{
    /**
     * CUFFT transform direction
     */
    public static final int CUFFT_FORWARD = -1;

    /**
     * CUFFT transform direction
     */
    public static final int CUFFT_INVERSE = 1;


    /**
     * The flag that indicates whether the native library has been
     * loaded
     */
    private static boolean initialized = false;

    /**
     * Whether a CudaException should be thrown if a method is about
     * to return a result code that is not cufftResult.CUFFT_SUCCESS
     */
    private static boolean exceptionsEnabled = false;
    
    /* Private constructor to prevent instantiation */
    private JCufft()
    {
    }

    // Initialize the native library.
    static
    {
        initialize();
    }
    
    /**
     * Initializes the native library. Note that this method
     * does not have to be called explicitly by the user of
     * the library: The library will automatically be 
     * loaded when this class is loaded.
     */
    public static void initialize()
    {
        if (!initialized)
        {
            LibUtils.loadLibrary("JCufft");
            initialized = true;
        }
    }

    
    /**
     * Set the specified log level for the JCufft library.

     * 

     * Currently supported log levels:
     * 

     * LOG_QUIET: Never print anything 

     * LOG_ERROR: Print error messages 

     * LOG_TRACE: Print a trace of all native function calls 

     *
     * @param logLevel The log level to use.
     */
    public static void setLogLevel(LogLevel logLevel)
    {
        setLogLevel(logLevel.ordinal());
    }

    private static native void setLogLevel(int logLevel);


    /**
     * Enables or disables exceptions. By default, the methods of this class
     * only return the cufftResult error code from the underlying CUDA function.
     * If exceptions are enabled, a CudaException with a detailed error 
     * message will be thrown if a method is about to return a result code 
     * that is not cufftResult.CUFFT_SUCCESS
     * 
     * @param enabled Whether exceptions are enabled
     */
    public static void setExceptionsEnabled(boolean enabled)
    {
        exceptionsEnabled = enabled;
    }

    /**
     * If the given result is different to cufftResult.CUFFT_SUCCESS and
     * exceptions have been enabled, this method will throw a 
     * CudaException with an error message that corresponds to the
     * given result code. Otherwise, the given result is simply
     * returned.
     * 
     * @param result The result to check
     * @return The result that was given as the parameter
     * @throws CudaException If exceptions have been enabled and
     * the given result code is not cufftResult.CUFFT_SUCCESS
     */
    private static int checkResult(int result)
    {
        if (exceptionsEnabled && result != cufftResult.CUFFT_SUCCESS)
        {
            throw new CudaException(cufftResult.stringFor(result));
        }
        return result;
    }
    
    
    /**
     * Writes the CUFFT version into the given argument.
     * 
     * @param version The version
     * @return The cufftResult code.
     */
    public static int cufftGetVersion(int version[])
    {
        return cufftGetVersionNative(version);
    }
    
    private static native int cufftGetVersionNative(int version[]);

    /**
     *      * Creates a 1D FFT plan configuration for a specified signal size and data
     * type.
     *
     * cufftResult cufftPlan1d( cufftHandle *plan, int nx, cufftType type, int batch );
     *
     * The batch input parameter tells CUFFT how many 1D transforms to configure.
     *
     * Input
     * ----
     * plan  Pointer to a cufftHandle object
     * nx    The transform size (e.g., 256 for a 256-point FFT)
     * type  The transform data type (e.g., CUFFT_C2C for complex to complex)
     * batch Number of transforms of size nx
     *
     * Output
     * ----
     * plan     Contains a CUFFT 1D plan handle value
     *
     * Return Values
     * ----
     * CUFFT_SETUP_FAILED CUFFT library failed to initialize.
     * CUFFT_INVALID_SIZE The nx parameter is not a supported size.
     * CUFFT_INVALID_TYPE The type parameter is not supported.
     * CUFFT_ALLOC_FAILED Allocation of GPU resources for the plan failed.
     * CUFFT_SUCCESS      CUFFT successfully created the FFT plan.
     *
     * JCUFFT_INTERNAL_ERROR If an internal JCufft error occurred
     *      * NOTE: Batch sizes other than 1 for cufftPlan1d() have been 
     * deprecated as of CUDA 6.0RC. Use cufftPlanMany() for 
     * multiple batch execution.
     */
    public static int cufftPlan1d(cufftHandle plan, int nx, int type, int batch)
    {
        plan.setDimension(1);
        plan.setType(type);
        plan.setSize(nx, 0, 0);
        plan.setBatchSize(batch);
        return checkResult(cufftPlan1dNative(plan, nx, type, batch));
    }
    private static native int cufftPlan1dNative(cufftHandle plan, int nx, int type, int batch);


    /**
     *      * Creates a 2D FFT plan configuration according to specified signal sizes
     * and data type.
     *
     * cufftResult cufftPlan2d( cufftHandle *plan, int nx, int ny, cufftType type );
     *
     * This function is the same as cufftPlan1d() except that
     * it takes a second size parameter, ny, and does not support batching.
     *
     * Input
     * ----
     * plan Pointer to a cufftHandle object
     * nx   The transform size in the X dimension (number of rows)
     * ny   The transform size in the Y dimension (number of columns)
     * type The transform data type (e.g., CUFFT_C2R for complex to real)
     *
     * Output
     * ----
     * plan Contains a CUFFT 2D plan handle value
     *
     * Return Values
     * ----
     * CUFFT_SETUP_FAILED CUFFT library failed to initialize.
     * CUFFT_INVALID_SIZE The nx or ny parameter is not a supported size.
     * CUFFT_INVALID_TYPE The type parameter is not supported.
     * CUFFT_ALLOC_FAILED Allocation of GPU resources for the plan failed.
     * CUFFT_SUCCESS      CUFFT successfully created the FFT plan.
     *
     * JCUFFT_INTERNAL_ERROR If an internal JCufft error occurred
     *      */
    public static int cufftPlan2d(cufftHandle plan, int nx, int ny, int type)
    {
        plan.setDimension(2);
        plan.setType(type);
        plan.setSize(nx, ny, 0);
        return checkResult(cufftPlan2dNative(plan, nx, ny, type));
    }
    private static native int cufftPlan2dNative(cufftHandle plan, int nx, int ny, int type);

    /**
     *      * Creates a 3D FFT plan configuration according to specified signal sizes
     * and data type.
     *
     * cufftResult cufftPlan3d( cufftHandle *plan, int nx, int ny, int nz, cufftType type );
     *
     * This function is the same as cufftPlan2d() except that
     * it takes a third size parameter nz.
     *
     * Input
     * ----
     * plan Pointer to a cufftHandle object
     * nx The transform size in the X dimension
     * ny The transform size in the Y dimension
     * nz The transform size in the Z dimension
     * type The transform data type (e.g., CUFFT_R2C for real to complex)
     *
     * Output
     * ----
     * plan Contains a CUFFT 3D plan handle value
     *
     * Return Values
     * ----
     * CUFFT_SETUP_FAILED CUFFT library failed to initialize.
     * CUFFT_INVALID_SIZE Parameter nx, ny, or nz is not a supported size.
     * CUFFT_INVALID_TYPE The type parameter is not supported.
     * CUFFT_ALLOC_FAILED Allocation of GPU resources for the plan failed.
     * CUFFT_SUCCESS      CUFFT successfully created the FFT plan.
     *
     * JCUFFT_INTERNAL_ERROR If an internal JCufft error occurred
     *      */
    public static int cufftPlan3d(cufftHandle plan, int nx, int ny, int nz, int type)
    {
        plan.setDimension(3);
        plan.setType(type);
        plan.setSize(nx, ny, nz);
        return checkResult(cufftPlan3dNative(plan, nx, ny, nz, type));
    }

    private static native int cufftPlan3dNative(cufftHandle plan, int nx, int ny, int nz, int type);

    
    /**
     *      * Creates a FFT plan configuration of dimension rank, with sizes
     * specified in the array n.
     * 
     * cufftResult cufftPlanMany(cufftHandle *plan, int rank, int *n,
     *     int *inembed, int istride, int idist,
     *     int *onembed, int ostride, int odist,
     *     cufftType type, int batch );
     * 
     * The batch input parameter tells CUFFT how many transforms to 
     * configure in parallel. With this function, batched plans of 
     * any dimension may be created. Input parameters inembed, istride, 
     * and idist and output parameters onembed, ostride, and odist 
     * will allow setup of noncontiguous input data in a future version. 
     * Note that for CUFFT 3.0, these parameters are ignored and the 
     * layout of batched data must be side-by-side and not interleaved.
     *  
     * Input
     * ----
     * plan Pointer to a cufftHandle object
     * rank Dimensionality of the transform (1, 2, or 3)
     * n An array of size rank, describing the size of each dimension
     * inembed Unused: pass NULL
     * istride Unused: pass 1
     * idist Unused: pass 0
     * onembed Unused: pass NULL
     * ostride Unused: pass 1
     * odist Unused: pass 0
     * type Transform data type (e.g., CUFFT_C2C, as per other CUFFT calls)
     * batch Batch size for this transform
     *
     * Output
     * ----
     * plan Contains a CUFFT plan handle
     *
     * Return Values
     * ----
     * CUFFT_SETUP_FAILED CUFFT library failed to initialize.
     * CUFFT_INVALID_SIZE Parameter is not a supported size.
     * CUFFT_INVALID_TYPE The type parameter is not supported     
     *  
     */
    public static int cufftPlanMany(cufftHandle plan, int rank, int n[],
        int inembed[], int istride, int idist,
        int onembed[], int ostride, int odist,
        int type, int batch)
    {
        return checkResult(cufftPlanManyNative(plan, rank, n, inembed, istride, idist, onembed, ostride, odist, type, batch));
    }
    
    private static native int cufftPlanManyNative(cufftHandle plan, int rank, int n[],
        int inembed[], int istride, int idist,
        int onembed[], int ostride, int odist,
        int type, int batch);
    
    
    
    
    
    public static int cufftMakePlan1d(
        cufftHandle plan, int nx, int type, 
        int batch, /* deprecated - use cufftPlanMany */
        long workSize[])
    {
        return checkResult(cufftMakePlan1dNative(plan, nx, type, batch, workSize));
    }
    private static native int cufftMakePlan1dNative(
        cufftHandle plan, int nx, int type, 
        int batch, /* deprecated - use cufftPlanMany */
        long workSize[]);

    
    
    public static int cufftMakePlan2d(
        cufftHandle plan, int nx, int ny, int type,
        long workSize[])
    {
        return checkResult(cufftMakePlan2dNative(plan, nx, ny, type, workSize));
    }
    private static native int cufftMakePlan2dNative(
        cufftHandle plan, int nx, int ny, int type,
        long workSize[]);

    
    
    public static int cufftMakePlan3d(
        cufftHandle plan, int nx, int ny, int nz, int type,
        long workSize[])
    {
        return checkResult(cufftMakePlan3dNative(plan, nx, ny, nz, type, workSize));
    }
    private static native int cufftMakePlan3dNative(
        cufftHandle plan, int nx, int ny, int nz, int type,
        long workSize[]);

    
    
    public static int cufftMakePlanMany(
        cufftHandle plan, int rank, int n[],
        int inembed[], int istride, int idist,
        int onembed[], int ostride, int odist,
        int type, int batch, long workSize[])
    {
        return checkResult(cufftMakePlanManyNative(
            plan, rank, n, 
            inembed, istride, idist, 
            onembed, ostride, odist, 
            type, batch, workSize));
    }
    private static native int cufftMakePlanManyNative(
        cufftHandle plan, int rank, int n[],
        int inembed[], int istride, int idist,
        int onembed[], int ostride, int odist,
        int type, int batch, long workSize[]);
    
    
    
    public static int cufftEstimate1d(int nx, 
        int type, 
            int batch, /* deprecated - use cufftPlanMany */
            long workSize[])
    {
        return checkResult(cufftEstimate1dNative(nx, type, batch, workSize));
    }
    private static native int cufftEstimate1dNative(
        int nx, int type, 
        int batch, /* deprecated - use cufftPlanMany */
        long workSize[]);

    
    
    public static int cufftEstimate2d(
        int nx, int ny, int type,
        long workSize[])
    {
        return checkResult(cufftEstimate2dNative(nx, ny, type, workSize));
    }
    private static native int cufftEstimate2dNative(
        int nx, int ny, int type,
        long workSize[]);

    
    
    public static int cufftEstimate3d(
        int nx, int ny, int nz, int type,
        long workSize[])
    {
        return checkResult(cufftEstimate3dNative(nx, ny, nz, type, workSize));
    }
    private static native int cufftEstimate3dNative(
        int nx, int ny, int nz, int type,
        long workSize[]);   

    
    
    public static int cufftEstimateMany(
        int rank, int n[],
        int inembed[], int istride, int idist,
        int onembed[], int ostride, int odist,
        int type,
        int batch,
        long workSize[])
    {
        return checkResult(cufftEstimateManyNative(rank, n, 
            inembed, istride, idist, 
            onembed, ostride, odist, type, batch, workSize));
    }
    private static native int cufftEstimateManyNative(
        int rank, int n[],
        int inembed[], int istride, int idist,
        int onembed[], int ostride, int odist,
        int type,
        int batch,
        long workSize[]);
    
    
            
    public static int cufftCreate(cufftHandle cufftHandle)
    {
        return checkResult(cufftCreateNative(cufftHandle));
    }
    private static native int cufftCreateNative(cufftHandle cufftHandle);

    
    public static int cufftGetSize1d(cufftHandle handle, 
        int nx, 
        int type, 
        int batch, 
        long workSize[])
    {
        return checkResult(cufftGetSize1dNative(handle, nx, type, batch, workSize));
    }
    private static native int cufftGetSize1dNative(cufftHandle handle, 
        int nx, 
        int type, 
        int batch, 
        long workSize[]);
    
    
    public static int cufftGetSize2d(cufftHandle handle, 
        int nx, int ny,
        int type,
        long workSize[])
    {
        return checkResult(cufftGetSize2dNative(handle, nx, ny, type, workSize));
    }
    private static native int cufftGetSize2dNative(cufftHandle handle, 
        int nx, int ny,
        int type,
        long workSize[]);

    
    
    public static int cufftGetSize3d(cufftHandle handle,
        int nx, int ny, int nz, 
        int type,
        long workSize[])
    {
        return checkResult(cufftGetSize3dNative(handle, nx, ny, nz, type, workSize));
    }
    private static native int cufftGetSize3dNative(cufftHandle handle,
        int nx, int ny, int nz, 
        int type,
        long workSize[]);

    
    
    public static int cufftGetSizeMany(cufftHandle handle, 
        int rank, int n[],
        int inembed[], int istride, int idist,
        int onembed[], int ostride, int odist,
        int type, int batch, long workArea[])
    {
        return checkResult(cufftGetSizeManyNative(handle, rank, n, 
            inembed, istride, idist, 
            onembed, ostride, odist, type, batch, workArea));
    }
    private static native int cufftGetSizeManyNative(cufftHandle handle, 
        int rank, int n[],
        int inembed[], int istride, int idist,
        int onembed[], int ostride, int odist,
        int type, int batch, long workArea[]);

    
    public static int cufftGetSize(cufftHandle handle, long workSize[])
    {
        return checkResult(cufftGetSizeNative(handle, workSize));
    }
    private static native int cufftGetSizeNative(cufftHandle handle, long workSize[]);
    
    public static int cufftSetWorkArea(cufftHandle plan, Pointer workArea)
    {
        return checkResult(cufftSetWorkAreaNative(plan, workArea));
    }
    private static native int cufftSetWorkAreaNative(cufftHandle plan, Pointer workArea);

    public static int cufftSetAutoAllocation(cufftHandle plan, int autoAllocate)
    {
        return checkResult(cufftSetAutoAllocationNative(plan, autoAllocate));
    }
    private static native int cufftSetAutoAllocationNative(cufftHandle plan, int autoAllocate);
    
    
    
    
    /**
     *      * Frees all GPU resources associated with a CUFFT plan and destroys the
     * internal plan data structure.
     *
     * cufftResult cufftDestroy( cufftHandle plan );
     *
     * This function should be called once a plan
     * is no longer needed to avoid wasting GPU memory.
     *
     * Input
     * ----
     * plan The cufftHandle object of the plan to be destroyed.
     *
     * Return Values
     * ----
     * CUFFT_SETUP_FAILED    CUFFT library failed to initialize.
     * CUFFT_SHUTDOWN_FAILED CUFFT library failed to shut down.
     * CUFFT_INVALID_PLAN    The plan parameter is not a valid handle.
     * CUFFT_SUCCESS         CUFFT successfully destroyed the FFT plan.
     *
     * JCUFFT_INTERNAL_ERROR If an internal JCufft error occurred
     *      */
    public static int cufftDestroy(cufftHandle plan)
    {
        return checkResult(cufftDestroyNative(plan));
    }

    private static native int cufftDestroyNative(cufftHandle plan);

    /**
     *      * Associates a CUDA stream with a CUFFT plan.
     * 
     * cufftResult cufftSetStream( cufftHandle plan, cudaStream_t stream );
     *
     * All kernel launches made during plan execution are now done through 
     * the associated stream, enabling overlap with activity in other 
     * streams (for example, data copying). The association remains until 
     * the plan is destroyed or the stream is changed with another call 
     * to cufftSetStream(). 
     *
     * Input
     * plan The cufftHandle object to associate with the stream
     * stream A valid CUDA stream created with cudaStreamCreate() (or 0
     * for the default stream)
     * 
     * Return Values
     * CUFFT_INVALID_PLAN The plan parameter is not a valid handle.
     * CUFFT_SUCCESS The stream was successfully associated with the plan.
     * 
     */
    public static int cufftSetStream(cufftHandle plan, cudaStream_t stream)
    {
        return checkResult(cufftSetStreamNative(plan, stream));
    }
    
    private static native int cufftSetStreamNative(cufftHandle plan, cudaStream_t stream);
    
    
    /**
     *      * Configures the layout of CUFFT output in FFTW-compatible modes.
     * 
     * When FFTW compatibility is desired, it can be configured for padding
     * only, for asymmetric complex inputs only, or to be fully compatible. 
     *
     * Input
     * plan The cufftHandle object to associate with the stream
     * mode The cufftCompatibility option to be used:
     *     CUFFT_COMPATIBILITY_NATIVE:
     *     Disable any FFTW compatibility mode.
     *     CUFFT_COMPATIBILITY_FFTW_PADDING:
     *     Support FFTW data padding. (Default)
     *     CUFFT_COMPATIBILITY_FFTW_ASYMMETRIC:
     *     Waive the C2R symmetry requirement.
     *     Should be used with asymmetric input.
     *     CUFFT_COMPATIBILITY_FFTW_ALL:
     *     Enable full FFTW compatibility.
     *     
     * Return Values
     * CUFFT_SETUP_FAILED CUFFT library failed to initialize.
     * CUFFT_INVALID_PLAN The plan parameter is not a valid handle.
     * CUFFT_SUCCESS CUFFT successfully executed the FFT plan.
     *  
     */
    public static int cufftSetCompatibilityMode(cufftHandle plan, int mode)
    {
        return checkResult(cufftSetCompatibilityModeNative(plan, mode));
    }
    private static native int cufftSetCompatibilityModeNative(cufftHandle plan, int mode);
    
    

    //=== Single precision ===================================================

    /**
     *      * Executes a CUFFT complex-to-complex transform plan.
     *
     * cufftResult cufftExecC2C( cufftHandle plan, cufftComplex *idata, cufftComplex *odata, int direction );
     *
     * CUFFT uses as input data the GPU memory pointed to by the idata parameter. This
     * function stores the Fourier coefficients in the odata array. If idata and
     * odata are the same, this method does an in-place transform.
     *
     * Input
     * ----
     * plan      The cufftHandle object for the plan to update
     * idata     Pointer to the input data (in GPU memory) to transform
     * odata     Pointer to the output data (in GPU memory)
     * direction The transform direction: CUFFT_FORWARD or CUFFT_INVERSE
     *
     * Output
     * ----
     * odata Contains the complex Fourier coefficients
     *
     * Return Values
     * ----
     * CUFFT_SETUP_FAILED  CUFFT library failed to initialize.
     * CUFFT_INVALID_PLAN  The plan parameter is not a valid handle.
     * CUFFT_INVALID_VALUE The idata, odata, and/or direction parameter is not valid.
     * CUFFT_EXEC_FAILED   CUFFT failed to execute the transform on GPU.
     * CUFFT_SUCCESS       CUFFT successfully executed the FFT plan
     *
     * JCUFFT_INTERNAL_ERROR If an internal JCufft error occurred
     *      */
    public static int cufftExecC2C(cufftHandle plan, Pointer cIdata, Pointer cOdata, int direction)
    {
        return checkResult(cufftExecC2CNative(plan, cIdata, cOdata, direction));
    }
    private static native int cufftExecC2CNative(cufftHandle plan, Pointer cIdata, Pointer cOdata, int direction);


    /**
     * Convenience method for {@link JCufft#cufftExecC2C(cufftHandle, Pointer, Pointer, int)}.
     * Accepts arrays for input and output data and automatically performs the host-device 
     * and device-host copies.
     *
     * @see JCufft#cufftExecC2C(cufftHandle, Pointer, Pointer, int)
     */
    public static int cufftExecC2C(cufftHandle plan, float cIdata[], float cOdata[], int direction)
    {
        int cudaResult = 0;

        boolean inPlace = (cIdata == cOdata);

        // Allocate space for the input data on the device
        Pointer hostCIdata = Pointer.to(cIdata);
        Pointer deviceCIdata = new Pointer();
        cudaResult = JCuda.cudaMalloc(deviceCIdata, cIdata.length * Sizeof.FLOAT);
        if (cudaResult != cudaError.cudaSuccess)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Set the output device data to be equal to the input
        // device data for in-place transforms, or allocate
        // the output device data if the transform is not
        // in-place
        Pointer hostCOdata = null;
        Pointer deviceCOdata = null;
        if (inPlace)
        {
            hostCOdata = hostCIdata;
            deviceCOdata = deviceCIdata;
        }
        else
        {
            hostCOdata = Pointer.to(cOdata);
            deviceCOdata = new Pointer();
            cudaResult = JCuda.cudaMalloc(deviceCOdata, cOdata.length * Sizeof.FLOAT);
            if (cudaResult != cudaError.cudaSuccess)
            {
                JCuda.cudaFree(deviceCIdata);
                if (exceptionsEnabled)
                {
                    throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
                }
                return cufftResult.JCUFFT_INTERNAL_ERROR;
            }
        }

        // Copy the host input data to the device
        cudaResult = JCuda.cudaMemcpy(deviceCIdata, hostCIdata, cIdata.length * Sizeof.FLOAT, cudaMemcpyKind.cudaMemcpyHostToDevice);
        if (cudaResult != cudaError.cudaSuccess)
        {
            JCuda.cudaFree(deviceCIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceCOdata);
            }

            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Execute the transform
        int result = cufftResult.CUFFT_SUCCESS;
        try
        {
            result = JCufft.cufftExecC2C(plan, deviceCIdata, deviceCOdata, direction);
        }
        catch (CudaException e)
        {
            JCuda.cudaFree(deviceCIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceCOdata);
            }
            result = cufftResult.JCUFFT_INTERNAL_ERROR;
        }
        if (result != cufftResult.CUFFT_SUCCESS)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException(cufftResult.stringFor(result));
            }
            return result;
        }

        // Copy the device output data to the host
        cudaResult = JCuda.cudaMemcpy(hostCOdata, deviceCOdata, cOdata.length * Sizeof.FLOAT, cudaMemcpyKind.cudaMemcpyDeviceToHost);
        if (cudaResult != cudaError.cudaSuccess)
        {
            JCuda.cudaFree(deviceCIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceCOdata);
            }
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Free the device data
        cudaResult = JCuda.cudaFree(deviceCIdata);
        if (cudaResult != cudaError.cudaSuccess)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }
        if (!inPlace)
        {
            cudaResult = JCuda.cudaFree(deviceCOdata);
            if (cudaResult != cudaError.cudaSuccess)
            {
                if (exceptionsEnabled)
                {
                    throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
                }
                return cufftResult.JCUFFT_INTERNAL_ERROR;
            }
        }
        return result;
    }



    /**
     *      * Executes a CUFFT real-to-complex (implicitly forward) transform plan.
     *
     * cufftResult cufftExecR2C( cufftHandle plan, cufftReal *idata, cufftComplex *odata );
     *
     * CUFFT uses as input data the GPU memory pointed to by the idata
     * parameter. This function stores the non-redundant Fourier coefficients
     * in the odata array. If idata and odata are the same, this method does
     * an in-place transform (See CUFFT documentation for details on real
     * data FFTs.)
     *
     * Input
     * ----
     * plan      The cufftHandle object for the plan to update
     * idata     Pointer to the input data (in GPU memory) to transform
     * odata     Pointer to the output data (in GPU memory)
     * direction The transform direction: CUFFT_FORWARD or CUFFT_INVERSE
     *
     * Output
     * ----
     * odata Contains the complex Fourier coefficients
     *
     * Return Values
     * ----
     * CUFFT_SETUP_FAILED CUFFT library failed to initialize.
     * CUFFT_INVALID_PLAN The plan parameter is not a valid handle.
     * CUFFT_INVALID_VALUE The idata, odata, and/or direction parameter is not valid.
     * CUFFT_EXEC_FAILED CUFFT failed to execute the transform on GPU.
     * CUFFT_SUCCESS CUFFT successfully executed the FFT plan.
     *
     * JCUFFT_INTERNAL_ERROR If an internal JCufft error occurred
     *      */
    public static int cufftExecR2C(cufftHandle plan, Pointer rIdata, Pointer cOdata)
    {
        return checkResult(cufftExecR2CNative(plan, rIdata, cOdata));
    }
    private static native int cufftExecR2CNative(cufftHandle plan, Pointer rIdata, Pointer cOdata);



    /**
     * Convenience method for {@link JCufft#cufftExecR2C(cufftHandle, Pointer, Pointer)}.
     * Accepts arrays for input and output data and automatically performs the host-device
     * and device-host copies.
     *
     * @see JCufft#cufftExecR2C(cufftHandle, Pointer, Pointer)
     */
    public static int cufftExecR2C(cufftHandle plan, float rIdata[], float cOdata[])
    {
        int cudaResult = 0;

        boolean inPlace = (rIdata == cOdata);

        // Allocate space for the input data on the device
        Pointer hostRIdata = Pointer.to(rIdata);
        Pointer deviceRIdata = new Pointer();
        cudaResult = JCuda.cudaMalloc(deviceRIdata, rIdata.length * Sizeof.FLOAT);
        if (cudaResult != cudaError.cudaSuccess)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Allocate the output device data
        Pointer hostCOdata = null;
        Pointer deviceCOdata = null;
        if (inPlace)
        {
            hostCOdata = hostRIdata;
            deviceCOdata = deviceRIdata;
        }
        else
        {
            hostCOdata = Pointer.to(cOdata);
            deviceCOdata = new Pointer();
            cudaResult = JCuda.cudaMalloc(deviceCOdata, cOdata.length * Sizeof.FLOAT);
            if (cudaResult != cudaError.cudaSuccess)
            {
                JCuda.cudaFree(deviceCOdata);
                if (exceptionsEnabled)
                {
                    throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
                }
                return cufftResult.JCUFFT_INTERNAL_ERROR;
            }
        }

        // Copy the host input data to the device
        cudaResult = JCuda.cudaMemcpy(deviceRIdata, hostRIdata, rIdata.length * Sizeof.FLOAT, cudaMemcpyKind.cudaMemcpyHostToDevice);
        if (cudaResult != cudaError.cudaSuccess)
        {
            JCuda.cudaFree(deviceRIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceCOdata);
            }
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Execute the transform
        int result = cufftResult.CUFFT_SUCCESS;
        try
        {
            result = JCufft.cufftExecR2C(plan, deviceRIdata, deviceCOdata);
        }
        catch (CudaException e)
        {
            JCuda.cudaFree(deviceRIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceCOdata);
            }
            result = cufftResult.JCUFFT_INTERNAL_ERROR;
        }
        if (result != cufftResult.CUFFT_SUCCESS)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException(cufftResult.stringFor(cudaResult));
            }
            return result;
        }

        // Copy the device output data to the host
        cudaResult = JCuda.cudaMemcpy(hostCOdata, deviceCOdata, cOdata.length * Sizeof.FLOAT, cudaMemcpyKind.cudaMemcpyDeviceToHost);
        if (cudaResult != cudaError.cudaSuccess)
        {
            JCuda.cudaFree(deviceRIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceCOdata);
            }
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Free the device data
        cudaResult = JCuda.cudaFree(deviceRIdata);
        if (cudaResult != cudaError.cudaSuccess)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }
        if (!inPlace)
        {
            cudaResult = JCuda.cudaFree(deviceCOdata);
            if (cudaResult != cudaError.cudaSuccess)
            {
                if (exceptionsEnabled)
                {
                    throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
                }
                return cufftResult.JCUFFT_INTERNAL_ERROR;
            }
        }
        return result;
    }








    /**
     *      * Executes a CUFFT complex-to-real (implicitly inverse) transform plan.
     *
     * cufftResult cufftExecC2R( cufftHandle plan, cufftComplex *idata, cufftReal *odata );
     *
     * CUFFT uses as input data the GPU memory pointed to by the idata
     * parameter. The input array holds only the non-redundant complex
     * Fourier coefficients. This function stores the real output values in the
     * odata array. If idata and odata are the same, this method does an inplace
     * transform. (See CUFFT documentation for details on real data FFTs.)
     *
     * Input
     * ----
     * plan The cufftHandle object for the plan to update
     * idata Pointer to the complex input data (in GPU memory) to transform
     * odata Pointer to the real output data (in GPU memory)
     *
     * Output
     * ----
     * odata Contains the real-valued output data
     *
     * Return Values
     * ----
     * CUFFT_SETUP_FAILED  CUFFT library failed to initialize.
     * CUFFT_INVALID_PLAN  The plan parameter is not a valid handle.
     * CUFFT_INVALID_VALUE The idata and/or odata parameter is not valid.
     * CUFFT_EXEC_FAILED   CUFFT failed to execute the transform on GPU.
     * CUFFT_SUCCESS       CUFFT successfully executed the FFT plan.
     *
     * JCUFFT_INTERNAL_ERROR If an internal JCufft error occurred
     *      */
    public static int cufftExecC2R(cufftHandle plan, Pointer cIdata, Pointer rOdata)
    {
        return checkResult(cufftExecC2RNative(plan, cIdata, rOdata));
    }
    private static native int cufftExecC2RNative(cufftHandle plan, Pointer cIdata, Pointer rOdata);



    /**
     * Convenience method for {@link JCufft#cufftExecC2R(cufftHandle, Pointer, Pointer)}.
     * Accepts arrays for input and output data and automatically performs the host-device
     * and device-host copies.
     *
     * @see JCufft#cufftExecC2R(cufftHandle, Pointer, Pointer)
     */
    public static int cufftExecC2R(cufftHandle plan, float cIdata[], float rOdata[])
    {
        int cudaResult = 0;

        boolean inPlace = (cIdata == rOdata);

        // Allocate space for the input data on the device
        Pointer hostCIdata = Pointer.to(cIdata);
        Pointer deviceCIdata = new Pointer();
        cudaResult = JCuda.cudaMalloc(deviceCIdata, cIdata.length * Sizeof.FLOAT);
        if (cudaResult != cudaError.cudaSuccess)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Allocate the output device data
        Pointer hostROdata = null;
        Pointer deviceROdata = null;
        if (inPlace)
        {
            hostROdata = hostCIdata;
            deviceROdata = deviceCIdata;
        }
        else
        {
            hostROdata = Pointer.to(rOdata);
            deviceROdata = new Pointer();
            cudaResult = JCuda.cudaMalloc(deviceROdata, rOdata.length * Sizeof.FLOAT);
            if (cudaResult != cudaError.cudaSuccess)
            {
                JCuda.cudaFree(deviceCIdata);
                if (exceptionsEnabled)
                {
                    throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
                }
                return cufftResult.JCUFFT_INTERNAL_ERROR;
            }
        }

        // Copy the host input data to the device
        cudaResult = JCuda.cudaMemcpy(deviceCIdata, hostCIdata, cIdata.length * Sizeof.FLOAT, cudaMemcpyKind.cudaMemcpyHostToDevice);
        if (cudaResult != cudaError.cudaSuccess)
        {
            JCuda.cudaFree(deviceCIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceROdata);
            }
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Execute the transform
        int result = cufftResult.CUFFT_SUCCESS;
        try
        {
            result = JCufft.cufftExecC2R(plan, deviceCIdata, deviceROdata);
        }
        catch (CudaException e)
        {
            JCuda.cudaFree(deviceCIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceROdata);
            }
            result = cufftResult.JCUFFT_INTERNAL_ERROR;
        }
        if (result != cufftResult.CUFFT_SUCCESS)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException(cufftResult.stringFor(cudaResult));
            }
            return result;
        }

        // Copy the device output data to the host
        cudaResult = JCuda.cudaMemcpy(hostROdata, deviceROdata, rOdata.length * Sizeof.FLOAT, cudaMemcpyKind.cudaMemcpyDeviceToHost);
        if (cudaResult != cudaError.cudaSuccess)
        {
            JCuda.cudaFree(deviceCIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceROdata);
            }
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Free the device data
        cudaResult = JCuda.cudaFree(deviceCIdata);
        if (cudaResult != cudaError.cudaSuccess)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }
        if (!inPlace)
        {
            cudaResult = JCuda.cudaFree(deviceROdata);
            if (cudaResult != cudaError.cudaSuccess)
            {
                if (exceptionsEnabled)
                {
                    throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
                }
                return cufftResult.JCUFFT_INTERNAL_ERROR;
            }
        }
        return result;
    }





    //=== Double precision ===================================================

    /**
     *      * Executes a CUFFT complex-to-complex transform plan for double precision
     * values.
     *
     * cufftResult cufftExecZ2Z( cufftHandle plan, cufftDoubleComplex *idata, cufftDoubleComplex *odata, int direction );
     *
     * CUFFT uses as input data the GPU memory pointed to by the idata parameter. This
     * function stores the Fourier coefficients in the odata array. If idata and
     * odata are the same, this method does an in-place transform.
     *
     * Input
     * ----
     * plan      The cufftHandle object for the plan to update
     * idata     Pointer to the input data (in GPU memory) to transform
     * odata     Pointer to the output data (in GPU memory)
     * direction The transform direction: CUFFT_FORWARD or CUFFT_INVERSE
     *
     * Output
     * ----
     * odata Contains the complex Fourier coefficients
     *
     * Return Values
     * ----
     * CUFFT_SETUP_FAILED  CUFFT library failed to initialize.
     * CUFFT_INVALID_PLAN  The plan parameter is not a valid handle.
     * CUFFT_INVALID_VALUE The idata, odata, and/or direction parameter is not valid.
     * CUFFT_EXEC_FAILED   CUFFT failed to execute the transform on GPU.
     * CUFFT_SUCCESS       CUFFT successfully executed the FFT plan
     *
     * JCUFFT_INTERNAL_ERROR If an internal JCufft error occurred
     *      */
    public static int cufftExecZ2Z(cufftHandle plan, Pointer cIdata, Pointer cOdata, int direction)
    {
        return checkResult(cufftExecZ2ZNative(plan, cIdata, cOdata, direction));
    }
    private static native int cufftExecZ2ZNative(cufftHandle plan, Pointer cIdata, Pointer cOdata, int direction);


    /**
     * Convenience method for {@link JCufft#cufftExecZ2Z(cufftHandle, Pointer, Pointer, int)}.
     * Accepts arrays for input and output data and automatically performs the host-device
     * and device-host copies.
     *
     * @see JCufft#cufftExecZ2Z(cufftHandle, Pointer, Pointer, int)
     */
    public static int cufftExecZ2Z(cufftHandle plan, double cIdata[], double cOdata[], int direction)
    {
        int cudaResult = 0;

        boolean inPlace = (cIdata == cOdata);

        // Allocate space for the input data on the device
        Pointer hostCIdata = Pointer.to(cIdata);
        Pointer deviceCIdata = new Pointer();
        cudaResult = JCuda.cudaMalloc(deviceCIdata, cIdata.length * Sizeof.DOUBLE);
        if (cudaResult != cudaError.cudaSuccess)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Set the output device data to be equal to the input
        // device data for in-place transforms, or allocate
        // the output device data if the transform is not
        // in-place
        Pointer hostCOdata = null;
        Pointer deviceCOdata = null;
        if (inPlace)
        {
            hostCOdata = hostCIdata;
            deviceCOdata = deviceCIdata;
        }
        else
        {
            hostCOdata = Pointer.to(cOdata);
            deviceCOdata = new Pointer();
            cudaResult = JCuda.cudaMalloc(deviceCOdata, cOdata.length * Sizeof.DOUBLE);
            if (cudaResult != cudaError.cudaSuccess)
            {
                JCuda.cudaFree(deviceCIdata);
                if (exceptionsEnabled)
                {
                    throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
                }
                return cufftResult.JCUFFT_INTERNAL_ERROR;
            }
        }

        // Copy the host input data to the device
        cudaResult = JCuda.cudaMemcpy(deviceCIdata, hostCIdata, cIdata.length * Sizeof.DOUBLE, cudaMemcpyKind.cudaMemcpyHostToDevice);
        if (cudaResult != cudaError.cudaSuccess)
        {
            JCuda.cudaFree(deviceCIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceCOdata);
            }

            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Execute the transform
        int result = cufftResult.CUFFT_SUCCESS;
        try
        {
            result = JCufft.cufftExecZ2Z(plan, deviceCIdata, deviceCOdata, direction);
        }
        catch (CudaException e)
        {
            JCuda.cudaFree(deviceCIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceCOdata);
            }
            result = cufftResult.JCUFFT_INTERNAL_ERROR;
        }
        if (result != cufftResult.CUFFT_SUCCESS)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException(cufftResult.stringFor(cudaResult));
            }
            return result;
        }

        // Copy the device output data to the host
        cudaResult = JCuda.cudaMemcpy(hostCOdata, deviceCOdata, cOdata.length * Sizeof.DOUBLE, cudaMemcpyKind.cudaMemcpyDeviceToHost);
        if (cudaResult != cudaError.cudaSuccess)
        {
            JCuda.cudaFree(deviceCIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceCOdata);
            }
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Free the device data
        cudaResult = JCuda.cudaFree(deviceCIdata);
        if (cudaResult != cudaError.cudaSuccess)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }
        if (!inPlace)
        {
            cudaResult = JCuda.cudaFree(deviceCOdata);
            if (cudaResult != cudaError.cudaSuccess)
            {
                if (exceptionsEnabled)
                {
                    throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
                }
                return cufftResult.JCUFFT_INTERNAL_ERROR;
            }
        }
        return result;
    }



    /**
     *      * Executes a CUFFT real-to-complex (implicitly forward) transform plan
     * for double precision values.
     *
     * cufftResult cufftExecD2Z( cufftHandle plan, cufftDoubleReal *idata, cufftDoubleComplex *odata );
     *
     * CUFFT uses as input data the GPU memory pointed to by the idata
     * parameter. This function stores the non-redundant Fourier coefficients
     * in the odata array. If idata and odata are the same, this method does
     * an in-place transform (See CUFFT documentation for details on real
     * data FFTs.)
     *
     * Input
     * ----
     * plan      The cufftHandle object for the plan to update
     * idata     Pointer to the input data (in GPU memory) to transform
     * odata     Pointer to the output data (in GPU memory)
     * direction The transform direction: CUFFT_FORWARD or CUFFT_INVERSE
     *
     * Output
     * ----
     * odata Contains the complex Fourier coefficients
     *
     * Return Values
     * ----
     * CUFFT_SETUP_FAILED CUFFT library failed to initialize.
     * CUFFT_INVALID_PLAN The plan parameter is not a valid handle.
     * CUFFT_INVALID_VALUE The idata, odata, and/or direction parameter is not valid.
     * CUFFT_EXEC_FAILED CUFFT failed to execute the transform on GPU.
     * CUFFT_SUCCESS CUFFT successfully executed the FFT plan.
     *
     * JCUFFT_INTERNAL_ERROR If an internal JCufft error occurred
     *      */
    public static int cufftExecD2Z(cufftHandle plan, Pointer rIdata, Pointer cOdata)
    {
        return checkResult(cufftExecD2ZNative(plan, rIdata, cOdata));
    }
    private static native int cufftExecD2ZNative(cufftHandle plan, Pointer rIdata, Pointer cOdata);



    /**
     * Convenience method for {@link JCufft#cufftExecD2Z(cufftHandle, Pointer, Pointer)}.
     * Accepts arrays for input and output data and automatically performs the host-device
     * and device-host copies.
     *
     * @see JCufft#cufftExecD2Z(cufftHandle, Pointer, Pointer)
     */
    public static int cufftExecD2Z(cufftHandle plan, double rIdata[], double cOdata[])
    {
        return cufftExecR2C(plan, rIdata, cOdata);
    }

    /**
     * @see JCufft#cufftExecD2Z(cufftHandle, Pointer, Pointer)
     * @see JCufft#cufftExecD2Z(cufftHandle, double[], double[])
     * @deprecated This method will be removed in a future release.
     * It should have been called 'cufftExecD2Z'.
     */
    public static int cufftExecR2C(cufftHandle plan, double rIdata[], double cOdata[])
    {
        int cudaResult = 0;

        boolean inPlace = (rIdata == cOdata);

        // Allocate space for the input data on the device
        Pointer hostRIdata = Pointer.to(rIdata);
        Pointer deviceRIdata = new Pointer();
        cudaResult = JCuda.cudaMalloc(deviceRIdata, rIdata.length * Sizeof.DOUBLE);
        if (cudaResult != cudaError.cudaSuccess)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Allocate the output device data
        Pointer hostCOdata = null;
        Pointer deviceCOdata = null;
        if (inPlace)
        {
            hostCOdata = hostRIdata;
            deviceCOdata = deviceRIdata;
        }
        else
        {
            hostCOdata = Pointer.to(cOdata);
            deviceCOdata = new Pointer();
            cudaResult = JCuda.cudaMalloc(deviceCOdata, cOdata.length * Sizeof.DOUBLE);
            if (cudaResult != cudaError.cudaSuccess)
            {
                JCuda.cudaFree(deviceCOdata);
                if (exceptionsEnabled)
                {
                    throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
                }
                return cufftResult.JCUFFT_INTERNAL_ERROR;
            }
        }

        // Copy the host input data to the device
        cudaResult = JCuda.cudaMemcpy(deviceRIdata, hostRIdata, rIdata.length * Sizeof.DOUBLE, cudaMemcpyKind.cudaMemcpyHostToDevice);
        if (cudaResult != cudaError.cudaSuccess)
        {
            JCuda.cudaFree(deviceRIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceCOdata);
            }
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Execute the transform
        int result = cufftResult.CUFFT_SUCCESS;
        try
        {
            result = JCufft.cufftExecD2Z(plan, deviceRIdata, deviceCOdata);
        }
        catch (CudaException e)
        {
            JCuda.cudaFree(deviceRIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceCOdata);
            }
            result = cufftResult.JCUFFT_INTERNAL_ERROR;
        }
        if (result != cufftResult.CUFFT_SUCCESS)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException(cufftResult.stringFor(cudaResult));
            }
            return result;
        }

        // Copy the device output data to the host
        cudaResult = JCuda.cudaMemcpy(hostCOdata, deviceCOdata, cOdata.length * Sizeof.DOUBLE, cudaMemcpyKind.cudaMemcpyDeviceToHost);
        if (cudaResult != cudaError.cudaSuccess)
        {
            JCuda.cudaFree(deviceRIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceCOdata);
            }
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Free the device data
        cudaResult = JCuda.cudaFree(deviceRIdata);
        if (cudaResult != cudaError.cudaSuccess)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }
        if (!inPlace)
        {
            cudaResult = JCuda.cudaFree(deviceCOdata);
            if (cudaResult != cudaError.cudaSuccess)
            {
                if (exceptionsEnabled)
                {
                    throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
                }
                return cufftResult.JCUFFT_INTERNAL_ERROR;
            }
        }
        return result;
    }








    /**
     *      * Executes a CUFFT complex-to-real (implicitly inverse) transform plan
     * for double precision values.
     *
     * cufftResult cufftExecZ2D( cufftHandle plan, cufftDoubleComplex *idata, cufftDoubleReal *odata );
     *
     * CUFFT uses as input data the GPU memory pointed to by the idata
     * parameter. The input array holds only the non-redundant complex
     * Fourier coefficients. This function stores the real output values in the
     * odata array. If idata and odata are the same, this method does an inplace
     * transform. (See CUFFT documentation for details on real data FFTs.)
     *
     * Input
     * ----
     * plan The cufftHandle object for the plan to update
     * idata Pointer to the complex input data (in GPU memory) to transform
     * odata Pointer to the real output data (in GPU memory)
     *
     * Output
     * ----
     * odata Contains the real-valued output data
     *
     * Return Values
     * ----
     * CUFFT_SETUP_FAILED  CUFFT library failed to initialize.
     * CUFFT_INVALID_PLAN  The plan parameter is not a valid handle.
     * CUFFT_INVALID_VALUE The idata and/or odata parameter is not valid.
     * CUFFT_EXEC_FAILED   CUFFT failed to execute the transform on GPU.
     * CUFFT_SUCCESS       CUFFT successfully executed the FFT plan.
     *
     * JCUFFT_INTERNAL_ERROR If an internal JCufft error occurred
     *      */
    public static int cufftExecZ2D(cufftHandle plan, Pointer cIdata, Pointer rOdata)
    {
        return checkResult(cufftExecZ2DNative(plan, cIdata, rOdata));
    }
    private static native int cufftExecZ2DNative(cufftHandle plan, Pointer cIdata, Pointer rOdata);



    /**
     * Convenience method for {@link JCufft#cufftExecZ2D(cufftHandle, Pointer, Pointer)}.
     * Accepts arrays for input and output data and automatically performs the host-device
     * and device-host copies.
     *
     * @see JCufft#cufftExecZ2D(cufftHandle, Pointer, Pointer)
     */
    public static int cufftExecZ2D(cufftHandle plan, double cIdata[], double rOdata[])
    {
        int cudaResult = 0;

        boolean inPlace = (cIdata == rOdata);
        
        // Allocate space for the input data on the device
        Pointer hostCIdata = Pointer.to(cIdata);
        Pointer deviceCIdata = new Pointer();
        cudaResult = JCuda.cudaMalloc(deviceCIdata, cIdata.length * Sizeof.DOUBLE);
        if (cudaResult != cudaError.cudaSuccess)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Allocate the output device data
        Pointer hostROdata = null;
        Pointer deviceROdata = null;
        if (inPlace)
        {
            hostROdata = hostCIdata;
            deviceROdata = deviceCIdata;
        }
        else
        {
            hostROdata = Pointer.to(rOdata);
            deviceROdata = new Pointer();
            cudaResult = JCuda.cudaMalloc(deviceROdata, rOdata.length * Sizeof.DOUBLE);
            if (cudaResult != cudaError.cudaSuccess)
            {
                JCuda.cudaFree(deviceCIdata);
                if (exceptionsEnabled)
                {
                    throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
                }
                return cufftResult.JCUFFT_INTERNAL_ERROR;
            }
        }

        // Copy the host input data to the device
        cudaResult = JCuda.cudaMemcpy(deviceCIdata, hostCIdata, cIdata.length * Sizeof.DOUBLE, cudaMemcpyKind.cudaMemcpyHostToDevice);
        if (cudaResult != cudaError.cudaSuccess)
        {
            JCuda.cudaFree(deviceCIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceROdata);
            }
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Execute the transform
        int result = cufftResult.CUFFT_SUCCESS;
        try
        {
            result = JCufft.cufftExecZ2D(plan, deviceCIdata, deviceROdata);
        }
        catch (CudaException e)
        {
            JCuda.cudaFree(deviceCIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceROdata);
            }
            result = cufftResult.JCUFFT_INTERNAL_ERROR;
        }
        if (result != cufftResult.CUFFT_SUCCESS)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException(cufftResult.stringFor(cudaResult));
            }
            return result;
        }

        // Copy the device output data to the host
        cudaResult = JCuda.cudaMemcpy(hostROdata, deviceROdata, rOdata.length * Sizeof.DOUBLE, cudaMemcpyKind.cudaMemcpyDeviceToHost);
        if (cudaResult != cudaError.cudaSuccess)
        {
            JCuda.cudaFree(deviceCIdata);
            if (!inPlace)
            {
                JCuda.cudaFree(deviceROdata);
            }
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }

        // Free the device data
        cudaResult = JCuda.cudaFree(deviceCIdata);
        if (cudaResult != cudaError.cudaSuccess)
        {
            if (exceptionsEnabled)
            {
                throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
            }
            return cufftResult.JCUFFT_INTERNAL_ERROR;
        }
        if (!inPlace)
        {
            cudaResult = JCuda.cudaFree(deviceROdata);
            if (cudaResult != cudaError.cudaSuccess)
            {
                if (exceptionsEnabled)
                {
                    throw new CudaException("JCuda error: "+cudaError.stringFor(cudaResult));
                }
                return cufftResult.JCUFFT_INTERNAL_ERROR;
            }
        }
        return result;
    }

}