srcnativelibs.Include.OpenCV.opencv2.gpu.device.detail.reduce.hpp Maven / Gradle / Ivy

Go to download
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#ifndef __OPENCV_GPU_REDUCE_DETAIL_HPP__
#define __OPENCV_GPU_REDUCE_DETAIL_HPP__

#include 
#include "../warp.hpp"
#include "../warp_shuffle.hpp"

namespace cv { namespace gpu { namespace device
{
    namespace reduce_detail
    {
        template  struct GetType;
        template  struct GetType
        {
            typedef T type;
        };
        template  struct GetType
        {
            typedef T type;
        };
        template  struct GetType
        {
            typedef T type;
        };

        template 
        struct For
        {
            template 
            static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
            {
                thrust::get(smem)[tid] = thrust::get(val);

                For::loadToSmem(smem, val, tid);
            }
            template 
            static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
            {
                thrust::get(val) = thrust::get(smem)[tid];

                For::loadFromSmem(smem, val, tid);
            }

            template 
            static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
            {
                typename GetType::type>::type reg = thrust::get(smem)[tid + delta];
                thrust::get(smem)[tid] = thrust::get(val) = thrust::get(op)(thrust::get(val), reg);

                For::merge(smem, val, tid, delta, op);
            }
            template 
            static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
            {
                typename GetType::type>::type reg = shfl_down(thrust::get(val), delta, width);
                thrust::get(val) = thrust::get(op)(thrust::get(val), reg);

                For::mergeShfl(val, delta, width, op);
            }
        };
        template 
        struct For
        {
            template 
            static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
            {
            }
            template 
            static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
            {
            }

            template 
            static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
            {
            }
            template 
            static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
            {
            }
        };

        template 
        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
        {
            smem[tid] = val;
        }
        template 
        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
        {
            val = smem[tid];
        }
        template 
        __device__ __forceinline__ void loadToSmem(const thrust::tuple& smem,
                                                       const thrust::tuple& val,
                                                       unsigned int tid)
        {
            For<0, thrust::tuple_size >::value>::loadToSmem(smem, val, tid);
        }
        template 
        __device__ __forceinline__ void loadFromSmem(const thrust::tuple& smem,
                                                         const thrust::tuple& val,
                                                         unsigned int tid)
        {
            For<0, thrust::tuple_size >::value>::loadFromSmem(smem, val, tid);
        }

        template 
        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
        {
            T reg = smem[tid + delta];
            smem[tid] = val = op(val, reg);
        }
        template 
        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
        {
            T reg = shfl_down(val, delta, width);
            val = op(val, reg);
        }
        template 
        __device__ __forceinline__ void merge(const thrust::tuple& smem,
                                              const thrust::tuple& val,
                                              unsigned int tid,
                                              unsigned int delta,
                                              const thrust::tuple& op)
        {
            For<0, thrust::tuple_size >::value>::merge(smem, val, tid, delta, op);
        }
        template 
        __device__ __forceinline__ void mergeShfl(const thrust::tuple& val,
                                                  unsigned int delta,
                                                  unsigned int width,
                                                  const thrust::tuple& op)
        {
            For<0, thrust::tuple_size >::value>::mergeShfl(val, delta, width, op);
        }

        template  struct Generic
        {
            template 
            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
            {
                loadToSmem(smem, val, tid);
                if (N >= 32)
                    __syncthreads();

                if (N >= 2048)
                {
                    if (tid < 1024)
                        merge(smem, val, tid, 1024, op);

                    __syncthreads();
                }
                if (N >= 1024)
                {
                    if (tid < 512)
                        merge(smem, val, tid, 512, op);

                    __syncthreads();
                }
                if (N >= 512)
                {
                    if (tid < 256)
                        merge(smem, val, tid, 256, op);

                    __syncthreads();
                }
                if (N >= 256)
                {
                    if (tid < 128)
                        merge(smem, val, tid, 128, op);

                    __syncthreads();
                }
                if (N >= 128)
                {
                    if (tid < 64)
                        merge(smem, val, tid, 64, op);

                    __syncthreads();
                }
                if (N >= 64)
                {
                    if (tid < 32)
                        merge(smem, val, tid, 32, op);
                }

                if (tid < 16)
                {
                    merge(smem, val, tid, 16, op);
                    merge(smem, val, tid, 8, op);
                    merge(smem, val, tid, 4, op);
                    merge(smem, val, tid, 2, op);
                    merge(smem, val, tid, 1, op);
                }
            }
        };

        template 
        struct Unroll
        {
            static __device__ void loopShfl(Reference val, Op op, unsigned int N)
            {
                mergeShfl(val, I, N, op);
                Unroll::loopShfl(val, op, N);
            }
            static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
            {
                merge(smem, val, tid, I, op);
                Unroll::loop(smem, val, tid, op);
            }
        };
        template 
        struct Unroll<0, Pointer, Reference, Op>
        {
            static __device__ void loopShfl(Reference, Op, unsigned int)
            {
            }
            static __device__ void loop(Pointer, Reference, unsigned int, Op)
            {
            }
        };

        template  struct WarpOptimized
        {
            template 
            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
            {
            #if __CUDA_ARCH__ >= 300
                (void) smem;
                (void) tid;

                Unroll::loopShfl(val, op, N);
            #else
                loadToSmem(smem, val, tid);

                if (tid < N / 2)
                    Unroll::loop(smem, val, tid, op);
            #endif
            }
        };

        template  struct GenericOptimized32
        {
            enum { M = N / 32 };

            template 
            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
            {
                const unsigned int laneId = Warp::laneId();

            #if __CUDA_ARCH__ >= 300
                Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);

                if (laneId == 0)
                    loadToSmem(smem, val, tid / 32);
            #else
                loadToSmem(smem, val, tid);

                if (laneId < 16)
                    Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);

                __syncthreads();

                if (laneId == 0)
                    loadToSmem(smem, val, tid / 32);
            #endif

                __syncthreads();

                loadFromSmem(smem, val, tid);

                if (tid < 32)
                {
                #if __CUDA_ARCH__ >= 300
                    Unroll::loopShfl(val, op, M);
                #else
                    Unroll::loop(smem, val, tid, op);
                #endif
                }
            }
        };

        template  struct StaticIf;
        template  struct StaticIf
        {
            typedef T1 type;
        };
        template  struct StaticIf
        {
            typedef T2 type;
        };

        template  struct IsPowerOf2
        {
            enum { value = ((N != 0) && !(N & (N - 1))) };
        };

        template  struct Dispatcher
        {
            typedef typename StaticIf<
                (N <= 32) && IsPowerOf2::value,
                WarpOptimized,
                typename StaticIf<
                    (N <= 1024) && IsPowerOf2::value,
                    GenericOptimized32,
                    Generic
                >::type
            >::type reductor;
        };
    }
}}}

#endif // __OPENCV_GPU_REDUCE_DETAIL_HPP__