init

2024-03-06 14:54:30 +08:00
commit edac2715f0
1525 changed files with 809982 additions and 0 deletions
--- a/OpenCV/include/opencv2/core/cuda/detail/color_detail.hpp
+++ b/OpenCV/include/opencv2/core/cuda/detail/color_detail.hpp
--- a/OpenCV/include/opencv2/core/cuda/detail/reduce.hpp
+++ b/OpenCV/include/opencv2/core/cuda/detail/reduce.hpp
@@ -0,0 +1,365 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_REDUCE_DETAIL_HPP
+#define OPENCV_CUDA_REDUCE_DETAIL_HPP
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace reduce_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val);
+
+                For<I + 1, N>::loadToSmem(smem, val, tid);
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
+            {
+                thrust::get<I>(val) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, val, tid);
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
+                thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::merge(smem, val, tid, delta, op);
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
+            {
+                typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
+                thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
+
+                For<I + 1, N>::mergeShfl(val, delta, width, op);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ValTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
+            {
+            }
+
+            template <class PointerTuple, class ValTuple, class OpTuple>
+            static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+            template <class ValTuple, class OpTuple>
+            static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
+            {
+            }
+        };
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            smem[tid] = val;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
+        {
+            val = smem[tid];
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                       const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                       unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                                         const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                         unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
+        }
+
+        template <typename T, class Op>
+        __device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
+        {
+            T reg = smem[tid + delta];
+            smem[tid] = val = op(val, reg);
+        }
+        template <typename T, class Op>
+        __device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
+        {
+            T reg = shfl_down(val, delta, width);
+            val = op(val, reg);
+        }
+        template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
+                  typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
+                                              const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                              unsigned int tid,
+                                              unsigned int delta,
+                                              const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
+        }
+        template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
+                  class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
+                                                  unsigned int delta,
+                                                  unsigned int width,
+                                                  const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
+        }
+
+        template <unsigned int N> struct Generic
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                loadToSmem(smem, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(smem, val, tid, 1024, op);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(smem, val, tid, 512, op);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(smem, val, tid, 256, op);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(smem, val, tid, 128, op);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(smem, val, tid, 64, op);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(smem, val, tid, 32, op);
+                }
+
+                if (tid < 16)
+                {
+                    merge(smem, val, tid, 16, op);
+                    merge(smem, val, tid, 8, op);
+                    merge(smem, val, tid, 4, op);
+                    merge(smem, val, tid, 2, op);
+                    merge(smem, val, tid, 1, op);
+                }
+            }
+        };
+
+        template <unsigned int I, typename Pointer, typename Reference, class Op>
+        struct Unroll
+        {
+            static __device__ void loopShfl(Reference val, Op op, unsigned int N)
+            {
+                mergeShfl(val, I, N, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            }
+            static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                merge(smem, val, tid, I, op);
+                Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            }
+        };
+        template <typename Pointer, typename Reference, class Op>
+        struct Unroll<0, Pointer, Reference, Op>
+        {
+            static __device__ void loopShfl(Reference, Op, unsigned int)
+            {
+            }
+            static __device__ void loop(Pointer, Reference, unsigned int, Op)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+            #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                CV_UNUSED(smem);
+                CV_UNUSED(tid);
+
+                Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <typename Pointer, typename Reference, class Op>
+            static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #else
+                loadToSmem(smem, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                    loadToSmem(smem, val, tid / 32);
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(smem, val, tid);
+
+                if (tid < 32)
+                {
+                #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                    Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
+                #else
+                    Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_REDUCE_DETAIL_HPP
--- a/OpenCV/include/opencv2/core/cuda/detail/reduce_key_val.hpp
+++ b/OpenCV/include/opencv2/core/cuda/detail/reduce_key_val.hpp
@@ -0,0 +1,502 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
+#define OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
+
+#include <thrust/tuple.h>
+#include "../warp.hpp"
+#include "../warp_shuffle.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace reduce_key_val_detail
+    {
+        template <typename T> struct GetType;
+        template <typename T> struct GetType<T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<volatile T*>
+        {
+            typedef T type;
+        };
+        template <typename T> struct GetType<T&>
+        {
+            typedef T type;
+        };
+
+        template <unsigned int I, unsigned int N>
+        struct For
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(smem)[tid] = thrust::get<I>(data);
+
+                For<I + 1, N>::loadToSmem(smem, data, tid);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
+            {
+                thrust::get<I>(data) = thrust::get<I>(smem)[tid];
+
+                For<I + 1, N>::loadFromSmem(smem, data, tid);
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
+            {
+                thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+
+                For<I + 1, N>::copyShfl(val, delta, width);
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
+            {
+                thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+
+                For<I + 1, N>::copy(svals, val, tid, delta);
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(key) = reg;
+                    thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
+                }
+
+                For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
+                                         const ValPointerTuple& svals, const ValReferenceTuple& val,
+                                         const CmpTuple& cmp,
+                                         unsigned int tid, unsigned int delta)
+            {
+                typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
+
+                if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
+                {
+                    thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
+                    thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
+                }
+
+                For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
+            }
+        };
+        template <unsigned int N>
+        struct For<N, N>
+        {
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
+            {
+            }
+
+            template <class ReferenceTuple>
+            static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
+            {
+            }
+            template <class PointerTuple, class ReferenceTuple>
+            static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
+            {
+            }
+
+            template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
+            {
+            }
+            template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
+            static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
+                                         const ValPointerTuple&, const ValReferenceTuple&,
+                                         const CmpTuple&,
+                                         unsigned int, unsigned int)
+            {
+            }
+        };
+
+        //////////////////////////////////////////////////////
+        // loadToSmem
+
+        template <typename T>
+        __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            smem[tid] = data;
+        }
+        template <typename T>
+        __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
+        {
+            data = smem[tid];
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                   const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                   unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
+                                                     const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
+                                                     unsigned int tid)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
+        }
+
+        //////////////////////////////////////////////////////
+        // copyVals
+
+        template <typename V>
+        __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
+        {
+            val = shfl_down(val, delta, width);
+        }
+        template <typename V>
+        __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
+        {
+            svals[tid] = val = svals[tid + delta];
+        }
+        template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                     unsigned int delta,
+                                                     int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
+        }
+        template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
+        __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                                 const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                 unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // merge
+
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K, typename V, class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename K,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void mergeShfl(K& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const Cmp& cmp,
+                                                  unsigned int delta, int width)
+        {
+            K reg = shfl_down(key, delta, width);
+
+            if (cmp(reg, key))
+            {
+                key = reg;
+                copyValsShfl(val, delta, width);
+            }
+        }
+        template <typename K,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp>
+        __device__ __forceinline__ void merge(volatile K* skeys, K& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const Cmp& cmp, unsigned int tid, unsigned int delta)
+        {
+            K reg = skeys[tid + delta];
+
+            if (cmp(reg, key))
+            {
+                skeys[tid] = key = reg;
+                copyVals(svals, val, tid, delta);
+            }
+        }
+        template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                                  const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                                  unsigned int delta, int width)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
+        }
+        template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
+                  typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
+                  typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
+                  typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
+                  class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
+        __device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
+                                              const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
+                                              const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
+                                              const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
+                                              const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
+                                              unsigned int tid, unsigned int delta)
+        {
+            For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
+        }
+
+        //////////////////////////////////////////////////////
+        // Generic
+
+        template <unsigned int N> struct Generic
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                loadToSmem(skeys, key, tid);
+                loadValsToSmem(svals, val, tid);
+                if (N >= 32)
+                    __syncthreads();
+
+                if (N >= 2048)
+                {
+                    if (tid < 1024)
+                        merge(skeys, key, svals, val, cmp, tid, 1024);
+
+                    __syncthreads();
+                }
+                if (N >= 1024)
+                {
+                    if (tid < 512)
+                        merge(skeys, key, svals, val, cmp, tid, 512);
+
+                    __syncthreads();
+                }
+                if (N >= 512)
+                {
+                    if (tid < 256)
+                        merge(skeys, key, svals, val, cmp, tid, 256);
+
+                    __syncthreads();
+                }
+                if (N >= 256)
+                {
+                    if (tid < 128)
+                        merge(skeys, key, svals, val, cmp, tid, 128);
+
+                    __syncthreads();
+                }
+                if (N >= 128)
+                {
+                    if (tid < 64)
+                        merge(skeys, key, svals, val, cmp, tid, 64);
+
+                    __syncthreads();
+                }
+                if (N >= 64)
+                {
+                    if (tid < 32)
+                        merge(skeys, key, svals, val, cmp, tid, 32);
+                }
+
+                if (tid < 16)
+                {
+                    merge(skeys, key, svals, val, cmp, tid, 16);
+                    merge(skeys, key, svals, val, cmp, tid, 8);
+                    merge(skeys, key, svals, val, cmp, tid, 4);
+                    merge(skeys, key, svals, val, cmp, tid, 2);
+                    merge(skeys, key, svals, val, cmp, tid, 1);
+                }
+            }
+        };
+
+        template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll
+        {
+            static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
+            {
+                mergeShfl(key, val, cmp, I, N);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            }
+            static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                merge(skeys, key, svals, val, cmp, tid, I);
+                Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            }
+        };
+        template <class KP, class KR, class VP, class VR, class Cmp>
+        struct Unroll<0, KP, KR, VP, VR, Cmp>
+        {
+            static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
+            {
+            }
+            static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
+            {
+            }
+        };
+
+        template <unsigned int N> struct WarpOptimized
+        {
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+            #if 0 // __CUDA_ARCH__ >= 300
+                CV_UNUSED(skeys);
+                CV_UNUSED(svals);
+                CV_UNUSED(tid);
+
+                Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (tid < N / 2)
+                    Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+            #endif
+            }
+        };
+
+        template <unsigned int N> struct GenericOptimized32
+        {
+            enum { M = N / 32 };
+
+            template <class KP, class KR, class VP, class VR, class Cmp>
+            static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
+            {
+                const unsigned int laneId = Warp::laneId();
+
+            #if 0 // __CUDA_ARCH__ >= 300
+                Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #else
+                loadToSmem(skeys, key, tid);
+                loadToSmem(svals, val, tid);
+
+                if (laneId < 16)
+                    Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+
+                __syncthreads();
+
+                if (laneId == 0)
+                {
+                    loadToSmem(skeys, key, tid / 32);
+                    loadToSmem(svals, val, tid / 32);
+                }
+            #endif
+
+                __syncthreads();
+
+                loadFromSmem(skeys, key, tid);
+
+                if (tid < 32)
+                {
+                #if 0 // __CUDA_ARCH__ >= 300
+                    loadFromSmem(svals, val, tid);
+
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
+                #else
+                    Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
+                #endif
+                }
+            }
+        };
+
+        template <bool val, class T1, class T2> struct StaticIf;
+        template <class T1, class T2> struct StaticIf<true, T1, T2>
+        {
+            typedef T1 type;
+        };
+        template <class T1, class T2> struct StaticIf<false, T1, T2>
+        {
+            typedef T2 type;
+        };
+
+        template <unsigned int N> struct IsPowerOf2
+        {
+            enum { value = ((N != 0) && !(N & (N - 1))) };
+        };
+
+        template <unsigned int N> struct Dispatcher
+        {
+            typedef typename StaticIf<
+                (N <= 32) && IsPowerOf2<N>::value,
+                WarpOptimized<N>,
+                typename StaticIf<
+                    (N <= 1024) && IsPowerOf2<N>::value,
+                    GenericOptimized32<N>,
+                    Generic<N>
+                >::type
+            >::type reductor;
+        };
+    }
+}}}
+
+//! @endcond
+
+#endif // OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
--- a/OpenCV/include/opencv2/core/cuda/detail/transform_detail.hpp
+++ b/OpenCV/include/opencv2/core/cuda/detail/transform_detail.hpp
@@ -0,0 +1,392 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TRANSFORM_DETAIL_HPP
+#define OPENCV_CUDA_TRANSFORM_DETAIL_HPP
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+#include "../functional.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace transform_detail
+    {
+        //! Read Write Traits
+
+        template <typename T, typename D, int shift> struct UnaryReadWriteTraits
+        {
+            typedef typename TypeVec<T, shift>::vec_type read_type;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
+
+        template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
+        {
+            typedef typename TypeVec<T1, shift>::vec_type read_type1;
+            typedef typename TypeVec<T2, shift>::vec_type read_type2;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
+
+        //! Transform kernels
+
+        template <int shift> struct OpUnroller;
+        template <> struct OpUnroller<1>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+            }
+        };
+        template <> struct OpUnroller<2>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+            }
+        };
+        template <> struct OpUnroller<3>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+            }
+        };
+        template <> struct OpUnroller<4>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src.w);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src1.w, src2.w);
+            }
+        };
+        template <> struct OpUnroller<8>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src.a7);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src1.a0, src2.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src1.a1, src2.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src1.a2, src2.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src1.a3, src2.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src1.a4, src2.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src1.a5, src2.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src1.a6, src2.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src1.a7, src2.a7);
+            }
+        };
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
+        {
+            typedef TransformFunctorTraits<UnOp> ft;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
+
+            if (y < src_.rows)
+            {
+                const T* src = src_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + ft::smart_shift - 1 < src_.cols)
+                {
+                    const read_type src_n_el = ((const read_type*)src)[x];
+                    OpUnroller<ft::smart_shift>::unroll(src_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src.cols && y < src.rows && mask(y, x))
+            {
+                dst.ptr(y)[x] = op(src.ptr(y)[x]);
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
+            const Mask mask, const BinOp op)
+        {
+            typedef TransformFunctorTraits<BinOp> ft;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
+
+            if (y < src1_.rows)
+            {
+                const T1* src1 = src1_.ptr(y);
+                const T2* src2 = src2_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + ft::smart_shift - 1 < src1_.cols)
+                {
+                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];
+
+                    OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src1[real_x], src2[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
+            const Mask mask, const BinOp op)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src1.cols && y < src1.rows && mask(y, x))
+            {
+                const T1 src1_data = src1.ptr(y)[x];
+                const T2 src2_data = src2.ptr(y)[x];
+                dst.ptr(y)[x] = op(src1_data, src2_data);
+            }
+        }
+
+        template <bool UseSmart> struct TransformDispatcher;
+        template<> struct TransformDispatcher<false>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<UnOp> ft;
+
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
+
+                transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<BinOp> ft;
+
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
+
+                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+        template<> struct TransformDispatcher<true>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<UnOp> ft;
+
+                CV_StaticAssert(ft::smart_shift != 1, "");
+
+                if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) ||
+                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
+                {
+                    TransformDispatcher<false>::call(src, dst, op, mask, stream);
+                    return;
+                }
+
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
+
+                transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<BinOp> ft;
+
+                CV_StaticAssert(ft::smart_shift != 1, "");
+
+                if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) ||
+                    !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) ||
+                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
+                {
+                    TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream);
+                    return;
+                }
+
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
+
+                transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+    } // namespace transform_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TRANSFORM_DETAIL_HPP
--- a/OpenCV/include/opencv2/core/cuda/detail/type_traits_detail.hpp
+++ b/OpenCV/include/opencv2/core/cuda/detail/type_traits_detail.hpp
@@ -0,0 +1,191 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
+#define OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace type_traits_detail
+    {
+        template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
+        template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
+
+        template <typename T> struct IsSignedIntergral { enum {value = 0}; };
+        template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
+
+        template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
+        template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
+
+        template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
+        template <> struct IsIntegral<char> { enum {value = 1}; };
+        template <> struct IsIntegral<bool> { enum {value = 1}; };
+
+        template <typename T> struct IsFloat { enum {value = 0}; };
+        template <> struct IsFloat<float> { enum {value = 1}; };
+        template <> struct IsFloat<double> { enum {value = 1}; };
+
+        template <typename T> struct IsVec { enum {value = 0}; };
+        template <> struct IsVec<uchar1> { enum {value = 1}; };
+        template <> struct IsVec<uchar2> { enum {value = 1}; };
+        template <> struct IsVec<uchar3> { enum {value = 1}; };
+        template <> struct IsVec<uchar4> { enum {value = 1}; };
+        template <> struct IsVec<uchar8> { enum {value = 1}; };
+        template <> struct IsVec<char1> { enum {value = 1}; };
+        template <> struct IsVec<char2> { enum {value = 1}; };
+        template <> struct IsVec<char3> { enum {value = 1}; };
+        template <> struct IsVec<char4> { enum {value = 1}; };
+        template <> struct IsVec<char8> { enum {value = 1}; };
+        template <> struct IsVec<ushort1> { enum {value = 1}; };
+        template <> struct IsVec<ushort2> { enum {value = 1}; };
+        template <> struct IsVec<ushort3> { enum {value = 1}; };
+        template <> struct IsVec<ushort4> { enum {value = 1}; };
+        template <> struct IsVec<ushort8> { enum {value = 1}; };
+        template <> struct IsVec<short1> { enum {value = 1}; };
+        template <> struct IsVec<short2> { enum {value = 1}; };
+        template <> struct IsVec<short3> { enum {value = 1}; };
+        template <> struct IsVec<short4> { enum {value = 1}; };
+        template <> struct IsVec<short8> { enum {value = 1}; };
+        template <> struct IsVec<uint1> { enum {value = 1}; };
+        template <> struct IsVec<uint2> { enum {value = 1}; };
+        template <> struct IsVec<uint3> { enum {value = 1}; };
+        template <> struct IsVec<uint4> { enum {value = 1}; };
+        template <> struct IsVec<uint8> { enum {value = 1}; };
+        template <> struct IsVec<int1> { enum {value = 1}; };
+        template <> struct IsVec<int2> { enum {value = 1}; };
+        template <> struct IsVec<int3> { enum {value = 1}; };
+        template <> struct IsVec<int4> { enum {value = 1}; };
+        template <> struct IsVec<int8> { enum {value = 1}; };
+        template <> struct IsVec<float1> { enum {value = 1}; };
+        template <> struct IsVec<float2> { enum {value = 1}; };
+        template <> struct IsVec<float3> { enum {value = 1}; };
+        template <> struct IsVec<float4> { enum {value = 1}; };
+        template <> struct IsVec<float8> { enum {value = 1}; };
+        template <> struct IsVec<double1> { enum {value = 1}; };
+        template <> struct IsVec<double2> { enum {value = 1}; };
+        template <> struct IsVec<double3> { enum {value = 1}; };
+        template <> struct IsVec<double4> { enum {value = 1}; };
+        template <> struct IsVec<double8> { enum {value = 1}; };
+
+        template <class U> struct AddParameterType { typedef const U& type; };
+        template <class U> struct AddParameterType<U&> { typedef U& type; };
+        template <> struct AddParameterType<void> { typedef void type; };
+
+        template <class U> struct ReferenceTraits
+        {
+            enum { value = false };
+            typedef U type;
+        };
+        template <class U> struct ReferenceTraits<U&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+
+        template <class U> struct PointerTraits
+        {
+            enum { value = false };
+            typedef void type;
+        };
+        template <class U> struct PointerTraits<U*>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+        template <class U> struct PointerTraits<U*&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+
+        template <class U> struct UnConst
+        {
+            typedef U type;
+            enum { value = 0 };
+        };
+        template <class U> struct UnConst<const U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnConst<const U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+
+        template <class U> struct UnVolatile
+        {
+            typedef U type;
+            enum { value = 0 };
+        };
+        template <class U> struct UnVolatile<volatile U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnVolatile<volatile U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+    } // namespace type_traits_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
--- a/OpenCV/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
+++ b/OpenCV/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
@@ -0,0 +1,121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
+#define OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
+
+#include "../datamov_utils.hpp"
+
+//! @cond IGNORED
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace vec_distance_detail
+    {
+        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
+            {
+                if (ind < len)
+                {
+                    T1 val1 = *vecCached++;
+
+                    T2 val2;
+                    ForceGlob<T2>::Load(vecGlob, ind, val2);
+
+                    dist.reduceIter(val1, val2);
+
+                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
+                }
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
+            {
+                T1 val1 = *vecCached++;
+
+                T2 val2;
+                ForceGlob<T2>::Load(vecGlob, 0, val2);
+                vecGlob += THREAD_DIM;
+
+                dist.reduceIter(val1, val2);
+
+                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
+            }
+        };
+        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
+            {
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
+            {
+            }
+        };
+
+        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
+            }
+        };
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
+            }
+        };
+    } // namespace vec_distance_detail
+}}} // namespace cv { namespace cuda { namespace cudev
+
+//! @endcond
+
+#endif // OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP