init
This commit is contained in:
2018
OpenCV/include/opencv2/core/cuda/detail/color_detail.hpp
Normal file
2018
OpenCV/include/opencv2/core/cuda/detail/color_detail.hpp
Normal file
File diff suppressed because one or more lines are too long
365
OpenCV/include/opencv2/core/cuda/detail/reduce.hpp
Normal file
365
OpenCV/include/opencv2/core/cuda/detail/reduce.hpp
Normal file
@@ -0,0 +1,365 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef OPENCV_CUDA_REDUCE_DETAIL_HPP
|
||||
#define OPENCV_CUDA_REDUCE_DETAIL_HPP
|
||||
|
||||
#include <thrust/tuple.h>
|
||||
#include "../warp.hpp"
|
||||
#include "../warp_shuffle.hpp"
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace reduce_detail
|
||||
{
|
||||
template <typename T> struct GetType;
|
||||
template <typename T> struct GetType<T*>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <typename T> struct GetType<volatile T*>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <typename T> struct GetType<T&>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
template <unsigned int I, unsigned int N>
|
||||
struct For
|
||||
{
|
||||
template <class PointerTuple, class ValTuple>
|
||||
static __device__ void loadToSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
|
||||
{
|
||||
thrust::get<I>(smem)[tid] = thrust::get<I>(val);
|
||||
|
||||
For<I + 1, N>::loadToSmem(smem, val, tid);
|
||||
}
|
||||
template <class PointerTuple, class ValTuple>
|
||||
static __device__ void loadFromSmem(const PointerTuple& smem, const ValTuple& val, unsigned int tid)
|
||||
{
|
||||
thrust::get<I>(val) = thrust::get<I>(smem)[tid];
|
||||
|
||||
For<I + 1, N>::loadFromSmem(smem, val, tid);
|
||||
}
|
||||
|
||||
template <class PointerTuple, class ValTuple, class OpTuple>
|
||||
static __device__ void merge(const PointerTuple& smem, const ValTuple& val, unsigned int tid, unsigned int delta, const OpTuple& op)
|
||||
{
|
||||
typename GetType<typename thrust::tuple_element<I, PointerTuple>::type>::type reg = thrust::get<I>(smem)[tid + delta];
|
||||
thrust::get<I>(smem)[tid] = thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
|
||||
|
||||
For<I + 1, N>::merge(smem, val, tid, delta, op);
|
||||
}
|
||||
template <class ValTuple, class OpTuple>
|
||||
static __device__ void mergeShfl(const ValTuple& val, unsigned int delta, unsigned int width, const OpTuple& op)
|
||||
{
|
||||
typename GetType<typename thrust::tuple_element<I, ValTuple>::type>::type reg = shfl_down(thrust::get<I>(val), delta, width);
|
||||
thrust::get<I>(val) = thrust::get<I>(op)(thrust::get<I>(val), reg);
|
||||
|
||||
For<I + 1, N>::mergeShfl(val, delta, width, op);
|
||||
}
|
||||
};
|
||||
template <unsigned int N>
|
||||
struct For<N, N>
|
||||
{
|
||||
template <class PointerTuple, class ValTuple>
|
||||
static __device__ void loadToSmem(const PointerTuple&, const ValTuple&, unsigned int)
|
||||
{
|
||||
}
|
||||
template <class PointerTuple, class ValTuple>
|
||||
static __device__ void loadFromSmem(const PointerTuple&, const ValTuple&, unsigned int)
|
||||
{
|
||||
}
|
||||
|
||||
template <class PointerTuple, class ValTuple, class OpTuple>
|
||||
static __device__ void merge(const PointerTuple&, const ValTuple&, unsigned int, unsigned int, const OpTuple&)
|
||||
{
|
||||
}
|
||||
template <class ValTuple, class OpTuple>
|
||||
static __device__ void mergeShfl(const ValTuple&, unsigned int, unsigned int, const OpTuple&)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, unsigned int tid)
|
||||
{
|
||||
smem[tid] = val;
|
||||
}
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, unsigned int tid)
|
||||
{
|
||||
val = smem[tid];
|
||||
}
|
||||
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
|
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
|
||||
__device__ __forceinline__ void loadToSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
|
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int tid)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadToSmem(smem, val, tid);
|
||||
}
|
||||
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
|
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9>
|
||||
__device__ __forceinline__ void loadFromSmem(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
|
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int tid)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::loadFromSmem(smem, val, tid);
|
||||
}
|
||||
|
||||
template <typename T, class Op>
|
||||
__device__ __forceinline__ void merge(volatile T* smem, T& val, unsigned int tid, unsigned int delta, const Op& op)
|
||||
{
|
||||
T reg = smem[tid + delta];
|
||||
smem[tid] = val = op(val, reg);
|
||||
}
|
||||
template <typename T, class Op>
|
||||
__device__ __forceinline__ void mergeShfl(T& val, unsigned int delta, unsigned int width, const Op& op)
|
||||
{
|
||||
T reg = shfl_down(val, delta, width);
|
||||
val = op(val, reg);
|
||||
}
|
||||
template <typename P0, typename P1, typename P2, typename P3, typename P4, typename P5, typename P6, typename P7, typename P8, typename P9,
|
||||
typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
|
||||
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
|
||||
__device__ __forceinline__ void merge(const thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9>& smem,
|
||||
const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int tid,
|
||||
unsigned int delta,
|
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<P0, P1, P2, P3, P4, P5, P6, P7, P8, P9> >::value>::merge(smem, val, tid, delta, op);
|
||||
}
|
||||
template <typename R0, typename R1, typename R2, typename R3, typename R4, typename R5, typename R6, typename R7, typename R8, typename R9,
|
||||
class Op0, class Op1, class Op2, class Op3, class Op4, class Op5, class Op6, class Op7, class Op8, class Op9>
|
||||
__device__ __forceinline__ void mergeShfl(const thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9>& val,
|
||||
unsigned int delta,
|
||||
unsigned int width,
|
||||
const thrust::tuple<Op0, Op1, Op2, Op3, Op4, Op5, Op6, Op7, Op8, Op9>& op)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<R0, R1, R2, R3, R4, R5, R6, R7, R8, R9> >::value>::mergeShfl(val, delta, width, op);
|
||||
}
|
||||
|
||||
template <unsigned int N> struct Generic
|
||||
{
|
||||
template <typename Pointer, typename Reference, class Op>
|
||||
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
|
||||
{
|
||||
loadToSmem(smem, val, tid);
|
||||
if (N >= 32)
|
||||
__syncthreads();
|
||||
|
||||
if (N >= 2048)
|
||||
{
|
||||
if (tid < 1024)
|
||||
merge(smem, val, tid, 1024, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 1024)
|
||||
{
|
||||
if (tid < 512)
|
||||
merge(smem, val, tid, 512, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 512)
|
||||
{
|
||||
if (tid < 256)
|
||||
merge(smem, val, tid, 256, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 256)
|
||||
{
|
||||
if (tid < 128)
|
||||
merge(smem, val, tid, 128, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 128)
|
||||
{
|
||||
if (tid < 64)
|
||||
merge(smem, val, tid, 64, op);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 64)
|
||||
{
|
||||
if (tid < 32)
|
||||
merge(smem, val, tid, 32, op);
|
||||
}
|
||||
|
||||
if (tid < 16)
|
||||
{
|
||||
merge(smem, val, tid, 16, op);
|
||||
merge(smem, val, tid, 8, op);
|
||||
merge(smem, val, tid, 4, op);
|
||||
merge(smem, val, tid, 2, op);
|
||||
merge(smem, val, tid, 1, op);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int I, typename Pointer, typename Reference, class Op>
|
||||
struct Unroll
|
||||
{
|
||||
static __device__ void loopShfl(Reference val, Op op, unsigned int N)
|
||||
{
|
||||
mergeShfl(val, I, N, op);
|
||||
Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
|
||||
}
|
||||
static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
|
||||
{
|
||||
merge(smem, val, tid, I, op);
|
||||
Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
|
||||
}
|
||||
};
|
||||
template <typename Pointer, typename Reference, class Op>
|
||||
struct Unroll<0, Pointer, Reference, Op>
|
||||
{
|
||||
static __device__ void loopShfl(Reference, Op, unsigned int)
|
||||
{
|
||||
}
|
||||
static __device__ void loop(Pointer, Reference, unsigned int, Op)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int N> struct WarpOptimized
|
||||
{
|
||||
template <typename Pointer, typename Reference, class Op>
|
||||
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
|
||||
{
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
|
||||
CV_UNUSED(smem);
|
||||
CV_UNUSED(tid);
|
||||
|
||||
Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
|
||||
#else
|
||||
loadToSmem(smem, val, tid);
|
||||
|
||||
if (tid < N / 2)
|
||||
Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int N> struct GenericOptimized32
|
||||
{
|
||||
enum { M = N / 32 };
|
||||
|
||||
template <typename Pointer, typename Reference, class Op>
|
||||
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
|
||||
{
|
||||
const unsigned int laneId = Warp::laneId();
|
||||
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
|
||||
Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
|
||||
|
||||
if (laneId == 0)
|
||||
loadToSmem(smem, val, tid / 32);
|
||||
#else
|
||||
loadToSmem(smem, val, tid);
|
||||
|
||||
if (laneId < 16)
|
||||
Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (laneId == 0)
|
||||
loadToSmem(smem, val, tid / 32);
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
loadFromSmem(smem, val, tid);
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
|
||||
Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
|
||||
#else
|
||||
Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <bool val, class T1, class T2> struct StaticIf;
|
||||
template <class T1, class T2> struct StaticIf<true, T1, T2>
|
||||
{
|
||||
typedef T1 type;
|
||||
};
|
||||
template <class T1, class T2> struct StaticIf<false, T1, T2>
|
||||
{
|
||||
typedef T2 type;
|
||||
};
|
||||
|
||||
template <unsigned int N> struct IsPowerOf2
|
||||
{
|
||||
enum { value = ((N != 0) && !(N & (N - 1))) };
|
||||
};
|
||||
|
||||
template <unsigned int N> struct Dispatcher
|
||||
{
|
||||
typedef typename StaticIf<
|
||||
(N <= 32) && IsPowerOf2<N>::value,
|
||||
WarpOptimized<N>,
|
||||
typename StaticIf<
|
||||
(N <= 1024) && IsPowerOf2<N>::value,
|
||||
GenericOptimized32<N>,
|
||||
Generic<N>
|
||||
>::type
|
||||
>::type reductor;
|
||||
};
|
||||
}
|
||||
}}}
|
||||
|
||||
//! @endcond
|
||||
|
||||
#endif // OPENCV_CUDA_REDUCE_DETAIL_HPP
|
||||
502
OpenCV/include/opencv2/core/cuda/detail/reduce_key_val.hpp
Normal file
502
OpenCV/include/opencv2/core/cuda/detail/reduce_key_val.hpp
Normal file
@@ -0,0 +1,502 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
|
||||
#define OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
|
||||
|
||||
#include <thrust/tuple.h>
|
||||
#include "../warp.hpp"
|
||||
#include "../warp_shuffle.hpp"
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace reduce_key_val_detail
|
||||
{
|
||||
template <typename T> struct GetType;
|
||||
template <typename T> struct GetType<T*>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <typename T> struct GetType<volatile T*>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
template <typename T> struct GetType<T&>
|
||||
{
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
template <unsigned int I, unsigned int N>
|
||||
struct For
|
||||
{
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
|
||||
{
|
||||
thrust::get<I>(smem)[tid] = thrust::get<I>(data);
|
||||
|
||||
For<I + 1, N>::loadToSmem(smem, data, tid);
|
||||
}
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
|
||||
{
|
||||
thrust::get<I>(data) = thrust::get<I>(smem)[tid];
|
||||
|
||||
For<I + 1, N>::loadFromSmem(smem, data, tid);
|
||||
}
|
||||
|
||||
template <class ReferenceTuple>
|
||||
static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
|
||||
{
|
||||
thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
|
||||
|
||||
For<I + 1, N>::copyShfl(val, delta, width);
|
||||
}
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
|
||||
{
|
||||
thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
|
||||
|
||||
For<I + 1, N>::copy(svals, val, tid, delta);
|
||||
}
|
||||
|
||||
template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
|
||||
static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
|
||||
{
|
||||
typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
|
||||
|
||||
if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
|
||||
{
|
||||
thrust::get<I>(key) = reg;
|
||||
thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
|
||||
}
|
||||
|
||||
For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
|
||||
}
|
||||
template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
|
||||
static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
|
||||
const ValPointerTuple& svals, const ValReferenceTuple& val,
|
||||
const CmpTuple& cmp,
|
||||
unsigned int tid, unsigned int delta)
|
||||
{
|
||||
typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
|
||||
|
||||
if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
|
||||
{
|
||||
thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
|
||||
thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
|
||||
}
|
||||
|
||||
For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
|
||||
}
|
||||
};
|
||||
template <unsigned int N>
|
||||
struct For<N, N>
|
||||
{
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
|
||||
{
|
||||
}
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
|
||||
{
|
||||
}
|
||||
|
||||
template <class ReferenceTuple>
|
||||
static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
|
||||
{
|
||||
}
|
||||
template <class PointerTuple, class ReferenceTuple>
|
||||
static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
|
||||
{
|
||||
}
|
||||
|
||||
template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
|
||||
static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
|
||||
{
|
||||
}
|
||||
template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
|
||||
static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
|
||||
const ValPointerTuple&, const ValReferenceTuple&,
|
||||
const CmpTuple&,
|
||||
unsigned int, unsigned int)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// loadToSmem
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
|
||||
{
|
||||
smem[tid] = data;
|
||||
}
|
||||
template <typename T>
|
||||
__device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
|
||||
{
|
||||
data = smem[tid];
|
||||
}
|
||||
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
|
||||
__device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
|
||||
unsigned int tid)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
|
||||
}
|
||||
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
|
||||
__device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
|
||||
unsigned int tid)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// copyVals
|
||||
|
||||
template <typename V>
|
||||
__device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
|
||||
{
|
||||
val = shfl_down(val, delta, width);
|
||||
}
|
||||
template <typename V>
|
||||
__device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
|
||||
{
|
||||
svals[tid] = val = svals[tid + delta];
|
||||
}
|
||||
template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
|
||||
__device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
unsigned int delta,
|
||||
int width)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
|
||||
}
|
||||
template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
|
||||
__device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
unsigned int tid, unsigned int delta)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// merge
|
||||
|
||||
template <typename K, typename V, class Cmp>
|
||||
__device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
|
||||
{
|
||||
K reg = shfl_down(key, delta, width);
|
||||
|
||||
if (cmp(reg, key))
|
||||
{
|
||||
key = reg;
|
||||
copyValsShfl(val, delta, width);
|
||||
}
|
||||
}
|
||||
template <typename K, typename V, class Cmp>
|
||||
__device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
|
||||
{
|
||||
K reg = skeys[tid + delta];
|
||||
|
||||
if (cmp(reg, key))
|
||||
{
|
||||
skeys[tid] = key = reg;
|
||||
copyVals(svals, val, tid, delta);
|
||||
}
|
||||
}
|
||||
template <typename K,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp>
|
||||
__device__ __forceinline__ void mergeShfl(K& key,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
const Cmp& cmp,
|
||||
unsigned int delta, int width)
|
||||
{
|
||||
K reg = shfl_down(key, delta, width);
|
||||
|
||||
if (cmp(reg, key))
|
||||
{
|
||||
key = reg;
|
||||
copyValsShfl(val, delta, width);
|
||||
}
|
||||
}
|
||||
template <typename K,
|
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp>
|
||||
__device__ __forceinline__ void merge(volatile K* skeys, K& key,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
const Cmp& cmp, unsigned int tid, unsigned int delta)
|
||||
{
|
||||
K reg = skeys[tid + delta];
|
||||
|
||||
if (cmp(reg, key))
|
||||
{
|
||||
skeys[tid] = key = reg;
|
||||
copyVals(svals, val, tid, delta);
|
||||
}
|
||||
}
|
||||
template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
|
||||
__device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
|
||||
unsigned int delta, int width)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
|
||||
}
|
||||
template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
|
||||
typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
|
||||
typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
|
||||
typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
|
||||
class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
|
||||
__device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
|
||||
const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
|
||||
const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
|
||||
const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
|
||||
const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
|
||||
unsigned int tid, unsigned int delta)
|
||||
{
|
||||
For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////
|
||||
// Generic
|
||||
|
||||
template <unsigned int N> struct Generic
|
||||
{
|
||||
template <class KP, class KR, class VP, class VR, class Cmp>
|
||||
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
||||
{
|
||||
loadToSmem(skeys, key, tid);
|
||||
loadValsToSmem(svals, val, tid);
|
||||
if (N >= 32)
|
||||
__syncthreads();
|
||||
|
||||
if (N >= 2048)
|
||||
{
|
||||
if (tid < 1024)
|
||||
merge(skeys, key, svals, val, cmp, tid, 1024);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 1024)
|
||||
{
|
||||
if (tid < 512)
|
||||
merge(skeys, key, svals, val, cmp, tid, 512);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 512)
|
||||
{
|
||||
if (tid < 256)
|
||||
merge(skeys, key, svals, val, cmp, tid, 256);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 256)
|
||||
{
|
||||
if (tid < 128)
|
||||
merge(skeys, key, svals, val, cmp, tid, 128);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 128)
|
||||
{
|
||||
if (tid < 64)
|
||||
merge(skeys, key, svals, val, cmp, tid, 64);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
if (N >= 64)
|
||||
{
|
||||
if (tid < 32)
|
||||
merge(skeys, key, svals, val, cmp, tid, 32);
|
||||
}
|
||||
|
||||
if (tid < 16)
|
||||
{
|
||||
merge(skeys, key, svals, val, cmp, tid, 16);
|
||||
merge(skeys, key, svals, val, cmp, tid, 8);
|
||||
merge(skeys, key, svals, val, cmp, tid, 4);
|
||||
merge(skeys, key, svals, val, cmp, tid, 2);
|
||||
merge(skeys, key, svals, val, cmp, tid, 1);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
|
||||
struct Unroll
|
||||
{
|
||||
static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
|
||||
{
|
||||
mergeShfl(key, val, cmp, I, N);
|
||||
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
|
||||
}
|
||||
static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
||||
{
|
||||
merge(skeys, key, svals, val, cmp, tid, I);
|
||||
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
|
||||
}
|
||||
};
|
||||
template <class KP, class KR, class VP, class VR, class Cmp>
|
||||
struct Unroll<0, KP, KR, VP, VR, Cmp>
|
||||
{
|
||||
static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
|
||||
{
|
||||
}
|
||||
static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int N> struct WarpOptimized
|
||||
{
|
||||
template <class KP, class KR, class VP, class VR, class Cmp>
|
||||
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
||||
{
|
||||
#if 0 // __CUDA_ARCH__ >= 300
|
||||
CV_UNUSED(skeys);
|
||||
CV_UNUSED(svals);
|
||||
CV_UNUSED(tid);
|
||||
|
||||
Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
|
||||
#else
|
||||
loadToSmem(skeys, key, tid);
|
||||
loadToSmem(svals, val, tid);
|
||||
|
||||
if (tid < N / 2)
|
||||
Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <unsigned int N> struct GenericOptimized32
|
||||
{
|
||||
enum { M = N / 32 };
|
||||
|
||||
template <class KP, class KR, class VP, class VR, class Cmp>
|
||||
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
|
||||
{
|
||||
const unsigned int laneId = Warp::laneId();
|
||||
|
||||
#if 0 // __CUDA_ARCH__ >= 300
|
||||
Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
|
||||
|
||||
if (laneId == 0)
|
||||
{
|
||||
loadToSmem(skeys, key, tid / 32);
|
||||
loadToSmem(svals, val, tid / 32);
|
||||
}
|
||||
#else
|
||||
loadToSmem(skeys, key, tid);
|
||||
loadToSmem(svals, val, tid);
|
||||
|
||||
if (laneId < 16)
|
||||
Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (laneId == 0)
|
||||
{
|
||||
loadToSmem(skeys, key, tid / 32);
|
||||
loadToSmem(svals, val, tid / 32);
|
||||
}
|
||||
#endif
|
||||
|
||||
__syncthreads();
|
||||
|
||||
loadFromSmem(skeys, key, tid);
|
||||
|
||||
if (tid < 32)
|
||||
{
|
||||
#if 0 // __CUDA_ARCH__ >= 300
|
||||
loadFromSmem(svals, val, tid);
|
||||
|
||||
Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
|
||||
#else
|
||||
Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <bool val, class T1, class T2> struct StaticIf;
|
||||
template <class T1, class T2> struct StaticIf<true, T1, T2>
|
||||
{
|
||||
typedef T1 type;
|
||||
};
|
||||
template <class T1, class T2> struct StaticIf<false, T1, T2>
|
||||
{
|
||||
typedef T2 type;
|
||||
};
|
||||
|
||||
template <unsigned int N> struct IsPowerOf2
|
||||
{
|
||||
enum { value = ((N != 0) && !(N & (N - 1))) };
|
||||
};
|
||||
|
||||
template <unsigned int N> struct Dispatcher
|
||||
{
|
||||
typedef typename StaticIf<
|
||||
(N <= 32) && IsPowerOf2<N>::value,
|
||||
WarpOptimized<N>,
|
||||
typename StaticIf<
|
||||
(N <= 1024) && IsPowerOf2<N>::value,
|
||||
GenericOptimized32<N>,
|
||||
Generic<N>
|
||||
>::type
|
||||
>::type reductor;
|
||||
};
|
||||
}
|
||||
}}}
|
||||
|
||||
//! @endcond
|
||||
|
||||
#endif // OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP
|
||||
392
OpenCV/include/opencv2/core/cuda/detail/transform_detail.hpp
Normal file
392
OpenCV/include/opencv2/core/cuda/detail/transform_detail.hpp
Normal file
@@ -0,0 +1,392 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef OPENCV_CUDA_TRANSFORM_DETAIL_HPP
|
||||
#define OPENCV_CUDA_TRANSFORM_DETAIL_HPP
|
||||
|
||||
#include "../common.hpp"
|
||||
#include "../vec_traits.hpp"
|
||||
#include "../functional.hpp"
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace transform_detail
|
||||
{
|
||||
//! Read Write Traits
|
||||
|
||||
template <typename T, typename D, int shift> struct UnaryReadWriteTraits
|
||||
{
|
||||
typedef typename TypeVec<T, shift>::vec_type read_type;
|
||||
typedef typename TypeVec<D, shift>::vec_type write_type;
|
||||
};
|
||||
|
||||
template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
|
||||
{
|
||||
typedef typename TypeVec<T1, shift>::vec_type read_type1;
|
||||
typedef typename TypeVec<T2, shift>::vec_type read_type2;
|
||||
typedef typename TypeVec<D, shift>::vec_type write_type;
|
||||
};
|
||||
|
||||
//! Transform kernels
|
||||
|
||||
template <int shift> struct OpUnroller;
|
||||
template <> struct OpUnroller<1>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src.x);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src1.x, src2.x);
|
||||
}
|
||||
};
|
||||
template <> struct OpUnroller<2>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src.y);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src1.x, src2.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src1.y, src2.y);
|
||||
}
|
||||
};
|
||||
template <> struct OpUnroller<3>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src.y);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.z = op(src.z);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src1.x, src2.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src1.y, src2.y);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.z = op(src1.z, src2.z);
|
||||
}
|
||||
};
|
||||
template <> struct OpUnroller<4>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src.y);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.z = op(src.z);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.w = op(src.w);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.x = op(src1.x, src2.x);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.y = op(src1.y, src2.y);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.z = op(src1.z, src2.z);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.w = op(src1.w, src2.w);
|
||||
}
|
||||
};
|
||||
template <> struct OpUnroller<8>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.a0 = op(src.a0);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.a1 = op(src.a1);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.a2 = op(src.a2);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.a3 = op(src.a3);
|
||||
if (mask(y, x_shifted + 4))
|
||||
dst.a4 = op(src.a4);
|
||||
if (mask(y, x_shifted + 5))
|
||||
dst.a5 = op(src.a5);
|
||||
if (mask(y, x_shifted + 6))
|
||||
dst.a6 = op(src.a6);
|
||||
if (mask(y, x_shifted + 7))
|
||||
dst.a7 = op(src.a7);
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
|
||||
{
|
||||
if (mask(y, x_shifted))
|
||||
dst.a0 = op(src1.a0, src2.a0);
|
||||
if (mask(y, x_shifted + 1))
|
||||
dst.a1 = op(src1.a1, src2.a1);
|
||||
if (mask(y, x_shifted + 2))
|
||||
dst.a2 = op(src1.a2, src2.a2);
|
||||
if (mask(y, x_shifted + 3))
|
||||
dst.a3 = op(src1.a3, src2.a3);
|
||||
if (mask(y, x_shifted + 4))
|
||||
dst.a4 = op(src1.a4, src2.a4);
|
||||
if (mask(y, x_shifted + 5))
|
||||
dst.a5 = op(src1.a5, src2.a5);
|
||||
if (mask(y, x_shifted + 6))
|
||||
dst.a6 = op(src1.a6, src2.a6);
|
||||
if (mask(y, x_shifted + 7))
|
||||
dst.a7 = op(src1.a7, src2.a7);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
|
||||
{
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
|
||||
typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
|
||||
|
||||
const int x = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int y = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int x_shifted = x * ft::smart_shift;
|
||||
|
||||
if (y < src_.rows)
|
||||
{
|
||||
const T* src = src_.ptr(y);
|
||||
D* dst = dst_.ptr(y);
|
||||
|
||||
if (x_shifted + ft::smart_shift - 1 < src_.cols)
|
||||
{
|
||||
const read_type src_n_el = ((const read_type*)src)[x];
|
||||
OpUnroller<ft::smart_shift>::unroll(src_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
|
||||
{
|
||||
if (mask(y, real_x))
|
||||
dst[real_x] = op(src[real_x]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
__global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < src.cols && y < src.rows && mask(y, x))
|
||||
{
|
||||
dst.ptr(y)[x] = op(src.ptr(y)[x]);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
|
||||
const Mask mask, const BinOp op)
|
||||
{
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
|
||||
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
|
||||
typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
|
||||
|
||||
const int x = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
const int y = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
const int x_shifted = x * ft::smart_shift;
|
||||
|
||||
if (y < src1_.rows)
|
||||
{
|
||||
const T1* src1 = src1_.ptr(y);
|
||||
const T2* src2 = src2_.ptr(y);
|
||||
D* dst = dst_.ptr(y);
|
||||
|
||||
if (x_shifted + ft::smart_shift - 1 < src1_.cols)
|
||||
{
|
||||
const read_type1 src1_n_el = ((const read_type1*)src1)[x];
|
||||
const read_type2 src2_n_el = ((const read_type2*)src2)[x];
|
||||
|
||||
OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, ((write_type*)dst)[x], mask, op, x_shifted, y);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
|
||||
{
|
||||
if (mask(y, real_x))
|
||||
dst[real_x] = op(src1[real_x], src2[real_x]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
|
||||
const Mask mask, const BinOp op)
|
||||
{
|
||||
const int x = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int y = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
|
||||
if (x < src1.cols && y < src1.rows && mask(y, x))
|
||||
{
|
||||
const T1 src1_data = src1.ptr(y)[x];
|
||||
const T2 src2_data = src2.ptr(y)[x];
|
||||
dst.ptr(y)[x] = op(src1_data, src2_data);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool UseSmart> struct TransformDispatcher;
|
||||
template<> struct TransformDispatcher<false>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
|
||||
const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
|
||||
|
||||
transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
|
||||
const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
|
||||
|
||||
transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
template<> struct TransformDispatcher<true>
|
||||
{
|
||||
template <typename T, typename D, typename UnOp, typename Mask>
|
||||
static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<UnOp> ft;
|
||||
|
||||
CV_StaticAssert(ft::smart_shift != 1, "");
|
||||
|
||||
if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) ||
|
||||
!isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
|
||||
{
|
||||
TransformDispatcher<false>::call(src, dst, op, mask, stream);
|
||||
return;
|
||||
}
|
||||
|
||||
const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
|
||||
|
||||
transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
|
||||
static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
|
||||
{
|
||||
typedef TransformFunctorTraits<BinOp> ft;
|
||||
|
||||
CV_StaticAssert(ft::smart_shift != 1, "");
|
||||
|
||||
if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) ||
|
||||
!isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) ||
|
||||
!isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
|
||||
{
|
||||
TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream);
|
||||
return;
|
||||
}
|
||||
|
||||
const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
|
||||
const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
|
||||
|
||||
transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
};
|
||||
} // namespace transform_detail
|
||||
}}} // namespace cv { namespace cuda { namespace cudev
|
||||
|
||||
//! @endcond
|
||||
|
||||
#endif // OPENCV_CUDA_TRANSFORM_DETAIL_HPP
|
||||
191
OpenCV/include/opencv2/core/cuda/detail/type_traits_detail.hpp
Normal file
191
OpenCV/include/opencv2/core/cuda/detail/type_traits_detail.hpp
Normal file
@@ -0,0 +1,191 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
|
||||
#define OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
|
||||
|
||||
#include "../common.hpp"
|
||||
#include "../vec_traits.hpp"
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace type_traits_detail
|
||||
{
|
||||
template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
|
||||
template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
|
||||
|
||||
template <typename T> struct IsSignedIntergral { enum {value = 0}; };
|
||||
template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<short> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<int> { enum {value = 1}; };
|
||||
template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
|
||||
|
||||
template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
|
||||
template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
|
||||
template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
|
||||
|
||||
template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
|
||||
template <> struct IsIntegral<char> { enum {value = 1}; };
|
||||
template <> struct IsIntegral<bool> { enum {value = 1}; };
|
||||
|
||||
template <typename T> struct IsFloat { enum {value = 0}; };
|
||||
template <> struct IsFloat<float> { enum {value = 1}; };
|
||||
template <> struct IsFloat<double> { enum {value = 1}; };
|
||||
|
||||
template <typename T> struct IsVec { enum {value = 0}; };
|
||||
template <> struct IsVec<uchar1> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar2> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar3> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar4> { enum {value = 1}; };
|
||||
template <> struct IsVec<uchar8> { enum {value = 1}; };
|
||||
template <> struct IsVec<char1> { enum {value = 1}; };
|
||||
template <> struct IsVec<char2> { enum {value = 1}; };
|
||||
template <> struct IsVec<char3> { enum {value = 1}; };
|
||||
template <> struct IsVec<char4> { enum {value = 1}; };
|
||||
template <> struct IsVec<char8> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort1> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort2> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort3> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort4> { enum {value = 1}; };
|
||||
template <> struct IsVec<ushort8> { enum {value = 1}; };
|
||||
template <> struct IsVec<short1> { enum {value = 1}; };
|
||||
template <> struct IsVec<short2> { enum {value = 1}; };
|
||||
template <> struct IsVec<short3> { enum {value = 1}; };
|
||||
template <> struct IsVec<short4> { enum {value = 1}; };
|
||||
template <> struct IsVec<short8> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint1> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint2> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint3> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint4> { enum {value = 1}; };
|
||||
template <> struct IsVec<uint8> { enum {value = 1}; };
|
||||
template <> struct IsVec<int1> { enum {value = 1}; };
|
||||
template <> struct IsVec<int2> { enum {value = 1}; };
|
||||
template <> struct IsVec<int3> { enum {value = 1}; };
|
||||
template <> struct IsVec<int4> { enum {value = 1}; };
|
||||
template <> struct IsVec<int8> { enum {value = 1}; };
|
||||
template <> struct IsVec<float1> { enum {value = 1}; };
|
||||
template <> struct IsVec<float2> { enum {value = 1}; };
|
||||
template <> struct IsVec<float3> { enum {value = 1}; };
|
||||
template <> struct IsVec<float4> { enum {value = 1}; };
|
||||
template <> struct IsVec<float8> { enum {value = 1}; };
|
||||
template <> struct IsVec<double1> { enum {value = 1}; };
|
||||
template <> struct IsVec<double2> { enum {value = 1}; };
|
||||
template <> struct IsVec<double3> { enum {value = 1}; };
|
||||
template <> struct IsVec<double4> { enum {value = 1}; };
|
||||
template <> struct IsVec<double8> { enum {value = 1}; };
|
||||
|
||||
template <class U> struct AddParameterType { typedef const U& type; };
|
||||
template <class U> struct AddParameterType<U&> { typedef U& type; };
|
||||
template <> struct AddParameterType<void> { typedef void type; };
|
||||
|
||||
template <class U> struct ReferenceTraits
|
||||
{
|
||||
enum { value = false };
|
||||
typedef U type;
|
||||
};
|
||||
template <class U> struct ReferenceTraits<U&>
|
||||
{
|
||||
enum { value = true };
|
||||
typedef U type;
|
||||
};
|
||||
|
||||
template <class U> struct PointerTraits
|
||||
{
|
||||
enum { value = false };
|
||||
typedef void type;
|
||||
};
|
||||
template <class U> struct PointerTraits<U*>
|
||||
{
|
||||
enum { value = true };
|
||||
typedef U type;
|
||||
};
|
||||
template <class U> struct PointerTraits<U*&>
|
||||
{
|
||||
enum { value = true };
|
||||
typedef U type;
|
||||
};
|
||||
|
||||
template <class U> struct UnConst
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 0 };
|
||||
};
|
||||
template <class U> struct UnConst<const U>
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <class U> struct UnConst<const U&>
|
||||
{
|
||||
typedef U& type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
|
||||
template <class U> struct UnVolatile
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 0 };
|
||||
};
|
||||
template <class U> struct UnVolatile<volatile U>
|
||||
{
|
||||
typedef U type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
template <class U> struct UnVolatile<volatile U&>
|
||||
{
|
||||
typedef U& type;
|
||||
enum { value = 1 };
|
||||
};
|
||||
} // namespace type_traits_detail
|
||||
}}} // namespace cv { namespace cuda { namespace cudev
|
||||
|
||||
//! @endcond
|
||||
|
||||
#endif // OPENCV_CUDA_TYPE_TRAITS_DETAIL_HPP
|
||||
121
OpenCV/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
Normal file
121
OpenCV/include/opencv2/core/cuda/detail/vec_distance_detail.hpp
Normal file
@@ -0,0 +1,121 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
|
||||
#define OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
|
||||
|
||||
#include "../datamov_utils.hpp"
|
||||
|
||||
//! @cond IGNORED
|
||||
|
||||
namespace cv { namespace cuda { namespace device
|
||||
{
|
||||
namespace vec_distance_detail
|
||||
{
|
||||
template <int THREAD_DIM, int N> struct UnrollVecDiffCached
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
|
||||
{
|
||||
if (ind < len)
|
||||
{
|
||||
T1 val1 = *vecCached++;
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vecGlob, ind, val2);
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
|
||||
{
|
||||
T1 val1 = *vecCached++;
|
||||
|
||||
T2 val2;
|
||||
ForceGlob<T2>::Load(vecGlob, 0, val2);
|
||||
vecGlob += THREAD_DIM;
|
||||
|
||||
dist.reduceIter(val1, val2);
|
||||
|
||||
UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
|
||||
}
|
||||
};
|
||||
template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
|
||||
{
|
||||
}
|
||||
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
|
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
|
||||
{
|
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
|
||||
}
|
||||
};
|
||||
template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
|
||||
{
|
||||
template <typename Dist, typename T1, typename T2>
|
||||
static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
|
||||
{
|
||||
UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
|
||||
}
|
||||
};
|
||||
} // namespace vec_distance_detail
|
||||
}}} // namespace cv { namespace cuda { namespace cudev
|
||||
|
||||
//! @endcond
|
||||
|
||||
#endif // OPENCV_CUDA_VEC_DISTANCE_DETAIL_HPP
|
||||
Reference in New Issue
Block a user