21#include <cuda_runtime.h>
31static constexpr int64_t OPEN3D_PARFOR_BLOCK = 128;
32static constexpr int64_t OPEN3D_PARFOR_THREAD = 4;
35template <
int64_t block_size,
int64_t thread_size,
typename func_t>
36__global__
void ElementWiseKernel_(int64_t n, func_t f) {
37 int64_t items_per_block = block_size * thread_size;
38 int64_t idx = blockIdx.x * items_per_block + threadIdx.x;
40 for (int64_t i = 0; i < thread_size; ++i) {
49template <
typename func_t>
50void ParallelForCUDA_(
const Device& device, int64_t n,
const func_t& func) {
52 utility::LogError(
"ParallelFor for CUDA cannot run on device {}.",
59 CUDAScopedDevice scoped_device(device);
60 int64_t items_per_block = OPEN3D_PARFOR_BLOCK * OPEN3D_PARFOR_THREAD;
61 int64_t grid_size = (n + items_per_block - 1) / items_per_block;
63 ElementWiseKernel_<OPEN3D_PARFOR_BLOCK, OPEN3D_PARFOR_THREAD>
64 <<<grid_size, OPEN3D_PARFOR_BLOCK, 0, core::cuda::GetStream()>>>(
72template <
typename func_t>
74 if (!device.
IsCPU()) {
75 utility::LogError(
"ParallelFor for CPU cannot run on device {}.",
82#pragma omp parallel for num_threads(utility::EstimateMaxThreads())
83 for (int64_t i = 0; i < n; ++i) {
107template <
typename func_t>
110 ParallelForCUDA_(device, n, func);
162template <
typename vec_func_t,
typename func_t>
166 const vec_func_t& vec_func) {
167#ifdef BUILD_ISPC_MODULE
170 ParallelForCUDA_(device, n, func);
174 int64_t start = n * i / num_threads;
175 int64_t end = std::min<int64_t>(n * (i + 1) / num_threads, n);
176 vec_func(start, end);
183 ParallelForCUDA_(device, n, func);
191#ifdef BUILD_ISPC_MODULE
194#define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
195 using namespace ispc; \
196 ISPCKernel(start, end, __VA_ARGS__);
201#define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
203 "ISPC module disabled. Unable to call vectorized kernel {}", \
204 OPEN3D_STRINGIFY(ISPCKernel));
209#define OPEN3D_OVERLOADED_LAMBDA_(T, ISPCKernel, ...) \
210 [&](T, int64_t start, int64_t end) { \
211 OPEN3D_CALL_ISPC_KERNEL_( \
212 OPEN3D_CONCAT(ISPCKernel, OPEN3D_CONCAT(_, T)), start, end, \
225#define OPEN3D_VECTORIZED(ISPCKernel, ...) \
226 [&](int64_t start, int64_t end) { \
227 OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, __VA_ARGS__); \
243#define OPEN3D_TEMPLATE_VECTORIZED(T, ISPCKernel, ...) \
244 [&](int64_t start, int64_t end) { \
245 static_assert(std::is_arithmetic<T>::value, \
246 "Data type is not an arithmetic type"); \
248 OPEN3D_OVERLOADED_LAMBDA_(bool, ISPCKernel, __VA_ARGS__), \
249 OPEN3D_OVERLOADED_LAMBDA_(uint8_t, ISPCKernel, __VA_ARGS__), \
250 OPEN3D_OVERLOADED_LAMBDA_(int8_t, ISPCKernel, __VA_ARGS__), \
251 OPEN3D_OVERLOADED_LAMBDA_(uint16_t, ISPCKernel, __VA_ARGS__), \
252 OPEN3D_OVERLOADED_LAMBDA_(int16_t, ISPCKernel, __VA_ARGS__), \
253 OPEN3D_OVERLOADED_LAMBDA_(uint32_t, ISPCKernel, __VA_ARGS__), \
254 OPEN3D_OVERLOADED_LAMBDA_(int32_t, ISPCKernel, __VA_ARGS__), \
255 OPEN3D_OVERLOADED_LAMBDA_(uint64_t, ISPCKernel, __VA_ARGS__), \
256 OPEN3D_OVERLOADED_LAMBDA_(int64_t, ISPCKernel, __VA_ARGS__), \
257 OPEN3D_OVERLOADED_LAMBDA_(float, ISPCKernel, __VA_ARGS__), \
258 OPEN3D_OVERLOADED_LAMBDA_(double, ISPCKernel, __VA_ARGS__), \
259 [&](auto&& generic, int64_t start, int64_t end) { \
261 "Unsupported data type {} for calling " \
262 "vectorized kernel {}", \
263 typeid(generic).name(), \
264 OPEN3D_STRINGIFY(ISPCKernel)); \
265 })(T{}, start, end); \
#define OPEN3D_GET_LAST_CUDA_ERROR(message)
Definition CUDAUtils.h:48
bool IsCPU() const
Returns true iff device type is CPU.
Definition Device.h:46
std::string ToString() const
Returns string representation of device, e.g. "CPU:0", "CUDA:0".
Definition Device.cpp:88
void ParallelForCPU_(const Device &device, int64_t n, const func_t &func)
Run a function in parallel on CPU.
Definition ParallelFor.h:73
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition ParallelFor.h:108
int EstimateMaxThreads()
Estimate the maximum number of threads to be used in a parallel region.
Definition Parallel.cpp:31
Definition PinholeCameraIntrinsic.cpp:16