diff --git a/src/curobo/curobolib/cpp/geom_cuda.cpp b/src/curobo/curobolib/cpp/geom_cuda.cpp index 2ff5222..a83337f 100644 --- a/src/curobo/curobolib/cpp/geom_cuda.cpp +++ b/src/curobo/curobolib/cpp/geom_cuda.cpp @@ -32,8 +32,6 @@ std::vectorself_collision_distance( const int ndpt = 8, // Does this need to match template? const bool debug = false); -// CUDA forward declarations - std::vectorswept_sphere_obb_clpt( const torch::Tensor sphere_position, // batch_size, 3 torch::Tensor distance, // batch_size, 1 @@ -83,6 +81,7 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4 const bool use_batch_env, const bool sum_collisions, const bool compute_esdf); + std::vector sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3 torch::Tensor distance, @@ -126,6 +125,7 @@ swept_sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3 const bool compute_distance, const bool use_batch_env, const bool sum_collisions); + std::vectorpose_distance( torch::Tensor out_distance, torch::Tensor out_position_distance, diff --git a/src/curobo/curobolib/cpp/self_collision_kernel.cu b/src/curobo/curobolib/cpp/self_collision_kernel.cu index 894cfcf..08e9b00 100644 --- a/src/curobo/curobolib/cpp/self_collision_kernel.cu +++ b/src/curobo/curobolib/cpp/self_collision_kernel.cu @@ -162,7 +162,7 @@ namespace Curobo ////////////////////////////////////////////////////// // Compute distances and store the maximum per thread // in registers (max_d). - // Each thread computes upto ndpt distances. + // Each thread computes up to ndpt distances. // two warps per row ////////////////////////////////////////////////////// // int nspheres_2 = nspheres * nspheres; @@ -175,8 +175,6 @@ namespace Curobo { sph2 = __rs_shared[j]; // we need not load sph2 in every iteration. -#pragma unroll 16 - for (int k = 0; k < ndpt; k++, i++) // increment i also here { if ((i < nspheres) && (j > i)) @@ -230,6 +228,8 @@ namespace Curobo if (threadIdx.x < blockDim.x) { // dist_t max_d = dist_sh[threadIdx.x]; +#pragma unroll 4 + for (int offset = 16; offset > 0; offset /= 2) { uint64_t nd = __shfl_down_sync(mask, *(uint64_t *)&max_d, offset);