diff --git a/src/curobo/curobolib/cpp/geom_cuda.cpp b/src/curobo/curobolib/cpp/geom_cuda.cpp
index 2ff5222..a83337f 100644
--- a/src/curobo/curobolib/cpp/geom_cuda.cpp
+++ b/src/curobo/curobolib/cpp/geom_cuda.cpp
@@ -32,8 +32,6 @@ std::vector<torch::Tensor>self_collision_distance(
   const int           ndpt         = 8, // Does this need to match template?
   const bool          debug        = false);
 
-// CUDA forward declarations
-
 std::vector<torch::Tensor>swept_sphere_obb_clpt(
   const torch::Tensor sphere_position, // batch_size, 3
   torch::Tensor       distance,        // batch_size, 1
@@ -83,6 +81,7 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
                 const bool          use_batch_env,
                 const bool          sum_collisions,
                 const bool          compute_esdf);
+
 std::vector<torch::Tensor>
 sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3
                 torch::Tensor distance,
@@ -126,6 +125,7 @@ swept_sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3
                 const bool compute_distance, 
                 const bool use_batch_env,
                 const bool sum_collisions);
+
 std::vector<torch::Tensor>pose_distance(
   torch::Tensor       out_distance,
   torch::Tensor       out_position_distance,
diff --git a/src/curobo/curobolib/cpp/self_collision_kernel.cu b/src/curobo/curobolib/cpp/self_collision_kernel.cu
index 894cfcf..08e9b00 100644
--- a/src/curobo/curobolib/cpp/self_collision_kernel.cu
+++ b/src/curobo/curobolib/cpp/self_collision_kernel.cu
@@ -162,7 +162,7 @@ namespace Curobo
       //////////////////////////////////////////////////////
       // Compute distances and store the maximum per thread
       // in registers (max_d).
-      // Each thread computes upto ndpt distances.
+      // Each thread computes up to ndpt distances.
       // two warps per row
       //////////////////////////////////////////////////////
       // int nspheres_2 = nspheres * nspheres;
@@ -175,8 +175,6 @@ namespace Curobo
       {
         sph2 = __rs_shared[j]; // we need not load sph2 in every iteration.
 
-#pragma unroll 16
-
         for (int k = 0; k < ndpt; k++, i++) // increment i also here
         {
           if ((i < nspheres) && (j > i))
@@ -230,6 +228,8 @@ namespace Curobo
         if (threadIdx.x < blockDim.x)
         {
           // dist_t max_d = dist_sh[threadIdx.x];
+#pragma unroll 4
+
           for (int offset = 16; offset > 0; offset /= 2)
           {
             uint64_t nd     = __shfl_down_sync(mask, *(uint64_t *)&max_d, offset);