Add planning to grasp API

2024-11-22 14:15:18 -08:00
parent 18e9ebd35f
commit 36ea382dab
38 changed files with 939 additions and 535 deletions
--- a/src/curobo/curobolib/cpp/geom_cuda.cpp
+++ b/src/curobo/curobolib/cpp/geom_cuda.cpp
@@ -108,21 +108,21 @@ swept_sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3
                torch::Tensor sparsity_idx, const torch::Tensor weight,
                const torch::Tensor activation_distance,
                const torch::Tensor max_distance,
-                const torch::Tensor speed_dt, 
+                const torch::Tensor speed_dt,
                const torch::Tensor grid_features,       // n_boxes, 4, 4
                const torch::Tensor grid_params,      // n_boxes, 3
                const torch::Tensor grid_pose,        // n_boxes, 4, 4
                const torch::Tensor grid_enable,      // n_boxes, 4, 4
                const torch::Tensor n_env_grid,
                const torch::Tensor env_query_idx,   // n_boxes, 4, 4
-                const int max_nobs, 
-                const int batch_size, 
+                const int max_nobs,
+                const int batch_size,
                const int horizon,
-                const int n_spheres, 
+                const int n_spheres,
                const int sweep_steps,
                const bool enable_speed_metric,
                const bool transform_back,
-                const bool compute_distance, 
+                const bool compute_distance,
                const bool use_batch_env,
                const bool sum_collisions);

@@ -145,14 +145,15 @@ std::vector<torch::Tensor>pose_distance(
  const torch::Tensor offset_waypoint,
  const torch::Tensor offset_tstep_fraction,
  const torch::Tensor batch_pose_idx,
+  const torch::Tensor project_distance,
  const int           batch_size,
  const int           horizon,
  const int           mode,
  const int           num_goals        = 1,
  const bool          compute_grad     = false,
  const bool          write_distance   = true,
-  const bool          use_metric       = false,
-  const bool          project_distance = true);
+  const bool          use_metric       = false
+  );

 std::vector<torch::Tensor>
 backward_pose_distance(torch::Tensor       out_grad_p,
@@ -202,7 +203,7 @@ std::vector<torch::Tensor>sphere_obb_clpt_wrapper(
  torch::Tensor closest_point,         // batch size, 3
  torch::Tensor sparsity_idx, const torch::Tensor weight,
  const torch::Tensor activation_distance,
-  const torch::Tensor max_distance, 
+  const torch::Tensor max_distance,
  const torch::Tensor obb_accel,       // n_boxes, 4, 4
  const torch::Tensor obb_bounds,      // n_boxes, 3
  const torch::Tensor obb_pose,        // n_boxes, 4, 4
@@ -210,9 +211,9 @@ std::vector<torch::Tensor>sphere_obb_clpt_wrapper(
  const torch::Tensor n_env_obb,       // n_boxes, 4, 4
  const torch::Tensor env_query_idx,   // n_boxes, 4, 4
  const int max_nobs, const int batch_size, const int horizon,
-  const int n_spheres, 
+  const int n_spheres,
  const bool transform_back, const bool compute_distance,
-  const bool use_batch_env, const bool sum_collisions = true, 
+  const bool use_batch_env, const bool sum_collisions = true,
  const bool compute_esdf = false)
 {
  const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
@@ -305,10 +306,11 @@ std::vector<torch::Tensor>pose_distance_wrapper(
  const torch::Tensor weight, const torch::Tensor vec_convergence,
  const torch::Tensor run_weight, const torch::Tensor run_vec_weight,
  const torch::Tensor offset_waypoint, const torch::Tensor offset_tstep_fraction,
-  const torch::Tensor batch_pose_idx, const int batch_size, const int horizon,
+  const torch::Tensor batch_pose_idx,
+  const torch::Tensor project_distance,
+  const int batch_size, const int horizon,
  const int mode, const int num_goals = 1, const bool compute_grad = false,
-  const bool write_distance = false, const bool use_metric = false,
-  const bool project_distance = true)
+  const bool write_distance = false, const bool use_metric = false)
 {
  // at::cuda::DeviceGuard guard(angle.device());
  CHECK_INPUT(out_distance);
@@ -323,6 +325,7 @@ std::vector<torch::Tensor>pose_distance_wrapper(
  CHECK_INPUT(batch_pose_idx);
  CHECK_INPUT(offset_waypoint);
  CHECK_INPUT(offset_tstep_fraction);
+  CHECK_INPUT(project_distance);
  const at::cuda::OptionalCUDAGuard guard(current_position.device());

  return pose_distance(
@@ -332,8 +335,10 @@ std::vector<torch::Tensor>pose_distance_wrapper(
    vec_convergence, run_weight, run_vec_weight,
    offset_waypoint,
    offset_tstep_fraction,
-    batch_pose_idx, batch_size,
-    horizon, mode, num_goals, compute_grad, write_distance, use_metric, project_distance);
+    batch_pose_idx,
+    project_distance,
+    batch_size,
+    horizon, mode, num_goals, compute_grad, write_distance, use_metric);
 }

 std::vector<torch::Tensor>backward_pose_distance_wrapper(
@@ -372,8 +377,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
        "Closest Point Voxel(curobolib)");
  m.def("swept_closest_point_voxel",           &swept_sphere_voxel_clpt,
        "Swpet Closest Point Voxel(curobolib)");
-  
-  
+
+

  m.def("self_collision_distance", &self_collision_distance_wrapper,
        "Self Collision Distance (curobolib)");
--- a/src/curobo/curobolib/cpp/pose_distance_kernel.cu
+++ b/src/curobo/curobolib/cpp/pose_distance_kernel.cu
@@ -146,7 +146,6 @@ namespace Curobo
      }
    }

-    template<bool project_distance>
    __device__ __forceinline__ void
    compute_pose_distance_vector(float       *result_vec,
                                 const float3 goal_position,
@@ -156,7 +155,8 @@ namespace Curobo
                                 const float *vec_weight,
                                 const float3 offset_position,
                                 const float3 offset_rotation,
-                                 const bool   reach_offset)
+                                 const bool   reach_offset,
+                                 const bool project_distance)
    {
      // project current position to goal frame:
      float3 error_position = make_float3(0, 0, 0);
@@ -253,7 +253,7 @@ namespace Curobo
      }
    }

-    template<bool project_distance, bool use_metric>
+    template<bool use_metric>
    __device__ __forceinline__ void
    compute_pose_distance(float *distance_vec, float& distance, float& position_distance,
                          float& rotation_distance, const float3 current_position,
@@ -265,17 +265,19 @@ namespace Curobo
                          const float r_alpha,
                          const float3 offset_position,
                          const float3 offset_rotation,
-                          const bool reach_offset)
+                          const bool reach_offset,
+                          const bool project_distance)
    {
-      compute_pose_distance_vector<project_distance>(&distance_vec[0],
-                                                     goal_position,
-                                                     goal_quat,
-                                                     current_position,
-                                                     current_quat,
-                                                     &vec_weight[0],
-                                                     offset_position,
-                                                     offset_rotation,
-                                                     reach_offset);
+      compute_pose_distance_vector(&distance_vec[0],
+                                    goal_position,
+                                    goal_quat,
+                                    current_position,
+                                    current_quat,
+                                    &vec_weight[0],
+                                    offset_position,
+                                    offset_rotation,
+                                    reach_offset,
+                                    project_distance);

      position_distance = 0;
      rotation_distance = 0;
@@ -394,7 +396,7 @@ namespace Curobo
      *(float3 *)&out_grad_q[batch_idx * 4 + 1] = g_q;
    }

-    template<typename scalar_t, bool write_distance, bool project_distance, bool use_metric>
+    template<typename scalar_t, bool write_distance, bool use_metric>
    __global__ void goalset_pose_distance_kernel(
      scalar_t *out_distance, scalar_t *out_position_distance,
      scalar_t *out_rotation_distance, scalar_t *out_p_vec, scalar_t *out_q_vec,
@@ -405,7 +407,9 @@ namespace Curobo
      const scalar_t *run_weight, const scalar_t *run_vec_weight,
      const scalar_t *offset_waypoint,
      const scalar_t *offset_tstep_fraction,
-      const int32_t *batch_pose_idx, const int mode, const int num_goals,
+      const int32_t *batch_pose_idx,
+      const uint8_t *project_distance_tensor,
+      const int mode, const int num_goals,
      const int batch_size, const int horizon, const bool write_grad = false)
    {
      const int t_idx     = (blockDim.x * blockIdx.x + threadIdx.x);
@@ -416,7 +420,7 @@ namespace Curobo
      {
        return;
      }
-
+      const bool project_distance = project_distance_tensor[0];
      // read current pose:
      float3 position =
        *(float3 *)&current_position[batch_idx * horizon * 3 + h_idx * 3];
@@ -511,7 +515,7 @@ namespace Curobo
        float4 gq4 = *(float4 *)&goal_quat[(offset + k) * 4];
        l_goal_quat = make_float4(gq4.y, gq4.z, gq4.w, gq4.x);

-        compute_pose_distance<project_distance, use_metric>(&distance_vec[0],
+        compute_pose_distance<use_metric>(&distance_vec[0],
                                                            pose_distance,
                                                            position_distance,
                                                            rotation_distance,
@@ -531,7 +535,8 @@ namespace Curobo
                                                            r_w_alpha,
                                                            offset_position,
                                                            offset_rotation,
-                                                            reach_offset);
+                                                            reach_offset,
+                                                            project_distance);

        if (pose_distance <= best_distance)
        {
@@ -657,10 +662,10 @@ pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
              const torch::Tensor offset_waypoint,
              const torch::Tensor offset_tstep_fraction,
              const torch::Tensor batch_pose_idx, // batch_size, 1
+              const torch::Tensor project_distance,
              const int batch_size, const int horizon, const int mode,
              const int num_goals = 1, const bool compute_grad = false,
-              const bool write_distance = true, const bool use_metric = false,
-              const bool project_distance = true)
+              const bool write_distance = true, const bool use_metric = false)
 {
  using namespace Curobo::Pose;

@@ -684,8 +689,7 @@ pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

-  if (project_distance)
-  {
+
    if (use_metric)
    {
      if (write_distance)
@@ -693,7 +697,7 @@ pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
        AT_DISPATCH_FLOATING_TYPES(
          current_position.scalar_type(), "batch_pose_distance", ([&] {
          goalset_pose_distance_kernel
-          <scalar_t, true, true, true><< < blocksPerGrid, threadsPerBlock, 0,
+          <scalar_t, true, true><< < blocksPerGrid, threadsPerBlock, 0,
            stream >> > (
            out_distance.data_ptr<scalar_t>(),
            out_position_distance.data_ptr<scalar_t>(),
@@ -711,7 +715,9 @@ pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
            run_vec_weight.data_ptr<scalar_t>(),
            offset_waypoint.data_ptr<scalar_t>(),
            offset_tstep_fraction.data_ptr<scalar_t>(),
-            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+            batch_pose_idx.data_ptr<int32_t>(),
+            project_distance.data_ptr<uint8_t>(),
+            mode, num_goals,
            batch_size, horizon, compute_grad);
        }));
      }
@@ -720,7 +726,7 @@ pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
        AT_DISPATCH_FLOATING_TYPES(
          current_position.scalar_type(), "batch_pose_distance", ([&] {
          goalset_pose_distance_kernel
-          <scalar_t, false, true, true><< < blocksPerGrid, threadsPerBlock, 0,
+          <scalar_t, false, true><< < blocksPerGrid, threadsPerBlock, 0,
            stream >> > (
            out_distance.data_ptr<scalar_t>(),
            out_position_distance.data_ptr<scalar_t>(),
@@ -738,7 +744,9 @@ pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
            run_vec_weight.data_ptr<scalar_t>(),
            offset_waypoint.data_ptr<scalar_t>(),
            offset_tstep_fraction.data_ptr<scalar_t>(),
-            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+            batch_pose_idx.data_ptr<int32_t>(),
+            project_distance.data_ptr<uint8_t>(),
+            mode, num_goals,
            batch_size, horizon, compute_grad);
        }));
      }
@@ -749,7 +757,7 @@ pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
      {
        AT_DISPATCH_FLOATING_TYPES(
          current_position.scalar_type(), "batch_pose_distance", ([&] {
-          goalset_pose_distance_kernel<scalar_t, true, true, false>
+          goalset_pose_distance_kernel<scalar_t, true, false>
            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
            out_distance.data_ptr<scalar_t>(),
            out_position_distance.data_ptr<scalar_t>(),
@@ -767,7 +775,9 @@ pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
            run_vec_weight.data_ptr<scalar_t>(),
            offset_waypoint.data_ptr<scalar_t>(),
            offset_tstep_fraction.data_ptr<scalar_t>(),
-            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+            batch_pose_idx.data_ptr<int32_t>(),
+            project_distance.data_ptr<uint8_t>(),
+            mode, num_goals,
            batch_size, horizon, compute_grad);
        }));
      }
@@ -775,7 +785,7 @@ pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
      {
        AT_DISPATCH_FLOATING_TYPES(
          current_position.scalar_type(), "batch_pose_distance", ([&] {
-          goalset_pose_distance_kernel<scalar_t, false, true, false>
+          goalset_pose_distance_kernel<scalar_t, false, false>
            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
            out_distance.data_ptr<scalar_t>(),
            out_position_distance.data_ptr<scalar_t>(),
@@ -793,127 +803,15 @@ pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
            run_vec_weight.data_ptr<scalar_t>(),
            offset_waypoint.data_ptr<scalar_t>(),
            offset_tstep_fraction.data_ptr<scalar_t>(),
-            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+            batch_pose_idx.data_ptr<int32_t>(),
+            project_distance.data_ptr<uint8_t>(),
+            mode, num_goals,
            batch_size, horizon, compute_grad);
        }));
      }
    }
-  }
-  else
-  {
-    if (use_metric)
-    {
-      if (write_distance)
-      {
-        AT_DISPATCH_FLOATING_TYPES(
-          current_position.scalar_type(), "batch_pose_distance", ([&] {
-          goalset_pose_distance_kernel
-          <scalar_t, true, false, true><< < blocksPerGrid, threadsPerBlock, 0,
-            stream >> > (
-            out_distance.data_ptr<scalar_t>(),
-            out_position_distance.data_ptr<scalar_t>(),
-            out_rotation_distance.data_ptr<scalar_t>(),
-            distance_p_vector.data_ptr<scalar_t>(),
-            distance_q_vector.data_ptr<scalar_t>(),
-            out_gidx.data_ptr<int32_t>(),
-            current_position.data_ptr<scalar_t>(),
-            goal_position.data_ptr<scalar_t>(),
-            current_quat.data_ptr<scalar_t>(),
-            goal_quat.data_ptr<scalar_t>(),
-            vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
-            vec_convergence.data_ptr<scalar_t>(),
-            run_weight.data_ptr<scalar_t>(),
-            run_vec_weight.data_ptr<scalar_t>(),
-            offset_waypoint.data_ptr<scalar_t>(),
-            offset_tstep_fraction.data_ptr<scalar_t>(),
-            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
-            batch_size, horizon, compute_grad);
-        }));
-      }
-      else
-      {
-        AT_DISPATCH_FLOATING_TYPES(
-          current_position.scalar_type(), "batch_pose_distance", ([&] {
-          goalset_pose_distance_kernel
-          <scalar_t, false, false, true><< < blocksPerGrid, threadsPerBlock, 0,
-            stream >> > (
-            out_distance.data_ptr<scalar_t>(),
-            out_position_distance.data_ptr<scalar_t>(),
-            out_rotation_distance.data_ptr<scalar_t>(),
-            distance_p_vector.data_ptr<scalar_t>(),
-            distance_q_vector.data_ptr<scalar_t>(),
-            out_gidx.data_ptr<int32_t>(),
-            current_position.data_ptr<scalar_t>(),
-            goal_position.data_ptr<scalar_t>(),
-            current_quat.data_ptr<scalar_t>(),
-            goal_quat.data_ptr<scalar_t>(),
-            vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
-            vec_convergence.data_ptr<scalar_t>(),
-            run_weight.data_ptr<scalar_t>(),
-            run_vec_weight.data_ptr<scalar_t>(),
-            offset_waypoint.data_ptr<scalar_t>(),
-            offset_tstep_fraction.data_ptr<scalar_t>(),
-            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
-            batch_size, horizon, compute_grad);
-        }));
-      }
-    }
-    else
-    {
-      if (write_distance)
-      {
-        AT_DISPATCH_FLOATING_TYPES(
-          current_position.scalar_type(), "batch_pose_distance", ([&] {
-          goalset_pose_distance_kernel<scalar_t, true, false, false>
-            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
-            out_distance.data_ptr<scalar_t>(),
-            out_position_distance.data_ptr<scalar_t>(),
-            out_rotation_distance.data_ptr<scalar_t>(),
-            distance_p_vector.data_ptr<scalar_t>(),
-            distance_q_vector.data_ptr<scalar_t>(),
-            out_gidx.data_ptr<int32_t>(),
-            current_position.data_ptr<scalar_t>(),
-            goal_position.data_ptr<scalar_t>(),
-            current_quat.data_ptr<scalar_t>(),
-            goal_quat.data_ptr<scalar_t>(),
-            vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
-            vec_convergence.data_ptr<scalar_t>(),
-            run_weight.data_ptr<scalar_t>(),
-            run_vec_weight.data_ptr<scalar_t>(),
-            offset_waypoint.data_ptr<scalar_t>(),
-            offset_tstep_fraction.data_ptr<scalar_t>(),
-            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
-            batch_size, horizon, compute_grad);
-        }));
-      }
-      else
-      {
-        AT_DISPATCH_FLOATING_TYPES(
-          current_position.scalar_type(), "batch_pose_distance", ([&] {
-          goalset_pose_distance_kernel<scalar_t, false, false, false>
-            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
-            out_distance.data_ptr<scalar_t>(),
-            out_position_distance.data_ptr<scalar_t>(),
-            out_rotation_distance.data_ptr<scalar_t>(),
-            distance_p_vector.data_ptr<scalar_t>(),
-            distance_q_vector.data_ptr<scalar_t>(),
-            out_gidx.data_ptr<int32_t>(),
-            current_position.data_ptr<scalar_t>(),
-            goal_position.data_ptr<scalar_t>(),
-            current_quat.data_ptr<scalar_t>(),
-            goal_quat.data_ptr<scalar_t>(),
-            vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
-            vec_convergence.data_ptr<scalar_t>(),
-            run_weight.data_ptr<scalar_t>(),
-            run_vec_weight.data_ptr<scalar_t>(),
-            offset_waypoint.data_ptr<scalar_t>(),
-            offset_tstep_fraction.data_ptr<scalar_t>(),
-            batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
-            batch_size, horizon, compute_grad);
-        }));
-      }
-    }
-  }
+
+
  C10_CUDA_KERNEL_LAUNCH_CHECK();

  return { out_distance,      out_position_distance, out_rotation_distance,
--- a/src/curobo/curobolib/cpp/self_collision_kernel.cu
+++ b/src/curobo/curobolib/cpp/self_collision_kernel.cu
@@ -41,7 +41,9 @@ namespace Curobo
      scalar_t *out_distance,        // batch x 1
      scalar_t *out_vec,             // batch x nspheres x 4
      const scalar_t *robot_spheres, // batch x nspheres x 4
-      const scalar_t *collision_threshold, const int batch_size,
+      const scalar_t *offsets,
+      const uint8_t *coll_matrix,
+      const int batch_size,
      const int nspheres, const scalar_t *weight, const bool write_grad = false)
    {
      const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
@@ -52,37 +54,35 @@ namespace Curobo
      }
      float  r_diff, distance;
      float  max_penetration = 0;
-      float3 sph1, sph2, dist_vec;
+      float4 sph1, sph2;
      int    sph1_idx = -1;
      int    sph2_idx = -1;

      // iterate over spheres:
      for (int i = 0; i < nspheres; i++)
      {
-        sph1 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + i * 4];
+        sph1 = *(float4 *)&robot_spheres[batch_idx * nspheres * 4 + i * 4];
+        sph1.w += offsets[i];

        for (int j = i + 1; j < nspheres; j++)
        {
-          r_diff = collision_threshold[i * nspheres + j];
-
-          if (isinf(r_diff))
+          if(coll_matrix[i * nspheres + j] == 1)
          {
-            continue;
-          }
-          sph2 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + j * 4];
+            sph2 = *(float4 *)&robot_spheres[batch_idx * nspheres * 4 + j * 4];
+            sph2.w += offsets[j];

-          // compute sphere distance:
-          distance = relu(r_diff - length(sph1 - sph2));
+            // compute sphere distance:
+            r_diff = sph1.w + sph2.w;
+            float d      = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
+                                  (sph1.y - sph2.y) * (sph1.y - sph2.y) +
+                                  (sph1.z - sph2.z) * (sph1.z - sph2.z));
+            distance = (r_diff - d);

-          if (distance > max_penetration)
-          {
-            max_penetration = distance;
-            sph1_idx        = i;
-            sph2_idx        = j;
-
-            if (write_grad)
+            if (distance > max_penetration)
            {
-              dist_vec = normalize(sph1 - sph2);// / distance;
+              max_penetration = distance;
+              sph1_idx        = i;
+              sph2_idx        = j;
            }
          }
        }
@@ -95,6 +95,11 @@ namespace Curobo

        if (write_grad)
        {
+          float3 sph1_g =
+              *(float3 *)&robot_spheres[4 * (batch_idx * nspheres + sph1_idx)];
+          float3 sph2_g =
+              *(float3 *)&robot_spheres[4 * (batch_idx * nspheres + sph2_idx)];
+            float3 dist_vec = normalize(sph1_g - sph2_g);
          *(float3 *)&out_vec[batch_idx * nspheres * 4 + sph1_idx * 4] =
            weight[0] * -1 * dist_vec;
          *(float3 *)&out_vec[batch_idx * nspheres * 4 + sph2_idx * 4] =
@@ -131,7 +136,7 @@ namespace Curobo
      int i         = ndpt * (warp_idx / nwpr); // starting row number for this warp
      int j         = (warp_idx % nwpr) * 32;   // starting column number for this warp

-      dist_t max_d = {0.0, 0.0, 0.0 };// .d, .i, .j
+      dist_t max_d = { 0.0, 0, 0 };// .d, .i, .j
      __shared__ dist_t max_darr[32];

      // Optimization: About 1/3 of the warps will have no work.
@@ -354,7 +359,7 @@ namespace Curobo
      // in registers (max_d).
      // Each thread computes upto ndpt distances.
      //////////////////////////////////////////////////////
-      dist_t  max_d[NBPB] = {{ 0.0, 0.0, 0.0}};
+      dist_t  max_d[NBPB] = {{ 0.0, 0, 0}};
      int16_t indices[ndpt * 2];

      for (uint8_t i = 0; i < ndpt * 2; i++)
@@ -698,7 +703,7 @@ std::vector<torch::Tensor>self_collision_distance(
    }
    else
    {
-      assert(false); // only ndpt of 32 or 64 is currently supported.
+      assert(false);
    }
  }

@@ -713,6 +718,8 @@ std::vector<torch::Tensor>self_collision_distance(
    assert(collision_matrix.size(0) == nspheres * nspheres);
    int smemSize = nspheres * sizeof(float4);

+    if (nspheres < 1024 && threadsPerBlock < 1024)
+    {

    AT_DISPATCH_FLOATING_TYPES(
      robot_spheres.scalar_type(), "self_collision_distance", ([&] {
@@ -726,6 +733,30 @@ std::vector<torch::Tensor>self_collision_distance(
        ndpt_n, nwpr, weight.data_ptr<scalar_t>(),
        sparse_index.data_ptr<uint8_t>(), compute_grad);
    }));
+    }
+    else
+    {
+      threadsPerBlock = batch_size;
+      if (threadsPerBlock > 128)
+      {
+        threadsPerBlock = 128;
+      }
+      blocksPerGrid = (batch_size + threadsPerBlock - 1) / threadsPerBlock;
+
+      AT_DISPATCH_FLOATING_TYPES(
+      robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+      self_collision_distance_kernel<scalar_t>
+        << < blocksPerGrid, threadsPerBlock, smemSize, stream >> > (
+        out_distance.data_ptr<scalar_t>(),
+        out_vec.data_ptr<scalar_t>(),
+        robot_spheres.data_ptr<scalar_t>(),
+        collision_offset.data_ptr<scalar_t>(),
+        collision_matrix.data_ptr<uint8_t>(),
+        batch_size, nspheres,
+        weight.data_ptr<scalar_t>(),
+        compute_grad);
+    }));
+    }
  }
  C10_CUDA_KERNEL_LAUNCH_CHECK();

--- a/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
+++ b/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
@@ -52,6 +52,28 @@ namespace Curobo
      return max(0.0f, sphere_length(v1, v2) - v1.w - v2.w);
    }

+    __device__  __forceinline__ int3 robust_floor(const float3 f_grid, const float threshold=1e-04)
+    {
+      float3 nearest_grid = make_float3(round(f_grid.x), round(f_grid.y), round(f_grid.z));
+
+      float3 abs_diff = (f_grid - nearest_grid);
+
+      if (abs_diff.x >= threshold)
+      {
+        nearest_grid.x = floorf(f_grid.x);
+      }
+      if (abs_diff.y >= threshold)
+      {
+        nearest_grid.y = floorf(f_grid.y);
+      }
+
+      if (abs_diff.z >= threshold)
+      {
+        nearest_grid.z = floorf(f_grid.z);
+      }
+      return make_int3(nearest_grid);
+
+    }

    #if CHECK_FP8
    __device__ __forceinline__ float
@@ -487,21 +509,20 @@ namespace Curobo
      delta = make_float3(pt.x - sphere.x, pt.y - sphere.y, pt.z - sphere.z);

      distance = length(delta);
-
-      if (!inside)
+      if (distance == 0.0)
+      {
+        delta = -1.0 * make_float3(pt.x, pt.y, pt.z);
+      }
+      if (!inside) // outside
      {
        distance *= -1.0;
      }
-      else
+      else // inside
      {
        delta = -1 * delta;
      }

-      if (distance != 0.0)
-      {
-        delta = normalize(delta);
-
-      }
+      delta = normalize(delta);
      sph_distance = distance + sphere.w;
      //

@@ -685,29 +706,48 @@ float4 &sum_pt)

    template<typename grid_scalar_t, bool INTERPOLATION=false>
    __device__ __forceinline__ void
-    compute_voxel_location_params(
+    compute_voxel_index(
    const grid_scalar_t *grid_features,
    const float4& loc_grid_params,
    const float4& loc_sphere,
+    int &voxel_idx,
    int3 &xyz_loc,
    int3 &xyz_grid,
-    float &interpolated_distance,
-    int &voxel_idx)
+    float &interpolated_distance)
    {


-      // convert location to index: can use floor to cast to int.
-      // to account for negative values, add 0.5 * bounds.
-      const float3 loc_grid = make_float3(loc_grid_params.x, loc_grid_params.y, loc_grid_params.z);
+
+
+      const float3 loc_grid = make_float3(loc_grid_params.x, loc_grid_params.y, loc_grid_params.z);// - loc_grid_params.w;
      const  float3 sphere = make_float3(loc_sphere.x, loc_sphere.y, loc_sphere.z);
-      //xyz_loc = make_int3(floorf((sphere  + 0.5 * loc_grid) / loc_grid_params.w));
-      const float inv_voxel_size = 1.0/loc_grid_params.w;
-      //xyz_loc = make_int3(sphere * inv_voxel_size)  + make_int3(0.5 * loc_grid * inv_voxel_size);
+      const float inv_voxel_size = 1.0f / loc_grid_params.w;
+
+      float3 f_grid = (loc_grid) * inv_voxel_size;
+
+
+      xyz_grid = robust_floor(f_grid) + 1;
+
+
+      xyz_loc = make_int3(((sphere.x + 0.5f * loc_grid.x) * inv_voxel_size),
+                          ((sphere.y + 0.5f * loc_grid.y)* inv_voxel_size),
+                          ((sphere.z + 0.5f * loc_grid.z) * inv_voxel_size));
+
+
+      // check grid bounds:
+      // 2 to catch numerical precision errors. 1 can be used when exact.
+      // We need at least 1 as we
+      // look at neighbouring voxels for finite difference
+      const int offset = 2;
+      if (xyz_loc.x >= xyz_grid.x - offset || xyz_loc.y >= xyz_grid.y - offset || xyz_loc.z >= xyz_grid.z - offset
+          || xyz_loc.x <= offset || xyz_loc.y <= offset || xyz_loc.z <= offset
+      )
+      {
+        voxel_idx = -1;
+        return;
+      }

-      xyz_loc = make_int3((sphere  + 0.5 * loc_grid) * inv_voxel_size);

-      //xyz_loc = make_int3(sphere / loc_grid_params.w) + make_int3(floorf(0.5 * loc_grid/loc_grid_params.w));
-      xyz_grid = make_int3((loc_grid * inv_voxel_size)) + 1;

      // find next nearest voxel to current point and then do weighted interpolation:
      voxel_idx = xyz_loc.x * xyz_grid.y * xyz_grid.z  + xyz_loc.y * xyz_grid.z + xyz_loc.z;
@@ -715,10 +755,6 @@ float4 &sum_pt)

      // compute interpolation distance between voxel origin and sphere location:
      get_array_value(grid_features, voxel_idx, interpolated_distance);
-      if(!INTERPOLATION)
-      {
-        interpolated_distance += 0.5 * loc_grid_params.w;//max(0.0, (0.3 * loc_grid_params.w) - loc_sphere.w);
-      }
      if(INTERPOLATION)
      {
      //
@@ -739,41 +775,6 @@ float4 &sum_pt)



-    }
-
-    template<typename grid_scalar_t>
-    __device__ __forceinline__ void
-    compute_voxel_index(
-    const grid_scalar_t *grid_features,
-    const float4& loc_grid_params,
-    const float4& loc_sphere,
-    int &voxel_idx,
-    int3 &xyz_loc,
-    int3 &xyz_grid,
-    float &interpolated_distance)
-    {
-      // check if sphere is out of bounds
-      // loc_grid_params.x contains bounds
-      float4 local_bounds = 0.5*loc_grid_params - 2*loc_grid_params.w;
-
-      if (loc_sphere.x <= -1 * (local_bounds.x) ||
-          loc_sphere.x >= (local_bounds.x) ||
-          loc_sphere.y <= -1 * (local_bounds.y) ||
-          loc_sphere.y >=  (local_bounds.y) ||
-          loc_sphere.z <= -1 * (local_bounds.z) ||
-          loc_sphere.z >=  (local_bounds.z))
-      {
-        voxel_idx = -1;
-        return;
-      }
-
-      compute_voxel_location_params(grid_features, loc_grid_params, loc_sphere, xyz_loc, xyz_grid, interpolated_distance, voxel_idx);
-      // convert location to index: can use floor to cast to int.
-      // to account for negative values, add 0.5 * bounds.
-
-
-
-

    }

@@ -979,7 +980,7 @@ float4 &sum_pt)
      // Load sphere_position input
      float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];

-      if (sphere_cache.w <= 0.0)
+      if (sphere_cache.w < 0.0)
      {
        // write zeros for cost:
        out_distance[bn_sph_idx] = 0;
@@ -1044,7 +1045,7 @@ float4 &sum_pt)
      // Load sphere_position input
      float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];

-      if (sphere_cache.w <= 0.0)
+      if (sphere_cache.w < 0.0)
      {
        // write zeros for cost:
        out_distance[bn_sph_idx] = 0;
@@ -1173,7 +1174,7 @@ float4 &sum_pt)

      // Load sphere_position input
      float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];
-      if (sphere_cache.w <= 0.0)
+      if (sphere_cache.w < 0.0)
      {
        // write zeros for cost:
        out_distance[bn_sph_idx] = 0;
@@ -1275,7 +1276,7 @@ float4 &sum_pt)
      // Load sphere_position input
      float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];

-      if (sphere_cache.w <= 0.0)
+      if (sphere_cache.w < 0.0)
      {
        // write zeros for cost:
        out_distance[bn_sph_idx] = 0;
@@ -1452,7 +1453,7 @@ float4 &sum_pt)
      // Load sphere_position input
      float4 sphere_1_cache = *(float4 *)&sphere_position[bhs_idx * 4];

-      if (sphere_1_cache.w <= 0.0)
+      if (sphere_1_cache.w < 0.0)
      {
        // write zeros for cost:
        out_distance[bhs_idx] = 0;
@@ -1888,7 +1889,7 @@ float4 &sum_pt)
      // Load sphere_position input
      float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];

-      if (sphere_cache.w <= 0.0)
+      if (sphere_cache.w < 0.0)
      {
        // write zeros for cost:
        out_distance[bn_sph_idx] = 0;
@@ -2001,7 +2002,7 @@ float4 &sum_pt)
      float4 sphere_1_cache = *(float4 *)&sphere_position[bhs_idx * 4];


-      if (sphere_1_cache.w <= 0.0)
+      if (sphere_1_cache.w < 0.0)
      {
        // write zeros for cost:
        out_distance[bhs_idx] = 0;
@@ -2303,7 +2304,7 @@ float4 &sum_pt)
      // if h_idx == horizon -1, we just read the same index
      float4 sphere_1_cache = *(float4 *)&sphere_position[bhs_idx * 4];

-      if (sphere_1_cache.w <= 0.0)
+      if (sphere_1_cache.w < 0.0)
      {
        out_distance[b_addrs + h_idx * nspheres + sph_idx] = 0.0;
        return;
--- a/src/curobo/curobolib/geom.py
+++ b/src/curobo/curobolib/geom.py
@@ -157,6 +157,7 @@ def get_pose_distance(
    offset_waypoint,
    offset_tstep_fraction,
    batch_pose_idx,
+    project_distance,
    batch_size,
    horizon,
    mode=1,
@@ -164,7 +165,6 @@ def get_pose_distance(
    write_grad=False,
    write_distance=False,
    use_metric=False,
-    project_distance=True,
 ):
    if batch_pose_idx.shape[0] != batch_size:
        raise ValueError("Index buffer size is different from batch size")
@@ -188,6 +188,7 @@ def get_pose_distance(
        offset_waypoint,
        offset_tstep_fraction,
        batch_pose_idx,
+        project_distance,
        batch_size,
        horizon,
        mode,
@@ -195,7 +196,6 @@ def get_pose_distance(
        write_grad,
        write_distance,
        use_metric,
-        project_distance,
    )

    out_distance = r[0]
@@ -272,6 +272,7 @@ class PoseErrorDistance(torch.autograd.Function):
        offset_waypoint,
        offset_tstep_fraction,
        batch_pose_idx,
+        project_distance,
        out_distance,
        out_position_distance,
        out_rotation_distance,
@@ -284,8 +285,7 @@ class PoseErrorDistance(torch.autograd.Function):
        horizon,
        mode,  # =PoseErrorType.BATCH_GOAL.value,
        num_goals,
-        use_metric,  # =False,
-        project_distance,  # =True,
+        use_metric,
    ):
        # out_distance = current_position[..., 0].detach().clone() * 0.0
        # out_position_distance = out_distance.detach().clone()
@@ -322,6 +322,7 @@ class PoseErrorDistance(torch.autograd.Function):
            offset_waypoint,
            offset_tstep_fraction,
            batch_pose_idx,
+            project_distance,
            batch_size,
            horizon,
            mode,
@@ -329,7 +330,6 @@ class PoseErrorDistance(torch.autograd.Function):
            current_position.requires_grad,
            True,
            use_metric,
-            project_distance,
        )
        ctx.save_for_backward(out_p_vec, out_r_vec, weight, out_p_grad, out_q_grad)
        return out_distance, out_position_distance, out_rotation_distance, out_idx  # .view(-1,1)
@@ -406,6 +406,7 @@ class PoseError(torch.autograd.Function):
        offset_waypoint,
        offset_tstep_fraction,
        batch_pose_idx,
+        project_distance,
        out_distance,
        out_position_distance,
        out_rotation_distance,
@@ -419,7 +420,6 @@ class PoseError(torch.autograd.Function):
        mode,
        num_goals,
        use_metric,
-        project_distance,
        return_loss,
    ):
        """Compute error in pose
@@ -494,6 +494,7 @@ class PoseError(torch.autograd.Function):
            offset_waypoint,
            offset_tstep_fraction,
            batch_pose_idx,
+            project_distance,
            batch_size,
            horizon,
            mode,
@@ -501,7 +502,6 @@ class PoseError(torch.autograd.Function):
            current_position.requires_grad,
            False,
            use_metric,
-            project_distance,
        )
        ctx.save_for_backward(out_p_vec, out_r_vec)
        return out_distance
--- a/src/curobo/curobolib/kinematics.py
+++ b/src/curobo/curobolib/kinematics.py
@@ -113,7 +113,6 @@ class KinematicsFusedFunction(Function):
    @staticmethod
    def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
        grad_joint = None
-
        if ctx.needs_input_grad[4]:
            (
                joint_seq,
@@ -193,10 +192,14 @@ class KinematicsFusedFunction(Function):
        b_size = b_shape[0]
        n_spheres = robot_sphere_out.shape[1]
        n_joints = angle.shape[-1]
-        if grad_out.is_contiguous():
-            grad_out = grad_out.view(-1)
-        else:
-            grad_out = grad_out.reshape(-1)
+        grad_out = grad_out.contiguous()
+        link_pos_out = link_pos_out.contiguous()
+        link_quat_out = link_quat_out.contiguous()
+        # if grad_out.is_contiguous():
+        #    grad_out = grad_out.view(-1)
+        # else:
+        #    grad_out = grad_out.reshape(-1)
+
        r = kinematics_fused_cu.backward(
            grad_out,
            link_pos_out,
--- a/src/curobo/curobolib/ls.py
+++ b/src/curobo/curobolib/ls.py
@@ -58,7 +58,6 @@ def wolfe_line_search(
    l1 = g_x.shape[1]
    l2 = g_x.shape[2]
    r = line_search_cu.line_search(
-        # m_idx,
        best_x,
        best_c,
        best_grad,
@@ -76,7 +75,6 @@ def wolfe_line_search(
        l2,
        batchsize,
    )
-    # print("batchsize:" + str(batchsize))
    return (r[0], r[1], r[2])