Significantly improved convergence for mesh and cuboid, new ESDF collision.

2024-03-18 11:19:48 -07:00
parent 286b3820a5
commit b1f63e8778
100 changed files with 7587 additions and 2589 deletions
--- a/src/curobo/curobolib/cpp/geom_cuda.cpp
+++ b/src/curobo/curobolib/cpp/geom_cuda.cpp
@@ -57,7 +57,8 @@ std::vector<torch::Tensor>swept_sphere_obb_clpt(
  const bool          enable_speed_metric,
  const bool          transform_back,
  const bool          compute_distance,
-  const bool          use_batch_env);
+  const bool          use_batch_env,
+  const bool          sum_collisions);

 std::vector<torch::Tensor>
 sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
@@ -66,6 +67,7 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
                torch::Tensor       sparsity_idx,
                const torch::Tensor weight,
                const torch::Tensor activation_distance,
+                  const torch::Tensor max_distance,
                const torch::Tensor obb_accel,     // n_boxes, 4, 4
                const torch::Tensor obb_bounds,    // n_boxes, 3
                const torch::Tensor obb_pose,      // n_boxes, 4, 4
@@ -78,8 +80,52 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
                const int           n_spheres,
                const bool          transform_back,
                const bool          compute_distance,
-                const bool          use_batch_env);
+                const bool          use_batch_env,
+                const bool          sum_collisions,
+                const bool          compute_esdf);
+std::vector<torch::Tensor>
+sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3
+                torch::Tensor distance,
+                torch::Tensor closest_point,         // batch size, 3
+                torch::Tensor sparsity_idx, const torch::Tensor weight,
+                const torch::Tensor activation_distance,
+                const torch::Tensor max_distance,
+                const torch::Tensor grid_features,       // n_boxes, 4, 4
+                const torch::Tensor grid_params,      // n_boxes, 3
+                const torch::Tensor grid_pose,        // n_boxes, 4, 4
+                const torch::Tensor grid_enable,      // n_boxes, 4, 4
+                const torch::Tensor n_env_grid,
+                const torch::Tensor env_query_idx,   // n_boxes, 4, 4
+                const int max_nobs, const int batch_size, const int horizon,
+                const int n_spheres, const bool transform_back,
+                const bool compute_distance, const bool use_batch_env,
+                const bool sum_collisions,
+                const bool compute_esdf);

+std::vector<torch::Tensor>
+swept_sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3
+                torch::Tensor distance,
+                torch::Tensor closest_point,         // batch size, 3
+                torch::Tensor sparsity_idx, const torch::Tensor weight,
+                const torch::Tensor activation_distance,
+                const torch::Tensor max_distance,
+                const torch::Tensor speed_dt, 
+                const torch::Tensor grid_features,       // n_boxes, 4, 4
+                const torch::Tensor grid_params,      // n_boxes, 3
+                const torch::Tensor grid_pose,        // n_boxes, 4, 4
+                const torch::Tensor grid_enable,      // n_boxes, 4, 4
+                const torch::Tensor n_env_grid,
+                const torch::Tensor env_query_idx,   // n_boxes, 4, 4
+                const int max_nobs, 
+                const int batch_size, 
+                const int horizon,
+                const int n_spheres, 
+                const int sweep_steps,
+                const bool enable_speed_metric,
+                const bool transform_back,
+                const bool compute_distance, 
+                const bool use_batch_env,
+                const bool sum_collisions);
 std::vector<torch::Tensor>pose_distance(
  torch::Tensor       out_distance,
  torch::Tensor       out_position_distance,
@@ -159,11 +205,11 @@ std::vector<torch::Tensor>self_collision_distance_wrapper(

 std::vector<torch::Tensor>sphere_obb_clpt_wrapper(
  const torch::Tensor sphere_position, // batch_size, 4
-
  torch::Tensor distance,
  torch::Tensor closest_point,         // batch size, 3
  torch::Tensor sparsity_idx, const torch::Tensor weight,
  const torch::Tensor activation_distance,
+  const torch::Tensor max_distance, 
  const torch::Tensor obb_accel,       // n_boxes, 4, 4
  const torch::Tensor obb_bounds,      // n_boxes, 3
  const torch::Tensor obb_pose,        // n_boxes, 4, 4
@@ -171,8 +217,10 @@ std::vector<torch::Tensor>sphere_obb_clpt_wrapper(
  const torch::Tensor n_env_obb,       // n_boxes, 4, 4
  const torch::Tensor env_query_idx,   // n_boxes, 4, 4
  const int max_nobs, const int batch_size, const int horizon,
-  const int n_spheres, const bool transform_back, const bool compute_distance,
-  const bool use_batch_env)
+  const int n_spheres, 
+  const bool transform_back, const bool compute_distance,
+  const bool use_batch_env, const bool sum_collisions = true, 
+  const bool compute_esdf = false)
 {
  const at::cuda::OptionalCUDAGuard guard(sphere_position.device());

@@ -185,9 +233,9 @@ std::vector<torch::Tensor>sphere_obb_clpt_wrapper(
  CHECK_INPUT(obb_accel);
  return sphere_obb_clpt(
    sphere_position, distance, closest_point, sparsity_idx, weight,
-    activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
+    activation_distance, max_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
    n_env_obb, env_query_idx, max_nobs, batch_size, horizon, n_spheres,
-    transform_back, compute_distance, use_batch_env);
+    transform_back, compute_distance, use_batch_env, sum_collisions, compute_esdf);
 }

 std::vector<torch::Tensor>swept_sphere_obb_clpt_wrapper(
@@ -205,7 +253,7 @@ std::vector<torch::Tensor>swept_sphere_obb_clpt_wrapper(
  const int max_nobs, const int batch_size, const int horizon,
  const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
  const bool transform_back, const bool compute_distance,
-  const bool use_batch_env)
+  const bool use_batch_env, const bool sum_collisions = true)
 {
  const at::cuda::OptionalCUDAGuard guard(sphere_position.device());

@@ -218,7 +266,37 @@ std::vector<torch::Tensor>swept_sphere_obb_clpt_wrapper(
    distance, closest_point, sparsity_idx, weight, activation_distance,
    speed_dt, obb_accel, obb_bounds, obb_pose, obb_enable, n_env_obb,
    env_query_idx, max_nobs, batch_size, horizon, n_spheres, sweep_steps,
-    enable_speed_metric, transform_back, compute_distance, use_batch_env);
+    enable_speed_metric, transform_back, compute_distance, use_batch_env, sum_collisions);
+}
+
+std::vector<torch::Tensor>
+sphere_voxel_clpt_wrapper(const torch::Tensor sphere_position, // batch_size, 3
+                torch::Tensor distance,
+                torch::Tensor closest_point,         // batch size, 3
+                torch::Tensor sparsity_idx, const torch::Tensor weight,
+                const torch::Tensor activation_distance,
+                const torch::Tensor max_distance,
+                const torch::Tensor grid_features,       // n_boxes, 4, 4
+                const torch::Tensor grid_params,      // n_boxes, 3
+                const torch::Tensor grid_pose,        // n_boxes, 4, 4
+                const torch::Tensor grid_enable,      // n_boxes, 4, 4
+                const torch::Tensor n_env_grid,
+                const torch::Tensor env_query_idx,   // n_boxes, 4, 4
+                const int max_ngrid, const int batch_size, const int horizon,
+                const int n_spheres, const bool transform_back,
+                const bool compute_distance, const bool use_batch_env,
+                const bool sum_collisions,
+                const bool compute_esdf)
+{
+    const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
+
+  CHECK_INPUT(distance);
+  CHECK_INPUT(closest_point);
+  CHECK_INPUT(sphere_position);
+  return sphere_voxel_clpt(sphere_position, distance, closest_point, sparsity_idx, weight,
+  activation_distance, max_distance, grid_features, grid_params,
+  grid_pose, grid_enable, n_env_grid, env_query_idx, max_ngrid, batch_size, horizon, n_spheres,
+  transform_back, compute_distance, use_batch_env, sum_collisions, compute_esdf);
 }

 std::vector<torch::Tensor>pose_distance_wrapper(
@@ -297,6 +375,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
        "Closest Point OBB(curobolib)");
  m.def("swept_closest_point",     &swept_sphere_obb_clpt_wrapper,
        "Swept Closest Point OBB(curobolib)");
+  m.def("closest_point_voxel",           &sphere_voxel_clpt_wrapper,
+        "Closest Point Voxel(curobolib)");
+  m.def("swept_closest_point_voxel",           &swept_sphere_voxel_clpt,
+        "Swpet Closest Point Voxel(curobolib)");
+  
+  

  m.def("self_collision_distance", &self_collision_distance_wrapper,
        "Self Collision Distance (curobolib)");
--- a/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp
+++ b/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp
@@ -14,38 +14,6 @@

 #include <c10/cuda/CUDAGuard.h>

-// CUDA forward declarations
-std::vector<torch::Tensor>reduce_cuda(torch::Tensor vec,
-                                      torch::Tensor vec2,
-                                      torch::Tensor rho_buffer,
-                                      torch::Tensor sum,
-                                      const int     batch_size,
-                                      const int     v_dim,
-                                      const int     m);
-
-std::vector<torch::Tensor>
-lbfgs_step_cuda(torch::Tensor step_vec,
-                torch::Tensor rho_buffer,
-                torch::Tensor y_buffer,
-                torch::Tensor s_buffer,
-                torch::Tensor grad_q,
-                const float   epsilon,
-                const int     batch_size,
-                const int     m,
-                const int     v_dim);
-
-std::vector<torch::Tensor>
-lbfgs_update_cuda(torch::Tensor rho_buffer,
-                  torch::Tensor y_buffer,
-                  torch::Tensor s_buffer,
-                  torch::Tensor q,
-                  torch::Tensor grad_q,
-                  torch::Tensor x_0,
-                  torch::Tensor grad_0,
-                  const int     batch_size,
-                  const int     m,
-                  const int     v_dim);
-
 std::vector<torch::Tensor>
 lbfgs_cuda_fuse(torch::Tensor step_vec,
                torch::Tensor rho_buffer,
@@ -59,7 +27,8 @@ lbfgs_cuda_fuse(torch::Tensor step_vec,
                const int     batch_size,
                const int     m,
                const int     v_dim,
-                const bool    stable_mode);
+                const bool    stable_mode,
+                const bool use_shared_buffers);

 // C++ interface

@@ -71,58 +40,12 @@ lbfgs_cuda_fuse(torch::Tensor step_vec,
  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

-std::vector<torch::Tensor>
-lbfgs_step_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
-                torch::Tensor y_buffer, torch::Tensor s_buffer,
-                torch::Tensor grad_q, const float epsilon, const int batch_size,
-                const int m, const int v_dim)
-{
-  CHECK_INPUT(step_vec);
-  CHECK_INPUT(rho_buffer);
-  CHECK_INPUT(y_buffer);
-  CHECK_INPUT(s_buffer);
-  CHECK_INPUT(grad_q);
-  const at::cuda::OptionalCUDAGuard guard(grad_q.device());
-
-  return lbfgs_step_cuda(step_vec, rho_buffer, y_buffer, s_buffer, grad_q,
-                         epsilon, batch_size, m, v_dim);
-}
-
-std::vector<torch::Tensor>
-lbfgs_update_call(torch::Tensor rho_buffer, torch::Tensor y_buffer,
-                  torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
-                  torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
-                  const int m, const int v_dim)
-{
-  CHECK_INPUT(rho_buffer);
-  CHECK_INPUT(y_buffer);
-  CHECK_INPUT(s_buffer);
-  CHECK_INPUT(grad_q);
-  CHECK_INPUT(x_0);
-  CHECK_INPUT(grad_0);
-  CHECK_INPUT(q);
-  const at::cuda::OptionalCUDAGuard guard(grad_q.device());
-
-  return lbfgs_update_cuda(rho_buffer, y_buffer, s_buffer, q, grad_q, x_0,
-                           grad_0, batch_size, m, v_dim);
-}
-
-std::vector<torch::Tensor>
-reduce_cuda_call(torch::Tensor vec, torch::Tensor vec2,
-                 torch::Tensor rho_buffer, torch::Tensor sum,
-                 const int batch_size, const int v_dim, const int m)
-{
-  const at::cuda::OptionalCUDAGuard guard(sum.device());
-
-  return reduce_cuda(vec, vec2, rho_buffer, sum, batch_size, v_dim, m);
-}
-
 std::vector<torch::Tensor>
 lbfgs_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
           torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
           torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
           const float epsilon, const int batch_size, const int m,
-           const int v_dim, const bool stable_mode)
+           const int v_dim, const bool stable_mode, const bool use_shared_buffers)
 {
  CHECK_INPUT(step_vec);
  CHECK_INPUT(rho_buffer);
@@ -136,13 +59,11 @@ lbfgs_call(torch::Tensor step_vec, torch::Tensor rho_buffer,

  return lbfgs_cuda_fuse(step_vec, rho_buffer, y_buffer, s_buffer, q, grad_q,
                         x_0, grad_0, epsilon, batch_size, m, v_dim,
-                         stable_mode);
+                         stable_mode, use_shared_buffers);
 }

 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
-  m.def("step",         &lbfgs_step_call,   "L-BFGS step (CUDA)");
-  m.def("update",       &lbfgs_update_call, "L-BFGS Update (CUDA)");
+ 
  m.def("forward",      &lbfgs_call,        "L-BFGS Update + Step (CUDA)");
-  m.def("debug_reduce", &reduce_cuda_call,  "L-BFGS Debug");
 }
--- a/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
+++ b/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
--- a/src/curobo/curobolib/cpp/line_search_kernel.cu
+++ b/src/curobo/curobolib/cpp/line_search_kernel.cu
@@ -9,7 +9,6 @@
 * its affiliates is strictly prohibited.
 */

-#pragma once
 #include <cuda.h>
 #include <torch/extension.h>
 #include <vector>
--- a/src/curobo/curobolib/cpp/self_collision_kernel.cu
+++ b/src/curobo/curobolib/cpp/self_collision_kernel.cu
@@ -185,11 +185,11 @@ namespace Curobo
            if (coll_matrix[i * nspheres + j] == 1)
            {
              float4 sph1 = __rs_shared[i];
-
-              if ((sph1.w <= 0.0) || (sph2.w <= 0.0))
-              {
-                continue;
-              }
+              //
+              //if ((sph1.w <= 0.0) || (sph2.w <= 0.0))
+              //{
+              //  continue;
+              //}
              float r_diff = sph1.w + sph2.w;
              float d      = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
                                  (sph1.y - sph2.y) * (sph1.y - sph2.y) +
@@ -380,10 +380,10 @@ namespace Curobo
          float4 sph1 = __rs_shared[NBPB * i + l];
          float4 sph2 = __rs_shared[NBPB * j + l];

-          if ((sph1.w <= 0.0) || (sph2.w <= 0.0))
-          {
-            continue;
-          }
+          //if ((sph1.w <= 0.0) || (sph2.w <= 0.0))
+          //{
+          //  continue;
+          //}
          float r_diff =
            sph1.w + sph2.w; // sum of two radii, radii include respective offsets
          float d = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
--- a/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
+++ b/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
--- a/src/curobo/curobolib/cpp/tensor_step_kernel.cu
+++ b/src/curobo/curobolib/cpp/tensor_step_kernel.cu
@@ -342,10 +342,8 @@ namespace Curobo
      float out_pos = 0.0, out_vel = 0.0, out_acc = 0.0, out_jerk = 0.0;
      float st_pos = 0.0, st_vel = 0.0, st_acc = 0.0;

-      const int   b_addrs        = b_idx * horizon * dof;
      const int   b_addrs_action = b_idx * (horizon - 4) * dof;
      float       in_pos[5]; // create a 5 value scalar
-      const float acc_scale = 1.0;

  #pragma unroll 5

--- a/src/curobo/curobolib/geom.py
+++ b/src/curobo/curobolib/geom.py
@@ -13,6 +13,7 @@ import torch

 # CuRobo
 from curobo.util.logger import log_warn
+from curobo.util.torch_utils import get_torch_jit_decorator

 try:
    # CuRobo
@@ -235,7 +236,7 @@ def get_pose_distance_backward(
    return r[0], r[1]


-@torch.jit.script
+@get_torch_jit_decorator()
 def backward_PoseError_jit(grad_g_dist, grad_out_distance, weight, g_vec):
    grad_vec = grad_g_dist + (grad_out_distance * weight)
    grad = 1.0 * (grad_vec).unsqueeze(-1) * g_vec
@@ -243,7 +244,7 @@ def backward_PoseError_jit(grad_g_dist, grad_out_distance, weight, g_vec):


 # full method:
-@torch.jit.script
+@get_torch_jit_decorator()
 def backward_full_PoseError_jit(
    grad_out_distance, grad_g_dist, grad_r_err, p_w, q_w, g_vec_p, g_vec_q
 ):
@@ -570,6 +571,7 @@ class SdfSphereOBB(torch.autograd.Function):
        sparsity_idx,
        weight,
        activation_distance,
+        max_distance,
        box_accel,
        box_dims,
        box_pose,
@@ -584,6 +586,8 @@ class SdfSphereOBB(torch.autograd.Function):
        compute_distance,
        use_batch_env,
        return_loss: bool = False,
+        sum_collisions: bool = True,
+        compute_esdf: bool = False,
    ):
        r = geom_cu.closest_point(
            query_sphere,
@@ -592,6 +596,7 @@ class SdfSphereOBB(torch.autograd.Function):
            sparsity_idx,
            weight,
            activation_distance,
+            max_distance,
            box_accel,
            box_dims,
            box_pose,
@@ -605,8 +610,11 @@ class SdfSphereOBB(torch.autograd.Function):
            transform_back,
            compute_distance,
            use_batch_env,
+            sum_collisions,
+            compute_esdf,
        )
        # r[1][r[1]!=r[1]] = 0.0
+        ctx.compute_esdf = compute_esdf
        ctx.return_loss = return_loss
        ctx.save_for_backward(r[1])
        return r[0]
@@ -615,6 +623,8 @@ class SdfSphereOBB(torch.autograd.Function):
    def backward(ctx, grad_output):
        grad_pt = None
        if ctx.needs_input_grad[0]:
+            # if ctx.compute_esdf:
+            #    raise NotImplementedError("Gradients not implemented for compute_esdf=True")
            (r,) = ctx.saved_tensors
            if ctx.return_loss:
                r = r * grad_output.unsqueeze(-1)
@@ -640,6 +650,9 @@ class SdfSphereOBB(torch.autograd.Function):
            None,
            None,
            None,
+            None,
+            None,
+            None,
        )


@@ -670,6 +683,7 @@ class SdfSweptSphereOBB(torch.autograd.Function):
        compute_distance,
        use_batch_env,
        return_loss: bool = False,
+        sum_collisions: bool = True,
    ):
        r = geom_cu.swept_closest_point(
            query_sphere,
@@ -694,6 +708,7 @@ class SdfSweptSphereOBB(torch.autograd.Function):
            transform_back,
            compute_distance,
            use_batch_env,
+            sum_collisions,
        )
        ctx.return_loss = return_loss
        ctx.save_for_backward(
@@ -733,4 +748,200 @@ class SdfSweptSphereOBB(torch.autograd.Function):
            None,
            None,
            None,
+            None,
+        )
+
+
+class SdfSphereVoxel(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        query_sphere,
+        out_buffer,
+        grad_out_buffer,
+        sparsity_idx,
+        weight,
+        activation_distance,
+        max_distance,
+        grid_features,
+        grid_params,
+        grid_pose,
+        grid_enable,
+        n_env_grid,
+        env_query_idx,
+        max_nobs,
+        batch_size,
+        horizon,
+        n_spheres,
+        transform_back,
+        compute_distance,
+        use_batch_env,
+        return_loss: bool = False,
+        sum_collisions: bool = True,
+        compute_esdf: bool = False,
+    ):
+
+        r = geom_cu.closest_point_voxel(
+            query_sphere,
+            out_buffer,
+            grad_out_buffer,
+            sparsity_idx,
+            weight,
+            activation_distance,
+            max_distance,
+            grid_features,
+            grid_params,
+            grid_pose,
+            grid_enable,
+            n_env_grid,
+            env_query_idx,
+            max_nobs,
+            batch_size,
+            horizon,
+            n_spheres,
+            transform_back,
+            compute_distance,
+            use_batch_env,
+            sum_collisions,
+            compute_esdf,
+        )
+        ctx.compute_esdf = compute_esdf
+        ctx.return_loss = return_loss
+        ctx.save_for_backward(r[1])
+        return r[0]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_pt = None
+        if ctx.needs_input_grad[0]:
+            # if ctx.compute_esdf:
+            #    raise NotImplementedError("Gradients not implemented for compute_esdf=True")
+            (r,) = ctx.saved_tensors
+            if ctx.return_loss:
+                r = r * grad_output.unsqueeze(-1)
+            grad_pt = r
+        return (
+            grad_pt,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class SdfSweptSphereVoxel(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        query_sphere,
+        out_buffer,
+        grad_out_buffer,
+        sparsity_idx,
+        weight,
+        activation_distance,
+        max_distance,
+        speed_dt,
+        grid_features,
+        grid_params,
+        grid_pose,
+        grid_enable,
+        n_env_grid,
+        env_query_idx,
+        max_nobs,
+        batch_size,
+        horizon,
+        n_spheres,
+        sweep_steps,
+        enable_speed_metric,
+        transform_back,
+        compute_distance,
+        use_batch_env,
+        return_loss: bool = False,
+        sum_collisions: bool = True,
+    ):
+        r = geom_cu.swept_closest_point_voxel(
+            query_sphere,
+            out_buffer,
+            grad_out_buffer,
+            sparsity_idx,
+            weight,
+            activation_distance,
+            max_distance,
+            speed_dt,
+            grid_features,
+            grid_params,
+            grid_pose,
+            grid_enable,
+            n_env_grid,
+            env_query_idx,
+            max_nobs,
+            batch_size,
+            horizon,
+            n_spheres,
+            sweep_steps,
+            enable_speed_metric,
+            transform_back,
+            compute_distance,
+            use_batch_env,
+            sum_collisions,
+        )
+
+        ctx.return_loss = return_loss
+        ctx.save_for_backward(
+            r[1],
+        )
+        return r[0]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_pt = None
+        if ctx.needs_input_grad[0]:
+            (r,) = ctx.saved_tensors
+            if ctx.return_loss:
+                r = r * grad_output.unsqueeze(-1)
+            grad_pt = r
+        return (
+            grad_pt,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
        )
--- a/src/curobo/curobolib/opt.py
+++ b/src/curobo/curobolib/opt.py
@@ -39,7 +39,8 @@ except ImportError:

 class LBFGScu(Function):
    @staticmethod
-    def _call_cuda(
+    def forward(
+        ctx,
        step_vec,
        rho_buffer,
        y_buffer,
@@ -50,6 +51,7 @@ class LBFGScu(Function):
        grad_0,
        epsilon=0.1,
        stable_mode=False,
+        use_shared_buffers=True,
    ):
        m, b, v_dim, _ = y_buffer.shape

@@ -67,39 +69,12 @@ class LBFGScu(Function):
            m,
            v_dim,
            stable_mode,
+            use_shared_buffers,
        )
        step_v = R[0].view(step_vec.shape)

-        return step_v
-
-    @staticmethod
-    def forward(
-        ctx,
-        step_vec,
-        rho_buffer,
-        y_buffer,
-        s_buffer,
-        q,
-        grad_q,
-        x_0,
-        grad_0,
-        epsilon=0.1,
-        stable_mode=False,
-    ):
-        R = LBFGScu._call_cuda(
-            step_vec,
-            rho_buffer,
-            y_buffer,
-            s_buffer,
-            q,
-            grad_q,
-            x_0,
-            grad_0,
-            epsilon=epsilon,
-            stable_mode=stable_mode,
-        )
        # ctx.save_for_backward(batch_spheres, robot_spheres, link_mats, link_sphere_map)
-        return R
+        return step_v

    @staticmethod
    def backward(ctx, grad_output):
@@ -109,4 +84,5 @@ class LBFGScu(Function):
            None,
            None,
            None,
+            None,
        )