release repository

2023-10-26 04:17:19 -07:00
commit 07e6ccfc91
287 changed files with 70659 additions and 0 deletions
--- a/src/curobo/curobolib/init.py
+++ b/src/curobo/curobolib/init.py
@@ -0,0 +1,11 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+"""CuRoboLib Module."""
--- a/src/curobo/curobolib/cpp/geom_cuda.cpp
+++ b/src/curobo/curobolib/cpp/geom_cuda.cpp
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <map>
+#include <torch/extension.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <vector>
+
+// CUDA forward declarations
+
+std::vector<torch::Tensor> self_collision_distance(
+    torch::Tensor out_distance, torch::Tensor out_vec,
+    torch::Tensor sparse_index,
+    const torch::Tensor robot_spheres,      // batch_size x n_spheres x 4
+    const torch::Tensor collision_offset,   // n_spheres x n_spheres
+    const torch::Tensor weight,
+    const torch::Tensor collision_matrix, // n_spheres x n_spheres
+    const torch::Tensor thread_locations, const int locations_size,
+    const int batch_size, const int nspheres, const bool compute_grad = false,
+    const int ndpt = 8, // Does this need to match template?
+    const bool debug = false);
+
+// CUDA forward declarations
+
+std::vector<torch::Tensor> swept_sphere_obb_clpt(
+    const torch::Tensor sphere_position, // batch_size, 3
+    torch::Tensor distance, // batch_size, 1
+    torch::Tensor
+        closest_point, // batch size, 4 -> written out as x,y,z,0 for gradient
+    torch::Tensor sparsity_idx, const torch::Tensor weight,
+    const torch::Tensor activation_distance, const torch::Tensor speed_dt,
+    const torch::Tensor obb_accel,     // n_boxes, 4, 4
+    const torch::Tensor obb_bounds,    // n_boxes, 3
+    const torch::Tensor obb_pose,      // n_boxes, 4, 4
+    const torch::Tensor obb_enable,    // n_boxes, 4,
+    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
+    const torch::Tensor env_query_idx, // n_boxes, 4, 4
+    const int max_nobs, const int batch_size, const int horizon,
+    const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
+    const bool transform_back, const bool compute_distance,
+    const bool use_batch_env);
+
+std::vector<torch::Tensor>
+sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
+                torch::Tensor distance,
+                torch::Tensor closest_point, // batch size, 3
+                torch::Tensor sparsity_idx, const torch::Tensor weight,
+                const torch::Tensor activation_distance,
+                const torch::Tensor obb_accel,     // n_boxes, 4, 4
+                const torch::Tensor obb_bounds,    // n_boxes, 3
+                const torch::Tensor obb_pose,      // n_boxes, 4, 4
+                const torch::Tensor obb_enable,    // n_boxes, 4, 4
+                const torch::Tensor n_env_obb,     // n_boxes, 4, 4
+                const torch::Tensor env_query_idx, // n_boxes, 4, 4
+                const int max_nobs, const int batch_size, const int horizon,
+                const int n_spheres, const bool transform_back,
+                const bool compute_distance, const bool use_batch_env);
+
+std::vector<torch::Tensor> pose_distance(
+    torch::Tensor out_distance, torch::Tensor out_position_distance,
+    torch::Tensor out_rotation_distance,
+    torch::Tensor distance_p_vector, // batch size, 3
+    torch::Tensor distance_q_vector, // batch size, 4
+    torch::Tensor out_gidx,
+    const torch::Tensor current_position, // batch_size, 3
+    const torch::Tensor goal_position,    // n_boxes, 3
+    const torch::Tensor current_quat, const torch::Tensor goal_quat,
+    const torch::Tensor vec_weight, // n_boxes, 4, 4
+    const torch::Tensor weight,     // n_boxes, 4, 4
+    const torch::Tensor vec_convergence, const torch::Tensor run_weight,
+    const torch::Tensor run_vec_weight, const torch::Tensor batch_pose_idx,
+    const int batch_size, const int horizon, const int mode,
+    const int num_goals = 1, const bool compute_grad = false,
+    const bool write_distance = true, const bool use_metric = false);
+
+std::vector<torch::Tensor>
+backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
+                       const torch::Tensor grad_distance,   // batch_size, 3
+                       const torch::Tensor grad_p_distance, // n_boxes, 3
+                       const torch::Tensor grad_q_distance,
+                       const torch::Tensor pose_weight,
+                       const torch::Tensor grad_p_vec, // n_boxes, 4, 4
+                       const torch::Tensor grad_q_vec, const int batch_size,
+                       const bool use_distance = false);
+
+// C++ interface
+
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor> self_collision_distance_wrapper(
+    torch::Tensor out_distance, torch::Tensor out_vec,
+    torch::Tensor sparse_index,
+    const torch::Tensor robot_spheres,        // batch_size x n_spheres x 4
+    const torch::Tensor collision_offset,     // n_spheres
+    const torch::Tensor weight,
+    const torch::Tensor collision_matrix, // n_spheres
+    const torch::Tensor thread_locations, const int thread_locations_size,
+    const int batch_size, const int nspheres, const bool compute_grad = false,
+    const int ndpt = 8, const bool debug = false) {
+
+  CHECK_INPUT(out_distance);
+  CHECK_INPUT(out_vec);
+  CHECK_INPUT(robot_spheres);
+  CHECK_INPUT(collision_offset);
+  CHECK_INPUT(sparse_index);
+  CHECK_INPUT(weight);
+  CHECK_INPUT(thread_locations);
+  CHECK_INPUT(collision_matrix);
+  const at::cuda::OptionalCUDAGuard guard(robot_spheres.device());
+
+  return self_collision_distance(
+      out_distance, out_vec, sparse_index, robot_spheres, 
+      collision_offset, weight, collision_matrix, thread_locations,
+      thread_locations_size, batch_size, nspheres, compute_grad, ndpt, debug);
+}
+
+std::vector<torch::Tensor> sphere_obb_clpt_wrapper(
+    const torch::Tensor sphere_position, // batch_size, 4
+
+    torch::Tensor distance,
+    torch::Tensor closest_point, // batch size, 3
+    torch::Tensor sparsity_idx, const torch::Tensor weight,
+    const torch::Tensor activation_distance,
+    const torch::Tensor obb_accel,     // n_boxes, 4, 4
+    const torch::Tensor obb_bounds,    // n_boxes, 3
+    const torch::Tensor obb_pose,      // n_boxes, 4, 4
+    const torch::Tensor obb_enable,    // n_boxes, 4, 4
+    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
+    const torch::Tensor env_query_idx, // n_boxes, 4, 4
+    const int max_nobs, const int batch_size, const int horizon,
+    const int n_spheres, const bool transform_back, const bool compute_distance,
+    const bool use_batch_env) {
+
+  const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
+  CHECK_INPUT(distance);
+  CHECK_INPUT(closest_point);
+  CHECK_INPUT(sphere_position);
+  CHECK_INPUT(sparsity_idx);
+  CHECK_INPUT(weight);
+  CHECK_INPUT(activation_distance);
+  CHECK_INPUT(obb_accel);
+  return sphere_obb_clpt(
+      sphere_position, distance, closest_point, sparsity_idx, weight,
+      activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
+      n_env_obb, env_query_idx, max_nobs, batch_size, horizon, n_spheres,
+      transform_back, compute_distance, use_batch_env);
+}
+std::vector<torch::Tensor> swept_sphere_obb_clpt_wrapper(
+    const torch::Tensor sphere_position, // batch_size, 4
+    torch::Tensor distance,
+    torch::Tensor closest_point, // batch size, 3
+    torch::Tensor sparsity_idx, const torch::Tensor weight,
+    const torch::Tensor activation_distance, const torch::Tensor speed_dt,
+    const torch::Tensor obb_accel,     // n_boxes, 4, 4
+    const torch::Tensor obb_bounds,    // n_boxes, 3
+    const torch::Tensor obb_pose,      // n_boxes, 4, 4
+    const torch::Tensor obb_enable,    // n_boxes, 4, 4
+    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
+    const torch::Tensor env_query_idx, // n_boxes, 4, 4
+    const int max_nobs, const int batch_size, const int horizon,
+    const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
+    const bool transform_back, const bool compute_distance,
+    const bool use_batch_env) {
+  const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
+  CHECK_INPUT(distance);
+  CHECK_INPUT(closest_point);
+  CHECK_INPUT(sphere_position);
+
+  return swept_sphere_obb_clpt(
+      sphere_position,
+      distance, closest_point, sparsity_idx, weight, activation_distance,
+      speed_dt, obb_accel, obb_bounds, obb_pose, obb_enable, n_env_obb,
+      env_query_idx, max_nobs, batch_size, horizon, n_spheres, sweep_steps,
+      enable_speed_metric, transform_back, compute_distance, use_batch_env);
+}
+std::vector<torch::Tensor> pose_distance_wrapper(
+    torch::Tensor out_distance, torch::Tensor out_position_distance,
+    torch::Tensor out_rotation_distance,
+    torch::Tensor distance_p_vector, // batch size, 3
+    torch::Tensor distance_q_vector, // batch size, 4
+    torch::Tensor out_gidx,
+    const torch::Tensor current_position, // batch_size, 3
+    const torch::Tensor goal_position,    // n_boxes, 3
+    const torch::Tensor current_quat, const torch::Tensor goal_quat,
+    const torch::Tensor vec_weight, // n_boxes, 4, 4
+    const torch::Tensor weight, const torch::Tensor vec_convergence,
+    const torch::Tensor run_weight, const torch::Tensor run_vec_weight,
+    const torch::Tensor batch_pose_idx, const int batch_size, const int horizon,
+    const int mode, const int num_goals = 1, const bool compute_grad = false,
+    const bool write_distance = false, const bool use_metric = false) {
+
+  // at::cuda::DeviceGuard guard(angle.device());
+  CHECK_INPUT(out_distance);
+  CHECK_INPUT(out_position_distance);
+  CHECK_INPUT(out_rotation_distance);
+  CHECK_INPUT(distance_p_vector);
+  CHECK_INPUT(distance_q_vector);
+  CHECK_INPUT(current_position);
+  CHECK_INPUT(goal_position);
+  CHECK_INPUT(current_quat);
+  CHECK_INPUT(goal_quat);
+  CHECK_INPUT(batch_pose_idx);
+  const at::cuda::OptionalCUDAGuard guard(current_position.device());
+
+  return pose_distance(
+      out_distance, out_position_distance, out_rotation_distance,
+      distance_p_vector, distance_q_vector, out_gidx, current_position,
+      goal_position, current_quat, goal_quat, vec_weight, weight,
+      vec_convergence, run_weight, run_vec_weight, batch_pose_idx, batch_size,
+      horizon, mode, num_goals, compute_grad, write_distance, use_metric);
+}
+
+std::vector<torch::Tensor> backward_pose_distance_wrapper(
+    torch::Tensor out_grad_p, torch::Tensor out_grad_q,
+    const torch::Tensor grad_distance,   // batch_size, 3
+    const torch::Tensor grad_p_distance, // n_boxes, 3
+    const torch::Tensor grad_q_distance, const torch::Tensor pose_weight,
+    const torch::Tensor grad_p_vec, // n_boxes, 4, 4
+    const torch::Tensor grad_q_vec, const int batch_size,
+    const bool use_distance) {
+  CHECK_INPUT(out_grad_p);
+  CHECK_INPUT(out_grad_q);
+  CHECK_INPUT(grad_distance);
+  CHECK_INPUT(grad_p_distance);
+  CHECK_INPUT(grad_q_distance);
+
+  const at::cuda::OptionalCUDAGuard guard(grad_distance.device());
+
+  return backward_pose_distance(
+      out_grad_p, out_grad_q, grad_distance, grad_p_distance, grad_q_distance,
+      pose_weight, grad_p_vec, grad_q_vec, batch_size, use_distance);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("pose_distance", &pose_distance_wrapper, "Pose Distance (curobolib)");
+  m.def("pose_distance_backward", &backward_pose_distance_wrapper,
+        "Pose Distance Backward (curobolib)");
+
+  m.def("closest_point", &sphere_obb_clpt_wrapper,
+        "Closest Point OBB(curobolib)");
+  m.def("swept_closest_point", &swept_sphere_obb_clpt_wrapper,
+        "Swept Closest Point OBB(curobolib)");
+
+  m.def("self_collision_distance", &self_collision_distance_wrapper,
+        "Self Collision Distance (curobolib)");
+}
--- a/src/curobo/curobolib/cpp/helper_math.h
+++ b/src/curobo/curobolib/cpp/helper_math.h
--- a/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
+++ b/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <torch/extension.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <vector>
+
+// CUDA forward declarations
+
+std::vector<torch::Tensor>
+matrix_to_quaternion(torch::Tensor out_quat,
+                     const torch::Tensor in_rot // batch_size, 3
+);
+
+std::vector<torch::Tensor> kin_fused_forward(
+    torch::Tensor link_pos, torch::Tensor link_quat,
+    torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
+    const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
+    const torch::Tensor robot_spheres, const torch::Tensor link_map,
+    const torch::Tensor joint_map, const torch::Tensor joint_map_type,
+    const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
+    const int batch_size, const int n_spheres,
+    const bool use_global_cumul = false);
+
+std::vector<torch::Tensor> kin_fused_backward_16t(
+    torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
+    const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
+    const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
+    const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
+    const torch::Tensor link_map, const torch::Tensor joint_map,
+    const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
+    const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
+    const int batch_size, const int n_spheres, const bool sparsity_opt = true,
+    const bool use_global_cumul = false);
+// C++ interface
+
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor> kin_forward_wrapper(
+    torch::Tensor link_pos, torch::Tensor link_quat,
+    torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
+    const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
+    const torch::Tensor robot_spheres, const torch::Tensor link_map,
+    const torch::Tensor joint_map, const torch::Tensor joint_map_type,
+    const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
+    const int batch_size, const int n_spheres,
+    const bool use_global_cumul = false) {
+
+  const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
+
+  // TODO: add check input
+  return kin_fused_forward(
+      link_pos, link_quat, batch_robot_spheres, global_cumul_mat, joint_vec,
+      fixed_transform, robot_spheres, link_map, joint_map, joint_map_type,
+      store_link_map, link_sphere_map, batch_size, n_spheres, use_global_cumul);
+}
+
+std::vector<torch::Tensor> kin_backward_wrapper(
+    torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
+    const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
+    const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
+    const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
+    const torch::Tensor link_map, const torch::Tensor joint_map,
+    const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
+    const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
+    const int batch_size, const int n_spheres, const bool sparsity_opt = true,
+    const bool use_global_cumul = false) {
+  const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
+
+  return kin_fused_backward_16t(
+      grad_out, grad_nlinks_pos, grad_nlinks_quat, grad_spheres,
+      global_cumul_mat, joint_vec, fixed_transform, robot_spheres, link_map,
+      joint_map, joint_map_type, store_link_map, link_sphere_map,
+      link_chain_map, batch_size, n_spheres, sparsity_opt, use_global_cumul);
+}
+
+std::vector<torch::Tensor>
+matrix_to_quaternion_wrapper(torch::Tensor out_quat,
+                             const torch::Tensor in_rot // batch_size, 3
+) {
+  const at::cuda::OptionalCUDAGuard guard(in_rot.device());
+  CHECK_INPUT(in_rot);
+  CHECK_INPUT(out_quat);
+  return matrix_to_quaternion(out_quat, in_rot);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &kin_forward_wrapper, "Kinematics fused forward (CUDA)");
+  m.def("backward", &kin_backward_wrapper, "Kinematics fused backward (CUDA)");
+  m.def("matrix_to_quaternion", &matrix_to_quaternion_wrapper,
+        "Rotation Matrix to Quaternion (CUDA)");
+}
--- a/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
+++ b/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
--- a/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp
+++ b/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <torch/extension.h>
+
+#include <vector>
+
+#include <c10/cuda/CUDAGuard.h>
+
+// CUDA forward declarations
+std::vector<torch::Tensor> reduce_cuda(torch::Tensor vec, torch::Tensor vec2,
+                                       torch::Tensor rho_buffer,
+                                       torch::Tensor sum, const int batch_size,
+                                       const int v_dim, const int m);
+
+std::vector<torch::Tensor>
+lbfgs_step_cuda(torch::Tensor step_vec, torch::Tensor rho_buffer,
+                torch::Tensor y_buffer, torch::Tensor s_buffer,
+                torch::Tensor grad_q, const float epsilon, const int batch_size,
+                const int m, const int v_dim);
+
+std::vector<torch::Tensor>
+lbfgs_update_cuda(torch::Tensor rho_buffer, torch::Tensor y_buffer,
+                  torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
+                  torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
+                  const int m, const int v_dim);
+
+std::vector<torch::Tensor>
+lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
+                torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
+                torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
+                const float epsilon, const int batch_size, const int m,
+                const int v_dim, const bool stable_mode);
+// C++ interface
+
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor>
+lbfgs_step_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
+                torch::Tensor y_buffer, torch::Tensor s_buffer,
+                torch::Tensor grad_q, const float epsilon, const int batch_size,
+                const int m, const int v_dim) {
+
+  CHECK_INPUT(step_vec);
+  CHECK_INPUT(rho_buffer);
+  CHECK_INPUT(y_buffer);
+  CHECK_INPUT(s_buffer);
+  CHECK_INPUT(grad_q);
+  const at::cuda::OptionalCUDAGuard guard(grad_q.device());
+
+  return lbfgs_step_cuda(step_vec, rho_buffer, y_buffer, s_buffer, grad_q,
+                         epsilon, batch_size, m, v_dim);
+}
+
+std::vector<torch::Tensor>
+lbfgs_update_call(torch::Tensor rho_buffer, torch::Tensor y_buffer,
+                  torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
+                  torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
+                  const int m, const int v_dim) {
+  CHECK_INPUT(rho_buffer);
+  CHECK_INPUT(y_buffer);
+  CHECK_INPUT(s_buffer);
+  CHECK_INPUT(grad_q);
+  CHECK_INPUT(x_0);
+  CHECK_INPUT(grad_0);
+  CHECK_INPUT(q);
+  const at::cuda::OptionalCUDAGuard guard(grad_q.device());
+
+  return lbfgs_update_cuda(rho_buffer, y_buffer, s_buffer, q, grad_q, x_0,
+                           grad_0, batch_size, m, v_dim);
+}
+
+std::vector<torch::Tensor>
+reduce_cuda_call(torch::Tensor vec, torch::Tensor vec2,
+                 torch::Tensor rho_buffer, torch::Tensor sum,
+                 const int batch_size, const int v_dim, const int m) {
+  const at::cuda::OptionalCUDAGuard guard(sum.device());
+
+  return reduce_cuda(vec, vec2, rho_buffer, sum, batch_size, v_dim, m);
+}
+
+std::vector<torch::Tensor>
+lbfgs_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
+           torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
+           torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
+           const float epsilon, const int batch_size, const int m,
+           const int v_dim, const bool stable_mode) {
+  CHECK_INPUT(step_vec);
+  CHECK_INPUT(rho_buffer);
+  CHECK_INPUT(y_buffer);
+  CHECK_INPUT(s_buffer);
+  CHECK_INPUT(grad_q);
+  CHECK_INPUT(x_0);
+  CHECK_INPUT(grad_0);
+  CHECK_INPUT(q);
+  const at::cuda::OptionalCUDAGuard guard(grad_q.device());
+
+  return lbfgs_cuda_fuse(step_vec, rho_buffer, y_buffer, s_buffer, q, grad_q,
+                         x_0, grad_0, epsilon, batch_size, m, v_dim,
+                         stable_mode);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("step", &lbfgs_step_call, "L-BFGS step (CUDA)");
+  m.def("update", &lbfgs_update_call, "L-BFGS Update (CUDA)");
+  m.def("forward", &lbfgs_call, "L-BFGS Update + Step (CUDA)");
+  m.def("debug_reduce", &reduce_cuda_call, "L-BFGS Debug");
+}
--- a/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
+++ b/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
@@ -0,0 +1,869 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <cuda.h>
+#include <torch/extension.h>
+#include <vector>
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_fp16.h>
+
+// #include "helper_cuda.h"
+#include "helper_math.h"
+
+#include <assert.h>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <math.h>
+
+// #include <stdio.h>
+//
+// For the CUDA runtime routines (prefixed with "cuda_")
+// #include <cuda_runtime.h>
+
+// #include <cuda_fp16.h>
+// #include <helper_cuda.h>
+#define M_MAX 512
+#define HALF_MAX 65504.0
+#define M 15
+#define VDIM 175 // 25 * 7,
+
+#define FULL_MASK 0xffffffff
+
+namespace Curobo {
+
+namespace Optimization {
+
+template <typename scalar_t>
+__device__ __forceinline__ void
+scalar_vec_product(const scalar_t a, const scalar_t *b, scalar_t *out,
+                   const int v_dim) {
+
+  for (int i = 0; i < v_dim; i++) {
+    out[i] = a * b[i];
+  }
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ void
+m_scalar_vec_product(const scalar_t *a, const scalar_t *b, scalar_t *out,
+                     const int v_dim, const int m) {
+  for (int j = 0; j < m; j++) {
+    for (int i = 0; i < v_dim; i++) {
+      out[j * v_dim + i] = a[j] * b[j * v_dim + i];
+    }
+  }
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ void vec_vec_dot(const scalar_t *a,
+                                            const scalar_t *b, scalar_t &out,
+                                            const int v_dim) {
+
+  for (int i = 0; i < v_dim; i++) {
+    out += a[i] * b[i];
+  }
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ void update_r(const scalar_t *rho_y,
+                                         const scalar_t *s_buffer, scalar_t *r,
+                                         scalar_t &alpha, const int v_dim) {
+  // dot product: and subtract with alpha
+  for (int i = 0; i < v_dim; i++) {
+    alpha -= rho_y[i] * r[i];
+  }
+  // scalar vector product:
+  for (int i = 0; i < v_dim; i++) {
+    r[i] = r[i] + alpha * s_buffer[i];
+  }
+}
+template <typename scalar_t>
+__device__ __forceinline__ void update_q(const scalar_t *y_buffer, scalar_t *gq,
+                                         const scalar_t alpha,
+                                         const int v_dim) {
+  //
+  for (int i = 0; i < v_dim; i++) {
+    gq[i] = gq[i] - (alpha * y_buffer[i]);
+  }
+}
+template <typename scalar_t>
+__global__ void
+lbfgs_step_kernel_old(scalar_t *step_vec, scalar_t *rho_buffer,
+                      const scalar_t *y_buffer, const scalar_t *s_buffer,
+                      const scalar_t *grad_q, const float epsilon,
+                      const int batchSize, const int m, const int v_dim) {
+  // each thread writes one sphere of some link
+  const int t = blockDim.x * blockIdx.x + threadIdx.x; // batch
+  const int b_idx = t;
+  if (t >= (batchSize)) {
+    return;
+  }
+
+  // get thread start address:
+  const int b_start_scalar_adrs = b_idx * m;
+  const int b_start_vec_adrs = b_idx * m * v_dim;
+  const int b_step_start_adrs = b_idx * v_dim;
+
+  scalar_t rho_s[M * VDIM];
+
+  // copy floats to local buffer?
+  // y_buffer, s_buffer, rho_buffer
+  // compute rho_s
+  scalar_t loc_ybuf[M * VDIM];
+  scalar_t loc_sbuf[M * VDIM];
+  scalar_t loc_rho[M];
+  scalar_t gq[VDIM]; //, l_q[VDIM];
+  scalar_t alpha_buffer[M];
+  scalar_t t_1, t_2;
+
+  for (int i = 0; i < m * v_dim; i++) {
+    loc_ybuf[i] = y_buffer[b_start_vec_adrs + i];
+    loc_sbuf[i] = s_buffer[b_start_vec_adrs + i];
+  }
+  for (int i = 0; i < v_dim; i++) {
+    gq[i] = grad_q[b_step_start_adrs + i];
+  }
+  for (int i = 0; i < m; i++) {
+    loc_rho[i] = rho_buffer[b_start_scalar_adrs + i];
+  }
+
+  m_scalar_vec_product(&loc_rho[0], &loc_sbuf[0], &rho_s[0], v_dim, m);
+  // for loop over m
+  for (int i = m - 1; i > m - 2; i--) {
+    // l_start_vec_adrs = i * v_dim;
+    // scalar_vec_product(loc_rho[i], &loc_sbuf[i*v_dim], &rho_s[i*v_dim],
+    // v_dim);
+    vec_vec_dot(&rho_s[i * v_dim], &gq[0], alpha_buffer[i], v_dim);
+    update_q(&loc_ybuf[(i * v_dim)], &gq[0], alpha_buffer[i], v_dim);
+  }
+  // compute initial hessian:
+  vec_vec_dot(&loc_sbuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_1,
+              v_dim);
+  vec_vec_dot(&loc_ybuf[(m - 1) * v_dim], &loc_ybuf[(m - 1) * v_dim], t_2,
+              v_dim);
+  t_1 = t_1 / t_2;
+  if (t_1 < 0) {
+    t_1 = 0;
+  }
+
+  t_1 += epsilon;
+  scalar_vec_product(t_1, &gq[0], &gq[0], v_dim);
+
+  m_scalar_vec_product(&loc_rho[0], &loc_ybuf[0], &rho_s[0], v_dim, m);
+
+  for (int i = 0; i < m; i++) {
+    // scalar_vec_product(loc_rho[i], &loc_ybuf[i*v_dim], &rho_s[i*v_dim],
+    // v_dim);
+    update_r(&rho_s[i * v_dim], &loc_sbuf[i * v_dim], &gq[0], alpha_buffer[i],
+             v_dim);
+  }
+
+  // write gq to out grad:
+
+  for (int i = 0; i < v_dim; i++) {
+    step_vec[b_step_start_adrs + i] = -1.0 * gq[i];
+  }
+}
+
+template <typename psum_t>
+__forceinline__ __device__ psum_t warpReduce(psum_t v, int elems,
+                                             unsigned mask) {
+  psum_t val = v;
+  int shift = 1;
+  for (int i = elems; i > 1; i /= 2) {
+    val += __shfl_down_sync(mask, val, shift);
+    shift *= 2;
+  }
+  // val += __shfl_down_sync(mask, val, 1); // i=32
+  // val += __shfl_down_sync(mask, val, 2); // i=16
+  // val += __shfl_down_sync(mask, val, 4); // i=8
+  // val += __shfl_down_sync(mask, val, 8); // i=4
+  // val += __shfl_down_sync(mask, val, 16); // i=2
+  return val;
+}
+
+template <typename scalar_t, typename psum_t>
+__forceinline__ __device__ void reduce(scalar_t v, int m, psum_t *data,
+                                       scalar_t *result) {
+  psum_t val = v;
+  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < m);
+  val += __shfl_down_sync(mask, val, 1);
+  val += __shfl_down_sync(mask, val, 2);
+  val += __shfl_down_sync(mask, val, 4);
+  val += __shfl_down_sync(mask, val, 8);
+  val += __shfl_down_sync(mask, val, 16);
+  // int leader = __ffs(mask) – 1;    // select a leader lane
+  int leader = 0;
+  if (threadIdx.x % 32 == leader) {
+    if (m < 32) {
+      result[0] = (scalar_t)val;
+    } else {
+      data[(threadIdx.x + 1) / 32] = val;
+    }
+  }
+  if (m >= 32) {
+
+    __syncthreads();
+
+    int elems = (m + 31) / 32;
+    unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
+    if (threadIdx.x / 32 == 0) { // only the first warp will do this work
+      psum_t val2 = data[threadIdx.x % 32];
+      int shift = 1;
+      for (int i = elems - 1; i > 0; i /= 2) {
+        val2 += __shfl_down_sync(mask2, val2, shift);
+        shift *= 2;
+      }
+      // int leader = __ffs(mask2) – 1;    // select a leader lane
+      int leader = 0;
+      if (threadIdx.x % 32 == leader) {
+        result[0] = (scalar_t)val2;
+      }
+    }
+  }
+  __syncthreads();
+}
+
+// blockReduce
+template <typename scalar_t, typename psum_t>
+__forceinline__ __device__ void reduce_v1(scalar_t v, int m, psum_t *data,
+                                          scalar_t *result) {
+  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < m);
+  psum_t val = warpReduce(v, 32, mask);
+
+  // int leader = __ffs(mask) – 1;    // select a leader lane
+  int leader = 0;
+  if (threadIdx.x % 32 == leader) {
+    data[(threadIdx.x + 1) / 32] = val;
+  }
+
+  if (m >= 32) {
+
+    __syncthreads();
+
+    int elems = (m + 31) / 32;
+    unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
+    if (threadIdx.x / 32 == 0) { // only the first warp will do this work
+      psum_t val2 = warpReduce(data[threadIdx.x % 32], elems, mask2);
+
+      // // int leader = __ffs(mask2) – 1;    // select a leader lane
+      if (threadIdx.x == leader) {
+        result[0] = (scalar_t)val2;
+      }
+    }
+  } else {
+    if (threadIdx.x == leader) {
+      result[0] = (scalar_t)val;
+    }
+  }
+  __syncthreads();
+}
+
+template <typename scalar_t, typename psum_t>
+__inline__ __device__ void dot(const scalar_t *mat1, const scalar_t *mat2,
+                               const int m, psum_t *data, scalar_t *result) {
+
+  scalar_t val = mat1[threadIdx.x] * mat2[threadIdx.x];
+  reduce(val, m, data, result);
+}
+
+template <typename scalar_t> __inline__ __device__ scalar_t relu(scalar_t var) {
+  if (var < 0)
+    return 0;
+  else
+    return var;
+}
+
+//////////////////////////////////////////////////////////
+// one block per batch
+// v_dim threads per block
+//////////////////////////////////////////////////////////
+template <typename scalar_t, typename psum_t>
+__global__ void lbfgs_step_kernel(scalar_t *step_vec,       // b x 175
+                                  scalar_t *rho_buffer,     // m x b x 1
+                                  const scalar_t *y_buffer, // m x b x 175
+                                  const scalar_t *s_buffer, // m x b x 175
+                                  const scalar_t *grad_q,   // b x 175
+                                  const float epsilon, const int batchsize,
+                                  const int m, const int v_dim) {
+  __shared__ psum_t
+      data[32]; // temporary buffer needed for block-wide reduction
+  __shared__ scalar_t
+      result; // result of the reduction or vector-vector dot product
+  __shared__ scalar_t gq[175]; /// gq  = batch * v_dim
+  assert(v_dim < 176);
+  int batch = blockIdx.x; // one block per batch
+  if (threadIdx.x >= v_dim)
+    return;
+
+  gq[threadIdx.x] = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
+
+  scalar_t alpha_buffer[16];
+  // assert(m<16); // allocating a buffer assuming m < 16
+
+  for (int i = m - 1; i > -1; i--) {
+    dot(&gq[0], &s_buffer[i * batchsize * v_dim + batch * v_dim], v_dim,
+        &data[0], &result);
+    alpha_buffer[i] = result * rho_buffer[i * batchsize + batch];
+    gq[threadIdx.x] =
+        gq[threadIdx.x] -
+        alpha_buffer[i] *
+            y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  }
+
+  // compute var1
+  scalar_t val1 =
+      y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  scalar_t val2 =
+      s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  reduce(val1 * val1, v_dim, data, &result);
+  scalar_t denominator = result;
+  reduce(val1 * val2, v_dim, data, &result);
+  scalar_t numerator = result;
+  scalar_t var1 = numerator / denominator;
+
+  scalar_t gamma = relu(var1) + epsilon; // epsilon
+
+  gq[threadIdx.x] = gamma * gq[threadIdx.x];
+
+  for (int i = 0; i < m; i++) {
+    dot(&gq[0], &y_buffer[i * batchsize * v_dim + batch * v_dim], v_dim,
+        &data[0], &result);
+    gq[threadIdx.x] =
+        gq[threadIdx.x] +
+        (alpha_buffer[i] - result * rho_buffer[i * batchsize + batch]) *
+            s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  }
+
+  step_vec[batch * v_dim + threadIdx.x] =
+      -1 * gq[threadIdx.x]; // copy from shared memory to global memory
+}
+
+template <typename scalar_t, typename psum_t>
+__global__ void lbfgs_update_buffer_kernel(scalar_t *rho_buffer, // m x b x 1
+                                           scalar_t *y_buffer,   // m x b x 175
+                                           scalar_t *s_buffer,   // m x b x 175
+                                           scalar_t *q,          // b x 175
+                                           scalar_t *x_0,        // b x 175
+                                           scalar_t *grad_0,     // b x 175
+                                           const scalar_t *grad_q, // b x 175
+                                           const int batchsize, const int m,
+                                           const int v_dim) {
+  __shared__ psum_t
+      data[32]; // temporary buffer needed for block-wide reduction
+  __shared__ scalar_t
+      result; // result of the reduction or vector-vector dot product
+  // __shared__ scalar_t y[175]; // temporary shared memory storage
+  // __shared__ scalar_t s[175]; // temporary shared memory storage
+  assert(v_dim <= VDIM);
+  int batch = blockIdx.x; // one block per batch
+  if (threadIdx.x >= v_dim)
+    return;
+
+  scalar_t y =
+      grad_q[batch * v_dim + threadIdx.x] - grad_0[batch * v_dim + threadIdx.x];
+  scalar_t s =
+      q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
+  reduce(y * s, v_dim, &data[0], &result);
+
+  for (int i = 1; i < m; i++) {
+    s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
+        s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+    y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
+        y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  }
+
+  // __syncthreads();
+
+  s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
+  y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
+  grad_0[batch * v_dim + threadIdx.x] = grad_q[batch * v_dim + threadIdx.x];
+  x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
+
+  if (threadIdx.x == 0) {
+    scalar_t rho = 1 / result;
+    for (int i = 1; i < m; i++) {
+      rho_buffer[(i - 1) * batchsize + batch] =
+          rho_buffer[i * batchsize + batch];
+    }
+    rho_buffer[(m - 1) * batchsize + batch] = rho;
+  }
+}
+
+template <typename scalar_t, typename psum_t>
+__global__ void reduce_kernel(
+    scalar_t *vec1,       // b x 175
+    scalar_t *vec2,       // b x 175
+    scalar_t *rho_buffer, // m x b x 1
+
+    scalar_t *sum_out, // m x b x 1
+
+    const int batchsize, const int m,
+    const int v_dim) // s_buffer and y_buffer are not rolled by default
+{
+  __shared__ psum_t
+      data[32]; // temporary buffer needed for block-wide reduction
+  __shared__ scalar_t
+      result; // result of the reduction or vector-vector dot product
+  int batch = blockIdx.x; // one block per batch
+  if (threadIdx.x >= v_dim)
+    return;
+
+  ////////////////////
+  // update_buffer
+  ////////////////////
+  scalar_t y = vec1[batch * v_dim + threadIdx.x];
+  scalar_t s = vec2[batch * v_dim + threadIdx.x];
+
+  reduce(y * s, v_dim, &data[0], &result);
+  scalar_t numerator = result;
+  if (threadIdx.x == 0) {
+    sum_out[batch] = 1 / numerator;
+  }
+  // return;
+  if (threadIdx.x < m - 1) {
+    // m thread participate to shif the values
+    // this is safe as m<32  and this happens in lockstep
+    rho_buffer[threadIdx.x * batchsize + batch] =
+        rho_buffer[(threadIdx.x + 1) * batchsize + batch];
+  } else if (threadIdx.x == m - 1) {
+    scalar_t rho = (1 / numerator);
+    rho_buffer[threadIdx.x * batchsize + batch] = rho;
+  }
+}
+template <typename scalar_t, typename psum_t>
+__global__ void lbfgs_update_buffer_and_step_v1(
+    scalar_t *step_vec,     // b x 175
+    scalar_t *rho_buffer,   // m x b x 1
+    scalar_t *y_buffer,     // m x b x 175
+    scalar_t *s_buffer,     // m x b x 175
+    scalar_t *q,            // b x 175
+    scalar_t *x_0,          // b x 175
+    scalar_t *grad_0,       // b x 175
+    const scalar_t *grad_q, // b x 175
+    const float epsilon, const int batchsize, const int m, const int v_dim,
+    const bool rolled_ys = false,
+    const bool stable_mode =
+        false) // s_buffer and y_buffer are not rolled by default
+{
+  // extern __shared__ scalar_t alpha_buffer_sh[];
+  extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[];
+  scalar_t *my_smem_rc = reinterpret_cast<scalar_t *>(my_smem);
+  scalar_t *alpha_buffer_sh = &my_smem_rc[0];            // m*blockDim.x
+  scalar_t *rho_buffer_sh = &my_smem_rc[m * blockDim.x]; // batchsize*m
+  scalar_t *s_buffer_sh =
+      &my_smem_rc[m * blockDim.x + m * batchsize]; // m*blockDim.x
+  scalar_t *y_buffer_sh =
+      &my_smem_rc[2 * m * blockDim.x + m * batchsize]; // m*blockDim.x
+
+  __shared__ psum_t
+      data[32]; // temporary buffer needed for block-wide reduction
+  __shared__ scalar_t
+      result; // result of the reduction or vector-vector dot product
+  int batch = blockIdx.x; // one block per batch
+  if (threadIdx.x >= v_dim)
+    return;
+
+  scalar_t gq;
+  gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
+
+  ////////////////////
+  // update_buffer
+  ////////////////////
+  scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x];
+  // if y is close to zero
+  scalar_t s =
+      q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
+  reduce_v1(y * s, v_dim, &data[0], &result);
+  scalar_t numerator = result;
+  // scalar_t rho = 1.0/numerator;
+
+  if (!rolled_ys) {
+#pragma unroll
+    for (int i = 1; i < m; i++) {
+      scalar_t st =
+          s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      scalar_t yt =
+          y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = st;
+      y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = yt;
+      s_buffer_sh[m * threadIdx.x + i - 1] = st;
+      y_buffer_sh[m * threadIdx.x + i - 1] = yt;
+    }
+  }
+
+  s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
+  y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
+  s_buffer_sh[m * threadIdx.x + m - 1] = s;
+  y_buffer_sh[m * threadIdx.x + m - 1] = y;
+  grad_0[batch * v_dim + threadIdx.x] = gq;
+  x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
+  if (threadIdx.x < m - 1) {
+    // m thread participate to shif the values
+    // this is safe as m<32  and this happens in lockstep
+    scalar_t rho = rho_buffer[(threadIdx.x + 1) * batchsize + batch];
+    rho_buffer[threadIdx.x * batchsize + batch] = rho;
+    rho_buffer_sh[threadIdx.x * batchsize + batch] = rho;
+  }
+  if (threadIdx.x == m - 1) {
+    scalar_t rho = 1.0 / numerator;
+    // if this is nan, make it zero:
+    if (stable_mode && numerator == 0.0) {
+      rho = 0.0;
+    }
+    rho_buffer[threadIdx.x * batchsize + batch] = rho;
+    rho_buffer_sh[threadIdx.x * batchsize + batch] = rho;
+  }
+  // return;
+  __syncthreads();
+  ////////////////////
+  // step
+  ////////////////////
+  // scalar_t alpha_buffer[16];
+  // assert(m<16); // allocating a buffer assuming m < 16
+
+#pragma unroll
+  for (int i = m - 1; i > -1; i--) {
+    // reduce(gq * s_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x],
+    // v_dim, &data[0], &result);
+    reduce_v1(gq * s_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result);
+    alpha_buffer_sh[threadIdx.x * m + i] =
+        result * rho_buffer_sh[i * batchsize + batch];
+    // gq = gq - alpha_buffer_sh[threadIdx.x*m+i]*y_buffer[i*batchsize*v_dim +
+    // batch*v_dim + threadIdx.x];
+    gq = gq - alpha_buffer_sh[threadIdx.x * m + i] *
+                  y_buffer_sh[m * threadIdx.x + i];
+  }
+
+  // compute var1
+  reduce_v1(y * y, v_dim, data, &result);
+  scalar_t denominator = result;
+  // reduce(s*y, v_dim, data, &result); // redundant - already computed it above
+  // scalar_t numerator = result;
+  scalar_t var1 = numerator / denominator;
+
+  // To improve stability, uncomment below line: [this however leads to poor
+  // convergence]
+
+  if (stable_mode && denominator == 0.0) {
+    var1 = epsilon;
+  }
+
+  scalar_t gamma = relu(var1);
+  gq = gamma * gq;
+
+#pragma unroll
+  for (int i = 0; i < m; i++) {
+    // reduce(gq * y_buffer[i*batchsize*v_dim + batch*v_dim + threadIdx.x],
+    // v_dim, &data[0], &result); gq = gq + (alpha_buffer_sh[threadIdx.x*m+i] -
+    // result * rho_buffer_sh[i*batchsize+batch]) * s_buffer[i*batchsize*v_dim +
+    // batch*v_dim + threadIdx.x];
+    reduce_v1(gq * y_buffer_sh[m * threadIdx.x + i], v_dim, &data[0], &result);
+    gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] -
+               result * rho_buffer_sh[i * batchsize + batch]) *
+                  s_buffer_sh[m * threadIdx.x + i];
+  }
+
+  step_vec[batch * v_dim + threadIdx.x] =
+      -1.0 * gq; // copy from shared memory to global memory
+}
+
+// (32/M) rolls per warp
+// Threads in a warp in a GPU execute in a lock-step. We leverage that to do
+// the roll without using temporary storage or explicit synchronization.
+template <typename scalar_t>
+__global__ void lbfgs_roll(scalar_t *a, // m x b x 175
+                           scalar_t *b, // m x b x 175
+                           const int m_t, const int batchsize, const int m,
+                           const int v_dim) {
+  assert(m_t <= 32);
+  int t = blockDim.x * blockIdx.x + threadIdx.x;
+  if (t >= m_t * v_dim * batchsize)
+    return;
+  int _m = t % m_t;
+  int _v_dim = (t / m_t) % v_dim;
+  int batch = t / (m * v_dim); // this line could be wrong?
+
+  if (_m < m - 1) {
+    a[_m * batchsize * v_dim + batch * v_dim + _v_dim] =
+        a[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim];
+    b[_m * batchsize * v_dim + batch * v_dim + _v_dim] =
+        b[(_m + 1) * batchsize * v_dim + batch * v_dim + _v_dim];
+  }
+}
+
+template <typename scalar_t, typename psum_t, bool rolled_ys>
+__global__ void lbfgs_update_buffer_and_step(
+    scalar_t *step_vec,     // b x 175
+    scalar_t *rho_buffer,   // m x b x 1
+    scalar_t *y_buffer,     // m x b x 175
+    scalar_t *s_buffer,     // m x b x 175
+    scalar_t *q,            // b x 175
+    scalar_t *x_0,          // b x 175
+    scalar_t *grad_0,       // b x 175
+    const scalar_t *grad_q, // b x 175
+    const float epsilon, const int batchsize, const int m, const int v_dim,
+    const bool stable_mode =
+        false) // s_buffer and y_buffer are not rolled by default
+{
+  // extern __shared__ scalar_t alpha_buffer_sh[];
+  extern __shared__ __align__(sizeof(scalar_t)) unsigned char my_smem[];
+  scalar_t *alpha_buffer_sh = reinterpret_cast<scalar_t *>(my_smem);
+
+  __shared__ psum_t
+      data[32]; // temporary buffer needed for block-wide reduction
+  __shared__ scalar_t
+      result; // result of the reduction or vector-vector dot product
+  int batch = blockIdx.x; // one block per batch
+  if (threadIdx.x >= v_dim)
+    return;
+
+  scalar_t gq;
+  gq = grad_q[batch * v_dim + threadIdx.x]; // copy grad_q to gq
+
+  ////////////////////
+  // update_buffer
+  ////////////////////
+  scalar_t y = gq - grad_0[batch * v_dim + threadIdx.x];
+  // if y is close to zero
+  scalar_t s =
+      q[batch * v_dim + threadIdx.x] - x_0[batch * v_dim + threadIdx.x];
+  reduce(y * s, v_dim, &data[0], &result);
+  scalar_t numerator = result;
+  // scalar_t rho = 1.0/numerator;
+
+  if (!rolled_ys) {
+    for (int i = 1; i < m; i++) {
+      s_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
+          s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+      y_buffer[(i - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] =
+          y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+    }
+  }
+
+  s_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = s;
+  y_buffer[(m - 1) * batchsize * v_dim + batch * v_dim + threadIdx.x] = y;
+  grad_0[batch * v_dim + threadIdx.x] = gq;
+  x_0[batch * v_dim + threadIdx.x] = q[batch * v_dim + threadIdx.x];
+  if (threadIdx.x < m - 1) {
+    // m thread participate to shif the values
+    // this is safe as m<32  and this happens in lockstep
+    rho_buffer[threadIdx.x * batchsize + batch] =
+        rho_buffer[(threadIdx.x + 1) * batchsize + batch];
+  }
+  if (threadIdx.x == m - 1) {
+    scalar_t rho = 1.0 / numerator;
+    // if this is nan, make it zero:
+    if (stable_mode && numerator == 0.0) {
+      rho = 0.0;
+    }
+    rho_buffer[threadIdx.x * batchsize + batch] = rho;
+  }
+  // return;
+  //__syncthreads();
+  ////////////////////
+  // step
+  ////////////////////
+  // scalar_t alpha_buffer[16];
+  // assert(m<16); // allocating a buffer assuming m < 16
+
+#pragma unroll
+  for (int i = m - 1; i > -1; i--) {
+    reduce(gq * s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x],
+           v_dim, &data[0], &result);
+    alpha_buffer_sh[threadIdx.x * m + i] =
+        result * rho_buffer[i * batchsize + batch];
+    gq = gq - alpha_buffer_sh[threadIdx.x * m + i] *
+                  y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  }
+
+  // compute var1
+  reduce(y * y, v_dim, data, &result);
+  scalar_t denominator = result;
+  // reduce(s*y, v_dim, data, &result); // redundant - already computed it above
+  // scalar_t numerator = result;
+  scalar_t var1 = numerator / denominator;
+
+  // To improve stability, uncomment below line: [this however leads to poor
+  // convergence]
+
+  if (stable_mode && denominator == 0.0) {
+    var1 = epsilon;
+  }
+
+  scalar_t gamma = relu(var1);
+
+  gq = gamma * gq;
+
+#pragma unroll
+  for (int i = 0; i < m; i++) {
+    reduce(gq * y_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x],
+           v_dim, &data[0], &result);
+    gq = gq + (alpha_buffer_sh[threadIdx.x * m + i] -
+               result * rho_buffer[i * batchsize + batch]) *
+                  s_buffer[i * batchsize * v_dim + batch * v_dim + threadIdx.x];
+  }
+
+  step_vec[batch * v_dim + threadIdx.x] =
+      -1.0 * gq; // copy from shared memory to global memory
+}
+
+} // namespace Optimization
+} // namespace Curobo
+std::vector<torch::Tensor>
+lbfgs_step_cuda(torch::Tensor step_vec, torch::Tensor rho_buffer,
+                torch::Tensor y_buffer, torch::Tensor s_buffer,
+                torch::Tensor grad_q, const float epsilon, const int batch_size,
+                const int m, const int v_dim) {
+  using namespace Curobo::Optimization;
+  const int threadsPerBlock = 128;
+  const int blocksPerGrid =
+      ((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
+
+  // launch threads per batch:
+  // int threadsPerBlock = pow(2,((int)log2(v_dim))+1);
+
+  // const int blocksPerGrid = batch_size; //((batch_size) + threadsPerBlock -
+  // 1) / threadsPerBlock;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      step_vec.scalar_type(), "lbfgs_step_cu", ([&] {
+        lbfgs_step_kernel_old<scalar_t>
+            <<<blocksPerGrid, threadsPerBlock,
+               v_dim * sizeof(step_vec.scalar_type()), stream>>>(
+                step_vec.data_ptr<scalar_t>(), rho_buffer.data_ptr<scalar_t>(),
+                y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
+                grad_q.data_ptr<scalar_t>(), epsilon, batch_size, m, v_dim);
+      }));
+
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {step_vec};
+}
+
+std::vector<torch::Tensor>
+lbfgs_update_cuda(torch::Tensor rho_buffer, torch::Tensor y_buffer,
+                  torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
+                  torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
+                  const int m, const int v_dim) {
+
+  using namespace Curobo::Optimization;
+
+  // const int threadsPerBlock = 128;
+  //  launch threads per batch:
+  // int threadsPerBlock = pow(2,((int)log2(v_dim))+1);
+  int threadsPerBlock = v_dim;
+
+  const int blocksPerGrid =
+      batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      y_buffer.scalar_type(), "lbfgs_update_cu", ([&] {
+        lbfgs_update_buffer_kernel<scalar_t, scalar_t>
+            <<<blocksPerGrid, threadsPerBlock,
+               v_dim * sizeof(y_buffer.scalar_type()), stream>>>(
+                rho_buffer.data_ptr<scalar_t>(), y_buffer.data_ptr<scalar_t>(),
+                s_buffer.data_ptr<scalar_t>(), q.data_ptr<scalar_t>(),
+                x_0.data_ptr<scalar_t>(), grad_0.data_ptr<scalar_t>(),
+                grad_q.data_ptr<scalar_t>(), batch_size, m, v_dim);
+      }));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {rho_buffer, y_buffer, s_buffer, x_0, grad_0};
+}
+
+std::vector<torch::Tensor>
+lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
+                torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
+                torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
+                const float epsilon, const int batch_size, const int m,
+                const int v_dim, const bool stable_mode) {
+  using namespace Curobo::Optimization;
+
+  // call first kernel:
+
+  int threadsPerBlock = v_dim;
+  assert(threadsPerBlock < 1024);
+  assert(m < M_MAX);
+  int blocksPerGrid =
+      batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  int smemsize = 0;
+  if (true) {
+    AT_DISPATCH_FLOATING_TYPES(
+        y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel", [&] {
+          smemsize = m * threadsPerBlock * sizeof(scalar_t);
+          lbfgs_update_buffer_and_step<scalar_t, scalar_t, false>
+              <<<blocksPerGrid, threadsPerBlock, smemsize, stream>>>(
+                  step_vec.data_ptr<scalar_t>(),
+                  rho_buffer.data_ptr<scalar_t>(),
+                  y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
+                  q.data_ptr<scalar_t>(), x_0.data_ptr<scalar_t>(),
+                  grad_0.data_ptr<scalar_t>(), grad_q.data_ptr<scalar_t>(),
+                  epsilon, batch_size, m, v_dim, stable_mode);
+        });
+  } else {
+
+    // v1 does not work
+    AT_DISPATCH_FLOATING_TYPES(
+        y_buffer.scalar_type(), "lbfgs_cuda_fuse_kernel_v1", [&] {
+          smemsize = 3 * m * threadsPerBlock * sizeof(scalar_t) +
+                     m * batch_size * sizeof(scalar_t);
+          lbfgs_update_buffer_and_step_v1<scalar_t, scalar_t>
+              <<<blocksPerGrid, threadsPerBlock, smemsize, stream>>>(
+                  step_vec.data_ptr<scalar_t>(),
+                  rho_buffer.data_ptr<scalar_t>(),
+                  y_buffer.data_ptr<scalar_t>(), s_buffer.data_ptr<scalar_t>(),
+                  q.data_ptr<scalar_t>(), x_0.data_ptr<scalar_t>(),
+                  grad_0.data_ptr<scalar_t>(), grad_q.data_ptr<scalar_t>(),
+                  epsilon, batch_size, m, v_dim, false, stable_mode);
+        });
+  }
+
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {step_vec, rho_buffer, y_buffer, s_buffer, x_0, grad_0};
+}
+
+std::vector<torch::Tensor> reduce_cuda(torch::Tensor vec, torch::Tensor vec2,
+                                       torch::Tensor rho_buffer,
+                                       torch::Tensor sum, const int batch_size,
+                                       const int m, const int v_dim) {
+
+  using namespace Curobo::Optimization;
+
+  int threadsPerBlock = pow(2, ((int)log2(v_dim)) + 1);
+  int blocksPerGrid =
+      batch_size; //((batch_size) + threadsPerBlock - 1) / threadsPerBlock;
+                  // printf("threadsPerBlock:%d, blocksPerGrid: %d\n",
+                  // threadsPerBlock, blocksPerGrid);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      vec.scalar_type(), "reduce_cu", ([&] {
+        reduce_kernel<scalar_t, scalar_t>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                vec.data_ptr<scalar_t>(), vec2.data_ptr<scalar_t>(),
+                rho_buffer.data_ptr<scalar_t>(), sum.data_ptr<scalar_t>(),
+                batch_size, m, v_dim);
+      }));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {sum, rho_buffer};
+}
--- a/src/curobo/curobolib/cpp/line_search_cuda.cpp
+++ b/src/curobo/curobolib/cpp/line_search_cuda.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <torch/extension.h>
+
+#include <vector>
+
+#include <c10/cuda/CUDAGuard.h>
+
+// CUDA forward declarations
+
+std::vector<torch::Tensor>
+update_best_cuda(torch::Tensor best_cost, torch::Tensor best_q,
+                 torch::Tensor best_iteration, 
+                 torch::Tensor current_iteration,
+                 const torch::Tensor cost,
+                 const torch::Tensor q, const int d_opt, const int cost_s1,
+                 const int cost_s2, const int iteration,
+                 const float delta_threshold,
+                 const float relative_threshold = 0.999);
+
+std::vector<torch::Tensor> line_search_cuda(
+    // torch::Tensor m,
+    torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
+    const torch::Tensor g_x, const torch::Tensor x_set,
+    const torch::Tensor step_vec, const torch::Tensor c_0,
+    const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
+    const float c_2, const bool strong_wolfe, const bool approx_wolfe,
+    const int l1, const int l2, const int batchsize);
+// C++ interface
+
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+
+std::vector<torch::Tensor> line_search_call(
+    // torch::Tensor m,
+    torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
+    const torch::Tensor g_x, const torch::Tensor x_set,
+    const torch::Tensor step_vec, const torch::Tensor c_0,
+    const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
+    const float c_2, const bool strong_wolfe, const bool approx_wolfe,
+    const int l1, const int l2, const int batchsize) {
+
+  CHECK_INPUT(g_x);
+  CHECK_INPUT(x_set);
+  CHECK_INPUT(step_vec);
+  CHECK_INPUT(c_0);
+  CHECK_INPUT(alpha_list);
+  CHECK_INPUT(c_idx);
+  // CHECK_INPUT(m);
+  CHECK_INPUT(best_x);
+  CHECK_INPUT(best_c);
+  CHECK_INPUT(best_grad);
+  const at::cuda::OptionalCUDAGuard guard(best_x.device());
+
+  // return line_search_cuda(m, g_x, step_vec, c_0, alpha_list, c_1, c_2,
+  // strong_wolfe, l1, l2, batchsize);
+  return line_search_cuda(best_x, best_c, best_grad, g_x, x_set, step_vec, c_0,
+                          alpha_list, c_idx, c_1, c_2, strong_wolfe,
+                          approx_wolfe, l1, l2, batchsize);
+}
+
+std::vector<torch::Tensor>
+update_best_call(torch::Tensor best_cost, torch::Tensor best_q,
+                 torch::Tensor best_iteration, 
+                 torch::Tensor current_iteration,
+                 const torch::Tensor cost,
+                 const torch::Tensor q, const int d_opt, const int cost_s1,
+                 const int cost_s2, const int iteration,
+                 const float delta_threshold,
+                 const float relative_threshold=0.999) {
+
+  CHECK_INPUT(best_cost);
+  CHECK_INPUT(best_q);
+  CHECK_INPUT(cost);
+  CHECK_INPUT(q);
+  CHECK_INPUT(current_iteration);
+  CHECK_INPUT(best_iteration);
+  const at::cuda::OptionalCUDAGuard guard(cost.device());
+
+  return update_best_cuda(best_cost, best_q, best_iteration, current_iteration, cost, q, d_opt,
+                          cost_s1, cost_s2, iteration, delta_threshold, relative_threshold);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("update_best", &update_best_call, "Update Best (CUDA)");
+  m.def("line_search", &line_search_call, "Line search (CUDA)");
+
+}
--- a/src/curobo/curobolib/cpp/line_search_kernel.cu
+++ b/src/curobo/curobolib/cpp/line_search_kernel.cu
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#pragma once
+#include <cuda.h>
+#include <torch/extension.h>
+#include <vector>
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_fp16.h>
+
+// #include "helper_cuda.h"
+#include "helper_math.h"
+
+#include <assert.h>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <cub/cub.cuh>
+#include <math.h>
+
+// #include <stdio.h>
+//
+// For the CUDA runtime routines (prefixed with "cuda_")
+// #include <cuda_runtime.h>
+
+// #include <cuda_fp16.h>
+// #include <helper_cuda.h>
+
+#define FULL_MASK 0xffffffff
+
+namespace Curobo {
+
+namespace Optimization {
+
+template <typename scalar_t, typename psum_t>
+__inline__ __device__ void reduce(scalar_t v, int m, unsigned mask,
+                                  psum_t *data, scalar_t *result) {
+  psum_t val = v;
+  val += __shfl_down_sync(mask, val, 1);
+  val += __shfl_down_sync(mask, val, 2);
+  val += __shfl_down_sync(mask, val, 4);
+  val += __shfl_down_sync(mask, val, 8);
+  val += __shfl_down_sync(mask, val, 16);
+  // int leader = __ffs(mask) – 1;    // select a leader lane
+  int leader = 0;
+  if (threadIdx.x % 32 == leader) {
+    if (m <= 32) {
+      result[0] = (scalar_t)val;
+    } else {
+      data[(threadIdx.x + 1) / 32] = val;
+    }
+  }
+  if (m > 32) {
+
+    __syncthreads();
+
+    int elems = (m + 31) / 32;
+    assert(elems <= 32);
+    unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
+    if (threadIdx.x < elems) { // only the first warp will do this work
+      psum_t val2 = data[threadIdx.x % 32];
+      int shift = 1;
+      for (int i = elems - 1; i > 0; i /= 2) {
+        val2 += __shfl_down_sync(mask2, val2, shift);
+        shift *= 2;
+      }
+      // int leader = __ffs(mask2) – 1;    // select a leader lane
+      int leader = 0;
+      if (threadIdx.x % 32 == leader) {
+        result[0] = (scalar_t)val2;
+      }
+    }
+  }
+  __syncthreads();
+}
+
+// Launched with l2 threads/block and batchsize blocks
+template <typename scalar_t, typename psum_t>
+__global__ void line_search_kernel(
+    // int64_t *m_idx, // 4x1x1
+    scalar_t *best_x,           // 4x280
+    scalar_t *best_c,           // 4x1
+    scalar_t *best_grad,        // 4x280
+    const scalar_t *g_x,        //  4x6x280
+    const scalar_t *x_set,      //  4x6x280
+    const scalar_t *step_vec,   // 4x280x1
+    const scalar_t *c,          // 4x6x1
+    const scalar_t *alpha_list, // 4x6x1
+    const int64_t *c_idx,       // 4x1x1
+    const float c_1, const float c_2, const bool strong_wolfe,
+    const bool approx_wolfe,
+    const int l1,        // 6
+    const int l2,        // 280
+    const int batchsize) // 4
+{
+
+  int batch = blockIdx.x;
+  __shared__ psum_t data[32];
+  __shared__ scalar_t result[32];
+  assert(l1 <= 32);
+  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
+
+  if (threadIdx.x >= l2) {
+    return;
+  }
+
+  scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
+
+  // g_step = g0 @ step_vec_T
+  // g_x @ step_vec_T
+  for (int i = 0; i < l1; i++) {
+    reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
+           &data[0], &result[i]);
+  }
+
+  __shared__ scalar_t step_success[32];
+  __shared__ scalar_t step_success_w1[32];
+  assert(blockDim.x >= l1);
+  bool wolfe_1 = false;
+  bool wolfe = false;
+  bool condition = threadIdx.x < l1;
+  if (condition) {
+    // scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
+    scalar_t alpha_list_elem = alpha_list[threadIdx.x];
+
+    // condition 1:
+    wolfe_1 = c[batch * l1 + threadIdx.x] <=
+              (c[batch * l1] + c_1 * alpha_list_elem * result[0]);
+
+    // condition 2:
+    bool wolfe_2;
+    if (strong_wolfe) {
+      wolfe_2 = abs(result[threadIdx.x]) <= c_2 * abs(result[0]);
+    } else {
+      wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
+    }
+
+    wolfe = wolfe_1 & wolfe_2;
+
+    step_success[threadIdx.x] = wolfe * (alpha_list_elem + 0.1);
+    step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
+  }
+
+  __syncthreads();
+
+  __shared__ int idx_shared;
+  if (threadIdx.x == 0) {
+    int m_id = 0;
+    int m1_id = 0;
+    scalar_t max1 = step_success[0];
+    scalar_t max2 = step_success_w1[0];
+    for (int i = 1; i < l1; i++) {
+      if (max1 < step_success[i]) {
+        max1 = step_success[i];
+        m_id = i;
+      }
+      if (max2 < step_success_w1[i]) {
+        max2 = step_success_w1[i];
+        m1_id = i;
+      }
+    }
+
+    if (!approx_wolfe) {
+
+      //   m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
+      if (m_id == 0) {
+        m_id = m1_id;
+      }
+      // m_idx[m_idx == 0] = 1
+      if (m_id == 0) {
+        m_id = 1;
+      }
+    }
+    idx_shared = m_id + c_idx[batch];
+  }
+
+  ////////////////////////////////////
+  // write outputs using the computed index.
+  // one index per batch is computed
+  ////////////////////////////////////
+  // l2 is d_opt, l1 is line_search n.
+  // idx_shared contains index in l1
+  //
+  __syncthreads();
+  if (threadIdx.x < l2) {
+    if (threadIdx.x == 0) {
+      // printf("block: %d, idx_shared: %d\n", batch, idx_shared);
+    }
+    best_x[batch * l2 + threadIdx.x] = x_set[idx_shared * l2 + threadIdx.x];
+    best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
+  }
+  if (threadIdx.x == 0) {
+    best_c[batch] = c[idx_shared];
+  }
+}
+
+// Launched with l2 threads/block and #blocks = batchsize
+template <typename scalar_t, typename psum_t>
+__global__ void line_search_kernel_mask(
+    // int64_t *m_idx, // 4x1x1
+    scalar_t *best_x,           // 4x280
+    scalar_t *best_c,           // 4x1
+    scalar_t *best_grad,        // 4x280
+    const scalar_t *g_x,        //  4x6x280
+    const scalar_t *x_set,      //  4x6x280
+    const scalar_t *step_vec,   // 4x280x1
+    const scalar_t *c,          // 4x6x1
+    const scalar_t *alpha_list, // 4x6x1
+    const int64_t *c_idx,       // 4x1x1
+    const float c_1, const float c_2, const bool strong_wolfe,
+    const bool approx_wolfe,
+    const int l1,        // 6
+    const int l2,        // 280
+    const int batchsize) // 4
+{
+
+  int batch = blockIdx.x;
+  __shared__ psum_t data[32];
+  __shared__ scalar_t result[32];
+  assert(l1 <= 32);
+  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
+
+  if (threadIdx.x >= l2) {
+    return;
+  }
+
+  scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
+
+  // g_step = g0 @ step_vec_T
+  // g_x @ step_vec_T
+  for (int i = 0; i < l1; i++) {
+    reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
+           &data[0], &result[i]);
+  }
+
+  // __shared__ scalar_t step_success[32];
+  // __shared__ scalar_t step_success_w1[32];
+  assert(blockDim.x >= l1);
+  bool wolfe_1 = false;
+  bool wolfe = false;
+  bool condition = threadIdx.x < l1;
+  if (condition) {
+    scalar_t alpha_list_elem = alpha_list[threadIdx.x];
+
+    // scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
+
+    // condition 1:
+    wolfe_1 = c[batch * l1 + threadIdx.x] <=
+              (c[batch * l1] + c_1 * alpha_list_elem * result[0]);
+
+    // condition 2:
+    bool wolfe_2;
+    if (strong_wolfe) {
+      wolfe_2 = abs(result[threadIdx.x]) <= c_2 * abs(result[0]);
+    } else {
+      wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
+    }
+
+    // wolfe = torch.logical_and(wolfe_1, wolfe_2)
+    wolfe = wolfe_1 & wolfe_2;
+
+    // // step_success = wolfe * (self.alpha_list[:, :, 0:1] + 0.1)
+    // // step_success_w1 = wolfe_1 * (self.alpha_list[:, :, 0:1] + 0.1)
+    // step_success[threadIdx.x]    = wolfe   * (alpha_list_elem + 0.1);
+    // step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
+  }
+  unsigned msk1 = __ballot_sync(FULL_MASK, wolfe_1 & condition);
+  unsigned msk = __ballot_sync(FULL_MASK, wolfe & condition);
+
+  // get the index of the last occurance of true
+  unsigned msk1_brev = __brev(msk1);
+  unsigned msk_brev = __brev(msk);
+
+  int id1 = 32 - __ffs(msk1_brev); // position of least signficant bit set to 1
+  int id = 32 - __ffs(msk_brev);   // position of least signficant bit set to 1
+
+  __syncthreads();
+
+  __shared__ int idx_shared;
+  if (threadIdx.x == 0) {
+    if (!approx_wolfe) {
+      if (id == 32) { // msk is zero
+        id = id1;
+      }
+      if (id == 0) { // bit 0 is set
+        id = id1;
+      }
+      if (id == 32) { // msk is zero
+        id = 1;
+      }
+      if (id == 0) {
+        id = 1;
+      }
+    } else {
+      if (id == 32) { // msk is zero
+        id = 0;
+      }
+    }
+
+    // //  _, m_idx = torch.max(step_success, dim=-2)
+    // //  _, m1_idx = torch.max(step_success_w1, dim=-2)
+    // int m_id = 0;
+    // int m1_id = 0;
+    // scalar_t max1 = step_success[0];
+    // scalar_t max2 = step_success_w1[0];
+    // for (int i=1; i<l1; i++) {
+    //   if (max1<step_success[i]) {
+    //     max1 = step_success[i];
+    //     m_id = i;
+    //   }
+    //   if (max2<step_success_w1[i]) {
+    //     max2 = step_success_w1[i];
+    //     m1_id = i;
+    //   }
+    // }
+
+    // //   m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
+    // if (m_id == 0) {
+    //     m_id = m1_id;
+    // }
+
+    // // m_idx[m_idx == 0] = 1
+    // if (m_id == 0) {
+    //     m_id = 1;
+    // }
+
+    // if (id != m_id) {
+    //   printf("id=%d, m_id=%d\n", id, m_id);
+    //   printf("msk1=%x, msk=%x, raw id1=%d, raw id=%d\n", msk1, msk,
+    //   32-__ffs(msk1_brev), 32-__ffs(msk_brev));
+    // }
+
+    // m_idx[batch] = m_id;
+    //  m_idx[batch] = id;
+    idx_shared = id + c_idx[batch];
+  }
+
+  ////////////////////////////////////
+  // write outputs using the computed index.
+  // one index per batch is computed
+  ////////////////////////////////////
+  __syncthreads();
+  if (threadIdx.x < l2) {
+    if (threadIdx.x == 0) {
+      // printf("block: %d, idx_shared: %d\n", batch, idx_shared);
+    }
+    best_x[batch * l2 + threadIdx.x] = x_set[idx_shared * l2 + threadIdx.x];
+    best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
+  }
+  if (threadIdx.x == 0) {
+    best_c[batch] = c[idx_shared];
+  }
+}
+
+} // namespace Optimization
+} // namespace Curobo
+std::vector<torch::Tensor> line_search_cuda(
+    // torch::Tensor m_idx,
+    torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
+    const torch::Tensor g_x, const torch::Tensor x_set,
+    const torch::Tensor step_vec, const torch::Tensor c_0,
+    const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
+    const float c_2, const bool strong_wolfe, const bool approx_wolfe,
+    const int l1, const int l2, const int batchsize) {
+  using namespace Curobo::Optimization;
+  assert(l2 <= 1024);
+
+  // multiple of 32
+  const int threadsPerBlock = 32 * ((l2 + 31) / 32); //  l2;
+  const int blocksPerGrid = batchsize;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      g_x.scalar_type(), "line_search_cu", ([&] {
+        line_search_kernel_mask<scalar_t, scalar_t>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                // m_idx.data_ptr<int>(),
+                best_x.data_ptr<scalar_t>(), best_c.data_ptr<scalar_t>(),
+                best_grad.data_ptr<scalar_t>(), g_x.data_ptr<scalar_t>(),
+                x_set.data_ptr<scalar_t>(), step_vec.data_ptr<scalar_t>(),
+                c_0.data_ptr<scalar_t>(), alpha_list.data_ptr<scalar_t>(),
+                c_idx.data_ptr<int64_t>(), c_1, c_2, strong_wolfe, approx_wolfe,
+                l1, l2, batchsize);
+      }));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {best_x, best_c, best_grad};
+}
--- a/src/curobo/curobolib/cpp/pose_distance_kernel.cu
+++ b/src/curobo/curobolib/cpp/pose_distance_kernel.cu
@@ -0,0 +1,790 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+
+#include "helper_math.h"
+#include <cmath>
+#include <cuda_fp16.h>
+#include <vector>
+
+#define SINGLE_GOAL 0
+#define BATCH_GOAL 1
+#define GOALSET 2
+#define BATCH_GOALSET 3
+
+namespace Curobo 
+{
+namespace Pose
+{
+__device__ __forceinline__ void
+transform_vec_quat(const float4 q, // x,y,z, qw, qx,qy,qz
+                   const float *d_vec_weight, 
+                   float *C) {
+  // do dot product:
+  // new_p = q * p * q_inv + obs_p
+
+  if (false || q.x != 0 || q.y != 0 || q.z != 0) {
+
+    C[0] = q.w * q.w * d_vec_weight[0] + 2 * q.y * q.w * d_vec_weight[2] -
+          2 * q.z * q.w * d_vec_weight[1] + q.x * q.x * d_vec_weight[0] +
+          2 * q.y * q.x * d_vec_weight[1] + 2 * q.z * q.x * d_vec_weight[2] -
+          q.z * q.z * d_vec_weight[0] - q.y * q.y * d_vec_weight[0];
+    C[1] = 2 * q.x * q.y * d_vec_weight[0] + q.y * q.y * d_vec_weight[1] +
+          2 * q.z * q.y * d_vec_weight[2] + 2 * q.w * q.z * d_vec_weight[0] -
+          q.z * q.z * d_vec_weight[1] + q.w * q.w * d_vec_weight[1] -
+          2 * q.x * q.w * d_vec_weight[2] - q.x * q.x * d_vec_weight[1];
+    C[2] = 2 * q.x * q.z * d_vec_weight[0] + 2 * q.y * q.z * d_vec_weight[1] +
+          q.z * q.z * d_vec_weight[2] - 2 * q.w * q.y * d_vec_weight[0] -
+          q.y * q.y * d_vec_weight[2] + 2 * q.w * q.x * d_vec_weight[1] -
+          q.x * q.x * d_vec_weight[2] + q.w * q.w * d_vec_weight[2];
+
+    C[3] = q.w * q.w * d_vec_weight[3] + 2 * q.y * q.w * d_vec_weight[5] -
+          2 * q.z * q.w * d_vec_weight[4] + q.x * q.x * d_vec_weight[3] +
+          2 * q.y * q.x * d_vec_weight[4] + 2 * q.z * q.x * d_vec_weight[5] -
+          q.z * q.z * d_vec_weight[3] - q.y * q.y * d_vec_weight[3];
+    C[4] = 2 * q.x * q.y * d_vec_weight[3] + q.y * q.y * d_vec_weight[4] + 
+          2 * q.z * q.y * d_vec_weight[5] + 2 * q.w * q.z * d_vec_weight[3] -
+          q.z * q.z * d_vec_weight[4] + q.w * q.w * d_vec_weight[4] -
+          2 * q.x * q.w * d_vec_weight[5] - q.x * q.x * d_vec_weight[4];
+    C[5] = 2 * q.x * q.z * d_vec_weight[3] + 2 * q.y * q.z * d_vec_weight[4] +
+          q.z * q.z * d_vec_weight[5] - 2 * q.w * q.y * d_vec_weight[3] -
+          q.y * q.y * d_vec_weight[5] + 2 * q.w * q.x * d_vec_weight[4] -
+          q.x * q.x * d_vec_weight[5] + q.w * q.w * d_vec_weight[5];
+  }
+  {
+    C[0] = d_vec_weight[0];
+    C[1] = d_vec_weight[1];
+    C[2] = d_vec_weight[2];
+    C[3] = d_vec_weight[3];
+    C[4] = d_vec_weight[4];
+    C[5] = d_vec_weight[5];
+    
+
+  }
+}
+
+__device__ __forceinline__ void
+inv_transform_vec_quat(const float4 q_in, // x,y,z, qw, qx,qy,qz
+                   const float *d_vec_weight, 
+                   float *C) {
+  // do dot product:
+  // new_p = q * p * q_inv + obs_p
+
+  if (q_in.x != 0 || q_in.y != 0 || q_in.z != 0) {
+    float4 q = make_float4(-q_in.x, -q_in.y, -q_in.z, q_in.w);
+
+    C[0] = q.w * q.w * d_vec_weight[0] + 2 * q.y * q.w * d_vec_weight[2] -
+          2 * q.z * q.w * d_vec_weight[1] + q.x * q.x * d_vec_weight[0] +
+          2 * q.y * q.x * d_vec_weight[1] + 2 * q.z * q.x * d_vec_weight[2] -
+          q.z * q.z * d_vec_weight[0] - q.y * q.y * d_vec_weight[0];
+    C[1] = 2 * q.x * q.y * d_vec_weight[0] + q.y * q.y * d_vec_weight[1] +
+          2 * q.z * q.y * d_vec_weight[2] + 2 * q.w * q.z * d_vec_weight[0] -
+          q.z * q.z * d_vec_weight[1] + q.w * q.w * d_vec_weight[1] -
+          2 * q.x * q.w * d_vec_weight[2] - q.x * q.x * d_vec_weight[1];
+    C[2] = 2 * q.x * q.z * d_vec_weight[0] + 2 * q.y * q.z * d_vec_weight[1] +
+          q.z * q.z * d_vec_weight[2] - 2 * q.w * q.y * d_vec_weight[0] -
+          q.y * q.y * d_vec_weight[2] + 2 * q.w * q.x * d_vec_weight[1] -
+          q.x * q.x * d_vec_weight[2] + q.w * q.w * d_vec_weight[2];
+
+    C[3] = q.w * q.w * d_vec_weight[3] + 2 * q.y * q.w * d_vec_weight[5] -
+          2 * q.z * q.w * d_vec_weight[4] + q.x * q.x * d_vec_weight[3] +
+          2 * q.y * q.x * d_vec_weight[4] + 2 * q.z * q.x * d_vec_weight[5] -
+          q.z * q.z * d_vec_weight[3] - q.y * q.y * d_vec_weight[3];
+    C[4] = 2 * q.x * q.y * d_vec_weight[3] + q.y * q.y * d_vec_weight[4] + 
+          2 * q.z * q.y * d_vec_weight[5] + 2 * q.w * q.z * d_vec_weight[3] -
+          q.z * q.z * d_vec_weight[4] + q.w * q.w * d_vec_weight[4] -
+          2 * q.x * q.w * d_vec_weight[5] - q.x * q.x * d_vec_weight[4];
+    C[5] = 2 * q.x * q.z * d_vec_weight[3] + 2 * q.y * q.z * d_vec_weight[4] +
+          q.z * q.z * d_vec_weight[5] - 2 * q.w * q.y * d_vec_weight[3] -
+          q.y * q.y * d_vec_weight[5] + 2 * q.w * q.x * d_vec_weight[4] -
+          q.x * q.x * d_vec_weight[5] + q.w * q.w * d_vec_weight[5];
+  }
+  {
+    C[0] = d_vec_weight[0];
+    C[1] = d_vec_weight[1];
+    C[2] = d_vec_weight[2];
+    C[3] = d_vec_weight[3];
+    C[4] = d_vec_weight[4];
+    C[5] = d_vec_weight[5];
+    
+
+  }
+}
+__device__ __forceinline__ void
+compute_quat_distance(float *result, const float4 a, const float4 b) {
+  // We compute distance with the conjugate of b
+  float r_w = (a.w * b.w + a.x * b.x + a.y * b.y + a.z * b.z);
+  if (r_w < 0.0) {
+    r_w = 1.0;
+  } else {
+    r_w = -1.0;
+  }
+
+  result[0] = r_w * (-1 * a.w * b.x + b.w * a.x - a.y * b.z + b.y * a.z);
+  result[1] = r_w * (-1 * a.w * b.y + b.w * a.y - a.z * b.x + b.z * a.x);
+  result[2] = r_w * (-1 * a.w * b.z + b.w * a.z - a.x * b.y + b.x * a.y);
+}
+
+__device__ __forceinline__ void
+compute_distance(float *distance_vec, float &distance, float &position_distance,
+                 float &rotation_distance, const float3 current_position,
+                 const float3 goal_position, const float4 current_quat,
+                 const float4 goal_quat, const float *vec_weight,
+                 const float *vec_convergence, const float position_weight,
+                 const float rotation_weight) {
+  compute_quat_distance(&distance_vec[3], goal_quat, current_quat);
+  // compute position distance
+  *(float3 *)&distance_vec[0] = current_position - goal_position;
+  // distance_vec[0] = goal_position
+  position_distance = 0;
+  rotation_distance = 0;
+// scale by vec weight and position weight:
+#pragma unroll 3
+  for (int i = 0; i < 3; i++) {
+    distance_vec[i] = vec_weight[i + 3] * distance_vec[i];
+    position_distance += distance_vec[i] * distance_vec[i];
+  }
+#pragma unroll 3
+  for (int i = 3; i < 6; i++) {
+    distance_vec[i] = vec_weight[i - 3] * distance_vec[i];
+    rotation_distance += distance_vec[i] * distance_vec[i];
+  }
+
+  distance = 0;
+  if (rotation_distance > vec_convergence[0] * vec_convergence[0]) {
+    rotation_distance = sqrtf(rotation_distance);
+    distance += rotation_weight * rotation_distance;
+  }
+  if (position_distance > vec_convergence[1] * vec_convergence[1]) {
+    position_distance = sqrtf(position_distance);
+    distance += position_weight * position_distance;
+  }
+}
+
+__device__ __forceinline__ void compute_metric_distance(
+    float *distance_vec, float &distance, float &position_distance,
+    float &rotation_distance, const float3 current_position,
+    const float3 goal_position, const float4 current_quat,
+    const float4 goal_quat, const float *vec_weight,
+    const float *vec_convergence, const float position_weight,
+    const float p_alpha, const float rotation_weight, const float r_alpha) {
+  compute_quat_distance(&distance_vec[3], goal_quat, current_quat);
+  // compute position distance
+  *(float3 *)&distance_vec[0] = current_position - goal_position;
+  // distance_vec[0] = goal_position
+  position_distance = 0;
+  rotation_distance = 0;
+// scale by vec weight and position weight:
+#pragma unroll 3
+  for (int i = 0; i < 3; i++) {
+    distance_vec[i] = vec_weight[i + 3] * distance_vec[i];
+    // position_distance += powf(distance_vec[i],2);
+    position_distance += distance_vec[i] * distance_vec[i];
+  }
+#pragma unroll 3
+  for (int i = 3; i < 6; i++) {
+    distance_vec[i] = vec_weight[i - 3] * distance_vec[i];
+    // rotation_distance += powf(distance_vec[i],2);
+    rotation_distance += distance_vec[i] * distance_vec[i];
+  }
+
+  distance = 0;
+  if (rotation_distance > vec_convergence[0] * vec_convergence[0]) {
+    rotation_distance = sqrtf(rotation_distance);
+    distance += rotation_weight * log2f(coshf(r_alpha * rotation_distance));
+  }
+  if (position_distance > vec_convergence[1] * vec_convergence[1]) {
+    position_distance = sqrtf(position_distance);
+    distance += position_weight * log2f(coshf(p_alpha * position_distance));
+  }
+}
+
+template <typename scalar_t>
+__global__ void
+backward_pose_distance_kernel(scalar_t *out_grad_p,            // [b,3]
+                              scalar_t *out_grad_q,            // [b,4]
+                              const scalar_t *grad_distance,   // [b,1]
+                              const scalar_t *grad_p_distance, // [b,1]
+                              const scalar_t *grad_q_distance, // [b,1]
+                              const scalar_t *pose_weight,     // [2]
+                              const scalar_t *grad_p_vec,      // [b,3]
+                              const scalar_t *grad_q_vec,      // [b,4]
+                              const int batch_size) {
+  const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (batch_idx >= batch_size) {
+    return;
+  }
+  // read data
+  const float g_distance = grad_distance[batch_idx];
+  const float2 p_weight = *(float2 *)&pose_weight[0];
+  const float3 g_p_v = *(float3 *)&grad_p_vec[batch_idx * 3];
+  const float3 g_q_v = *(float3 *)&grad_q_vec[batch_idx * 4 + 1];
+  const float g_p_distance = grad_p_distance[batch_idx];
+  const float g_q_distance = grad_q_distance[batch_idx];
+
+  // compute position gradient
+  float3 g_p =
+      (g_p_v) * ((g_p_distance + g_distance * p_weight.y)); // scalar * float3
+  float3 g_q =
+      (g_q_v) * ((g_q_distance + g_distance * p_weight.x)); // scalar * float3
+
+  // write out
+  *(float3 *)&out_grad_p[batch_idx * 3] = g_p;
+  *(float3 *)&out_grad_q[batch_idx * 4 + 1] = g_q;
+}
+
+template <typename scalar_t>
+__global__ void backward_pose_kernel(scalar_t *out_grad_p,          // [b,3]
+                                     scalar_t *out_grad_q,          // [b,4]
+                                     const scalar_t *grad_distance, // [b,1]
+                                     const scalar_t *pose_weight,   // [2]
+                                     const scalar_t *grad_p_vec,    // [b,3]
+                                     const scalar_t *grad_q_vec,    // [b,4]
+                                     const int batch_size) {
+  const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (batch_idx >= batch_size) {
+    return;
+  }
+  // read data
+  const float g_distance = grad_distance[batch_idx];
+  const float2 p_weight = *(float2 *)&pose_weight[0];
+  const float3 g_p_v = *(float3 *)&grad_p_vec[batch_idx * 3];
+  const float3 g_q_v = *(float3 *)&grad_q_vec[batch_idx * 4 + 1];
+
+  // compute position gradient
+  float3 g_p = (g_p_v) * ((g_distance * p_weight.y)); // scalar * float3
+  float3 g_q = (g_q_v) * ((g_distance * p_weight.x)); // scalar * float3
+
+  // write out
+  *(float3 *)&out_grad_p[batch_idx * 3] = g_p;
+  *(float3 *)&out_grad_q[batch_idx * 4 + 1] = g_q;
+}
+
+template <typename scalar_t, bool write_distance>
+__global__ void goalset_pose_distance_kernel(
+    scalar_t *out_distance, scalar_t *out_position_distance,
+    scalar_t *out_rotation_distance, scalar_t *out_p_vec, scalar_t *out_q_vec,
+    int32_t *out_gidx, const scalar_t *current_position,
+    const scalar_t *goal_position, const scalar_t *current_quat,
+    const scalar_t *goal_quat, const scalar_t *vec_weight,
+    const scalar_t *weight, const scalar_t *vec_convergence,
+    const scalar_t *run_weight, const scalar_t *run_vec_weight,
+    const int32_t *batch_pose_idx, const int mode, const int num_goals,
+    const int batch_size, const int horizon, const bool write_grad = false) {
+  const int t_idx = (blockDim.x * blockIdx.x + threadIdx.x);
+  const int batch_idx = t_idx / horizon;
+  const int h_idx = t_idx - (batch_idx * horizon);
+  if (batch_idx >= batch_size || h_idx >= horizon) {
+    return;
+  }
+
+  // read current pose:
+  float3 position =
+      *(float3 *)&current_position[batch_idx * horizon * 3 + h_idx * 3];
+  float4 quat_4 = *(float4 *)&current_quat[batch_idx * horizon * 4 + h_idx * 4];
+  float4 quat = make_float4(quat_4.y, quat_4.z, quat_4.w, quat_4.x);
+
+  // read weights:
+
+  float position_weight = weight[1];
+  float rotation_weight = weight[0];
+  if (!write_distance) {
+    position_weight *= run_weight[h_idx];
+    rotation_weight *= run_weight[h_idx];
+    if (position_weight == 0.0 && rotation_weight == 0.0) {
+      return;
+    }
+  }
+
+  float3 l_goal_position;
+  float4 l_goal_quat;
+  float distance_vec[6]; //  = {0.0};
+  float pose_distance = 0.0;
+  float position_distance = 0.0;
+  float rotation_distance = 0.0;
+  float best_distance = INFINITY;
+  float best_position_distance = 0.0;
+  float best_rotation_distance = 0.0;
+  float best_distance_vec[6] = {0.0};
+  float best_des_vec_weight[6] = {0.0};
+  float d_vec_convergence[2];
+
+  *(float2 *)&d_vec_convergence[0] = *(float2 *)&vec_convergence[0];
+
+  int best_idx = -1;
+  float d_vec_weight[6];
+  float des_vec_weight[6] = {0.0};
+  *(float3 *)&d_vec_weight[0] = *(float3 *)&vec_weight[0];
+  *(float3 *)&d_vec_weight[3] = *(float3 *)&vec_weight[3];
+  if (h_idx < horizon - 1) {
+    *(float3 *)&d_vec_weight[0] *= *(float3 *)&run_vec_weight[0];
+    *(float3 *)&d_vec_weight[3] *= *(float3 *)&run_vec_weight[3];
+  }
+ 
+
+  // read offset
+  int offset = batch_pose_idx[batch_idx];
+  if (mode == BATCH_GOALSET || mode == BATCH_GOAL) {
+    offset = (offset)*num_goals;
+  }
+
+  for (int k = 0; k < num_goals; k++) {
+
+    l_goal_position = *(float3 *)&goal_position[(offset + k) * 3];
+    float4 gq4 = *(float4 *)&goal_quat[(offset + k) * 4];
+    l_goal_quat = make_float4(gq4.y, gq4.z, gq4.w, gq4.x);
+      transform_vec_quat(l_goal_quat, &d_vec_weight[0], &des_vec_weight[0]);
+
+    compute_distance(&distance_vec[0], pose_distance, position_distance,
+                     rotation_distance, position, l_goal_position, quat,
+                     l_goal_quat,
+                     &des_vec_weight[0],      //&l_vec_weight[0],
+                     &d_vec_convergence[0], //&l_vec_convergence[0],
+                     position_weight, rotation_weight);
+    if (pose_distance <= best_distance) {
+      best_idx = k;
+      best_distance = pose_distance;
+      best_position_distance = position_distance;
+      best_rotation_distance = rotation_distance;
+      if (write_grad) {
+        //inv_transform_vec_quat(l_goal_quat, &d_vec_weight[0], &best_des_vec_weight[0]);
+#pragma unroll 6
+        for (int i = 0; i < 6; i++) {
+          best_distance_vec[i] = distance_vec[i];
+          best_des_vec_weight[i] = des_vec_weight[i];
+        }
+      }
+    }
+  }
+  // write out:
+
+  // write out pose distance:
+  out_distance[batch_idx * horizon + h_idx] = best_distance;
+  if (write_distance) {
+    out_position_distance[batch_idx * horizon + h_idx] = best_position_distance;
+    out_rotation_distance[batch_idx * horizon + h_idx] = best_rotation_distance;
+  }
+  out_gidx[batch_idx * horizon + h_idx] = best_idx;
+
+  if (write_grad) {
+    if (write_distance) {
+      position_weight = 1;
+      rotation_weight = 1;
+    }
+    if (best_position_distance > 0) {
+      best_position_distance = (position_weight / best_position_distance);
+
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3] =
+          best_des_vec_weight[3] * best_distance_vec[0] * best_position_distance;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] =
+          best_des_vec_weight[4] * best_distance_vec[1] * best_position_distance;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] =
+          best_des_vec_weight[5] * best_distance_vec[2] * best_position_distance;
+
+    } else {
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3] = 0.0;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] = 0.0;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] = 0.0;
+    }
+
+    if (best_rotation_distance > 0) {
+      best_rotation_distance = rotation_weight / best_rotation_distance;
+
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] =
+          best_des_vec_weight[0] * best_distance_vec[3] * best_rotation_distance;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] =
+          best_des_vec_weight[1] * best_distance_vec[4] * best_rotation_distance;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] =
+          best_des_vec_weight[2] * best_distance_vec[5] * best_rotation_distance;
+    } else {
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] = 0.0;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] = 0.0;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] = 0.0;
+    }
+  }
+}
+
+template <typename scalar_t, bool write_distance>
+__global__ void goalset_pose_distance_metric_kernel(
+    scalar_t *out_distance, scalar_t *out_position_distance,
+    scalar_t *out_rotation_distance, scalar_t *out_p_vec, scalar_t *out_q_vec,
+    int32_t *out_gidx, const scalar_t *current_position,
+    const scalar_t *goal_position, const scalar_t *current_quat,
+    const scalar_t *goal_quat, const scalar_t *vec_weight,
+    const scalar_t *weight, const scalar_t *vec_convergence,
+    const scalar_t *run_weight, const scalar_t *run_vec_weight,
+    const int32_t *batch_pose_idx, const int mode, const int num_goals,
+    const int batch_size, const int horizon, const bool write_grad = false) {
+  const int t_idx = (blockDim.x * blockIdx.x + threadIdx.x);
+  const int batch_idx = t_idx / horizon;
+  const int h_idx = t_idx - (batch_idx * horizon);
+  if (batch_idx >= batch_size || h_idx >= horizon) {
+    return;
+  }
+
+  // read current pose:
+  float3 position =
+      *(float3 *)&current_position[batch_idx * horizon * 3 + h_idx * 3];
+  float4 quat_4 = *(float4 *)&current_quat[batch_idx * horizon * 4 + h_idx * 4];
+  float4 quat = make_float4(quat_4.y, quat_4.z, quat_4.w, quat_4.x);
+
+  // read weights:
+
+  float position_weight = weight[1];
+  float p_w_alpha = weight[3];
+  float r_w_alpha = weight[2];
+  float rotation_weight = weight[0];
+  if (!write_distance) {
+    position_weight *= run_weight[h_idx];
+    rotation_weight *= run_weight[h_idx];
+    p_w_alpha *= run_weight[h_idx];
+    r_w_alpha *= run_weight[h_idx];
+    if (position_weight == 0.0 && rotation_weight == 0.0) {
+      return;
+    }
+  }
+
+  float d_vec_convergence[2];
+  *(float2 *)&d_vec_convergence[0] = *(float2 *)&vec_convergence[0];
+
+  float d_vec_weight[6];
+  *(float3 *)&d_vec_weight[0] = *(float3 *)&vec_weight[0];
+  *(float3 *)&d_vec_weight[3] = *(float3 *)&vec_weight[3];
+  if (h_idx < horizon - 1) {
+    *(float3 *)&d_vec_weight[0] *= *(float3 *)&run_vec_weight[0];
+    *(float3 *)&d_vec_weight[3] *= *(float3 *)&run_vec_weight[3];
+  }
+
+  float des_vec_weight[6] = {0.0};
+  float3 l_goal_position;
+  float4 l_goal_quat;
+  float distance_vec[6]; //  = {0.0};
+  float pose_distance = 0.0;
+  float position_distance = 0.0;
+  float rotation_distance = 0.0;
+  float best_distance = INFINITY;
+  float best_position_distance = 0.0;
+  float best_rotation_distance = 0.0;
+  float best_distance_vec[6] = {0.0};
+  float best_des_vec_weight[6] = {0.0};
+  int best_idx = -1.0;
+  int offset = batch_pose_idx[batch_idx];
+  if (mode == BATCH_GOALSET || mode == BATCH_GOAL) {
+    offset = (offset)*num_goals;
+  }
+
+  for (int k = 0; k < num_goals; k++) {
+
+    l_goal_position = *(float3 *)&goal_position[(offset + k) * 3];
+    float4 gq4 =
+        *(float4 *)&goal_quat[(offset + k) * 4]; // reading qw, qx, qy, qz
+    l_goal_quat = make_float4(gq4.y, gq4.z, gq4.w, gq4.x);
+
+    transform_vec_quat(l_goal_quat, &d_vec_weight[0], &des_vec_weight[0]);
+    compute_metric_distance(
+        &distance_vec[0], pose_distance, position_distance, rotation_distance,
+        position, l_goal_position, quat, l_goal_quat,
+        &des_vec_weight[0],      //&l_vec_weight[0],
+        &d_vec_convergence[0], //&l_vec_convergence[0],
+        position_weight, p_w_alpha, rotation_weight, r_w_alpha);
+    if (pose_distance <= best_distance) {
+      best_idx = k;
+      best_distance = pose_distance;
+      best_position_distance = position_distance;
+      best_rotation_distance = rotation_distance;
+      if (write_grad) {
+         // inv_transform_vec_quat(l_goal_quat, &d_vec_weight[0], &best_des_vec_weight[0]);
+
+#pragma unroll 6
+        for (int i = 0; i < 6; i++) {
+          best_distance_vec[i] = distance_vec[i];
+         best_des_vec_weight[i] = des_vec_weight[i];
+        }
+      }
+    }
+  }
+
+  // add scaling metric:
+  // best_distance = log2f(acoshf(best_distance));
+  // write out:
+
+  // write out pose distance:
+  out_distance[batch_idx * horizon + h_idx] = best_distance;
+  if (write_distance) {
+    out_position_distance[batch_idx * horizon + h_idx] = best_position_distance;
+    out_rotation_distance[batch_idx * horizon + h_idx] = best_rotation_distance;
+  }
+  out_gidx[batch_idx * horizon + h_idx] = best_idx;
+
+  if (write_grad) {
+    // write gradient
+    // compute scalar term:
+    // -w * (d_vec)/ (length * length(negative +1) * acos(length)
+    // -w * (d_vec) * sinh(length) / (length * cosh(length))
+    // compute extra term:
+
+    if (write_distance) {
+      position_weight = 1.0;
+      rotation_weight = 1.0;
+    }
+
+    if (best_position_distance > 0) {
+      best_position_distance =
+          (p_w_alpha * position_weight *
+           sinhf(p_w_alpha * best_position_distance)) /
+          (best_position_distance * coshf(p_w_alpha * best_position_distance));
+
+      // best_position_distance = (position_weight/best_position_distance);
+      //  comput scalar gradient
+
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3] =
+          best_des_vec_weight[3] * best_distance_vec[0] * best_position_distance;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] =
+          best_des_vec_weight[4] * best_distance_vec[1] * best_position_distance;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] =
+          best_des_vec_weight[5] * best_distance_vec[2] * best_position_distance;
+
+    } else {
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3] = 0.0;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 1] = 0.0;
+      out_p_vec[batch_idx * horizon * 3 + h_idx * 3 + 2] = 0.0;
+    }
+
+    if (best_rotation_distance > 0) {
+      best_rotation_distance =
+          (r_w_alpha * rotation_weight *
+           sinhf(r_w_alpha * best_rotation_distance)) /
+          (best_rotation_distance * coshf(r_w_alpha * best_rotation_distance));
+
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] =
+          best_des_vec_weight[0] * best_distance_vec[3] * best_rotation_distance;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] =
+          best_des_vec_weight[1] * best_distance_vec[4] * best_rotation_distance;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] =
+          best_des_vec_weight[2] * best_distance_vec[5] * best_rotation_distance;
+    } else {
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 1] = 0.0;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 2] = 0.0;
+      out_q_vec[batch_idx * horizon * 4 + h_idx * 4 + 3] = 0.0;
+    }
+  }
+}
+
+} // namespace
+}
+
+std::vector<torch::Tensor>
+pose_distance(torch::Tensor out_distance, torch::Tensor out_position_distance,
+              torch::Tensor out_rotation_distance,
+              torch::Tensor distance_p_vector, // batch size, 3
+              torch::Tensor distance_q_vector, // batch size, 4
+              torch::Tensor out_gidx,
+              const torch::Tensor current_position, // batch_size, 3
+              const torch::Tensor goal_position,    // n_boxes, 3
+              const torch::Tensor current_quat, const torch::Tensor goal_quat,
+              const torch::Tensor vec_weight, // n_boxes, 4, 4
+              const torch::Tensor weight, const torch::Tensor vec_convergence,
+              const torch::Tensor run_weight,
+              const torch::Tensor run_vec_weight,
+              const torch::Tensor batch_pose_idx, // batch_size, 1
+              const int batch_size, const int horizon, const int mode,
+              const int num_goals = 1, const bool compute_grad = false,
+              const bool write_distance = true, const bool use_metric = false) {
+  using namespace Curobo::Pose;
+  // we compute the warp threads based on number of boxes:
+  assert(batch_pose_idx.size(0) == batch_size);
+  // TODO: verify this math
+  // const int batch_size = out_distance.size(0);
+  assert(run_weight.size(-1) == horizon);
+  const int bh = batch_size * horizon;
+  int threadsPerBlock = bh;
+  if (bh > 128) {
+    threadsPerBlock = 128;
+  }
+
+  // we fit warp thread spheres in a threadsPerBlock
+
+  int blocksPerGrid = (bh + threadsPerBlock - 1) / threadsPerBlock;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (use_metric) {
+    if (write_distance)
+    {
+       AT_DISPATCH_FLOATING_TYPES(
+        current_position.scalar_type(), "batch_pose_distance", ([&] {
+          goalset_pose_distance_metric_kernel
+              <scalar_t, true><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_position_distance.data_ptr<scalar_t>(),
+                  out_rotation_distance.data_ptr<scalar_t>(),
+                  distance_p_vector.data_ptr<scalar_t>(),
+                  distance_q_vector.data_ptr<scalar_t>(),
+                  out_gidx.data_ptr<int32_t>(),
+                  current_position.data_ptr<scalar_t>(),
+                  goal_position.data_ptr<scalar_t>(),
+                  current_quat.data_ptr<scalar_t>(),
+                  goal_quat.data_ptr<scalar_t>(),
+                  vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                  vec_convergence.data_ptr<scalar_t>(),
+                  run_weight.data_ptr<scalar_t>(),
+                  run_vec_weight.data_ptr<scalar_t>(),
+                  batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+                  batch_size, horizon, compute_grad);
+        }));
+    
+    }
+    else
+    {
+
+    
+    AT_DISPATCH_FLOATING_TYPES(
+        current_position.scalar_type(), "batch_pose_distance", ([&] {
+          goalset_pose_distance_metric_kernel
+              // goalset_pose_distance_kernel
+              <scalar_t, false><<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_position_distance.data_ptr<scalar_t>(),
+                  out_rotation_distance.data_ptr<scalar_t>(),
+                  distance_p_vector.data_ptr<scalar_t>(),
+                  distance_q_vector.data_ptr<scalar_t>(),
+                  out_gidx.data_ptr<int32_t>(),
+                  current_position.data_ptr<scalar_t>(),
+                  goal_position.data_ptr<scalar_t>(),
+                  current_quat.data_ptr<scalar_t>(),
+                  goal_quat.data_ptr<scalar_t>(),
+                  vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                  vec_convergence.data_ptr<scalar_t>(),
+                  run_weight.data_ptr<scalar_t>(),
+                  run_vec_weight.data_ptr<scalar_t>(),
+                  batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+                  batch_size, horizon, compute_grad);
+        }));
+    }
+  } else {
+    if(write_distance)
+    {
+       AT_DISPATCH_FLOATING_TYPES(
+        current_position.scalar_type(), "batch_pose_distance", ([&] {
+          // goalset_pose_distance_metric_kernel
+          goalset_pose_distance_kernel<scalar_t, true>
+              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_position_distance.data_ptr<scalar_t>(),
+                  out_rotation_distance.data_ptr<scalar_t>(),
+                  distance_p_vector.data_ptr<scalar_t>(),
+                  distance_q_vector.data_ptr<scalar_t>(),
+                  out_gidx.data_ptr<int32_t>(),
+                  current_position.data_ptr<scalar_t>(),
+                  goal_position.data_ptr<scalar_t>(),
+                  current_quat.data_ptr<scalar_t>(),
+                  goal_quat.data_ptr<scalar_t>(),
+                  vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                  vec_convergence.data_ptr<scalar_t>(),
+                  run_weight.data_ptr<scalar_t>(),
+                  run_vec_weight.data_ptr<scalar_t>(),
+                  batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+                  batch_size, horizon, compute_grad);
+                          }));
+    }
+    else
+    {
+    AT_DISPATCH_FLOATING_TYPES(
+        current_position.scalar_type(), "batch_pose_distance", ([&] {
+          // goalset_pose_distance_metric_kernel
+          goalset_pose_distance_kernel<scalar_t, false>
+              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_position_distance.data_ptr<scalar_t>(),
+                  out_rotation_distance.data_ptr<scalar_t>(),
+                  distance_p_vector.data_ptr<scalar_t>(),
+                  distance_q_vector.data_ptr<scalar_t>(),
+                  out_gidx.data_ptr<int32_t>(),
+                  current_position.data_ptr<scalar_t>(),
+                  goal_position.data_ptr<scalar_t>(),
+                  current_quat.data_ptr<scalar_t>(),
+                  goal_quat.data_ptr<scalar_t>(),
+                  vec_weight.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(),
+                  vec_convergence.data_ptr<scalar_t>(),
+                  run_weight.data_ptr<scalar_t>(),
+                  run_vec_weight.data_ptr<scalar_t>(),
+                  batch_pose_idx.data_ptr<int32_t>(), mode, num_goals,
+                  batch_size, horizon, compute_grad);
+        }));
+    }
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {out_distance,      out_position_distance, out_rotation_distance,
+          distance_p_vector, distance_q_vector,     out_gidx};
+}
+
+std::vector<torch::Tensor>
+backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
+                       const torch::Tensor grad_distance,   // batch_size, 3
+                       const torch::Tensor grad_p_distance, // n_boxes, 3
+                       const torch::Tensor grad_q_distance,
+                       const torch::Tensor pose_weight,
+                       const torch::Tensor grad_p_vec, // n_boxes, 4, 4
+                       const torch::Tensor grad_q_vec, const int batch_size,
+                       const bool use_distance = false) {
+
+  // we compute the warp threads based on number of boxes:
+
+  // TODO: verify this math
+  // const int batch_size = grad_distance.size(0);
+  using namespace Curobo::Pose;
+
+  int threadsPerBlock = batch_size;
+  if (batch_size > 128) {
+    threadsPerBlock = 128;
+  }
+
+  // we fit warp thread spheres in a threadsPerBlock
+
+  int blocksPerGrid = (batch_size + threadsPerBlock - 1) / threadsPerBlock;
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (use_distance) {
+
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_distance.scalar_type(), "backward_pose_distance", ([&] {
+          backward_pose_distance_kernel<scalar_t>
+              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_grad_p.data_ptr<scalar_t>(),
+                  out_grad_q.data_ptr<scalar_t>(),
+                  grad_distance.data_ptr<scalar_t>(),
+                  grad_p_distance.data_ptr<scalar_t>(),
+                  grad_q_distance.data_ptr<scalar_t>(),
+                  pose_weight.data_ptr<scalar_t>(),
+                  grad_p_vec.data_ptr<scalar_t>(),
+                  grad_q_vec.data_ptr<scalar_t>(), batch_size);
+        }));
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_distance.scalar_type(), "backward_pose", ([&] {
+          backward_pose_kernel<scalar_t>
+              <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                  out_grad_p.data_ptr<scalar_t>(),
+                  out_grad_q.data_ptr<scalar_t>(),
+                  grad_distance.data_ptr<scalar_t>(),
+                  pose_weight.data_ptr<scalar_t>(),
+                  grad_p_vec.data_ptr<scalar_t>(),
+                  grad_q_vec.data_ptr<scalar_t>(), batch_size);
+        }));
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {out_grad_p, out_grad_q};
+}
--- a/src/curobo/curobolib/cpp/self_collision_kernel.cu
+++ b/src/curobo/curobolib/cpp/self_collision_kernel.cu
@@ -0,0 +1,643 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/extension.h>
+
+#include "helper_math.h"
+#include <cmath>
+#include <cuda_fp16.h>
+#include <vector>
+
+#define SINGLE_GOAL 0
+#define BATCH_GOAL 1
+#define GOALSET 2
+#define BATCH_GOALSET 3
+
+namespace Curobo {
+
+namespace Geometry {
+
+template <typename scalar_t> __inline__ __device__ scalar_t relu(scalar_t var) {
+  if (var < 0)
+    return 0;
+  else
+    return var;
+}
+
+template <typename scalar_t>
+__global__ void self_collision_distance_kernel(
+    scalar_t *out_distance,        // batch x 1
+    scalar_t *out_vec,             // batch x nspheres x 4
+    const scalar_t *robot_spheres, // batch x nspheres x 4
+    const scalar_t *collision_threshold, const int batch_size,
+    const int nspheres, const scalar_t *weight, const bool write_grad = false) {
+  const int batch_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (batch_idx >= batch_size) {
+    return;
+  }
+  float r_diff, distance;
+  float max_penetration = 0;
+  float3 sph1, sph2, dist_vec;
+  int sph1_idx = -1;
+  int sph2_idx = -1;
+  // iterate over spheres:
+  for (int i = 0; i < nspheres; i++) {
+    sph1 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + i * 4];
+    for (int j = i + 1; j < nspheres; j++) {
+      r_diff = collision_threshold[i * nspheres + j];
+      if (isinf(r_diff)) {
+        continue;
+      }
+      sph2 = *(float3 *)&robot_spheres[batch_idx * nspheres * 4 + j * 4];
+      // compute sphere distance:
+      distance = relu(r_diff - length(sph1 - sph2));
+      if (distance > max_penetration) {
+        max_penetration = distance;
+        sph1_idx = i;
+        sph2_idx = j;
+        if (write_grad) {
+          dist_vec = (sph1 - sph2) / distance;
+        }
+      }
+    }
+  }
+
+  // write out pose distance:
+  if (max_penetration > 0) {
+    out_distance[batch_idx] = weight[0] * max_penetration;
+    if (write_grad) {
+      *(float3 *)&out_vec[batch_idx * nspheres * 4 + sph1_idx * 4] =
+          weight[0] * -1 * dist_vec;
+      *(float3 *)&out_vec[batch_idx * nspheres * 4 + sph2_idx * 4] =
+          weight[0] * dist_vec;
+    }
+  }
+}
+
+typedef struct {
+  float d;
+  int16_t i;
+  int16_t j;
+} dist_t;
+
+
+
+///////////////////////////////////////////////////////////
+// n warps per row
+// ndpt rows per warp
+///////////////////////////////////////////////////////////
+template <typename scalar_t>
+__global__ void self_collision_distance_kernel4(
+    scalar_t *out_distance,        // batch x 1
+    scalar_t *out_vec,             // batch x nspheres x 3
+    const scalar_t *robot_spheres, // batch x nspheres x 3
+    const scalar_t *offsets, const uint8_t *coll_matrix, const int batch_size,
+    const int nspheres,
+    const int ndpt, // number of distances to be computed per thread
+    const int nwpr, // number of warps per row
+    const scalar_t *weight, uint8_t *sparse_index,
+    const bool write_grad = false) {
+  int batch_idx = blockIdx.x;
+  int warp_idx = threadIdx.x / 32;
+  int i = ndpt * (warp_idx / nwpr); // starting row number for this warp
+  int j = (warp_idx % nwpr) * 32;   // starting column number for this warp
+
+  dist_t max_d = {.d = 0.0, .i = 0, .j = 0};
+  __shared__ dist_t max_darr[32];
+
+  // Optimization: About 1/3 of the warps will have no work.
+  // We compute distances only when i<j. If i is never <j in this warp, we have
+  // no work
+  if (i > j + 31) { // this warp has no work
+    max_darr[warp_idx] = max_d;
+    return;
+  }
+
+  // load robot_spheres to shared memory
+  extern __shared__ float4 __rs_shared[];
+  if (threadIdx.x < nspheres) {
+    float4 sph = *(float4 *)&robot_spheres[4 * (batch_idx * nspheres + threadIdx.x)];
+    //float4 sph = make_float4(robot_spheres[3 * (batch_idx * nspheres + threadIdx.x)],
+    //robot_spheres[3 * (batch_idx * nspheres + threadIdx.x) + 1], 
+    //robot_spheres[3 * (batch_idx * nspheres + threadIdx.x) + 2],
+    //robot_spheres_radius[threadIdx.x]) ;
+    sph.w += offsets[threadIdx.x];
+    __rs_shared[threadIdx.x] = sph;
+  }
+  __syncthreads();
+
+  //////////////////////////////////////////////////////
+  // Compute distances and store the maximum per thread
+  // in registers (max_d).
+  // Each thread computes upto ndpt distances.
+  // two warps per row
+  //////////////////////////////////////////////////////
+  // int nspheres_2 = nspheres * nspheres;
+
+  j = j + threadIdx.x % 32; // column number for this thread
+
+  float4 sph2;
+  if (j < nspheres) {
+    sph2 = __rs_shared[j]; // we need not load sph2 in every iteration.
+
+#pragma unroll 16
+    for (int k = 0; k < ndpt; k++, i++) { // increment i also here
+      if (i < nspheres && j > i) {
+        // check if self collision is allowed here:
+        if (coll_matrix[i * nspheres + j] == 1) {
+
+          float4 sph1 = __rs_shared[i];
+          if (sph1.w <= 0.0 || sph2.w <= 0.0)
+          {
+            continue;
+          }
+          float r_diff = sph1.w + sph2.w;
+          float d = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
+                         (sph1.y - sph2.y) * (sph1.y - sph2.y) +
+                         (sph1.z - sph2.z) * (sph1.z - sph2.z));
+          //float distance = max((float)0.0, (float)(r_diff - d));
+          float distance = r_diff - d;
+          // printf("%d, %d: (%d, %d) %f new\n", blockIdx.x, threadIdx.x, i, j,
+          // distance);
+          if (distance > max_d.d) {
+            max_d.d = distance;
+            max_d.i = i;
+            max_d.j = j;
+          }
+        }
+      }
+    }
+  }
+  // max_d has the result max for this thread
+
+  //////////////////////////////////////////////////////
+  // Reduce gridDim.x values using gridDim.x threads
+  //////////////////////////////////////////////////////
+
+  // Perform warp-wide reductions
+  // Optimization: Skip the reduction if all the values are zero
+  unsigned zero_mask = __ballot_sync(
+      0xffffffff, max_d.d != 0.0); // we expect most values to be 0. So,
+                                   // zero_mask should be 0 in the common case.
+  if (zero_mask != 0) {            // some of the values are non-zero
+    unsigned mask = __ballot_sync(0xffffffff, threadIdx.x < blockDim.x);
+    if (threadIdx.x < blockDim.x) {
+      // dist_t max_d = dist_sh[threadIdx.x];
+      for (int offset = 16; offset > 0; offset /= 2) {
+        uint64_t nd = __shfl_down_sync(mask, *(uint64_t *)&max_d, offset);
+        dist_t d_temp = *(dist_t *)&nd;
+        if (d_temp.d > max_d.d) {
+          max_d = d_temp;
+        }
+      }
+    }
+  }
+
+  // thread0 in the warp has the max_d for the warp
+  if (threadIdx.x % 32 == 0) {
+    max_darr[warp_idx] = max_d;
+    // printf("threadIdx.x=%d, blockIdx.x=%d, max_d=%f\n", threadIdx.x,
+    // blockIdx.x, max_d);
+  }
+
+  if (threadIdx.x < nspheres) {
+    if (write_grad && sparse_index[batch_idx * nspheres + threadIdx.x] != 0) {
+
+      *(float4 *)&out_vec[batch_idx * nspheres * 4 + threadIdx.x * 4] =
+          make_float4(0.0);
+      sparse_index[batch_idx * nspheres + threadIdx.x] = 0;
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    dist_t max_d = max_darr[0];
+    // TODO: This can be parallized
+    for (int i = 1; i < (blockDim.x + 31) / 32; i++) {
+      if (max_darr[i].d > max_d.d) {
+        max_d = max_darr[i];
+      }
+    }
+    //////////////////////////////////////////////////////
+    // Write out the final results
+    //////////////////////////////////////////////////////
+    if (max_d.d != 0.0) {
+      out_distance[batch_idx] = weight[0] * max_d.d;
+      if (write_grad) {
+        float3 sph1 =
+            *(float3 *)&robot_spheres[4 * (batch_idx * nspheres + max_d.i)];
+        float3 sph2 =
+            *(float3 *)&robot_spheres[4 * (batch_idx * nspheres + max_d.j)];
+        float3 dist_vec = (sph1 - sph2) / max_d.d;
+        *(float3 *)&out_vec[batch_idx * nspheres * 4 + max_d.i * 4] =
+            weight[0] * -1 * dist_vec;
+        *(float3 *)&out_vec[batch_idx * nspheres * 4 + max_d.j * 4] =
+            weight[0] *  dist_vec;
+        sparse_index[batch_idx * nspheres + max_d.i] = 1;
+        sparse_index[batch_idx * nspheres + max_d.j] = 1;
+      }
+    } else {
+      out_distance[batch_idx] = 0;
+    }
+  }
+}
+template <typename scalar_t, int ndpt, int NBPB>
+__global__ void self_collision_distance_kernel7(
+    scalar_t *out_distance, // batch x 1
+    scalar_t *out_vec,      // batch x nspheres x 3
+    uint8_t *sparse_index,
+    const scalar_t *robot_spheres, // batch x nspheres x 3
+    const scalar_t *offsets,       // nspheres
+    const scalar_t *weight, const int16_t *locations_, const int batch_size,
+    const int nspheres, const bool write_grad = false) {
+  uint32_t batch_idx = blockIdx.x * NBPB;
+  uint8_t nbpb = min(NBPB, batch_size - batch_idx);
+
+  if (nbpb == 0)
+    return;
+
+  // Layout in shared memory:
+  //     sphere1[batch=0] sphere1[batch=1] sphere1[batch=2] sphere1[batch=4]
+  //     sphere2[batch=0] sphere2[batch=1] sphere2[batch=2] sphere2[batch=4]
+  //     ...
+  extern __shared__ float4 __rs_shared[];
+  if (threadIdx.x < nspheres) { // threadIdx.x is sphere index
+#pragma unroll
+    for (int l = 0; l < nbpb; l++) {
+      float4 sph = *(float4 *)&robot_spheres[4 * ((batch_idx + l) * nspheres + threadIdx.x)];
+
+      //float4 sph = make_float4(
+      //  robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x)],
+      //  robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x) + 1],
+      //  robot_spheres[3 * ((batch_idx + l) * nspheres + threadIdx.x) + 2],
+      //  robot_spheres_radius[threadIdx.x]
+      //);
+      
+      sph.w += offsets[threadIdx.x];
+      __rs_shared[NBPB * threadIdx.x + l] = sph;
+    }
+  }
+  __syncthreads();
+
+  //////////////////////////////////////////////////////
+  // Compute distances and store the maximum per thread
+  // in registers (max_d).
+  // Each thread computes upto ndpt distances.
+  //////////////////////////////////////////////////////
+  dist_t max_d[NBPB] = {{.d = 0.0, .i = 0, .j = 0}};
+  int16_t indices[ndpt * 2];
+  for (uint8_t i =0; i< ndpt * 2; i++)
+  {
+    indices[i] = locations_[(threadIdx.x) * 2 * ndpt + i];
+    
+  }
+
+#pragma unroll
+  for (uint8_t k = 0; k < ndpt; k++) { 
+    // We are iterating through ndpt pair of spheres across batch
+    // if we increase ndpt, then we can compute for more spheres?
+    int i = indices[k * 2];
+    int j = indices[k * 2 + 1];
+    if (i == -1 || j == -1)
+      continue;
+
+#pragma unroll
+    for (uint16_t l = 0; l < nbpb; l++) { // iterate through nbpb batches
+      float4 sph1 = __rs_shared[NBPB * i + l];
+      float4 sph2 = __rs_shared[NBPB * j + l];
+      if (sph1.w <= 0.0 || sph2.w <= 0.0)
+      {
+        continue;
+      }
+      float r_diff =
+          sph1.w + sph2.w; // sum of two radii, radii include respective offsets
+      float d = sqrt((sph1.x - sph2.x) * (sph1.x - sph2.x) +
+                     (sph1.y - sph2.y) * (sph1.y - sph2.y) +
+                     (sph1.z - sph2.z) * (sph1.z - sph2.z));
+      float f_diff = r_diff - d;
+      if (f_diff > max_d[l].d) {
+        max_d[l].d = f_diff;
+        max_d[l].i = i;
+        max_d[l].j = j;
+      }
+    }
+  }
+  // max_d has the result max for this thread
+
+  //////////////////////////////////////////////////////
+  // Reduce gridDim.x values using gridDim.x threads
+  //////////////////////////////////////////////////////
+  // We find the sum across 32 threads. Hence, we are limited to running all our self collision
+  // distances for a batch_idx to 32 threads.
+
+  __shared__ dist_t max_darr[32 * NBPB];
+
+#pragma unroll
+  for (uint8_t l = 0; l < nbpb; l++) {
+    // Perform warp-wide reductions
+    // Optimization: Skip the reduction if all the values are zero
+    unsigned zero_mask = __ballot_sync(
+        0xffffffff,
+        max_d[l].d != 0.0); // we expect most values to be 0. So, zero_mask
+                            // should be 0 in the common case.
+    if (zero_mask != 0) {   // some of the values are non-zero
+      unsigned mask = __ballot_sync(0xffffffff, threadIdx.x < blockDim.x);
+      if (threadIdx.x < blockDim.x) {
+        // dist_t max_d = dist_sh[threadIdx.x];
+        for (int offset = 16; offset > 0; offset /= 2) {
+          // the offset here is linked to ndpt? 
+          uint64_t nd = __shfl_down_sync(mask, *(uint64_t *)&max_d[l], offset);
+          dist_t d_temp = *(dist_t *)&nd;
+          if (d_temp.d > max_d[l].d) {
+            max_d[l] = d_temp;
+          }
+        }
+      }
+    }
+    // thread0 in the warp has the max_d for the warp
+    if (threadIdx.x % 32 == 0) {
+      max_darr[(threadIdx.x / 32) + 32 * l] = max_d[l];
+      // printf("threadIdx.x=%d, blockIdx.x=%d, max_d=%f\n", threadIdx.x,
+      // blockIdx.x, max_d);
+    }
+  }
+
+  if (threadIdx.x < nspheres) {
+    for (int l = 0; l < nbpb; l++) {
+
+      if (write_grad &&
+          sparse_index[(batch_idx + l) * nspheres + threadIdx.x] != 0) {
+        *(float4 *)&out_vec[(batch_idx + l) * nspheres * 4 + threadIdx.x * 4] =
+            make_float4(0.0);
+        sparse_index[(batch_idx + l) * nspheres + threadIdx.x] = 0;
+      }
+    }
+  }
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+#pragma unroll
+    for (uint8_t l = 0; l < nbpb; l++) {
+      dist_t max_d = max_darr[l * 32];
+      // TODO: This can be parallized
+      for (int i = 1; i < (blockDim.x + 31) / 32; i++) {
+        if (max_darr[l * 32 + i].d > max_d.d) {
+          max_d = max_darr[l * 32 + i];
+        }
+      }
+      //////////////////////////////////////////////////////
+      // Write out the final results
+      //////////////////////////////////////////////////////
+      if (max_d.d != 0.0) {
+        out_distance[batch_idx + l] = weight[0] * max_d.d;
+        if (write_grad) {
+          float3 sph1 =
+              *(float3 *)&robot_spheres[4 *
+                                        ((batch_idx + l) * nspheres + max_d.i)];
+          float3 sph2 =
+              *(float3 *)&robot_spheres[4 *
+                                        ((batch_idx + l) * nspheres + max_d.j)];
+          float3 dist_vec = (sph1 - sph2) / max_d.d;
+          *(float3 *)&out_vec[(batch_idx + l) * nspheres * 4 + max_d.i * 4] =
+              weight[0] * -1 * dist_vec;
+          *(float3 *)&out_vec[(batch_idx + l) * nspheres * 4 + max_d.j * 4] =
+              weight[0] *  dist_vec;
+          sparse_index[(batch_idx + l) * nspheres + max_d.i] = 1;
+          sparse_index[(batch_idx + l) * nspheres + max_d.j] = 1;
+        }
+      } else {
+        out_distance[batch_idx + l] = 0;
+      }
+    }
+  }
+}
+} // namespace Geometry
+} // namespace Curobo
+
+// This is the best version so far.
+// It precomputes the start addresses per thread on the cpu.
+// The rest is similar to the version above.
+std::vector<torch::Tensor> self_collision_distance(
+    torch::Tensor out_distance, torch::Tensor out_vec,
+    torch::Tensor sparse_index,
+    const torch::Tensor robot_spheres,    // batch_size x n_spheres x 3
+    const torch::Tensor collision_offset, // n_spheres x n_spheres
+    const torch::Tensor weight, const torch::Tensor collision_matrix,
+    const torch::Tensor thread_locations, const int thread_locations_size,
+    const int batch_size, const int nspheres, const bool compute_grad = false,
+    const int ndpt = 8, // Does this need to match template?
+    const bool experimental_kernel = false) {
+  using namespace Curobo::Geometry;
+
+  // use efficient kernel based on number of threads:
+  const int nbpb = 1;
+
+  assert(nspheres < 1024);
+
+  
+  int threadsPerBlock = ((thread_locations_size / 2) + ndpt - 1) /
+                        ndpt; // location_size must be an even number. We store
+                              // i,j for each sphere pair.
+  //assert(threadsPerBlock/nbpb <=32);
+  if (threadsPerBlock < 32*nbpb)
+  {
+    threadsPerBlock = 32 * nbpb;
+  }
+  if (threadsPerBlock < nspheres)
+  {
+    threadsPerBlock = nspheres;
+  }
+  int blocksPerGrid = (batch_size + nbpb - 1) / nbpb; // number of batches per block 
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+   // ((threadsPerBlock >= nspheres))&& 
+  if (experimental_kernel) {
+
+    int smemSize = nbpb * nspheres * sizeof(float4);
+
+
+
+    if (ndpt == 1)
+    {
+
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 1, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    
+    else if (ndpt == 2)
+    {
+
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 2, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    
+    else if (ndpt == 4)
+    {
+
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 4, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    else if (ndpt == 8)
+    {
+
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 8, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    else if (ndpt == 32)
+    {
+
+    
+    AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 32, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    else if (ndpt == 64)
+    {
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 64, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    else if (ndpt == 128)
+    {
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 128, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+      
+    }
+    else if (ndpt == 512)
+    {
+      AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel7<scalar_t, 512, nbpb>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  weight.data_ptr<scalar_t>(),
+                  thread_locations.data_ptr<int16_t>(), batch_size, nspheres,
+                  compute_grad);
+        }));
+    }
+    else
+    {
+      assert(false); // only ndpt of 32 or 64 is currently supported.
+    }
+  }
+
+  else {
+    int ndpt_n = 32; // number of distances to be computed per thread
+    int nwpr = (nspheres + 31) / 32;
+    int warpsPerBlock = nwpr * ((nspheres + ndpt_n - 1) / ndpt_n);
+    threadsPerBlock = warpsPerBlock * 32;
+    blocksPerGrid = batch_size;
+    // printf("Blocks: %d, threads/block:%d, ndpt=%d, nwpr=%d\n", blocksPerGrid,
+    // threadsPerBlock, ndpt, nwpr);
+
+    int smemSize = nspheres * sizeof(float4);
+
+
+    AT_DISPATCH_FLOATING_TYPES(
+        robot_spheres.scalar_type(), "self_collision_distance", ([&] {
+          self_collision_distance_kernel4<scalar_t>
+              <<<blocksPerGrid, threadsPerBlock, smemSize, stream>>>(
+                  out_distance.data_ptr<scalar_t>(),
+                  out_vec.data_ptr<scalar_t>(),
+                  robot_spheres.data_ptr<scalar_t>(),
+                  collision_offset.data_ptr<scalar_t>(),
+                  collision_matrix.data_ptr<uint8_t>(), batch_size, nspheres,
+                  ndpt_n, nwpr, weight.data_ptr<scalar_t>(),
+                  sparse_index.data_ptr<uint8_t>(), compute_grad);
+        }));
+  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {out_distance, out_vec, sparse_index};
+}
+
--- a/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
+++ b/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
--- a/src/curobo/curobolib/cpp/tensor_step_cuda.cpp
+++ b/src/curobo/curobolib/cpp/tensor_step_cuda.cpp
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <torch/extension.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <vector>
+
+std::vector<torch::Tensor> step_position_clique(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor traj_dt, const int batch_size, const int horizon,
+    const int dof);
+std::vector<torch::Tensor> step_position_clique2(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor traj_dt, const int batch_size, const int horizon,
+    const int dof, const int mode);
+std::vector<torch::Tensor> step_position_clique2_idx(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor start_idx, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,  const int mode);
+
+std::vector<torch::Tensor> backward_step_position_clique(
+    torch::Tensor out_grad_position, const torch::Tensor grad_position,
+    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof);
+std::vector<torch::Tensor> backward_step_position_clique2(
+    torch::Tensor out_grad_position, const torch::Tensor grad_position,
+    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,  const int mode);
+
+std::vector<torch::Tensor>
+step_acceleration(torch::Tensor out_position, torch::Tensor out_velocity,
+                  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+                  const torch::Tensor u_acc, const torch::Tensor start_position,
+                  const torch::Tensor start_velocity,
+                  const torch::Tensor start_acceleration,
+                  const torch::Tensor traj_dt, const int batch_size,
+                  const int horizon, const int dof, const bool use_rk2 = true);
+
+std::vector<torch::Tensor> step_acceleration_idx(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_acc, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor start_idx, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,
+    const bool use_rk2 = true);
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor> step_position_clique_wrapper(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor traj_dt, const int batch_size, const int horizon,
+    const int dof) {
+  const at::cuda::OptionalCUDAGuard guard(u_position.device());
+
+  CHECK_INPUT(u_position);
+  CHECK_INPUT(out_position);
+  CHECK_INPUT(out_velocity);
+  CHECK_INPUT(out_acceleration);
+  CHECK_INPUT(out_jerk);
+  CHECK_INPUT(start_position);
+  CHECK_INPUT(start_velocity);
+  CHECK_INPUT(start_acceleration);
+  CHECK_INPUT(traj_dt);
+
+  return step_position_clique(out_position, out_velocity, out_acceleration,
+                              out_jerk, u_position, start_position,
+                              start_velocity, start_acceleration, traj_dt,
+                              batch_size, horizon, dof);
+}
+
+std::vector<torch::Tensor> step_position_clique2_wrapper(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor traj_dt, const int batch_size, const int horizon,
+    const int dof,
+    const int mode) {
+  const at::cuda::OptionalCUDAGuard guard(u_position.device());
+
+  CHECK_INPUT(u_position);
+  CHECK_INPUT(out_position);
+  CHECK_INPUT(out_velocity);
+  CHECK_INPUT(out_acceleration);
+  CHECK_INPUT(out_jerk);
+  CHECK_INPUT(start_position);
+  CHECK_INPUT(start_velocity);
+  CHECK_INPUT(start_acceleration);
+  CHECK_INPUT(traj_dt);
+
+  return step_position_clique2(out_position, out_velocity, out_acceleration,
+                               out_jerk, u_position, start_position,
+                               start_velocity, start_acceleration, traj_dt,
+                               batch_size, horizon, dof, mode);
+}
+
+std::vector<torch::Tensor> step_position_clique2_idx_wrapper(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_position, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor start_idx, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,
+    const int mode) {
+  const at::cuda::OptionalCUDAGuard guard(u_position.device());
+
+  CHECK_INPUT(u_position);
+  CHECK_INPUT(out_position);
+  CHECK_INPUT(out_velocity);
+  CHECK_INPUT(out_acceleration);
+  CHECK_INPUT(out_jerk);
+  CHECK_INPUT(start_position);
+  CHECK_INPUT(start_velocity);
+  CHECK_INPUT(start_acceleration);
+  CHECK_INPUT(traj_dt);
+  CHECK_INPUT(start_idx);
+
+  return step_position_clique2_idx(
+      out_position, out_velocity, out_acceleration, out_jerk, u_position,
+      start_position, start_velocity, start_acceleration, start_idx, traj_dt,
+      batch_size, horizon, dof, mode);
+}
+
+std::vector<torch::Tensor> backward_step_position_clique_wrapper(
+    torch::Tensor out_grad_position, const torch::Tensor grad_position,
+    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof) {
+  const at::cuda::OptionalCUDAGuard guard(grad_position.device());
+
+  CHECK_INPUT(out_grad_position);
+  CHECK_INPUT(grad_position);
+  CHECK_INPUT(grad_velocity);
+  CHECK_INPUT(grad_acceleration);
+  CHECK_INPUT(grad_jerk);
+  CHECK_INPUT(traj_dt);
+
+  return backward_step_position_clique(
+      out_grad_position, grad_position, grad_velocity, grad_acceleration,
+      grad_jerk, traj_dt, batch_size, horizon, dof);
+}
+std::vector<torch::Tensor> backward_step_position_clique2_wrapper(
+    torch::Tensor out_grad_position, const torch::Tensor grad_position,
+    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,
+    const int mode) {
+  const at::cuda::OptionalCUDAGuard guard(grad_position.device());
+
+  CHECK_INPUT(out_grad_position);
+  CHECK_INPUT(grad_position);
+  CHECK_INPUT(grad_velocity);
+  CHECK_INPUT(grad_acceleration);
+  CHECK_INPUT(grad_jerk);
+  CHECK_INPUT(traj_dt);
+
+  return backward_step_position_clique2(
+      out_grad_position, grad_position, grad_velocity, grad_acceleration,
+      grad_jerk, traj_dt, batch_size, horizon, dof, mode);
+}
+
+std::vector<torch::Tensor> step_acceleration_wrapper(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_acc, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor traj_dt, const int batch_size, const int horizon,
+    const int dof, const bool use_rk2 = true) {
+  const at::cuda::OptionalCUDAGuard guard(u_acc.device());
+
+  CHECK_INPUT(u_acc);
+  CHECK_INPUT(out_position);
+  CHECK_INPUT(out_velocity);
+  CHECK_INPUT(out_acceleration);
+  CHECK_INPUT(out_jerk);
+  CHECK_INPUT(start_position);
+  CHECK_INPUT(start_velocity);
+  CHECK_INPUT(start_acceleration);
+  CHECK_INPUT(traj_dt);
+
+  return step_acceleration(out_position, out_velocity, out_acceleration,
+                           out_jerk, u_acc, start_position, start_velocity,
+                           start_acceleration, traj_dt, batch_size, horizon,
+                           dof, use_rk2);
+}
+
+std::vector<torch::Tensor> step_acceleration_idx_wrapper(
+    torch::Tensor out_position, torch::Tensor out_velocity,
+    torch::Tensor out_acceleration, torch::Tensor out_jerk,
+    const torch::Tensor u_acc, const torch::Tensor start_position,
+    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+    const torch::Tensor start_idx, const torch::Tensor traj_dt,
+    const int batch_size, const int horizon, const int dof,
+    const bool use_rk2 = true) {
+  const at::cuda::OptionalCUDAGuard guard(u_acc.device());
+
+  CHECK_INPUT(u_acc);
+  CHECK_INPUT(out_position);
+  CHECK_INPUT(out_velocity);
+  CHECK_INPUT(out_acceleration);
+  CHECK_INPUT(out_jerk);
+  CHECK_INPUT(start_position);
+  CHECK_INPUT(start_velocity);
+  CHECK_INPUT(start_acceleration);
+  CHECK_INPUT(start_idx);
+  CHECK_INPUT(traj_dt);
+
+  return step_acceleration_idx(out_position, out_velocity, out_acceleration,
+                               out_jerk, u_acc, start_position, start_velocity,
+                               start_acceleration, start_idx, traj_dt,
+                               batch_size, horizon, dof, use_rk2);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("step_position", &step_position_clique_wrapper,
+        "Tensor Step Position (curobolib)");
+  m.def("step_position2", &step_position_clique2_wrapper,
+        "Tensor Step Position (curobolib)");
+  m.def("step_idx_position2", &step_position_clique2_idx_wrapper,
+        "Tensor Step Position (curobolib)");
+  m.def("step_position_backward", &backward_step_position_clique_wrapper,
+        "Tensor Step Position (curobolib)");
+  m.def("step_position_backward2", &backward_step_position_clique2_wrapper,
+        "Tensor Step Position (curobolib)");
+  m.def("step_acceleration", &step_acceleration_wrapper,
+        "Tensor Step Acceleration (curobolib)");
+  m.def("step_acceleration_idx", &step_acceleration_idx_wrapper,
+        "Tensor Step Acceleration (curobolib)");
+}
--- a/src/curobo/curobolib/cpp/tensor_step_kernel.cu
+++ b/src/curobo/curobolib/cpp/tensor_step_kernel.cu
--- a/src/curobo/curobolib/cpp/update_best_kernel.cu
+++ b/src/curobo/curobolib/cpp/update_best_kernel.cu
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+
+#include <cuda.h>
+#include <torch/extension.h>
+#include <vector>
+
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_fp16.h>
+
+// #include "helper_cuda.h"
+#include "helper_math.h"
+
+#include <assert.h>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <cub/cub.cuh>
+#include <math.h>
+
+namespace Curobo {
+
+namespace Optimization {
+
+// We launch with d_opt*cost_s1 threads.
+// We assume that cost_s2 is always 1.
+template <typename scalar_t>
+__global__ void update_best_kernel(scalar_t *best_cost,     // 200x1
+                                   scalar_t *best_q,        // 200x7
+                                   int16_t *best_iteration, // 200 x 1
+                                   int16_t *current_iteration, // 1
+                                   const scalar_t *cost,    // 200x1
+                                   const scalar_t *q,       // 200x7
+                                   const int d_opt,         // 7
+                                   const int cost_s1,       // 200
+                                   const int cost_s2, 
+                                   const int iteration,
+                                   const float delta_threshold,
+                                   const float relative_threshold) // 1
+{
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int size = cost_s1 * d_opt; // size of best_q
+  if (tid >= size) {
+    return;
+  }
+
+  const int cost_idx = tid / d_opt;
+  const float cost_new = cost[cost_idx];
+  const float best_cost_in = best_cost[cost_idx];
+  const bool change = (best_cost_in - cost_new) > delta_threshold && cost_new < best_cost_in * relative_threshold;
+  if (change) {
+    best_q[tid] = q[tid]; // update best_q
+
+    if (tid % d_opt == 0) {
+      best_cost[cost_idx] = cost_new ; // update best_cost
+      //best_iteration[cost_idx] = curr_iter + iteration; // 
+             // this tensor keeps track of whether the cost reduced by at least
+             // delta_threshold.
+             // here iteration is the last_best parameter. 
+    }
+  }
+
+  if (tid % d_opt == 0)
+  {
+    if (change)
+    {
+      best_iteration[cost_idx] = 0;
+    }
+    else
+    {
+      best_iteration[cost_idx] -= 1;
+    }
+  }
+ 
+  //.if (tid == 0)
+  //{
+  //  curr_iter += 1;
+  //  current_iteration[0] = curr_iter;
+  //}
+  
+} 
+} // namespace Optimization
+} // namespace Curobo
+
+std::vector<torch::Tensor>
+update_best_cuda(torch::Tensor best_cost, torch::Tensor best_q,
+                 torch::Tensor best_iteration,
+                 torch::Tensor current_iteration,
+                  const torch::Tensor cost,
+                 const torch::Tensor q, const int d_opt, const int cost_s1,
+                 const int cost_s2, const int iteration,
+                 const float delta_threshold,
+                 const float relative_threshold = 0.999) {
+  using namespace Curobo::Optimization;
+  const int threadsPerBlock = 128;
+  const int cost_size = cost_s1 * d_opt;
+  assert(cost_s2 == 1); // assumption
+  const int blocksPerGrid = (cost_size + threadsPerBlock - 1) / threadsPerBlock;
+  // printf("cost_s1=%d, d_opt=%d, blocksPerGrid=%d\n", cost_s1, d_opt,
+  // blocksPerGrid);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      cost.scalar_type(), "update_best_cu", ([&] {
+        update_best_kernel<scalar_t>
+            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
+                best_cost.data_ptr<scalar_t>(), best_q.data_ptr<scalar_t>(),
+                best_iteration.data_ptr<int16_t>(), 
+                current_iteration.data_ptr<int16_t>(), 
+                cost.data_ptr<scalar_t>(),
+                q.data_ptr<scalar_t>(), d_opt, cost_s1, cost_s2, iteration,
+                delta_threshold, relative_threshold);
+      }));
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  return {best_cost, best_q, best_iteration};
+}
--- a/src/curobo/curobolib/geom.py
+++ b/src/curobo/curobolib/geom.py
@@ -0,0 +1,405 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+
+# CuRobo
+from curobo.util.logger import log_warn
+
+try:
+    # CuRobo
+    from curobo.curobolib import geom_cu
+
+except ImportError:
+    log_warn("geom_cu binary not found, jit compiling...")
+    # Third Party
+    from torch.utils.cpp_extension import load
+
+    # CuRobo
+    from curobo.util_file import add_cpp_path
+
+    geom_cu = load(
+        name="geom_cu",
+        sources=add_cpp_path(
+            [
+                "geom_cuda.cpp",
+                "sphere_obb_kernel.cu",
+                "pose_distance_kernel.cu",
+                "self_collision_kernel.cu",
+            ]
+        ),
+    )
+
+
+def get_self_collision_distance(
+    out_distance,
+    out_vec,
+    sparse_index,
+    robot_spheres,
+    collision_offset,
+    weight,
+    coll_matrix,
+    thread_locations,
+    thread_size,
+    b_size,
+    nspheres,
+    compute_grad,
+    checks_per_thread=32,
+    experimental_kernel=True,
+):
+    r = geom_cu.self_collision_distance(
+        out_distance,
+        out_vec,
+        sparse_index,
+        robot_spheres,
+        collision_offset,
+        weight,
+        coll_matrix,
+        thread_locations,
+        thread_size,
+        b_size,
+        nspheres,
+        compute_grad,
+        checks_per_thread,
+        experimental_kernel,
+    )
+
+    out_distance = r[0]
+    out_vec = r[1]
+    return out_distance, out_vec
+
+
+class SelfCollisionDistance(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        out_distance,
+        out_vec,
+        sparse_idx,
+        robot_spheres,
+        sphere_offset,
+        weight,
+        coll_matrix,
+        thread_locations,
+        max_thread,
+        checks_per_thread: int,
+        experimental_kernel: bool,
+        return_loss: bool = False,
+    ):
+        # get batch size
+        b, h, n_spheres, _ = robot_spheres.shape
+        out_distance, out_vec = get_self_collision_distance(
+            out_distance,
+            out_vec,
+            sparse_idx,
+            robot_spheres,  # .view(-1, 4),
+            sphere_offset,
+            weight,
+            coll_matrix,
+            thread_locations,
+            max_thread,
+            b * h,
+            n_spheres,
+            robot_spheres.requires_grad,
+            checks_per_thread,
+            experimental_kernel,
+        )
+        ctx.return_loss = return_loss
+        ctx.save_for_backward(out_vec)
+        return out_distance
+
+    @staticmethod
+    def backward(ctx, grad_out_distance):
+        sphere_grad = None
+        if ctx.needs_input_grad[3]:
+            (g_vec,) = ctx.saved_tensors
+            if ctx.return_loss:
+                g_vec = g_vec * grad_out_distance.unsqueeze(1)
+            sphere_grad = g_vec
+        return None, None, None, sphere_grad, None, None, None, None, None, None, None, None
+
+
+class SelfCollisionDistanceLoss(SelfCollisionDistance):
+    @staticmethod
+    def backward(ctx, grad_out_distance):
+        sphere_grad = None
+        if ctx.needs_input_grad[3]:
+            (g_vec,) = ctx.saved_tensors
+            sphere_grad = g_vec * grad_out_distance.unsqueeze(1)
+        return None, None, None, sphere_grad, None, None, None, None, None, None, None
+
+
+def get_pose_distance(
+    out_distance,
+    out_position_distance,
+    out_rotation_distance,
+    out_p_vec,
+    out_q_vec,
+    out_idx,
+    current_position,
+    goal_position,
+    current_quat,
+    goal_quat,
+    vec_weight,
+    weight,
+    vec_convergence,
+    run_weight,
+    run_vec_weight,
+    batch_pose_idx,
+    batch_size,
+    horizon,
+    mode=1,
+    num_goals=1,
+    write_grad=False,
+    write_distance=False,
+    use_metric=False,
+):
+    if batch_pose_idx.shape[0] != batch_size:
+        raise ValueError("Index buffer size is different from batch size")
+
+    r = geom_cu.pose_distance(
+        out_distance,
+        out_position_distance,
+        out_rotation_distance,
+        out_p_vec,
+        out_q_vec,
+        out_idx,
+        current_position,
+        goal_position.view(-1),
+        current_quat,
+        goal_quat.view(-1),
+        vec_weight,
+        weight,
+        vec_convergence,
+        run_weight,
+        run_vec_weight,
+        batch_pose_idx,
+        batch_size,
+        horizon,
+        mode,
+        num_goals,
+        write_grad,
+        write_distance,
+        use_metric,
+    )
+
+    out_distance = r[0]
+    out_position_distance = r[1]
+    out_rotation_distance = r[2]
+
+    out_p_vec = r[3]
+    out_q_vec = r[4]
+
+    out_idx = r[5]
+    return out_distance, out_position_distance, out_rotation_distance, out_p_vec, out_q_vec, out_idx
+
+
+def get_pose_distance_backward(
+    out_grad_p,
+    out_grad_q,
+    grad_distance,
+    grad_p_distance,
+    grad_q_distance,
+    pose_weight,
+    grad_p_vec,
+    grad_q_vec,
+    batch_size,
+    use_distance=False,
+):
+    r = geom_cu.pose_distance_backward(
+        out_grad_p,
+        out_grad_q,
+        grad_distance,
+        grad_p_distance,
+        grad_q_distance,
+        pose_weight,
+        grad_p_vec,
+        grad_q_vec,
+        batch_size,
+        use_distance,
+    )
+    return r[0], r[1]
+
+
+class SdfSphereOBB(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        query_sphere,
+        out_buffer,
+        grad_out_buffer,
+        sparsity_idx,
+        weight,
+        activation_distance,
+        box_accel,
+        box_dims,
+        box_pose,
+        box_enable,
+        n_env_obb,
+        env_query_idx,
+        max_nobs,
+        batch_size,
+        horizon,
+        n_spheres,
+        transform_back,
+        compute_distance,
+        use_batch_env,
+        return_loss: bool = False,
+    ):
+        r = geom_cu.closest_point(
+            query_sphere,
+            out_buffer,
+            grad_out_buffer,
+            sparsity_idx,
+            weight,
+            activation_distance,
+            box_accel,
+            box_dims,
+            box_pose,
+            box_enable,
+            n_env_obb,
+            env_query_idx,
+            max_nobs,
+            batch_size,
+            horizon,
+            n_spheres,
+            transform_back,
+            compute_distance,
+            use_batch_env,
+        )
+        # r[1][r[1]!=r[1]] = 0.0
+        ctx.return_loss = return_loss
+        ctx.save_for_backward(r[1])
+        return r[0]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_pt = None
+        if ctx.needs_input_grad[0]:
+            (r,) = ctx.saved_tensors
+            if ctx.return_loss:
+                r = r * grad_output.unsqueeze(-1)
+            grad_pt = r
+        return (
+            grad_pt,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class SdfSweptSphereOBB(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        query_sphere,
+        out_buffer,
+        grad_out_buffer,
+        sparsity_idx,
+        weight,
+        activation_distance,
+        speed_dt,
+        box_accel,
+        box_dims,
+        box_pose,
+        box_enable,
+        n_env_obb,
+        env_query_idx,
+        max_nobs,
+        batch_size,
+        horizon,
+        n_spheres,
+        sweep_steps,
+        enable_speed_metric,
+        transform_back,
+        compute_distance,
+        use_batch_env,
+        return_loss: bool = False,
+    ):
+        r = geom_cu.swept_closest_point(
+            query_sphere,
+            out_buffer,
+            grad_out_buffer,
+            sparsity_idx,
+            weight,
+            activation_distance,
+            speed_dt,
+            box_accel,
+            box_dims,
+            box_pose,
+            box_enable,
+            n_env_obb,
+            env_query_idx,
+            max_nobs,
+            batch_size,
+            horizon,
+            n_spheres,
+            sweep_steps,
+            enable_speed_metric,
+            transform_back,
+            compute_distance,
+            use_batch_env,
+        )
+        ctx.return_loss = return_loss
+        ctx.save_for_backward(
+            r[1],
+        )
+        return r[0]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_pt = None
+        if ctx.needs_input_grad[0]:
+            (r,) = ctx.saved_tensors
+            if ctx.return_loss:
+                r = r * grad_output.unsqueeze(-1)
+            grad_pt = r
+        return (
+            grad_pt,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
--- a/src/curobo/curobolib/kinematics.py
+++ b/src/curobo/curobolib/kinematics.py
@@ -0,0 +1,498 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+from torch.autograd import Function
+
+# CuRobo
+from curobo.util.logger import log_warn
+
+try:
+    # CuRobo
+    from curobo.curobolib import kinematics_fused_cu
+except ImportError:
+    log_warn("kinematics_fused_cu not found, JIT compiling...")
+    # Third Party
+    from torch.utils.cpp_extension import load
+
+    # CuRobo
+    from curobo.curobolib.util_file import add_cpp_path
+
+    kinematics_fused_cu = load(
+        name="kinematics_fused_cu",
+        sources=add_cpp_path(
+            [
+                "kinematics_fused_cuda.cpp",
+                "kinematics_fused_kernel.cu",
+            ]
+        ),
+    )
+
+
+def rotation_matrix_to_quaternion(in_mat, out_quat):
+    r = kinematics_fused_cu.matrix_to_quaternion(out_quat, in_mat.reshape(-1, 9))
+    return r[0]
+
+
+class KinematicsFusedFunction(Function):
+    @staticmethod
+    def _call_cuda(
+        link_pos_out,
+        link_quat_out,
+        robot_sphere_out,
+        global_cumul_mat,
+        angle,
+        fixed_transform,
+        robot_spheres,
+        link_map,
+        joint_map,
+        joint_map_type,
+        store_link_map,
+        link_sphere_map,
+    ):
+        b_shape = link_pos_out.shape
+        b_size = b_shape[0]
+        n_spheres = robot_sphere_out.shape[1]
+        r = kinematics_fused_cu.forward(
+            link_pos_out,
+            link_quat_out,
+            robot_sphere_out,
+            global_cumul_mat,
+            angle,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+            b_size,
+            n_spheres,
+            False,
+        )
+        out_link_pos = r[0]
+        out_link_quat = r[1]
+        out_spheres = r[2]
+        global_cumul_mat = r[3]
+        return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
+
+    @staticmethod
+    def _call_backward_cuda(
+        grad_out,
+        link_pos_out,
+        link_quat_out,
+        robot_sphere_out,
+        global_cumul_mat,
+        angle,
+        fixed_transform,
+        robot_spheres,
+        link_map,
+        joint_map,
+        joint_map_type,
+        store_link_map,
+        link_sphere_map,
+        link_chain_map,
+        sparsity_opt=True,
+    ):
+        b_shape = link_pos_out.shape
+        b_size = b_shape[0]
+        n_spheres = robot_sphere_out.shape[1]
+        if grad_out.is_contiguous():
+            grad_out = grad_out.view(-1)
+        else:
+            grad_out = grad_out.reshape(-1)
+        r = kinematics_fused_cu.backward(
+            grad_out,
+            link_pos_out,
+            link_quat_out,
+            robot_sphere_out,
+            global_cumul_mat,
+            angle,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+            link_chain_map,
+            b_size,
+            n_spheres,
+            sparsity_opt,
+            False,
+        )
+        out_q = r[0].view(b_size, -1)
+
+        return out_q
+
+    @staticmethod
+    def forward(
+        ctx,
+        # link_mats: torch.Tensor,
+        link_pos: torch.Tensor,
+        link_quat: torch.Tensor,
+        b_robot_spheres: torch.tensor,
+        global_cumul_mat: torch.Tensor,
+        joint_seq: torch.Tensor,
+        fixed_transform: torch.tensor,
+        robot_spheres: torch.tensor,
+        link_map: torch.tensor,
+        joint_map: torch.Tensor,
+        joint_map_type: torch.Tensor,
+        store_link_map: torch.Tensor,
+        link_sphere_map: torch.Tensor,
+        link_chain_map: torch.Tensor,
+        grad_out: torch.Tensor,
+    ):
+        link_pos, link_quat, b_robot_spheres, global_cumul_mat = KinematicsFusedFunction._call_cuda(
+            link_pos,
+            link_quat,
+            b_robot_spheres,
+            global_cumul_mat,
+            joint_seq,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+        )
+        ctx.save_for_backward(
+            joint_seq,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+            link_chain_map,
+            grad_out,
+            global_cumul_mat,
+        )
+
+        return link_pos, link_quat, b_robot_spheres
+
+    @staticmethod
+    def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
+        grad_joint = None
+
+        if ctx.needs_input_grad[4]:
+            (
+                joint_seq,
+                fixed_transform,
+                robot_spheres,
+                link_map,
+                joint_map,
+                joint_map_type,
+                store_link_map,
+                link_sphere_map,
+                link_chain_map,
+                grad_out,
+                global_cumul_mat,
+            ) = ctx.saved_tensors
+            grad_joint = KinematicsFusedFunction._call_backward_cuda(
+                grad_out,
+                grad_out_link_pos,
+                grad_out_link_quat,
+                grad_out_spheres,
+                global_cumul_mat,
+                joint_seq,
+                fixed_transform,
+                robot_spheres,
+                link_map,
+                joint_map,
+                joint_map_type,
+                store_link_map,
+                link_sphere_map,
+                link_chain_map,
+                True,
+            )
+
+        return (
+            None,
+            None,
+            None,
+            None,
+            grad_joint,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class KinematicsFusedGlobalCumulFunction(Function):
+    @staticmethod
+    def _call_cuda(
+        link_pos_out,
+        link_quat_out,
+        robot_sphere_out,
+        global_cumul_mat,
+        angle,
+        fixed_transform,
+        robot_spheres,
+        link_map,
+        joint_map,
+        joint_map_type,
+        store_link_map,
+        link_sphere_map,
+    ):
+        b_shape = link_pos_out.shape
+        b_size = b_shape[0]
+        n_spheres = robot_sphere_out.shape[1]
+        # print(n_spheres)
+        # print(angle)
+        r = kinematics_fused_cu.forward(
+            link_pos_out,
+            link_quat_out,
+            robot_sphere_out,
+            global_cumul_mat,
+            angle,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+            b_size,
+            n_spheres,
+            True,
+        )
+        out_link_pos = r[0]
+        out_link_quat = r[1]
+        out_spheres = r[2]
+        global_cumul_mat = r[3]
+        return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
+
+    @staticmethod
+    def _call_backward_cuda(
+        grad_out,
+        link_pos_out,
+        link_quat_out,
+        robot_sphere_out,
+        global_cumul_mat,
+        angle,
+        fixed_transform,
+        robot_spheres,
+        link_map,
+        joint_map,
+        joint_map_type,
+        store_link_map,
+        link_sphere_map,
+        link_chain_map,
+        sparsity_opt=True,
+    ):
+        b_shape = link_pos_out.shape
+        b_size = b_shape[0]
+        # b_size = link_mat_out.shape[0]
+        n_spheres = robot_sphere_out.shape[1]
+        if grad_out.is_contiguous():
+            grad_out = grad_out.view(-1)
+        else:
+            grad_out = grad_out.reshape(-1)
+        # create backward tensors?
+
+        r = kinematics_fused_cu.backward(
+            grad_out,
+            link_pos_out,
+            link_quat_out,
+            robot_sphere_out,
+            global_cumul_mat,
+            angle,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+            link_chain_map,
+            b_size,
+            n_spheres,
+            sparsity_opt,
+            True,
+        )
+        out_q = r[0]  # .view(angle.shape)
+        out_q = out_q.view(b_size, -1)
+        return out_q
+
+    @staticmethod
+    def forward(
+        ctx,
+        # link_mats: torch.Tensor,
+        link_pos: torch.Tensor,
+        link_quat: torch.Tensor,
+        b_robot_spheres: torch.tensor,
+        global_cumul_mat: torch.Tensor,
+        joint_seq: torch.Tensor,
+        fixed_transform: torch.tensor,
+        robot_spheres: torch.tensor,
+        link_map: torch.tensor,
+        joint_map: torch.Tensor,
+        joint_map_type: torch.Tensor,
+        store_link_map: torch.Tensor,
+        link_sphere_map: torch.Tensor,
+        link_chain_map: torch.Tensor,
+        grad_out: torch.Tensor,
+    ):
+        if joint_seq.is_contiguous():
+            joint_seq = joint_seq.view(-1)
+        else:
+            joint_seq = joint_seq.reshape(-1)
+        (
+            link_pos,
+            link_quat,
+            b_robot_spheres,
+            global_cumul_mat,
+        ) = KinematicsFusedGlobalCumulFunction._call_cuda(
+            link_pos,
+            link_quat,
+            b_robot_spheres,
+            global_cumul_mat,
+            joint_seq,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+        )
+        ctx.save_for_backward(
+            joint_seq,
+            fixed_transform,
+            robot_spheres,
+            link_map,
+            joint_map,
+            joint_map_type,
+            store_link_map,
+            link_sphere_map,
+            link_chain_map,
+            grad_out,
+            global_cumul_mat,
+        )
+
+        return link_pos, link_quat, b_robot_spheres
+
+    @staticmethod
+    def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
+        grad_joint = None
+
+        if ctx.needs_input_grad[4]:
+            (
+                joint_seq,
+                fixed_transform,
+                robot_spheres,
+                link_map,
+                joint_map,
+                joint_map_type,
+                store_link_map,
+                link_sphere_map,
+                link_chain_map,
+                grad_out,
+                global_cumul_mat,
+            ) = ctx.saved_tensors
+            grad_joint = KinematicsFusedGlobalCumulFunction._call_backward_cuda(
+                grad_out,
+                grad_out_link_pos,
+                grad_out_link_quat,
+                grad_out_spheres,
+                global_cumul_mat,
+                joint_seq,
+                fixed_transform,
+                robot_spheres,
+                link_map,
+                joint_map,
+                joint_map_type,
+                store_link_map,
+                link_sphere_map,
+                link_chain_map,
+                True,
+            )
+
+        return (
+            None,
+            None,
+            None,
+            None,
+            grad_joint,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def get_cuda_kinematics(
+    link_pos_seq,
+    link_quat_seq,
+    batch_robot_spheres,
+    global_cumul_mat,
+    q_in,
+    fixed_transform,
+    link_spheres_tensor,
+    link_map,  # tells which link is attached to which link i
+    joint_map,  # tells which joint is attached to a link i
+    joint_map_type,  # joint type
+    store_link_map,
+    link_sphere_idx_map,  # sphere idx map
+    link_chain_map,
+    grad_out_q,
+    use_global_cumul: bool = True,
+):
+    if use_global_cumul:
+        link_pos, link_quat, robot_spheres = KinematicsFusedGlobalCumulFunction.apply(
+            link_pos_seq,
+            link_quat_seq,
+            batch_robot_spheres,
+            global_cumul_mat,
+            q_in,
+            fixed_transform,
+            link_spheres_tensor,
+            link_map,  # tells which link is attached to which link i
+            joint_map,  # tells which joint is attached to a link i
+            joint_map_type,  # joint type
+            store_link_map,
+            link_sphere_idx_map,  # sphere idx map
+            link_chain_map,
+            grad_out_q,
+        )
+    else:
+        link_pos, link_quat, robot_spheres = KinematicsFusedFunction.apply(
+            link_pos_seq,
+            link_quat_seq,
+            batch_robot_spheres,
+            global_cumul_mat,
+            q_in,
+            fixed_transform,
+            link_spheres_tensor,
+            link_map,  # tells which link is attached to which link i
+            joint_map,  # tells which joint is attached to a link i
+            joint_map_type,  # joint type
+            store_link_map,
+            link_sphere_idx_map,  # sphere idx map
+            link_chain_map,
+            grad_out_q,
+        )
+    return link_pos, link_quat, robot_spheres
--- a/src/curobo/curobolib/ls.py
+++ b/src/curobo/curobolib/ls.py
@@ -0,0 +1,112 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+
+# CuRobo
+from curobo.util.logger import log_warn
+
+try:
+    # CuRobo
+    from curobo.curobolib import line_search_cu
+except ImportError:
+    log_warn("line_search_cu not found, JIT compiling...")
+
+    # Third Party
+    from torch.utils.cpp_extension import load
+
+    # CuRobo
+    from curobo.curobolib.util_file import add_cpp_path
+
+    line_search_cu = load(
+        name="line_search_cu",
+        sources=add_cpp_path(
+            [
+                "line_search_cuda.cpp",
+                "line_search_kernel.cu",
+                "update_best_kernel.cu",
+            ]
+        ),
+    )
+
+
+def wolfe_line_search(
+    # m_idx,
+    best_x,
+    best_c,
+    best_grad,
+    g_x,
+    x_set,
+    sv,
+    c,
+    c_idx,
+    c_1: float,
+    c_2: float,
+    al,
+    sw: bool,
+    aw: bool,
+):
+    batchsize = g_x.shape[0]
+    l1 = g_x.shape[1]
+    l2 = g_x.shape[2]
+    r = line_search_cu.line_search(
+        # m_idx,
+        best_x,
+        best_c,
+        best_grad,
+        g_x,
+        x_set,
+        sv,
+        c,
+        al,
+        c_idx,
+        c_1,
+        c_2,
+        sw,
+        aw,
+        l1,
+        l2,
+        batchsize,
+    )
+    # print("batchsize:" + str(batchsize))
+    return (r[0], r[1], r[2])
+
+
+def update_best(
+    best_cost,
+    best_q,
+    best_iteration,
+    current_iteration,
+    cost,
+    q,
+    d_opt: int,
+    iteration: int,
+    delta_threshold: float = 1e-5,
+    relative_threshold: float = 0.999,
+):
+    cost_s1 = cost.shape[0]
+    cost_s2 = cost.shape[1]
+    r = line_search_cu.update_best(
+        best_cost,
+        best_q,
+        best_iteration,
+        current_iteration,
+        cost,
+        q,
+        d_opt,
+        cost_s1,
+        cost_s2,
+        iteration,
+        delta_threshold,
+        relative_threshold,
+    )
+    # print("batchsize:" + str(batchsize))
+    return (r[0], r[1], r[2])  # output: best_cost, best_q, best_iteration
--- a/src/curobo/curobolib/opt.py
+++ b/src/curobo/curobolib/opt.py
@@ -0,0 +1,112 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+from torch.autograd import Function
+
+# CuRobo
+from curobo.util.logger import log_warn
+
+try:
+    # CuRobo
+    from curobo.curobolib import lbfgs_step_cu
+except ImportError:
+    log_warn("lbfgs_step_cu not found, JIT compiling...")
+    # Third Party
+    from torch.utils.cpp_extension import load
+
+    # CuRobo
+    from curobo.curobolib.util_file import add_cpp_path
+
+    lbfgs_step_cu = load(
+        name="lbfgs_step_cu",
+        sources=add_cpp_path(
+            [
+                "lbfgs_step_cuda.cpp",
+                "lbfgs_step_kernel.cu",
+            ]
+        ),
+    )
+
+
+class LBFGScu(Function):
+    @staticmethod
+    def _call_cuda(
+        step_vec,
+        rho_buffer,
+        y_buffer,
+        s_buffer,
+        q,
+        grad_q,
+        x_0,
+        grad_0,
+        epsilon=0.1,
+        stable_mode=False,
+    ):
+        m, b, v_dim, _ = y_buffer.shape
+
+        R = lbfgs_step_cu.forward(
+            step_vec,  # .view(-1),
+            rho_buffer,  # .view(-1),
+            y_buffer,  # .view(-1),
+            s_buffer,  # .view(-1),
+            q,
+            grad_q,  # .view(-1),
+            x_0,
+            grad_0,
+            epsilon,
+            b,
+            m,
+            v_dim,
+            stable_mode,
+        )
+        step_v = R[0].view(step_vec.shape)
+
+        return step_v
+
+    @staticmethod
+    def forward(
+        ctx,
+        step_vec,
+        rho_buffer,
+        y_buffer,
+        s_buffer,
+        q,
+        grad_q,
+        x_0,
+        grad_0,
+        epsilon=0.1,
+        stable_mode=False,
+    ):
+        R = LBFGScu._call_cuda(
+            step_vec,
+            rho_buffer,
+            y_buffer,
+            s_buffer,
+            q,
+            grad_q,
+            x_0,
+            grad_0,
+            epsilon=epsilon,
+            stable_mode=stable_mode,
+        )
+        # ctx.save_for_backward(batch_spheres, robot_spheres, link_mats, link_sphere_map)
+        return R
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
--- a/src/curobo/curobolib/tensor_step.py
+++ b/src/curobo/curobolib/tensor_step.py
@@ -0,0 +1,195 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+# Third Party
+import torch
+
+# CuRobo
+from curobo.util.logger import log_warn
+
+try:
+    # CuRobo
+    from curobo.curobolib import tensor_step_cu
+
+except ImportError:
+    # Third Party
+    from torch.utils.cpp_extension import load
+
+    # CuRobo
+    from curobo.curobolib.util_file import add_cpp_path
+
+    log_warn("tensor_step_cu not found, jit compiling...")
+    tensor_step_cu = load(
+        name="tensor_step_cu",
+        sources=add_cpp_path(["tensor_step_cuda.cpp", "tensor_step_kernel.cu"]),
+    )
+
+
+def tensor_step_pos_clique_idx_fwd(
+    out_position,
+    out_velocity,
+    out_acceleration,
+    out_jerk,
+    u_position,
+    start_position,
+    start_velocity,
+    start_acceleration,
+    start_idx,
+    traj_dt,
+    batch_size,
+    horizon,
+    dof,
+    mode=-1,
+):
+    r = tensor_step_cu.step_idx_position2(
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        u_position,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        start_idx,
+        traj_dt,
+        batch_size,
+        horizon,
+        dof,
+        mode,
+    )
+    return (r[0], r[1], r[2], r[3])
+
+
+def tensor_step_pos_clique_fwd(
+    out_position,
+    out_velocity,
+    out_acceleration,
+    out_jerk,
+    u_position,
+    start_position,
+    start_velocity,
+    start_acceleration,
+    traj_dt,
+    batch_size,
+    horizon,
+    dof,
+    mode=-1,
+):
+    r = tensor_step_cu.step_position2(
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        u_position,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        traj_dt,
+        batch_size,
+        horizon,
+        dof,
+        mode,
+    )
+    return (r[0], r[1], r[2], r[3])
+
+
+def tensor_step_acc_fwd(
+    out_position,
+    out_velocity,
+    out_acceleration,
+    out_jerk,
+    u_acc,
+    start_position,
+    start_velocity,
+    start_acceleration,
+    traj_dt,
+    batch_size,
+    horizon,
+    dof,
+    use_rk2=True,
+):
+    r = tensor_step_cu.step_acceleration(
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        u_acc,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        traj_dt,
+        batch_size,
+        horizon,
+        dof,
+        use_rk2,
+    )
+    return (r[0], r[1], r[2], r[3])  # output: best_cost, best_q, best_iteration
+
+
+def tensor_step_acc_idx_fwd(
+    out_position,
+    out_velocity,
+    out_acceleration,
+    out_jerk,
+    u_acc,
+    start_position,
+    start_velocity,
+    start_acceleration,
+    start_idx,
+    traj_dt,
+    batch_size,
+    horizon,
+    dof,
+    use_rk2=True,
+):
+    r = tensor_step_cu.step_acceleration_idx(
+        out_position,
+        out_velocity,
+        out_acceleration,
+        out_jerk,
+        u_acc,
+        start_position,
+        start_velocity,
+        start_acceleration,
+        start_idx,
+        traj_dt,
+        batch_size,
+        horizon,
+        dof,
+        use_rk2,
+    )
+    return (r[0], r[1], r[2], r[3])  # output: best_cost, best_q, best_iteration
+
+
+def tensor_step_pos_clique_bwd(
+    out_grad_position,
+    grad_position,
+    grad_velocity,
+    grad_acceleration,
+    grad_jerk,
+    traj_dt,
+    batch_size,
+    horizon,
+    dof,
+    mode=-1,
+):
+    r = tensor_step_cu.step_position_backward2(
+        out_grad_position,
+        grad_position,
+        grad_velocity,
+        grad_acceleration,
+        grad_jerk,
+        traj_dt,
+        batch_size,
+        horizon,
+        dof,
+        mode,
+    )
+    return r[0]
--- a/src/curobo/curobolib/util_file.py
+++ b/src/curobo/curobolib/util_file.py
@@ -0,0 +1,34 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+import os
+
+
+def get_cpp_path():
+    path = os.path.dirname(__file__)
+    return os.path.join(path, "cpp")
+
+
+def join_path(path1, path2):
+    if isinstance(path2, str):
+        return os.path.join(path1, path2)
+    else:
+        return path2
+
+
+def add_cpp_path(sources):
+    cpp_path = get_cpp_path()
+    new_list = []
+    for s in sources:
+        s = join_path(cpp_path, s)
+        new_list.append(s)
+    return new_list