kinematics refactor with mimic joint support

2024-04-03 18:42:01 -07:00
parent b481ee201a
commit 774dcfd609
60 changed files with 2177 additions and 810 deletions
--- a/src/curobo/curobolib/cpp/check_cuda.h
+++ b/src/curobo/curobolib/cpp/check_cuda.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+
+
+// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x);  CHECK_CONTIGUOUS(x)
+
+#define CHECK_FP8 defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
+#define CHECK_INPUT_GUARD(x) CHECK_INPUT(x);   const at::cuda::OptionalCUDAGuard guard(x.device())
--- a/src/curobo/curobolib/cpp/cuda_precisions.h
+++ b/src/curobo/curobolib/cpp/cuda_precisions.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include "check_cuda.h"
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#if CHECK_FP8
+#include <cuda_fp8.h>
+#endif
--- a/src/curobo/curobolib/cpp/geom_cuda.cpp
+++ b/src/curobo/curobolib/cpp/geom_cuda.cpp
@@ -13,7 +13,7 @@

 #include <c10/cuda/CUDAGuard.h>
 #include <vector>
-
+#include "check_cuda.h"
 // CUDA forward declarations

 std::vector<torch::Tensor>self_collision_distance(
@@ -168,13 +168,6 @@ backward_pose_distance(torch::Tensor       out_grad_p,

 // C++ interface

-// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) \
-  AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)

 std::vector<torch::Tensor>self_collision_distance_wrapper(
  torch::Tensor out_distance, torch::Tensor out_vec,
--- a/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
+++ b/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
@@ -13,6 +13,8 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <vector>

+#include "check_cuda.h"
+
 // CUDA forward declarations

 std::vector<torch::Tensor>
@@ -33,7 +35,9 @@ std::vector<torch::Tensor>kin_fused_forward(
  const torch::Tensor joint_map_type,
  const torch::Tensor store_link_map,
  const torch::Tensor link_sphere_map,
+  const torch::Tensor joint_offset_map,
  const int           batch_size,
+  const int           n_joints,
  const int           n_spheres,
  const bool          use_global_cumul = false);

@@ -52,76 +56,20 @@ std::vector<torch::Tensor>kin_fused_backward_16t(
  const torch::Tensor store_link_map,
  const torch::Tensor link_sphere_map,
  const torch::Tensor link_chain_map,
+  const torch::Tensor joint_offset_map,
  const int           batch_size,
+  const int           n_joints,
  const int           n_spheres,
  const bool          sparsity_opt     = true,
  const bool          use_global_cumul = false);

 // C++ interface

-// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) \
-  AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-
-std::vector<torch::Tensor>kin_forward_wrapper(
-  torch::Tensor link_pos, torch::Tensor link_quat,
-  torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
-  const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
-  const torch::Tensor robot_spheres, const torch::Tensor link_map,
-  const torch::Tensor joint_map, const torch::Tensor joint_map_type,
-  const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
-  const int batch_size, const int n_spheres,
-  const bool use_global_cumul = false)
-{
-  const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
-
-  // TODO: add check input
-  return kin_fused_forward(
-    link_pos, link_quat, batch_robot_spheres, global_cumul_mat, joint_vec,
-    fixed_transform, robot_spheres, link_map, joint_map, joint_map_type,
-    store_link_map, link_sphere_map, batch_size, n_spheres, use_global_cumul);
-}
-
-std::vector<torch::Tensor>kin_backward_wrapper(
-  torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
-  const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
-  const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
-  const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
-  const torch::Tensor link_map, const torch::Tensor joint_map,
-  const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
-  const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
-  const int batch_size, const int n_spheres, const bool sparsity_opt = true,
-  const bool use_global_cumul = false)
-{
-  const at::cuda::OptionalCUDAGuard guard(joint_vec.device());
-
-  return kin_fused_backward_16t(
-    grad_out, grad_nlinks_pos, grad_nlinks_quat, grad_spheres,
-    global_cumul_mat, joint_vec, fixed_transform, robot_spheres, link_map,
-    joint_map, joint_map_type, store_link_map, link_sphere_map,
-    link_chain_map, batch_size, n_spheres, sparsity_opt, use_global_cumul);
-}
-
-std::vector<torch::Tensor>
-matrix_to_quaternion_wrapper(torch::Tensor       out_quat,
-                             const torch::Tensor in_rot // batch_size, 3
-                             )
-{
-  const at::cuda::OptionalCUDAGuard guard(in_rot.device());
-
-  CHECK_INPUT(in_rot);
-  CHECK_INPUT(out_quat);
-  return matrix_to_quaternion(out_quat, in_rot);
-}

 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
-  m.def("forward",              &kin_forward_wrapper,  "Kinematics fused forward (CUDA)");
-  m.def("backward",             &kin_backward_wrapper, "Kinematics fused backward (CUDA)");
-  m.def("matrix_to_quaternion", &matrix_to_quaternion_wrapper,
+  m.def("forward",              &kin_fused_forward,  "Kinematics fused forward (CUDA)");
+  m.def("backward",             &kin_fused_backward_16t, "Kinematics fused backward (CUDA)");
+  m.def("matrix_to_quaternion", &matrix_to_quaternion,
        "Rotation Matrix to Quaternion (CUDA)");
 }
--- a/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
+++ b/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
@@ -10,12 +10,13 @@
 */

 #include <c10/cuda/CUDAStream.h>
+
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <torch/extension.h>

 #include "helper_math.h"
-#include <cuda_fp16.h>
+#include "check_cuda.h"
 #include <vector>

 #define M 4
@@ -28,13 +29,6 @@
 #define Y_ROT 4
 #define Z_ROT 5

-#define X_PRISM_NEG 6
-#define Y_PRISM_NEG 7
-#define Z_PRISM_NEG 8
-#define X_ROT_NEG 9
-#define Y_ROT_NEG 10
-#define Z_ROT_NEG 11
-
 #define MAX_BATCH_PER_BLOCK 32    // tunable parameter for improving occupancy
 #define MAX_BW_BATCH_PER_BLOCK 16 // tunable parameter for improving occupancy

@@ -46,6 +40,8 @@ namespace Curobo
 {
  namespace Kinematics
  {
+
+
    template<typename psum_t>
    __device__ __forceinline__ void scale_cross_sum(float3 a, float3 b,
                                                    float3 scale, psum_t& sum_out)
@@ -293,36 +289,11 @@ namespace Curobo
      }
    }

-    __device__ __forceinline__ void update_joint_type_direction(int& j_type, int8_t& axis_sign)
-    {
-      // Assume that input j_type >= 0 . Check fixed joint outside of this function.
-
-      // Don't do anything if j_type < 6. j_type range is [0, 11]
-      // divergence here.
-      axis_sign = 1;
-
-      if (j_type >= 6)
-      {
-        j_type   -= 6;
-        axis_sign = -1;
-      }
-    }
-
-    __device__ __forceinline__ void update_joint_type_direction(int& j_type)
-    {
-      // Assume that input j_type >= 0 . Check fixed joint outside of this function.
-
-      // Don't do anything if j_type < 6. j_type range is [0, 11]
-      // divergence here.
-      if (j_type >= 6)
-      {
-        j_type -= 6;
-      }
-    }

    __device__ __forceinline__ void update_axis_direction(
      float& angle,
-      int  & j_type)
+      int  & j_type,
+      const float2 &j_offset)
    {
      // Assume that input j_type >= 0 . Check fixed joint outside of this function.
      // sign should be +ve <= 5 and -ve >5
@@ -330,9 +301,7 @@ namespace Curobo
      // cuda code treats -1.0 * 0.0 as negative. Hence we subtract 6. If in future, -1.0 * 0.0 =
      // +ve,
      // then this code should be j_type - 5.
-      angle = -1 * copysignf(1.0, j_type - 6) * angle;
-
-      update_joint_type_direction(j_type);
+      angle = j_offset.x * angle + j_offset.y;
    }

    // In the following versions of rot_fn, some non-nan values may become nan as we
@@ -462,7 +431,7 @@ namespace Curobo
    template<typename psum_t>
    __device__ __forceinline__ void
    rot_backward_translation(const float3& vec, float *cumul_mat, float *l_pos,
-                             const float3& loc_grad, psum_t& grad_q, const int8_t axis_sign = 1)
+                             const float3& loc_grad, psum_t& grad_q, const float axis_sign = 1)
    {
      float3 e_pos, j_pos;

@@ -481,7 +450,7 @@ namespace Curobo
    rot_backward_rotation(const float3 vec,
                          const float3 grad_vec,
                          psum_t     & grad_q,
-                          const int8_t axis_sign = 1)
+                          const float axis_sign = 1)
    {
      grad_q += axis_sign * dot(vec, grad_vec);
    }
@@ -489,7 +458,7 @@ namespace Curobo
    template<typename psum_t>
    __device__ __forceinline__ void
    prism_backward_translation(const float3 vec, const float3 grad_vec,
-                               psum_t& grad_q, const int8_t axis_sign = 1)
+                               psum_t& grad_q, const float axis_sign = 1)
    {
      grad_q += axis_sign * dot(vec, grad_vec);
    }
@@ -497,7 +466,7 @@ namespace Curobo
    template<typename psum_t>
    __device__ __forceinline__ void
    z_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
-                   float3& loc_grad_orientation, psum_t& grad_q, const int8_t axis_sign = 1)
+                   float3& loc_grad_orientation, psum_t& grad_q, const float axis_sign = 1)
    {
      float3 vec =
        make_float3(link_cumul_mat[2], link_cumul_mat[6], link_cumul_mat[10]);
@@ -512,7 +481,7 @@ namespace Curobo
    template<typename psum_t>
    __device__ __forceinline__ void
    x_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
-                   float3& loc_grad_orientation, psum_t& grad_q,  const int8_t axis_sign = 1)
+                   float3& loc_grad_orientation, psum_t& grad_q,  const float axis_sign = 1)
    {
      float3 vec =
        make_float3(link_cumul_mat[0], link_cumul_mat[4], link_cumul_mat[8]);
@@ -527,7 +496,7 @@ namespace Curobo
    template<typename psum_t>
    __device__ __forceinline__ void
    y_rot_backward(float *link_cumul_mat, float *l_pos, float3& loc_grad_position,
-                   float3& loc_grad_orientation, psum_t& grad_q,  const int8_t axis_sign = 1)
+                   float3& loc_grad_orientation, psum_t& grad_q,  const float axis_sign = 1)
    {
      float3 vec =
        make_float3(link_cumul_mat[1], link_cumul_mat[5], link_cumul_mat[9]);
@@ -542,7 +511,7 @@ namespace Curobo
    template<typename psum_t>
    __device__ __forceinline__ void
    xyz_prism_backward_translation(float *cumul_mat, float3& loc_grad,
-                                   psum_t& grad_q, int xyz,  const int8_t axis_sign = 1)
+                                   psum_t& grad_q, int xyz,  const float axis_sign = 1)
    {
      prism_backward_translation(
        make_float3(cumul_mat[0 + xyz], cumul_mat[4 + xyz], cumul_mat[8 + xyz]),
@@ -553,7 +522,7 @@ namespace Curobo
    __device__ __forceinline__ void x_prism_backward_translation(float       *cumul_mat,
                                                                 float3     & loc_grad,
                                                                 psum_t     & grad_q,
-                                                                 const int8_t axis_sign = 1)
+                                                                 const float axis_sign = 1)
    {
      // get rotation vector:
      prism_backward_translation(
@@ -564,7 +533,7 @@ namespace Curobo
    __device__ __forceinline__ void y_prism_backward_translation(float       *cumul_mat,
                                                                 float3     & loc_grad,
                                                                 psum_t     & grad_q,
-                                                                 const int8_t axis_sign = 1)
+                                                                 const float axis_sign = 1)
    {
      // get rotation vector:
      prism_backward_translation(
@@ -575,16 +544,15 @@ namespace Curobo
    __device__ __forceinline__ void z_prism_backward_translation(float       *cumul_mat,
                                                                 float3     & loc_grad,
                                                                 psum_t     & grad_q,
-                                                                 const int8_t axis_sign = 1)
+                                                                 const float axis_sign = 1)
    {
      // get rotation vector:
      prism_backward_translation(
        make_float3(cumul_mat[2], cumul_mat[6], cumul_mat[10]), loc_grad, grad_q, axis_sign);
    }
-
    __device__ __forceinline__ void
    xyz_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
-                                 float& grad_q, int xyz,  const int8_t axis_sign = 1)
+                                 float& grad_q, int xyz,  const float axis_sign = 1)
    {
      // get rotation vector:
      rot_backward_translation(
@@ -592,10 +560,11 @@ namespace Curobo
        &cumul_mat[0], &l_pos[0], loc_grad, grad_q, axis_sign);
    }

+
    template<typename psum_t>
    __device__ __forceinline__ void
    x_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
-                               psum_t& grad_q,  const int8_t axis_sign = 1)
+                               psum_t& grad_q,  const float axis_sign = 1)
    {
      // get rotation vector:
      rot_backward_translation(
@@ -606,7 +575,7 @@ namespace Curobo
    template<typename psum_t>
    __device__ __forceinline__ void
    y_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
-                               psum_t& grad_q,  const int8_t axis_sign = 1)
+                               psum_t& grad_q,  const float axis_sign = 1)
    {
      // get rotation vector:
      rot_backward_translation(
@@ -617,7 +586,7 @@ namespace Curobo
    template<typename psum_t>
    __device__ __forceinline__ void
    z_rot_backward_translation(float *cumul_mat, float *l_pos, float3& loc_grad,
-                               psum_t& grad_q,  const int8_t axis_sign = 1)
+                               psum_t& grad_q,  const float axis_sign = 1)
    {
      // get rotation vector:
      rot_backward_translation(
@@ -629,18 +598,19 @@ namespace Curobo
    // This one should be about 10% faster.
    template<typename scalar_t, bool use_global_cumul>
    __global__ void
-    kin_fused_warp_kernel2(scalar_t *link_pos,             // batchSize xz store_n_links x M x M
-                           scalar_t *link_quat,            // batchSize x store_n_links x M x M
+    kin_fused_warp_kernel2(float *link_pos,             // batchSize xz store_n_links x M x M
+                           float *link_quat,            // batchSize x store_n_links x M x M
                           scalar_t *b_robot_spheres,      // batchSize x nspheres x M
-                           scalar_t *global_cumul_mat,
-                           const scalar_t *q,              // batchSize x njoints
-                           const scalar_t *fixedTransform, // nlinks x M x M
-                           const scalar_t *robot_spheres,  // nspheres x M
+                           float *global_cumul_mat, // batchSize x nlinks x M x M
+                           const float *q,              // batchSize x njoints
+                           const float *fixedTransform, // nlinks x M x M
+                           const float *robot_spheres,  // nspheres x M
                           const int8_t *jointMapType,     // nlinks
                           const int16_t *jointMap,        // nlinks
                           const int16_t *linkMap,         // nlinks
                           const int16_t *storeLinkMap,    // store_n_links
                           const int16_t *linkSphereMap,   // nspheres
+                           const float *jointOffset, // nlinks
                           const int batchSize, const int nspheres,
                           const int nlinks, const int njoints,
                           const int store_n_links)
@@ -688,7 +658,8 @@ namespace Curobo
        else
        {
          float angle = q[batch * njoints + jointMap[l]];
-          update_axis_direction(angle, j_type);
+          float2 angle_offset = *(float2 *)&jointOffset[l*2];
+          update_axis_direction(angle, j_type, angle_offset);

          if (j_type <= Z_PRISM)
          {
@@ -794,20 +765,21 @@ namespace Curobo
    template<typename scalar_t, typename psum_t, bool use_global_cumul,
             bool enable_sparsity_opt, int16_t MAX_JOINTS, bool PARALLEL_WRITE>
    __global__ void kin_fused_backward_kernel3(
-      scalar_t *grad_out_link_q,       // batchSize * njoints
-      const scalar_t *grad_nlinks_pos, // batchSize * store_n_links * 16
-      const scalar_t *grad_nlinks_quat,
+      float *grad_out_link_q,       // batchSize * njoints
+      const float *grad_nlinks_pos, // batchSize * store_n_links * 16
+      const float *grad_nlinks_quat,
      const scalar_t *grad_spheres,    // batchSize * nspheres * 4
-      const scalar_t *global_cumul_mat,
-      const scalar_t *q,               // batchSize * njoints
-      const scalar_t *fixedTransform,  // nlinks * 16
-      const scalar_t *robotSpheres,    // batchSize * nspheres * 4
+      const float *global_cumul_mat,
+      const float *q,               // batchSize * njoints
+      const float *fixedTransform,  // nlinks * 16
+      const float *robotSpheres,    // batchSize * nspheres * 4
      const int8_t *jointMapType,      // nlinks
      const int16_t *jointMap,         // nlinks
      const int16_t *linkMap,          // nlinks
      const int16_t *storeLinkMap,     // store_n_links
      const int16_t *linkSphereMap,    // nspheres
      const int16_t *linkChainMap,     // nlinks*nlinks
+      const float *jointOffset,        // nlinks*2
      const int batchSize, const int nspheres, const int nlinks,
      const int njoints, const int store_n_links)
    {
@@ -819,8 +791,7 @@ namespace Curobo

      if (batch >= batchSize)
        return;
-
-      // Each thread computes one element of the cumul_mat.
+        // Each thread computes one element of the cumul_mat.
      // first 4 threads compute a row of the output;
      const int elem_idx    = threadIdx.x % 16;
      const int col_idx     = elem_idx % 4;
@@ -832,6 +803,7 @@ namespace Curobo
        for (int l = 0; l < nlinks; l++)
        {
          int outAddrStart = matAddrBase + l * M * M; // + (t % M) * M;
+
          cumul_mat[outAddrStart + elem_idx] =
            global_cumul_mat[batch * nlinks * M * M + l * M * M + elem_idx];
        }
@@ -857,8 +829,8 @@ namespace Curobo
          else
          {
            float angle = q[batch * njoints + jointMap[l]];
-
-            update_axis_direction(angle, j_type);
+            float2 angle_offset = *(float2 *)&jointOffset[l*2];
+            update_axis_direction(angle, j_type, angle_offset);

            if (j_type <= Z_PRISM)
            {
@@ -949,15 +921,11 @@ namespace Curobo
          {
            continue;
          }
-          int8_t axis_sign = 1;
+          float axis_sign = jointOffset[j*2];

          int j_type = jointMapType[j];

-          if (j_type != FIXED)
-          {
-            update_joint_type_direction(j_type, axis_sign);
-          }
-
+          
          if (j_type == Z_ROT)
          {
            float result = 0.0;
@@ -1046,12 +1014,13 @@ namespace Curobo
        // for (int16_t k = joints_per_thread; k >= 0; k--)
        for (int16_t k = 0; k < joints_per_thread; k++)
        {
-          // int16_t j = elem_idx * joints_per_thread + k;
-          int16_t j = elem_idx + k * 16;
+          int16_t j = elem_idx * joints_per_thread + k;
+          //int16_t j = elem_idx + k * 16;
+          // int16_t j = elem_idx + k * 16; // (threadidx.x % 16) + k * 16 (0 to 16)

          // int16_t j = k * M + elem_idx;
-          if ((j > max_lmap) || (j < 0))
-            continue;
+          if ((j > max_lmap))
+            break;

          // This can be spread across threads as they are not sequential?
          if (linkChainMap[l_map * nlinks + j] == 0.0)
@@ -1061,12 +1030,8 @@ namespace Curobo
          int16_t j_idx  = jointMap[j];
          int     j_type = jointMapType[j];

-          int8_t axis_sign = 1;
+          float axis_sign = jointOffset[j*2];

-          if (j_type != FIXED)
-          {
-            update_joint_type_direction(j_type, axis_sign);
-          }

          // get rotation vector:
          if (j_type == Z_ROT)
@@ -1090,8 +1055,9 @@ namespace Curobo
                           g_position, g_orientation, psum_grad[j_idx], axis_sign);
          }
        }
-      }

+      }
+      __syncthreads();
      if (PARALLEL_WRITE)
      {
        // accumulate the partial sums across the 16 threads
@@ -1114,8 +1080,8 @@ namespace Curobo

        for (int16_t j = 0; j < joints_per_thread; j++)
        {
-          // const int16_t j_idx = elem_idx * joints_per_thread + j;
-          const int16_t j_idx = elem_idx + j * 16;
+          const int16_t j_idx = elem_idx * joints_per_thread + j;
+          //const int16_t j_idx = elem_idx + j * 16;

          if (j_idx >= njoints)
          {
@@ -1151,7 +1117,7 @@ namespace Curobo
        {
          {
            grad_out_link_q[batch * njoints + j] =
-              psum_grad[j]; // write the sum to memory
+              (float) psum_grad[j]; // write the sum to memory
          }
        }
      }
@@ -1195,11 +1161,27 @@ std::vector<torch::Tensor>kin_fused_forward(
  const torch::Tensor robot_spheres, const torch::Tensor link_map,
  const torch::Tensor joint_map, const torch::Tensor joint_map_type,
  const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
-  const int batch_size, const int n_spheres,
+  const torch::Tensor joint_offset_map, 
+  const int batch_size, 
+  const int n_joints,const int n_spheres,
  const bool use_global_cumul = false)
 {
  using namespace Curobo::Kinematics;
-  const int n_joints      = joint_vec.size(0) / batch_size;
+  CHECK_INPUT_GUARD(joint_vec);
+  CHECK_INPUT(link_pos);
+  CHECK_INPUT(link_quat);
+  CHECK_INPUT(global_cumul_mat);
+  CHECK_INPUT(batch_robot_spheres);
+  CHECK_INPUT(fixed_transform);
+  CHECK_INPUT(robot_spheres);
+  CHECK_INPUT(link_map);
+  CHECK_INPUT(joint_map);
+  CHECK_INPUT(joint_map_type);
+  CHECK_INPUT(store_link_map);
+  CHECK_INPUT(link_sphere_map);
+  //CHECK_INPUT(link_chain_map);
+  CHECK_INPUT(joint_offset_map);
+  
  const int n_links       = link_map.size(0);
  const int store_n_links = link_pos.size(1);
  assert(joint_map.dtype() == torch::kInt16);
@@ -1238,38 +1220,42 @@ std::vector<torch::Tensor>kin_fused_forward(
  if (use_global_cumul)
  {
    AT_DISPATCH_FLOATING_TYPES(
-      link_pos.scalar_type(), "kin_fused_forward", ([&] {
+      batch_robot_spheres.scalar_type(), "kin_fused_forward", ([&] {
      kin_fused_warp_kernel2<scalar_t, true>
        << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
-        link_pos.data_ptr<scalar_t>(), link_quat.data_ptr<scalar_t>(),
+        link_pos.data_ptr<float>(), link_quat.data_ptr<float>(),
        batch_robot_spheres.data_ptr<scalar_t>(),
-        global_cumul_mat.data_ptr<scalar_t>(),
-        joint_vec.data_ptr<scalar_t>(),
-        fixed_transform.data_ptr<scalar_t>(),
-        robot_spheres.data_ptr<scalar_t>(),
+        global_cumul_mat.data_ptr<float>(),
+        joint_vec.data_ptr<float>(),
+        fixed_transform.data_ptr<float>(),
+        robot_spheres.data_ptr<float>(),
        joint_map_type.data_ptr<int8_t>(),
        joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
        store_link_map.data_ptr<int16_t>(),
-        link_sphere_map.data_ptr<int16_t>(), batch_size, n_spheres,
+        link_sphere_map.data_ptr<int16_t>(), 
+        joint_offset_map.data_ptr<float>(),
+        batch_size, n_spheres,
        n_links, n_joints, store_n_links);
    }));
  }
  else
  {
    AT_DISPATCH_FLOATING_TYPES(
-      link_pos.scalar_type(), "kin_fused_forward", ([&] {
+      batch_robot_spheres.scalar_type(), "kin_fused_forward", ([&] {
      kin_fused_warp_kernel2<scalar_t, false>
        << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
-        link_pos.data_ptr<scalar_t>(), link_quat.data_ptr<scalar_t>(),
+        link_pos.data_ptr<float>(), link_quat.data_ptr<float>(),
        batch_robot_spheres.data_ptr<scalar_t>(),
-        global_cumul_mat.data_ptr<scalar_t>(),
-        joint_vec.data_ptr<scalar_t>(),
-        fixed_transform.data_ptr<scalar_t>(),
-        robot_spheres.data_ptr<scalar_t>(),
+        global_cumul_mat.data_ptr<float>(),
+        joint_vec.data_ptr<float>(),
+        fixed_transform.data_ptr<float>(),
+        robot_spheres.data_ptr<float>(),
        joint_map_type.data_ptr<int8_t>(),
        joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
        store_link_map.data_ptr<int16_t>(),
-        link_sphere_map.data_ptr<int16_t>(), batch_size, n_spheres,
+        link_sphere_map.data_ptr<int16_t>(), 
+        joint_offset_map.data_ptr<float>(),
+        batch_size, n_spheres,
        n_links, n_joints, store_n_links);
    }));
  }
@@ -1292,15 +1278,29 @@ std::vector<torch::Tensor>kin_fused_backward_16t(
  const torch::Tensor link_map, const torch::Tensor joint_map,
  const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
  const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
-  const int batch_size, const int n_spheres, const bool sparsity_opt = true,
+  const torch::Tensor joint_offset_map,
+  const int batch_size,  const int n_joints, const int n_spheres, const bool sparsity_opt = true,
  const bool use_global_cumul = false)
 {
  using namespace Curobo::Kinematics;
-
-  const int n_joints      = joint_vec.size(0) / batch_size;
+  CHECK_INPUT_GUARD(joint_vec);
+  CHECK_INPUT(grad_out);
+  CHECK_INPUT(grad_nlinks_pos);  
+  CHECK_INPUT(grad_nlinks_quat);
+  CHECK_INPUT(global_cumul_mat);
+  CHECK_INPUT(fixed_transform);
+  CHECK_INPUT(robot_spheres);
+  CHECK_INPUT(link_map);
+  CHECK_INPUT(joint_map);
+  CHECK_INPUT(joint_map_type);
+  CHECK_INPUT(store_link_map);
+  CHECK_INPUT(link_sphere_map);
+  CHECK_INPUT(link_chain_map);
+  CHECK_INPUT(joint_offset_map);
+  
  const int n_links       = link_map.size(0);
  const int store_n_links = store_link_map.size(0);
-
+  
  // assert(n_links < 128);
  assert(n_joints < 128); // for larger num. of joints, change kernel3's
                          // MAX_JOINTS template value.
@@ -1341,72 +1341,77 @@ std::vector<torch::Tensor>kin_fused_backward_16t(

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  assert(sparsity_opt);
-
  if (use_global_cumul)
  {
    if (n_joints < 16)
    {
      AT_DISPATCH_FLOATING_TYPES(
-        grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
-        kin_fused_backward_kernel3<scalar_t, float, true, true, 16, true>
+        grad_spheres.scalar_type(), "kin_fused_backward_16t", ([&] {
+        kin_fused_backward_kernel3<scalar_t, double, true, true, 16, true>
          << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
-          grad_out.data_ptr<scalar_t>(),
-          grad_nlinks_pos.data_ptr<scalar_t>(),
-          grad_nlinks_quat.data_ptr<scalar_t>(),
+          grad_out.data_ptr<float>(),
+          grad_nlinks_pos.data_ptr<float>(),
+          grad_nlinks_quat.data_ptr<float>(),
          grad_spheres.data_ptr<scalar_t>(),
-          global_cumul_mat.data_ptr<scalar_t>(),
-          joint_vec.data_ptr<scalar_t>(),
-          fixed_transform.data_ptr<scalar_t>(),
-          robot_spheres.data_ptr<scalar_t>(),
+          global_cumul_mat.data_ptr<float>(),
+          joint_vec.data_ptr<float>(),
+          fixed_transform.data_ptr<float>(),
+          robot_spheres.data_ptr<float>(),
          joint_map_type.data_ptr<int8_t>(),
          joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
          store_link_map.data_ptr<int16_t>(),
          link_sphere_map.data_ptr<int16_t>(),
-          link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
+          link_chain_map.data_ptr<int16_t>(), 
+          joint_offset_map.data_ptr<float>(),
+          batch_size, n_spheres,
          n_links, n_joints, store_n_links);
      }));
    }
    else if (n_joints < 64)
    {
      AT_DISPATCH_FLOATING_TYPES(
-        grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
-        kin_fused_backward_kernel3<scalar_t, float, true, true, 64, true>
+        grad_spheres.scalar_type(), "kin_fused_backward_16t", ([&] {
+        kin_fused_backward_kernel3<scalar_t, double, true, true, 64, true>
          << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
-          grad_out.data_ptr<scalar_t>(),
-          grad_nlinks_pos.data_ptr<scalar_t>(),
-          grad_nlinks_quat.data_ptr<scalar_t>(),
+          grad_out.data_ptr<float>(),
+          grad_nlinks_pos.data_ptr<float>(),
+          grad_nlinks_quat.data_ptr<float>(),
          grad_spheres.data_ptr<scalar_t>(),
-          global_cumul_mat.data_ptr<scalar_t>(),
-          joint_vec.data_ptr<scalar_t>(),
-          fixed_transform.data_ptr<scalar_t>(),
-          robot_spheres.data_ptr<scalar_t>(),
+          global_cumul_mat.data_ptr<float>(),
+          joint_vec.data_ptr<float>(),
+          fixed_transform.data_ptr<float>(),
+          robot_spheres.data_ptr<float>(),
          joint_map_type.data_ptr<int8_t>(),
          joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
          store_link_map.data_ptr<int16_t>(),
          link_sphere_map.data_ptr<int16_t>(),
-          link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
+          link_chain_map.data_ptr<int16_t>(), 
+          joint_offset_map.data_ptr<float>(),
+          batch_size, n_spheres,
          n_links, n_joints, store_n_links);
      }));
    }
    else
    {
      AT_DISPATCH_FLOATING_TYPES(
-        grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
-        kin_fused_backward_kernel3<scalar_t, float, true, true, 128, true>
+        grad_spheres.scalar_type(), "kin_fused_backward_16t", ([&] {
+        kin_fused_backward_kernel3<scalar_t, double, true, true, 128, true>
          << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
-          grad_out.data_ptr<scalar_t>(),
-          grad_nlinks_pos.data_ptr<scalar_t>(),
-          grad_nlinks_quat.data_ptr<scalar_t>(),
+          grad_out.data_ptr<float>(),
+          grad_nlinks_pos.data_ptr<float>(),
+          grad_nlinks_quat.data_ptr<float>(),
          grad_spheres.data_ptr<scalar_t>(),
-          global_cumul_mat.data_ptr<scalar_t>(),
-          joint_vec.data_ptr<scalar_t>(),
-          fixed_transform.data_ptr<scalar_t>(),
-          robot_spheres.data_ptr<scalar_t>(),
+          global_cumul_mat.data_ptr<float>(),
+          joint_vec.data_ptr<float>(),
+          fixed_transform.data_ptr<float>(),
+          robot_spheres.data_ptr<float>(),
          joint_map_type.data_ptr<int8_t>(),
          joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
          store_link_map.data_ptr<int16_t>(),
          link_sphere_map.data_ptr<int16_t>(),
-          link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
+          link_chain_map.data_ptr<int16_t>(), 
+          joint_offset_map.data_ptr<float>(),
+          batch_size, n_spheres,
          n_links, n_joints, store_n_links);
      }));
    }
@@ -1417,22 +1422,25 @@ std::vector<torch::Tensor>kin_fused_backward_16t(
  {
    //
    AT_DISPATCH_FLOATING_TYPES(
-      grad_nlinks_pos.scalar_type(), "kin_fused_backward_16t", ([&] {
-      kin_fused_backward_kernel3<scalar_t, float, false, true, 128, true>
+      grad_spheres.scalar_type(), "kin_fused_backward_16t", ([&] {
+      kin_fused_backward_kernel3<scalar_t, double, false, true, 128, true>
        << < blocksPerGrid, threadsPerBlock, sharedMemSize, stream >> > (
-        grad_out.data_ptr<scalar_t>(),
-        grad_nlinks_pos.data_ptr<scalar_t>(),
-        grad_nlinks_quat.data_ptr<scalar_t>(),
+        grad_out.data_ptr<float>(),
+        grad_nlinks_pos.data_ptr<float>(),
+        grad_nlinks_quat.data_ptr<float>(),
        grad_spheres.data_ptr<scalar_t>(),
-        global_cumul_mat.data_ptr<scalar_t>(),
-        joint_vec.data_ptr<scalar_t>(),
-        fixed_transform.data_ptr<scalar_t>(),
-        robot_spheres.data_ptr<scalar_t>(),
+        global_cumul_mat.data_ptr<float>(),
+        joint_vec.data_ptr<float>(),
+        fixed_transform.data_ptr<float>(),
+        robot_spheres.data_ptr<float>(),
        joint_map_type.data_ptr<int8_t>(),
-        joint_map.data_ptr<int16_t>(), link_map.data_ptr<int16_t>(),
+        joint_map.data_ptr<int16_t>(), 
+        link_map.data_ptr<int16_t>(),
        store_link_map.data_ptr<int16_t>(),
        link_sphere_map.data_ptr<int16_t>(),
-        link_chain_map.data_ptr<int16_t>(), batch_size, n_spheres,
+        link_chain_map.data_ptr<int16_t>(), 
+        joint_offset_map.data_ptr<float>(),
+        batch_size, n_spheres,
        n_links, n_joints, store_n_links);
    }));
  }
@@ -1447,7 +1455,8 @@ matrix_to_quaternion(torch::Tensor       out_quat,
                     )
 {
  using namespace Curobo::Kinematics;
-
+  CHECK_INPUT(out_quat);
+  CHECK_INPUT_GUARD(in_rot);
  // we compute the warp threads based on number of boxes:

  // TODO: verify this math
--- a/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
+++ b/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
@@ -807,6 +807,7 @@ lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
 {
  using namespace Curobo::Optimization;

+
  // call first kernel:
  //const bool use_experimental = true;
  const bool use_fixed_m = true;
--- a/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
+++ b/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
@@ -15,23 +15,13 @@
 #include <torch/extension.h>

 #include "helper_math.h"
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
+
 #include <vector>
 #include <torch/torch.h>
 #include <c10/cuda/CUDAGuard.h>
+#include "check_cuda.h"
+#include "cuda_precisions.h"

-// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) \
-  AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
-#define CHECK_INPUT(x) \
-  CHECK_CUDA(x);       \
-  CHECK_CONTIGUOUS(x)
-
-#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
-#include <cuda_fp8.h>
-#endif
 #define M 4
 #define VOXEL_DEBUG true
 #define VOXEL_UNOBSERVED_DISTANCE -1000.0
@@ -63,8 +53,7 @@ namespace Curobo
    }


-    #if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
-
+    #if CHECK_FP8
    __device__ __forceinline__ float
    get_array_value(const at::Float8_e4m3fn *grid_features, const int voxel_idx)
    {
@@ -704,16 +693,21 @@ float4 &sum_pt)
    int3 &xyz_grid,
    float &interpolated_distance,
    int &voxel_idx)
-    {
+    { 

      
      // convert location to index: can use floor to cast to int.
      // to account for negative values, add 0.5 * bounds.
      const float3 loc_grid = make_float3(loc_grid_params.x, loc_grid_params.y, loc_grid_params.z);
      const  float3 sphere = make_float3(loc_sphere.x, loc_sphere.y, loc_sphere.z);
-      // xyz_loc = make_int3(floorf((sphere  + 0.5 * loc_grid) / loc_grid_params.w));
-      xyz_loc = make_int3(floorf((sphere) / loc_grid_params.w) + (0.5 * loc_grid/loc_grid_params.w));
-      xyz_grid = make_int3(floorf(loc_grid / loc_grid_params.w));
+      //xyz_loc = make_int3(floorf((sphere  + 0.5 * loc_grid) / loc_grid_params.w));
+      const float inv_voxel_size = 1.0/loc_grid_params.w;
+      //xyz_loc = make_int3(sphere * inv_voxel_size)  + make_int3(0.5 * loc_grid * inv_voxel_size);
+      
+      xyz_loc = make_int3((sphere  + 0.5 * loc_grid) * inv_voxel_size);
+
+      //xyz_loc = make_int3(sphere / loc_grid_params.w) + make_int3(floorf(0.5 * loc_grid/loc_grid_params.w));
+      xyz_grid = make_int3((loc_grid * inv_voxel_size)) + 1;
      
      // find next nearest voxel to current point and then do weighted interpolation:
      voxel_idx = xyz_loc.x * xyz_grid.y * xyz_grid.z  + xyz_loc.y * xyz_grid.z + xyz_loc.z;
@@ -732,8 +726,8 @@ float4 &sum_pt)
      
      
      float3 delta =  sphere - voxel_origin;
-      int3 next_loc = make_int3(floorf((make_float3(xyz_loc) + normalize(delta))));
-      float ratio = length(delta)/loc_grid_params.w;
+      int3 next_loc = make_int3(((make_float3(xyz_loc) + normalize(delta))));
+      float ratio = length(delta) * inv_voxel_size;

      int next_voxel_idx = next_loc.x * xyz_grid.y * xyz_grid.z  + next_loc.y * xyz_grid.z + next_loc.z;

@@ -2752,7 +2746,7 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 3
  else
  {
      
-    #if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
+    #if CHECK_FP8
    const auto fp8_type = torch::kFloat8_e4m3fn;
  #else
    const auto fp8_type = torch::kHalf;
@@ -3217,7 +3211,7 @@ sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3

  int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;

-  #if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
+  #if CHECK_FP8
    const auto fp8_type = torch::kFloat8_e4m3fn;
  #else
    const auto fp8_type = torch::kHalf;
@@ -3352,7 +3346,7 @@ swept_sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3

  int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;

-  #if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 && TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR >= 2
+  #if CHECK_FP8
    const auto fp8_type = torch::kFloat8_e4m3fn;
  #else
    const auto fp8_type = torch::kHalf;
--- a/src/curobo/curobolib/kinematics.py
+++ b/src/curobo/curobolib/kinematics.py
@@ -43,100 +43,10 @@ def rotation_matrix_to_quaternion(in_mat, out_quat):


 class KinematicsFusedFunction(Function):
-    @staticmethod
-    def _call_cuda(
-        link_pos_out,
-        link_quat_out,
-        robot_sphere_out,
-        global_cumul_mat,
-        angle,
-        fixed_transform,
-        robot_spheres,
-        link_map,
-        joint_map,
-        joint_map_type,
-        store_link_map,
-        link_sphere_map,
-    ):
-        b_shape = link_pos_out.shape
-        b_size = b_shape[0]
-        n_spheres = robot_sphere_out.shape[1]
-        r = kinematics_fused_cu.forward(
-            link_pos_out,
-            link_quat_out,
-            robot_sphere_out,
-            global_cumul_mat,
-            angle,
-            fixed_transform,
-            robot_spheres,
-            link_map,
-            joint_map,
-            joint_map_type,
-            store_link_map,
-            link_sphere_map,
-            b_size,
-            n_spheres,
-            False,
-        )
-        out_link_pos = r[0]
-        out_link_quat = r[1]
-        out_spheres = r[2]
-        global_cumul_mat = r[3]
-        return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
-
-    @staticmethod
-    def _call_backward_cuda(
-        grad_out,
-        link_pos_out,
-        link_quat_out,
-        robot_sphere_out,
-        global_cumul_mat,
-        angle,
-        fixed_transform,
-        robot_spheres,
-        link_map,
-        joint_map,
-        joint_map_type,
-        store_link_map,
-        link_sphere_map,
-        link_chain_map,
-        sparsity_opt=True,
-    ):
-        b_shape = link_pos_out.shape
-        b_size = b_shape[0]
-        n_spheres = robot_sphere_out.shape[1]
-        if grad_out.is_contiguous():
-            grad_out = grad_out.view(-1)
-        else:
-            grad_out = grad_out.reshape(-1)
-        r = kinematics_fused_cu.backward(
-            grad_out,
-            link_pos_out,
-            link_quat_out,
-            robot_sphere_out,
-            global_cumul_mat,
-            angle,
-            fixed_transform,
-            robot_spheres,
-            link_map,
-            joint_map,
-            joint_map_type,
-            store_link_map,
-            link_sphere_map,
-            link_chain_map,
-            b_size,
-            n_spheres,
-            sparsity_opt,
-            False,
-        )
-        out_q = r[0].view(b_size, -1)
-
-        return out_q

    @staticmethod
    def forward(
        ctx,
-        # link_mats: torch.Tensor,
        link_pos: torch.Tensor,
        link_quat: torch.Tensor,
        b_robot_spheres: torch.tensor,
@@ -150,9 +60,17 @@ class KinematicsFusedFunction(Function):
        store_link_map: torch.Tensor,
        link_sphere_map: torch.Tensor,
        link_chain_map: torch.Tensor,
+        joint_offset_map: torch.Tensor,
        grad_out: torch.Tensor,
+        use_global_cumul: bool = True,
    ):
-        link_pos, link_quat, b_robot_spheres, global_cumul_mat = KinematicsFusedFunction._call_cuda(
+        ctx.use_global_cumul = use_global_cumul
+        b_shape = link_pos.shape
+        b_size = b_shape[0]
+        n_spheres = b_robot_spheres.shape[1]
+        n_joints = joint_seq.shape[-1]
+
+        r = kinematics_fused_cu.forward(
            link_pos,
            link_quat,
            b_robot_spheres,
@@ -165,7 +83,17 @@ class KinematicsFusedFunction(Function):
            joint_map_type,
            store_link_map,
            link_sphere_map,
+            joint_offset_map,  # offset_joint_map
+            b_size,
+            n_joints,
+            n_spheres,
+            True,
        )
+        out_link_pos = r[0]
+        out_link_quat = r[1]
+        out_spheres = r[2]
+        global_cumul_mat = r[3]
+
        ctx.save_for_backward(
            joint_seq,
            fixed_transform,
@@ -176,11 +104,11 @@ class KinematicsFusedFunction(Function):
            store_link_map,
            link_sphere_map,
            link_chain_map,
+            joint_offset_map,
            grad_out,
            global_cumul_mat,
        )
-
-        return link_pos, link_quat, b_robot_spheres
+        return out_link_pos, out_link_quat, out_spheres

    @staticmethod
    def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
@@ -197,9 +125,11 @@ class KinematicsFusedFunction(Function):
                store_link_map,
                link_sphere_map,
                link_chain_map,
+                joint_offset_map,
                grad_out,
                global_cumul_mat,
            ) = ctx.saved_tensors
+
            grad_joint = KinematicsFusedFunction._call_backward_cuda(
                grad_out,
                grad_out_link_pos,
@@ -215,7 +145,9 @@ class KinematicsFusedFunction(Function):
                store_link_map,
                link_sphere_map,
                link_chain_map,
+                joint_offset_map,
                True,
+                use_global_cumul=ctx.use_global_cumul,
            )

        return (
@@ -233,53 +165,10 @@ class KinematicsFusedFunction(Function):
            None,
            None,
            None,
+            None,
+            None,
        )

-
-class KinematicsFusedGlobalCumulFunction(Function):
-    @staticmethod
-    def _call_cuda(
-        link_pos_out,
-        link_quat_out,
-        robot_sphere_out,
-        global_cumul_mat,
-        angle,
-        fixed_transform,
-        robot_spheres,
-        link_map,
-        joint_map,
-        joint_map_type,
-        store_link_map,
-        link_sphere_map,
-    ):
-        b_shape = link_pos_out.shape
-        b_size = b_shape[0]
-        n_spheres = robot_sphere_out.shape[1]
-        # print(n_spheres)
-        # print(angle)
-        r = kinematics_fused_cu.forward(
-            link_pos_out,
-            link_quat_out,
-            robot_sphere_out,
-            global_cumul_mat,
-            angle,
-            fixed_transform,
-            robot_spheres,
-            link_map,
-            joint_map,
-            joint_map_type,
-            store_link_map,
-            link_sphere_map,
-            b_size,
-            n_spheres,
-            True,
-        )
-        out_link_pos = r[0]
-        out_link_quat = r[1]
-        out_spheres = r[2]
-        global_cumul_mat = r[3]
-        return out_link_pos, out_link_quat, out_spheres, global_cumul_mat
-
    @staticmethod
    def _call_backward_cuda(
        grad_out,
@@ -296,18 +185,18 @@ class KinematicsFusedGlobalCumulFunction(Function):
        store_link_map,
        link_sphere_map,
        link_chain_map,
+        joint_offset_map,
        sparsity_opt=True,
+        use_global_cumul=False,
    ):
-        b_shape = link_pos_out.shape
+        b_shape = grad_out.shape
        b_size = b_shape[0]
-        # b_size = link_mat_out.shape[0]
        n_spheres = robot_sphere_out.shape[1]
+        n_joints = angle.shape[-1]
        if grad_out.is_contiguous():
            grad_out = grad_out.view(-1)
        else:
            grad_out = grad_out.reshape(-1)
-        # create backward tensors?
-
        r = kinematics_fused_cu.backward(
            grad_out,
            link_pos_out,
@@ -323,126 +212,17 @@ class KinematicsFusedGlobalCumulFunction(Function):
            store_link_map,
            link_sphere_map,
            link_chain_map,
+            joint_offset_map,  # offset_joint_map
            b_size,
+            n_joints,
            n_spheres,
            sparsity_opt,
-            True,
+            use_global_cumul,
        )
-        out_q = r[0]  # .view(angle.shape)
-        out_q = out_q.view(b_size, -1)
+        out_q = r[0].view(b_size, -1)
+
        return out_q

-    @staticmethod
-    def forward(
-        ctx,
-        # link_mats: torch.Tensor,
-        link_pos: torch.Tensor,
-        link_quat: torch.Tensor,
-        b_robot_spheres: torch.tensor,
-        global_cumul_mat: torch.Tensor,
-        joint_seq: torch.Tensor,
-        fixed_transform: torch.tensor,
-        robot_spheres: torch.tensor,
-        link_map: torch.tensor,
-        joint_map: torch.Tensor,
-        joint_map_type: torch.Tensor,
-        store_link_map: torch.Tensor,
-        link_sphere_map: torch.Tensor,
-        link_chain_map: torch.Tensor,
-        grad_out: torch.Tensor,
-    ):
-        if joint_seq.is_contiguous():
-            joint_seq = joint_seq.view(-1)
-        else:
-            joint_seq = joint_seq.reshape(-1)
-        (
-            link_pos,
-            link_quat,
-            b_robot_spheres,
-            global_cumul_mat,
-        ) = KinematicsFusedGlobalCumulFunction._call_cuda(
-            link_pos,
-            link_quat,
-            b_robot_spheres,
-            global_cumul_mat,
-            joint_seq,
-            fixed_transform,
-            robot_spheres,
-            link_map,
-            joint_map,
-            joint_map_type,
-            store_link_map,
-            link_sphere_map,
-        )
-        ctx.save_for_backward(
-            joint_seq,
-            fixed_transform,
-            robot_spheres,
-            link_map,
-            joint_map,
-            joint_map_type,
-            store_link_map,
-            link_sphere_map,
-            link_chain_map,
-            grad_out,
-            global_cumul_mat,
-        )
-
-        return link_pos, link_quat, b_robot_spheres
-
-    @staticmethod
-    def backward(ctx, grad_out_link_pos, grad_out_link_quat, grad_out_spheres):
-        grad_joint = None
-
-        if ctx.needs_input_grad[4]:
-            (
-                joint_seq,
-                fixed_transform,
-                robot_spheres,
-                link_map,
-                joint_map,
-                joint_map_type,
-                store_link_map,
-                link_sphere_map,
-                link_chain_map,
-                grad_out,
-                global_cumul_mat,
-            ) = ctx.saved_tensors
-            grad_joint = KinematicsFusedGlobalCumulFunction._call_backward_cuda(
-                grad_out,
-                grad_out_link_pos,
-                grad_out_link_quat,
-                grad_out_spheres,
-                global_cumul_mat,
-                joint_seq,
-                fixed_transform,
-                robot_spheres,
-                link_map,
-                joint_map,
-                joint_map_type,
-                store_link_map,
-                link_sphere_map,
-                link_chain_map,
-                True,
-            )
-
-        return (
-            None,
-            None,
-            None,
-            None,
-            grad_joint,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
-

 def get_cuda_kinematics(
    link_pos_seq,
@@ -458,41 +238,28 @@ def get_cuda_kinematics(
    store_link_map,
    link_sphere_idx_map,  # sphere idx map
    link_chain_map,
+    joint_offset_map,
    grad_out_q,
    use_global_cumul: bool = True,
 ):
-    if use_global_cumul:
-        link_pos, link_quat, robot_spheres = KinematicsFusedGlobalCumulFunction.apply(
-            link_pos_seq,
-            link_quat_seq,
-            batch_robot_spheres,
-            global_cumul_mat,
-            q_in,
-            fixed_transform,
-            link_spheres_tensor,
-            link_map,  # tells which link is attached to which link i
-            joint_map,  # tells which joint is attached to a link i
-            joint_map_type,  # joint type
-            store_link_map,
-            link_sphere_idx_map,  # sphere idx map
-            link_chain_map,
-            grad_out_q,
-        )
-    else:
-        link_pos, link_quat, robot_spheres = KinematicsFusedFunction.apply(
-            link_pos_seq,
-            link_quat_seq,
-            batch_robot_spheres,
-            global_cumul_mat,
-            q_in,
-            fixed_transform,
-            link_spheres_tensor,
-            link_map,  # tells which link is attached to which link i
-            joint_map,  # tells which joint is attached to a link i
-            joint_map_type,  # joint type
-            store_link_map,
-            link_sphere_idx_map,  # sphere idx map
-            link_chain_map,
-            grad_out_q,
-        )
+    # if not q_in.is_contiguous():
+    #    q_in = q_in.contiguous()
+    link_pos, link_quat, robot_spheres = KinematicsFusedFunction.apply(
+        link_pos_seq,
+        link_quat_seq,
+        batch_robot_spheres,
+        global_cumul_mat,
+        q_in,
+        fixed_transform,
+        link_spheres_tensor,
+        link_map,  # tells which link is attached to which link i
+        joint_map,  # tells which joint is attached to a link i
+        joint_map_type,  # joint type
+        store_link_map,
+        link_sphere_idx_map,  # sphere idx map
+        link_chain_map,
+        joint_offset_map,
+        grad_out_q,
+        use_global_cumul,
+    )
    return link_pos, link_quat, robot_spheres