gen_data_curobo/src/curobo/curobolib/cpp/sphere_obb_kernel.cu

/*
 * Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 *
 * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 * property and proprietary rights in and to this material, related
 * documentation and any modifications thereto. Any use, reproduction,
 * disclosure or distribution of this material and related documentation
 * without an express license agreement from NVIDIA CORPORATION or
 * its affiliates is strictly prohibited.
 */

#include <c10/cuda/CUDAStream.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <torch/extension.h>

#include "helper_math.h"

#include <vector>
#include <torch/torch.h>
#include <c10/cuda/CUDAGuard.h>
#include "check_cuda.h"
#include "cuda_precisions.h"

#define M 4
#define VOXEL_DEBUG true
#define VOXEL_UNOBSERVED_DISTANCE -1000.0

// #define MAX_WARP_THREADS 512 // warp level batching. 8 x M = 32
#define MAX_BOX_SHARED 256 // maximum number of boxes we can store distance and closest point
#define DEBUG false
namespace Curobo
{
  namespace Geometry
  {
    /**
     * @brief Compute length of sphere
     *
     * @param v1
     * @param v2
     * @return float
     */
    __device__ __forceinline__ float sphere_length(const float4& v1,
                                                   const float4& v2)
    {
      return norm3df(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
    }

    __device__ __forceinline__ float sphere_distance(const float4& v1,
                                                     const float4& v2)
    {
      return max(0.0f, sphere_length(v1, v2) - v1.w - v2.w);
    }


    #if CHECK_FP8
    __device__ __forceinline__ float
    get_array_value(const at::Float8_e4m3fn *grid_features, const int voxel_idx)
    {

      __nv_fp8_storage_t value_in = (reinterpret_cast<const __nv_fp8_storage_t*> (grid_features))[voxel_idx];

      float value = __half2float(__nv_cvt_fp8_to_halfraw(value_in, __NV_E4M3));

      return value;
    }

    __device__ __forceinline__ void
    get_array_value(const at::Float8_e4m3fn *grid_features, const int voxel_idx, float &value)
    {

      __nv_fp8_storage_t value_in = (reinterpret_cast<const __nv_fp8_storage_t*> (grid_features))[voxel_idx];

      value = __half2float(__nv_cvt_fp8_to_halfraw(value_in, __NV_E4M3));

    }

    #endif

    __device__ __forceinline__ float
    get_array_value(const at::BFloat16 *grid_features, const int voxel_idx)
    {

      __nv_bfloat16 value_in = (reinterpret_cast<const __nv_bfloat16*> (grid_features))[voxel_idx];

      float value = __bfloat162float(value_in);

      return value;
    }

    __device__ __forceinline__ float
    get_array_value(const at::Half *grid_features, const int voxel_idx)
    {

      __nv_half value_in = (reinterpret_cast<const __nv_half*> (grid_features))[voxel_idx];

      float value = __half2float(value_in);

      return value;
    }

    __device__ __forceinline__ float
    get_array_value(const float *grid_features, const int voxel_idx)
    {

      float value = grid_features[voxel_idx];

      return value;
    }

    __device__ __forceinline__ float
    get_array_value(const double *grid_features, const int voxel_idx)
    {

      float value = (float) grid_features[voxel_idx];

      return value;
    }


    __device__ __forceinline__ void
    get_array_value(const at::BFloat16 *grid_features, const int voxel_idx, float &value)
    {

      __nv_bfloat16 value_in = (reinterpret_cast<const __nv_bfloat16*> (grid_features))[voxel_idx];

      value = __bfloat162float(value_in);

    }

    __device__ __forceinline__ void
    get_array_value(const at::Half *grid_features, const int voxel_idx, float &value)
    {

      __nv_half value_in = (reinterpret_cast<const __nv_half*> (grid_features))[voxel_idx];

      value = __half2float(value_in);

    }

    __device__ __forceinline__ void
    get_array_value(const float *grid_features, const int voxel_idx, float &value)
    {

      value = grid_features[voxel_idx];

    }

    __device__ __forceinline__ void
    get_array_value(const double *grid_features, const int voxel_idx, float &value)
    {

      value = (float) grid_features[voxel_idx];

    }

    template<typename scalar_t>
    __device__ __forceinline__ void load_obb_pose(const scalar_t *obb_mat,
                                                  float3& position, float4& quat)
    { // obb_mat has x,y,z, qw, qx, qy, qz, 0 with an extra 0 padding for better use of memory
      float4 temp = *(float4 *)&obb_mat[0];

      position.x = temp.x;
      position.y = temp.y;
      position.z = temp.z;
      quat.w     = temp.w;
      temp       = *(float4 *)&obb_mat[4];
      quat.x     = temp.x;
      quat.y     = temp.y;
      quat.z     = temp.z;
    }

    template<typename scalar_t>
    __device__ __forceinline__ void load_obb_bounds(const scalar_t *obb_bounds,
                                                    float3        & bounds)
    { // obb_bounds has x,y,z, 0 with an extra 0 padding.
      float4 loc_bounds = *(float4 *)&obb_bounds[0];

      bounds.x = loc_bounds.x / 2;
      bounds.y = loc_bounds.y / 2;
      bounds.z = loc_bounds.z / 2;
    }


    template<typename scalar_t>
    __device__ __forceinline__ void
    transform_sphere_quat(const scalar_t *transform_mat, // x,y,z, qw, qx,qy,qz
                          const float4& sphere_pos, float4& C)
    {
      // do dot product:
      // new_p = q * p * q_inv + obs_p
      const float3   p_arr = *(float3 *)&transform_mat[0];
      const scalar_t w     = transform_mat[3];
      const scalar_t x     = transform_mat[4];
      const scalar_t y     = transform_mat[5];
      const scalar_t z     = transform_mat[6];
      if(x != 0 || y!= 0 || z!=0)
      {
        C.x = p_arr.x + w * w * sphere_pos.x + 2 * y * w * sphere_pos.z -
              2 * z * w * sphere_pos.y + x * x * sphere_pos.x +
              2 * y * x * sphere_pos.y + 2 * z * x * sphere_pos.z -
              z * z * sphere_pos.x - y * y * sphere_pos.x;

        C.y = p_arr.y + 2 * x * y * sphere_pos.x + y * y * sphere_pos.y +
              2 * z * y * sphere_pos.z + 2 * w * z * sphere_pos.x -
              z * z * sphere_pos.y + w * w * sphere_pos.y - 2 * x * w * sphere_pos.z -
              x * x * sphere_pos.y;
        C.z = p_arr.z + 2 * x * z * sphere_pos.x + 2 * y * z * sphere_pos.y +
              z * z * sphere_pos.z - 2 * w * y * sphere_pos.x - y * y * sphere_pos.z +
              2 * w * x * sphere_pos.y - x * x * sphere_pos.z + w * w * sphere_pos.z;
      }
      else
      {
        C.x = p_arr.x + sphere_pos.x;
        C.y = p_arr.y + sphere_pos.y;
        C.z = p_arr.z + sphere_pos.z;
      }
      C.w = sphere_pos.w;
    }

    __device__ __forceinline__ void transform_sphere_quat(const float3  p,
                                                          const float4  q,
                                                          const float4& sphere_pos,
                                                          float4      & C)
    {
      // do dot product:
      // new_p = q * p * q_inv + obs_p

      if ((q.x != 0) || (q.y != 0) || (q.z != 0))
      {
        C.x = p.x + q.w * q.w * sphere_pos.x + 2 * q.y * q.w * sphere_pos.z -
              2 * q.z * q.w * sphere_pos.y + q.x * q.x * sphere_pos.x +
              2 * q.y * q.x * sphere_pos.y + 2 * q.z * q.x * sphere_pos.z -
              q.z * q.z * sphere_pos.x - q.y * q.y * sphere_pos.x;
        C.y = p.y + 2 * q.x * q.y * sphere_pos.x + q.y * q.y * sphere_pos.y +
              2 * q.z * q.y * sphere_pos.z + 2 * q.w * q.z * sphere_pos.x -
              q.z * q.z * sphere_pos.y + q.w * q.w * sphere_pos.y - 2 * q.x * q.w * sphere_pos.z -
              q.x * q.x * sphere_pos.y;
        C.z = p.z + 2 * q.x * q.z * sphere_pos.x + 2 * q.y * q.z * sphere_pos.y +
              q.z * q.z * sphere_pos.z - 2 * q.w * q.y * sphere_pos.x - q.y * q.y * sphere_pos.z +
              2 * q.w * q.x * sphere_pos.y - q.x * q.x * sphere_pos.z + q.w * q.w * sphere_pos.z;
      }
      else
      {
        C.x = p.x + sphere_pos.x;
        C.y = p.y + sphere_pos.y;
        C.z = p.z + sphere_pos.z;
      }
      C.w = sphere_pos.w;
    }

    __device__ __forceinline__ void
    inv_transform_vec_quat(
      const float3 p,
      const float4 q,
      const float4& sphere_pos, float3& C)
    {
      // do dot product:
      // new_p = q * p * q_inv + obs_p
      if ((q.x != 0) || (q.y != 0) || (q.z != 0))
      {
        C.x =  q.w *  q.w * sphere_pos.x - 2 * q.y *  q.w * sphere_pos.z +
              2 * q.z *  q.w * sphere_pos.y + q.x * q.x * sphere_pos.x +
              2 * q.y * q.x * sphere_pos.y + 2 * q.z * q.x * sphere_pos.z -
              q.z * q.z * sphere_pos.x - q.y * q.y * sphere_pos.x;
        C.y = 2 * q.x * q.y * sphere_pos.x + q.y * q.y * sphere_pos.y +
              2 * q.z * q.y * sphere_pos.z - 2 *  q.w * q.z * sphere_pos.x -
              q.z * q.z * sphere_pos.y +  q.w *  q.w * sphere_pos.y + 2 * q.x *  q.w *
              sphere_pos.z -
              q.x * q.x * sphere_pos.y;
        C.z = 2 * q.x * q.z * sphere_pos.x + 2 * q.y * q.z * sphere_pos.y +
              q.z * q.z * sphere_pos.z + 2 *  q.w * q.y * sphere_pos.x - q.y * q.y * sphere_pos.z -
              2 *  q.w * q.x * sphere_pos.y - q.x * q.x * sphere_pos.z +  q.w *  q.w * sphere_pos.z;
      }
      else
      {
        C.x = sphere_pos.x ;
        C.y = sphere_pos.y ;
        C.z = sphere_pos.z ;
      }
    }

    __device__ __forceinline__ void
    inv_transform_vec_quat_add(const float3 p,
                               const float4 q, // x,y,z, qw, qx,qy,qz
                               const float4& sphere_pos, float3& C)
    {
      // do dot product:
      // new_p = q * p * q_inv + obs_p
      float3 temp_C = make_float3(0.0);

      inv_transform_vec_quat(p, q, sphere_pos, temp_C);
      C = C + temp_C;
    }


    /**
     * @brief Scales the Collision across the trajectory by sphere velocity. This is
     * implemented from CHOMP motion planner (ICRA 2009). We use central difference
     * to compute the velocity and acceleration of the sphere.
     *
     * @param sphere_0_cache
     * @param sphere_1_cache
     * @param sphere_2_cache
     * @param dt
     * @param transform_back
     * @param max_dist
     * @param max_grad
     * @return void
     */
    __device__ __forceinline__ void
    scale_speed_metric(const float4& sphere_0_cache, const float4& sphere_1_cache,
                       const float4& sphere_2_cache, const float& dt,
                       const bool& transform_back, float& max_dist,
                       float3& max_grad)
    {
      float3 norm_vel_vec = make_float3(sphere_2_cache.x - sphere_0_cache.x,
                                        sphere_2_cache.y - sphere_0_cache.y,
                                        sphere_2_cache.z - sphere_0_cache.z);

      norm_vel_vec = (0.5 / dt) * norm_vel_vec;
      const float sph_vel = length(norm_vel_vec);
      if(sph_vel < 0.001)
      {
        return;
      }
      if (transform_back)
      {
        float3 sph_acc_vec = make_float3(
          sphere_0_cache.x + sphere_2_cache.x - 2 * sphere_1_cache.x,
          sphere_0_cache.y + sphere_2_cache.y - 2 * sphere_1_cache.y,
          sphere_0_cache.z + sphere_2_cache.z - 2 * sphere_1_cache.z);

        sph_acc_vec  = (1 / (dt * dt)) * sph_acc_vec;
        norm_vel_vec = norm_vel_vec * (1 / sph_vel);

        const float3 curvature_vec = (sph_acc_vec) / (sph_vel * sph_vel);

        // compute orthogonal projection:
        float orth_proj[9] = { 0.0 };

        // load float3 into array for easier matmul later:
        float vel_arr[3];
        vel_arr[0] = norm_vel_vec.x;
        vel_arr[1] = norm_vel_vec.y;
        vel_arr[2] = norm_vel_vec.z;

        // calculate projection ( I - (v * v^T)):
#pragma unroll

        for (int i = 0; i < 3; i++)
        {
#pragma unroll

          for (int j = 0; j < 3; j++)
          {
            orth_proj[i * 3 + j] = -1 * vel_arr[i] * vel_arr[j];
          }
        }
        orth_proj[0] += 1;
        orth_proj[4] += 1;
        orth_proj[8] += 1;

        // curvature vec:

        // multiply by orth projection:
        // two matmuls:
        float orth_pt[3];    // orth_proj(3x3) * max_grad(3x1)
        float orth_curve[3]; // max_dist(1) * orth_proj (3x3) * curvature_vec (3x1)

#pragma unroll

        for (int i = 0; i < 3; i++) // matrix - vector product
        {
          orth_pt[i] = orth_proj[i * 3 + 0] * max_grad.x +
                       orth_proj[i * 3 + 1] * max_grad.y +
                       orth_proj[i * 3 + 2] * max_grad.z;

          orth_curve[i] = max_dist * (orth_proj[i * 3 + 0] * curvature_vec.x +
                                      orth_proj[i * 3 + 1] * curvature_vec.y +
                                      orth_proj[i * 3 + 2] * curvature_vec.z);
        }

        // max_grad =  sph_vel * ((orth_proj * max_grad) - max_dist *  orth_proj *
        // curvature)

        max_grad.x = sph_vel * (orth_pt[0] - orth_curve[0]); // orth_proj[0];// * (orth_pt[0] -
                                                             // orth_curve[0]);
        max_grad.y = sph_vel * (orth_pt[1] - orth_curve[1]);
        max_grad.z = sph_vel * (orth_pt[2] - orth_curve[2]);
      }
      max_dist = sph_vel * max_dist;
    }

    //


    __device__ __forceinline__ void
    compute_closest_point(const float3& bounds, const float4& sphere,
    float3& delta, float& distance, float& sph_distance)
    {
      float3 pt = make_float3(sphere.x, sphere.y, sphere.z);
      bool inside = true;

      if (max(max(fabs(sphere.x) - bounds.x, fabs(sphere.y) - bounds.y),
              fabs(sphere.z) - bounds.z) >= (0.0))
      {
        inside = false;
      }


      float3 val = make_float3(sphere.x,sphere.y,sphere.z);
      val = bounds - fabs(val);

      if(!inside)
      {


        if (val.x < 0) // it's outside limits, clamp:
        {
          pt.x = copysignf(bounds.x, sphere.x);
        }


        if (val.y < 0) // it's outside limits, clamp:
        {
          pt.y = copysignf(bounds.y, sphere.y);
        }

        if (val.z < 0) // it's outside limits, clamp:
        {
          pt.z = copysignf(bounds.z, sphere.z);
        }
      }
      else
      {


        val = fabs(val);


        if (val.y <= val.x && val.y <= val.z)
        {

          if(sphere.y > 0)
          {
            pt.y = bounds.y;
          }
          else
          {
            pt.y = -1 * bounds.y;
          }
        }

        else if (val.x <= val.y && val.x <= val.z)
        {
          if(sphere.x > 0)
          {
            pt.x = bounds.x;
          }
          else
          {
            pt.x = -1 * bounds.x;
          }
        }
        else if (val.z <= val.x && val.z <= val.y)
        {

          if(sphere.z > 0)
          {
            pt.z = bounds.z;
          }
          else
          {
            pt.z = -1 * bounds.z;
          }
        }


      }

      delta = make_float3(pt.x - sphere.x, pt.y - sphere.y, pt.z - sphere.z);

      distance = length(delta);

      if (!inside)
      {
        distance *= -1.0;
      }
      else
      {
        delta = -1 * delta;
      }

      if (distance != 0.0)
      {
        delta = normalize(delta);

      }
      sph_distance = distance + sphere.w;
      //


    }

    /**
     * @brief check if sphere is inside. For numerical stability, we assume that if
     * sphere is exactly at bound of cuboid, we are not in collision. Note: this is
     * not warp safe.
     *
     * @param bounds bounds of cuboid
     * @param sphere sphere as float4 (in cuboid frame of reference)
     * @return bool
     */
    __device__ __forceinline__ bool
    check_sphere_aabb(const float3 bounds, const float4 sphere,  bool &inside,
    float3& delta, float& distance, float& sph_distance)
    {
      // if((fabs(sphere.x) - bounds.x) >= sphere.w || fabs(sphere.y) - bounds.y >=
      // sphere.w || (fabs(sphere.z) - bounds.z) >= sphere.w)

      inside = false;


      if (max(max(fabs(sphere.x) - bounds.x, fabs(sphere.y) - bounds.y),
              fabs(sphere.z) - bounds.z) >= (sphere.w))
      {
        return false;
      }
      // if it's within aabb, check more accurately:
      // compute closest point:

      compute_closest_point(bounds, sphere, delta, distance, sph_distance);
      if (sph_distance > 0)
      {
        inside = true;
      }

      return inside;
    }
    __device__ __forceinline__ float
    compute_distance_fn(
      const float3& bounds,
      const float4& sphere,
      const float max_distance,
      float3& delta,
      float& sph_dist,
      float& distance,
      bool& inside) // pass in cl_pt
    {

      // compute distance:
      float4 loc_sphere = sphere;
      loc_sphere.w = max_distance;
      distance = max_distance;
      check_sphere_aabb(bounds, loc_sphere, inside, delta, distance, sph_dist);

      //distance = fabsf(distance);
      return distance;
    }


template<bool SCALE_METRIC=true>
__device__ __forceinline__ void scale_eta_metric_vector(
const float eta,
float4 &sum_pt)
{
  float sph_dist = sum_pt.w;

  if (sph_dist == 0)
  {
    sum_pt.x = 0;
    sum_pt.y = 0;
    sum_pt.z = 0;
    sum_pt.w = 0;

    return;
  }
  sum_pt.w = sph_dist - eta;
  //sum_pt.x = sum_pt.x / sph_dist;
  //sum_pt.y = sum_pt.y / sph_dist;
  //sum_pt.z = sum_pt.z / sph_dist;

  if (SCALE_METRIC)
  {
    if (eta > 0.0 && sph_dist > 0)
    {
      //sum_pt.x = sum_pt.x * (1/sph_dist);
      //sum_pt.y = sum_pt.y * (1/sph_dist);
      //sum_pt.z = sum_pt.z * (1/sph_dist);

    if (sph_dist> eta)
    {
      sum_pt.w = sph_dist - 0.5 * eta;


    } else if (sph_dist <= eta)
    {

      sum_pt.w = (0.5 / eta) * (sph_dist) * (sph_dist);
      const float scale = (1 / eta) * (sph_dist);
      sum_pt.x = scale * sum_pt.x;
      sum_pt.y = scale * sum_pt.y;
      sum_pt.z = scale * sum_pt.z;
    }

    }

  }
}

    template<bool SCALE_METRIC=true>
    __device__ __forceinline__ void scale_eta_metric(
      const float3 delta,
      const float sph_dist,
      const float eta,
      float4& sum_pt)
    {
      // compute distance:
      //float sph_dist = 0;

      sum_pt.x =  delta.x;
      sum_pt.y =  delta.y;
      sum_pt.z =  delta.z;
      sum_pt.w = sph_dist;


      if(SCALE_METRIC)
      {

        if (sph_dist > 0)
        {

          if (sph_dist > eta)
          {
            sum_pt.w = sph_dist - 0.5 * eta;
          }
          else if (sph_dist <= eta)
          {
            sum_pt.w = (0.5 / eta) * (sph_dist) * (sph_dist);
            const float scale = (1.0 / eta) * (sph_dist);
            sum_pt.x = scale * sum_pt.x;
            sum_pt.y = scale * sum_pt.y;
            sum_pt.z = scale * sum_pt.z;
          }

        }
        else
        {
          sum_pt.x = 0.0;
          sum_pt.y = 0.0;
          sum_pt.z = 0.0;
          sum_pt.w = 0.0;

        }

      }


    }


    template<bool SCALE_METRIC=true>
    __device__ __forceinline__ void scale_eta_metric(const float4& sphere, const float4& cl_pt,
                                                     const float eta,
                                                     const bool inside,
                                                     float4& sum_pt)
    {
      // compute distance:
      float distance = 0;

      scale_eta_metric<SCALE_METRIC>(sphere, cl_pt, eta, inside, sum_pt, distance);


    }


    template<typename grid_scalar_t, bool INTERPOLATION=false>
    __device__ __forceinline__ void
    compute_voxel_location_params(
    const grid_scalar_t *grid_features,
    const float4& loc_grid_params,
    const float4& loc_sphere,
    int3 &xyz_loc,
    int3 &xyz_grid,
    float &interpolated_distance,
    int &voxel_idx)
    {


      // convert location to index: can use floor to cast to int.
      // to account for negative values, add 0.5 * bounds.
      const float3 loc_grid = make_float3(loc_grid_params.x, loc_grid_params.y, loc_grid_params.z);
      const  float3 sphere = make_float3(loc_sphere.x, loc_sphere.y, loc_sphere.z);
      //xyz_loc = make_int3(floorf((sphere  + 0.5 * loc_grid) / loc_grid_params.w));
      const float inv_voxel_size = 1.0/loc_grid_params.w;
      //xyz_loc = make_int3(sphere * inv_voxel_size)  + make_int3(0.5 * loc_grid * inv_voxel_size);

      xyz_loc = make_int3((sphere  + 0.5 * loc_grid) * inv_voxel_size);

      //xyz_loc = make_int3(sphere / loc_grid_params.w) + make_int3(floorf(0.5 * loc_grid/loc_grid_params.w));
      xyz_grid = make_int3((loc_grid * inv_voxel_size)) + 1;

      // find next nearest voxel to current point and then do weighted interpolation:
      voxel_idx = xyz_loc.x * xyz_grid.y * xyz_grid.z  + xyz_loc.y * xyz_grid.z + xyz_loc.z;


      // compute interpolation distance between voxel origin and sphere location:
      get_array_value(grid_features, voxel_idx, interpolated_distance);
      if(!INTERPOLATION)
      {
        interpolated_distance += 0.5 * loc_grid_params.w;//max(0.0, (0.3 * loc_grid_params.w) - loc_sphere.w);
      }
      if(INTERPOLATION)
      {
      //
      float3 voxel_origin = (make_float3(xyz_loc) * loc_grid_params.w) - (loc_grid/2);


      float3 delta =  sphere - voxel_origin;
      int3 next_loc = make_int3(((make_float3(xyz_loc) + normalize(delta))));
      float ratio = length(delta) * inv_voxel_size;

      int next_voxel_idx = next_loc.x * xyz_grid.y * xyz_grid.z  + next_loc.y * xyz_grid.z + next_loc.z;

      interpolated_distance = ratio * interpolated_distance + (1 - ratio) * get_array_value(grid_features, next_voxel_idx) +
       max(0.0, (ratio * loc_grid_params.w) - loc_sphere.w);;

      }


    }

    template<typename grid_scalar_t>
    __device__ __forceinline__ void
    compute_voxel_index(
    const grid_scalar_t *grid_features,
    const float4& loc_grid_params,
    const float4& loc_sphere,
    int &voxel_idx,
    int3 &xyz_loc,
    int3 &xyz_grid,
    float &interpolated_distance)
    {
      // check if sphere is out of bounds
      // loc_grid_params.x contains bounds
      float4 local_bounds = 0.5*loc_grid_params - 2*loc_grid_params.w;

      if (loc_sphere.x <= -1 * (local_bounds.x) ||
          loc_sphere.x >= (local_bounds.x) ||
          loc_sphere.y <= -1 * (local_bounds.y) ||
          loc_sphere.y >=  (local_bounds.y) ||
          loc_sphere.z <= -1 * (local_bounds.z) ||
          loc_sphere.z >=  (local_bounds.z))
      {
        voxel_idx = -1;
        return;
      }

      compute_voxel_location_params(grid_features, loc_grid_params, loc_sphere, xyz_loc, xyz_grid, interpolated_distance, voxel_idx);
      // convert location to index: can use floor to cast to int.
      // to account for negative values, add 0.5 * bounds.


    }


    template<typename grid_scalar_t, bool NORMALIZE=true, bool CENTRAL_DIFFERENCE=true, bool ADD_NOISE=false>
    __device__ __forceinline__ void
    compute_voxel_fd_gradient(
    const grid_scalar_t *grid_features,
    const int voxel_layer_start_idx,
    const int3& xyz_loc,
    const int3& xyz_grid,
    const float voxel_size,
    float4 &cl_pt)
    {

    float3 d_grad;
    if (CENTRAL_DIFFERENCE)
    {

    // x difference:
    int x_plus, x_minus, y_plus, y_minus, z_plus, z_minus;

    x_plus = (xyz_loc.x + 1) * xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z;
    x_minus = (xyz_loc.x - 1)* xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z;

    y_plus = (xyz_loc.x) * xyz_grid.y * xyz_grid.z + (xyz_loc.y + 1) * xyz_grid.z + xyz_loc.z;
    y_minus = (xyz_loc.x )* xyz_grid.y * xyz_grid.z + (xyz_loc.y -1) * xyz_grid.z + xyz_loc.z;

    z_plus = (xyz_loc.x) * xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z + 1;
    z_minus = (xyz_loc.x)* xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z - 1;


    float3 d_plus = make_float3(
      get_array_value(grid_features,voxel_layer_start_idx + x_plus),
      get_array_value(grid_features, voxel_layer_start_idx + y_plus),
      get_array_value(grid_features,voxel_layer_start_idx + z_plus));
    float3 d_minus = make_float3(
      get_array_value(grid_features,voxel_layer_start_idx + x_minus),
      get_array_value(grid_features, voxel_layer_start_idx + y_minus),
      get_array_value(grid_features,voxel_layer_start_idx + z_minus));


    d_grad = (d_plus - d_minus) * (1/(2*voxel_size));
    }
    if (!CENTRAL_DIFFERENCE)
    {
        // x difference:
    int x_minus,y_minus, z_minus;

    x_minus = (xyz_loc.x - 1)* xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z;
    y_minus = (xyz_loc.x )* xyz_grid.y * xyz_grid.z + (xyz_loc.y -1) * xyz_grid.z + xyz_loc.z;
    z_minus = (xyz_loc.x)* xyz_grid.y * xyz_grid.z + xyz_loc.y * xyz_grid.z + xyz_loc.z - 1;


    float3 d_plus = make_float3(cl_pt.w, cl_pt.w, cl_pt.w);
    float3 d_minus = make_float3(
      get_array_value(grid_features,voxel_layer_start_idx + x_minus),
      get_array_value(grid_features, voxel_layer_start_idx + y_minus),
      get_array_value(grid_features,voxel_layer_start_idx + z_minus));


    d_grad = (d_plus - d_minus) * (1/voxel_size);
    }


    if (NORMALIZE)
    {
      if (!(d_grad.x ==0 && d_grad.y == 0 && d_grad.z == 0))
      {
        d_grad = normalize(d_grad);
      }
    }
    cl_pt.x = d_grad.x;
    cl_pt.y = d_grad.y;
    cl_pt.z = d_grad.z;
    if (ADD_NOISE)
    {
      if (cl_pt.z == 0 && cl_pt.x == 0 && cl_pt.y == 0)
      {
      cl_pt.x = 0.001;
      cl_pt.y = 0.001;
      }
    }

    }

    template<typename grid_scalar_t, bool SCALE_METRIC=true>
    __device__ __forceinline__ void
    compute_sphere_voxel_gradient(const grid_scalar_t *grid_features,
    const int voxel_layer_start_idx,
    const int num_voxels,
    const float4& loc_grid_params,
    const float4& loc_sphere,
    float4 &sum_pt,
    float &signed_distance,
    const float eta = 0.0,
    const float max_distance = -10.0,
    const bool transform_back = true)
    {
      int voxel_idx = 0;
      int3 xyz_loc = make_int3(0,0,0);
      int3 xyz_grid = make_int3(0,0,0);
      float interpolated_distance = 0.0;
      compute_voxel_index(grid_features, loc_grid_params, loc_sphere, voxel_idx, xyz_loc, xyz_grid, interpolated_distance);
      if (voxel_idx < 0 || voxel_idx >= num_voxels)
      {
        sum_pt.w = VOXEL_UNOBSERVED_DISTANCE;
        signed_distance = VOXEL_UNOBSERVED_DISTANCE;
        return;
      }


        //sum_pt.w = get_array_value(grid_features,voxel_layer_start_idx + voxel_idx);
        sum_pt.w = interpolated_distance;

        if ((!SCALE_METRIC && transform_back)|| (transform_back && sum_pt.w > -loc_sphere.w ))
        {
          // compute closest point:
          compute_voxel_fd_gradient(grid_features, voxel_layer_start_idx, xyz_loc, xyz_grid, loc_grid_params.w, sum_pt);
        }

        signed_distance = sum_pt.w;

        sum_pt.w += loc_sphere.w;


        scale_eta_metric_vector<SCALE_METRIC>(eta, sum_pt);


    }


    __device__ __forceinline__ void check_jump_distance(
      const float4& loc_sphere_1, const float4 loc_sphere_0, const float k0,
      const float3& bounds,
      const float max_distance,
      float3& delta,
      float& sph_dist,
      float& distance,
      bool& inside,
      const float eta,
      float4& sum_pt,
      float& curr_jump_distance) // we can pass in interpolated sphere here, also
                                 // need to pass cl_pt for use in
                                 // compute_sphere_gradient & compute_distance
    {
      const float4 interpolated_sphere =
        (k0) * loc_sphere_1 + (1 - k0) * loc_sphere_0;

      if (check_sphere_aabb(bounds, interpolated_sphere, inside, delta, distance, sph_dist))
      {
        float4 loc_grad = make_float4(0,0,0,0);
        scale_eta_metric<true>(delta, sph_dist, eta, loc_grad);
        sum_pt += loc_grad;

      }
      else
      {
        compute_distance_fn(
          bounds,
          interpolated_sphere,
          max_distance,
          delta,
          sph_dist,
          distance,
          inside
         );
      }
      curr_jump_distance += max(fabsf(distance), interpolated_sphere.w);

    }

    ///////////////////////////////////////////////////////////////
    // We write out the distance and gradients for all the spheres.
    // So we do not need to initize the output tensor to 0.
    // Each thread computes the max distance and gradients per sphere.
    // This version should be faster if we have enough spheres/threads
    // to fill the GPU as it avoid inter-thread communication and the
    // use of shared memory.
    ///////////////////////////////////////////////////////////////

    template<typename scalar_t, typename dist_scalar_t=float, bool SCALE_METRIC=true>
    __device__ __forceinline__ void sphere_obb_collision_fn(
      const scalar_t *sphere_position,
      const int env_idx,
      const int bn_sph_idx,
      const int sph_idx,
      dist_scalar_t *out_distance, const float *weight,
      const float *activation_distance, const float *obb_accel,
      const float *obb_bounds, const float *obb_mat,
      const uint8_t *obb_enable, const int max_nobs, const int nboxes)
    {
      float max_dist            = 0;
      const int   start_box_idx = max_nobs * env_idx;
      const float eta           = activation_distance[0];

      // Load sphere_position input
      float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];

      if (sphere_cache.w <= 0.0)
      {
        // write zeros for cost:
        out_distance[bn_sph_idx] = 0;

        return;
      }
      sphere_cache.w += eta;

      float4 loc_sphere = make_float4(0.0);
      float4 obb_quat   = make_float4(0.0);
      float3 obb_pos    = make_float3(0.0);
      float3 loc_bounds = make_float3(0.0);
      bool inside = false;
      float distance = 0.0;
      float sph_dist = 0.0;
      float3 delta = make_float3(0,0,0);

      for (int box_idx = 0; box_idx < nboxes; box_idx++)
      {
        if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
        {
          continue;
        }
        load_obb_pose(&obb_mat[(start_box_idx + box_idx) * 8], obb_pos,
                      obb_quat);
        load_obb_bounds(&obb_bounds[(start_box_idx + box_idx) * 4], loc_bounds);

        transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere);


        // first check if point is inside or outside box:

        if (check_sphere_aabb(loc_bounds, loc_sphere, inside, delta, distance, sph_dist))
        {
          // using same primitive functions:
            max_dist = 1;
            break; // we exit without checking other cuboids if we found a collision.
        }

      }

      out_distance[bn_sph_idx] = weight[0] * max_dist;
    }

    template<typename scalar_t, typename dist_scalar_t=float, bool SCALE_METRIC=true, bool SUM_COLLISIONS=true>
    __device__ __forceinline__ void sphere_obb_distance_fn(
      const scalar_t *sphere_position,
      const int32_t env_idx,
      const int bn_sph_idx,
      const int sph_idx,
      dist_scalar_t *out_distance, scalar_t *closest_pt, uint8_t *sparsity_idx,
      const float *weight, const float *activation_distance,
      const float *obb_accel, const float *obb_bounds,
      const float *obb_mat, const uint8_t *obb_enable, const int max_nobs,
      const int nboxes, const bool transform_back)
    {
      float max_dist  = 0.0;

      const float eta = activation_distance[0];
      float3 max_grad = make_float3(0.0, 0.0, 0.0);

      // Load sphere_position input
      float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];

      if (sphere_cache.w <= 0.0)
      {
        // write zeros for cost:
        out_distance[bn_sph_idx] = 0;

        // write zeros for gradient if not zero:
        if (sparsity_idx[bn_sph_idx] != 0)
        {
          sparsity_idx[bn_sph_idx]               = 0;
          *(float4 *)&closest_pt[bn_sph_idx * 4] = make_float4(0.0);
        }
        return;
      }
      sphere_cache.w += eta;
      const int start_box_idx = max_nobs * env_idx;

      float4 loc_sphere = make_float4(0.0);
      float4 obb_quat   = make_float4(0.0);
      float3 obb_pos    = make_float3(0.0);
      float3 loc_bounds = make_float3(0.0);
      float4 loc_grad = make_float4(0,0,0,0);
      bool inside = false;
      float distance = 0.0;
      float sph_dist = 0.0;
      float3 delta = make_float3(0,0,0);
      for (int box_idx = 0; box_idx < nboxes; box_idx++)
      {
        if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
        {
          continue;
        }

        load_obb_pose(&obb_mat[(start_box_idx + box_idx) * 8], obb_pos,
                      obb_quat);
        load_obb_bounds(&obb_bounds[(start_box_idx + box_idx) * 4], loc_bounds);

        transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere);

        // first check if point is inside or outside box:
        if (check_sphere_aabb(loc_bounds, loc_sphere, inside, delta, distance, sph_dist))
        {
          // compute closest point:
          //loc_bounds = loc_bounds + loc_sphere.w;

          // using same primitive functions:
          scale_eta_metric<SCALE_METRIC>(delta, sph_dist, eta, loc_grad);


          if (SUM_COLLISIONS)
          {
            if (loc_grad.w > 0)
            {
              max_dist += loc_grad.w;

              if (transform_back)
              {

                inv_transform_vec_quat_add(obb_pos, obb_quat, loc_grad, max_grad);
              }
            }
          }
          else
          {
          if (loc_grad.w > max_dist)
          {
            max_dist = loc_grad.w;

            if (transform_back)
            {
              inv_transform_vec_quat(obb_pos, obb_quat, loc_grad, max_grad);

            }
          }

          }

        }
      }

      // sparsity opt:
      if (max_dist == 0)
      {
        if (sparsity_idx[bn_sph_idx] == 0)
        {
          return;
        }
        sparsity_idx[bn_sph_idx] = 0;

        if (transform_back)
        {
          *(float3 *)&closest_pt[bn_sph_idx * 4] = max_grad; // max_grad is all zeros
        }
        out_distance[bn_sph_idx] = 0.0;
        return;
      }

      // else max_dist != 0
      max_dist = weight[0] * max_dist;

      if (transform_back)
      {
        *(float3 *)&closest_pt[bn_sph_idx * 4] = weight[0] * max_grad;
      }
      out_distance[bn_sph_idx] = max_dist;
      sparsity_idx[bn_sph_idx] = 1;
    }


    template<typename scalar_t, typename dist_scalar_t=float,  bool SCALE_METRIC=false, bool SUM_COLLISIONS=false>
    __device__ __forceinline__ void sphere_obb_esdf_fn(
      const scalar_t *sphere_position,
      const int32_t env_idx,
      const int bn_sph_idx,
      const int sph_idx,
      dist_scalar_t *out_distance, scalar_t *closest_pt, uint8_t *sparsity_idx,
      const float *weight, const float *activation_distance,
      const float *max_distance,
      const float *obb_accel, const float *obb_bounds,
      const float *obb_mat, const uint8_t *obb_enable, const int max_nobs,
      const int nboxes, const bool transform_back)
    {

      const float eta = max_distance[0];
      float max_dist  = -1 * eta;

      float3 max_grad = make_float3(0.0, 0.0, 0.0);

      // Load sphere_position input
      float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];
      if (sphere_cache.w <= 0.0)
      {
        // write zeros for cost:
        out_distance[bn_sph_idx] = 0;

        // write zeros for gradient if not zero:
        if (sparsity_idx[bn_sph_idx] != 0)
        {
          sparsity_idx[bn_sph_idx]               = 0;
          *(float4 *)&closest_pt[bn_sph_idx * 4] = make_float4(0.0);
        }
        return;
      }
      sphere_cache.w += eta;

      //const float sphere_radius = sphere_cache.w + eta;

      const int start_box_idx = max_nobs * env_idx;

      float4 loc_sphere = make_float4(0.0);
      float4 obb_quat   = make_float4(0.0);
      float3 obb_pos    = make_float3(0.0);
      float3 loc_bounds = make_float3(0.0);
      bool inside = false;
      float distance = 0.0;
      float sph_dist = 0.0;
      float3 delta = make_float3(0,0,0);
      float4 loc_grad = make_float4(0,0,0,0);

      for (int box_idx = 0; box_idx < nboxes; box_idx++)
      {
        if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
        {
          continue;
        }

        load_obb_pose(&obb_mat[(start_box_idx + box_idx) * 8], obb_pos,
                      obb_quat);
        load_obb_bounds(&obb_bounds[(start_box_idx + box_idx) * 4], loc_bounds);

        transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere);

        // first check if point is inside or outside box:
        if (check_sphere_aabb(loc_bounds, loc_sphere, inside, delta, distance, sph_dist))
        {
          // compute closest point:


          // using same primitive functions:
          scale_eta_metric<SCALE_METRIC>(delta, sph_dist, eta, loc_grad);


          if (loc_grad.w > max_dist)
          {
            max_dist = loc_grad.w;

            if (transform_back)
            {
              inv_transform_vec_quat(obb_pos, obb_quat, loc_grad, max_grad);

            }
          }
        }
      }
      // subtract radius:
      max_dist = max_dist - sphere_cache.w;
      if (transform_back)
      {
        *(float3 *)&closest_pt[bn_sph_idx * 4] = max_grad;
      }
      out_distance[bn_sph_idx] = max_dist;
    }

    template<typename grid_scalar_t, typename geom_scalar_t=float, typename dist_scalar_t=float, typename grad_scalar_t=float, bool SCALE_METRIC=true, int NUM_LAYERS=100>
    __device__ __forceinline__ void sphere_voxel_distance_fn(
      const geom_scalar_t *sphere_position,
      const int32_t env_idx,
      const int bn_sph_idx,
      const int sph_idx,
      dist_scalar_t *out_distance,
      grad_scalar_t *closest_pt,
      uint8_t *sparsity_idx,
      const float *weight,
      const float *activation_distance,
      const float *max_distance,
      const grid_scalar_t *grid_features,
      const float *grid_params,
      const float *obb_mat,
      const uint8_t *obb_enable,
      const int max_nobs,
      const int num_voxels,
      const bool transform_back)
    {
      float max_dist  = 0.0;
      float max_distance_local = max_distance[0];
      max_distance_local = -1 * max_distance_local;
      const float eta = activation_distance[0];
      float3 max_grad = make_float3(0.0, 0.0, 0.0);

      // Load sphere_position input
      float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];

      if (sphere_cache.w <= 0.0)
      {
        // write zeros for cost:
        out_distance[bn_sph_idx] = 0;

        // write zeros for gradient if not zero:
        if (sparsity_idx[bn_sph_idx] != 0)
        {
          sparsity_idx[bn_sph_idx]               = 0;
          *(float4 *)&closest_pt[bn_sph_idx * 4] = make_float4(0.0);
        }
        return;
      }
      sphere_cache.w += eta;
      const int local_env_idx = max_nobs * env_idx;
      float signed_distance = 0;

      float4 loc_sphere = make_float4(0.0);
      float4 obb_quat   = make_float4(0.0);
      float3 obb_pos    = make_float3(0.0);
      float4 loc_grid_params = make_float4(0.0);

      if (NUM_LAYERS <= 4)
      {

      #pragma unroll
      for (int layer_idx=0; layer_idx < NUM_LAYERS; layer_idx++)
      {


      int local_env_layer_idx = local_env_idx + layer_idx;
      if (obb_enable[local_env_layer_idx] != 0) // disabled obstacle
      {

        load_obb_pose(&obb_mat[(local_env_layer_idx) * 8], obb_pos,
                      obb_quat);
        loc_grid_params = *(float4 *)&grid_params[local_env_layer_idx*4];

        transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere);
        int voxel_layer_start_idx = local_env_layer_idx * num_voxels;
        // check distance:
        float4 cl;
        compute_sphere_voxel_gradient<grid_scalar_t,SCALE_METRIC>(grid_features,
        voxel_layer_start_idx, num_voxels,
        loc_grid_params, loc_sphere, cl, signed_distance, eta,
        max_distance_local, transform_back);
        if (cl.w>0.0)
        {
          max_dist += cl.w;
          if (transform_back)
          {
            inv_transform_vec_quat_add(obb_pos, obb_quat, cl, max_grad);

          }
        }
      }
      }
      }
      else
      {


      for (int layer_idx=0; layer_idx < max_nobs; layer_idx++)
      {


      int local_env_layer_idx = local_env_idx + layer_idx;
      if (obb_enable[local_env_layer_idx] != 0) // disabled obstacle
      {

        load_obb_pose(&obb_mat[(local_env_layer_idx) * 8], obb_pos,
                      obb_quat);
        loc_grid_params = *(float4 *)&grid_params[local_env_layer_idx*4];

        transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere);
        int voxel_layer_start_idx = local_env_layer_idx * num_voxels;
        // check distance:
        float4 cl;
        compute_sphere_voxel_gradient<grid_scalar_t,SCALE_METRIC>(grid_features,
        voxel_layer_start_idx, num_voxels,
        loc_grid_params, loc_sphere, cl, signed_distance, eta,
        max_distance_local, transform_back);
        if (cl.w>0.0)
        {
          max_dist += cl.w;
          if (transform_back)
          {
            inv_transform_vec_quat_add(obb_pos, obb_quat, cl, max_grad);

          }
        }
      }
      }
      }
      // sparsity opt:
      if (max_dist == 0)
      {
        if (sparsity_idx[bn_sph_idx] == 0)
        {
          return;
        }
        sparsity_idx[bn_sph_idx] = 0;

        if (transform_back)
        {
          *(float3 *)&closest_pt[bn_sph_idx * 4] = max_grad; // max_grad is all zeros
        }
        out_distance[bn_sph_idx] = 0.0;
        return;
      }

      // else max_dist != 0
      max_dist = weight[0] * max_dist;

      if (transform_back)
      {
        *(float3 *)&closest_pt[bn_sph_idx * 4] = weight[0] * max_grad;
      }
      out_distance[bn_sph_idx] = max_dist;
      sparsity_idx[bn_sph_idx] = 1;
    }


    template<typename grid_scalar_t, typename scalar_t=float, typename dist_scalar_t=float,
    bool SCALE_METRIC=true, bool ENABLE_SPEED_METRIC=true, bool SUM_COLLISIONS=true,
    int NUM_LAYERS=100>
    __device__ __forceinline__ void swept_sphere_voxel_distance_fn(
      const scalar_t *sphere_position,
      const int env_idx,
      const int b_idx,
      const int h_idx,
      const int sph_idx,
      dist_scalar_t *out_distance,
      scalar_t *closest_pt,
      uint8_t *sparsity_idx,
      const float *weight,
      const float *activation_distance,
      const float *max_distance,
      const float *speed_dt,
      const grid_scalar_t *grid_features,
      const float *grid_params,
      const float *grid_pose,
      const uint8_t *grid_enable,
      const int max_nobs,
      const int env_ngrid,
      const int num_voxels,
      const int batch_size,
      const int horizon,
      const int nspheres,
      const int sweep_steps,
      const bool transform_back)
    {
      const int   sw_steps      = sweep_steps;
      const int   b_addrs       =
        b_idx * horizon * nspheres; // + h_idx * n_spheres + sph_idx;

      // We read the same obstacles across

      // Load sphere_position input
      // if h_idx == horizon -1, we just read the same index
      const int bhs_idx = b_addrs + h_idx * nspheres + sph_idx;


      float max_dist  = 0.0;
      float max_distance_local = max_distance[0];
      max_distance_local = -1 * max_distance_local;
      const float eta = activation_distance[0];
      float3 max_grad = make_float3(0.0, 0.0, 0.0);

      // Load sphere_position input
      float4 sphere_1_cache = *(float4 *)&sphere_position[bhs_idx * 4];

      if (sphere_1_cache.w <= 0.0)
      {
        // write zeros for cost:
        out_distance[bhs_idx] = 0;

        // write zeros for gradient if not zero:
        if (sparsity_idx[bhs_idx] != 0)
        {
          sparsity_idx[bhs_idx]               = 0;
          *(float4 *)&closest_pt[bhs_idx * 4] = make_float4(0.0);
        }
        return;
      }
      sphere_1_cache.w += eta;
      float4 loc_sphere_0, loc_sphere_2;

      bool  sweep_back = false;
      bool  sweep_fwd = false;
      float sphere_0_distance, sphere_2_distance, sphere_0_len, sphere_2_len;


      const float dt  = speed_dt[0];
      float4 sphere_0_cache = make_float4(0,0,0,0);
      float4 sphere_2_cache = make_float4(0,0,0,0);

      if (h_idx > 0)
      {
        sphere_0_cache =
          *(float4 *)&sphere_position[b_addrs * 4 + (h_idx - 1) * nspheres * 4 + sph_idx * 4];
        sphere_0_cache.w  = sphere_1_cache.w;
        sphere_0_distance = sphere_distance(sphere_0_cache, sphere_1_cache);
        sphere_0_len      = sphere_0_distance + sphere_0_cache.w * 2;

        if (sphere_0_distance > 0.0)
        {
          sweep_back = true;
        }
      }

      if (h_idx < horizon - 1)
      {
        sphere_2_cache =
          *(float4 *)&sphere_position[b_addrs * 4 + (h_idx + 1) * nspheres * 4 + sph_idx * 4];
        sphere_2_cache.w  = sphere_1_cache.w;
        sphere_2_distance = sphere_distance(sphere_2_cache, sphere_1_cache);
        sphere_2_len      = sphere_2_distance + sphere_2_cache.w * 2;

        if (sphere_2_distance > 0.0)
        {
          sweep_fwd = true;
        }
      }

      float signed_distance = 0.0;
      const int local_env_idx = max_nobs * env_idx;

      float4 loc_sphere = make_float4(0.0);
      float4 obb_quat   = make_float4(0.0);
      float3 obb_pos    = make_float3(0.0);
      float4 loc_grid_params = make_float4(0.0);
      float4 sum_grad = make_float4(0.0, 0.0, 0.0, 0.0);
      float4 cl;
      float jump_mid_distance = 0.0;
      float k0;
      float temp_jump_distance = 0.0;

      if (NUM_LAYERS <= 4)
      {


      #pragma unroll
      for (int layer_idx=0; layer_idx < NUM_LAYERS; layer_idx++)
      {
      float curr_jump_distance = 0.0;

      int local_env_layer_idx = local_env_idx + layer_idx;
      sum_grad *= 0.0;
      if (grid_enable[local_env_layer_idx] != 0) // disabled obstacle
      {

        load_obb_pose(&grid_pose[(local_env_layer_idx) * 8], obb_pos,
                      obb_quat);
        loc_grid_params = *(float4 *)&grid_params[local_env_layer_idx*4];

        transform_sphere_quat(obb_pos, obb_quat, sphere_1_cache, loc_sphere);
        transform_sphere_quat(obb_pos, obb_quat, sphere_0_cache, loc_sphere_0);
        transform_sphere_quat(obb_pos, obb_quat, sphere_2_cache, loc_sphere_2);

        int voxel_layer_start_idx = local_env_layer_idx * num_voxels;
        // check distance:
        compute_sphere_voxel_gradient<grid_scalar_t,SCALE_METRIC>(grid_features,
        voxel_layer_start_idx, num_voxels,
        loc_grid_params, loc_sphere, cl, signed_distance, eta,
        max_distance_local, transform_back);
        if (cl.w>0.0)
        {
          sum_grad += cl;
          jump_mid_distance = signed_distance;
        }
        else if (signed_distance != VOXEL_UNOBSERVED_DISTANCE)
        {
          jump_mid_distance = -1 * signed_distance;
        }


        jump_mid_distance = max(jump_mid_distance, loc_sphere.w);
        curr_jump_distance = jump_mid_distance;
        if (sweep_back && curr_jump_distance < sphere_0_distance/2)
        {
          for (int j=0; j<sw_steps; j++)
          {
            if (curr_jump_distance >= sphere_0_len/2)
            {
              break;
            }
            temp_jump_distance = 0.0;
            k0 = 1 - (curr_jump_distance/sphere_0_len);
            // compute collision
            const float4 interpolated_sphere =
            (k0)*loc_sphere + (1 - k0) * loc_sphere_0;

            compute_sphere_voxel_gradient<grid_scalar_t,SCALE_METRIC>(
              grid_features,
              voxel_layer_start_idx, num_voxels,
              loc_grid_params, interpolated_sphere, cl, signed_distance, eta,
              max_distance_local, transform_back);
            if (cl.w>0.0)
            {
              sum_grad += cl;
              temp_jump_distance = signed_distance;
            }
            else if (signed_distance != VOXEL_UNOBSERVED_DISTANCE)
            {
              temp_jump_distance = -1 * signed_distance;
            }
            temp_jump_distance = max(temp_jump_distance, loc_sphere.w);
            curr_jump_distance += temp_jump_distance;


          }
        }
        curr_jump_distance = jump_mid_distance;
        if (sweep_fwd && curr_jump_distance < sphere_2_distance/2)
        {
          for (int j=0; j<sw_steps; j++)
          {
            if (curr_jump_distance >= sphere_2_len/2)
            {
              break;
            }
            temp_jump_distance = 0.0;
            k0 = 1 - (curr_jump_distance/sphere_2_len);
            // compute collision
            const float4 interpolated_sphere =
            (k0)*loc_sphere + (1 - k0) * loc_sphere_2;

             compute_sphere_voxel_gradient<grid_scalar_t,SCALE_METRIC>(
             grid_features,
             voxel_layer_start_idx, num_voxels,
             loc_grid_params, interpolated_sphere, cl, signed_distance, eta,
             max_distance_local, transform_back);
             if (cl.w>0.0)
            {
              sum_grad += cl;
              temp_jump_distance = signed_distance;
            }
            else if (signed_distance != VOXEL_UNOBSERVED_DISTANCE)
            {
              temp_jump_distance = -1 * signed_distance;
            }
            temp_jump_distance = max(temp_jump_distance, loc_sphere.w);
            curr_jump_distance += temp_jump_distance;

        }
        }
        if (sum_grad.w > 0.0 )
        {
          max_dist += sum_grad.w;
          if (transform_back)
          {
            inv_transform_vec_quat_add(obb_pos, obb_quat, sum_grad, max_grad);
          }

        }


      }
      }
      }
      else
      {


      for (int layer_idx=0; layer_idx < max_nobs; layer_idx++)
      {
      float curr_jump_distance = 0.0;

      int local_env_layer_idx = local_env_idx + layer_idx;
      sum_grad *= 0.0;
      if (grid_enable[local_env_layer_idx] != 0) // disabled obstacle
      {

        load_obb_pose(&grid_pose[(local_env_layer_idx) * 8], obb_pos,
                      obb_quat);
        loc_grid_params = *(float4 *)&grid_params[local_env_layer_idx*4];

        transform_sphere_quat(obb_pos, obb_quat, sphere_1_cache, loc_sphere);
        transform_sphere_quat(obb_pos, obb_quat, sphere_0_cache, loc_sphere_0);
        transform_sphere_quat(obb_pos, obb_quat, sphere_2_cache, loc_sphere_2);

        int voxel_layer_start_idx = local_env_layer_idx * num_voxels;
        // check distance:
        compute_sphere_voxel_gradient<grid_scalar_t,SCALE_METRIC>(grid_features,
        voxel_layer_start_idx, num_voxels,
        loc_grid_params, loc_sphere, cl, signed_distance, eta,
        max_distance_local, transform_back);
        if (cl.w>0.0)
        {
          sum_grad += cl;
          jump_mid_distance = signed_distance;
        }
        else if (signed_distance != VOXEL_UNOBSERVED_DISTANCE)
        {
          jump_mid_distance = -1 * signed_distance;
        }


        jump_mid_distance = max(jump_mid_distance, loc_sphere.w);
        curr_jump_distance = jump_mid_distance;
        if (sweep_back && curr_jump_distance < sphere_0_distance/2)
        {
          for (int j=0; j<sw_steps; j++)
          {
            if (curr_jump_distance >= sphere_0_len/2)
            {
              break;
            }
            temp_jump_distance = 0.0;
            k0 = 1 - (curr_jump_distance/sphere_0_len);
            // compute collision
            const float4 interpolated_sphere =
            (k0)*loc_sphere + (1 - k0) * loc_sphere_0;

            compute_sphere_voxel_gradient<grid_scalar_t,SCALE_METRIC>(
              grid_features,
              voxel_layer_start_idx, num_voxels,
              loc_grid_params, interpolated_sphere, cl, signed_distance, eta,
              max_distance_local, transform_back);
            if (cl.w>0.0)
            {
              sum_grad += cl;
              temp_jump_distance = signed_distance;
            }
            else if (signed_distance != VOXEL_UNOBSERVED_DISTANCE)
            {
              temp_jump_distance = -1 * signed_distance;
            }
            temp_jump_distance = max(temp_jump_distance, loc_sphere.w);
            curr_jump_distance += temp_jump_distance;


          }
        }
        curr_jump_distance = jump_mid_distance;
        if (sweep_fwd && curr_jump_distance < sphere_2_distance/2)
        {
          for (int j=0; j<sw_steps; j++)
          {
            if (curr_jump_distance >= sphere_2_len/2)
            {
              break;
            }
            temp_jump_distance = 0.0;
            k0 = 1 - (curr_jump_distance/sphere_2_len);
            // compute collision
            const float4 interpolated_sphere =
            (k0)*loc_sphere + (1 - k0) * loc_sphere_2;

             compute_sphere_voxel_gradient<grid_scalar_t,SCALE_METRIC>(
             grid_features,
             voxel_layer_start_idx, num_voxels,
             loc_grid_params, interpolated_sphere, cl, signed_distance, eta,
             max_distance_local, transform_back);
             if (cl.w>0.0)
            {
              sum_grad += cl;
              temp_jump_distance = signed_distance;
            }
            else if (signed_distance != VOXEL_UNOBSERVED_DISTANCE)
            {
              temp_jump_distance = -1 * signed_distance;
            }
            temp_jump_distance = max(temp_jump_distance, loc_sphere.w);
            curr_jump_distance += temp_jump_distance;

        }
        }
        if (sum_grad.w > 0.0 )
        {
          max_dist += sum_grad.w;
          if (transform_back)
          {
            inv_transform_vec_quat_add(obb_pos, obb_quat, sum_grad, max_grad);
          }

        }


      }
      }
      }
      // sparsity opt:
      if (max_dist == 0)
      {
        if (sparsity_idx[bhs_idx] == 0)
        {
          return;
        }
        sparsity_idx[bhs_idx] = 0;

        if (transform_back)
        {
          *(float3 *)&closest_pt[bhs_idx * 4] = max_grad; // max_grad is all zeros
        }
        out_distance[bhs_idx] = 0.0;
        return;
      }

       // computer speed metric here:
      if (ENABLE_SPEED_METRIC)
      {
        if (sweep_back && sweep_fwd)
        {
          scale_speed_metric(sphere_0_cache, sphere_1_cache, sphere_2_cache, dt,
                             transform_back, max_dist, max_grad);
        }
      }
      // else max_dist != 0
      max_dist = weight[0] * max_dist;

      if (transform_back)
      {
        *(float3 *)&closest_pt[bhs_idx * 4] = weight[0] * max_grad;
      }
      out_distance[bhs_idx] = max_dist;
      sparsity_idx[bhs_idx] = 1;

    }


    template<typename grid_scalar_t, typename scalar_t,   typename dist_scalar_t=float,
    bool BATCH_ENV_T=true,
    bool SCALE_METRIC=true,
    bool ENABLE_SPEED_METRIC=true,
    bool SUM_COLLISIONS=false,
    int NUM_LAYERS=100>
    __global__ void swept_sphere_voxel_distance_jump_kernel(
      const scalar_t *sphere_position,
      dist_scalar_t *out_distance,
      scalar_t *closest_pt, uint8_t *sparsity_idx, const float *weight,
      const float *activation_distance,
      const float *max_distance,
      const float *speed_dt,
      const grid_scalar_t *grid_features, const float *grid_params,
      const float *grid_pose, const uint8_t *grid_enable,
      const int32_t *n_env_grid, const int32_t *env_query_idx, const int max_nobs,
      const int num_voxels,
      const int batch_size, const int horizon, const int nspheres,
      const int sweep_steps,
      const bool transform_back)
      {
      const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
      const int b_idx = t_idx / (horizon * nspheres);

      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);

      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
      {
        return;
      }

      int env_idx    = 0;

      if (BATCH_ENV_T)
      {
        env_idx = env_query_idx[b_idx];
      }

      const int env_nboxes = n_env_grid[env_idx];
      swept_sphere_voxel_distance_fn<grid_scalar_t, scalar_t,dist_scalar_t, SCALE_METRIC,
      ENABLE_SPEED_METRIC, SUM_COLLISIONS, NUM_LAYERS>(
        sphere_position,  env_idx, b_idx, h_idx, sph_idx,
        out_distance, closest_pt,
        sparsity_idx, weight, activation_distance, max_distance,
        speed_dt,
        grid_features,
        grid_params, grid_pose, grid_enable, max_nobs, env_nboxes, num_voxels, batch_size,
        horizon, nspheres, sweep_steps, transform_back);
      }

    template<typename grid_scalar_t, typename geom_scalar_t=float, typename dist_scalar_t=float, typename grad_scalar_t=float, bool SCALE_METRIC=true, int NUM_LAYERS=100>
    __device__ __forceinline__ void sphere_voxel_esdf_fn(
      const geom_scalar_t *sphere_position,
      const int32_t env_idx,
      const int bn_sph_idx,
      const int sph_idx,
      dist_scalar_t *out_distance,
      grad_scalar_t *closest_pt,
      uint8_t *sparsity_idx,
      const float *weight,
      const float *activation_distance,
      const float *max_distance,
      const grid_scalar_t *grid_features,
      const float *grid_params,
      const float *obb_mat,
      const uint8_t *obb_enable,
      const int max_nobs,
      const int num_voxels,
      const bool transform_back)
    {
      float max_distance_local = max_distance[0];
      const float eta = max_distance_local;
      float max_dist  = -1 * max_distance_local;
      max_distance_local = -1 * max_distance_local;

      float3 max_grad = make_float3(0.0, 0.0, 0.0);

      // Load sphere_position input
      float4 sphere_cache = *(float4 *)&sphere_position[bn_sph_idx * 4];

      if (sphere_cache.w <= 0.0)
      {
        // write zeros for cost:
        out_distance[bn_sph_idx] = 0;

        // write zeros for gradient if not zero:
        if (sparsity_idx[bn_sph_idx] != 0)
        {
          sparsity_idx[bn_sph_idx]               = 0;
          *(float4 *)&closest_pt[bn_sph_idx * 4] = make_float4(0.0);
        }
        return;
      }
      const float sphere_radius = sphere_cache.w;

      sphere_cache.w += eta;
      const int local_env_idx = max_nobs * env_idx;

      float4 loc_sphere = make_float4(0.0);
      float4 obb_quat   = make_float4(0.0);
      float3 obb_pos    = make_float3(0.0);
      float4 loc_grid_params = make_float4(0.0);

      float signed_distance = 0;

      for (int layer_idx=0; layer_idx < max_nobs; layer_idx++)
      {


      int local_env_layer_idx = local_env_idx + layer_idx;
      if (obb_enable[local_env_layer_idx] != 0) // disabled obstacle
      {

        load_obb_pose(&obb_mat[(local_env_layer_idx) * 8], obb_pos,
                      obb_quat);
        loc_grid_params = *(float4 *)&grid_params[local_env_layer_idx*4];

        transform_sphere_quat(obb_pos, obb_quat, sphere_cache, loc_sphere);
        int voxel_layer_start_idx = local_env_layer_idx * num_voxels;
        // check distance:
        float4 cl;

        compute_sphere_voxel_gradient<grid_scalar_t,SCALE_METRIC>(grid_features,
        voxel_layer_start_idx, num_voxels,
        loc_grid_params, loc_sphere, cl, signed_distance, eta,
        max_distance_local, transform_back);
        if (cl.w>max_dist)
        {
          max_dist = cl.w;
          if (transform_back)
          {
            inv_transform_vec_quat(obb_pos, obb_quat, cl, max_grad);

          }
        }
      }
      }


      max_dist = max_dist - sphere_radius;
      if (transform_back)
      {
        *(float3 *)&closest_pt[bn_sph_idx * 4] = max_grad;
      }
      out_distance[bn_sph_idx] = max_dist;

    }


    template<typename scalar_t,  typename dist_scalar_t=float, bool ENABLE_SPEED_METRIC=true, bool SUM_COLLISIONS=true, int SWEEP_STEPS=-1>
    __device__ __forceinline__ void swept_sphere_obb_distance_fn(
      const scalar_t *sphere_position,
      const int env_idx, const int b_idx,
      const int h_idx, const int sph_idx,
      dist_scalar_t *out_distance,
      scalar_t *closest_pt,
      uint8_t *sparsity_idx,
      const float *weight,
      const float *activation_distance, const float *speed_dt,
      const float *obb_accel, const float *obb_bounds,
      const float *obb_mat,
      const uint8_t *obb_enable,
      const int max_nobs,
      const int nboxes, const int batch_size, const int horizon,
      const int nspheres, const int sweep_steps,
      const bool transform_back)
    {
      // create shared memory to do warp wide reductions:
      // warp wide reductions should only happen across nspheres in same batch and horizon
      //
      // extern __shared__ float psum[];
      int   sw_steps      = SWEEP_STEPS;
      const float max_distance = 1000.0;

      if (SWEEP_STEPS == -1)
      {
        sw_steps = sweep_steps;
      }
      const int   start_box_idx = max_nobs * env_idx;
      const int   b_addrs       =
        b_idx * horizon * nspheres; // + h_idx * n_spheres + sph_idx;

      // We read the same obstacles across

      // Load sphere_position input
      // if h_idx == horizon -1, we just read the same index
      const int bhs_idx = b_addrs + h_idx * nspheres + sph_idx;

      float4 sphere_1_cache = *(float4 *)&sphere_position[bhs_idx * 4];


      if (sphere_1_cache.w <= 0.0)
      {
        // write zeros for cost:
        out_distance[bhs_idx] = 0;

        // write zeros for gradient if not zero:
        if (sparsity_idx[bhs_idx] != 0)
        {
          sparsity_idx[b_addrs + h_idx * nspheres + sph_idx] = 0;
          *(float4 *)&closest_pt[bhs_idx * 4]                = make_float4(0.0);
        }

        return;
      }
      bool  sweep_back = false;
      bool  sweep_fwd = false;
      float sphere_0_distance, sphere_2_distance, sphere_0_len, sphere_2_len;

      float  max_dist = 0.0;
      float3 max_grad = make_float3(0.0, 0.0, 0.0);

      const float dt  = speed_dt[0];
      const float eta = activation_distance[0];
      sphere_1_cache.w += eta;
      float4 sphere_0_cache = make_float4(0,0,0,0);
      float4 sphere_2_cache = make_float4(0,0,0,0);
      float4 loc_grad = make_float4(0,0,0,0);
      bool inside = false;
      float distance = 0.0;
      float sph_dist = 0.0;
      float3 delta = make_float3(0,0,0);
      if (h_idx > 0)
      {
        sphere_0_cache =
          *(float4 *)&sphere_position[b_addrs * 4 + (h_idx - 1) * nspheres * 4 + sph_idx * 4];
        sphere_0_cache.w  = sphere_1_cache.w;
        sphere_0_distance = sphere_distance(sphere_0_cache, sphere_1_cache);
        sphere_0_len      = sphere_0_distance + sphere_0_cache.w * 2;

        if (sphere_0_distance > 0.0)
        {
          sweep_back = true;
        }
      }

      if (h_idx < horizon - 1)
      {
        sphere_2_cache =
          *(float4 *)&sphere_position[b_addrs * 4 + (h_idx + 1) * nspheres * 4 + sph_idx * 4];
        sphere_2_cache.w  = sphere_1_cache.w;
        sphere_2_distance = sphere_distance(sphere_2_cache, sphere_1_cache);
        sphere_2_len      = sphere_2_distance + sphere_2_cache.w * 2;

        if (sphere_2_distance > 0.0)
        {
          sweep_fwd = true;
        }
      }
      float4 loc_sphere_0, loc_sphere_1, loc_sphere_2;
      float  k0 = 0.0;


      // float4 loc_sphere = make_float4(0.0);
      float4 obb_quat   = make_float4(0.0);
      float3 obb_pos    = make_float3(0.0);
      float3 loc_bounds = make_float3(0.0);

      for (int box_idx = 0; box_idx < nboxes; box_idx++)
      {
        if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
        {
          continue;
        }

        // read position and quaternion:
        load_obb_pose(&obb_mat[(start_box_idx + box_idx) * 8], obb_pos,
                      obb_quat);
        load_obb_bounds(&obb_bounds[(start_box_idx + box_idx) * 4], loc_bounds);
        float curr_jump_distance = 0.0;

        //const float3 grad_loc_bounds = loc_bounds + sphere_1_cache.w; // assuming sphere radius
                                                                      // doesn't change

        transform_sphere_quat(obb_pos, obb_quat, sphere_1_cache, loc_sphere_1);
        transform_sphere_quat(obb_pos, obb_quat, sphere_0_cache, loc_sphere_0);
        transform_sphere_quat(obb_pos, obb_quat, sphere_2_cache, loc_sphere_2);


        // assuming sphere position is in box frame:
        // read data:
        float4 sum_pt = make_float4(0.0, 0.0, 0.0, 0.0);
        curr_jump_distance = 0.0;
        // check at exact timestep:
        if (check_sphere_aabb(loc_bounds, loc_sphere_1, inside, delta, curr_jump_distance, sph_dist))
        {

          scale_eta_metric<true>(delta, sph_dist, eta, sum_pt);
        }
        else if (sweep_back || sweep_fwd)
        {
          // there is no collision, compute the distance to obstacle:
          curr_jump_distance = compute_distance_fn(loc_bounds, loc_sphere_1,
          max_distance, delta, sph_dist, distance, inside);

        }
        curr_jump_distance = fabsf(curr_jump_distance) - loc_sphere_1.w;
        curr_jump_distance = max(curr_jump_distance, loc_sphere_1.w);

        const float jump_mid_distance = curr_jump_distance;

        // compute distance between sweep spheres:
        if (sweep_back && (jump_mid_distance < sphere_0_distance / 2))
        {

          // get unit vector:
          // loc_sphere_0 = (loc_sphere_0 - loc_sphere_1)/(sphere_0_len);

          // loop over sweep steps and accumulate distance:
          #pragma unroll
          for (int j = 0; j < sw_steps; j++)
          {
            // jump by current jump distance:

            // when sweep_steps == 0, then we only check at loc_sphere_1.
            // do interpolation from t=1 to t=0 (sweep backward)

            if (curr_jump_distance >= (sphere_0_len / 2))
            {
              break;
            }
            k0 = 1 - (curr_jump_distance / sphere_0_len);
            check_jump_distance(loc_sphere_1, loc_sphere_0,
            k0,
            loc_bounds,
            max_distance,
            delta,
            sph_dist,
            distance,
            inside,
            eta,
            sum_pt,
            curr_jump_distance);
          }
        }

        if (sweep_fwd && (jump_mid_distance < (sphere_2_len / 2)))
        {
          curr_jump_distance = jump_mid_distance;

          #pragma unroll
          for (int j = 0; j < sw_steps; j++)
          {
            if (curr_jump_distance >= (sphere_2_len / 2))
            {
              break;
            }
            k0 = 1 - curr_jump_distance / sphere_2_len;
            check_jump_distance(loc_sphere_1, loc_sphere_2,
            k0,
            loc_bounds,
            max_distance,
            delta,
            sph_dist,
            distance,
            inside,
            eta,
            sum_pt,
            curr_jump_distance);
          }
        }
        if (SUM_COLLISIONS)
        {
        if (sum_pt.w > 0) // max_dist starts at 0
        {
          max_dist += sum_pt.w;

          // transform point back if required:
          if (transform_back)
          {
            //inv_transform_vec_quat(obb_pos, obb_quat, sum_pt, max_grad);
            inv_transform_vec_quat_add(obb_pos, obb_quat, sum_pt, max_grad);

          }

          // break;// break after first obstacle collision
        }
        }
        else
        {

        if (sum_pt.w > max_dist) // max_dist starts at 0
        {
          max_dist = sum_pt.w;

          // transform point back if required:
          if (transform_back)
          {
            inv_transform_vec_quat(obb_pos, obb_quat, sum_pt, max_grad);
            //inv_transform_vec_quat_add(obb_pos, obb_quat, sum_pt, max_grad);

          }

          // break;// break after first obstacle collision
        }
        }
      }


      // sparsity opt:
      if (max_dist == 0)
      {
        if (sparsity_idx[bhs_idx] == 0)
        {
          return;
        }
        sparsity_idx[bhs_idx] = 0;

        if (transform_back)
        {
          *(float3 *)&closest_pt[bhs_idx * 4] = max_grad; // max_grad is all zeros
        }
        out_distance[bhs_idx] = 0.0;
        return;
      }

      // computer speed metric here:
      if (ENABLE_SPEED_METRIC)
      {
        if (sweep_back && sweep_fwd)
        {
          scale_speed_metric(sphere_0_cache, sphere_1_cache, sphere_2_cache, dt,
                             transform_back, max_dist, max_grad);
        }
      }
      max_dist = weight[0] * max_dist;

      if (transform_back)
      {
        *(float3 *)&closest_pt[bhs_idx * 4] = weight[0] * max_grad;
      }
      sparsity_idx[bhs_idx] = 1;

      out_distance[bhs_idx] = max_dist;
    }


    /**
     * @brief Swept Collision checking. Note: This function currently does not
     * implement skipping computation based on distance (which is done in
     * swept_sphere_obb_distance_fn).
     *
     * @tparam scalar_t
     * @param sphere_position
     * @param env_idx
     * @param b_idx
     * @param h_idx
     * @param sph_idx
     * @param out_distance
     * @param weight
     * @param activation_distance
     * @param obb_accel
     * @param obb_bounds
     * @param obb_mat
     * @param obb_enable
     * @param max_nobs
     * @param nboxes
     * @param batch_size
     * @param horizon
     * @param nspheres
     * @param sweep_steps
     * @return __device__
     */
    template<typename scalar_t, typename dist_scalar_t=float>
    __device__ __forceinline__ void swept_sphere_obb_collision_fn(
      const scalar_t *sphere_position,
      const int env_idx, const int b_idx,
      const int h_idx, const int sph_idx, dist_scalar_t *out_distance,
      const float *weight, const float *activation_distance,
      const float *obb_accel, const float *obb_bounds,
      const float *obb_mat, const uint8_t *obb_enable, const int max_nobs,
      const int nboxes, const int batch_size, const int horizon,
      const int nspheres, const int sweep_steps)
    {
      const int   sw_steps    = sweep_steps;
      const float fl_sw_steps = 2 * sw_steps + 1;
      float max_dist          = 0.0;
      const float eta         = activation_distance[0];
      const int   b_addrs     =
        b_idx * horizon * nspheres; // + h_idx * n_spheres + sph_idx;
      const int start_box_idx = max_nobs * env_idx;
      const int bhs_idx       = b_addrs + h_idx * nspheres + sph_idx;
      float4 loc_grad = make_float4(0,0,0,0);
      float3 delta = make_float3(0,0,0);
      bool inside = false;
      float curr_jump_distance = 0.0;
      float sph_dist = 0.0;
      // We read the same obstacles across

      // Load sphere_position input
      // if h_idx == horizon -1, we just read the same index
      float4 sphere_1_cache = *(float4 *)&sphere_position[bhs_idx * 4];

      if (sphere_1_cache.w <= 0.0)
      {
        out_distance[b_addrs + h_idx * nspheres + sph_idx] = 0.0;
        return;
      }
      sphere_1_cache.w += eta;

      float4 sphere_0_cache, sphere_2_cache;

      if (h_idx > 0)
      {
        sphere_0_cache = *(float4 *)&sphere_position[b_addrs * 4 + (h_idx - 1) * nspheres * 4 +
                                                     sph_idx * 4];
        sphere_0_cache.w += eta;
      }

      if (h_idx < horizon - 1)
      {
        sphere_2_cache = *(float4 *)&sphere_position[b_addrs * 4 + (h_idx + 1) * nspheres * 4 +
                                                     sph_idx * 4];
        sphere_2_cache.w += eta;
      }
      float4 loc_sphere_0, loc_sphere_1, loc_sphere_2;
      float4 interpolated_sphere;
      float  k0, k1;
      float  in_obb_mat[7];

      for (int box_idx = 0; box_idx < nboxes; box_idx++)
      {
        // read position and quaternion:
        if (obb_enable[start_box_idx + box_idx] == 0) // disabled obstacle
        {
          continue;
        }

#pragma unroll

        for (int i = 0; i < 7; i++)
        {
          in_obb_mat[i] = obb_mat[(start_box_idx + box_idx) * 7 + i];
        }

        float3 loc_bounds =
          *(float3 *)&obb_bounds[(start_box_idx + box_idx) * 3]; // /2
        loc_bounds = loc_bounds / 2;

        transform_sphere_quat(&in_obb_mat[0], sphere_1_cache, loc_sphere_1);

        max_dist += box_idx;

        if (check_sphere_aabb(loc_bounds, loc_sphere_1, inside, delta, curr_jump_distance, sph_dist))
        {

            max_dist = 1;
            break;
        }


        if (h_idx > 0)
        {
          transform_sphere_quat(&in_obb_mat[0], sphere_0_cache, loc_sphere_0);

          // loop over sweep steps and accumulate distance:
          for (int j = 0; j < sw_steps; j++)
          {
            // when sweep_steps == 0, then we only check at loc_sphere_1.
            // do interpolation from t=1 to t=0 (sweep backward)
            k0                  = (j + 1) / (fl_sw_steps);
            k1                  = 1 - k0;
            interpolated_sphere = k0 * loc_sphere_1 + (k1) * loc_sphere_0;

            if (check_sphere_aabb(loc_bounds, interpolated_sphere, inside, delta, curr_jump_distance, sph_dist))
            {

                max_dist = 1;
                break;

            }
          }
        }

        if (h_idx < horizon - 1)
        {
          transform_sphere_quat(&in_obb_mat[0], sphere_2_cache, loc_sphere_2);

          for (int j = 0; j < sw_steps; j++)
          {
            // do interpolation from t=1 to t=2 (sweep forward):

            k0                  = (j + 1) / (fl_sw_steps);
            k1                  = 1 - k0;
            interpolated_sphere = k0 * loc_sphere_1 + (k1) * loc_sphere_2;
            if (check_sphere_aabb(loc_bounds, interpolated_sphere, inside, delta, curr_jump_distance, sph_dist))
            {
              max_dist = 1;
              break;
            }
          }
        }

        if (max_dist > 0)
        {
          break;
        }
      }
      out_distance[b_addrs + h_idx * nspheres + sph_idx] = weight[0] * max_dist;
    }

    template<typename scalar_t, typename dist_scalar_t=float, bool BATCH_ENV_T=true, bool SCALE_METRIC=true, bool SUM_COLLISIONS=true, bool COMPUTE_ESDF=false>
    __global__ void sphere_obb_distance_kernel(
      const scalar_t *sphere_position,
      dist_scalar_t *out_distance,
      scalar_t *closest_pt, uint8_t *sparsity_idx, const float *weight,
      const float *activation_distance,
      const float *max_distance,
      const float *obb_accel,
      const float *obb_bounds, const float *obb_mat,
      const uint8_t *obb_enable, const int32_t *n_env_obb,
      const int32_t *env_query_idx,
      const int max_nobs,
      const int batch_size, const int horizon, const int nspheres,
      const bool transform_back)
    {
      // spheres_per_block is number of spheres in a thread
      // compute local sphere batch by first dividing threadidx/nboxes
      // const int sph_idx = blockIdx.x * blockDim.x + threadIdx.x;
      const int t_idx   = blockIdx.x * blockDim.x + threadIdx.x;
      const int b_idx   = t_idx / (horizon * nspheres);
      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);

      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
      {
        return;
      }
      const int bn_sph_idx =
        b_idx * horizon * nspheres + h_idx * nspheres + sph_idx;

      int env_idx    = 0;

      if (BATCH_ENV_T)
      {
        env_idx =
          env_query_idx[b_idx]; // read env idx from current batch idx

      }
      const int env_nboxes = n_env_obb[env_idx];   // read nboxes in current environment
      if (COMPUTE_ESDF)
      {
          sphere_obb_esdf_fn<scalar_t, dist_scalar_t, SCALE_METRIC, SUM_COLLISIONS>(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance,
                             closest_pt, sparsity_idx, weight, activation_distance, max_distance,
                             obb_accel, obb_bounds, obb_mat, obb_enable, max_nobs,
                             env_nboxes, transform_back);

      }
      else
      {
        sphere_obb_distance_fn<scalar_t, dist_scalar_t, SCALE_METRIC, SUM_COLLISIONS>(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance,
                             closest_pt, sparsity_idx, weight, activation_distance,
                             obb_accel, obb_bounds, obb_mat, obb_enable, max_nobs,
                             env_nboxes, transform_back);

      }

      // return the sphere distance here:
      // sync threads and do block level reduction:
    }


    template<typename grid_scalar_t, typename geom_scalar_t=float, typename dist_scalar_t=float, typename grad_scalar_t=float, bool BATCH_ENV_T=false, bool SCALE_METRIC=true, bool COMPUTE_ESDF=false, int NUM_LAYERS=100>
    __global__ void sphere_voxel_distance_kernel(
      const geom_scalar_t *sphere_position,
      dist_scalar_t *out_distance,
      grad_scalar_t *closest_pt,
      uint8_t *sparsity_idx,
      const float *weight,
      const float *activation_distance,
      const float *max_distance,
      const grid_scalar_t *grid_features,
      const float *grid_params, const float *obb_mat,
      const uint8_t *obb_enable, const int32_t *n_env_obb,
      const int32_t *env_query_idx,
      const int max_nobs, const int num_voxels,
      const int batch_size, const int horizon, const int nspheres,
      const bool transform_back)
    {
      // spheres_per_block is number of spheres in a thread
      // compute local sphere batch by first dividing threadidx/nboxes
      // const int sph_idx = blockIdx.x * blockDim.x + threadIdx.x;
      const int t_idx   = blockIdx.x * blockDim.x + threadIdx.x;
      const int b_idx   = t_idx / (horizon * nspheres);
      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);

      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
      {
        return;
      }
      const int bn_sph_idx =
        b_idx * horizon * nspheres + h_idx * nspheres + sph_idx;

      int env_idx    = 0;

      if (BATCH_ENV_T)
      {
        env_idx =
          env_query_idx[b_idx]; // read env idx from current batch idx

      }
      if (COMPUTE_ESDF)
      {

        sphere_voxel_esdf_fn<grid_scalar_t, geom_scalar_t, dist_scalar_t, grad_scalar_t, SCALE_METRIC, NUM_LAYERS>(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance,
                             closest_pt, sparsity_idx, weight, activation_distance, max_distance,
                             grid_features, grid_params, obb_mat, obb_enable, max_nobs, num_voxels,
                             transform_back);

      }
      else
      {
         sphere_voxel_distance_fn<grid_scalar_t, geom_scalar_t, dist_scalar_t, grad_scalar_t, SCALE_METRIC, NUM_LAYERS>(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance,
                             closest_pt, sparsity_idx, weight, activation_distance, max_distance,
                             grid_features, grid_params, obb_mat, obb_enable, max_nobs, num_voxels,
                             transform_back);
      }

      // return the sphere distance here:
      // sync threads and do block level reduction:
    }


    template<typename scalar_t, typename dist_scalar_t, bool BATCH_ENV_T, bool ENABLE_SPEED_METRIC, bool SUM_COLLISIONS=false, int SWEEP_STEPS=-1>
    __global__ void swept_sphere_obb_distance_jump_kernel(
      const scalar_t *sphere_position,
      dist_scalar_t *out_distance,
      scalar_t *closest_pt, uint8_t *sparsity_idx, const float *weight,
      const float *activation_distance, const float *speed_dt,
      const float *obb_accel, const float *obb_bounds,
      const float *obb_pose, const uint8_t *obb_enable,
      const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs,
      const int batch_size, const int horizon, const int nspheres,
      const int sweep_steps,
      const bool transform_back)
    {
      // This kernel jumps by sdf to only get gradients at collision points.

      // spheres_per_block is number of spheres in a thread
      // compute local sphere batch by first dividing threadidx/nboxes
      const int t_idx = blockIdx.x * blockDim.x + threadIdx.x;
      const int b_idx = t_idx / (horizon * nspheres);

      // const int sph_idx = (t_idx - b_idx * (horizon * nspheres)) / horizon;
      // const int h_idx = (t_idx - b_idx * horizon * nspheres - sph_idx * horizon);
      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);

      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
      {
        return;
      }

      int env_idx    = 0;

      if (BATCH_ENV_T)
      {
        env_idx = env_query_idx[b_idx];
      }

      const int env_nboxes = n_env_obb[env_idx];

      swept_sphere_obb_distance_fn<scalar_t, dist_scalar_t, ENABLE_SPEED_METRIC, SUM_COLLISIONS, SWEEP_STEPS>(
        sphere_position,  env_idx, b_idx, h_idx, sph_idx, out_distance, closest_pt,
        sparsity_idx, weight, activation_distance, speed_dt, obb_accel,
        obb_bounds, obb_pose, obb_enable, max_nobs, env_nboxes, batch_size,
        horizon, nspheres, sweep_steps, transform_back);
    }

    template<typename scalar_t, typename dist_scalar_t=float>
    __global__ void swept_sphere_obb_collision_kernel(
      const scalar_t *sphere_position,
      dist_scalar_t *out_distance,
      const float *weight, const float *activation_distance,
      const float *obb_accel, const float *obb_bounds,
      const float *obb_pose, const uint8_t *obb_enable,
      const int32_t *n_env_obb, const int max_nobs, const int batch_size,
      const int horizon, const int nspheres, const int sweep_steps)
    {
      // spheres_per_block is number of spheres in a thread
      // compute local sphere batch by first dividing threadidx/nboxes
      const int t_idx   = blockIdx.x * blockDim.x + threadIdx.x;
      const int b_idx   = t_idx / (horizon * nspheres);
      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);

      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
      {
        return;
      }

      const int env_idx    = 0;
      const int env_nboxes = n_env_obb[env_idx];

      swept_sphere_obb_collision_fn(
        sphere_position, env_idx, b_idx, h_idx, sph_idx, out_distance, weight,
        activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
        max_nobs, env_nboxes, batch_size, horizon, nspheres, sweep_steps);
    }

    template<typename scalar_t, typename dist_scalar_t=float>
    __global__ void swept_sphere_obb_collision_batch_env_kernel(
      const scalar_t *sphere_position,
      dist_scalar_t *out_distance,
      const float *weight, const float *activation_distance,
      const float *obb_accel, const float *obb_bounds,
      const float *obb_pose, const uint8_t *obb_enable,
      const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs,
      const int batch_size, const int horizon, const int nspheres,
      const int sweep_steps)
    {
      // spheres_per_block is number of spheres in a thread
      // compute local sphere batch by first dividing threadidx/nboxes
      const int t_idx   = blockIdx.x * blockDim.x + threadIdx.x;
      const int b_idx   = t_idx / (horizon * nspheres);
      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);

      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
      {
        return;
      }

      const int env_idx    = env_query_idx[b_idx];
      const int env_nboxes = n_env_obb[env_idx];

      swept_sphere_obb_collision_fn(
        sphere_position, env_idx, b_idx, h_idx, sph_idx, out_distance, weight,
        activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
        max_nobs, env_nboxes, batch_size, horizon, nspheres, sweep_steps);
    }


    template<typename scalar_t, typename dist_scalar_t, bool BATCH_ENV_T>
    __global__ void sphere_obb_collision_batch_env_kernel(
      const scalar_t *sphere_position,
      dist_scalar_t *out_distance,
      const float *weight, const float *activation_distance,
      const float *obb_accel, const float *obb_bounds,
      const float *obb_mat, const uint8_t *obb_enable,
      const int32_t *n_env_obb, const int32_t *env_query_idx, const int max_nobs,
      const int batch_size, const int horizon, const int nspheres)
    {
      // spheres_per_block is number of spheres in a thread
      // compute local sphere batch by first dividing threadidx/nboxes
      const int t_idx   = blockIdx.x * blockDim.x + threadIdx.x;
      const int b_idx   = t_idx / (horizon * nspheres);
      const int h_idx   = (t_idx - b_idx * (horizon * nspheres)) / nspheres;
      const int sph_idx = (t_idx - b_idx * horizon * nspheres - h_idx * nspheres);

      if ((sph_idx >= nspheres) || (b_idx >= batch_size) || (h_idx >= horizon))
      {
        return;
      }
      int env_idx    = 0;
      if(BATCH_ENV_T)
      {
        env_idx = env_query_idx[b_idx];
      }

      const int env_nboxes = n_env_obb[env_idx];

      const int bn_sph_idx =
        b_idx * horizon * nspheres + h_idx * nspheres + sph_idx;
      sphere_obb_collision_fn(sphere_position, env_idx, bn_sph_idx, sph_idx, out_distance,
                              weight, activation_distance, obb_accel, obb_bounds,
                              obb_mat, obb_enable, max_nobs, env_nboxes);
    }
  } // namespace Geometry
}   // namespace Curobo


std::vector<torch::Tensor>
sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 3
                torch::Tensor distance,
                torch::Tensor closest_point,         // batch size, 3
                torch::Tensor sparsity_idx, const torch::Tensor weight,
                const torch::Tensor activation_distance,
                const torch::Tensor max_distance,
                const torch::Tensor obb_accel,       // n_boxes, 4, 4
                const torch::Tensor obb_bounds,      // n_boxes, 3
                const torch::Tensor obb_pose,        // n_boxes, 4, 4
                const torch::Tensor obb_enable,      // n_boxes, 4, 4
                const torch::Tensor n_env_obb,       // n_boxes, 4, 4
                const torch::Tensor env_query_idx,   // n_boxes, 4, 4
                const int max_nobs, const int batch_size, const int horizon,
                const int n_spheres, const bool transform_back,
                const bool compute_distance, const bool use_batch_env,
                const bool sum_collisions,
                const bool compute_esdf)
{
  using namespace Curobo::Geometry;
  cudaStream_t stream      = at::cuda::getCurrentCUDAStream();
  const int    bnh_spheres = n_spheres * batch_size * horizon; //
  const bool scale_metric = true;
  int threadsPerBlock = bnh_spheres;
  const bool sum_collisions_ = true;

  if (threadsPerBlock > 128)
  {
    threadsPerBlock = 128;
  }
  int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;

  if (!compute_distance)
  {

  AT_DISPATCH_FLOATING_TYPES(
    distance.scalar_type(), "SphereObb_clpt_collision", ([&]{
    auto collision_kernel = sphere_obb_collision_batch_env_kernel<scalar_t, float, false>;
    auto batch_collision_kernel = sphere_obb_collision_batch_env_kernel<scalar_t,float, true>;
    auto selected_k = collision_kernel;
    if (use_batch_env)
    {
        selected_k = batch_collision_kernel;
    }

      selected_k<< < blocksPerGrid, threadsPerBlock, 0, stream >> > (
          sphere_position.data_ptr<scalar_t>(),
          distance.data_ptr<float>(), weight.data_ptr<float>(),
          activation_distance.data_ptr<float>(),
          obb_accel.data_ptr<float>(),
          obb_bounds.data_ptr<float>(),
          obb_pose.data_ptr<float>(),
          obb_enable.data_ptr<uint8_t>(),
          n_env_obb.data_ptr<int32_t>(),
          env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
          horizon, n_spheres);
    }));

  }
  else
  {

  // typename scalar_t, typename dist_scalar_t=float, bool BATCH_ENV_T=true, bool SCALE_METRIC=true, bool SUM_COLLISIONS=true, bool COMPUTE_ESDF=false
  AT_DISPATCH_FLOATING_TYPES_AND2(torch::kBFloat16, FP8_TYPE_MACRO,
      distance.scalar_type(), "SphereObb_clpt", ([&]{
    auto distance_kernel = sphere_obb_distance_kernel<scalar_t, scalar_t, false, scale_metric, sum_collisions_,false>;
    if (use_batch_env)
    {
      if (compute_esdf)
      {
              distance_kernel =  sphere_obb_distance_kernel<scalar_t, scalar_t, true, false, sum_collisions_, true>;

      }
      else
      {

      distance_kernel =  sphere_obb_distance_kernel<scalar_t,scalar_t,  true, scale_metric, sum_collisions_, false>;
      }

    }
    else if (compute_esdf)
    {
      distance_kernel = sphere_obb_distance_kernel<scalar_t, scalar_t, false, false, sum_collisions_, true>;
    }

    distance_kernel<< < blocksPerGrid, threadsPerBlock, 0, stream >> > (
          sphere_position.data_ptr<scalar_t>(),
          distance.data_ptr<scalar_t>(),
          closest_point.data_ptr<scalar_t>(),
          sparsity_idx.data_ptr<uint8_t>(),
          weight.data_ptr<float>(),
          activation_distance.data_ptr<float>(),
          max_distance.data_ptr<float>(),
          obb_accel.data_ptr<float>(),
          obb_bounds.data_ptr<float>(),
          obb_pose.data_ptr<float>(),
          obb_enable.data_ptr<uint8_t>(),
          n_env_obb.data_ptr<int32_t>(),
          env_query_idx.data_ptr<int32_t>(),
          max_nobs, batch_size,
          horizon, n_spheres, transform_back);


  }));
  }

  C10_CUDA_KERNEL_LAUNCH_CHECK();

  return { distance, closest_point, sparsity_idx };
}

std::vector<torch::Tensor>swept_sphere_obb_clpt(
  const torch::Tensor sphere_position, // batch_size, 3

  torch::Tensor       distance,        // batch_size, 1
  torch::Tensor
  closest_point,                       // batch size, 4 -> written out as x,y,z,0 for gradient
  torch::Tensor sparsity_idx, const torch::Tensor weight,
  const torch::Tensor activation_distance,
  const torch::Tensor speed_dt,
  const torch::Tensor obb_accel,       // n_boxes, 4, 4
  const torch::Tensor obb_bounds,      // n_boxes, 3
  const torch::Tensor obb_pose,        // n_boxes, 4, 4
  const torch::Tensor obb_enable,      // n_boxes, 4,
  const torch::Tensor n_env_obb,       // n_boxes, 4, 4
  const torch::Tensor env_query_idx,   // n_boxes, 4, 4
  const int max_nobs, const int batch_size, const int horizon,
  const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
  const bool transform_back, const bool compute_distance,
  const bool use_batch_env,
  const bool sum_collisions)
{
  using namespace Curobo::Geometry;

  // const int max_batches_per_block = 128;

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // const int bh = batch_size * horizon;

  // const int warp_n_spheres = n_spheres + (n_spheres % 32);// make n_spheres a multiple of 32
  // int batches_per_block = (bh * warp_n_spheres) / max_batches_per_block;
  const int bnh_spheres = n_spheres * batch_size * horizon; //
  int threadsPerBlock   = bnh_spheres;

  // This block is for old kernels?
  if (threadsPerBlock > 128)
  {
    threadsPerBlock = 128;
  }
  int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;

  if (sum_collisions)
  {
    const bool sum_collisions_ = true;

  if (use_batch_env)
  {
    if (compute_distance)
    {
      if (enable_speed_metric)
      {
        // This is the best kernel for now
        AT_DISPATCH_FLOATING_TYPES(
          distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
          swept_sphere_obb_distance_jump_kernel<scalar_t,float,  true, true, sum_collisions_>
            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
            sphere_position.data_ptr<scalar_t>(),
            distance.data_ptr<float>(),
            closest_point.data_ptr<scalar_t>(),
            sparsity_idx.data_ptr<uint8_t>(),
            weight.data_ptr<float>(),
            activation_distance.data_ptr<float>(),
            speed_dt.data_ptr<float>(),
            obb_accel.data_ptr<float>(),
            obb_bounds.data_ptr<float>(),
            obb_pose.data_ptr<float>(),
            obb_enable.data_ptr<uint8_t>(),
            n_env_obb.data_ptr<int32_t>(),
            env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
            horizon, n_spheres, sweep_steps,
            transform_back);
        }));
      }
      else
      {
        AT_DISPATCH_FLOATING_TYPES(
          distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
          swept_sphere_obb_distance_jump_kernel<scalar_t,float,  true, false, sum_collisions_>
            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
            sphere_position.data_ptr<scalar_t>(),
            distance.data_ptr<float>(),
            closest_point.data_ptr<scalar_t>(),
            sparsity_idx.data_ptr<uint8_t>(),
            weight.data_ptr<float>(),
            activation_distance.data_ptr<float>(),
            speed_dt.data_ptr<float>(),
            obb_accel.data_ptr<float>(),
            obb_bounds.data_ptr<float>(),
            obb_pose.data_ptr<float>(),
            obb_enable.data_ptr<uint8_t>(),
            n_env_obb.data_ptr<int32_t>(),
            env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
            horizon, n_spheres, sweep_steps,
            transform_back);
        }));
      }
    }
    else
    {
      // TODO: implement this later

      // TODO: call kernel based on flag:
      AT_DISPATCH_FLOATING_TYPES(
        distance.scalar_type(), "SphereObb_collision", ([&] {
        swept_sphere_obb_collision_batch_env_kernel<scalar_t, float>
          << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
          sphere_position.data_ptr<scalar_t>(),
          distance.data_ptr<float>(), weight.data_ptr<float>(),
          activation_distance.data_ptr<float>(),
          obb_accel.data_ptr<float>(),
          obb_bounds.data_ptr<float>(),
          obb_pose.data_ptr<float>(),
          obb_enable.data_ptr<uint8_t>(),
          n_env_obb.data_ptr<int32_t>(),
          env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
          horizon, n_spheres, sweep_steps);
      }));
    }
  }
  else
  {
    if (compute_distance)
    {
      if (enable_speed_metric)
      {
        if (sweep_steps == 4)
        {
        AT_DISPATCH_FLOATING_TYPES(
          distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
          swept_sphere_obb_distance_jump_kernel<scalar_t, float, false, true, sum_collisions_,4>
            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
            sphere_position.data_ptr<scalar_t>(),
            distance.data_ptr<float>(),
            closest_point.data_ptr<scalar_t>(),
            sparsity_idx.data_ptr<uint8_t>(),
            weight.data_ptr<float>(),
            activation_distance.data_ptr<float>(),
            speed_dt.data_ptr<float>(),
            obb_accel.data_ptr<float>(),
            obb_bounds.data_ptr<float>(),
            obb_pose.data_ptr<float>(),
            obb_enable.data_ptr<uint8_t>(),
            n_env_obb.data_ptr<int32_t>(),
            env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
            horizon, n_spheres, sweep_steps,
            transform_back);
        }));
        }
        else
        {


        AT_DISPATCH_FLOATING_TYPES(
          distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
          swept_sphere_obb_distance_jump_kernel<scalar_t, float, false, true, sum_collisions_>
            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
            sphere_position.data_ptr<scalar_t>(),
            distance.data_ptr<float>(),
            closest_point.data_ptr<scalar_t>(),
            sparsity_idx.data_ptr<uint8_t>(),
            weight.data_ptr<float>(),
            activation_distance.data_ptr<float>(),
            speed_dt.data_ptr<float>(),
            obb_accel.data_ptr<float>(),
            obb_bounds.data_ptr<float>(),
            obb_pose.data_ptr<float>(),
            obb_enable.data_ptr<uint8_t>(),
            n_env_obb.data_ptr<int32_t>(),
            env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
            horizon, n_spheres, sweep_steps,
            transform_back);
        }));
        }
      }
      else
      {
        // This is the best kernel for now
        AT_DISPATCH_FLOATING_TYPES(
          distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
          swept_sphere_obb_distance_jump_kernel<scalar_t,float, false, false, sum_collisions_>
            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
            sphere_position.data_ptr<scalar_t>(),
            distance.data_ptr<float>(),
            closest_point.data_ptr<scalar_t>(),
            sparsity_idx.data_ptr<uint8_t>(),
            weight.data_ptr<float>(),
            activation_distance.data_ptr<float>(),
            speed_dt.data_ptr<float>(),
            obb_accel.data_ptr<float>(),
            obb_bounds.data_ptr<float>(),
            obb_pose.data_ptr<float>(),
            obb_enable.data_ptr<uint8_t>(),
            n_env_obb.data_ptr<int32_t>(),
            env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
            horizon, n_spheres, sweep_steps,
            transform_back);
        }));
      }
    }
    else
    {
      AT_DISPATCH_FLOATING_TYPES(
        distance.scalar_type(), "SphereObb_collision", ([&] {
        swept_sphere_obb_collision_kernel<scalar_t, float>
          << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
          sphere_position.data_ptr<scalar_t>(),
          distance.data_ptr<float>(), weight.data_ptr<float>(),
          activation_distance.data_ptr<float>(),
          obb_accel.data_ptr<float>(),
          obb_bounds.data_ptr<float>(),
          obb_pose.data_ptr<float>(),
          obb_enable.data_ptr<uint8_t>(),
          n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
          horizon, n_spheres, sweep_steps);
      }));
    }
  }
  }
  else
  {
    const bool sum_collisions_ = true;
  if (use_batch_env)
  {
    if (compute_distance)
    {
      if (enable_speed_metric)
      {
        // This is the best kernel for now
        AT_DISPATCH_FLOATING_TYPES(
          distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
          swept_sphere_obb_distance_jump_kernel<scalar_t, float, true, true, sum_collisions_>
            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
            sphere_position.data_ptr<scalar_t>(),
            distance.data_ptr<float>(),
            closest_point.data_ptr<scalar_t>(),
            sparsity_idx.data_ptr<uint8_t>(),
            weight.data_ptr<float>(),
            activation_distance.data_ptr<float>(),
            speed_dt.data_ptr<float>(),
            obb_accel.data_ptr<float>(),
            obb_bounds.data_ptr<float>(),
            obb_pose.data_ptr<float>(),
            obb_enable.data_ptr<uint8_t>(),
            n_env_obb.data_ptr<int32_t>(),
            env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
            horizon, n_spheres, sweep_steps,
            transform_back);
        }));
      }
      else
      {
        // This is the best kernel for now
        AT_DISPATCH_FLOATING_TYPES(
          distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
          swept_sphere_obb_distance_jump_kernel<scalar_t, float, true, false, sum_collisions_>
            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
            sphere_position.data_ptr<scalar_t>(),
            distance.data_ptr<float>(),
            closest_point.data_ptr<scalar_t>(),
            sparsity_idx.data_ptr<uint8_t>(),
            weight.data_ptr<float>(),
            activation_distance.data_ptr<float>(),
            speed_dt.data_ptr<float>(),
            obb_accel.data_ptr<float>(),
            obb_bounds.data_ptr<float>(),
            obb_pose.data_ptr<float>(),
            obb_enable.data_ptr<uint8_t>(),
            n_env_obb.data_ptr<int32_t>(),
            env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
            horizon, n_spheres, sweep_steps,
            transform_back);
        }));
      }
    }
    else
    {
      // TODO: implement this later

      // TODO: call kernel based on flag:
      AT_DISPATCH_FLOATING_TYPES(
        distance.scalar_type(), "SphereObb_collision", ([&] {
        swept_sphere_obb_collision_batch_env_kernel<scalar_t, float>
          << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
          sphere_position.data_ptr<scalar_t>(),
          distance.data_ptr<float>(), weight.data_ptr<float>(),
          activation_distance.data_ptr<float>(),
          obb_accel.data_ptr<float>(),
          obb_bounds.data_ptr<float>(),
          obb_pose.data_ptr<float>(),
          obb_enable.data_ptr<uint8_t>(),
          n_env_obb.data_ptr<int32_t>(),
          env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
          horizon, n_spheres, sweep_steps);
      }));
    }
  }
  else
  {
    if (compute_distance)
    {
      if (enable_speed_metric)
      {
        // This is the best kernel for now
        AT_DISPATCH_FLOATING_TYPES(
          distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
          swept_sphere_obb_distance_jump_kernel<scalar_t,float,  false, true, sum_collisions_>
            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
            sphere_position.data_ptr<scalar_t>(),
            distance.data_ptr<float>(),
            closest_point.data_ptr<scalar_t>(),
            sparsity_idx.data_ptr<uint8_t>(),
            weight.data_ptr<float>(),
            activation_distance.data_ptr<float>(),
            speed_dt.data_ptr<float>(),
            obb_accel.data_ptr<float>(),
            obb_bounds.data_ptr<float>(),
            obb_pose.data_ptr<float>(),
            obb_enable.data_ptr<uint8_t>(),
            n_env_obb.data_ptr<int32_t>(),
            env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
            horizon, n_spheres, sweep_steps,
            transform_back);
        }));
      }
      else
      {
        // This is the best kernel for now
        AT_DISPATCH_FLOATING_TYPES(
          distance.scalar_type(), "Swept_SphereObb_clpt", ([&] {
          swept_sphere_obb_distance_jump_kernel<scalar_t, float, false, false, sum_collisions_>
            << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
            sphere_position.data_ptr<scalar_t>(),
            distance.data_ptr<float>(),
            closest_point.data_ptr<scalar_t>(),
            sparsity_idx.data_ptr<uint8_t>(),
            weight.data_ptr<float>(),
            activation_distance.data_ptr<float>(),
            speed_dt.data_ptr<float>(),
            obb_accel.data_ptr<float>(),
            obb_bounds.data_ptr<float>(),
            obb_pose.data_ptr<float>(),
            obb_enable.data_ptr<uint8_t>(),
            n_env_obb.data_ptr<int32_t>(),
            env_query_idx.data_ptr<int32_t>(), max_nobs, batch_size,
            horizon, n_spheres, sweep_steps,
            transform_back);
        }));
      }
    }
    else
    {
      AT_DISPATCH_FLOATING_TYPES(
        distance.scalar_type(), "SphereObb_collision", ([&] {
        swept_sphere_obb_collision_kernel<scalar_t, float>
          << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
          sphere_position.data_ptr<scalar_t>(),
          distance.data_ptr<float>(), weight.data_ptr<float>(),
          activation_distance.data_ptr<float>(),
          obb_accel.data_ptr<float>(),
          obb_bounds.data_ptr<float>(),
          obb_pose.data_ptr<float>(),
          obb_enable.data_ptr<uint8_t>(),
          n_env_obb.data_ptr<int32_t>(), max_nobs, batch_size,
          horizon, n_spheres, sweep_steps);
      }));
    }
  }

  }

  C10_CUDA_KERNEL_LAUNCH_CHECK();

  return { distance, closest_point, sparsity_idx }; // , debug_data};
}


std::vector<torch::Tensor>
sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3
                torch::Tensor distance,
                torch::Tensor closest_point,         // batch size, 3
                torch::Tensor sparsity_idx, const torch::Tensor weight,
                const torch::Tensor activation_distance,
                const torch::Tensor max_distance,
                const torch::Tensor grid_features,       // n_boxes, 4, 4
                const torch::Tensor grid_params,      // n_boxes, 3
                const torch::Tensor obb_pose,        // n_boxes, 4, 4
                const torch::Tensor obb_enable,      // n_boxes, 4, 4
                const torch::Tensor n_env_obb,
                const torch::Tensor env_query_idx,   // n_boxes, 4, 4
                const int max_nobs, const int batch_size, const int horizon,
                const int n_spheres, const bool transform_back,
                const bool compute_distance, const bool use_batch_env,
                const bool sum_collisions,
                const bool compute_esdf)
{
  using namespace Curobo::Geometry;
  cudaStream_t stream      = at::cuda::getCurrentCUDAStream();
  const int    bnh_spheres = n_spheres * batch_size * horizon; //
  const bool scale_metric = true;
  const int num_voxels = grid_features.size(-2);
  int threadsPerBlock = bnh_spheres;

  if (threadsPerBlock > 128)
  {
    threadsPerBlock = 128;
  }
  // bfloat16
  //ScalarType::Float8_e4m3fn

  int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;


  AT_DISPATCH_FLOATING_TYPES_AND2(torch::kBFloat16, FP8_TYPE_MACRO,
  grid_features.scalar_type(), "SphereVoxel_clpt", ([&]
  {

    auto kernel_esdf = sphere_voxel_distance_kernel<scalar_t, float, float, float, false, false, true, 100>;
    auto kernel_distance_1 =  sphere_voxel_distance_kernel<scalar_t, float, float, float, false, scale_metric, false, 1>;
    auto kernel_distance_2 =  sphere_voxel_distance_kernel<scalar_t, float, float, float, false, scale_metric, false, 2>;
    auto kernel_distance_3 =  sphere_voxel_distance_kernel<scalar_t, float, float, float, false, scale_metric, false, 3>;
    auto kernel_distance_4 =  sphere_voxel_distance_kernel<scalar_t, float, float, float, false, scale_metric, false, 4>;

    auto kernel_distance_n =  sphere_voxel_distance_kernel<scalar_t, float, float, float, false, scale_metric, false, 100>;
    auto selected_kernel = kernel_distance_n;
    if (compute_esdf)
    {
      selected_kernel = kernel_esdf;

    }
    else
    {
      switch (max_nobs){
        case 1:
          selected_kernel = kernel_distance_1;
          break;
        case 2:
          selected_kernel = kernel_distance_2;
          break;
        case 3:
          selected_kernel = kernel_distance_3;
          break;
        case 4:
          selected_kernel = kernel_distance_4;
          break;
        default:
          selected_kernel = kernel_distance_n;
          break;
      }
    }

      selected_kernel
        << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
        sphere_position.data_ptr<float>(),
        distance.data_ptr<float>(),
        closest_point.data_ptr<float>(),
        sparsity_idx.data_ptr<uint8_t>(),
        weight.data_ptr<float>(),
        activation_distance.data_ptr<float>(),
        max_distance.data_ptr<float>(),
        grid_features.data_ptr<scalar_t>(),
        grid_params.data_ptr<float>(),
        obb_pose.data_ptr<float>(),
        obb_enable.data_ptr<uint8_t>(),
        n_env_obb.data_ptr<int32_t>(),
        env_query_idx.data_ptr<int32_t>(),
        max_nobs,
        num_voxels,
        batch_size,
        horizon, n_spheres, transform_back);
    }));


  C10_CUDA_KERNEL_LAUNCH_CHECK();

  return { distance, closest_point, sparsity_idx };
}


std::vector<torch::Tensor>
swept_sphere_voxel_clpt(const torch::Tensor sphere_position, // batch_size, 3
                torch::Tensor distance,
                torch::Tensor closest_point,         // batch size, 3
                torch::Tensor sparsity_idx, const torch::Tensor weight,
                const torch::Tensor activation_distance,
                const torch::Tensor max_distance,
                const torch::Tensor speed_dt,
                const torch::Tensor grid_features,       // n_boxes, 4, 4
                const torch::Tensor grid_params,      // n_boxes, 3
                const torch::Tensor grid_pose,        // n_boxes, 4, 4
                const torch::Tensor grid_enable,      // n_boxes, 4, 4
                const torch::Tensor n_env_grid,
                const torch::Tensor env_query_idx,   // n_boxes, 4, 4
                const int max_nobs,
                const int batch_size,
                const int horizon,
                const int n_spheres,
                const int sweep_steps,
                const bool enable_speed_metric,
                const bool transform_back,
                const bool compute_distance,
                const bool use_batch_env,
                const bool sum_collisions)
{
  using namespace Curobo::Geometry;

  const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
  CHECK_INPUT(sphere_position);
  CHECK_INPUT(distance);
  CHECK_INPUT(closest_point);
  CHECK_INPUT(sparsity_idx);
  CHECK_INPUT(weight);
  CHECK_INPUT(activation_distance);
  CHECK_INPUT(max_distance);
  CHECK_INPUT(speed_dt);
  CHECK_INPUT(grid_features);
  CHECK_INPUT(grid_params);
  CHECK_INPUT(grid_pose);
  CHECK_INPUT(grid_enable);
  CHECK_INPUT(n_env_grid);
  CHECK_INPUT(env_query_idx);
  cudaStream_t stream      = at::cuda::getCurrentCUDAStream();
  const int    bnh_spheres = n_spheres * batch_size * horizon; //
  const bool scale_metric = true;
  const int num_voxels = grid_features.size(-2);
  int threadsPerBlock = bnh_spheres;

  if (threadsPerBlock > 128)
  {
    threadsPerBlock = 128;
  }
  // bfloat16
  //ScalarType::Float8_e4m3fn

  int blocksPerGrid = (bnh_spheres + threadsPerBlock - 1) / threadsPerBlock;

  AT_DISPATCH_FLOATING_TYPES_AND2(torch::kBFloat16, FP8_TYPE_MACRO,
      grid_features.scalar_type(), "SphereVoxel_clpt", ([&] {

      auto collision_kernel_n = swept_sphere_voxel_distance_jump_kernel<scalar_t, float, float, false, scale_metric, true, false, 100>;
      auto collision_kernel_1 = swept_sphere_voxel_distance_jump_kernel<scalar_t, float, float, false, scale_metric, true, false, 1>;
      auto collision_kernel_2 = swept_sphere_voxel_distance_jump_kernel<scalar_t, float, float, false, scale_metric, true, false, 2>;
      auto collision_kernel_3 = swept_sphere_voxel_distance_jump_kernel<scalar_t, float, float, false, scale_metric, true, false, 3>;
      auto collision_kernel_4 = swept_sphere_voxel_distance_jump_kernel<scalar_t, float, float, false, scale_metric, true, false, 4>;
      auto selected_kernel = collision_kernel_n;
      switch (max_nobs){
        case 1:
          selected_kernel = collision_kernel_1;
          break;
        case 2:
          selected_kernel = collision_kernel_2;
          break;
        case 3:
          selected_kernel = collision_kernel_3;
          break;
        case 4:
          selected_kernel = collision_kernel_4;
          break;
        default:
          selected_kernel = collision_kernel_n;
      }

      selected_kernel<< < blocksPerGrid, threadsPerBlock, 0, stream >> > (
        sphere_position.data_ptr<float>(),
        distance.data_ptr<float>(),
        closest_point.data_ptr<float>(),
        sparsity_idx.data_ptr<uint8_t>(),
        weight.data_ptr<float>(),
        activation_distance.data_ptr<float>(),
        max_distance.data_ptr<float>(),
        speed_dt.data_ptr<float>(),
        grid_features.data_ptr<scalar_t>(),
        grid_params.data_ptr<float>(),
        grid_pose.data_ptr<float>(),
        grid_enable.data_ptr<uint8_t>(),
        n_env_grid.data_ptr<int32_t>(),
        env_query_idx.data_ptr<int32_t>(),
        max_nobs,
        num_voxels,
        batch_size,
        horizon, n_spheres, sweep_steps, transform_back);
    }));


  C10_CUDA_KERNEL_LAUNCH_CHECK();

  return { distance, closest_point, sparsity_idx };
}