constrained planning, robot segmentation

2024-02-22 21:45:47 -08:00
parent 88eac64edc
commit bafdf80c05
102 changed files with 12440 additions and 8112 deletions
--- a/src/curobo/init.py
+++ b/src/curobo/init.py
@@ -9,7 +9,22 @@
 # its affiliates is strictly prohibited.
 #

-"""CuRobo package."""
+"""
+cuRobo package is split into several modules:
+
+- :mod:`curobo.opt` contains optimization solvers.
+- :mod:`curobo.cuda_robot_model` contains robot kinematics.
+- :mod:`curobo.curobolib` contains the cuda kernels and python bindings for them.
+- :mod:`curobo.geom` contains geometry processing, collision checking and frame transforms.
+- :mod:`curobo.graph` contains geometric planning with graph search methods.
+- :mod:`curobo.rollout` contains methods that map actions to costs. This class wraps instances of
+  :mod:`curobo.cuda_robot_model` and :mod:`geom` to compute costs given trajectory of actions.
+- :mod:`curobo.util` contains utility methods.
+- :mod:`curobo.wrap` adds the user-level api for task programming. Includes implementation of 
+  collision-free reacher and batched robot world collision checking.
+- :mod:`curobo.types` contains custom dataclasses for common data types in robotics, including 
+  :meth:`types.state.JointState`, :meth:`types.camera.CameraObservation`, :meth:`types.math.Pose`.
+"""


 # NOTE (roflaherty): This is inspired by how matplotlib does creates its version value.
--- a/src/curobo/content/assets/robot/franka_description/franka_panda.urdf
+++ b/src/curobo/content/assets/robot/franka_description/franka_panda.urdf
@@ -172,7 +172,7 @@
    <dynamics damping="10.0"/>
    <limit effort="12" lower="-2.8973" upper="2.8973" velocity="2.6100"/>
  </joint>
-  <!--
+  
  <link name="panda_link8"/>
  <joint name="panda_joint8" type="fixed">
    <origin rpy="0 0 0" xyz="0 0 0.107"/>
@@ -180,18 +180,11 @@
    <child link="panda_link8"/>
    <axis xyz="0 0 0"/>
  </joint>
-  Removing this joint seems to help with some stability things
-  -->
  <joint name="panda_hand_joint" type="fixed">
-    <!--
    <parent link="panda_link8"/>
-    -->
-    <parent link="panda_link7"/>
    <child link="panda_hand"/>
-    <origin rpy="0 0 -0.785398163397" xyz="0 0 0.107"/>
-    <!--
+    
    <origin rpy="0 0 -0.785398163397" xyz="0 0 0"/>
-    -->
  </joint>
  <link name="panda_hand">
    <visual>
--- a/src/curobo/content/configs/robot/franka.yml
+++ b/src/curobo/content/configs/robot/franka.yml
@@ -13,7 +13,7 @@ robot_cfg:
  kinematics:
    use_usd_kinematics: False
    isaac_usd_path: "/Isaac/Robots/Franka/franka.usd"
-    usd_path: "robot/non_shipping/franka/franka_panda_meters.usda"
+    usd_path: "robot/franka/franka_panda.usda"
    usd_robot_root: "/panda"
    usd_flip_joints: ["panda_joint1","panda_joint2","panda_joint3","panda_joint4", "panda_joint5",
      "panda_joint6","panda_joint7","panda_finger_joint1", "panda_finger_joint2"]
@@ -32,7 +32,7 @@ robot_cfg:
    usd_flip_joint_limits: ["panda_finger_joint2"]
    urdf_path: "robot/franka_description/franka_panda.urdf"
    asset_root_path: "robot/franka_description"
-    base_link: "base_link"
+    base_link: "panda_link0"
    ee_link: "panda_hand"
    collision_link_names:
      [
@@ -50,7 +50,7 @@ robot_cfg:
        "attached_object",
      ]
    collision_spheres: "spheres/franka_mesh.yml"
-    collision_sphere_buffer: 0.0025 
+    collision_sphere_buffer: 0.004 # 0.0025
    extra_collision_spheres: {"attached_object": 4}
    use_global_cumul: True
    self_collision_ignore:
--- a/src/curobo/content/configs/robot/franka_mobile.yml
+++ b/src/curobo/content/configs/robot/franka_mobile.yml
@@ -109,7 +109,7 @@ robot_cfg:
        "base_x", "base_y", "base_z",
        "panda_joint1","panda_joint2","panda_joint3","panda_joint4", "panda_joint5",
      "panda_joint6","panda_joint7","panda_finger_joint1", "panda_finger_joint2"]
-      retract_config: [0,0,0,0.0, -1.3, 0.0, -2.5, 0.0, 1.0, 0., 0.04, 0.04]
+      retract_config: [0,0,0,0, -1.3, 0.0, -2.5, 0.0, 1.0, 0., 0.04, 0.04]
      null_space_weight: [1,1,1,1,1,1,1,1,1,1,1,1]
      cspace_distance_weight: [1,1,1,1,1,1,1,1,1,1,1,1]
      max_acceleration: 15.0 #15.0
--- a/src/curobo/content/configs/robot/tm12.yml
+++ b/src/curobo/content/configs/robot/tm12.yml
@@ -12,7 +12,7 @@
 robot_cfg:
  kinematics:
    use_usd_kinematics: False
-    usd_path: "robot/non_shipping/techman/tm12.usd"
+    usd_path: "robot/techman/tm12.usd"
    usd_robot_root: "/tm12"
    usd_flip_joints:
      {
--- a/src/curobo/content/configs/robot/ur5e.yml
+++ b/src/curobo/content/configs/robot/ur5e.yml
@@ -49,7 +49,7 @@ robot_cfg:
        - "center": [-0.055, 0.008, 0.14]
          "radius": 0.07
        - "center": [-0.001, -0.002, 0.143]
-          "radius": 0.076
+          "radius": 0.08
      forearm_link:
        - "center": [-0.01, 0.002, 0.031]
          "radius": 0.072
--- a/src/curobo/content/configs/task/finetune_trajopt.yml
+++ b/src/curobo/content/configs/task/finetune_trajopt.yml
@@ -31,22 +31,23 @@ model:
 cost:
  pose_cfg:
    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
-    run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
-    weight: [2000,500000.0,30,50] #[150.0, 2000.0, 30, 40]
+    run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position
+    weight: [2000,500000.0,30,50] 
    vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
    terminal: True
-    run_weight: 0.0 #0.05
+    run_weight: 1.0 
    use_metric: True

  link_pose_cfg:
    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
-    run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+    run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position
    weight: [2000,500000.0,30,50] #[150.0, 2000.0, 30, 40]
    vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
    terminal: True
-    run_weight: 0.0
+    run_weight: 0.001
    use_metric: True

+  
  cspace_cfg:
    weight: 10000.0
    terminal: True
@@ -54,7 +55,7 @@ cost:

  bound_cfg:
    weight: [5000.0, 50000.0, 50000.0, 50000.0] # needs to be 3 values
-    smooth_weight: [0.0,2000.0, 50.0, 0.0]  # [vel, acc, jerk,] 
+    smooth_weight: [0.0,5000.0, 50.0, 0.0]  # [vel, acc, jerk,] 
    run_weight_velocity: 0.0
    run_weight_acceleration: 1.0
    run_weight_jerk: 1.0
@@ -78,11 +79,11 @@ cost:

 
 lbfgs:
-  n_iters: 400 
+  n_iters: 400 # 400 
  inner_iters: 25 
-  cold_start_n_iters: 200 
+  cold_start_n_iters: null 
  min_iters: 25
-  line_search_scale: [0.01, 0.3,0.7,1.0]   #   #
+  line_search_scale: [0.01, 0.3, 0.7,1.0]   #   #
  fixed_iters: True
  cost_convergence: 0.01
  cost_delta_threshold: 1.0
@@ -90,7 +91,7 @@ lbfgs:
  epsilon: 0.01
  history: 15 #15
  use_cuda_graph: True
-  n_envs: 1
+  n_problems: 1
  store_debug: False
  use_cuda_kernel: True
  stable_mode: True
--- a/src/curobo/content/configs/task/finetune_trajopt_slow.yml
+++ b/src/curobo/content/configs/task/finetune_trajopt_slow.yml
@@ -0,0 +1,109 @@
+
+##
+## Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+##
+## NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+## property and proprietary rights in and to this material, related
+## documentation and any modifications thereto. Any use, reproduction,
+## disclosure or distribution of this material and related documentation
+## without an express license agreement from NVIDIA CORPORATION or
+## its affiliates is strictly prohibited.
+##
+
+model:
+  horizon: 32
+  state_filter_cfg:
+    filter_coeff:
+      position: 1.0
+      velocity: 1.0
+      acceleration: 1.0
+    enable: False
+  dt_traj_params:
+    base_dt: 0.2
+    base_ratio: 1.0
+    max_dt: 0.2
+  vel_scale: 1.0
+  control_space: 'POSITION'
+  state_finite_difference_mode: "CENTRAL" 
+
+  teleport_mode: False
+  return_full_act_buffer: True
+cost:
+  pose_cfg:
+    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
+    run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+    weight: [2000,500000.0,30,50] #[150.0, 2000.0, 30, 40]
+    vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
+    terminal: True
+    run_weight: 0.0 #0.05
+    use_metric: True
+
+  link_pose_cfg:
+    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
+    run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+    weight: [2000,500000.0,30,50] #[150.0, 2000.0, 30, 40]
+    vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
+    terminal: True
+    run_weight: 0.001
+    use_metric: True
+
+  
+  cspace_cfg:
+    weight: 10000.0
+    terminal: True
+    run_weight: 0.00 #1
+
+  bound_cfg:
+    weight: [5000.0, 50000.0, 50000.0, 50000.0] # needs to be 3 values
+    
+    smooth_weight: [0.0,50000.0, 500.0, 0.0]  # [vel, acc, jerk,] 
+    
+    run_weight_velocity: 0.0
+    run_weight_acceleration: 1.0
+    run_weight_jerk: 1.0
+    activation_distance: [0.1,0.1,0.1,10.0] # for position, velocity, acceleration and jerk
+    null_space_weight: [0.0]
+
+  primitive_collision_cfg:
+    weight: 1000000.0
+    use_sweep: True
+    sweep_steps: 6
+    classify: False
+    use_sweep_kernel: True
+    use_speed_metric: True
+    speed_dt: 0.01 # used only for speed metric
+    activation_distance: 0.025
+
+  self_collision_cfg:
+    weight: 5000.0
+    classify: False
+  
+
+ 
+lbfgs:
+  n_iters: 400 # 400 
+  inner_iters: 25 
+  cold_start_n_iters: 200 
+  min_iters: 25
+  line_search_scale: [0.01, 0.3, 0.7,1.0]   #   #
+  fixed_iters: True
+  cost_convergence: 0.01
+  cost_delta_threshold: 1.0
+  cost_relative_threshold: 0.999 #0.999
+  epsilon: 0.01
+  history: 15 #15
+  use_cuda_graph: True
+  n_problems: 1
+  store_debug: False
+  use_cuda_kernel: True
+  stable_mode: True
+  line_search_type: "approx_wolfe"
+  use_cuda_line_search_kernel: True
+  use_cuda_update_best_kernel: True
+  use_temporal_smooth: False
+  sync_cuda_time: True
+  step_scale: 1.0
+  last_best: 10
+  use_coo_sparse: True
+  debug_info:
+    visual_traj       : null #'ee_pos_seq'
--- a/src/curobo/content/configs/task/gradient_ik.yml
+++ b/src/curobo/content/configs/task/gradient_ik.yml
@@ -32,6 +32,7 @@ cost:
    vec_convergence: [0.0, 0.00] 
    terminal: False
    use_metric: True
+    run_weight: 1.0
  link_pose_cfg:
    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
    weight: [1000, 20000, 30, 50]
@@ -58,7 +59,7 @@ cost:
 lbfgs:
  n_iters: 100 #60
  inner_iters: 25
-  cold_start_n_iters: 100
+  cold_start_n_iters: null
  min_iters: 20
  line_search_scale:  [0.01, 0.3, 0.7,  1.0]
  fixed_iters: True
@@ -69,7 +70,7 @@ lbfgs:
  history: 6
  horizon: 1
  use_cuda_graph: True
-  n_envs: 1
+  n_problems: 1
  store_debug: False
  use_cuda_kernel: True
  stable_mode: True
--- a/src/curobo/content/configs/task/gradient_mpc.yml
+++ b/src/curobo/content/configs/task/gradient_mpc.yml
@@ -83,7 +83,7 @@ lbfgs:
  epsilon: 0.01
  history: 15 #15
  use_cuda_graph: True
-  n_envs: 1
+  n_problems: 1
  store_debug: False
  use_cuda_kernel: True
  stable_mode: True
--- a/src/curobo/content/configs/task/gradient_trajopt.yml
+++ b/src/curobo/content/configs/task/gradient_trajopt.yml
@@ -31,11 +31,11 @@ model:
 cost:
  pose_cfg:
    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] # orientation, position for all timesteps
-    run_vec_weight: [1.00,1.00,1.00,1.0,1.0,1.0] # running weight orientation, position
+    run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position
    weight: [2000,50000.0,30,60] #[150.0, 2000.0, 30, 40]
    vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
    terminal: True
-    run_weight: 0.00
+    run_weight: 1.0
    use_metric: True

  link_pose_cfg:
@@ -44,7 +44,7 @@ cost:
    weight: [2000,50000.0,30,50] #[150.0, 2000.0, 30, 40]
    vec_convergence: [0.0,0.0] # orientation, position, orientation metric activation, position metric activation
    terminal: True
-    run_weight: 0.0
+    run_weight: 0.001
    use_metric: True

  cspace_cfg:
@@ -54,7 +54,9 @@ cost:

  bound_cfg:
    weight: [5000.0, 50000.0, 50000.0,50000.0] # needs to be 3 values
+    #weight: [000.0, 000.0, 000.0,000.0]
    smooth_weight: [0.0,10000.0,10.0, 0.0]  # [vel, acc, jerk, alpha_v-not-used]
+    #smooth_weight: [0.0,0000.0,0.0, 0.0]  # [vel, acc, jerk, alpha_v-not-used]
    run_weight_velocity: 0.00
    run_weight_acceleration: 1.0
    run_weight_jerk: 1.0
@@ -79,9 +81,9 @@ cost:

 
 lbfgs:
-  n_iters: 175 #175
+  n_iters: 125 #175
  inner_iters: 25 
-  cold_start_n_iters: 150 
+  cold_start_n_iters: null 
  min_iters: 25
  line_search_scale: [0.01,0.3,0.7,1.0]  #[0.01,0.2, 0.3,0.5,0.7,0.9, 1.0] #
  fixed_iters: True
@@ -91,7 +93,7 @@ lbfgs:
  epsilon: 0.01
  history: 15
  use_cuda_graph: True
-  n_envs: 1
+  n_problems: 1
  store_debug: False
  use_cuda_kernel: True
  stable_mode: True
--- a/src/curobo/content/configs/task/particle_ik.yml
+++ b/src/curobo/content/configs/task/particle_ik.yml
@@ -31,6 +31,7 @@ cost:
    vec_convergence: [0.00, 0.000] # orientation, position
    terminal: False
    use_metric: True
+    run_weight: 1.0
  link_pose_cfg:
    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
    weight: [30, 50, 10, 10] #[20.0, 100.0]
@@ -56,7 +57,7 @@ mppi:
  init_cov          : 1.0 #0.15 #.5 #.5
  gamma             : 1.0
  n_iters           : 4
-  cold_start_n_iters: 4
+  cold_start_n_iters: null
  step_size_mean    : 0.9
  step_size_cov     : 0.1
  beta              : 0.01
@@ -69,12 +70,12 @@ mppi:
  sample_mode       : 'BEST'
  base_action       : 'REPEAT'
  squash_fn         : 'CLAMP'
-  n_envs            : 1
+  n_problems            : 1
  use_cuda_graph    : True
  seed              : 0
  store_debug       : False
  random_mean       : False
-  sample_per_env    : True
+  sample_per_problem: True
  sync_cuda_time    : False
  use_coo_sparse    : True
  sample_params:
--- a/src/curobo/content/configs/task/particle_mpc.yml
+++ b/src/curobo/content/configs/task/particle_mpc.yml
@@ -89,12 +89,12 @@ mppi:
  sample_mode       : 'BEST'
  base_action       : 'REPEAT'
  squash_fn         : 'CLAMP'
-  n_envs            : 1
+  n_problems        : 1
  use_cuda_graph    : True
  seed              : 0
  store_debug       : False
  random_mean       : True
-  sample_per_env    : False
+  sample_per_problem: False
  sync_cuda_time    : True
  use_coo_sparse    : True
  sample_params:
--- a/src/curobo/content/configs/task/particle_trajopt.yml
+++ b/src/curobo/content/configs/task/particle_trajopt.yml
@@ -30,17 +30,17 @@ model:
 cost:
  pose_cfg:
    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
-    run_vec_weight: [1.0,1.0,1.0,1.0,1.0,1.0] # running weight 
+    run_vec_weight: [0.0,0.0,0.0,0.0,0.0,0.0] # running weight 
    weight: [250.0, 5000.0, 20, 20]
    vec_convergence: [0.0,0.0,1000.0,1000.0] 
    terminal: True
-    run_weight: 0.00 
+    run_weight: 1.0
    use_metric: True
  
  link_pose_cfg:
    vec_weight: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
-    run_vec_weight: [1.0,1.0,1.0,1.0,1.0,1.0] # running weight 
-    weight: [250.0, 5000.0, 40, 40]
+    run_vec_weight: [0.00,0.00,0.00,0.0,0.0,0.0] # running weight orientation, position
+    weight: [0.0, 5000.0, 40, 40]
    vec_convergence: [0.0,0.0,1000.0,1000.0] 
    terminal: True
    run_weight: 0.00 
@@ -54,10 +54,12 @@ cost:
  bound_cfg:
    weight: [0.1, 0.1,0.0,0.0]
    activation_distance: [0.0,0.0,0.0,0.0] #-#0.01
+    #smooth_weight: [0.0,100.0,1.0,0.0]
    smooth_weight: [0.0,20.0,0.0,0.0]
    run_weight_velocity: 0.0
    run_weight_acceleration: 1.0
    run_weight_jerk: 1.0
+    null_space_weight: [0.00]

  primitive_collision_cfg:
    weight: 5000.0
@@ -80,7 +82,7 @@ mppi:
  init_cov          : 0.5
  gamma             : 1.0
  n_iters           : 2
-  cold_start_n_iters: 2
+  cold_start_n_iters: null
  step_size_mean    : 0.9
  step_size_cov     : 0.01
  beta              : 0.01
@@ -93,12 +95,12 @@ mppi:
  sample_mode       : 'BEST'
  base_action       : 'REPEAT'
  squash_fn         : 'CLAMP'
-  n_envs            : 1
+  n_problems            : 1
  use_cuda_graph    : True
  seed              : 0
  store_debug       : False
  random_mean       : True
-  sample_per_env    : True
+  sample_per_problem: True
  sync_cuda_time    : False
  use_coo_sparse    : True
  sample_params:
--- a/src/curobo/content/configs/world/collision_table.yml
+++ b/src/curobo/content/configs/world/collision_table.yml
@@ -11,4 +11,4 @@
 cuboid:
  table:
    dims: [5.0, 5.0, 0.2] # x, y, z
-    pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw
+    pose: [0.0, 0.0, -0.1, 1, 0, 0, 0.0] # x, y, z, qx, qy, qz, qw                                                                                                                                                                                                                                                            
--- a/src/curobo/cuda_robot_model/cuda_robot_generator.py
+++ b/src/curobo/cuda_robot_model/cuda_robot_generator.py
@@ -145,6 +145,9 @@ class CudaRobotGeneratorConfig:
    #: cspace config
    cspace: Union[None, CSpaceConfig, Dict[str, List[Any]]] = None

+    #: Enable loading meshes from kinematics parser.
+    load_meshes: bool = False
+
    def __post_init__(self):
        # add root path:
        # Check if an external asset path is provided:
@@ -283,7 +286,10 @@ class CudaRobotGenerator(CudaRobotGeneratorConfig):
            )
        else:
            self._kinematics_parser = UrdfKinematicsParser(
-                self.urdf_path, mesh_root=self.asset_root_path, extra_links=self.extra_links
+                self.urdf_path,
+                mesh_root=self.asset_root_path,
+                extra_links=self.extra_links,
+                load_meshes=self.load_meshes,
            )

        if self.lock_joints is None:
--- a/src/curobo/cuda_robot_model/cuda_robot_model.py
+++ b/src/curobo/cuda_robot_model/cuda_robot_model.py
@@ -30,7 +30,7 @@ from curobo.cuda_robot_model.types import (
    SelfCollisionKinematicsConfig,
 )
 from curobo.curobolib.kinematics import get_cuda_kinematics
-from curobo.geom.types import Sphere
+from curobo.geom.types import Mesh, Sphere
 from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
 from curobo.util.logger import log_error
@@ -390,4 +390,8 @@ class CudaRobotModel(CudaRobotModelConfig):
        return self.kinematics_config.base_link

    def update_kinematics_config(self, new_kin_config: KinematicsTensorConfig):
-        self.kinematics_config.copy(new_kin_config)
+        self.kinematics_config.copy_(new_kin_config)
+
+    @property
+    def retract_config(self):
+        return self.kinematics_config.cspace.retract_config
--- a/src/curobo/cuda_robot_model/types.py
+++ b/src/curobo/cuda_robot_model/types.py
@@ -23,7 +23,7 @@ from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
 from curobo.types.state import JointState
 from curobo.types.tensor import T_DOF
-from curobo.util.tensor_util import copy_if_not_none
+from curobo.util.tensor_util import clone_if_not_none, copy_if_not_none


 class JointType(Enum):
@@ -75,20 +75,28 @@ class JointLimits:
            self.tensor_args,
        )

+    def copy_(self, new_jl: JointLimits):
+        self.joint_names = new_jl.joint_names.copy()
+        self.position.copy_(new_jl.position)
+        self.velocity.copy_(new_jl.velocity)
+        self.acceleration.copy_(new_jl.acceleration)
+        self.effort = copy_if_not_none(new_jl.effort, self.effort)
+        return self
+

@dataclass
 class CSpaceConfig:
    joint_names: List[str]
    retract_config: Optional[T_DOF] = None
    cspace_distance_weight: Optional[T_DOF] = None
-    null_space_weight: Optional[T_DOF] = None  # List[float]
+    null_space_weight: Optional[T_DOF] = None
    tensor_args: TensorDeviceType = TensorDeviceType()
    max_acceleration: Union[float, List[float]] = 10.0
    max_jerk: Union[float, List[float]] = 500.0
    velocity_scale: Union[float, List[float]] = 1.0
    acceleration_scale: Union[float, List[float]] = 1.0
    jerk_scale: Union[float, List[float]] = 1.0
-    position_limit_clip: Union[float, List[float]] = 0.01
+    position_limit_clip: Union[float, List[float]] = 0.0

    def __post_init__(self):
        if self.retract_config is not None:
@@ -147,12 +155,31 @@ class CSpaceConfig:
        joint_names = [self.joint_names[n] for n in new_index]
        self.joint_names = joint_names

+    def copy_(self, new_config: CSpaceConfig):
+        self.joint_names = new_config.joint_names.copy()
+        self.retract_config = copy_if_not_none(new_config.retract_config, self.retract_config)
+        self.null_space_weight = copy_if_not_none(
+            new_config.null_space_weight, self.null_space_weight
+        )
+        self.cspace_distance_weight = copy_if_not_none(
+            new_config.cspace_distance_weight, self.cspace_distance_weight
+        )
+        self.tensor_args = self.tensor_args
+        self.max_jerk = copy_if_not_none(new_config.max_jerk, self.max_jerk)
+        self.max_acceleration = copy_if_not_none(new_config.max_acceleration, self.max_acceleration)
+        self.velocity_scale = copy_if_not_none(new_config.velocity_scale, self.velocity_scale)
+        self.acceleration_scale = copy_if_not_none(
+            new_config.acceleration_scale, self.acceleration_scale
+        )
+        self.jerk_scale = copy_if_not_none(new_config.jerk_scale, self.jerk_scale)
+        return self
+
    def clone(self) -> CSpaceConfig:
        return CSpaceConfig(
            joint_names=self.joint_names.copy(),
-            retract_config=copy_if_not_none(self.retract_config),
-            null_space_weight=copy_if_not_none(self.null_space_weight),
-            cspace_distance_weight=copy_if_not_none(self.cspace_distance_weight),
+            retract_config=clone_if_not_none(self.retract_config),
+            null_space_weight=clone_if_not_none(self.null_space_weight),
+            cspace_distance_weight=clone_if_not_none(self.cspace_distance_weight),
            tensor_args=self.tensor_args,
            max_jerk=self.max_jerk.clone(),
            max_acceleration=self.max_acceleration.clone(),
@@ -215,12 +242,48 @@ class KinematicsTensorConfig:
    cspace: Optional[CSpaceConfig] = None
    base_link: str = "base_link"
    ee_link: str = "ee_link"
+    reference_link_spheres: Optional[torch.Tensor] = None

    def __post_init__(self):
        if self.cspace is None and self.joint_limits is not None:
            self.load_cspace_cfg_from_kinematics()
        if self.joint_limits is not None and self.cspace is not None:
            self.joint_limits = self.cspace.scale_joint_limits(self.joint_limits)
+        if self.link_spheres is not None and self.reference_link_spheres is None:
+            self.reference_link_spheres = self.link_spheres.clone()
+
+    def copy_(self, new_config: KinematicsTensorConfig):
+        self.fixed_transforms.copy_(new_config.fixed_transforms)
+        self.link_map.copy_(new_config.link_map)
+        self.joint_map.copy_(new_config.joint_map)
+        self.joint_map_type.copy_(new_config.joint_map_type)
+        self.store_link_map.copy_(new_config.store_link_map)
+        self.link_chain_map.copy_(new_config.link_chain_map)
+        self.joint_limits.copy_(new_config.joint_limits)
+        if new_config.link_spheres is not None and self.link_spheres is not None:
+            self.link_spheres.copy_(new_config.link_spheres)
+        if new_config.link_sphere_idx_map is not None and self.link_sphere_idx_map is not None:
+            self.link_sphere_idx_map.copy_(new_config.link_sphere_idx_map)
+        if new_config.link_name_to_idx_map is not None and self.link_name_to_idx_map is not None:
+            self.link_name_to_idx_map = new_config.link_name_to_idx_map.copy()
+        if (
+            new_config.reference_link_spheres is not None
+            and self.reference_link_spheres is not None
+        ):
+            self.reference_link_spheres.copy_(new_config.reference_link_spheres)
+        self.base_link = new_config.base_link
+        self.ee_idx = new_config.ee_idx
+        self.ee_link = new_config.ee_link
+        self.debug = new_config.debug
+        self.cspace.copy_(new_config.cspace)
+        self.n_dof = new_config.n_dof
+        self.non_fixed_joint_names = new_config.non_fixed_joint_names
+        self.joint_names = new_config.joint_names
+        self.link_names = new_config.link_names
+        self.mesh_link_names = new_config.mesh_link_names
+        self.total_spheres = new_config.total_spheres
+
+        return self

    def load_cspace_cfg_from_kinematics(self):
        retract_config = (
@@ -270,6 +333,13 @@ class KinematicsTensorConfig:
        link_sphere_index = self.get_sphere_index_from_link_name(link_name)
        return self.link_spheres[link_sphere_index, :]

+    def get_reference_link_spheres(
+        self,
+        link_name: str,
+    ):
+        link_sphere_index = self.get_sphere_index_from_link_name(link_name)
+        return self.reference_link_spheres[link_sphere_index, :]
+
    def attach_object(
        self,
        sphere_radius: Optional[float] = None,
@@ -332,6 +402,19 @@ class KinematicsTensorConfig:
        """
        return self.get_link_spheres(link_name).shape[0]

+    def disable_link_spheres(self, link_name: str):
+        if link_name not in self.link_name_to_idx_map.keys():
+            raise ValueError(link_name + " not found in spheres")
+        curr_spheres = self.get_link_spheres(link_name)
+        curr_spheres[:, 3] = -100.0
+        self.update_link_spheres(link_name, curr_spheres)
+
+    def enable_link_spheres(self, link_name: str):
+        if link_name not in self.link_name_to_idx_map.keys():
+            raise ValueError(link_name + " not found in spheres")
+        curr_spheres = self.get_reference_link_spheres(link_name)
+        self.update_link_spheres(link_name, curr_spheres)
+

@dataclass
 class SelfCollisionKinematicsConfig:
--- a/src/curobo/cuda_robot_model/urdf_kinematics_parser.py
+++ b/src/curobo/cuda_robot_model/urdf_kinematics_parser.py
@@ -183,6 +183,7 @@ class UrdfKinematicsParser(KinematicsParser):
        else:
            # convert to list:
            mesh_pose = Pose.from_matrix(mesh_pose).to_list()
+
        return CuroboMesh(
            name=link_name,
            pose=mesh_pose,
--- a/src/curobo/cuda_robot_model/usd_kinematics_parser.py
+++ b/src/curobo/cuda_robot_model/usd_kinematics_parser.py
@@ -13,7 +13,6 @@ from typing import Dict, List, Optional, Tuple

 # Third Party
 import numpy as np
-from pxr import Usd, UsdPhysics

 # CuRobo
 from curobo.cuda_robot_model.kinematics_parser import KinematicsParser, LinkParams
@@ -22,6 +21,15 @@ from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
 from curobo.util.logger import log_error

+try:
+    # Third Party
+    from pxr import Usd, UsdPhysics
+except ImportError:
+    raise ImportError(
+        "usd-core failed to import, install with pip install usd-core"
+        + " NOTE: Do not install this if using with ISAAC SIM."
+    )
+

 class UsdKinematicsParser(KinematicsParser):
    """An experimental kinematics parser from USD.
--- a/src/curobo/curobolib/cpp/geom_cuda.cpp
+++ b/src/curobo/curobolib/cpp/geom_cuda.cpp
@@ -16,43 +16,55 @@

 // CUDA forward declarations

-std::vector<torch::Tensor> self_collision_distance(
-    torch::Tensor out_distance, torch::Tensor out_vec,
-    torch::Tensor sparse_index,
-    const torch::Tensor robot_spheres,      // batch_size x n_spheres x 4
-    const torch::Tensor collision_offset,   // n_spheres x n_spheres
-    const torch::Tensor weight,
-    const torch::Tensor collision_matrix, // n_spheres x n_spheres
-    const torch::Tensor thread_locations, const int locations_size,
-    const int batch_size, const int nspheres, const bool compute_grad = false,
-    const int ndpt = 8, // Does this need to match template?
-    const bool debug = false);
+std::vector<torch::Tensor>self_collision_distance(
+  torch::Tensor       out_distance,
+  torch::Tensor       out_vec,
+  torch::Tensor       sparse_index,
+  const torch::Tensor robot_spheres,    // batch_size x n_spheres x 4
+  const torch::Tensor collision_offset, // n_spheres x n_spheres
+  const torch::Tensor weight,
+  const torch::Tensor collision_matrix, // n_spheres x n_spheres
+  const torch::Tensor thread_locations,
+  const int           locations_size,
+  const int           batch_size,
+  const int           nspheres,
+  const bool          compute_grad = false,
+  const int           ndpt         = 8, // Does this need to match template?
+  const bool          debug        = false);

 // CUDA forward declarations

-std::vector<torch::Tensor> swept_sphere_obb_clpt(
-    const torch::Tensor sphere_position, // batch_size, 3
-    torch::Tensor distance, // batch_size, 1
-    torch::Tensor
-        closest_point, // batch size, 4 -> written out as x,y,z,0 for gradient
-    torch::Tensor sparsity_idx, const torch::Tensor weight,
-    const torch::Tensor activation_distance, const torch::Tensor speed_dt,
-    const torch::Tensor obb_accel,     // n_boxes, 4, 4
-    const torch::Tensor obb_bounds,    // n_boxes, 3
-    const torch::Tensor obb_pose,      // n_boxes, 4, 4
-    const torch::Tensor obb_enable,    // n_boxes, 4,
-    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
-    const torch::Tensor env_query_idx, // n_boxes, 4, 4
-    const int max_nobs, const int batch_size, const int horizon,
-    const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
-    const bool transform_back, const bool compute_distance,
-    const bool use_batch_env);
+std::vector<torch::Tensor>swept_sphere_obb_clpt(
+  const torch::Tensor sphere_position, // batch_size, 3
+  torch::Tensor       distance,        // batch_size, 1
+  torch::Tensor
+  closest_point,                       // batch size, 4 -> written out as x,y,z,0 for gradient
+  torch::Tensor       sparsity_idx,
+  const torch::Tensor weight,
+  const torch::Tensor activation_distance,
+  const torch::Tensor speed_dt,
+  const torch::Tensor obb_accel,     // n_boxes, 4, 4
+  const torch::Tensor obb_bounds,    // n_boxes, 3
+  const torch::Tensor obb_pose,      // n_boxes, 4, 4
+  const torch::Tensor obb_enable,    // n_boxes, 4,
+  const torch::Tensor n_env_obb,     // n_boxes, 4, 4
+  const torch::Tensor env_query_idx, // n_boxes, 4, 4
+  const int           max_nobs,
+  const int           batch_size,
+  const int           horizon,
+  const int           n_spheres,
+  const int           sweep_steps,
+  const bool          enable_speed_metric,
+  const bool          transform_back,
+  const bool          compute_distance,
+  const bool          use_batch_env);

 std::vector<torch::Tensor>
 sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
-                torch::Tensor distance,
-                torch::Tensor closest_point, // batch size, 3
-                torch::Tensor sparsity_idx, const torch::Tensor weight,
+                torch::Tensor       distance,
+                torch::Tensor       closest_point,   // batch size, 3
+                torch::Tensor       sparsity_idx,
+                const torch::Tensor weight,
                const torch::Tensor activation_distance,
                const torch::Tensor obb_accel,     // n_boxes, 4, 4
                const torch::Tensor obb_bounds,    // n_boxes, 3
@@ -60,58 +72,75 @@ sphere_obb_clpt(const torch::Tensor sphere_position, // batch_size, 4
                const torch::Tensor obb_enable,    // n_boxes, 4, 4
                const torch::Tensor n_env_obb,     // n_boxes, 4, 4
                const torch::Tensor env_query_idx, // n_boxes, 4, 4
-                const int max_nobs, const int batch_size, const int horizon,
-                const int n_spheres, const bool transform_back,
-                const bool compute_distance, const bool use_batch_env);
+                const int           max_nobs,
+                const int           batch_size,
+                const int           horizon,
+                const int           n_spheres,
+                const bool          transform_back,
+                const bool          compute_distance,
+                const bool          use_batch_env);

-std::vector<torch::Tensor> pose_distance(
-    torch::Tensor out_distance, torch::Tensor out_position_distance,
-    torch::Tensor out_rotation_distance,
-    torch::Tensor distance_p_vector, // batch size, 3
-    torch::Tensor distance_q_vector, // batch size, 4
-    torch::Tensor out_gidx,
-    const torch::Tensor current_position, // batch_size, 3
-    const torch::Tensor goal_position,    // n_boxes, 3
-    const torch::Tensor current_quat, const torch::Tensor goal_quat,
-    const torch::Tensor vec_weight, // n_boxes, 4, 4
-    const torch::Tensor weight,     // n_boxes, 4, 4
-    const torch::Tensor vec_convergence, const torch::Tensor run_weight,
-    const torch::Tensor run_vec_weight, const torch::Tensor batch_pose_idx,
-    const int batch_size, const int horizon, const int mode,
-    const int num_goals = 1, const bool compute_grad = false,
-    const bool write_distance = true, const bool use_metric = false);
+std::vector<torch::Tensor>pose_distance(
+  torch::Tensor       out_distance,
+  torch::Tensor       out_position_distance,
+  torch::Tensor       out_rotation_distance,
+  torch::Tensor       distance_p_vector, // batch size, 3
+  torch::Tensor       distance_q_vector, // batch size, 4
+  torch::Tensor       out_gidx,
+  const torch::Tensor current_position,  // batch_size, 3
+  const torch::Tensor goal_position,     // n_boxes, 3
+  const torch::Tensor current_quat,
+  const torch::Tensor goal_quat,
+  const torch::Tensor vec_weight,        // n_boxes, 4, 4
+  const torch::Tensor weight,            // n_boxes, 4, 4
+  const torch::Tensor vec_convergence,
+  const torch::Tensor run_weight,
+  const torch::Tensor run_vec_weight,
+  const torch::Tensor offset_waypoint,
+  const torch::Tensor offset_tstep_fraction,
+  const torch::Tensor batch_pose_idx,
+  const int           batch_size,
+  const int           horizon,
+  const int           mode,
+  const int           num_goals        = 1,
+  const bool          compute_grad     = false,
+  const bool          write_distance   = true,
+  const bool          use_metric       = false,
+  const bool          project_distance = true);

 std::vector<torch::Tensor>
-backward_pose_distance(torch::Tensor out_grad_p, torch::Tensor out_grad_q,
+backward_pose_distance(torch::Tensor       out_grad_p,
+                       torch::Tensor       out_grad_q,
                       const torch::Tensor grad_distance,   // batch_size, 3
                       const torch::Tensor grad_p_distance, // n_boxes, 3
                       const torch::Tensor grad_q_distance,
                       const torch::Tensor pose_weight,
-                       const torch::Tensor grad_p_vec, // n_boxes, 4, 4
-                       const torch::Tensor grad_q_vec, const int batch_size,
-                       const bool use_distance = false);
+                       const torch::Tensor grad_p_vec,      // n_boxes, 4, 4
+                       const torch::Tensor grad_q_vec,
+                       const int           batch_size,
+                       const bool          use_distance = false);

 // C++ interface

 // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

-std::vector<torch::Tensor> self_collision_distance_wrapper(
-    torch::Tensor out_distance, torch::Tensor out_vec,
-    torch::Tensor sparse_index,
-    const torch::Tensor robot_spheres,        // batch_size x n_spheres x 4
-    const torch::Tensor collision_offset,     // n_spheres
-    const torch::Tensor weight,
-    const torch::Tensor collision_matrix, // n_spheres
-    const torch::Tensor thread_locations, const int thread_locations_size,
-    const int batch_size, const int nspheres, const bool compute_grad = false,
-    const int ndpt = 8, const bool debug = false) {
-
+std::vector<torch::Tensor>self_collision_distance_wrapper(
+  torch::Tensor out_distance, torch::Tensor out_vec,
+  torch::Tensor sparse_index,
+  const torch::Tensor robot_spheres,    // batch_size x n_spheres x 4
+  const torch::Tensor collision_offset, // n_spheres
+  const torch::Tensor weight,
+  const torch::Tensor collision_matrix, // n_spheres
+  const torch::Tensor thread_locations, const int thread_locations_size,
+  const int batch_size, const int nspheres, const bool compute_grad = false,
+  const int ndpt = 8, const bool debug = false)
+{
  CHECK_INPUT(out_distance);
  CHECK_INPUT(out_vec);
  CHECK_INPUT(robot_spheres);
@@ -123,29 +152,30 @@ std::vector<torch::Tensor> self_collision_distance_wrapper(
  const at::cuda::OptionalCUDAGuard guard(robot_spheres.device());

  return self_collision_distance(
-      out_distance, out_vec, sparse_index, robot_spheres, 
-      collision_offset, weight, collision_matrix, thread_locations,
-      thread_locations_size, batch_size, nspheres, compute_grad, ndpt, debug);
+    out_distance, out_vec, sparse_index, robot_spheres,
+    collision_offset, weight, collision_matrix, thread_locations,
+    thread_locations_size, batch_size, nspheres, compute_grad, ndpt, debug);
 }

-std::vector<torch::Tensor> sphere_obb_clpt_wrapper(
-    const torch::Tensor sphere_position, // batch_size, 4
-
-    torch::Tensor distance,
-    torch::Tensor closest_point, // batch size, 3
-    torch::Tensor sparsity_idx, const torch::Tensor weight,
-    const torch::Tensor activation_distance,
-    const torch::Tensor obb_accel,     // n_boxes, 4, 4
-    const torch::Tensor obb_bounds,    // n_boxes, 3
-    const torch::Tensor obb_pose,      // n_boxes, 4, 4
-    const torch::Tensor obb_enable,    // n_boxes, 4, 4
-    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
-    const torch::Tensor env_query_idx, // n_boxes, 4, 4
-    const int max_nobs, const int batch_size, const int horizon,
-    const int n_spheres, const bool transform_back, const bool compute_distance,
-    const bool use_batch_env) {
+std::vector<torch::Tensor>sphere_obb_clpt_wrapper(
+  const torch::Tensor sphere_position, // batch_size, 4

+  torch::Tensor distance,
+  torch::Tensor closest_point,         // batch size, 3
+  torch::Tensor sparsity_idx, const torch::Tensor weight,
+  const torch::Tensor activation_distance,
+  const torch::Tensor obb_accel,       // n_boxes, 4, 4
+  const torch::Tensor obb_bounds,      // n_boxes, 3
+  const torch::Tensor obb_pose,        // n_boxes, 4, 4
+  const torch::Tensor obb_enable,      // n_boxes, 4, 4
+  const torch::Tensor n_env_obb,       // n_boxes, 4, 4
+  const torch::Tensor env_query_idx,   // n_boxes, 4, 4
+  const int max_nobs, const int batch_size, const int horizon,
+  const int n_spheres, const bool transform_back, const bool compute_distance,
+  const bool use_batch_env)
+{
  const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
+
  CHECK_INPUT(distance);
  CHECK_INPUT(closest_point);
  CHECK_INPUT(sphere_position);
@@ -154,55 +184,61 @@ std::vector<torch::Tensor> sphere_obb_clpt_wrapper(
  CHECK_INPUT(activation_distance);
  CHECK_INPUT(obb_accel);
  return sphere_obb_clpt(
-      sphere_position, distance, closest_point, sparsity_idx, weight,
-      activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
-      n_env_obb, env_query_idx, max_nobs, batch_size, horizon, n_spheres,
-      transform_back, compute_distance, use_batch_env);
+    sphere_position, distance, closest_point, sparsity_idx, weight,
+    activation_distance, obb_accel, obb_bounds, obb_pose, obb_enable,
+    n_env_obb, env_query_idx, max_nobs, batch_size, horizon, n_spheres,
+    transform_back, compute_distance, use_batch_env);
 }
-std::vector<torch::Tensor> swept_sphere_obb_clpt_wrapper(
-    const torch::Tensor sphere_position, // batch_size, 4
-    torch::Tensor distance,
-    torch::Tensor closest_point, // batch size, 3
-    torch::Tensor sparsity_idx, const torch::Tensor weight,
-    const torch::Tensor activation_distance, const torch::Tensor speed_dt,
-    const torch::Tensor obb_accel,     // n_boxes, 4, 4
-    const torch::Tensor obb_bounds,    // n_boxes, 3
-    const torch::Tensor obb_pose,      // n_boxes, 4, 4
-    const torch::Tensor obb_enable,    // n_boxes, 4, 4
-    const torch::Tensor n_env_obb,     // n_boxes, 4, 4
-    const torch::Tensor env_query_idx, // n_boxes, 4, 4
-    const int max_nobs, const int batch_size, const int horizon,
-    const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
-    const bool transform_back, const bool compute_distance,
-    const bool use_batch_env) {
+
+std::vector<torch::Tensor>swept_sphere_obb_clpt_wrapper(
+  const torch::Tensor sphere_position, // batch_size, 4
+  torch::Tensor distance,
+  torch::Tensor closest_point,         // batch size, 3
+  torch::Tensor sparsity_idx, const torch::Tensor weight,
+  const torch::Tensor activation_distance, const torch::Tensor speed_dt,
+  const torch::Tensor obb_accel,       // n_boxes, 4, 4
+  const torch::Tensor obb_bounds,      // n_boxes, 3
+  const torch::Tensor obb_pose,        // n_boxes, 4, 4
+  const torch::Tensor obb_enable,      // n_boxes, 4, 4
+  const torch::Tensor n_env_obb,       // n_boxes, 4, 4
+  const torch::Tensor env_query_idx,   // n_boxes, 4, 4
+  const int max_nobs, const int batch_size, const int horizon,
+  const int n_spheres, const int sweep_steps, const bool enable_speed_metric,
+  const bool transform_back, const bool compute_distance,
+  const bool use_batch_env)
+{
  const at::cuda::OptionalCUDAGuard guard(sphere_position.device());
+
  CHECK_INPUT(distance);
  CHECK_INPUT(closest_point);
  CHECK_INPUT(sphere_position);

  return swept_sphere_obb_clpt(
-      sphere_position,
-      distance, closest_point, sparsity_idx, weight, activation_distance,
-      speed_dt, obb_accel, obb_bounds, obb_pose, obb_enable, n_env_obb,
-      env_query_idx, max_nobs, batch_size, horizon, n_spheres, sweep_steps,
-      enable_speed_metric, transform_back, compute_distance, use_batch_env);
+    sphere_position,
+    distance, closest_point, sparsity_idx, weight, activation_distance,
+    speed_dt, obb_accel, obb_bounds, obb_pose, obb_enable, n_env_obb,
+    env_query_idx, max_nobs, batch_size, horizon, n_spheres, sweep_steps,
+    enable_speed_metric, transform_back, compute_distance, use_batch_env);
 }
-std::vector<torch::Tensor> pose_distance_wrapper(
-    torch::Tensor out_distance, torch::Tensor out_position_distance,
-    torch::Tensor out_rotation_distance,
-    torch::Tensor distance_p_vector, // batch size, 3
-    torch::Tensor distance_q_vector, // batch size, 4
-    torch::Tensor out_gidx,
-    const torch::Tensor current_position, // batch_size, 3
-    const torch::Tensor goal_position,    // n_boxes, 3
-    const torch::Tensor current_quat, const torch::Tensor goal_quat,
-    const torch::Tensor vec_weight, // n_boxes, 4, 4
-    const torch::Tensor weight, const torch::Tensor vec_convergence,
-    const torch::Tensor run_weight, const torch::Tensor run_vec_weight,
-    const torch::Tensor batch_pose_idx, const int batch_size, const int horizon,
-    const int mode, const int num_goals = 1, const bool compute_grad = false,
-    const bool write_distance = false, const bool use_metric = false) {

+std::vector<torch::Tensor>pose_distance_wrapper(
+  torch::Tensor out_distance, torch::Tensor out_position_distance,
+  torch::Tensor out_rotation_distance,
+  torch::Tensor distance_p_vector,      // batch size, 3
+  torch::Tensor distance_q_vector,      // batch size, 4
+  torch::Tensor out_gidx,
+  const torch::Tensor current_position, // batch_size, 3
+  const torch::Tensor goal_position,    // n_boxes, 3
+  const torch::Tensor current_quat, const torch::Tensor goal_quat,
+  const torch::Tensor vec_weight,       // n_boxes, 4, 4
+  const torch::Tensor weight, const torch::Tensor vec_convergence,
+  const torch::Tensor run_weight, const torch::Tensor run_vec_weight,
+  const torch::Tensor offset_waypoint, const torch::Tensor offset_tstep_fraction,
+  const torch::Tensor batch_pose_idx, const int batch_size, const int horizon,
+  const int mode, const int num_goals = 1, const bool compute_grad = false,
+  const bool write_distance = false, const bool use_metric = false,
+  const bool project_distance = true)
+{
  // at::cuda::DeviceGuard guard(angle.device());
  CHECK_INPUT(out_distance);
  CHECK_INPUT(out_position_distance);
@@ -214,24 +250,30 @@ std::vector<torch::Tensor> pose_distance_wrapper(
  CHECK_INPUT(current_quat);
  CHECK_INPUT(goal_quat);
  CHECK_INPUT(batch_pose_idx);
+  CHECK_INPUT(offset_waypoint);
+  CHECK_INPUT(offset_tstep_fraction);
  const at::cuda::OptionalCUDAGuard guard(current_position.device());

  return pose_distance(
-      out_distance, out_position_distance, out_rotation_distance,
-      distance_p_vector, distance_q_vector, out_gidx, current_position,
-      goal_position, current_quat, goal_quat, vec_weight, weight,
-      vec_convergence, run_weight, run_vec_weight, batch_pose_idx, batch_size,
-      horizon, mode, num_goals, compute_grad, write_distance, use_metric);
+    out_distance, out_position_distance, out_rotation_distance,
+    distance_p_vector, distance_q_vector, out_gidx, current_position,
+    goal_position, current_quat, goal_quat, vec_weight, weight,
+    vec_convergence, run_weight, run_vec_weight,
+    offset_waypoint,
+    offset_tstep_fraction,
+    batch_pose_idx, batch_size,
+    horizon, mode, num_goals, compute_grad, write_distance, use_metric, project_distance);
 }

-std::vector<torch::Tensor> backward_pose_distance_wrapper(
-    torch::Tensor out_grad_p, torch::Tensor out_grad_q,
-    const torch::Tensor grad_distance,   // batch_size, 3
-    const torch::Tensor grad_p_distance, // n_boxes, 3
-    const torch::Tensor grad_q_distance, const torch::Tensor pose_weight,
-    const torch::Tensor grad_p_vec, // n_boxes, 4, 4
-    const torch::Tensor grad_q_vec, const int batch_size,
-    const bool use_distance) {
+std::vector<torch::Tensor>backward_pose_distance_wrapper(
+  torch::Tensor out_grad_p, torch::Tensor out_grad_q,
+  const torch::Tensor grad_distance,   // batch_size, 3
+  const torch::Tensor grad_p_distance, // n_boxes, 3
+  const torch::Tensor grad_q_distance, const torch::Tensor pose_weight,
+  const torch::Tensor grad_p_vec,      // n_boxes, 4, 4
+  const torch::Tensor grad_q_vec, const int batch_size,
+  const bool use_distance)
+{
  CHECK_INPUT(out_grad_p);
  CHECK_INPUT(out_grad_q);
  CHECK_INPUT(grad_distance);
@@ -241,18 +283,19 @@ std::vector<torch::Tensor> backward_pose_distance_wrapper(
  const at::cuda::OptionalCUDAGuard guard(grad_distance.device());

  return backward_pose_distance(
-      out_grad_p, out_grad_q, grad_distance, grad_p_distance, grad_q_distance,
-      pose_weight, grad_p_vec, grad_q_vec, batch_size, use_distance);
+    out_grad_p, out_grad_q, grad_distance, grad_p_distance, grad_q_distance,
+    pose_weight, grad_p_vec, grad_q_vec, batch_size, use_distance);
 }

-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("pose_distance", &pose_distance_wrapper, "Pose Distance (curobolib)");
-  m.def("pose_distance_backward", &backward_pose_distance_wrapper,
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+  m.def("pose_distance",           &pose_distance_wrapper, "Pose Distance (curobolib)");
+  m.def("pose_distance_backward",  &backward_pose_distance_wrapper,
        "Pose Distance Backward (curobolib)");

-  m.def("closest_point", &sphere_obb_clpt_wrapper,
+  m.def("closest_point",           &sphere_obb_clpt_wrapper,
        "Closest Point OBB(curobolib)");
-  m.def("swept_closest_point", &swept_sphere_obb_clpt_wrapper,
+  m.def("swept_closest_point",     &swept_sphere_obb_clpt_wrapper,
        "Swept Closest Point OBB(curobolib)");

  m.def("self_collision_distance", &self_collision_distance_wrapper,
--- a/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
+++ b/src/curobo/curobolib/cpp/kinematics_fused_cuda.cpp
@@ -16,91 +16,112 @@
 // CUDA forward declarations

 std::vector<torch::Tensor>
-matrix_to_quaternion(torch::Tensor out_quat,
+matrix_to_quaternion(torch::Tensor       out_quat,
                     const torch::Tensor in_rot // batch_size, 3
-);
+                     );

-std::vector<torch::Tensor> kin_fused_forward(
-    torch::Tensor link_pos, torch::Tensor link_quat,
-    torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
-    const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
-    const torch::Tensor robot_spheres, const torch::Tensor link_map,
-    const torch::Tensor joint_map, const torch::Tensor joint_map_type,
-    const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
-    const int batch_size, const int n_spheres,
-    const bool use_global_cumul = false);
+std::vector<torch::Tensor>kin_fused_forward(
+  torch::Tensor       link_pos,
+  torch::Tensor       link_quat,
+  torch::Tensor       batch_robot_spheres,
+  torch::Tensor       global_cumul_mat,
+  const torch::Tensor joint_vec,
+  const torch::Tensor fixed_transform,
+  const torch::Tensor robot_spheres,
+  const torch::Tensor link_map,
+  const torch::Tensor joint_map,
+  const torch::Tensor joint_map_type,
+  const torch::Tensor store_link_map,
+  const torch::Tensor link_sphere_map,
+  const int           batch_size,
+  const int           n_spheres,
+  const bool          use_global_cumul = false);
+
+std::vector<torch::Tensor>kin_fused_backward_16t(
+  torch::Tensor       grad_out,
+  const torch::Tensor grad_nlinks_pos,
+  const torch::Tensor grad_nlinks_quat,
+  const torch::Tensor grad_spheres,
+  const torch::Tensor global_cumul_mat,
+  const torch::Tensor joint_vec,
+  const torch::Tensor fixed_transform,
+  const torch::Tensor robot_spheres,
+  const torch::Tensor link_map,
+  const torch::Tensor joint_map,
+  const torch::Tensor joint_map_type,
+  const torch::Tensor store_link_map,
+  const torch::Tensor link_sphere_map,
+  const torch::Tensor link_chain_map,
+  const int           batch_size,
+  const int           n_spheres,
+  const bool          sparsity_opt     = true,
+  const bool          use_global_cumul = false);

-std::vector<torch::Tensor> kin_fused_backward_16t(
-    torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
-    const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
-    const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
-    const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
-    const torch::Tensor link_map, const torch::Tensor joint_map,
-    const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
-    const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
-    const int batch_size, const int n_spheres, const bool sparsity_opt = true,
-    const bool use_global_cumul = false);
 // C++ interface

 // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

-std::vector<torch::Tensor> kin_forward_wrapper(
-    torch::Tensor link_pos, torch::Tensor link_quat,
-    torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
-    const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
-    const torch::Tensor robot_spheres, const torch::Tensor link_map,
-    const torch::Tensor joint_map, const torch::Tensor joint_map_type,
-    const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
-    const int batch_size, const int n_spheres,
-    const bool use_global_cumul = false) {
-
+std::vector<torch::Tensor>kin_forward_wrapper(
+  torch::Tensor link_pos, torch::Tensor link_quat,
+  torch::Tensor batch_robot_spheres, torch::Tensor global_cumul_mat,
+  const torch::Tensor joint_vec, const torch::Tensor fixed_transform,
+  const torch::Tensor robot_spheres, const torch::Tensor link_map,
+  const torch::Tensor joint_map, const torch::Tensor joint_map_type,
+  const torch::Tensor store_link_map, const torch::Tensor link_sphere_map,
+  const int batch_size, const int n_spheres,
+  const bool use_global_cumul = false)
+{
  const at::cuda::OptionalCUDAGuard guard(joint_vec.device());

  // TODO: add check input
  return kin_fused_forward(
-      link_pos, link_quat, batch_robot_spheres, global_cumul_mat, joint_vec,
-      fixed_transform, robot_spheres, link_map, joint_map, joint_map_type,
-      store_link_map, link_sphere_map, batch_size, n_spheres, use_global_cumul);
+    link_pos, link_quat, batch_robot_spheres, global_cumul_mat, joint_vec,
+    fixed_transform, robot_spheres, link_map, joint_map, joint_map_type,
+    store_link_map, link_sphere_map, batch_size, n_spheres, use_global_cumul);
 }

-std::vector<torch::Tensor> kin_backward_wrapper(
-    torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
-    const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
-    const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
-    const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
-    const torch::Tensor link_map, const torch::Tensor joint_map,
-    const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
-    const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
-    const int batch_size, const int n_spheres, const bool sparsity_opt = true,
-    const bool use_global_cumul = false) {
+std::vector<torch::Tensor>kin_backward_wrapper(
+  torch::Tensor grad_out, const torch::Tensor grad_nlinks_pos,
+  const torch::Tensor grad_nlinks_quat, const torch::Tensor grad_spheres,
+  const torch::Tensor global_cumul_mat, const torch::Tensor joint_vec,
+  const torch::Tensor fixed_transform, const torch::Tensor robot_spheres,
+  const torch::Tensor link_map, const torch::Tensor joint_map,
+  const torch::Tensor joint_map_type, const torch::Tensor store_link_map,
+  const torch::Tensor link_sphere_map, const torch::Tensor link_chain_map,
+  const int batch_size, const int n_spheres, const bool sparsity_opt = true,
+  const bool use_global_cumul = false)
+{
  const at::cuda::OptionalCUDAGuard guard(joint_vec.device());

  return kin_fused_backward_16t(
-      grad_out, grad_nlinks_pos, grad_nlinks_quat, grad_spheres,
-      global_cumul_mat, joint_vec, fixed_transform, robot_spheres, link_map,
-      joint_map, joint_map_type, store_link_map, link_sphere_map,
-      link_chain_map, batch_size, n_spheres, sparsity_opt, use_global_cumul);
+    grad_out, grad_nlinks_pos, grad_nlinks_quat, grad_spheres,
+    global_cumul_mat, joint_vec, fixed_transform, robot_spheres, link_map,
+    joint_map, joint_map_type, store_link_map, link_sphere_map,
+    link_chain_map, batch_size, n_spheres, sparsity_opt, use_global_cumul);
 }

 std::vector<torch::Tensor>
-matrix_to_quaternion_wrapper(torch::Tensor out_quat,
+matrix_to_quaternion_wrapper(torch::Tensor       out_quat,
                             const torch::Tensor in_rot // batch_size, 3
-) {
+                             )
+{
  const at::cuda::OptionalCUDAGuard guard(in_rot.device());
+
  CHECK_INPUT(in_rot);
  CHECK_INPUT(out_quat);
  return matrix_to_quaternion(out_quat, in_rot);
 }

-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &kin_forward_wrapper, "Kinematics fused forward (CUDA)");
-  m.def("backward", &kin_backward_wrapper, "Kinematics fused backward (CUDA)");
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+  m.def("forward",              &kin_forward_wrapper,  "Kinematics fused forward (CUDA)");
+  m.def("backward",             &kin_backward_wrapper, "Kinematics fused backward (CUDA)");
  m.def("matrix_to_quaternion", &matrix_to_quaternion_wrapper,
        "Rotation Matrix to Quaternion (CUDA)");
 }
--- a/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
+++ b/src/curobo/curobolib/cpp/kinematics_fused_kernel.cu
--- a/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp
+++ b/src/curobo/curobolib/cpp/lbfgs_step_cuda.cpp
@@ -15,45 +15,68 @@
 #include <c10/cuda/CUDAGuard.h>

 // CUDA forward declarations
-std::vector<torch::Tensor> reduce_cuda(torch::Tensor vec, torch::Tensor vec2,
-                                       torch::Tensor rho_buffer,
-                                       torch::Tensor sum, const int batch_size,
-                                       const int v_dim, const int m);
+std::vector<torch::Tensor>reduce_cuda(torch::Tensor vec,
+                                      torch::Tensor vec2,
+                                      torch::Tensor rho_buffer,
+                                      torch::Tensor sum,
+                                      const int     batch_size,
+                                      const int     v_dim,
+                                      const int     m);

 std::vector<torch::Tensor>
-lbfgs_step_cuda(torch::Tensor step_vec, torch::Tensor rho_buffer,
-                torch::Tensor y_buffer, torch::Tensor s_buffer,
-                torch::Tensor grad_q, const float epsilon, const int batch_size,
-                const int m, const int v_dim);
+lbfgs_step_cuda(torch::Tensor step_vec,
+                torch::Tensor rho_buffer,
+                torch::Tensor y_buffer,
+                torch::Tensor s_buffer,
+                torch::Tensor grad_q,
+                const float   epsilon,
+                const int     batch_size,
+                const int     m,
+                const int     v_dim);

 std::vector<torch::Tensor>
-lbfgs_update_cuda(torch::Tensor rho_buffer, torch::Tensor y_buffer,
-                  torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
-                  torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
-                  const int m, const int v_dim);
+lbfgs_update_cuda(torch::Tensor rho_buffer,
+                  torch::Tensor y_buffer,
+                  torch::Tensor s_buffer,
+                  torch::Tensor q,
+                  torch::Tensor grad_q,
+                  torch::Tensor x_0,
+                  torch::Tensor grad_0,
+                  const int     batch_size,
+                  const int     m,
+                  const int     v_dim);

 std::vector<torch::Tensor>
-lbfgs_cuda_fuse(torch::Tensor step_vec, torch::Tensor rho_buffer,
-                torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
-                torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
-                const float epsilon, const int batch_size, const int m,
-                const int v_dim, const bool stable_mode);
+lbfgs_cuda_fuse(torch::Tensor step_vec,
+                torch::Tensor rho_buffer,
+                torch::Tensor y_buffer,
+                torch::Tensor s_buffer,
+                torch::Tensor q,
+                torch::Tensor grad_q,
+                torch::Tensor x_0,
+                torch::Tensor grad_0,
+                const float   epsilon,
+                const int     batch_size,
+                const int     m,
+                const int     v_dim,
+                const bool    stable_mode);
+
 // C++ interface

 // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

 std::vector<torch::Tensor>
 lbfgs_step_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
                torch::Tensor y_buffer, torch::Tensor s_buffer,
                torch::Tensor grad_q, const float epsilon, const int batch_size,
-                const int m, const int v_dim) {
-
+                const int m, const int v_dim)
+{
  CHECK_INPUT(step_vec);
  CHECK_INPUT(rho_buffer);
  CHECK_INPUT(y_buffer);
@@ -69,7 +92,8 @@ std::vector<torch::Tensor>
 lbfgs_update_call(torch::Tensor rho_buffer, torch::Tensor y_buffer,
                  torch::Tensor s_buffer, torch::Tensor q, torch::Tensor grad_q,
                  torch::Tensor x_0, torch::Tensor grad_0, const int batch_size,
-                  const int m, const int v_dim) {
+                  const int m, const int v_dim)
+{
  CHECK_INPUT(rho_buffer);
  CHECK_INPUT(y_buffer);
  CHECK_INPUT(s_buffer);
@@ -86,7 +110,8 @@ lbfgs_update_call(torch::Tensor rho_buffer, torch::Tensor y_buffer,
 std::vector<torch::Tensor>
 reduce_cuda_call(torch::Tensor vec, torch::Tensor vec2,
                 torch::Tensor rho_buffer, torch::Tensor sum,
-                 const int batch_size, const int v_dim, const int m) {
+                 const int batch_size, const int v_dim, const int m)
+{
  const at::cuda::OptionalCUDAGuard guard(sum.device());

  return reduce_cuda(vec, vec2, rho_buffer, sum, batch_size, v_dim, m);
@@ -97,7 +122,8 @@ lbfgs_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
           torch::Tensor y_buffer, torch::Tensor s_buffer, torch::Tensor q,
           torch::Tensor grad_q, torch::Tensor x_0, torch::Tensor grad_0,
           const float epsilon, const int batch_size, const int m,
-           const int v_dim, const bool stable_mode) {
+           const int v_dim, const bool stable_mode)
+{
  CHECK_INPUT(step_vec);
  CHECK_INPUT(rho_buffer);
  CHECK_INPUT(y_buffer);
@@ -113,9 +139,10 @@ lbfgs_call(torch::Tensor step_vec, torch::Tensor rho_buffer,
                         stable_mode);
 }

-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("step", &lbfgs_step_call, "L-BFGS step (CUDA)");
-  m.def("update", &lbfgs_update_call, "L-BFGS Update (CUDA)");
-  m.def("forward", &lbfgs_call, "L-BFGS Update + Step (CUDA)");
-  m.def("debug_reduce", &reduce_cuda_call, "L-BFGS Debug");
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+  m.def("step",         &lbfgs_step_call,   "L-BFGS step (CUDA)");
+  m.def("update",       &lbfgs_update_call, "L-BFGS Update (CUDA)");
+  m.def("forward",      &lbfgs_call,        "L-BFGS Update + Step (CUDA)");
+  m.def("debug_reduce", &reduce_cuda_call,  "L-BFGS Debug");
 }
--- a/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
+++ b/src/curobo/curobolib/cpp/lbfgs_step_kernel.cu
--- a/src/curobo/curobolib/cpp/line_search_cuda.cpp
+++ b/src/curobo/curobolib/cpp/line_search_cuda.cpp
@@ -18,49 +18,67 @@
 // CUDA forward declarations

 std::vector<torch::Tensor>
-update_best_cuda(torch::Tensor best_cost, torch::Tensor best_q,
-                 torch::Tensor best_iteration, 
-                 torch::Tensor current_iteration,
+update_best_cuda(torch::Tensor       best_cost,
+                 torch::Tensor       best_q,
+                 torch::Tensor       best_iteration,
+                 torch::Tensor       current_iteration,
                 const torch::Tensor cost,
-                 const torch::Tensor q, const int d_opt, const int cost_s1,
-                 const int cost_s2, const int iteration,
-                 const float delta_threshold,
-                 const float relative_threshold = 0.999);
+                 const torch::Tensor q,
+                 const int           d_opt,
+                 const int           cost_s1,
+                 const int           cost_s2,
+                 const int           iteration,
+                 const float         delta_threshold,
+                 const float         relative_threshold = 0.999);
+
+std::vector<torch::Tensor>line_search_cuda(
+
+  // torch::Tensor m,
+  torch::Tensor       best_x,
+  torch::Tensor       best_c,
+  torch::Tensor       best_grad,
+  const torch::Tensor g_x,
+  const torch::Tensor x_set,
+  const torch::Tensor step_vec,
+  const torch::Tensor c_0,
+  const torch::Tensor alpha_list,
+  const torch::Tensor c_idx,
+  const float         c_1,
+  const float         c_2,
+  const bool          strong_wolfe,
+  const bool          approx_wolfe,
+  const int           l1,
+  const int           l2,
+  const int           batchsize);

-std::vector<torch::Tensor> line_search_cuda(
-    // torch::Tensor m,
-    torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
-    const torch::Tensor g_x, const torch::Tensor x_set,
-    const torch::Tensor step_vec, const torch::Tensor c_0,
-    const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
-    const float c_2, const bool strong_wolfe, const bool approx_wolfe,
-    const int l1, const int l2, const int batchsize);
 // C++ interface

 // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)


-std::vector<torch::Tensor> line_search_call(
-    // torch::Tensor m,
-    torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
-    const torch::Tensor g_x, const torch::Tensor x_set,
-    const torch::Tensor step_vec, const torch::Tensor c_0,
-    const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
-    const float c_2, const bool strong_wolfe, const bool approx_wolfe,
-    const int l1, const int l2, const int batchsize) {
+std::vector<torch::Tensor>line_search_call(

+  // torch::Tensor m,
+  torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
+  const torch::Tensor g_x, const torch::Tensor x_set,
+  const torch::Tensor step_vec, const torch::Tensor c_0,
+  const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
+  const float c_2, const bool strong_wolfe, const bool approx_wolfe,
+  const int l1, const int l2, const int batchsize)
+{
  CHECK_INPUT(g_x);
  CHECK_INPUT(x_set);
  CHECK_INPUT(step_vec);
  CHECK_INPUT(c_0);
  CHECK_INPUT(alpha_list);
  CHECK_INPUT(c_idx);
+
  // CHECK_INPUT(m);
  CHECK_INPUT(best_x);
  CHECK_INPUT(best_c);
@@ -76,14 +94,14 @@ std::vector<torch::Tensor> line_search_call(

 std::vector<torch::Tensor>
 update_best_call(torch::Tensor best_cost, torch::Tensor best_q,
-                 torch::Tensor best_iteration, 
+                 torch::Tensor best_iteration,
                 torch::Tensor current_iteration,
                 const torch::Tensor cost,
                 const torch::Tensor q, const int d_opt, const int cost_s1,
                 const int cost_s2, const int iteration,
                 const float delta_threshold,
-                 const float relative_threshold=0.999) {
-
+                 const float relative_threshold = 0.999)
+{
  CHECK_INPUT(best_cost);
  CHECK_INPUT(best_q);
  CHECK_INPUT(cost);
@@ -96,8 +114,8 @@ update_best_call(torch::Tensor best_cost, torch::Tensor best_q,
                          cost_s1, cost_s2, iteration, delta_threshold, relative_threshold);
 }

-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
  m.def("update_best", &update_best_call, "Update Best (CUDA)");
  m.def("line_search", &line_search_call, "Line search (CUDA)");
-
-}
+}
--- a/src/curobo/curobolib/cpp/line_search_kernel.cu
+++ b/src/curobo/curobolib/cpp/line_search_kernel.cu
@@ -37,362 +37,431 @@

 #define FULL_MASK 0xffffffff

-namespace Curobo {
+namespace Curobo
+{
+  namespace Optimization
+  {
+    template<typename scalar_t, typename psum_t>
+    __inline__ __device__ void reduce(scalar_t v, int m, unsigned mask,
+                                      psum_t *data, scalar_t *result)
+    {
+      psum_t val = v;

-namespace Optimization {
+      val += __shfl_down_sync(mask, val, 1);
+      val += __shfl_down_sync(mask, val, 2);
+      val += __shfl_down_sync(mask, val, 4);
+      val += __shfl_down_sync(mask, val, 8);
+      val += __shfl_down_sync(mask, val, 16);

-template <typename scalar_t, typename psum_t>
-__inline__ __device__ void reduce(scalar_t v, int m, unsigned mask,
-                                  psum_t *data, scalar_t *result) {
-  psum_t val = v;
-  val += __shfl_down_sync(mask, val, 1);
-  val += __shfl_down_sync(mask, val, 2);
-  val += __shfl_down_sync(mask, val, 4);
-  val += __shfl_down_sync(mask, val, 8);
-  val += __shfl_down_sync(mask, val, 16);
-  // int leader = __ffs(mask) – 1;    // select a leader lane
-  int leader = 0;
-  if (threadIdx.x % 32 == leader) {
-    if (m <= 32) {
-      result[0] = (scalar_t)val;
-    } else {
-      data[(threadIdx.x + 1) / 32] = val;
-    }
-  }
-  if (m > 32) {
-
-    __syncthreads();
-
-    int elems = (m + 31) / 32;
-    assert(elems <= 32);
-    unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
-    if (threadIdx.x < elems) { // only the first warp will do this work
-      psum_t val2 = data[threadIdx.x % 32];
-      int shift = 1;
-      for (int i = elems - 1; i > 0; i /= 2) {
-        val2 += __shfl_down_sync(mask2, val2, shift);
-        shift *= 2;
-      }
-      // int leader = __ffs(mask2) – 1;    // select a leader lane
+      // int leader = __ffs(mask) – 1;    // select a leader lane
      int leader = 0;
-      if (threadIdx.x % 32 == leader) {
-        result[0] = (scalar_t)val2;
+
+      if (threadIdx.x % 32 == leader)
+      {
+        if (m <= 32)
+        {
+          result[0] = (scalar_t)val;
+        }
+        else
+        {
+          data[(threadIdx.x + 1) / 32] = val;
+        }
+      }
+
+      if (m > 32)
+      {
+        __syncthreads();
+
+        int elems = (m + 31) / 32;
+        assert(elems <= 32);
+        unsigned mask2 = __ballot_sync(FULL_MASK, threadIdx.x < elems);
+
+        if (threadIdx.x < elems) // only the first warp will do this work
+        {
+          psum_t val2  = data[threadIdx.x % 32];
+          int    shift = 1;
+
+          for (int i = elems - 1; i > 0; i /= 2)
+          {
+            val2  += __shfl_down_sync(mask2, val2, shift);
+            shift *= 2;
+          }
+
+          // int leader = __ffs(mask2) – 1;    // select a leader lane
+          int leader = 0;
+
+          if (threadIdx.x % 32 == leader)
+          {
+            result[0] = (scalar_t)val2;
+          }
+        }
+      }
+      __syncthreads();
+    }
+
+    // Launched with l2 threads/block and batchsize blocks
+    template<typename scalar_t, typename psum_t>
+    __global__ void line_search_kernel(
+
+      // int64_t *m_idx, // 4x1x1
+      scalar_t *best_x,           // 4x280
+      scalar_t *best_c,           // 4x1
+      scalar_t *best_grad,        // 4x280
+      const scalar_t *g_x,        //  4x6x280
+      const scalar_t *x_set,      //  4x6x280
+      const scalar_t *step_vec,   // 4x280x1
+      const scalar_t *c,          // 4x6x1
+      const scalar_t *alpha_list, // 4x6x1
+      const int64_t *c_idx,       // 4x1x1
+      const float c_1, const float c_2, const bool strong_wolfe,
+      const bool approx_wolfe,
+      const int l1,               // 6
+      const int l2,               // 280
+      const int batchsize)        // 4
+    {
+      int batch = blockIdx.x;
+      __shared__ psum_t   data[32];
+      __shared__ scalar_t result[32];
+
+      assert(l1 <= 32);
+      unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
+
+      if (threadIdx.x >= l2)
+      {
+        return;
+      }
+
+      scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
+
+      // g_step = g0 @ step_vec_T
+      // g_x @ step_vec_T
+      for (int i = 0; i < l1; i++)
+      {
+        reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
+               &data[0], &result[i]);
+      }
+
+      __shared__ scalar_t step_success[32];
+      __shared__ scalar_t step_success_w1[32];
+      assert(blockDim.x >= l1);
+      bool wolfe_1   = false;
+      bool wolfe     = false;
+      bool condition = threadIdx.x < l1;
+
+      if (condition)
+      {
+        // scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
+        scalar_t alpha_list_elem = alpha_list[threadIdx.x];
+
+        // condition 1:
+        wolfe_1 = c[batch * l1 + threadIdx.x] <=
+                  (c[batch * l1] + c_1 * alpha_list_elem * result[0]);
+
+        // condition 2:
+        bool wolfe_2;
+
+        if (strong_wolfe)
+        {
+          wolfe_2 = abs(result[threadIdx.x]) <= c_2 *abs(result[0]);
+        }
+        else
+        {
+          wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
+        }
+
+        wolfe = wolfe_1 & wolfe_2;
+
+        step_success[threadIdx.x]    = wolfe * (alpha_list_elem + 0.1);
+        step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
+      }
+
+      __syncthreads();
+
+      __shared__ int idx_shared;
+
+      if (threadIdx.x == 0)
+      {
+        int m_id      = 0;
+        int m1_id     = 0;
+        scalar_t max1 = step_success[0];
+        scalar_t max2 = step_success_w1[0];
+
+        for (int i = 1; i < l1; i++)
+        {
+          if (max1 < step_success[i])
+          {
+            max1 = step_success[i];
+            m_id = i;
+          }
+
+          if (max2 < step_success_w1[i])
+          {
+            max2  = step_success_w1[i];
+            m1_id = i;
+          }
+        }
+
+        if (!approx_wolfe)
+        {
+          //   m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
+          if (m_id == 0)
+          {
+            m_id = m1_id;
+          }
+
+          // m_idx[m_idx == 0] = 1
+          if (m_id == 0)
+          {
+            m_id = 1;
+          }
+        }
+        idx_shared = m_id + c_idx[batch];
+      }
+
+      ////////////////////////////////////
+      // write outputs using the computed index.
+      // one index per batch is computed
+      ////////////////////////////////////
+      // l2 is d_opt, l1 is line_search n.
+      // idx_shared contains index in l1
+      //
+      __syncthreads();
+
+      if (threadIdx.x < l2)
+      {
+        if (threadIdx.x == 0)
+        {
+          // printf("block: %d, idx_shared: %d\n", batch, idx_shared);
+        }
+        best_x[batch * l2 + threadIdx.x]    = x_set[idx_shared * l2 + threadIdx.x];
+        best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
+      }
+
+      if (threadIdx.x == 0)
+      {
+        best_c[batch] = c[idx_shared];
      }
    }
-  }
-  __syncthreads();
-}

-// Launched with l2 threads/block and batchsize blocks
-template <typename scalar_t, typename psum_t>
-__global__ void line_search_kernel(
-    // int64_t *m_idx, // 4x1x1
-    scalar_t *best_x,           // 4x280
-    scalar_t *best_c,           // 4x1
-    scalar_t *best_grad,        // 4x280
-    const scalar_t *g_x,        //  4x6x280
-    const scalar_t *x_set,      //  4x6x280
-    const scalar_t *step_vec,   // 4x280x1
-    const scalar_t *c,          // 4x6x1
-    const scalar_t *alpha_list, // 4x6x1
-    const int64_t *c_idx,       // 4x1x1
-    const float c_1, const float c_2, const bool strong_wolfe,
-    const bool approx_wolfe,
-    const int l1,        // 6
-    const int l2,        // 280
-    const int batchsize) // 4
+    // Launched with l2 threads/block and #blocks = batchsize
+    template<typename scalar_t, typename psum_t>
+    __global__ void line_search_kernel_mask(
+
+      // int64_t *m_idx, // 4x1x1
+      scalar_t *best_x,           // 4x280
+      scalar_t *best_c,           // 4x1
+      scalar_t *best_grad,        // 4x280
+      const scalar_t *g_x,        //  4x6x280
+      const scalar_t *x_set,      //  4x6x280
+      const scalar_t *step_vec,   // 4x280x1
+      const scalar_t *c,          // 4x6x1
+      const scalar_t *alpha_list, // 4x6x1
+      const int64_t *c_idx,       // 4x1x1
+      const float c_1, const float c_2, const bool strong_wolfe,
+      const bool approx_wolfe,
+      const int l1,               // 6
+      const int l2,               // 280
+      const int batchsize)        // 4
+    {
+      int batch = blockIdx.x;
+      __shared__ psum_t   data[32];
+      __shared__ scalar_t result[32];
+
+      assert(l1 <= 32);
+      unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
+
+      if (threadIdx.x >= l2)
+      {
+        return;
+      }
+
+      scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
+
+      // g_step = g0 @ step_vec_T
+      // g_x @ step_vec_T
+      for (int i = 0; i < l1; i++)
+      {
+        reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
+               &data[0], &result[i]);
+      }
+
+      // __shared__ scalar_t step_success[32];
+      // __shared__ scalar_t step_success_w1[32];
+      assert(blockDim.x >= l1);
+      bool wolfe_1   = false;
+      bool wolfe     = false;
+      bool condition = threadIdx.x < l1;
+
+      if (condition)
+      {
+        scalar_t alpha_list_elem = alpha_list[threadIdx.x];
+
+        // scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
+
+        // condition 1:
+        wolfe_1 = c[batch * l1 + threadIdx.x] <=
+                  (c[batch * l1] + c_1 * alpha_list_elem * result[0]);
+
+        // condition 2:
+        bool wolfe_2;
+
+        if (strong_wolfe)
+        {
+          wolfe_2 = abs(result[threadIdx.x]) <= c_2 *abs(result[0]);
+        }
+        else
+        {
+          wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
+        }
+
+        // wolfe = torch.logical_and(wolfe_1, wolfe_2)
+        wolfe = wolfe_1 & wolfe_2;
+
+        // // step_success = wolfe * (self.alpha_list[:, :, 0:1] + 0.1)
+        // // step_success_w1 = wolfe_1 * (self.alpha_list[:, :, 0:1] + 0.1)
+        // step_success[threadIdx.x]    = wolfe   * (alpha_list_elem + 0.1);
+        // step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
+      }
+      unsigned msk1 = __ballot_sync(FULL_MASK, wolfe_1 & condition);
+      unsigned msk  = __ballot_sync(FULL_MASK, wolfe & condition);
+
+      // get the index of the last occurance of true
+      unsigned msk1_brev = __brev(msk1);
+      unsigned msk_brev  = __brev(msk);
+
+      int id1 = 32 - __ffs(msk1_brev); // position of least signficant bit set to 1
+      int id  = 32 - __ffs(msk_brev);  // position of least signficant bit set to 1
+
+      __syncthreads();
+
+      __shared__ int idx_shared;
+
+      if (threadIdx.x == 0)
+      {
+        if (!approx_wolfe)
+        {
+          if (id == 32) // msk is zero
+          {
+            id = id1;
+          }
+
+          if (id == 0) // bit 0 is set
+          {
+            id = id1;
+          }
+
+          if (id == 32) // msk is zero
+          {
+            id = 1;
+          }
+
+          if (id == 0)
+          {
+            id = 1;
+          }
+        }
+        else
+        {
+          if (id == 32) // msk is zero
+          {
+            id = 0;
+          }
+        }
+
+        // //  _, m_idx = torch.max(step_success, dim=-2)
+        // //  _, m1_idx = torch.max(step_success_w1, dim=-2)
+        // int m_id = 0;
+        // int m1_id = 0;
+        // scalar_t max1 = step_success[0];
+        // scalar_t max2 = step_success_w1[0];
+        // for (int i=1; i<l1; i++) {
+        //   if (max1<step_success[i]) {
+        //     max1 = step_success[i];
+        //     m_id = i;
+        //   }
+        //   if (max2<step_success_w1[i]) {
+        //     max2 = step_success_w1[i];
+        //     m1_id = i;
+        //   }
+        // }
+
+        // //   m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
+        // if (m_id == 0) {
+        //     m_id = m1_id;
+        // }
+
+        // // m_idx[m_idx == 0] = 1
+        // if (m_id == 0) {
+        //     m_id = 1;
+        // }
+
+        // if (id != m_id) {
+        //   printf("id=%d, m_id=%d\n", id, m_id);
+        //   printf("msk1=%x, msk=%x, raw id1=%d, raw id=%d\n", msk1, msk,
+        //   32-__ffs(msk1_brev), 32-__ffs(msk_brev));
+        // }
+
+        // m_idx[batch] = m_id;
+        //  m_idx[batch] = id;
+        idx_shared = id + c_idx[batch];
+      }
+
+      ////////////////////////////////////
+      // write outputs using the computed index.
+      // one index per batch is computed
+      ////////////////////////////////////
+      __syncthreads();
+
+      if (threadIdx.x < l2)
+      {
+        if (threadIdx.x == 0)
+        {
+          // printf("block: %d, idx_shared: %d\n", batch, idx_shared);
+        }
+        best_x[batch * l2 + threadIdx.x]    = x_set[idx_shared * l2 + threadIdx.x];
+        best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
+      }
+
+      if (threadIdx.x == 0)
+      {
+        best_c[batch] = c[idx_shared];
+      }
+    }
+  } // namespace Optimization
+}   // namespace Curobo
+std::vector<torch::Tensor>line_search_cuda(
+
+  // torch::Tensor m_idx,
+  torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
+  const torch::Tensor g_x, const torch::Tensor x_set,
+  const torch::Tensor step_vec, const torch::Tensor c_0,
+  const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
+  const float c_2, const bool strong_wolfe, const bool approx_wolfe,
+  const int l1, const int l2, const int batchsize)
 {
-
-  int batch = blockIdx.x;
-  __shared__ psum_t data[32];
-  __shared__ scalar_t result[32];
-  assert(l1 <= 32);
-  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
-
-  if (threadIdx.x >= l2) {
-    return;
-  }
-
-  scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
-
-  // g_step = g0 @ step_vec_T
-  // g_x @ step_vec_T
-  for (int i = 0; i < l1; i++) {
-    reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
-           &data[0], &result[i]);
-  }
-
-  __shared__ scalar_t step_success[32];
-  __shared__ scalar_t step_success_w1[32];
-  assert(blockDim.x >= l1);
-  bool wolfe_1 = false;
-  bool wolfe = false;
-  bool condition = threadIdx.x < l1;
-  if (condition) {
-    // scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
-    scalar_t alpha_list_elem = alpha_list[threadIdx.x];
-
-    // condition 1:
-    wolfe_1 = c[batch * l1 + threadIdx.x] <=
-              (c[batch * l1] + c_1 * alpha_list_elem * result[0]);
-
-    // condition 2:
-    bool wolfe_2;
-    if (strong_wolfe) {
-      wolfe_2 = abs(result[threadIdx.x]) <= c_2 * abs(result[0]);
-    } else {
-      wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
-    }
-
-    wolfe = wolfe_1 & wolfe_2;
-
-    step_success[threadIdx.x] = wolfe * (alpha_list_elem + 0.1);
-    step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
-  }
-
-  __syncthreads();
-
-  __shared__ int idx_shared;
-  if (threadIdx.x == 0) {
-    int m_id = 0;
-    int m1_id = 0;
-    scalar_t max1 = step_success[0];
-    scalar_t max2 = step_success_w1[0];
-    for (int i = 1; i < l1; i++) {
-      if (max1 < step_success[i]) {
-        max1 = step_success[i];
-        m_id = i;
-      }
-      if (max2 < step_success_w1[i]) {
-        max2 = step_success_w1[i];
-        m1_id = i;
-      }
-    }
-
-    if (!approx_wolfe) {
-
-      //   m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
-      if (m_id == 0) {
-        m_id = m1_id;
-      }
-      // m_idx[m_idx == 0] = 1
-      if (m_id == 0) {
-        m_id = 1;
-      }
-    }
-    idx_shared = m_id + c_idx[batch];
-  }
-
-  ////////////////////////////////////
-  // write outputs using the computed index.
-  // one index per batch is computed
-  ////////////////////////////////////
-  // l2 is d_opt, l1 is line_search n.
-  // idx_shared contains index in l1
-  //
-  __syncthreads();
-  if (threadIdx.x < l2) {
-    if (threadIdx.x == 0) {
-      // printf("block: %d, idx_shared: %d\n", batch, idx_shared);
-    }
-    best_x[batch * l2 + threadIdx.x] = x_set[idx_shared * l2 + threadIdx.x];
-    best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
-  }
-  if (threadIdx.x == 0) {
-    best_c[batch] = c[idx_shared];
-  }
-}
-
-// Launched with l2 threads/block and #blocks = batchsize
-template <typename scalar_t, typename psum_t>
-__global__ void line_search_kernel_mask(
-    // int64_t *m_idx, // 4x1x1
-    scalar_t *best_x,           // 4x280
-    scalar_t *best_c,           // 4x1
-    scalar_t *best_grad,        // 4x280
-    const scalar_t *g_x,        //  4x6x280
-    const scalar_t *x_set,      //  4x6x280
-    const scalar_t *step_vec,   // 4x280x1
-    const scalar_t *c,          // 4x6x1
-    const scalar_t *alpha_list, // 4x6x1
-    const int64_t *c_idx,       // 4x1x1
-    const float c_1, const float c_2, const bool strong_wolfe,
-    const bool approx_wolfe,
-    const int l1,        // 6
-    const int l2,        // 280
-    const int batchsize) // 4
-{
-
-  int batch = blockIdx.x;
-  __shared__ psum_t data[32];
-  __shared__ scalar_t result[32];
-  assert(l1 <= 32);
-  unsigned mask = __ballot_sync(FULL_MASK, threadIdx.x < l2);
-
-  if (threadIdx.x >= l2) {
-    return;
-  }
-
-  scalar_t sv_elem = step_vec[batch * l2 + threadIdx.x];
-
-  // g_step = g0 @ step_vec_T
-  // g_x @ step_vec_T
-  for (int i = 0; i < l1; i++) {
-    reduce(g_x[batch * l1 * l2 + l2 * i + threadIdx.x] * sv_elem, l2, mask,
-           &data[0], &result[i]);
-  }
-
-  // __shared__ scalar_t step_success[32];
-  // __shared__ scalar_t step_success_w1[32];
-  assert(blockDim.x >= l1);
-  bool wolfe_1 = false;
-  bool wolfe = false;
-  bool condition = threadIdx.x < l1;
-  if (condition) {
-    scalar_t alpha_list_elem = alpha_list[threadIdx.x];
-
-    // scalar_t alpha_list_elem = alpha_list[batch*l1 + threadIdx.x];
-
-    // condition 1:
-    wolfe_1 = c[batch * l1 + threadIdx.x] <=
-              (c[batch * l1] + c_1 * alpha_list_elem * result[0]);
-
-    // condition 2:
-    bool wolfe_2;
-    if (strong_wolfe) {
-      wolfe_2 = abs(result[threadIdx.x]) <= c_2 * abs(result[0]);
-    } else {
-      wolfe_2 = result[threadIdx.x] >= c_2 * result[0];
-    }
-
-    // wolfe = torch.logical_and(wolfe_1, wolfe_2)
-    wolfe = wolfe_1 & wolfe_2;
-
-    // // step_success = wolfe * (self.alpha_list[:, :, 0:1] + 0.1)
-    // // step_success_w1 = wolfe_1 * (self.alpha_list[:, :, 0:1] + 0.1)
-    // step_success[threadIdx.x]    = wolfe   * (alpha_list_elem + 0.1);
-    // step_success_w1[threadIdx.x] = wolfe_1 * (alpha_list_elem + 0.1);
-  }
-  unsigned msk1 = __ballot_sync(FULL_MASK, wolfe_1 & condition);
-  unsigned msk = __ballot_sync(FULL_MASK, wolfe & condition);
-
-  // get the index of the last occurance of true
-  unsigned msk1_brev = __brev(msk1);
-  unsigned msk_brev = __brev(msk);
-
-  int id1 = 32 - __ffs(msk1_brev); // position of least signficant bit set to 1
-  int id = 32 - __ffs(msk_brev);   // position of least signficant bit set to 1
-
-  __syncthreads();
-
-  __shared__ int idx_shared;
-  if (threadIdx.x == 0) {
-    if (!approx_wolfe) {
-      if (id == 32) { // msk is zero
-        id = id1;
-      }
-      if (id == 0) { // bit 0 is set
-        id = id1;
-      }
-      if (id == 32) { // msk is zero
-        id = 1;
-      }
-      if (id == 0) {
-        id = 1;
-      }
-    } else {
-      if (id == 32) { // msk is zero
-        id = 0;
-      }
-    }
-
-    // //  _, m_idx = torch.max(step_success, dim=-2)
-    // //  _, m1_idx = torch.max(step_success_w1, dim=-2)
-    // int m_id = 0;
-    // int m1_id = 0;
-    // scalar_t max1 = step_success[0];
-    // scalar_t max2 = step_success_w1[0];
-    // for (int i=1; i<l1; i++) {
-    //   if (max1<step_success[i]) {
-    //     max1 = step_success[i];
-    //     m_id = i;
-    //   }
-    //   if (max2<step_success_w1[i]) {
-    //     max2 = step_success_w1[i];
-    //     m1_id = i;
-    //   }
-    // }
-
-    // //   m_idx = torch.where(m_idx == 0, m1_idx, m_idx)
-    // if (m_id == 0) {
-    //     m_id = m1_id;
-    // }
-
-    // // m_idx[m_idx == 0] = 1
-    // if (m_id == 0) {
-    //     m_id = 1;
-    // }
-
-    // if (id != m_id) {
-    //   printf("id=%d, m_id=%d\n", id, m_id);
-    //   printf("msk1=%x, msk=%x, raw id1=%d, raw id=%d\n", msk1, msk,
-    //   32-__ffs(msk1_brev), 32-__ffs(msk_brev));
-    // }
-
-    // m_idx[batch] = m_id;
-    //  m_idx[batch] = id;
-    idx_shared = id + c_idx[batch];
-  }
-
-  ////////////////////////////////////
-  // write outputs using the computed index.
-  // one index per batch is computed
-  ////////////////////////////////////
-  __syncthreads();
-  if (threadIdx.x < l2) {
-    if (threadIdx.x == 0) {
-      // printf("block: %d, idx_shared: %d\n", batch, idx_shared);
-    }
-    best_x[batch * l2 + threadIdx.x] = x_set[idx_shared * l2 + threadIdx.x];
-    best_grad[batch * l2 + threadIdx.x] = g_x[idx_shared * l2 + threadIdx.x];
-  }
-  if (threadIdx.x == 0) {
-    best_c[batch] = c[idx_shared];
-  }
-}
-
-} // namespace Optimization
-} // namespace Curobo
-std::vector<torch::Tensor> line_search_cuda(
-    // torch::Tensor m_idx,
-    torch::Tensor best_x, torch::Tensor best_c, torch::Tensor best_grad,
-    const torch::Tensor g_x, const torch::Tensor x_set,
-    const torch::Tensor step_vec, const torch::Tensor c_0,
-    const torch::Tensor alpha_list, const torch::Tensor c_idx, const float c_1,
-    const float c_2, const bool strong_wolfe, const bool approx_wolfe,
-    const int l1, const int l2, const int batchsize) {
  using namespace Curobo::Optimization;
  assert(l2 <= 1024);

  // multiple of 32
  const int threadsPerBlock = 32 * ((l2 + 31) / 32); //  l2;
-  const int blocksPerGrid = batchsize;
+  const int blocksPerGrid   = batchsize;

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  AT_DISPATCH_FLOATING_TYPES(
-      g_x.scalar_type(), "line_search_cu", ([&] {
-        line_search_kernel_mask<scalar_t, scalar_t>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                // m_idx.data_ptr<int>(),
-                best_x.data_ptr<scalar_t>(), best_c.data_ptr<scalar_t>(),
-                best_grad.data_ptr<scalar_t>(), g_x.data_ptr<scalar_t>(),
-                x_set.data_ptr<scalar_t>(), step_vec.data_ptr<scalar_t>(),
-                c_0.data_ptr<scalar_t>(), alpha_list.data_ptr<scalar_t>(),
-                c_idx.data_ptr<int64_t>(), c_1, c_2, strong_wolfe, approx_wolfe,
-                l1, l2, batchsize);
-      }));
+    g_x.scalar_type(), "line_search_cu", ([&] {
+    line_search_kernel_mask<scalar_t, scalar_t>
+      << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+
+      // m_idx.data_ptr<int>(),
+      best_x.data_ptr<scalar_t>(), best_c.data_ptr<scalar_t>(),
+      best_grad.data_ptr<scalar_t>(), g_x.data_ptr<scalar_t>(),
+      x_set.data_ptr<scalar_t>(), step_vec.data_ptr<scalar_t>(),
+      c_0.data_ptr<scalar_t>(), alpha_list.data_ptr<scalar_t>(),
+      c_idx.data_ptr<int64_t>(), c_1, c_2, strong_wolfe, approx_wolfe,
+      l1, l2, batchsize);
+  }));
  C10_CUDA_KERNEL_LAUNCH_CHECK();

-  return {best_x, best_c, best_grad};
-}
+  return { best_x, best_c, best_grad };
+}
--- a/src/curobo/curobolib/cpp/pose_distance_kernel.cu
+++ b/src/curobo/curobolib/cpp/pose_distance_kernel.cu
--- a/src/curobo/curobolib/cpp/self_collision_kernel.cu
+++ b/src/curobo/curobolib/cpp/self_collision_kernel.cu
--- a/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
+++ b/src/curobo/curobolib/cpp/sphere_obb_kernel.cu
--- a/src/curobo/curobolib/cpp/tensor_step_cuda.cpp
+++ b/src/curobo/curobolib/cpp/tensor_step_cuda.cpp
@@ -13,72 +13,120 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <vector>

-std::vector<torch::Tensor> step_position_clique(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor traj_dt, const int batch_size, const int horizon,
-    const int dof);
-std::vector<torch::Tensor> step_position_clique2(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor traj_dt, const int batch_size, const int horizon,
-    const int dof, const int mode);
-std::vector<torch::Tensor> step_position_clique2_idx(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor start_idx, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,  const int mode);
+std::vector<torch::Tensor>step_position_clique(
+  torch::Tensor       out_position,
+  torch::Tensor       out_velocity,
+  torch::Tensor       out_acceleration,
+  torch::Tensor       out_jerk,
+  const torch::Tensor u_position,
+  const torch::Tensor start_position,
+  const torch::Tensor start_velocity,
+  const torch::Tensor start_acceleration,
+  const torch::Tensor traj_dt,
+  const int           batch_size,
+  const int           horizon,
+  const int           dof);
+std::vector<torch::Tensor>step_position_clique2(
+  torch::Tensor       out_position,
+  torch::Tensor       out_velocity,
+  torch::Tensor       out_acceleration,
+  torch::Tensor       out_jerk,
+  const torch::Tensor u_position,
+  const torch::Tensor start_position,
+  const torch::Tensor start_velocity,
+  const torch::Tensor start_acceleration,
+  const torch::Tensor traj_dt,
+  const int           batch_size,
+  const int           horizon,
+  const int           dof,
+  const int           mode);
+std::vector<torch::Tensor>step_position_clique2_idx(
+  torch::Tensor       out_position,
+  torch::Tensor       out_velocity,
+  torch::Tensor       out_acceleration,
+  torch::Tensor       out_jerk,
+  const torch::Tensor u_position,
+  const torch::Tensor start_position,
+  const torch::Tensor start_velocity,
+  const torch::Tensor start_acceleration,
+  const torch::Tensor start_idx,
+  const torch::Tensor traj_dt,
+  const int           batch_size,
+  const int           horizon,
+  const int           dof,
+  const int           mode);

-std::vector<torch::Tensor> backward_step_position_clique(
-    torch::Tensor out_grad_position, const torch::Tensor grad_position,
-    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
-    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof);
-std::vector<torch::Tensor> backward_step_position_clique2(
-    torch::Tensor out_grad_position, const torch::Tensor grad_position,
-    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
-    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,  const int mode);
+std::vector<torch::Tensor>backward_step_position_clique(
+  torch::Tensor       out_grad_position,
+  const torch::Tensor grad_position,
+  const torch::Tensor grad_velocity,
+  const torch::Tensor grad_acceleration,
+  const torch::Tensor grad_jerk,
+  const torch::Tensor traj_dt,
+  const int           batch_size,
+  const int           horizon,
+  const int           dof);
+std::vector<torch::Tensor>backward_step_position_clique2(
+  torch::Tensor       out_grad_position,
+  const torch::Tensor grad_position,
+  const torch::Tensor grad_velocity,
+  const torch::Tensor grad_acceleration,
+  const torch::Tensor grad_jerk,
+  const torch::Tensor traj_dt,
+  const int           batch_size,
+  const int           horizon,
+  const int           dof,
+  const int           mode);

 std::vector<torch::Tensor>
-step_acceleration(torch::Tensor out_position, torch::Tensor out_velocity,
-                  torch::Tensor out_acceleration, torch::Tensor out_jerk,
-                  const torch::Tensor u_acc, const torch::Tensor start_position,
+step_acceleration(torch::Tensor       out_position,
+                  torch::Tensor       out_velocity,
+                  torch::Tensor       out_acceleration,
+                  torch::Tensor       out_jerk,
+                  const torch::Tensor u_acc,
+                  const torch::Tensor start_position,
                  const torch::Tensor start_velocity,
                  const torch::Tensor start_acceleration,
-                  const torch::Tensor traj_dt, const int batch_size,
-                  const int horizon, const int dof, const bool use_rk2 = true);
+                  const torch::Tensor traj_dt,
+                  const int           batch_size,
+                  const int           horizon,
+                  const int           dof,
+                  const bool          use_rk2 = true);
+
+std::vector<torch::Tensor>step_acceleration_idx(
+  torch::Tensor       out_position,
+  torch::Tensor       out_velocity,
+  torch::Tensor       out_acceleration,
+  torch::Tensor       out_jerk,
+  const torch::Tensor u_acc,
+  const torch::Tensor start_position,
+  const torch::Tensor start_velocity,
+  const torch::Tensor start_acceleration,
+  const torch::Tensor start_idx,
+  const torch::Tensor traj_dt,
+  const int           batch_size,
+  const int           horizon,
+  const int           dof,
+  const bool          use_rk2 = true);

-std::vector<torch::Tensor> step_acceleration_idx(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_acc, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor start_idx, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,
-    const bool use_rk2 = true);
 // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x)                                                    \
-  AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x)                                                         \
-  CHECK_CUDA(x);                                                               \
+#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), # x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERTM(x.is_contiguous(), # x " must be contiguous")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
  CHECK_CONTIGUOUS(x)

-std::vector<torch::Tensor> step_position_clique_wrapper(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor traj_dt, const int batch_size, const int horizon,
-    const int dof) {
+std::vector<torch::Tensor>step_position_clique_wrapper(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_position, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor traj_dt, const int batch_size, const int horizon,
+  const int dof)
+{
  const at::cuda::OptionalCUDAGuard guard(u_position.device());
+
  assert(false); // not supported
  CHECK_INPUT(u_position);
  CHECK_INPUT(out_position);
@@ -96,14 +144,15 @@ std::vector<torch::Tensor> step_position_clique_wrapper(
                              batch_size, horizon, dof);
 }

-std::vector<torch::Tensor> step_position_clique2_wrapper(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor traj_dt, const int batch_size, const int horizon,
-    const int dof,
-    const int mode) {
+std::vector<torch::Tensor>step_position_clique2_wrapper(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_position, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor traj_dt, const int batch_size, const int horizon,
+  const int dof,
+  const int mode)
+{
  const at::cuda::OptionalCUDAGuard guard(u_position.device());

  CHECK_INPUT(u_position);
@@ -122,14 +171,15 @@ std::vector<torch::Tensor> step_position_clique2_wrapper(
                               batch_size, horizon, dof, mode);
 }

-std::vector<torch::Tensor> step_position_clique2_idx_wrapper(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_position, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor start_idx, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,
-    const int mode) {
+std::vector<torch::Tensor>step_position_clique2_idx_wrapper(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_position, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor start_idx, const torch::Tensor traj_dt,
+  const int batch_size, const int horizon, const int dof,
+  const int mode)
+{
  const at::cuda::OptionalCUDAGuard guard(u_position.device());

  CHECK_INPUT(u_position);
@@ -144,17 +194,19 @@ std::vector<torch::Tensor> step_position_clique2_idx_wrapper(
  CHECK_INPUT(start_idx);

  return step_position_clique2_idx(
-      out_position, out_velocity, out_acceleration, out_jerk, u_position,
-      start_position, start_velocity, start_acceleration, start_idx, traj_dt,
-      batch_size, horizon, dof, mode);
+    out_position, out_velocity, out_acceleration, out_jerk, u_position,
+    start_position, start_velocity, start_acceleration, start_idx, traj_dt,
+    batch_size, horizon, dof, mode);
 }

-std::vector<torch::Tensor> backward_step_position_clique_wrapper(
-    torch::Tensor out_grad_position, const torch::Tensor grad_position,
-    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
-    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof) {
+std::vector<torch::Tensor>backward_step_position_clique_wrapper(
+  torch::Tensor out_grad_position, const torch::Tensor grad_position,
+  const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+  const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+  const int batch_size, const int horizon, const int dof)
+{
  const at::cuda::OptionalCUDAGuard guard(grad_position.device());
+
  assert(false); // not supported
  CHECK_INPUT(out_grad_position);
  CHECK_INPUT(grad_position);
@@ -164,15 +216,17 @@ std::vector<torch::Tensor> backward_step_position_clique_wrapper(
  CHECK_INPUT(traj_dt);

  return backward_step_position_clique(
-      out_grad_position, grad_position, grad_velocity, grad_acceleration,
-      grad_jerk, traj_dt, batch_size, horizon, dof);
+    out_grad_position, grad_position, grad_velocity, grad_acceleration,
+    grad_jerk, traj_dt, batch_size, horizon, dof);
 }
-std::vector<torch::Tensor> backward_step_position_clique2_wrapper(
-    torch::Tensor out_grad_position, const torch::Tensor grad_position,
-    const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
-    const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,
-    const int mode) {
+
+std::vector<torch::Tensor>backward_step_position_clique2_wrapper(
+  torch::Tensor out_grad_position, const torch::Tensor grad_position,
+  const torch::Tensor grad_velocity, const torch::Tensor grad_acceleration,
+  const torch::Tensor grad_jerk, const torch::Tensor traj_dt,
+  const int batch_size, const int horizon, const int dof,
+  const int mode)
+{
  const at::cuda::OptionalCUDAGuard guard(grad_position.device());

  CHECK_INPUT(out_grad_position);
@@ -183,17 +237,18 @@ std::vector<torch::Tensor> backward_step_position_clique2_wrapper(
  CHECK_INPUT(traj_dt);

  return backward_step_position_clique2(
-      out_grad_position, grad_position, grad_velocity, grad_acceleration,
-      grad_jerk, traj_dt, batch_size, horizon, dof, mode);
+    out_grad_position, grad_position, grad_velocity, grad_acceleration,
+    grad_jerk, traj_dt, batch_size, horizon, dof, mode);
 }

-std::vector<torch::Tensor> step_acceleration_wrapper(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_acc, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor traj_dt, const int batch_size, const int horizon,
-    const int dof, const bool use_rk2 = true) {
+std::vector<torch::Tensor>step_acceleration_wrapper(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_acc, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor traj_dt, const int batch_size, const int horizon,
+  const int dof, const bool use_rk2 = true)
+{
  const at::cuda::OptionalCUDAGuard guard(u_acc.device());

  CHECK_INPUT(u_acc);
@@ -212,14 +267,15 @@ std::vector<torch::Tensor> step_acceleration_wrapper(
                           dof, use_rk2);
 }

-std::vector<torch::Tensor> step_acceleration_idx_wrapper(
-    torch::Tensor out_position, torch::Tensor out_velocity,
-    torch::Tensor out_acceleration, torch::Tensor out_jerk,
-    const torch::Tensor u_acc, const torch::Tensor start_position,
-    const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
-    const torch::Tensor start_idx, const torch::Tensor traj_dt,
-    const int batch_size, const int horizon, const int dof,
-    const bool use_rk2 = true) {
+std::vector<torch::Tensor>step_acceleration_idx_wrapper(
+  torch::Tensor out_position, torch::Tensor out_velocity,
+  torch::Tensor out_acceleration, torch::Tensor out_jerk,
+  const torch::Tensor u_acc, const torch::Tensor start_position,
+  const torch::Tensor start_velocity, const torch::Tensor start_acceleration,
+  const torch::Tensor start_idx, const torch::Tensor traj_dt,
+  const int batch_size, const int horizon, const int dof,
+  const bool use_rk2 = true)
+{
  const at::cuda::OptionalCUDAGuard guard(u_acc.device());

  CHECK_INPUT(u_acc);
@@ -239,19 +295,20 @@ std::vector<torch::Tensor> step_acceleration_idx_wrapper(
                               batch_size, horizon, dof, use_rk2);
 }

-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("step_position", &step_position_clique_wrapper,
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+  m.def("step_position",           &step_position_clique_wrapper,
        "Tensor Step Position (curobolib)");
-  m.def("step_position2", &step_position_clique2_wrapper,
+  m.def("step_position2",          &step_position_clique2_wrapper,
        "Tensor Step Position (curobolib)");
-  m.def("step_idx_position2", &step_position_clique2_idx_wrapper,
+  m.def("step_idx_position2",      &step_position_clique2_idx_wrapper,
        "Tensor Step Position (curobolib)");
-  m.def("step_position_backward", &backward_step_position_clique_wrapper,
+  m.def("step_position_backward",  &backward_step_position_clique_wrapper,
        "Tensor Step Position (curobolib)");
  m.def("step_position_backward2", &backward_step_position_clique2_wrapper,
        "Tensor Step Position (curobolib)");
-  m.def("step_acceleration", &step_acceleration_wrapper,
+  m.def("step_acceleration",       &step_acceleration_wrapper,
        "Tensor Step Acceleration (curobolib)");
-  m.def("step_acceleration_idx", &step_acceleration_idx_wrapper,
+  m.def("step_acceleration_idx",   &step_acceleration_idx_wrapper,
        "Tensor Step Acceleration (curobolib)");
-}
+}
--- a/src/curobo/curobolib/cpp/tensor_step_kernel.cu
+++ b/src/curobo/curobolib/cpp/tensor_step_kernel.cu
--- a/src/curobo/curobolib/cpp/update_best_kernel.cu
+++ b/src/curobo/curobolib/cpp/update_best_kernel.cu
@@ -26,101 +26,108 @@
 #include <cub/cub.cuh>
 #include <math.h>

-namespace Curobo {
-
-namespace Optimization {
-
-// We launch with d_opt*cost_s1 threads.
-// We assume that cost_s2 is always 1.
-template <typename scalar_t>
-__global__ void update_best_kernel(scalar_t *best_cost,     // 200x1
-                                   scalar_t *best_q,        // 200x7
-                                   int16_t *best_iteration, // 200 x 1
-                                   int16_t *current_iteration, // 1
-                                   const scalar_t *cost,    // 200x1
-                                   const scalar_t *q,       // 200x7
-                                   const int d_opt,         // 7
-                                   const int cost_s1,       // 200
-                                   const int cost_s2, 
-                                   const int iteration,
-                                   const float delta_threshold,
-                                   const float relative_threshold) // 1
+namespace Curobo
 {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int size = cost_s1 * d_opt; // size of best_q
-  if (tid >= size) {
-    return;
-  }
-
-  const int cost_idx = tid / d_opt;
-  const float cost_new = cost[cost_idx];
-  const float best_cost_in = best_cost[cost_idx];
-  const bool change = (best_cost_in - cost_new) > delta_threshold && cost_new < best_cost_in * relative_threshold;
-  if (change) {
-    best_q[tid] = q[tid]; // update best_q
-
-    if (tid % d_opt == 0) {
-      best_cost[cost_idx] = cost_new ; // update best_cost
-      //best_iteration[cost_idx] = curr_iter + iteration; // 
-             // this tensor keeps track of whether the cost reduced by at least
-             // delta_threshold.
-             // here iteration is the last_best parameter. 
-    }
-  }
-
-  if (tid % d_opt == 0)
+  namespace Optimization
  {
-    if (change)
+    // We launch with d_opt*cost_s1 threads.
+    // We assume that cost_s2 is always 1.
+    template<typename scalar_t>
+    __global__ void update_best_kernel(scalar_t       *best_cost,         // 200x1
+                                       scalar_t       *best_q,            // 200x7
+                                       int16_t        *best_iteration,    // 200 x 1
+                                       int16_t        *current_iteration, // 1
+                                       const scalar_t *cost,              // 200x1
+                                       const scalar_t *q,                 // 200x7
+                                       const int       d_opt,             // 7
+                                       const int       cost_s1,           // 200
+                                       const int       cost_s2,
+                                       const int       iteration,
+                                       const float     delta_threshold,
+                                       const float     relative_threshold) // 1
    {
-      best_iteration[cost_idx] = 0;
+      int tid  = blockIdx.x * blockDim.x + threadIdx.x;
+      int size = cost_s1 * d_opt;                                          // size of best_q
+
+      if (tid >= size)
+      {
+        return;
+      }
+
+      const int   cost_idx     = tid / d_opt;
+      const float cost_new     = cost[cost_idx];
+      const float best_cost_in = best_cost[cost_idx];
+      const bool  change       = (best_cost_in - cost_new) > delta_threshold &&
+                                 cost_new < best_cost_in * relative_threshold;
+
+      if (change)
+      {
+        best_q[tid] = q[tid]; // update best_q
+
+        if (tid % d_opt == 0)
+        {
+          best_cost[cost_idx] = cost_new; // update best_cost
+          // best_iteration[cost_idx] = curr_iter + iteration; //
+          // this tensor keeps track of whether the cost reduced by at least
+          // delta_threshold.
+          // here iteration is the last_best parameter.
+        }
+      }
+
+      if (tid % d_opt == 0)
+      {
+        if (change)
+        {
+          best_iteration[cost_idx] = 0;
+        }
+        else
+        {
+          best_iteration[cost_idx] -= 1;
+        }
+      }
+
+      // .if (tid == 0)
+      // {
+      //  curr_iter += 1;
+      //  current_iteration[0] = curr_iter;
+      // }
    }
-    else
-    {
-      best_iteration[cost_idx] -= 1;
-    }
-  }
- 
-  //.if (tid == 0)
-  //{
-  //  curr_iter += 1;
-  //  current_iteration[0] = curr_iter;
-  //}
-  
-} 
-} // namespace Optimization
-} // namespace Curobo
+  } // namespace Optimization
+}   // namespace Curobo

 std::vector<torch::Tensor>
 update_best_cuda(torch::Tensor best_cost, torch::Tensor best_q,
                 torch::Tensor best_iteration,
                 torch::Tensor current_iteration,
-                  const torch::Tensor cost,
+                 const torch::Tensor cost,
                 const torch::Tensor q, const int d_opt, const int cost_s1,
                 const int cost_s2, const int iteration,
                 const float delta_threshold,
-                 const float relative_threshold = 0.999) {
+                 const float relative_threshold = 0.999)
+{
  using namespace Curobo::Optimization;
  const int threadsPerBlock = 128;
-  const int cost_size = cost_s1 * d_opt;
+  const int cost_size       = cost_s1 * d_opt;
  assert(cost_s2 == 1); // assumption
  const int blocksPerGrid = (cost_size + threadsPerBlock - 1) / threadsPerBlock;
+
  // printf("cost_s1=%d, d_opt=%d, blocksPerGrid=%d\n", cost_s1, d_opt,
  // blocksPerGrid);

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  AT_DISPATCH_FLOATING_TYPES(
-      cost.scalar_type(), "update_best_cu", ([&] {
-        update_best_kernel<scalar_t>
-            <<<blocksPerGrid, threadsPerBlock, 0, stream>>>(
-                best_cost.data_ptr<scalar_t>(), best_q.data_ptr<scalar_t>(),
-                best_iteration.data_ptr<int16_t>(), 
-                current_iteration.data_ptr<int16_t>(), 
-                cost.data_ptr<scalar_t>(),
-                q.data_ptr<scalar_t>(), d_opt, cost_s1, cost_s2, iteration,
-                delta_threshold, relative_threshold);
-      }));
+    cost.scalar_type(), "update_best_cu", ([&] {
+    update_best_kernel<scalar_t>
+      << < blocksPerGrid, threadsPerBlock, 0, stream >> > (
+      best_cost.data_ptr<scalar_t>(), best_q.data_ptr<scalar_t>(),
+      best_iteration.data_ptr<int16_t>(),
+      current_iteration.data_ptr<int16_t>(),
+      cost.data_ptr<scalar_t>(),
+      q.data_ptr<scalar_t>(), d_opt, cost_s1, cost_s2, iteration,
+      delta_threshold, relative_threshold);
+  }));
  C10_CUDA_KERNEL_LAUNCH_CHECK();

-  return {best_cost, best_q, best_iteration};
-}
+  return { best_cost, best_q, best_iteration };
+}
--- a/src/curobo/curobolib/geom.py
+++ b/src/curobo/curobolib/geom.py
@@ -153,6 +153,8 @@ def get_pose_distance(
    vec_convergence,
    run_weight,
    run_vec_weight,
+    offset_waypoint,
+    offset_tstep_fraction,
    batch_pose_idx,
    batch_size,
    horizon,
@@ -161,6 +163,7 @@ def get_pose_distance(
    write_grad=False,
    write_distance=False,
    use_metric=False,
+    project_distance=True,
 ):
    if batch_pose_idx.shape[0] != batch_size:
        raise ValueError("Index buffer size is different from batch size")
@@ -181,6 +184,8 @@ def get_pose_distance(
        vec_convergence,
        run_weight,
        run_vec_weight,
+        offset_waypoint,
+        offset_tstep_fraction,
        batch_pose_idx,
        batch_size,
        horizon,
@@ -189,6 +194,7 @@ def get_pose_distance(
        write_grad,
        write_distance,
        use_metric,
+        project_distance,
    )

    out_distance = r[0]
@@ -229,6 +235,331 @@ def get_pose_distance_backward(
    return r[0], r[1]


+@torch.jit.script
+def backward_PoseError_jit(grad_g_dist, grad_out_distance, weight, g_vec):
+    grad_vec = grad_g_dist + (grad_out_distance * weight)
+    grad = 1.0 * (grad_vec).unsqueeze(-1) * g_vec
+    return grad
+
+
+# full method:
+@torch.jit.script
+def backward_full_PoseError_jit(
+    grad_out_distance, grad_g_dist, grad_r_err, p_w, q_w, g_vec_p, g_vec_q
+):
+    p_grad = (grad_g_dist + (grad_out_distance * p_w)).unsqueeze(-1) * g_vec_p
+    q_grad = (grad_r_err + (grad_out_distance * q_w)).unsqueeze(-1) * g_vec_q
+    # p_grad = ((grad_out_distance * p_w)).unsqueeze(-1) * g_vec_p
+    # q_grad = ((grad_out_distance * q_w)).unsqueeze(-1) * g_vec_q
+
+    return p_grad, q_grad
+
+
+class PoseErrorDistance(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        current_position,
+        goal_position,
+        current_quat,
+        goal_quat,
+        vec_weight,
+        weight,
+        vec_convergence,
+        run_weight,
+        run_vec_weight,
+        offset_waypoint,
+        offset_tstep_fraction,
+        batch_pose_idx,
+        out_distance,
+        out_position_distance,
+        out_rotation_distance,
+        out_p_vec,
+        out_r_vec,
+        out_idx,
+        out_p_grad,
+        out_q_grad,
+        batch_size,
+        horizon,
+        mode,  # =PoseErrorType.BATCH_GOAL.value,
+        num_goals,
+        use_metric,  # =False,
+        project_distance,  # =True,
+    ):
+        # out_distance = current_position[..., 0].detach().clone() * 0.0
+        # out_position_distance = out_distance.detach().clone()
+        # out_rotation_distance = out_distance.detach().clone()
+        # out_vec = (
+        #    torch.cat((current_position.detach().clone(), current_quat.detach().clone()), dim=-1)
+        #    * 0.0
+        # )
+        # out_idx = out_distance.clone().to(dtype=torch.long)
+
+        (
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+        ) = get_pose_distance(
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+            current_position.contiguous(),
+            goal_position,
+            current_quat.contiguous(),
+            goal_quat,
+            vec_weight,
+            weight,
+            vec_convergence,
+            run_weight,
+            run_vec_weight,
+            offset_waypoint,
+            offset_tstep_fraction,
+            batch_pose_idx,
+            batch_size,
+            horizon,
+            mode,
+            num_goals,
+            current_position.requires_grad,
+            True,
+            use_metric,
+            project_distance,
+        )
+        ctx.save_for_backward(out_p_vec, out_r_vec, weight, out_p_grad, out_q_grad)
+        return out_distance, out_position_distance, out_rotation_distance, out_idx  # .view(-1,1)
+
+    @staticmethod
+    def backward(ctx, grad_out_distance, grad_g_dist, grad_r_err, grad_out_idx):
+        (g_vec_p, g_vec_q, weight, out_grad_p, out_grad_q) = ctx.saved_tensors
+        pos_grad = None
+        quat_grad = None
+        batch_size = g_vec_p.shape[0] * g_vec_p.shape[1]
+        if ctx.needs_input_grad[0] and ctx.needs_input_grad[2]:
+            pos_grad, quat_grad = get_pose_distance_backward(
+                out_grad_p,
+                out_grad_q,
+                grad_out_distance.contiguous(),
+                grad_g_dist.contiguous(),
+                grad_r_err.contiguous(),
+                weight,
+                g_vec_p,
+                g_vec_q,
+                batch_size,
+                use_distance=True,
+            )
+
+        elif ctx.needs_input_grad[0]:
+            pos_grad = backward_PoseError_jit(grad_g_dist, grad_out_distance, weight[1], g_vec_p)
+
+        elif ctx.needs_input_grad[2]:
+            quat_grad = backward_PoseError_jit(grad_r_err, grad_out_distance, weight[0], g_vec_q)
+
+        return (
+            pos_grad,
+            None,
+            quat_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class PoseError(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        current_position: torch.Tensor,
+        goal_position: torch.Tensor,
+        current_quat: torch.Tensor,
+        goal_quat,
+        vec_weight,
+        weight,
+        vec_convergence,
+        run_weight,
+        run_vec_weight,
+        offset_waypoint,
+        offset_tstep_fraction,
+        batch_pose_idx,
+        out_distance,
+        out_position_distance,
+        out_rotation_distance,
+        out_p_vec,
+        out_r_vec,
+        out_idx,
+        out_p_grad,
+        out_q_grad,
+        batch_size,
+        horizon,
+        mode,
+        num_goals,
+        use_metric,
+        project_distance,
+        return_loss,
+    ):
+        """Compute error in pose
+
+        _extended_summary_
+
+        Args:
+            ctx: _description_
+            current_position: _description_
+            goal_position: _description_
+            current_quat: _description_
+            goal_quat: _description_
+            vec_weight: _description_
+            weight: _description_
+            vec_convergence: _description_
+            run_weight: _description_
+            run_vec_weight: _description_
+            offset_waypoint: _description_
+            offset_tstep_fraction: _description_
+            batch_pose_idx: _description_
+            out_distance: _description_
+            out_position_distance: _description_
+            out_rotation_distance: _description_
+            out_p_vec: _description_
+            out_r_vec: _description_
+            out_idx: _description_
+            out_p_grad: _description_
+            out_q_grad: _description_
+            batch_size: _description_
+            horizon: _description_
+            mode: _description_
+            num_goals: _description_
+            use_metric: _description_
+            project_distance: _description_
+            return_loss: _description_
+
+        Returns:
+            _description_
+        """
+        # out_distance = current_position[..., 0].detach().clone() * 0.0
+        # out_position_distance = out_distance.detach().clone()
+        # out_rotation_distance = out_distance.detach().clone()
+        # out_vec = (
+        #    torch.cat((current_position.detach().clone(), current_quat.detach().clone()), dim=-1)
+        #    * 0.0
+        # )
+        # out_idx = out_distance.clone().to(dtype=torch.long)
+        ctx.return_loss = return_loss
+        (
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+        ) = get_pose_distance(
+            out_distance,
+            out_position_distance,
+            out_rotation_distance,
+            out_p_vec,
+            out_r_vec,
+            out_idx,
+            current_position.contiguous(),
+            goal_position,
+            current_quat.contiguous(),
+            goal_quat,
+            vec_weight,
+            weight,
+            vec_convergence,
+            run_weight,
+            run_vec_weight,
+            offset_waypoint,
+            offset_tstep_fraction,
+            batch_pose_idx,
+            batch_size,
+            horizon,
+            mode,
+            num_goals,
+            current_position.requires_grad,
+            False,
+            use_metric,
+            project_distance,
+        )
+        ctx.save_for_backward(out_p_vec, out_r_vec)
+        return out_distance
+
+    @staticmethod
+    def backward(ctx, grad_out_distance):  # , grad_g_dist, grad_r_err, grad_out_idx):
+        pos_grad = None
+        quat_grad = None
+        if ctx.needs_input_grad[0] and ctx.needs_input_grad[2]:
+            (g_vec_p, g_vec_q) = ctx.saved_tensors
+            pos_grad = g_vec_p
+            quat_grad = g_vec_q
+            if ctx.return_loss:
+                pos_grad = pos_grad * grad_out_distance.unsqueeze(1)
+                quat_grad = quat_grad * grad_out_distance.unsqueeze(1)
+
+        elif ctx.needs_input_grad[0]:
+            (g_vec_p, g_vec_q) = ctx.saved_tensors
+
+            pos_grad = g_vec_p
+            if ctx.return_loss:
+                pos_grad = pos_grad * grad_out_distance.unsqueeze(1)
+        elif ctx.needs_input_grad[2]:
+            (g_vec_p, g_vec_q) = ctx.saved_tensors
+
+            quat_grad = g_vec_q
+            if ctx.return_loss:
+                quat_grad = quat_grad * grad_out_distance.unsqueeze(1)
+        return (
+            pos_grad,
+            None,
+            quat_grad,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
 class SdfSphereOBB(torch.autograd.Function):
    @staticmethod
    def forward(
--- a/src/curobo/geom/cv.py
+++ b/src/curobo/geom/cv.py
@@ -0,0 +1,118 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Third Party
+import torch
+
+
+@torch.jit.script
+def project_depth_to_pointcloud(depth_image: torch.Tensor, intrinsics_matrix: torch.Tensor):
+    """Projects numpy depth image to point cloud.
+
+    Args:
+      np_depth_image: numpy array float, shape (h, w).
+        intrinsics array: numpy array float, 3x3 intrinsics matrix.
+
+    Returns:
+      array of float (h, w, 3)
+    """
+    fx = intrinsics_matrix[0, 0]
+    fy = intrinsics_matrix[1, 1]
+    cx = intrinsics_matrix[0, 2]
+    cy = intrinsics_matrix[1, 2]
+    height = depth_image.shape[0]
+    width = depth_image.shape[1]
+    input_x = torch.arange(width, dtype=torch.float32, device=depth_image.device)
+    input_y = torch.arange(height, dtype=torch.float32, device=depth_image.device)
+    input_x, input_y = torch.meshgrid(input_x, input_y, indexing="ij")
+    input_x, input_y = input_x.T, input_y.T
+
+    input_z = depth_image
+    output_x = (input_x * input_z - cx * input_z) / fx
+    output_y = (input_y * input_z - cy * input_z) / fy
+    raw_pc = torch.stack([output_x, output_y, input_z], -1)
+
+    return raw_pc
+
+
+@torch.jit.script
+def get_projection_rays(height: int, width: int, intrinsics_matrix: torch.Tensor):
+    """Projects numpy depth image to point cloud.
+
+    Args:
+      np_depth_image: numpy array float, shape (h, w).
+        intrinsics array: numpy array float, 3x3 intrinsics matrix.
+
+    Returns:
+      array of float (h, w, 3)
+    """
+    fx = intrinsics_matrix[:, 0, 0]
+    fy = intrinsics_matrix[:, 1, 1]
+    cx = intrinsics_matrix[:, 0, 2]
+    cy = intrinsics_matrix[:, 1, 2]
+
+    input_x = torch.arange(width, dtype=torch.float32, device=intrinsics_matrix.device)
+    input_y = torch.arange(height, dtype=torch.float32, device=intrinsics_matrix.device)
+    input_x, input_y = torch.meshgrid(input_x, input_y, indexing="ij")
+
+    input_x, input_y = input_x.T, input_y.T
+
+    input_x = input_x.unsqueeze(0).repeat(intrinsics_matrix.shape[0], 1, 1)
+    input_y = input_y.unsqueeze(0).repeat(intrinsics_matrix.shape[0], 1, 1)
+
+    input_z = torch.ones(
+        (intrinsics_matrix.shape[0], height, width),
+        device=intrinsics_matrix.device,
+        dtype=torch.float32,
+    )
+
+    output_x = (input_x - cx) / fx
+    output_y = (input_y - cy) / fy
+
+    rays = torch.stack([output_x, output_y, input_z], -1).reshape(
+        intrinsics_matrix.shape[0], width * height, 3
+    )
+    rays = rays * (1.0 / 1000.0)
+    return rays
+
+
+@torch.jit.script
+def project_pointcloud_to_depth(
+    pointcloud: torch.Tensor,
+    output_image: torch.Tensor,
+):
+    """Projects pointcloud to depth image
+
+    Args:
+      np_depth_image: numpy array float, shape (h, w).
+        intrinsics array: numpy array float, 3x3 intrinsics matrix.
+
+    Returns:
+      array of float (h, w)
+    """
+    width, height = output_image.shape
+
+    output_image = output_image.view(-1)
+    output_image[:] = pointcloud[:, 2]
+    output_image = output_image.view(width, height)
+    return output_image
+
+
+@torch.jit.script
+def project_depth_using_rays(
+    depth_image: torch.Tensor, rays: torch.Tensor, filter_origin: bool = False
+):
+    if filter_origin:
+        depth_image = torch.where(depth_image < 0.01, 0, depth_image)
+
+    depth_image = depth_image.view(depth_image.shape[0], -1, 1).contiguous()
+    points = depth_image * rays
+    return points
--- a/src/curobo/geom/sdf/world.py
+++ b/src/curobo/geom/sdf/world.py
@@ -364,8 +364,12 @@ class WorldPrimitiveCollision(WorldCollision):
        if self.cache is not None and "obb" in self.cache and self.cache["obb"] not in [None, 0]:
            self._create_obb_cache(self.cache["obb"])

-    def load_collision_model(self, world_config: WorldConfig, env_idx=0):
-        self._load_collision_model_in_cache(world_config, env_idx)
+    def load_collision_model(
+        self, world_config: WorldConfig, env_idx=0, fix_cache_reference: bool = False
+    ):
+        self._load_collision_model_in_cache(
+            world_config, env_idx, fix_cache_reference=fix_cache_reference
+        )

    def load_batch_collision_model(self, world_config_list: List[WorldConfig]):
        """Load a batch of collision environments from a list of world configs.
@@ -436,7 +440,9 @@ class WorldPrimitiveCollision(WorldCollision):
        )
        self.collision_types["primitive"] = True

-    def _load_collision_model_in_cache(self, world_config: WorldConfig, env_idx: int = 0):
+    def _load_collision_model_in_cache(
+        self, world_config: WorldConfig, env_idx: int = 0, fix_cache_reference: bool = False
+    ):
        cube_objs = world_config.cuboid
        max_obb = len(cube_objs)
        self.world_model = world_config
@@ -444,8 +450,11 @@ class WorldPrimitiveCollision(WorldCollision):
            log_info("No OBB objs")
            return
        if self._cube_tensor_list is None or self._cube_tensor_list[0].shape[1] < max_obb:
-            log_info("Creating Obb cache" + str(max_obb))
-            self._create_obb_cache(max_obb)
+            if not fix_cache_reference:
+                log_info("Creating Obb cache" + str(max_obb))
+                self._create_obb_cache(max_obb)
+            else:
+                log_error("number of OBB is larger than collision cache, create larger cache.")

        # load as a batch:
        pose_batch = [c.pose for c in cube_objs]
@@ -835,7 +844,9 @@ class WorldPrimitiveCollision(WorldCollision):
        return dist

    def clear_cache(self):
-        pass
+        if self._cube_tensor_list is not None:
+            self._cube_tensor_list[2][:] = 1
+            self._env_n_obbs[:] = 0

    def get_voxels_in_bounding_box(
        self,
--- a/src/curobo/geom/sdf/world_blox.py
+++ b/src/curobo/geom/sdf/world_blox.py
@@ -55,7 +55,7 @@ class WorldBloxCollision(WorldMeshCollision):
        self._blox_voxel_sizes = [0.02]
        super().__init__(config)

-    def load_collision_model(self, world_model: WorldConfig):
+    def load_collision_model(self, world_model: WorldConfig, fix_cache_reference: bool = False):
        # load nvblox mesh
        if len(world_model.blox) > 0:
            # check if there is a mapper instance:
@@ -109,15 +109,17 @@ class WorldBloxCollision(WorldMeshCollision):
            self._blox_names = names
            self.collision_types["blox"] = True

-        return super().load_collision_model(world_model)
+        return super().load_collision_model(world_model, fix_cache_reference=fix_cache_reference)

    def clear_cache(self):
        self._blox_mapper.clear()
+        self._blox_mapper.update_hashmaps()
        super().clear_cache()

    def clear_blox_layer(self, layer_name: str):
        index = self._blox_names.index(layer_name)
        self._blox_mapper.clear(index)
+        self._blox_mapper.update_hashmaps()

    def _get_blox_sdf(
        self,
@@ -147,6 +149,7 @@ class WorldBloxCollision(WorldMeshCollision):
        speed_dt,
        sweep_steps,
        enable_speed_metric,
+        return_loss=False,
    ):
        d = self._blox_mapper.query_sphere_trajectory_sdf_cost(
            query_spheres,
@@ -160,6 +163,8 @@ class WorldBloxCollision(WorldMeshCollision):
            self._blox_tensor_list[1],
            sweep_steps,
            enable_speed_metric,
+            return_loss,
+            use_experimental=False,
        )
        return d

@@ -279,6 +284,7 @@ class WorldBloxCollision(WorldMeshCollision):
            speed_dt=speed_dt,
            sweep_steps=sweep_steps,
            enable_speed_metric=enable_speed_metric,
+            return_loss=return_loss,
        )

        if ("primitive" not in self.collision_types or not self.collision_types["primitive"]) and (
@@ -332,6 +338,7 @@ class WorldBloxCollision(WorldMeshCollision):
            speed_dt=speed_dt,
            sweep_steps=sweep_steps,
            enable_speed_metric=enable_speed_metric,
+            return_loss=return_loss,
        )

        if ("primitive" not in self.collision_types or not self.collision_types["primitive"]) and (
--- a/src/curobo/geom/sdf/world_mesh.py
+++ b/src/curobo/geom/sdf/world_mesh.py
@@ -72,7 +72,11 @@ class WorldMeshCollision(WorldPrimitiveCollision):
        return super()._init_cache()

    def load_collision_model(
-        self, world_model: WorldConfig, env_idx: int = 0, load_obb_obs: bool = True
+        self,
+        world_model: WorldConfig,
+        env_idx: int = 0,
+        load_obb_obs: bool = True,
+        fix_cache_reference: bool = False,
    ):
        max_nmesh = len(world_model.mesh)
        if max_nmesh > 0:
@@ -91,14 +95,16 @@ class WorldMeshCollision(WorldPrimitiveCollision):

            self.collision_types["mesh"] = True
        if load_obb_obs:
-            super().load_collision_model(world_model, env_idx)
+            super().load_collision_model(
+                world_model, env_idx, fix_cache_reference=fix_cache_reference
+            )
        else:
            self.world_model = world_model

    def load_batch_collision_model(self, world_config_list: List[WorldConfig]):
        max_nmesh = max([len(x.mesh) for x in world_config_list])
        if self._mesh_tensor_list is None or self._mesh_tensor_list[0].shape[1] < max_nmesh:
-            log_info("Creating new Mesh cache: " + str(max_nmesh))
+            log_warn("Creating new Mesh cache: " + str(max_nmesh))
            self._create_mesh_cache(max_nmesh)

        for env_idx, world_model in enumerate(world_config_list):
--- a/src/curobo/geom/transform.py
+++ b/src/curobo/geom/transform.py
@@ -228,6 +228,61 @@ def pose_inverse(
    return out_position, out_quaternion


+@wp.kernel
+def compute_pose_inverse(
+    position: wp.array(dtype=wp.vec3),
+    quat: wp.array(dtype=wp.vec4),
+    out_position: wp.array(dtype=wp.vec3),
+    out_quat: wp.array(dtype=wp.vec4),
+):  # b pose_1 and b pose_2, compute pose_1 * pose_2
+    b_idx = wp.tid()
+    # read data:
+
+    in_position = position[b_idx]
+    in_quat = quat[b_idx]
+
+    # read point
+    # create a transform from a vector/quaternion:
+    t_1 = wp.transform(in_position, wp.quaternion(in_quat[1], in_quat[2], in_quat[3], in_quat[0]))
+    t_3 = wp.transform_inverse(t_1)
+
+    # write pt:
+    out_q = wp.transform_get_rotation(t_3)
+
+    out_v = wp.vec4()
+    out_v[0] = out_q[3]  # out_q[3]
+    out_v[1] = out_q[0]  # [0]
+    out_v[2] = out_q[1]  # wp.extract(out_q, 1)
+    out_v[3] = out_q[2]  # wp.extract(out_q, 2)
+
+    out_position[b_idx] = wp.transform_get_translation(t_3)
+    out_quat[b_idx] = out_v
+
+
+@wp.kernel
+def compute_matrix_to_quat(
+    in_mat: wp.array(dtype=wp.mat33),
+    out_quat: wp.array(dtype=wp.vec4),
+):
+    # b pose_1 and b pose_2, compute pose_1 * pose_2
+    b_idx = wp.tid()
+    # read data:
+
+    in_m = in_mat[b_idx]
+
+    # read point
+    # create a transform from a vector/quaternion:
+    out_q = wp.quat_from_matrix(in_m)
+
+    out_v = wp.vec4()
+    out_v[0] = out_q[3]  # wp.extract(out_q, 3)
+    out_v[1] = out_q[0]  # wp.extract(out_q, 0)
+    out_v[2] = out_q[1]  # wp.extract(out_q, 1)
+    out_v[3] = out_q[2]  # wp.extract(out_q, 2)
+    # write pt:
+    out_quat[b_idx] = out_v
+
+
@wp.kernel
 def compute_transform_point(
    position: wp.array(dtype=wp.vec3),
@@ -331,37 +386,6 @@ def compute_batch_pose_multipy(
    out_quat[b_idx] = out_v


-@wp.kernel
-def compute_pose_inverse(
-    position: wp.array(dtype=wp.vec3),
-    quat: wp.array(dtype=wp.vec4),
-    out_position: wp.array(dtype=wp.vec3),
-    out_quat: wp.array(dtype=wp.vec4),
-):  # b pose_1 and b pose_2, compute pose_1 * pose_2
-    b_idx = wp.tid()
-    # read data:
-
-    in_position = position[b_idx]
-    in_quat = quat[b_idx]
-
-    # read point
-    # create a transform from a vector/quaternion:
-    t_1 = wp.transform(in_position, wp.quaternion(in_quat[1], in_quat[2], in_quat[3], in_quat[0]))
-    t_3 = wp.transform_inverse(t_1)
-
-    # write pt:
-    out_q = wp.transform_get_rotation(t_3)
-
-    out_v = wp.vec4()
-    out_v[0] = out_q[3]
-    out_v[1] = out_q[0]
-    out_v[2] = out_q[1]
-    out_v[3] = out_q[2]
-
-    out_position[b_idx] = wp.transform_get_translation(t_3)
-    out_quat[b_idx] = out_v
-
-
@wp.kernel
 def compute_quat_to_matrix(
    quat: wp.array(dtype=wp.vec4),
@@ -382,30 +406,6 @@ def compute_quat_to_matrix(
    out_mat[b_idx] = m_1


-@wp.kernel
-def compute_matrix_to_quat(
-    in_mat: wp.array(dtype=wp.mat33),
-    out_quat: wp.array(dtype=wp.vec4),
-):
-    # b pose_1 and b pose_2, compute pose_1 * pose_2
-    b_idx = wp.tid()
-    # read data:
-
-    in_m = in_mat[b_idx]
-
-    # read point
-    # create a transform from a vector/quaternion:
-    out_q = wp.quat_from_matrix(in_m)
-
-    out_v = wp.vec4()
-    out_v[0] = out_q[3]
-    out_v[1] = out_q[0]
-    out_v[2] = out_q[1]
-    out_v[3] = out_q[2]
-    # write pt:
-    out_quat[b_idx] = out_v
-
-
@wp.kernel
 def compute_pose_multipy(
    position: wp.array(dtype=wp.vec3),
@@ -962,6 +962,7 @@ class PoseInverse(torch.autograd.Function):
            adj_quaternion,
        )
        ctx.b = b
+
        wp.launch(
            kernel=compute_pose_inverse,
            dim=b,
@@ -1071,6 +1072,7 @@ class QuatToMatrix(torch.autograd.Function):
            adj_quaternion,
        )
        ctx.b = b
+
        wp.launch(
            kernel=compute_quat_to_matrix,
            dim=b,
@@ -1153,6 +1155,7 @@ class MatrixToQuaternion(torch.autograd.Function):
            adj_mat,
        )
        ctx.b = b
+
        wp.launch(
            kernel=compute_matrix_to_quat,
            dim=b,
--- a/src/curobo/geom/types.py
+++ b/src/curobo/geom/types.py
@@ -12,7 +12,7 @@ from __future__ import annotations

 # Standard Library
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Sequence
+from typing import Any, Dict, List, Optional, Sequence, Union

 # Third Party
 import numpy as np
@@ -22,6 +22,7 @@ import trimesh
 # CuRobo
 from curobo.geom.sphere_fit import SphereFitType, fit_spheres_to_mesh
 from curobo.types.base import TensorDeviceType
+from curobo.types.camera import CameraObservation
 from curobo.types.math import Pose
 from curobo.util.logger import log_error, log_warn
 from curobo.util_file import get_assets_path, join_path
@@ -60,7 +61,7 @@ class Obstacle:

    tensor_args: TensorDeviceType = TensorDeviceType()

-    def get_trimesh_mesh(self, process: bool = True) -> trimesh.Trimesh:
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True) -> trimesh.Trimesh:
        """Create a trimesh instance from the obstacle representation.

        Args:
@@ -74,8 +75,11 @@ class Obstacle:
        """
        raise NotImplementedError

-    def save_as_mesh(self, file_path: str):
+    def save_as_mesh(self, file_path: str, transform_with_pose: bool = False):
        mesh_scene = self.get_trimesh_mesh()
+        if transform_with_pose:
+            mesh_scene.apply_transform(self.get_transform_matrix())
+
        mesh_scene.export(file_path)

    def get_cuboid(self) -> Cuboid:
@@ -238,7 +242,7 @@ class Cuboid(Obstacle):
        if self.pose is None:
            log_error("Cuboid Obstacle requires Pose")

-    def get_trimesh_mesh(self, process: bool = True):
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
        m = trimesh.creation.box(extents=self.dims)
        if self.color is not None:
            color_visual = trimesh.visual.color.ColorVisuals(
@@ -254,7 +258,7 @@ class Capsule(Obstacle):
    base: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])
    tip: List[float] = field(default_factory=lambda: [0.0, 0.0, 0.0])

-    def get_trimesh_mesh(self, process: bool = True):
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
        height = self.tip[2] - self.base[2]
        if (
            height < 0
@@ -280,7 +284,7 @@ class Cylinder(Obstacle):
    radius: float = 0.0
    height: float = 0.0

-    def get_trimesh_mesh(self, process: bool = True):
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
        m = trimesh.creation.cylinder(radius=self.radius, height=self.height)
        if self.color is not None:
            color_visual = trimesh.visual.color.ColorVisuals(
@@ -304,7 +308,7 @@ class Sphere(Obstacle):
        if self.pose is not None:
            self.position = self.pose[:3]

-    def get_trimesh_mesh(self, process: bool = True):
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
        m = trimesh.creation.icosphere(radius=self.radius)
        if self.color is not None:
            color_visual = trimesh.visual.color.ColorVisuals(
@@ -356,9 +360,9 @@ class Sphere(Obstacle):
@dataclass
 class Mesh(Obstacle):
    file_path: Optional[str] = None
-    file_string: Optional[
-        str
-    ] = None  # storing full mesh as a string, loading from this is not implemented yet.
+    file_string: Optional[str] = (
+        None  # storing full mesh as a string, loading from this is not implemented yet.
+    )
    urdf_path: Optional[str] = None  # useful for visualization in isaac gym.
    vertices: Optional[List[List[float]]] = None
    faces: Optional[List[int]] = None
@@ -375,13 +379,13 @@ class Mesh(Obstacle):
            self.vertices = np.ravel(self.scale) * self.vertices
            self.scale = None

-    def get_trimesh_mesh(self, process: bool = True):
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
        # load mesh from filepath or verts and faces:
        if self.file_path is not None:
            m = trimesh.load(self.file_path, process=process, force="mesh")
            if isinstance(m, trimesh.Scene):
                m = m.dump(concatenate=True)
-            if isinstance(m.visual, trimesh.visual.texture.TextureVisuals):
+            if process_color and isinstance(m.visual, trimesh.visual.texture.TextureVisuals):
                m.visual = m.visual.to_color()
        else:
            m = trimesh.Trimesh(
@@ -467,7 +471,7 @@ class BloxMap(Obstacle):
                name=self.name + "_mesh", file_path=self.mesh_file_path, pose=self.pose
            )

-    def get_trimesh_mesh(self, process: bool = True):
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
        if self.mesh is not None:
            return self.mesh.get_trimesh_mesh(process)
        else:
@@ -475,6 +479,91 @@ class BloxMap(Obstacle):
            return None


+@dataclass
+class PointCloud(Obstacle):
+    points: Union[torch.Tensor, np.ndarray, List[List[float]]] = None
+    points_features: Union[torch.Tensor, np.ndarray, List[List[float]], None] = None
+
+    def __post_init__(self):
+        if self.scale is not None and self.points is not None:
+            self.points = np.ravel(self.scale) * self.points
+            self.scale = None
+
+    def get_trimesh_mesh(self, process: bool = True, process_color: bool = True):
+        points = self.points
+        if isinstance(points, torch.Tensor):
+            points = points.view(-1, 3).cpu().numpy()
+        if isinstance(points, list):
+            points = np.ndarray(points)
+
+        mesh = Mesh.from_pointcloud(points, pose=self.pose)
+        return mesh.get_trimesh_mesh()
+
+    def get_mesh_data(self, process: bool = True):
+        verts = faces = None
+        m = self.get_trimesh_mesh(process=process)
+        verts = m.vertices.view(np.ndarray)
+        faces = m.faces
+        return verts, faces
+
+    @staticmethod
+    def from_camera_observation(
+        camera_obs: CameraObservation, name: str = "pc_obstacle", pose: Optional[List[float]] = None
+    ):
+        return PointCloud(name=name, pose=pose, points=camera_obs.get_pointcloud())
+
+    def get_bounding_spheres(
+        self,
+        n_spheres: int = 1,
+        surface_sphere_radius: float = 0.002,
+        fit_type: SphereFitType = SphereFitType.VOXEL_VOLUME_SAMPLE_SURFACE,
+        voxelize_method: str = "ray",
+        pre_transform_pose: Optional[Pose] = None,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ) -> List[Sphere]:
+        """Compute n spheres that fits in the volume of the object.
+
+        Args:
+            n: number of spheres
+        Returns:
+            spheres
+        """
+        # sample points in pointcloud:
+
+        # mesh = self.get_trimesh_mesh()
+        # pts, n_radius = fit_spheres_to_mesh(
+        #     mesh, n_spheres, surface_sphere_radius, fit_type, voxelize_method=voxelize_method
+        # )
+
+        obj_pose = Pose.from_list(self.pose, tensor_args)
+        # transform object:
+
+        # transform points:
+        if pre_transform_pose is not None:
+            obj_pose = pre_transform_pose.multiply(obj_pose)  # convert object pose to another frame
+
+        if pts is None or len(pts) == 0:
+            log_warn("spheres could not be fit!, adding one sphere at origin")
+            pts = np.zeros((1, 3))
+            pts[0, :] = mesh.centroid
+            n_radius = [0.02]
+            obj_pose = Pose.from_list([0, 0, 0, 1, 0, 0, 0], tensor_args)
+
+        points_cuda = tensor_args.to_device(pts)
+        pts = obj_pose.transform_points(points_cuda).cpu().view(-1, 3).numpy()
+
+        new_spheres = [
+            Sphere(
+                name="sph_" + str(i),
+                pose=[pts[i, 0], pts[i, 1], pts[i, 2], 1, 0, 0, 0],
+                radius=n_radius[i],
+            )
+            for i in range(pts.shape[0])
+        ]
+
+        return new_spheres
+
+
@dataclass
 class WorldConfig(Sequence):
    """Representation of World for use in CuRobo."""
@@ -664,23 +753,25 @@ class WorldConfig(Sequence):
        )

    @staticmethod
-    def get_scene_graph(current_world: WorldConfig):
+    def get_scene_graph(current_world: WorldConfig, process_color: bool = True):
        m_world = WorldConfig.create_mesh_world(current_world)
-        mesh_scene = trimesh.scene.scene.Scene(base_frame="world")
+        mesh_scene = trimesh.scene.scene.Scene(base_frame="world_origin")
        mesh_list = m_world
        for m in mesh_list:
            mesh_scene.add_geometry(
-                m.get_trimesh_mesh(),
+                m.get_trimesh_mesh(process_color=process_color),
                geom_name=m.name,
-                parent_node_name="world",
+                parent_node_name="world_origin",
                transform=m.get_transform_matrix(),
            )

        return mesh_scene

    @staticmethod
-    def create_merged_mesh_world(current_world: WorldConfig, process: bool = True):
-        mesh_scene = WorldConfig.get_scene_graph(current_world)
+    def create_merged_mesh_world(
+        current_world: WorldConfig, process: bool = True, process_color: bool = True
+    ):
+        mesh_scene = WorldConfig.get_scene_graph(current_world, process_color=process_color)
        mesh_scene = mesh_scene.dump(concatenate=True)
        new_mesh = Mesh(
            vertices=mesh_scene.vertices.view(np.ndarray),
@@ -702,8 +793,10 @@ class WorldConfig(Sequence):
    def get_collision_check_world(self, mesh_process: bool = False):
        return WorldConfig.create_collision_support_world(self, process=mesh_process)

-    def save_world_as_mesh(self, file_path: str, save_as_scene_graph=False):
-        mesh_scene = WorldConfig.get_scene_graph(self)
+    def save_world_as_mesh(
+        self, file_path: str, save_as_scene_graph=False, process_color: bool = True
+    ):
+        mesh_scene = WorldConfig.get_scene_graph(self, process_color=process_color)
        if save_as_scene_graph:
            mesh_scene = mesh_scene.dump(concatenate=True)

--- a/src/curobo/graph/graph_nx.py
+++ b/src/curobo/graph/graph_nx.py
@@ -10,10 +10,19 @@
 #


+# Standard Library
+import random
+
 # Third Party
 import networkx as nx
+import numpy as np
 import torch

+# This is needed to get deterministic results from networkx.
+# Note: it has to be set in global space.
+np.random.seed(2)
+random.seed(2)
+

 class NetworkxGraph(object):
    def __init__(self):
--- a/src/curobo/opt/init.py
+++ b/src/curobo/opt/init.py
@@ -8,3 +8,15 @@
 # without an express license agreement from NVIDIA CORPORATION or
 # its affiliates is strictly prohibited.
 #
+
+""" Optimization module containing several numerical solvers. 
+
+Base for an opimization solver is at :class:`opt_base.Optimizer`. cuRobo provides two base classes 
+for implementing two popular ways to optimize, (1) using particles 
+with :class:`particle.particle_opt_base.ParticleOptBase` and (2) using Newton/Quasi-Newton solvers
+with :class:`newton.newton_base.NewtonOptBase`. :class:`newton.newton_base.NewtonOptBase` contains 
+implementations of several line search schemes. Note that these line search schemes are approximate
+as cuRobo tries different line search magnitudes in parallel and chooses the largest that satisfies
+line search conditions.
+
+"""
--- a/src/curobo/opt/newton/init.py
+++ b/src/curobo/opt/newton/init.py
@@ -10,5 +10,5 @@
 #

 """
-This module contains code for cuda accelerated kinematics.
+This module contains Newton/Quasi-Newton solvers.
 """
--- a/src/curobo/opt/newton/lbfgs.py
+++ b/src/curobo/opt/newton/lbfgs.py
@@ -95,9 +95,9 @@ class LBFGSOpt(NewtonOptBase, LBFGSOptConfig):
        self.step_q_buffer[:] = 0.0
        return super().reset()

-    def update_nenvs(self, n_envs):
-        self.init_hessian(b=n_envs)
-        return super().update_nenvs(n_envs)
+    def update_nproblems(self, n_problems):
+        self.init_hessian(b=n_problems)
+        return super().update_nproblems(n_problems)

    def init_hessian(self, b=1):
        self.x_0 = torch.zeros(
--- a/src/curobo/opt/newton/newton_base.py
+++ b/src/curobo/opt/newton/newton_base.py
@@ -79,7 +79,7 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
        self.outer_iters = math.ceil(self.n_iters / self.inner_iters)

        # create line search
-        self.update_nenvs(self.n_envs)
+        self.update_nproblems(self.n_problems)

        self.reset()

@@ -135,7 +135,7 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
        if self.store_debug:
            self.debug.append(q.view(-1, self.action_horizon, self.d_action).clone())
        with profiler.record_function("newton_base/init_opt"):
-            q = q.view(self.n_envs, self.action_horizon * self.d_action)
+            q = q.view(self.n_problems, self.action_horizon * self.d_action)
            grad_q = q.detach() * 0.0
        # run opt graph
        if not self.cu_opt_init:
@@ -150,7 +150,7 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
                if check_convergence(self.best_iteration, self.current_iteration, self.last_best):
                    break

-        best_q = best_q.view(self.n_envs, self.action_horizon, self.d_action)
+        best_q = best_q.view(self.n_problems, self.action_horizon, self.d_action)
        return best_q

    def reset(self):
@@ -171,9 +171,6 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
        if self.store_debug:
            self.debug.append(self.best_q.view(-1, self.action_horizon, self.d_action).clone())
            self.debug_cost.append(self.best_cost.detach().view(-1, 1).clone())
-            # self.debug.append(q.view(-1, self.action_horizon, self.d_action).clone())
-            # self.debug_cost.append(cost_n.detach().view(-1, 1).clone())
-            # print(grad_q)

        return self.best_q.detach(), self.best_cost.detach(), q.detach(), grad_q.detach()

@@ -222,11 +219,11 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
    def _compute_cost_gradient(self, x):
        x_n = x.detach().requires_grad_(True)
        x_in = x_n.view(
-            self.n_envs * self.num_particles, self.action_horizon, self.rollout_fn.d_action
+            self.n_problems * self.num_particles, self.action_horizon, self.rollout_fn.d_action
        )
        trajectories = self.rollout_fn(x_in)  # x_n = (batch*line_search_scale) x horizon x d_action
        cost = torch.sum(
-            trajectories.costs.view(self.n_envs, self.num_particles, self.horizon),
+            trajectories.costs.view(self.n_problems, self.num_particles, self.horizon),
            dim=-1,
            keepdim=True,
        )
@@ -235,7 +232,7 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
        return (
            cost,
            g_x,
-        )  # cost: [n_envs, n_particles, 1], g_x: [n_envs, n_particles, horizon*d_action]
+        )  # cost: [n_problems, n_particles, 1], g_x: [n_problems, n_particles, horizon*d_action]

    def _wolfe_line_search(self, x, step_direction):
        # x_set = get_x_set_jit(step_direction, x, self.alpha_list, self.action_lows, self.action_highs)
@@ -455,36 +452,40 @@ class NewtonOptBase(Optimizer, NewtonOptConfig):
            mask_q = mask.unsqueeze(-1).expand(-1, self.d_opt)
            self.best_q.copy_(torch.where(mask_q, q, self.best_q))

-    def update_nenvs(self, n_envs):
+    def update_nproblems(self, n_problems):
        self.l_vec = torch.ones(
-            (n_envs, self.num_particles, 1),
+            (n_problems, self.num_particles, 1),
            device=self.tensor_args.device,
            dtype=self.tensor_args.dtype,
        )
        self.best_cost = (
-            torch.ones((n_envs, 1), device=self.tensor_args.device, dtype=self.tensor_args.dtype)
+            torch.ones(
+                (n_problems, 1), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
            * 5000000.0
        )
        self.best_q = torch.zeros(
-            (n_envs, self.d_opt), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            (n_problems, self.d_opt), device=self.tensor_args.device, dtype=self.tensor_args.dtype
        )
        self.best_grad_q = torch.zeros(
-            (n_envs, 1, self.d_opt), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            (n_problems, 1, self.d_opt),
+            device=self.tensor_args.device,
+            dtype=self.tensor_args.dtype,
        )

        # create list:
-        self.alpha_list = self.line_scale.repeat(n_envs, 1, 1)
+        self.alpha_list = self.line_scale.repeat(n_problems, 1, 1)
        self.zero_alpha_list = self.alpha_list[:, :, 0:1].contiguous()
        h = self.alpha_list.shape[1]
        self.c_idx = torch.arange(
-            0, n_envs * h, step=(h), device=self.tensor_args.device, dtype=torch.long
+            0, n_problems * h, step=(h), device=self.tensor_args.device, dtype=torch.long
        )
        self.best_iteration = torch.zeros(
-            (n_envs), device=self.tensor_args.device, dtype=torch.int16
+            (n_problems), device=self.tensor_args.device, dtype=torch.int16
        )
        self.current_iteration = torch.zeros((1), device=self.tensor_args.device, dtype=torch.int16)
        self.cu_opt_init = False
-        super().update_nenvs(n_envs)
+        super().update_nproblems(n_problems)

    def _initialize_opt_iters_graph(self, q, grad_q, shift_steps):
        if self.use_cuda_graph:
--- a/src/curobo/opt/opt_base.py
+++ b/src/curobo/opt/opt_base.py
@@ -8,6 +8,9 @@
 # without an express license agreement from NVIDIA CORPORATION or
 # its affiliates is strictly prohibited.
 #
+""" Base module for Optimization. """
+from __future__ import annotations
+
 # Standard Library
 import time
 from abc import abstractmethod
@@ -28,31 +31,75 @@ from curobo.util.torch_utils import is_cuda_graph_available

@dataclass
 class OptimizerConfig:
+    """Configuration for an :meth:`Optimizer`."""
+
+    #: Number of optimization variables per timestep.
    d_action: int
+
+    #: Lower bound for optimization variables.
    action_lows: List[float]
+
+    #: Higher bound for optimization variables
    action_highs: List[float]
-    horizon: int
-    n_iters: int
-    rollout_fn: RolloutBase
-    tensor_args: TensorDeviceType
-    use_cuda_graph: bool
-    store_debug: bool
-    debug_info: Any
-    cold_start_n_iters: int
-    num_particles: Union[int, None]
-    n_envs: int
-    sync_cuda_time: bool
-    use_coo_sparse: bool
+
+    #:
    action_horizon: int

+    #: Number of timesteps in optimization state, total variables = d_action * horizon
+    horizon: int
+
+    #: Number of iterations to run optimization
+    n_iters: int
+
+    #: Number of iterations to run optimization during the first instance. Setting to None will
+    #: use n_iters. This parameter is useful in MPC like settings where we need to run many
+    #: iterations during initialization (cold start) and then run only few iterations (warm start).
+    cold_start_n_iters: Union[int, None]
+
+    #: Rollout function to use for computing cost, given optimization variables.
+    rollout_fn: RolloutBase
+
+    #: Tensor device to use for optimization.
+    tensor_args: TensorDeviceType
+
+    #: Capture optimization iteration in a cuda graph and replay graph instead of eager execution.
+    #: Enabling this can make optimization 10x faster. But changing control flow, tensor
+    #: shapes, or problem type is not allowed.
+    use_cuda_graph: bool
+
+    #: Record debugging data such as optimization variables, and cost at every iteration. Enabling
+    #: this will disable cuda graph.
+    store_debug: bool
+
+    #: Use this to record additional attributes from rollouts.
+    debug_info: Any
+
+    #: Number of parallel problems to optimize.
+    n_problems: int
+
+    #: Number of particles to use per problem. Common optimization solvers use many particles to
+    #: optimize a single problem. E.g., MPPI rolls out many parallel samples and computes a weighted
+    #: mean. In cuRobo, Quasi-Newton solvers use particles to run many line search magnitudes.
+    #: Total optimization batch size = n_problems * num_particles.
+    num_particles: Union[int, None]
+
+    #: Synchronize device before computing solver time.
+    sync_cuda_time: bool
+
+    #: Matmul with a Sparse tensor is used to create particles for each problem index to save memory
+    #: and compute. Some versions of pytorch do not support coo sparse, specifically during
+    #: torch profile runs. Set this to False to use a standard tensor.
+    use_coo_sparse: bool
+
    def __post_init__(self):
        object.__setattr__(self, "action_highs", self.tensor_args.to_device(self.action_highs))
        object.__setattr__(self, "action_lows", self.tensor_args.to_device(self.action_lows))
-        # check cuda graph version:
        if self.use_cuda_graph:
            self.use_cuda_graph = is_cuda_graph_available()
        if self.num_particles is None:
            self.num_particles = 1
+        if self.cold_start_n_iters is None:
+            self.cold_start_n_iters = self.n_iters

    @staticmethod
    def create_data_dict(
@@ -61,6 +108,17 @@ class OptimizerConfig:
        tensor_args: TensorDeviceType = TensorDeviceType(),
        child_dict: Optional[Dict] = None,
    ):
+        """Helper function to create dictionary from optimizer parameters and rollout class.
+
+        Args:
+            data_dict: optimizer parameters dictionary.
+            rollout_fn: rollout function.
+            tensor_args: tensor cuda device.
+            child_dict: new dictionary where parameters will be stored.
+
+        Returns:
+            Dictionary with parameters to create a :meth:`OptimizerConfig`
+        """
        if child_dict is None:
            child_dict = deepcopy(data_dict)
        child_dict["d_action"] = rollout_fn.d_action
@@ -78,17 +136,33 @@ class OptimizerConfig:

 class Optimizer(OptimizerConfig):
    def __init__(self, config: Optional[OptimizerConfig] = None) -> None:
+        """Base optimization solver class
+
+        Args:
+            config: Initialized with parameters from a dataclass.
+        """
        if config is not None:
            super().__init__(**vars(config))
        self.opt_dt = 0.0
        self.COLD_START = True
-        self.update_nenvs(self.n_envs)
+        self.update_nproblems(self.n_problems)
        self._batch_goal = None
        self._rollout_list = None
        self.debug = []
        self.debug_cost = []

    def optimize(self, opt_tensor: torch.Tensor, shift_steps=0, n_iters=None) -> torch.Tensor:
+        """Find a solution through optimization given the initial values for variables.
+
+        Args:
+            opt_tensor: Initial value of optimization variables.
+                        Shape: [n_problems, action_horizon, d_action]
+            shift_steps: Shift variables along action_horizon. Useful in mpc warm-start setting.
+            n_iters: Override number of iterations to run optimization.
+
+        Returns:
+            Optimized values returned as a tensor of shape [n_problems, action_horizon, d_action].
+        """
        if self.COLD_START:
            n_iters = self.cold_start_n_iters
            self.COLD_START = False
@@ -99,17 +173,12 @@ class Optimizer(OptimizerConfig):
        self.opt_dt = time.time() - st_time
        return out

-    @abstractmethod
-    def _optimize(self, opt_tensor: torch.Tensor, shift_steps=0, n_iters=None) -> torch.Tensor:
-        pass
-
-    def _shift(self, shift_steps=0):
-        """
-        Shift the variables in the solver to hotstart the next timestep
-        """
-        return
-
    def update_params(self, goal: Goal):
+        """Update parameters in the :meth:`curobo.rollout.rollout_base.RolloutBase` instance.
+
+        Args:
+            goal: parameters to update rollout instance.
+        """
        with profiler.record_function("OptBase/batch_goal"):
            if self._batch_goal is not None:
                self._batch_goal.copy_(goal, update_idx_buffers=True)  # why True?
@@ -118,80 +187,37 @@ class Optimizer(OptimizerConfig):
        self.rollout_fn.update_params(self._batch_goal)

    def reset(self):
-        """
-        Reset the controller
-        """
+        """Reset optimizer."""
        self.rollout_fn.reset()
        self.debug = []
        self.debug_cost = []
-        # self.COLD_START = True

-    def update_nenvs(self, n_envs):
-        assert n_envs > 0
-        self._update_env_kernel(n_envs, self.num_particles)
-        self.n_envs = n_envs
+    def update_nproblems(self, n_problems: int):
+        """Update the number of problems that need to be optimized.

-    def _update_env_kernel(self, n_envs, num_particles):
-        log_info(
-            "Updating env kernel [n_envs: "
-            + str(n_envs)
-            + " , num_particles: "
-            + str(num_particles)
-            + " ]"
-        )
+        Args:
+            n_problems: number of problems.
+        """
+        assert n_problems > 0
+        self._update_problem_kernel(n_problems, self.num_particles)
+        self.n_problems = n_problems

-        self.env_col = torch.arange(
-            0, n_envs, step=1, dtype=torch.long, device=self.tensor_args.device
-        )
-        self.n_select_ = torch.tensor(
-            [x * n_envs + x for x in range(n_envs)],
-            device=self.tensor_args.device,
-            dtype=torch.long,
-        )
-
-        # create sparse tensor:
-        sparse_indices = []
-        for i in range(n_envs):
-            sparse_indices.extend([[i * num_particles + x, i] for x in range(num_particles)])
-
-        sparse_values = torch.ones(len(sparse_indices))
-        sparse_indices = torch.tensor(sparse_indices)
-        if self.use_coo_sparse:
-            self.env_kernel_ = torch.sparse_coo_tensor(
-                sparse_indices.t(),
-                sparse_values,
-                device=self.tensor_args.device,
-                dtype=self.tensor_args.dtype,
-            )
-        else:
-            self.env_kernel_ = torch.sparse_coo_tensor(
-                sparse_indices.t(),
-                sparse_values,
-                device="cpu",
-                dtype=self.tensor_args.dtype,
-            )
-            self.env_kernel_ = self.env_kernel_.to_dense().to(device=self.tensor_args.device)
-        self._env_seeds = self.num_particles
-
-    def get_nenv_tensor(self, x):
-        """This function takes an input tensor of shape (n_env,....) and converts it into
-        (n_particles,...)
+    def get_nproblem_tensor(self, x):
+        """This function takes an input tensor of shape (n_problem,....) and converts it into
+        (n_particles,...).
        """

-        # if x.shape[0] != self.n_envs:
-        #    x_env = x.unsqueeze(0).repeat(self.n_envs, 1)
-        # else:
-        #    x_env = x
-
        # create a tensor
-        nx_env = self.env_kernel_ @ x
+        nx_problem = self.problem_kernel_ @ x

-        return nx_env
+        return nx_problem

    def reset_seed(self):
+        """Reset seeds."""
        return True

    def reset_cuda_graph(self):
+        """Reset CUDA Graphs. This does not work, workaround is to create a new instance."""
        if self.use_cuda_graph:
            self.cu_opt_init = False
        else:
@@ -199,7 +225,87 @@ class Optimizer(OptimizerConfig):
        self._batch_goal = None
        self.rollout_fn.reset_cuda_graph()

+    def reset_shape(self):
+        """Reset any flags in rollout class. Useful to reinitialize tensors for a new shape."""
+        self.rollout_fn.reset_shape()
+        self._batch_goal = None
+
    def get_all_rollout_instances(self) -> List[RolloutBase]:
+        """Get all instances of Rollout class in the optimizer."""
        if self._rollout_list is None:
            self._rollout_list = [self.rollout_fn]
        return self._rollout_list
+
+    @abstractmethod
+    def _optimize(self, opt_tensor: torch.Tensor, shift_steps=0, n_iters=None) -> torch.Tensor:
+        """Implement this function in a derived class containing the solver.
+
+        Args:
+            opt_tensor: Initial value of optimization variables.
+                        Shape: [n_problems, action_horizon, d_action]
+            shift_steps: Shift variables along action_horizon. Useful in mpc warm-start setting.
+            n_iters: Override number of iterations to run optimization.
+
+        Returns:
+            Optimized variables in tensor shape [action_horizon, d_action].
+        """
+        return opt_tensor
+
+    @abstractmethod
+    def _shift(self, shift_steps=0):
+        """Shift the variables in the solver to hotstart the next timestep.
+
+        Args:
+            shift_steps: Number of timesteps to shift.
+        """
+        return
+
+    def _update_problem_kernel(self, n_problems: int, num_particles: int):
+        """Update matrix used to map problem index to number of particles.
+
+        Args:
+            n_problems: Number of optimization problems.
+            num_particles: Number of particles per problem.
+        """
+        log_info(
+            "Updating problem kernel [n_problems: "
+            + str(n_problems)
+            + " , num_particles: "
+            + str(num_particles)
+            + " ]"
+        )
+
+        self.problem_col = torch.arange(
+            0, n_problems, step=1, dtype=torch.long, device=self.tensor_args.device
+        )
+        self.n_select_ = torch.tensor(
+            [x * n_problems + x for x in range(n_problems)],
+            device=self.tensor_args.device,
+            dtype=torch.long,
+        )
+
+        # create sparse tensor:
+        sparse_indices = []
+        for i in range(n_problems):
+            sparse_indices.extend([[i * num_particles + x, i] for x in range(num_particles)])
+
+        sparse_values = torch.ones(len(sparse_indices))
+        sparse_indices = torch.tensor(sparse_indices)
+        if self.use_coo_sparse:
+            self.problem_kernel_ = torch.sparse_coo_tensor(
+                sparse_indices.t(),
+                sparse_values,
+                device=self.tensor_args.device,
+                dtype=self.tensor_args.dtype,
+            )
+        else:
+            self.problem_kernel_ = torch.sparse_coo_tensor(
+                sparse_indices.t(),
+                sparse_values,
+                device="cpu",
+                dtype=self.tensor_args.dtype,
+            )
+            self.problem_kernel_ = self.problem_kernel_.to_dense().to(
+                device=self.tensor_args.device
+            )
+        self._problem_seeds = self.num_particles
--- a/src/curobo/opt/particle/parallel_mppi.py
+++ b/src/curobo/opt/particle/parallel_mppi.py
@@ -65,7 +65,7 @@ class ParallelMPPIConfig(ParticleOptConfig):
    alpha: float
    gamma: float
    kappa: float
-    sample_per_env: bool
+    sample_per_problem: bool

    def __post_init__(self):
        self.init_cov = self.tensor_args.to_device(self.init_cov).unsqueeze(0)
@@ -264,7 +264,7 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
        actions = trajectories.actions

        total_costs = self._compute_total_cost(costs)
-        # Let's reshape to n_envs now:
+        # Let's reshape to n_problems now:

        # first find the means before doing exponential utility:
        w = self._exp_util(total_costs)
@@ -272,7 +272,7 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
        # Update best action
        if self.sample_mode == SampleMode.BEST:
            best_idx = torch.argmax(w, dim=-1)
-            self.best_traj.copy_(actions[self.env_col, best_idx])
+            self.best_traj.copy_(actions[self.problem_col, best_idx])

        if self.store_rollouts and self.visual_traj is not None:
            vis_seq = getattr(trajectories.state, self.visual_traj)
@@ -281,7 +281,9 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
            self.top_idx = top_idx
            top_trajs = torch.index_select(vis_seq, 0, top_idx[0])
            for i in range(1, top_idx.shape[0]):
-                trajs = torch.index_select(vis_seq, 0, top_idx[i] + (self.particles_per_env * i))
+                trajs = torch.index_select(
+                    vis_seq, 0, top_idx[i] + (self.particles_per_problem * i)
+                )
                top_trajs = torch.cat((top_trajs, trajs), dim=0)
            if self.top_trajs is None or top_trajs.shape != self.top_trajs:
                self.top_trajs = top_trajs
@@ -317,13 +319,15 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
        act_seq = self.mean_action.unsqueeze(-3) + scaled_delta
        cat_list = [act_seq]

-        if self.neg_per_env > 0:
+        if self.neg_per_problem > 0:
            neg_action = -1.0 * self.mean_action
-            neg_act_seqs = neg_action.unsqueeze(-3).expand(-1, self.neg_per_env, -1, -1)
+            neg_act_seqs = neg_action.unsqueeze(-3).expand(-1, self.neg_per_problem, -1, -1)
            cat_list.append(neg_act_seqs)
-        if self.null_per_env > 0:
+        if self.null_per_problem > 0:
            cat_list.append(
-                self.null_act_seqs[: self.null_per_env].unsqueeze(0).expand(self.n_envs, -1, -1, -1)
+                self.null_act_seqs[: self.null_per_problem]
+                .unsqueeze(0)
+                .expand(self.n_problems, -1, -1, -1)
            )

        act_seq = torch.cat(
@@ -343,8 +347,8 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
    def update_init_mean(self, init_mean):
        # update mean:
        # init_mean = init_mean.clone()
-        if init_mean.shape[0] != self.n_envs:
-            init_mean = init_mean.expand(self.n_envs, -1, -1)
+        if init_mean.shape[0] != self.n_problems:
+            init_mean = init_mean.expand(self.n_problems, -1, -1)
        if not copy_tensor(init_mean, self.mean_action):
            self.mean_action = init_mean.clone()
        if not copy_tensor(init_mean, self.best_traj):
@@ -353,27 +357,27 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
    def reset_mean(self):
        with profiler.record_function("mppi/reset_mean"):
            if self.random_mean:
-                mean = self.mean_lib.get_samples([self.n_envs])
+                mean = self.mean_lib.get_samples([self.n_problems])
                self.update_init_mean(mean)
            else:
                self.update_init_mean(self.init_mean)

    def reset_covariance(self):
        with profiler.record_function("mppi/reset_cov"):
-            # init_cov can either be a single value, or n_envs x 1 or n_envs x d_action
+            # init_cov can either be a single value, or n_problems x 1 or n_problems x d_action

            if self.cov_type == CovType.SIGMA_I:
-                # init_cov can either be a single value, or n_envs x 1
+                # init_cov can either be a single value, or n_problems x 1
                self.cov_action = self.init_cov
-                if self.init_cov.shape[0] != self.n_envs:
-                    self.cov_action = self.init_cov.unsqueeze(0).expand(self.n_envs, -1)
+                if self.init_cov.shape[0] != self.n_problems:
+                    self.cov_action = self.init_cov.unsqueeze(0).expand(self.n_problems, -1)
                self.inv_cov_action = 1.0 / self.cov_action
                a = torch.sqrt(self.cov_action)
                if not copy_tensor(a, self.scale_tril):
                    self.scale_tril = a

            elif self.cov_type == CovType.DIAG_A:
-                # init_cov can either be a single value, or n_envs x 1 or n_envs x 7
+                # init_cov can either be a single value, or n_problems x 1 or n_problems x 7
                init_cov = self.init_cov.clone()

                # if(init_cov.shape[-1] != self.d_action):
@@ -382,8 +386,8 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
                if len(init_cov.shape) == 2 and init_cov.shape[-1] != self.d_action:
                    init_cov = init_cov.expand(-1, self.d_action)
                init_cov = init_cov.unsqueeze(1)
-                if init_cov.shape[0] != self.n_envs:
-                    init_cov = init_cov.expand(self.n_envs, -1, -1)
+                if init_cov.shape[0] != self.n_problems:
+                    init_cov = init_cov.expand(self.n_problems, -1, -1)
                if not copy_tensor(init_cov.clone(), self.cov_action):
                    self.cov_action = init_cov.clone()
                self.inv_cov_action = 1.0 / self.cov_action
@@ -523,16 +527,18 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
                n_iters = 1
            else:
                n_iters = self.n_iters
-            if self.sample_per_env:
+            if self.sample_per_problem:
                s_set = (
                    self.sample_lib.get_samples(
-                        sample_shape=[self.sampled_particles_per_env * self.n_envs * n_iters],
+                        sample_shape=[
+                            self.sampled_particles_per_problem * self.n_problems * n_iters
+                        ],
                        base_seed=self.seed,
                    )
                    .view(
                        n_iters,
-                        self.n_envs,
-                        self.sampled_particles_per_env,
+                        self.n_problems,
+                        self.sampled_particles_per_problem,
                        self.action_horizon,
                        self.d_action,
                    )
@@ -540,13 +546,17 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
                )
            else:
                s_set = self.sample_lib.get_samples(
-                    sample_shape=[n_iters * (self.sampled_particles_per_env)],
+                    sample_shape=[n_iters * (self.sampled_particles_per_problem)],
                    base_seed=self.seed,
                )
                s_set = s_set.view(
-                    n_iters, 1, self.sampled_particles_per_env, self.action_horizon, self.d_action
+                    n_iters,
+                    1,
+                    self.sampled_particles_per_problem,
+                    self.action_horizon,
+                    self.d_action,
                )
-                s_set = s_set.repeat(1, self.n_envs, 1, 1, 1).clone()
+                s_set = s_set.repeat(1, self.n_problems, 1, 1, 1).clone()
            s_set[:, :, -1, :, :] = 0.0
            if not copy_tensor(s_set, self._sample_set):
                log_info("ParallelMPPI: Updating sample set")
@@ -575,7 +585,7 @@ class ParallelMPPI(ParticleOptBase, ParallelMPPIConfig):
        Parameters
        ----------
        state : dict or np.ndarray
-            Initial state to set the simulation env to
+            Initial state to set the simulation problem to
        """

        return super().generate_rollouts(init_act)
--- a/src/curobo/opt/particle/particle_opt_base.py
+++ b/src/curobo/opt/particle/particle_opt_base.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 #
 # Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
@@ -95,7 +94,7 @@ class ParticleOptBase(Optimizer, ParticleOptConfig):
        self.trajectories = None
        self.cu_opt_init = False
        self.info = ParticleOptInfo()
-        self.update_num_particles_per_env(self.num_particles)
+        self.update_num_particles_per_problem(self.num_particles)

    @abstractmethod
    def _get_action_seq(self, mode=SampleMode):
@@ -172,7 +171,7 @@ class ParticleOptBase(Optimizer, ParticleOptConfig):
        Parameters
        ----------
        state : dict or np.ndarray
-            Initial state to set the simulation env to
+            Initial state to set the simulation problem to
        """

        act_seq = self.sample_actions(init_act)
@@ -259,10 +258,10 @@ class ParticleOptBase(Optimizer, ParticleOptConfig):
            # generate random simulated trajectories
            trajectory = self.generate_rollouts()
            trajectory.actions = trajectory.actions.view(
-                self.n_envs, self.particles_per_env, self.action_horizon, self.d_action
+                self.n_problems, self.particles_per_problem, self.action_horizon, self.d_action
            )
            trajectory.costs = trajectory.costs.view(
-                self.n_envs, self.particles_per_env, self.horizon
+                self.n_problems, self.particles_per_problem, self.horizon
            )
            with profiler.record_function("mpc/mppi/update_distribution"):
                self._update_distribution(trajectory)
@@ -275,26 +274,26 @@ class ParticleOptBase(Optimizer, ParticleOptConfig):
        curr_action_seq = self._get_action_seq(mode=self.sample_mode)
        return curr_action_seq

-    def update_nenvs(self, n_envs):
-        assert n_envs > 0
-        self.total_num_particles = n_envs * self.num_particles
+    def update_nproblems(self, n_problems):
+        assert n_problems > 0
+        self.total_num_particles = n_problems * self.num_particles
        self.cu_opt_init = False
-        super().update_nenvs(n_envs)
+        super().update_nproblems(n_problems)

-    def update_num_particles_per_env(self, num_particles_per_env):
-        self.null_per_env = round(int(self.null_act_frac * num_particles_per_env * 0.5))
+    def update_num_particles_per_problem(self, num_particles_per_problem):
+        self.null_per_problem = round(int(self.null_act_frac * num_particles_per_problem * 0.5))

-        self.neg_per_env = (
-            round(int(self.null_act_frac * num_particles_per_env)) - self.null_per_env
+        self.neg_per_problem = (
+            round(int(self.null_act_frac * num_particles_per_problem)) - self.null_per_problem
        )

-        self.sampled_particles_per_env = (
-            num_particles_per_env - self.null_per_env - self.neg_per_env
+        self.sampled_particles_per_problem = (
+            num_particles_per_problem - self.null_per_problem - self.neg_per_problem
        )
-        self.particles_per_env = num_particles_per_env
-        if self.null_per_env > 0:
+        self.particles_per_problem = num_particles_per_problem
+        if self.null_per_problem > 0:
            self.null_act_seqs = torch.zeros(
-                self.null_per_env,
+                self.null_per_problem,
                self.action_horizon,
                self.d_action,
                device=self.tensor_args.device,
--- a/src/curobo/opt/particle/particle_opt_utils.py
+++ b/src/curobo/opt/particle/particle_opt_utils.py
@@ -121,7 +121,7 @@ def get_stomp_cov_jit(
                    else:
                        A[k * horizon + i, k * horizon + index] = fd_array[j + 3]

-    R = torch.matmul(A.transpose(-2, -1), A)
+    R = torch.matmul(A.transpose(-2, -1).clone(), A.clone())
    M = torch.inverse(R)
    scaled_M = (1 / horizon) * M / (torch.max(torch.abs(M), dim=1)[0].unsqueeze(0))
    cov = M / torch.max(torch.abs(M))
@@ -132,7 +132,6 @@ def get_stomp_cov_jit(
        scale_tril = torch.linalg.cholesky(cov)
    else:
        scale_tril = cov
-
    """
    k = 0
    act_cov_matrix = cov[k * horizon:k * horizon + horizon, k * horizon:k * horizon + horizon]
--- a/src/curobo/rollout/arm_base.py
+++ b/src/curobo/rollout/arm_base.py
@@ -39,7 +39,7 @@ from curobo.rollout.rollout_base import Goal, RolloutBase, RolloutConfig, Rollou
 from curobo.types.base import TensorDeviceType
 from curobo.types.robot import CSpaceConfig, RobotConfig
 from curobo.types.state import JointState
-from curobo.util.logger import log_info, log_warn
+from curobo.util.logger import log_error, log_info, log_warn
 from curobo.util.tensor_util import cat_sum


@@ -366,6 +366,7 @@ class ArmBase(RolloutBase, ArmBaseConfig):
                )
                cost_list.append(coll_cost)
        if return_list:
+
            return cost_list
        cost = cat_sum(cost_list)
        return cost
@@ -424,6 +425,7 @@ class ArmBase(RolloutBase, ArmBaseConfig):
        out_metrics = self.constraint_fn(state)
        out_metrics.state = state
        out_metrics = self.convergence_fn(state, out_metrics)
+        out_metrics.cost = self.cost_fn(state)
        return out_metrics

    def get_metrics_cuda_graph(self, state: JointState):
@@ -451,6 +453,8 @@ class ArmBase(RolloutBase, ArmBaseConfig):
            with torch.cuda.graph(self.cu_metrics_graph, stream=s):
                self._cu_out_metrics = self.get_metrics(self._cu_metrics_state_in)
            self._metrics_cuda_graph_init = True
+        if self._cu_metrics_state_in.position.shape != state.position.shape:
+            log_error("cuda graph changed")
        self._cu_metrics_state_in.copy_(state)
        self.cu_metrics_graph.replay()
        out_metrics = self._cu_out_metrics
@@ -462,17 +466,6 @@ class ArmBase(RolloutBase, ArmBaseConfig):
    ):
        if out_metrics is None:
            out_metrics = RolloutMetrics()
-        if (
-            self.convergence_cfg.null_space_cfg is not None
-            and self.null_convergence.enabled
-            and self._goal_buffer.batch_retract_state_idx is not None
-        ):
-            out_metrics.cost = self.null_convergence.forward_target_idx(
-                self._goal_buffer.retract_state,
-                state.state_seq.position,
-                self._goal_buffer.batch_retract_state_idx,
-            )
-
        return out_metrics

    def _get_augmented_state(self, state: JointState) -> KinematicModelState:
@@ -688,9 +681,11 @@ class ArmBase(RolloutBase, ArmBaseConfig):
        act_seq = self.dynamics_model.init_action_mean.unsqueeze(0).repeat(self.batch_size, 1, 1)
        return act_seq

-    def reset_cuda_graph(self):
+    def reset_shape(self):
        self._goal_idx_update = True
+        super().reset_shape()

+    def reset_cuda_graph(self):
        super().reset_cuda_graph()

    def get_action_from_state(self, state: JointState):
--- a/src/curobo/rollout/arm_reacher.py
+++ b/src/curobo/rollout/arm_reacher.py
@@ -20,16 +20,16 @@ import torch.autograd.profiler as profiler
 from curobo.geom.sdf.world import WorldCollision
 from curobo.rollout.cost.cost_base import CostConfig
 from curobo.rollout.cost.dist_cost import DistCost, DistCostConfig
-from curobo.rollout.cost.pose_cost import PoseCost, PoseCostConfig
+from curobo.rollout.cost.pose_cost import PoseCost, PoseCostConfig, PoseCostMetric
 from curobo.rollout.cost.straight_line_cost import StraightLineCost
 from curobo.rollout.cost.zero_cost import ZeroCost
 from curobo.rollout.dynamics_model.kinematic_model import KinematicModelState
 from curobo.rollout.rollout_base import Goal, RolloutMetrics
 from curobo.types.base import TensorDeviceType
 from curobo.types.robot import RobotConfig
-from curobo.types.tensor import T_BValue_float
+from curobo.types.tensor import T_BValue_float, T_BValue_int
 from curobo.util.helpers import list_idx_if_not_none
-from curobo.util.logger import log_info
+from curobo.util.logger import log_error, log_info
 from curobo.util.tensor_util import cat_max, cat_sum

 # Local Folder
@@ -42,6 +42,8 @@ class ArmReacherMetrics(RolloutMetrics):
    position_error: Optional[T_BValue_float] = None
    rotation_error: Optional[T_BValue_float] = None
    pose_error: Optional[T_BValue_float] = None
+    goalset_index: Optional[T_BValue_int] = None
+    null_space_error: Optional[T_BValue_float] = None

    def __getitem__(self, idx):
        d_list = [
@@ -53,6 +55,8 @@ class ArmReacherMetrics(RolloutMetrics):
            self.position_error,
            self.rotation_error,
            self.pose_error,
+            self.goalset_index,
+            self.null_space_error,
        ]
        idx_vals = list_idx_if_not_none(d_list, idx)
        return ArmReacherMetrics(*idx_vals)
@@ -65,10 +69,14 @@ class ArmReacherMetrics(RolloutMetrics):
            constraint=None if self.constraint is None else self.constraint.clone(),
            feasible=None if self.feasible is None else self.feasible.clone(),
            state=None if self.state is None else self.state,
-            cspace_error=None if self.cspace_error is None else self.cspace_error,
-            position_error=None if self.position_error is None else self.position_error,
-            rotation_error=None if self.rotation_error is None else self.rotation_error,
-            pose_error=None if self.pose_error is None else self.pose_error,
+            cspace_error=None if self.cspace_error is None else self.cspace_error.clone(),
+            position_error=None if self.position_error is None else self.position_error.clone(),
+            rotation_error=None if self.rotation_error is None else self.rotation_error.clone(),
+            pose_error=None if self.pose_error is None else self.pose_error.clone(),
+            goalset_index=None if self.goalset_index is None else self.goalset_index.clone(),
+            null_space_error=(
+                None if self.null_space_error is None else self.null_space_error.clone()
+            ),
        )


@@ -254,6 +262,7 @@ class ArmReacher(ArmBase, ArmReacherConfig):
                    goal_cost = self.goal_cost.forward(
                        ee_pos_batch, ee_quat_batch, self._goal_buffer
                    )
+
                cost_list.append(goal_cost)
        with profiler.record_function("cost/link_poses"):
            if self._goal_buffer.links_goal_pose is not None and self.cost_cfg.pose_cfg is not None:
@@ -296,36 +305,21 @@ class ArmReacher(ArmBase, ArmReacherConfig):
                g_dist,
            )

-            # cost += z_acc
            cost_list.append(z_acc)
-        # print(self.cost_cfg.zero_jerk_cfg)
-        if (
-            self.cost_cfg.zero_jerk_cfg is not None
-            and self.zero_jerk_cost.enabled
-            # and g_dist is not None
-        ):
-            # jerk = self.dynamics_model._aux_matrix @ state_batch.acceleration
+        if self.cost_cfg.zero_jerk_cfg is not None and self.zero_jerk_cost.enabled:
            z_jerk = self.zero_jerk_cost.forward(
                state_batch.jerk,
                g_dist,
            )
            cost_list.append(z_jerk)
-            # cost +=  z_jerk

-        if (
-            self.cost_cfg.zero_vel_cfg is not None
-            and self.zero_vel_cost.enabled
-            # and g_dist is not None
-        ):
+        if self.cost_cfg.zero_vel_cfg is not None and self.zero_vel_cost.enabled:
            z_vel = self.zero_vel_cost.forward(
                state_batch.velocity,
                g_dist,
            )
-            # cost += z_vel
-            # print(z_vel.shape)
            cost_list.append(z_vel)
        cost = cat_sum(cost_list)
-        # print(cost[:].T)
        return cost

    def convergence_fn(
@@ -350,6 +344,7 @@ class ArmReacher(ArmBase, ArmReacherConfig):
            ) = self.pose_convergence.forward_out_distance(
                state.ee_pos_seq, state.ee_quat_seq, self._goal_buffer
            )
+            out_metrics.goalset_index = self.pose_convergence.goalset_index_buffer  # .clone()
        if (
            self._goal_buffer.links_goal_pose is not None
            and self.convergence_cfg.pose_cfg is not None
@@ -389,6 +384,17 @@ class ArmReacher(ArmBase, ArmReacherConfig):
                True,
            )

+        if (
+            self.convergence_cfg.null_space_cfg is not None
+            and self.null_convergence.enabled
+            and self._goal_buffer.batch_retract_state_idx is not None
+        ):
+            out_metrics.null_space_error = self.null_convergence.forward_target_idx(
+                self._goal_buffer.retract_state,
+                state.state_seq.position,
+                self._goal_buffer.batch_retract_state_idx,
+            )
+
        return out_metrics

    def update_params(
@@ -420,3 +426,43 @@ class ArmReacher(ArmBase, ArmReacherConfig):
        else:
            self.dist_cost.disable_cost()
            self.cspace_convergence.disable_cost()
+
+    def get_pose_costs(self, include_link_pose: bool = False, include_convergence: bool = True):
+        pose_costs = [self.goal_cost]
+        if include_convergence:
+            pose_costs += [self.pose_convergence]
+        if include_link_pose:
+            log_error("Not implemented yet")
+        return pose_costs
+
+    def update_pose_cost_metric(
+        self,
+        metric: PoseCostMetric,
+    ):
+        pose_costs = self.get_pose_costs()
+        if metric.hold_partial_pose:
+            if metric.hold_vec_weight is None:
+                log_error("hold_vec_weight is required")
+            [x.hold_partial_pose(metric.hold_vec_weight) for x in pose_costs]
+        if metric.release_partial_pose:
+            [x.release_partial_pose() for x in pose_costs]
+        if metric.reach_partial_pose:
+            if metric.reach_vec_weight is None:
+                log_error("reach_vec_weight is required")
+            [x.reach_partial_pose(metric.reach_vec_weight) for x in pose_costs]
+        if metric.reach_full_pose:
+            [x.reach_full_pose() for x in pose_costs]
+
+        pose_costs = self.get_pose_costs(include_convergence=False)
+        if metric.remove_offset_waypoint:
+            [x.remove_offset_waypoint() for x in pose_costs]
+
+        if metric.offset_position is not None or metric.offset_rotation is not None:
+            [
+                x.update_offset_waypoint(
+                    offset_position=metric.offset_position,
+                    offset_rotation=metric.offset_rotation,
+                    offset_tstep_fraction=metric.offset_tstep_fraction,
+                )
+                for x in pose_costs
+            ]
--- a/src/curobo/rollout/cost/bound_cost.py
+++ b/src/curobo/rollout/cost/bound_cost.py
@@ -257,13 +257,13 @@ class BoundCost(CostBase, BoundCostConfig):
        return cost

    def update_dt(self, dt: Union[float, torch.Tensor]):
-        # return super().update_dt(dt)
        if self.cost_type == BoundCostType.BOUNDS_SMOOTH:
            v_scale = dt / self._dt
            a_scale = v_scale**2
            j_scale = v_scale**3
            self.smooth_weight[1] *= a_scale
            self.smooth_weight[2] *= j_scale
+
        return super().update_dt(dt)


--- a/src/curobo/rollout/cost/pose_cost.py
+++ b/src/curobo/rollout/cost/pose_cost.py
@@ -8,19 +8,23 @@
 # without an express license agreement from NVIDIA CORPORATION or
 # its affiliates is strictly prohibited.
 #
+from __future__ import annotations
+
 # Standard Library
+import math
 from dataclasses import dataclass
 from enum import Enum
 from typing import List, Optional

 # Third Party
 import torch
-from torch.autograd import Function

 # CuRobo
-from curobo.curobolib.geom import get_pose_distance, get_pose_distance_backward
+from curobo.curobolib.geom import PoseError, PoseErrorDistance
 from curobo.rollout.rollout_base import Goal
+from curobo.types.base import TensorDeviceType
 from curobo.types.math import OrientationError, Pose
+from curobo.util.logger import log_error

 # Local Folder
 from .cost_base import CostBase, CostConfig
@@ -37,7 +41,11 @@ class PoseErrorType(Enum):
 class PoseCostConfig(CostConfig):
    cost_type: PoseErrorType = PoseErrorType.BATCH_GOAL
    use_metric: bool = False
+    project_distance: bool = True
    run_vec_weight: Optional[List[float]] = None
+    use_projected_distance: bool = True
+    offset_waypoint: List[float] = None
+    offset_tstep_fraction: float = -1.0

    def __post_init__(self):
        if self.run_vec_weight is not None:
@@ -54,392 +62,85 @@ class PoseCostConfig(CostConfig):
            self.vec_convergence = torch.zeros(
                2, device=self.tensor_args.device, dtype=self.tensor_args.dtype
            )
+        if self.offset_waypoint is None:
+            self.offset_waypoint = [0, 0, 0, 0, 0, 0]
+        if self.run_weight is None:
+            self.run_weight = 1
+        self.offset_waypoint = self.tensor_args.to_device(self.offset_waypoint)
+        if isinstance(self.offset_tstep_fraction, float):
+            self.offset_tstep_fraction = self.tensor_args.to_device([self.offset_tstep_fraction])
        return super().__post_init__()


-@torch.jit.script
-def backward_PoseError_jit(grad_g_dist, grad_out_distance, weight, g_vec):
-    grad_vec = grad_g_dist + (grad_out_distance * weight)
-    grad = 1.0 * (grad_vec).unsqueeze(-1) * g_vec
-    return grad
+@dataclass
+class PoseCostMetric:
+    hold_partial_pose: bool = False
+    release_partial_pose: bool = False
+    hold_vec_weight: Optional[torch.Tensor] = None
+    reach_partial_pose: bool = False
+    reach_full_pose: bool = False
+    reach_vec_weight: Optional[torch.Tensor] = None
+    offset_position: Optional[torch.Tensor] = None
+    offset_rotation: Optional[torch.Tensor] = None
+    offset_tstep_fraction: float = -1.0
+    remove_offset_waypoint: bool = False

+    def clone(self):

-# full method:
-@torch.jit.script
-def backward_full_PoseError_jit(
-    grad_out_distance, grad_g_dist, grad_r_err, p_w, q_w, g_vec_p, g_vec_q
-):
-    p_grad = (grad_g_dist + (grad_out_distance * p_w)).unsqueeze(-1) * g_vec_p
-    q_grad = (grad_r_err + (grad_out_distance * q_w)).unsqueeze(-1) * g_vec_q
-    # p_grad = ((grad_out_distance * p_w)).unsqueeze(-1) * g_vec_p
-    # q_grad = ((grad_out_distance * q_w)).unsqueeze(-1) * g_vec_q
-
-    return p_grad, q_grad
-
-
-class PoseErrorDistance(Function):
-    @staticmethod
-    def forward(
-        ctx,
-        current_position,
-        goal_position,
-        current_quat,
-        goal_quat,
-        vec_weight,
-        weight,
-        vec_convergence,
-        run_weight,
-        run_vec_weight,
-        batch_pose_idx,
-        out_distance,
-        out_position_distance,
-        out_rotation_distance,
-        out_p_vec,
-        out_r_vec,
-        out_idx,
-        out_p_grad,
-        out_q_grad,
-        batch_size,
-        horizon,
-        mode=PoseErrorType.BATCH_GOAL.value,
-        num_goals=1,
-        use_metric=False,
-    ):
-        # out_distance = current_position[..., 0].detach().clone() * 0.0
-        # out_position_distance = out_distance.detach().clone()
-        # out_rotation_distance = out_distance.detach().clone()
-        # out_vec = (
-        #    torch.cat((current_position.detach().clone(), current_quat.detach().clone()), dim=-1)
-        #    * 0.0
-        # )
-        # out_idx = out_distance.clone().to(dtype=torch.long)
-
-        (
-            out_distance,
-            out_position_distance,
-            out_rotation_distance,
-            out_p_vec,
-            out_r_vec,
-            out_idx,
-        ) = get_pose_distance(
-            out_distance,
-            out_position_distance,
-            out_rotation_distance,
-            out_p_vec,
-            out_r_vec,
-            out_idx,
-            current_position.contiguous(),
-            goal_position,
-            current_quat.contiguous(),
-            goal_quat,
-            vec_weight,
-            weight,
-            vec_convergence,
-            run_weight,
-            run_vec_weight,
-            batch_pose_idx,
-            batch_size,
-            horizon,
-            mode,
-            num_goals,
-            current_position.requires_grad,
-            True,
-            use_metric,
-        )
-        ctx.save_for_backward(out_p_vec, out_r_vec, weight, out_p_grad, out_q_grad)
-        return out_distance, out_position_distance, out_rotation_distance, out_idx  # .view(-1,1)
-
-    @staticmethod
-    def backward(ctx, grad_out_distance, grad_g_dist, grad_r_err, grad_out_idx):
-        (g_vec_p, g_vec_q, weight, out_grad_p, out_grad_q) = ctx.saved_tensors
-        pos_grad = None
-        quat_grad = None
-        batch_size = g_vec_p.shape[0] * g_vec_p.shape[1]
-        if ctx.needs_input_grad[0] and ctx.needs_input_grad[2]:
-            pos_grad, quat_grad = get_pose_distance_backward(
-                out_grad_p,
-                out_grad_q,
-                grad_out_distance.contiguous(),
-                grad_g_dist.contiguous(),
-                grad_r_err.contiguous(),
-                weight,
-                g_vec_p,
-                g_vec_q,
-                batch_size,
-                use_distance=True,
-            )
-            # pos_grad, quat_grad = backward_full_PoseError_jit(
-            #    grad_out_distance,
-            #    grad_g_dist, grad_r_err, p_w, q_w, g_vec_p, g_vec_q
-            # )
-        elif ctx.needs_input_grad[0]:
-            pos_grad = backward_PoseError_jit(grad_g_dist, grad_out_distance, p_w, g_vec_p)
-            # grad_vec = grad_g_dist + (grad_out_distance * weight[1])
-            # pos_grad = 1.0 * (grad_vec).unsqueeze(-1) * g_vec[..., 4:]
-        elif ctx.needs_input_grad[2]:
-            quat_grad = backward_PoseError_jit(grad_r_err, grad_out_distance, q_w, g_vec_q)
-            # grad_vec = grad_r_err + (grad_out_distance * weight[0])
-            # quat_grad = 1.0 * (grad_vec).unsqueeze(-1) * g_vec[..., :4]
-        return (
-            pos_grad,
-            None,
-            quat_grad,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
+        return PoseCostMetric(
+            hold_partial_pose=self.hold_partial_pose,
+            release_partial_pose=self.release_partial_pose,
+            hold_vec_weight=None if self.hold_vec_weight is None else self.hold_vec_weight.clone(),
+            reach_partial_pose=self.reach_partial_pose,
+            reach_full_pose=self.reach_full_pose,
+            reach_vec_weight=(
+                None if self.reach_vec_weight is None else self.reach_vec_weight.clone()
+            ),
+            offset_position=None if self.offset_position is None else self.offset_position.clone(),
+            offset_rotation=None if self.offset_rotation is None else self.offset_rotation.clone(),
+            offset_tstep_fraction=self.offset_tstep_fraction,
+            remove_offset_waypoint=self.remove_offset_waypoint,
        )

+    @classmethod
+    def create_grasp_approach_metric(
+        cls,
+        offset_position: float = 0.5,
+        linear_axis: int = 2,
+        tstep_fraction: float = 0.6,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ) -> PoseCostMetric:
+        """Enables moving to a pregrasp and then locked orientation movement to final grasp.

-class PoseLoss(Function):
-    @staticmethod
-    def forward(
-        ctx,
-        current_position,
-        goal_position,
-        current_quat,
-        goal_quat,
-        vec_weight,
-        weight,
-        vec_convergence,
-        run_weight,
-        run_vec_weight,
-        batch_pose_idx,
-        out_distance,
-        out_position_distance,
-        out_rotation_distance,
-        out_p_vec,
-        out_r_vec,
-        out_idx,
-        out_p_grad,
-        out_q_grad,
-        batch_size,
-        horizon,
-        mode=PoseErrorType.BATCH_GOAL.value,
-        num_goals=1,
-        use_metric=False,
-    ):
-        # out_distance = current_position[..., 0].detach().clone() * 0.0
-        # out_position_distance = out_distance.detach().clone()
-        # out_rotation_distance = out_distance.detach().clone()
-        # out_vec = (
-        #    torch.cat((current_position.detach().clone(), current_quat.detach().clone()), dim=-1)
-        #    * 0.0
-        # )
-        # out_idx = out_distance.clone().to(dtype=torch.long)
+        Since this is added as a cost, the trajectory will not reach the exact offset, instead it
+        will try to take a blended path to the final grasp without stopping at the offset.

-        (
-            out_distance,
-            out_position_distance,
-            out_rotation_distance,
-            out_p_vec,
-            out_r_vec,
-            out_idx,
-        ) = get_pose_distance(
-            out_distance,
-            out_position_distance,
-            out_rotation_distance,
-            out_p_vec,
-            out_r_vec,
-            out_idx,
-            current_position.contiguous(),
-            goal_position,
-            current_quat.contiguous(),
-            goal_quat,
-            vec_weight,
-            weight,
-            vec_convergence,
-            run_weight,
-            run_vec_weight,
-            batch_pose_idx,
-            batch_size,
-            horizon,
-            mode,
-            num_goals,
-            current_position.requires_grad,
-            False,
-            use_metric,
-        )
-        ctx.save_for_backward(out_p_vec, out_r_vec)
-        return out_distance
+        Args:
+            offset_position: offset in meters.
+            linear_axis: specifies the x y or z axis.
+            tstep_fraction:  specifies the timestep fraction to start activating this constraint.
+            tensor_args: cuda device.

-    @staticmethod
-    def backward(ctx, grad_out_distance):  # , grad_g_dist, grad_r_err, grad_out_idx):
-        pos_grad = None
-        quat_grad = None
-        if ctx.needs_input_grad[0] and ctx.needs_input_grad[2]:
-            (g_vec_p, g_vec_q) = ctx.saved_tensors
-            pos_grad = g_vec_p * grad_out_distance.unsqueeze(1)
-            quat_grad = g_vec_q * grad_out_distance.unsqueeze(1)
-            pos_grad = pos_grad.unsqueeze(-2)
-            quat_grad = quat_grad.unsqueeze(-2)
-        elif ctx.needs_input_grad[0]:
-            (g_vec_p, g_vec_q) = ctx.saved_tensors
-
-            pos_grad = g_vec_p * grad_out_distance.unsqueeze(1)
-            pos_grad = pos_grad.unsqueeze(-2)
-        elif ctx.needs_input_grad[2]:
-            (g_vec_p, g_vec_q) = ctx.saved_tensors
-
-            quat_grad = g_vec_q * grad_out_distance.unsqueeze(1)
-            quat_grad = quat_grad.unsqueeze(-2)
-
-        return (
-            pos_grad,
-            None,
-            quat_grad,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
+        Returns:
+            cost metric.
+        """
+        hold_vec_weight = tensor_args.to_device([1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
+        hold_vec_weight[3 + linear_axis] = 0.0
+        offset_position_vec = tensor_args.to_device([0.0, 0.0, 0.0])
+        offset_position_vec[linear_axis] = offset_position
+        return cls(
+            hold_partial_pose=True,
+            hold_vec_weight=hold_vec_weight,
+            offset_position=offset_position_vec,
+            offset_tstep_fraction=tstep_fraction,
        )

-
-class PoseError(Function):
-    @staticmethod
-    def forward(
-        ctx,
-        current_position,
-        goal_position,
-        current_quat,
-        goal_quat,
-        vec_weight,
-        weight,
-        vec_convergence,
-        run_weight,
-        run_vec_weight,
-        batch_pose_idx,
-        out_distance,
-        out_position_distance,
-        out_rotation_distance,
-        out_p_vec,
-        out_r_vec,
-        out_idx,
-        out_p_grad,
-        out_q_grad,
-        batch_size,
-        horizon,
-        mode=PoseErrorType.BATCH_GOAL.value,
-        num_goals=1,
-        use_metric=False,
-    ):
-        # out_distance = current_position[..., 0].detach().clone() * 0.0
-        # out_position_distance = out_distance.detach().clone()
-        # out_rotation_distance = out_distance.detach().clone()
-        # out_vec = (
-        #    torch.cat((current_position.detach().clone(), current_quat.detach().clone()), dim=-1)
-        #    * 0.0
-        # )
-        # out_idx = out_distance.clone().to(dtype=torch.long)
-
-        (
-            out_distance,
-            out_position_distance,
-            out_rotation_distance,
-            out_p_vec,
-            out_r_vec,
-            out_idx,
-        ) = get_pose_distance(
-            out_distance,
-            out_position_distance,
-            out_rotation_distance,
-            out_p_vec,
-            out_r_vec,
-            out_idx,
-            current_position.contiguous(),
-            goal_position,
-            current_quat.contiguous(),
-            goal_quat,
-            vec_weight,
-            weight,
-            vec_convergence,
-            run_weight,
-            run_vec_weight,
-            batch_pose_idx,
-            batch_size,
-            horizon,
-            mode,
-            num_goals,
-            current_position.requires_grad,
-            False,
-            use_metric,
-        )
-        ctx.save_for_backward(out_p_vec, out_r_vec)
-        return out_distance
-
-    @staticmethod
-    def backward(ctx, grad_out_distance):  # , grad_g_dist, grad_r_err, grad_out_idx):
-        pos_grad = None
-        quat_grad = None
-        if ctx.needs_input_grad[0] and ctx.needs_input_grad[2]:
-            (g_vec_p, g_vec_q) = ctx.saved_tensors
-
-            pos_grad = g_vec_p
-            quat_grad = g_vec_q
-        elif ctx.needs_input_grad[0]:
-            (g_vec_p, g_vec_q) = ctx.saved_tensors
-
-            pos_grad = g_vec_p
-        elif ctx.needs_input_grad[2]:
-            (g_vec_p, g_vec_q) = ctx.saved_tensors
-
-            quat_grad = g_vec_q
-        return (
-            pos_grad,
-            None,
-            quat_grad,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
+    @classmethod
+    def reset_metric(cls) -> PoseCostMetric:
+        return PoseCostMetric(
+            remove_offset_waypoint=True,
+            reach_full_pose=True,
+            release_partial_pose=True,
        )


@@ -449,13 +150,88 @@ class PoseCost(CostBase, PoseCostConfig):
        CostBase.__init__(self)
        self.rot_weight = self.vec_weight[0:3]
        self.pos_weight = self.vec_weight[3:6]
-
        self._vec_convergence = self.tensor_args.to_device(self.vec_convergence)
        self._batch_size = 0
        self._horizon = 0

+    def update_metric(self, metric: PoseCostMetric):
+        if metric.hold_partial_pose:
+            if metric.hold_vec_weight is None:
+                log_error("hold_vec_weight is required")
+            self.hold_partial_pose(metric.hold_vec_weight)
+        if metric.release_partial_pose:
+            self.release_partial_pose()
+        if metric.reach_partial_pose:
+            if metric.reach_vec_weight is None:
+                log_error("reach_vec_weight is required")
+            self.reach_partial_pose(metric.reach_vec_weight)
+        if metric.reach_full_pose:
+            self.reach_full_pose()
+
+        if metric.remove_offset_waypoint:
+            self.remove_offset_waypoint()
+
+        if metric.offset_position is not None or metric.offset_rotation is not None:
+            self.update_offset_waypoint(
+                offset_position=self.offset_position,
+                offset_rotation=self.offset_rotation,
+                offset_tstep_fraction=self.offset_tstep_fraction,
+            )
+
+    def hold_partial_pose(self, run_vec_weight: torch.Tensor):
+
+        self.run_vec_weight.copy_(run_vec_weight)
+
+    def release_partial_pose(self):
+        self.run_vec_weight[:] = 0.0
+
+    def reach_partial_pose(self, vec_weight: torch.Tensor):
+        self.vec_weight[:] = vec_weight
+
+    def reach_full_pose(self):
+        self.vec_weight[:] = 1.0
+
+    def update_offset_waypoint(
+        self,
+        offset_position: Optional[torch.Tensor] = None,
+        offset_rotation: Optional[torch.Tensor] = None,
+        offset_tstep_fraction: float = 0.75,
+    ):
+        if offset_position is not None:
+            self.offset_waypoint[3:].copy_(offset_position)
+        if offset_rotation is not None:
+            self.offset_waypoint[:3].copy_(offset_rotation)
+        self.offset_tstep_fraction[:] = offset_tstep_fraction
+        if self._horizon <= 0:
+            print(self.weight)
+            log_error(
+                "Updating offset waypoint is only possible after initializing motion gen"
+                + " run motion_gen.warmup() before adding offset_waypoint"
+            )
+        self.update_run_weight(run_tstep_fraction=offset_tstep_fraction)
+
+    def remove_offset_waypoint(self):
+        self.offset_tstep_fraction[:] = -1.0
+        self.update_run_weight()
+
+    def update_run_weight(
+        self, run_tstep_fraction: float = 0.0, run_weight: Optional[float] = None
+    ):
+        if self._horizon == 1:
+            return
+
+        if run_weight is None:
+            run_weight = self.run_weight
+
+        active_steps = math.floor(self._horizon * run_tstep_fraction)
+        self._run_weight_vec[:, :active_steps] = 0
+        self._run_weight_vec[:, active_steps:-1] = run_weight
+
    def update_batch_size(self, batch_size, horizon):
        if batch_size != self._batch_size or horizon != self._horizon:
+            # print(self.weight)
+            # print(batch_size, horizon, self._batch_size, self._horizon)
+
            # batch_size = b*h
            self.out_distance = torch.zeros(
                (batch_size, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
@@ -493,12 +269,16 @@ class PoseCost(CostBase, PoseCostConfig):
                self._run_weight_vec = torch.ones(
                    (1, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
                )
-            if self.terminal and self.run_weight is not None:
-                self._run_weight_vec[:, :-1] *= self.run_weight
+            if self.terminal and self.run_weight is not None and horizon > 1:
+                self._run_weight_vec[:, :-1] = self.run_weight

            self._batch_size = batch_size
            self._horizon = horizon

+    @property
+    def goalset_index_buffer(self):
+        return self.out_idx
+
    def _forward_goal_distribution(self, ee_pos_batch, ee_rot_batch, ee_goal_pos, ee_goal_rot):
        ee_goal_pos = ee_goal_pos.unsqueeze(1)
        ee_goal_pos = ee_goal_pos.unsqueeze(1)
@@ -563,13 +343,13 @@ class PoseCost(CostBase, PoseCostConfig):
    def _update_cost_type(self, ee_goal_pos, ee_pos_batch, num_goals):
        d_g = len(ee_goal_pos.shape)
        b_sze = ee_goal_pos.shape[0]
-        if d_g == 2 and b_sze == 1:  # 1, 3
+        if d_g == 2 and b_sze == 1:  # [1, 3]
            self.cost_type = PoseErrorType.SINGLE_GOAL
-        elif d_g == 2 and b_sze == ee_pos_batch.shape[0]:  # b, 3
+        elif d_g == 2 and b_sze > 1:  # [b, 3]
            self.cost_type = PoseErrorType.BATCH_GOAL
-        elif d_g == 3:
+        elif d_g == 3 and b_sze == 1:  # [1, goalset, 3]
            self.cost_type = PoseErrorType.GOALSET
-        elif len(ee_goal_pos.shape) == 4 and b_sze == ee_pos_bath.shape[0]:
+        elif d_g == 3 and b_sze > 1:  # [b, goalset,3]
            self.cost_type = PoseErrorType.BATCH_GOALSET

    def forward_out_distance(
@@ -599,6 +379,8 @@ class PoseCost(CostBase, PoseCostConfig):
            self._vec_convergence,
            self._run_weight_vec,
            self.run_vec_weight,
+            self.offset_waypoint,
+            self.offset_tstep_fraction,
            goal.batch_pose_idx,
            self.out_distance,
            self.out_position_distance,
@@ -613,7 +395,9 @@ class PoseCost(CostBase, PoseCostConfig):
            self.cost_type.value,
            num_goals,
            self.use_metric,
+            self.project_distance,
        )
+        # print(self.out_idx.shape, self.out_idx[:,-1])
        # print(goal.batch_pose_idx.shape)
        cost = distance  # .view(b, h)#.clone()
        r_err = r_err  # .view(b, h)
@@ -632,65 +416,46 @@ class PoseCost(CostBase, PoseCostConfig):
        ee_goal_rot = goal_pose.quaternion
        num_goals = goal_pose.n_goalset
        self._update_cost_type(ee_goal_pos, ee_pos_batch, num_goals)
-
+        # print(self.cost_type)
        b, h, _ = ee_pos_batch.shape
        self.update_batch_size(b, h)
        # return self.out_distance
        # print(b,h, ee_goal_pos.shape)
-        if self.return_loss:
-            distance = PoseLoss.apply(
-                ee_pos_batch,
-                ee_goal_pos,
-                ee_rot_batch,  # .view(-1, 4).contiguous(),
-                ee_goal_rot,
-                self.vec_weight,
-                self.weight,
-                self._vec_convergence,
-                self._run_weight_vec,
-                self.run_vec_weight,
-                goal.batch_pose_idx,
-                self.out_distance,
-                self.out_position_distance,
-                self.out_rotation_distance,
-                self.out_p_vec,
-                self.out_q_vec,
-                self.out_idx,
-                self.out_p_grad,
-                self.out_q_grad,
-                b,
-                h,
-                self.cost_type.value,
-                num_goals,
-                self.use_metric,
-            )
-        else:
-            distance = PoseError.apply(
-                ee_pos_batch,
-                ee_goal_pos,
-                ee_rot_batch,  # .view(-1, 4).contiguous(),
-                ee_goal_rot,
-                self.vec_weight,
-                self.weight,
-                self._vec_convergence,
-                self._run_weight_vec,
-                self.run_vec_weight,
-                goal.batch_pose_idx,
-                self.out_distance,
-                self.out_position_distance,
-                self.out_rotation_distance,
-                self.out_p_vec,
-                self.out_q_vec,
-                self.out_idx,
-                self.out_p_grad,
-                self.out_q_grad,
-                b,
-                h,
-                self.cost_type.value,
-                num_goals,
-                self.use_metric,
-            )
+
+        distance = PoseError.apply(
+            ee_pos_batch,
+            ee_goal_pos,
+            ee_rot_batch,  # .view(-1, 4).contiguous(),
+            ee_goal_rot,
+            self.vec_weight,
+            self.weight,
+            self._vec_convergence,
+            self._run_weight_vec,
+            self.run_vec_weight,
+            self.offset_waypoint,
+            self.offset_tstep_fraction,
+            goal.batch_pose_idx,
+            self.out_distance,
+            self.out_position_distance,
+            self.out_rotation_distance,
+            self.out_p_vec,
+            self.out_q_vec,
+            self.out_idx,
+            self.out_p_grad,
+            self.out_q_grad,
+            b,
+            h,
+            self.cost_type.value,
+            num_goals,
+            self.use_metric,
+            self.project_distance,
+            self.return_loss,
+        )

        cost = distance
+        # if link_name is None and cost.shape[0]==8:
+        #    print(ee_pos_batch[...,-1].squeeze())
+        # print(cost.shape)
        return cost

    def forward_pose(
@@ -708,56 +473,34 @@ class PoseCost(CostBase, PoseCostConfig):
        b = query_pose.position.shape[0]
        h = query_pose.position.shape[1]
        num_goals = 1
-        if self.return_loss:
-            distance = PoseLoss.apply(
-                query_pose.position.unsqueeze(1),
-                ee_goal_pos,
-                query_pose.quaternion.unsqueeze(1),
-                ee_goal_quat,
-                self.vec_weight,
-                self.weight,
-                self._vec_convergence,
-                self._run_weight_vec,
-                self.run_vec_weight,
-                batch_pose_idx,
-                self.out_distance,
-                self.out_position_distance,
-                self.out_rotation_distance,
-                self.out_p_vec,
-                self.out_q_vec,
-                self.out_idx,
-                self.out_p_grad,
-                self.out_q_grad,
-                b,
-                h,
-                self.cost_type.value,
-                num_goals,
-                self.use_metric,
-            )
-        else:
-            distance = PoseError.apply(
-                query_pose.position.unsqueeze(1),
-                ee_goal_pos,
-                query_pose.quaternion.unsqueeze(1),
-                ee_goal_quat,
-                self.vec_weight,
-                self.weight,
-                self._vec_convergence,
-                self._run_weight_vec,
-                self.run_vec_weight,
-                batch_pose_idx,
-                self.out_distance,
-                self.out_position_distance,
-                self.out_rotation_distance,
-                self.out_p_vec,
-                self.out_q_vec,
-                self.out_idx,
-                self.out_p_grad,
-                self.out_q_grad,
-                b,
-                h,
-                self.cost_type.value,
-                num_goals,
-                self.use_metric,
-            )
+
+        distance = PoseError.apply(
+            query_pose.position.unsqueeze(1),
+            ee_goal_pos,
+            query_pose.quaternion.unsqueeze(1),
+            ee_goal_quat,
+            self.vec_weight,
+            self.weight,
+            self._vec_convergence,
+            self._run_weight_vec,
+            self.run_vec_weight,
+            self.offset_waypoint,
+            self.offset_tstep_fraction,
+            batch_pose_idx,
+            self.out_distance,
+            self.out_position_distance,
+            self.out_rotation_distance,
+            self.out_p_vec,
+            self.out_q_vec,
+            self.out_idx,
+            self.out_p_grad,
+            self.out_q_grad,
+            b,
+            h,
+            self.cost_type.value,
+            num_goals,
+            self.use_metric,
+            self.project_distance,
+            self.return_loss,
+        )
        return distance
--- a/src/curobo/rollout/cost/stop_cost.py
+++ b/src/curobo/rollout/cost/stop_cost.py
@@ -68,9 +68,14 @@ class StopCost(CostBase, StopCostConfig):
            self.max_vel = (sum_matrix @ delta_vel).unsqueeze(-1)

    def forward(self, vels):
-        vel_abs = torch.abs(vels)
-        vel_abs = torch.nn.functional.relu(vel_abs - self.max_vel)
-
-        cost = self.weight * (torch.sum(vel_abs**2, dim=-1))
-
+        cost = velocity_cost(vels, self.weight, self.max_vel)
        return cost
+
+
+@torch.jit.script
+def velocity_cost(vels, weight, max_vel):
+    vel_abs = torch.abs(vels)
+    vel_abs = torch.nn.functional.relu(vel_abs - max_vel[: vels.shape[1]])
+    cost = weight * (torch.sum(vel_abs**2, dim=-1))
+
+    return cost
--- a/src/curobo/rollout/rollout_base.py
+++ b/src/curobo/rollout/rollout_base.py
@@ -23,7 +23,7 @@ import torch
 # CuRobo
 from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
-from curobo.types.robot import CSpaceConfig, State, JointState
+from curobo.types.robot import CSpaceConfig, State
 from curobo.types.tensor import (
    T_BDOF,
    T_DOF,
@@ -33,6 +33,7 @@ from curobo.types.tensor import (
    T_BValue_float,
 )
 from curobo.util.helpers import list_idx_if_not_none
+from curobo.util.logger import log_info
 from curobo.util.sample_lib import HaltonGenerator
 from curobo.util.tensor_util import copy_tensor

@@ -235,6 +236,7 @@ class Goal(Sequence):
            batch_retract_state_idx=self.batch_retract_state_idx,
            batch_goal_state_idx=self.batch_goal_state_idx,
            links_goal_pose=self.links_goal_pose,
+            n_goalset=self.n_goalset,
        )

    def _tensor_repeat_seeds(self, tensor, num_seeds):
@@ -353,7 +355,7 @@ class Goal(Sequence):

    def _copy_tensor(self, ref_buffer, buffer):
        if buffer is not None:
-            if ref_buffer is not None:
+            if ref_buffer is not None and buffer.shape == ref_buffer.shape:
                if not copy_tensor(buffer, ref_buffer):
                    ref_buffer = buffer.clone()
            else:
@@ -553,6 +555,10 @@ class RolloutBase:
        self._rollout_constraint_cuda_graph_init = False
        if self.cu_rollout_constraint_graph is not None:
            self.cu_rollout_constraint_graph.reset()
+        self.reset_shape()
+
+    def reset_shape(self):
+        pass

    @abstractmethod
    def get_action_from_state(self, state: State):
--- a/src/curobo/types/camera.py
+++ b/src/curobo/types/camera.py
@@ -16,10 +16,15 @@ from typing import List, Optional

 # Third Party
 import torch
+from torch.profiler import record_function

 # CuRobo
+from curobo.geom.cv import (
+    get_projection_rays,
+    project_depth_using_rays,
+    project_pointcloud_to_depth,
+)
 from curobo.types.math import Pose
-from curobo.util.logger import log_error, log_warn


@dataclass
@@ -30,25 +35,82 @@ class CameraObservation:
    depth_image: Optional[torch.Tensor] = None
    image_segmentation: Optional[torch.Tensor] = None
    projection_matrix: Optional[torch.Tensor] = None
+    projection_rays: Optional[torch.Tensor] = None
    resolution: Optional[List[int]] = None
    pose: Optional[Pose] = None
    intrinsics: Optional[torch.Tensor] = None
    timestamp: float = 0.0

+    def filter_depth(self, distance: float = 0.01):
+        self.depth_image = torch.where(self.depth_image < distance, 0, self.depth_image)
+
    @property
    def shape(self):
        return self.rgb_image.shape

+    @record_function("camera/copy_")
    def copy_(self, new_data: CameraObservation):
-        self.rgb_image.copy_(new_data.rgb_image)
-        self.depth_image.copy_(new_data.depth_image)
-        self.image_segmentation.copy_(new_data.image_segmentation)
-        self.projection_matrix.copy_(new_data.projection_matrix)
+        if self.rgb_image is not None:
+            self.rgb_image.copy_(new_data.rgb_image)
+        if self.depth_image is not None:
+            self.depth_image.copy_(new_data.depth_image)
+        if self.image_segmentation is not None:
+            self.image_segmentation.copy_(new_data.image_segmentation)
+        if self.projection_matrix is not None:
+            self.projection_matrix.copy_(new_data.projection_matrix)
+        if self.projection_rays is not None:
+            self.projection_rays.copy_(new_data.projection_rays)
+        if self.pose is not None:
+            self.pose.copy_(new_data.pose)
        self.resolution = new_data.resolution

+    @record_function("camera/clone")
+    def clone(self):
+
+        return CameraObservation(
+            depth_image=self.depth_image.clone() if self.depth_image is not None else None,
+            rgb_image=self.rgb_image.clone() if self.rgb_image is not None else None,
+            intrinsics=self.intrinsics.clone() if self.intrinsics is not None else None,
+            resolution=self.resolution,
+            pose=self.pose.clone() if self.pose is not None else None,
+        )
+
    def to(self, device: torch.device):
        if self.rgb_image is not None:
            self.rgb_image = self.rgb_image.to(device=device)
        if self.depth_image is not None:
            self.depth_image = self.depth_image.to(device=device)
        return self
+
+    def get_pointcloud(self):
+        if self.projection_rays is None:
+            self.update_projection_rays()
+        depth_image = self.depth_image
+        if len(self.depth_image.shape) == 2:
+            depth_image = self.depth_image.unsqueeze(0)
+        point_cloud = project_depth_using_rays(depth_image, self.projection_rays)
+        return point_cloud
+
+    def get_image_from_pointcloud(self, pointcloud, output_image: Optional[torch.Tensor] = None):
+        if output_image is None:
+            output_image = torch.zeros(
+                (self.depth_image.shape[0], self.depth_image.shape[1]),
+                dtype=pointcloud.dtype,
+                device=pointcloud.device,
+            )
+
+        depth_image = project_pointcloud_to_depth(pointcloud, output_image=output_image)
+        return depth_image
+
+    def update_projection_rays(self):
+        intrinsics = self.intrinsics
+        if len(intrinsics.shape) == 2:
+            intrinsics = intrinsics.unsqueeze(0)
+        project_rays = get_projection_rays(
+            self.depth_image.shape[-2], self.depth_image.shape[-1], intrinsics
+        )
+
+        if self.projection_rays is None:
+            self.projection_rays = project_rays
+
+        self.projection_rays.copy_(project_rays)
--- a/src/curobo/types/math.py
+++ b/src/curobo/types/math.py
@@ -19,6 +19,7 @@ import numpy as np
 import torch
 import torch.autograd.profiler as profiler
 from torch.autograd import Function
+from torch.profiler import record_function

 # CuRobo
 from curobo.geom.transform import (
@@ -33,7 +34,7 @@ from curobo.geom.transform import (
 from curobo.types.base import TensorDeviceType
 from curobo.util.helpers import list_idx_if_not_none
 from curobo.util.logger import log_error, log_info, log_warn
-from curobo.util.tensor_util import copy_if_not_none, copy_tensor
+from curobo.util.tensor_util import clone_if_not_none, copy_tensor

 # Local Folder
 from .tensor import T_BPosition, T_BQuaternion, T_BRotation
@@ -256,10 +257,10 @@ class Pose(Sequence):

    def clone(self):
        return Pose(
-            position=copy_if_not_none(self.position),
-            quaternion=copy_if_not_none(self.quaternion),
+            position=clone_if_not_none(self.position),
+            quaternion=clone_if_not_none(self.quaternion),
            normalize_rotation=False,
-            # rotation=copy_if_not_none(self.rotation),
+            # rotation=clone_if_not_none(self.rotation),
        )

    def to(self, tensor_args: TensorDeviceType):
@@ -408,6 +409,7 @@ class Pose(Sequence):
            gpt_out,
        )

+    @record_function("math/pose/transform_points")
    def batch_transform_points(
        self,
        points: torch.Tensor,
@@ -432,6 +434,12 @@ class Pose(Sequence):
    def shape(self):
        return self.position.shape

+    def compute_offset_pose(self, offset: Pose) -> Pose:
+        return self.multiply(offset)
+
+    def compute_local_pose(self, world_pose: Pose) -> Pose:
+        return self.inverse().multiply(world_pose)
+

 def quat_multiply(q1, q2, q_res):
    a_w = q1[..., 0]
--- a/src/curobo/types/state.py
+++ b/src/curobo/types/state.py
@@ -25,7 +25,7 @@ from curobo.types.tensor import T_BDOF, T_DOF
 from curobo.util.logger import log_error, log_info, log_warn
 from curobo.util.tensor_util import (
    check_tensor_shapes,
-    copy_if_not_none,
+    clone_if_not_none,
    copy_tensor,
    fd_tensor,
    tensor_repeat_seeds,
@@ -121,12 +121,14 @@ class JointState(State):
    def repeat_seeds(self, num_seeds: int):
        return JointState(
            position=tensor_repeat_seeds(self.position, num_seeds),
-            velocity=tensor_repeat_seeds(self.velocity, num_seeds)
-            if self.velocity is not None
-            else None,
-            acceleration=tensor_repeat_seeds(self.acceleration, num_seeds)
-            if self.acceleration is not None
-            else None,
+            velocity=(
+                tensor_repeat_seeds(self.velocity, num_seeds) if self.velocity is not None else None
+            ),
+            acceleration=(
+                tensor_repeat_seeds(self.acceleration, num_seeds)
+                if self.acceleration is not None
+                else None
+            ),
            joint_names=self.joint_names,
        )

@@ -153,10 +155,10 @@ class JointState(State):
        if self.joint_names is not None:
            j_names = self.joint_names.copy()
        return JointState(
-            position=copy_if_not_none(self.position),
-            velocity=copy_if_not_none(self.velocity),
-            acceleration=copy_if_not_none(self.acceleration),
-            jerk=copy_if_not_none(self.jerk),
+            position=clone_if_not_none(self.position),
+            velocity=clone_if_not_none(self.velocity),
+            acceleration=clone_if_not_none(self.acceleration),
+            jerk=clone_if_not_none(self.jerk),
            joint_names=j_names,
            tensor_args=self.tensor_args,
        )
--- a/src/curobo/util/metrics.py
+++ b/src/curobo/util/metrics.py
@@ -0,0 +1,64 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from dataclasses import dataclass
+from typing import List, Optional
+
+# Third Party
+import numpy as np
+
+try:
+    # Third Party
+    from robometrics.statistics import (
+        Statistic,
+        TrajectoryGroupMetrics,
+        TrajectoryMetrics,
+        percent_true,
+    )
+except ImportError:
+    raise ImportError(
+        "Benchmarking library not found, pip install "
+        + '"robometrics[evaluator] @ git+https://github.com/fishbotics/robometrics.git"'
+    )
+
+
+@dataclass
+class CuroboMetrics(TrajectoryMetrics):
+    time: float = np.inf
+    cspace_path_length: float = 0.0
+    perception_success: bool = False
+    perception_interpolated_success: bool = False
+    jerk: float = np.inf
+
+
+@dataclass
+class CuroboGroupMetrics(TrajectoryGroupMetrics):
+    time: float = np.inf
+    cspace_path_length: Optional[Statistic] = None
+    perception_success: float = 0.0
+    perception_interpolated_success: float = 0.0
+    jerk: float = np.inf
+
+    @classmethod
+    def from_list(cls, group: List[CuroboMetrics]):
+        unskipped = [m for m in group if not m.skip]
+        successes = [m for m in unskipped if m.success]
+        data = super().from_list(group)
+        data.time = Statistic.from_list([m.time for m in successes])
+        data.cspace_path_length = Statistic.from_list([m.cspace_path_length for m in successes])
+        data.perception_success = percent_true([m.perception_success for m in group])
+        data.perception_interpolated_success = percent_true(
+            [m.perception_interpolated_success for m in group]
+        )
+        data.jerk = Statistic.from_list([m.jerk for m in successes])
+
+        return data
--- a/src/curobo/util/sample_lib.py
+++ b/src/curobo/util/sample_lib.py
@@ -111,7 +111,7 @@ class HaltonSampleLib(BaseSampleLib):
        super().__init__(sample_config)
        # create halton generator:
        self.halton_generator = HaltonGenerator(
-            self.ndims, seed=self.seed, tensor_args=self.tensor_args
+            self.d_action, seed=self.seed, tensor_args=self.tensor_args
        )

    def get_samples(self, sample_shape, base_seed=None, filter_smooth=False, **kwargs):
@@ -122,8 +122,10 @@ class HaltonSampleLib(BaseSampleLib):
            seed = self.seed if base_seed is None else base_seed
            self.sample_shape = sample_shape
            self.seed = seed
-            self.samples = self.halton_generator.get_gaussian_samples(sample_shape[0])
-            self.samples = self.samples.view(self.samples.shape[0], self.horizon, self.d_action)
+            self.samples = self.halton_generator.get_gaussian_samples(
+                sample_shape[0] * self.horizon
+            )
+            self.samples = self.samples.view(sample_shape[0], self.horizon, self.d_action)

            if filter_smooth:
                self.samples = self.filter_smooth(self.samples)
@@ -301,7 +303,7 @@ class StompSampleLib(BaseSampleLib):

        self.filter_coeffs = None
        self.halton_generator = HaltonGenerator(
-            self.ndims, seed=self.seed, tensor_args=self.tensor_args
+            self.d_action, seed=self.seed, tensor_args=self.tensor_args
        )

    def get_samples(self, sample_shape, base_seed=None, **kwargs):
@@ -314,7 +316,10 @@ class StompSampleLib(BaseSampleLib):

            # self.seed = seed
            # torch.manual_seed(self.seed)
-            halton_samples = self.halton_generator.get_gaussian_samples(sample_shape[0])
+            halton_samples = self.halton_generator.get_gaussian_samples(
+                sample_shape[0] * self.horizon
+            ).view(sample_shape[0], self.horizon * self.d_action)
+
            halton_samples = (
                self.stomp_scale_tril.unsqueeze(0) @ halton_samples.unsqueeze(-1)
            ).squeeze(-1)
@@ -415,7 +420,7 @@ class HaltonGenerator:
        up_bounds=[1],
        low_bounds=[0],
        seed=123,
-        store_buffer: Optional[int] = 1000,
+        store_buffer: Optional[int] = 2000,
    ):
        self._seed = seed
        self.tensor_args = tensor_args
@@ -499,7 +504,7 @@ class HaltonGenerator:
    @profiler.record_function("halton_generator/gaussian_samples")
    def get_gaussian_samples(self, num_samples, variance=1.0):
        std_dev = np.sqrt(variance)
-        uniform_samples = self.get_samples(num_samples)
+        uniform_samples = self.get_samples(num_samples, False)
        gaussian_halton_samples = gaussian_transform(
            uniform_samples, self.proj_mat, self.i_mat, std_dev
        )
@@ -508,7 +513,7 @@ class HaltonGenerator:

@torch.jit.script
 def gaussian_transform(
-    uniform_samples: torch.Tensor, proj_mat: torch.Tensor, i_mat: torch.Tensor, variance: float
+    uniform_samples: torch.Tensor, proj_mat: torch.Tensor, i_mat: torch.Tensor, std_dev: float
 ):
    """Compute a guassian transform of uniform samples.

@@ -525,7 +530,7 @@ def gaussian_transform(
    # these values.
    changed_samples = 1.99 * uniform_samples - 0.99
    gaussian_halton_samples = proj_mat * torch.erfinv(changed_samples)
-    i_mat = i_mat * variance
+    i_mat = i_mat * std_dev
    gaussian_halton_samples = torch.matmul(gaussian_halton_samples, i_mat)
    return gaussian_halton_samples

--- a/src/curobo/util/tensor_util.py
+++ b/src/curobo/util/tensor_util.py
@@ -31,11 +31,29 @@ def copy_tensor(new_tensor: torch.Tensor, mem_tensor: torch.Tensor):
    return False


-def copy_if_not_none(x):
+def copy_if_not_none(new_tensor, ref_tensor):
    """Clones x if it's not None.
    TODO: Rename this to clone_if_not_none


+    Args:
+        x (torch.Tensor): _description_
+
+    Returns:
+        _type_: _description_
+    """
+    if ref_tensor is not None and new_tensor is not None:
+        ref_tensor.copy_(new_tensor)
+    elif ref_tensor is None and new_tensor is not None:
+        ref_tensor = new_tensor
+
+    return ref_tensor
+
+
+def clone_if_not_none(x):
+    """Clones x if it's not None.
+
+
    Args:
        x (torch.Tensor): _description_

--- a/src/curobo/util/torch_utils.py
+++ b/src/curobo/util/torch_utils.py
@@ -34,3 +34,10 @@ def is_cuda_graph_available():
        log_warn("WARNING: Disabling CUDA Graph as pytorch < 1.10")
        return False
    return True
+
+
+def is_torch_compile_available():
+    if version.parse(torch.__version__) < version.parse("2.0"):
+        log_warn("WARNING: Disabling compile as pytorch < 2.0")
+        return False
+    return True
--- a/src/curobo/util/usd_helper.py
+++ b/src/curobo/util/usd_helper.py
@@ -16,7 +16,6 @@ from typing import Dict, List, Optional, Union
 # Third Party
 import numpy as np
 import torch
-from pxr import Gf, Sdf, Usd, UsdGeom, UsdPhysics, UsdShade
 from tqdm import tqdm

 # CuRobo
@@ -47,6 +46,15 @@ from curobo.util_file import (
 )
 from curobo.wrap.reacher.motion_gen import MotionGenResult

+try:
+    # Third Party
+    from pxr import Gf, Sdf, Usd, UsdGeom, UsdPhysics, UsdShade
+except ImportError:
+    raise ImportError(
+        "usd-core failed to import, install with pip install usd-core"
+        + " NOTE: Do not install this if using with ISAAC SIM."
+    )
+

 def set_prim_translate(prim, translation):
    UsdGeom.Xformable(prim).AddTranslateOp().Set(Gf.Vec3d(translation))
--- a/src/curobo/util/warp.py
+++ b/src/curobo/util/warp.py
@@ -24,5 +24,5 @@ def init_warp(quiet=True, tensor_args: TensorDeviceType = TensorDeviceType()):
    # wp.config.enable_backward = True
    wp.init()

-    wp.force_load(wp.device_from_torch(tensor_args.device))
+    # wp.force_load(wp.device_from_torch(tensor_args.device))
    return True
--- a/src/curobo/wrap/model/robot_segmenter.py
+++ b/src/curobo/wrap/model/robot_segmenter.py
@@ -0,0 +1,204 @@
+#
+# Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+#
+
+# Standard Library
+from typing import Dict, Optional, Tuple, Union
+
+# Third Party
+import torch
+from torch.profiler import record_function
+
+# CuRobo
+from curobo.geom.cv import (
+    get_projection_rays,
+    project_depth_using_rays,
+)
+from curobo.geom.types import PointCloud
+from curobo.types.base import TensorDeviceType
+from curobo.types.camera import CameraObservation
+from curobo.types.math import Pose
+from curobo.types.robot import RobotConfig
+from curobo.types.state import JointState
+from curobo.util.logger import log_error
+from curobo.util_file import get_robot_configs_path, join_path, load_yaml
+from curobo.wrap.model.robot_world import RobotWorld, RobotWorldConfig
+
+
+class RobotSegmenter:
+    def __init__(
+        self,
+        robot_world: RobotWorld,
+        distance_threshold: float = 0.05,
+        use_cuda_graph: bool = True,
+    ):
+        self._robot_world = robot_world
+        self._projection_rays = None
+        self.ready = False
+        self._out_points_buffer = None
+        self._out_gp = None
+        self._out_gq = None
+        self._out_gpt = None
+        self._cu_graph = None
+        self._use_cuda_graph = use_cuda_graph
+        self.tensor_args = robot_world.tensor_args
+        self.distance_threshold = distance_threshold
+
+    @staticmethod
+    def from_robot_file(
+        robot_file: Union[str, Dict],
+        collision_sphere_buffer: Optional[float],
+        distance_threshold: float = 0.05,
+        use_cuda_graph: bool = True,
+        tensor_args: TensorDeviceType = TensorDeviceType(),
+    ):
+        robot_dict = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
+        if collision_sphere_buffer is not None:
+            robot_dict["kinematics"]["collision_sphere_buffer"] = collision_sphere_buffer
+
+        robot_cfg = RobotConfig.from_dict(robot_dict, tensor_args=tensor_args)
+
+        config = RobotWorldConfig.load_from_config(
+            robot_cfg,
+            None,
+            collision_activation_distance=0.0,
+            tensor_args=tensor_args,
+        )
+        robot_world = RobotWorld(config)
+
+        return RobotSegmenter(
+            robot_world, distance_threshold=distance_threshold, use_cuda_graph=use_cuda_graph
+        )
+
+    def get_pointcloud_from_depth(self, camera_obs: CameraObservation):
+        if self._projection_rays is None:
+            self.update_camera_projection(camera_obs)
+        depth_image = camera_obs.depth_image
+        if len(depth_image.shape) == 2:
+            depth_image = depth_image.unsqueeze(0)
+        points = project_depth_using_rays(depth_image, self._projection_rays)
+
+        return points
+
+    def update_camera_projection(self, camera_obs: CameraObservation):
+        intrinsics = camera_obs.intrinsics
+        if len(intrinsics.shape) == 2:
+            intrinsics = intrinsics.unsqueeze(0)
+        project_rays = get_projection_rays(
+            camera_obs.depth_image.shape[-2], camera_obs.depth_image.shape[-1], intrinsics
+        )
+
+        if self._projection_rays is None:
+            self._projection_rays = project_rays
+
+        self._projection_rays.copy_(project_rays)
+        self.ready = True
+
+    @record_function("robot_segmenter/get_robot_mask")
+    def get_robot_mask(
+        self,
+        camera_obs: CameraObservation,
+        joint_state: JointState,
+    ):
+        """
+        Assumes 1 robot and batch of depth images, batch of poses
+        """
+        if len(camera_obs.depth_image.shape) != 3:
+            log_error("Send depth image as (batch, height, width)")
+
+        active_js = self._robot_world.get_active_js(joint_state)
+
+        mask, filtered_image = self.get_robot_mask_from_active_js(camera_obs, active_js)
+
+        return mask, filtered_image
+
+    def get_robot_mask_from_active_js(
+        self, camera_obs: CameraObservation, active_joint_state: JointState
+    ):
+        q = active_joint_state.position
+        mask, filtered_image = self._call_op(camera_obs, q)
+
+        return mask, filtered_image
+
+    def _create_cg_graph(self, cam_obs, q):
+        self._cu_cam_obs = cam_obs.clone()
+        self._cu_q = q.clone()
+        s = torch.cuda.Stream(device=self.tensor_args.device)
+        s.wait_stream(torch.cuda.current_stream(device=self.tensor_args.device))
+
+        with torch.cuda.stream(s):
+            for _ in range(3):
+                self._cu_out, self._cu_filtered_out = self._mask_op(self._cu_cam_obs, self._cu_q)
+        torch.cuda.current_stream(device=self.tensor_args.device).wait_stream(s)
+
+        self._cu_graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(self._cu_graph, stream=s):
+            self._cu_out, self._cu_filtered_out = self._mask_op(
+                self._cu_cam_obs,
+                self._cu_q,
+            )
+
+    def _call_op(self, cam_obs, q):
+        if self._use_cuda_graph:
+            if self._cu_graph is None:
+                self._create_cg_graph(cam_obs, q)
+            self._cu_cam_obs.copy_(cam_obs)
+            self._cu_q.copy_(q)
+            self._cu_graph.replay()
+            return self._cu_out.clone(), self._cu_filtered_out.clone()
+        return self._mask_op(cam_obs, q)
+
+    @record_function("robot_segmenter/_mask_op")
+    def _mask_op(self, camera_obs, q):
+        if len(q.shape) == 1:
+            q = q.unsqueeze(0)
+        points = self.get_pointcloud_from_depth(camera_obs)
+        camera_to_robot = camera_obs.pose
+
+        if self._out_points_buffer is None:
+            self._out_points_buffer = points.clone()
+        if self._out_gpt is None:
+            self._out_gpt = torch.zeros((points.shape[0], points.shape[1], 3), device=points.device)
+        if self._out_gp is None:
+            self._out_gp = torch.zeros((camera_to_robot.position.shape[0], 3), device=points.device)
+        if self._out_gq is None:
+            self._out_gq = torch.zeros(
+                (camera_to_robot.quaternion.shape[0], 4), device=points.device
+            )
+
+        points_in_robot_frame = camera_to_robot.batch_transform_points(
+            points,
+            out_buffer=self._out_points_buffer,
+            gp_out=self._out_gp,
+            gq_out=self._out_gq,
+            gpt_out=self._out_gpt,
+        )
+
+        out_points = points_in_robot_frame
+
+        dist = self._robot_world.get_point_robot_distance(out_points, q)
+
+        mask, filtered_image = mask_image(camera_obs.depth_image, dist, self.distance_threshold)
+
+        return mask, filtered_image
+
+
+@torch.jit.script
+def mask_image(
+    image: torch.Tensor, distance: torch.Tensor, distance_threshold: float
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    distance = distance.view(
+        image.shape[0],
+        image.shape[1],
+        image.shape[2],
+    )
+    mask = torch.logical_and((image > 0.0), (distance > -distance_threshold))
+    filtered_image = torch.where(mask, 0, image)
+    return mask, filtered_image
--- a/src/curobo/wrap/model/robot_world.py
+++ b/src/curobo/wrap/model/robot_world.py
@@ -34,6 +34,7 @@ from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
 from curobo.types.robot import RobotConfig
 from curobo.types.state import JointState
+from curobo.util.logger import log_error
 from curobo.util.sample_lib import HaltonGenerator
 from curobo.util.warp import init_warp
 from curobo.util_file import get_robot_configs_path, get_world_configs_path, join_path, load_yaml
@@ -60,8 +61,8 @@ class RobotWorldConfig:
        world_model: Union[None, str, Dict, WorldConfig, List[WorldConfig], List[str]] = None,
        tensor_args: TensorDeviceType = TensorDeviceType(),
        n_envs: int = 1,
-        n_meshes: int = 10,
-        n_cuboids: int = 10,
+        n_meshes: int = 50,
+        n_cuboids: int = 50,
        collision_activation_distance: float = 0.2,
        self_collision_activation_distance: float = 0.0,
        max_collision_distance: float = 1.0,
@@ -74,6 +75,8 @@ class RobotWorldConfig:
        if isinstance(robot_config, str):
            robot_config = load_yaml(join_path(get_robot_configs_path(), robot_config))["robot_cfg"]
        if isinstance(robot_config, Dict):
+            if "robot_cfg" in robot_config:
+                robot_config = robot_config["robot_cfg"]
            robot_config = RobotConfig.from_dict(robot_config, tensor_args)
        kinematics = CudaRobotModel(robot_config.kinematics)

@@ -178,8 +181,11 @@ class RobotWorld(RobotWorldConfig):
    def __init__(self, config: RobotWorldConfig) -> None:
        RobotWorldConfig.__init__(self, **vars(config))
        self._batch_pose_idx = None
+        self._camera_projection_rays = None

    def get_kinematics(self, q: torch.Tensor) -> CudaRobotModelState:
+        if len(q.shape) == 1:
+            log_error("q should be of shape [b, dof]")
        state = self.kinematics.get_state(q)
        return state

@@ -344,6 +350,37 @@ class RobotWorld(RobotWorldConfig):
        distance = self.pose_cost.forward_pose(x_des, x_current, self._batch_pose_idx)
        return distance

+    def get_point_robot_distance(self, points: torch.Tensor, q: torch.Tensor):
+        """Compute distance from the robot at q joint configuration to points (e.g., pointcloud)
+
+        Args:
+            points: [b,n,3]
+            q: [1, dof]
+
+        Returns:
+            distance: [b,1] Positive is in collision with robot
+        NOTE: This currently does not support batched robot but can be done easily.
+        """
+        if len(q.shape) == 1:
+            log_error("q should be of shape [b, dof]")
+        kin_state = self.get_kinematics(q)
+        b, n = None, None
+        if len(points.shape) == 3:
+            b, n, _ = points.shape
+            points = points.view(b * n, 3)
+
+        pt_distance = point_robot_distance(kin_state.link_spheres_tensor, points)
+
+        if b is not None:
+            pt_distance = pt_distance.view(b, n)
+
+        return pt_distance
+
+    def get_active_js(self, full_js: JointState):
+        active_jnames = self.kinematics.joint_names
+        out_js = full_js.get_ordered_joint_state(active_jnames)
+        return out_js
+

@torch.jit.script
 def sum_mask(d1, d2, d3):
@@ -357,3 +394,15 @@ def mask(d1, d2, d3):
    d_total = d1 + d2 + d3
    d_mask = d_total == 0.0
    return d_mask
+
+
+@torch.jit.script
+def point_robot_distance(link_spheres_tensor, points):
+    robot_spheres = link_spheres_tensor.view(1, -1, 4).contiguous()
+    robot_radius = robot_spheres[:, :, 3]
+    points = points.unsqueeze(1)
+    sph_distance = (
+        torch.linalg.norm(points - robot_spheres[:, :, :3], dim=-1) - robot_radius
+    )  # b, n_spheres
+    pt_distance = torch.max(-1 * sph_distance, dim=-1)[0]
+    return pt_distance
--- a/src/curobo/wrap/reacher/evaluator.py
+++ b/src/curobo/wrap/reacher/evaluator.py
@@ -52,7 +52,8 @@ def smooth_cost(abs_acc, abs_jerk, opt_dt):
    # jerk = torch.max(torch.max(abs_jerk, dim=-1)[0], dim=-1)[0]
    jerk = torch.mean(torch.max(abs_jerk, dim=-1)[0], dim=-1)
    mean_acc = torch.mean(torch.max(abs_acc, dim=-1)[0], dim=-1)  # [0]
-    a = (jerk * 0.001) + opt_dt + (mean_acc * 0.01)
+    a = (jerk * 0.001) + 5.0 * opt_dt + (mean_acc * 0.01)
+
    return a


--- a/src/curobo/wrap/reacher/ik_solver.py
+++ b/src/curobo/wrap/reacher/ik_solver.py
@@ -26,6 +26,7 @@ from curobo.opt.newton.newton_base import NewtonOptBase, NewtonOptConfig
 from curobo.opt.particle.parallel_es import ParallelES, ParallelESConfig
 from curobo.opt.particle.parallel_mppi import ParallelMPPI, ParallelMPPIConfig
 from curobo.rollout.arm_reacher import ArmReacher, ArmReacherConfig
+from curobo.rollout.cost.pose_cost import PoseCostMetric
 from curobo.rollout.rollout_base import Goal, RolloutBase, RolloutMetrics
 from curobo.types.base import TensorDeviceType
 from curobo.types.math import Pose
@@ -56,6 +57,7 @@ class IKSolverConfig:
    world_coll_checker: Optional[WorldCollision] = None
    sample_rejection_ratio: int = 50
    tensor_args: TensorDeviceType = TensorDeviceType()
+    use_cuda_graph: bool = True

    @staticmethod
    @profiler.record_function("ik_solver/load_from_robot_config")
@@ -72,12 +74,12 @@ class IKSolverConfig:
        base_cfg_file: str = "base_cfg.yml",
        particle_file: str = "particle_ik.yml",
        gradient_file: str = "gradient_ik.yml",
-        use_cuda_graph: Optional[bool] = None,
+        use_cuda_graph: bool = True,
        self_collision_check: bool = True,
        self_collision_opt: bool = True,
        grad_iters: Optional[int] = None,
        use_particle_opt: bool = True,
-        collision_checker_type: Optional[CollisionCheckerType] = CollisionCheckerType.PRIMITIVE,
+        collision_checker_type: Optional[CollisionCheckerType] = CollisionCheckerType.MESH,
        sync_cuda_time: Optional[bool] = None,
        use_gradient_descent: bool = False,
        collision_cache: Optional[Dict[str, int]] = None,
@@ -90,6 +92,7 @@ class IKSolverConfig:
        regularization: bool = True,
        collision_activation_distance: Optional[float] = None,
        high_precision: bool = False,
+        project_pose_to_goal_frame: bool = True,
    ):
        if position_threshold <= 0.001:
            high_precision = True
@@ -116,6 +119,9 @@ class IKSolverConfig:
            base_config_data["convergence"]["cspace_cfg"]["weight"] = 0.0
            config_data["cost"]["bound_cfg"]["null_space_weight"] = 0.0
            grad_config_data["cost"]["bound_cfg"]["null_space_weight"] = 0.0
+
+        if isinstance(robot_cfg, str):
+            robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_cfg))["robot_cfg"]
        if ee_link_name is not None:
            if isinstance(robot_cfg, RobotConfig):
                raise NotImplementedError("ee link cannot be changed after creating RobotConfig")
@@ -123,8 +129,6 @@ class IKSolverConfig:
                robot_cfg.kinematics.ee_link = ee_link_name
            else:
                robot_cfg["kinematics"]["ee_link"] = ee_link_name
-        if isinstance(robot_cfg, str):
-            robot_cfg = load_yaml(join_path(get_robot_configs_path(), robot_cfg))["robot_cfg"]
        if isinstance(robot_cfg, dict):
            robot_cfg = RobotConfig.from_dict(robot_cfg, tensor_args)

@@ -160,8 +164,8 @@ class IKSolverConfig:
            grad_config_data["cost"]["self_collision_cfg"]["weight"] = 0.0
        if grad_iters is not None:
            grad_config_data["lbfgs"]["n_iters"] = grad_iters
-        config_data["mppi"]["n_envs"] = 1
-        grad_config_data["lbfgs"]["n_envs"] = 1
+        config_data["mppi"]["n_problems"] = 1
+        grad_config_data["lbfgs"]["n_problems"] = 1
        grad_cfg = ArmReacherConfig.from_dict(
            robot_cfg,
            grad_config_data["model"],
@@ -241,6 +245,7 @@ class IKSolverConfig:
            world_coll_checker=world_coll_checker,
            rollout_fn=aux_rollout,
            tensor_args=tensor_args,
+            use_cuda_graph=use_cuda_graph,
        )
        return ik_cfg

@@ -259,6 +264,7 @@ class IKResult(Sequence):
    error: T_BValue_float
    solve_time: float
    debug_info: Optional[Any] = None
+    goalset_index: Optional[torch.Tensor] = None

    def __getitem__(self, idx):
        success = self.success[idx]
@@ -272,6 +278,7 @@ class IKResult(Sequence):
            position_error=self.position_error[idx],
            rotation_error=self.rotation_error[idx],
            debug_info=self.debug_info,
+            goalset_index=None if self.goalset_index is None else self.goalset_index[idx],
        )

    def __len__(self):
@@ -317,7 +324,7 @@ class IKSolver(IKSolverConfig):
            self.solver.rollout_fn.retract_state.unsqueeze(0)
        )
        self.dof = self.solver.safety_rollout.d_action
-        self._col = torch.arange(0, 1, device=self.tensor_args.device, dtype=torch.long)
+        self._col = None  # torch.arange(0, 1, device=self.tensor_args.device, dtype=torch.long)

        # self.fixed_seeds = self.solver.safety_rollout.sample_random_actions(100 * 200)
        # create random seeder:
@@ -355,10 +362,14 @@ class IKSolver(IKSolverConfig):
            self._goal_buffer,
            self.tensor_args,
        )
-        # print("Goal:", self._goal_buffer.links_goal_pose)
+
        if update_reference:
-            self.solver.update_nenvs(self._solve_state.get_ik_batch_size())
-            self.reset_cuda_graph()
+            self.reset_shape()
+            if self.use_cuda_graph and self._col is not None:
+                log_error("changing goal type, breaking previous cuda graph.")
+                self.reset_cuda_graph()
+
+            self.solver.update_nproblems(self._solve_state.get_ik_batch_size())
            self._goal_buffer.current_state = self.init_state.repeat_seeds(goal_pose.batch)
            self._col = torch.arange(
                0,
@@ -676,6 +687,8 @@ class IKSolver(IKSolverConfig):
        if newton_iters is not None:
            self.solver.newton_optimizer.outer_iters = self.og_newton_iters
        ik_result = self.get_result(num_seeds, result, goal_buffer.goal_pose, return_seeds)
+        if ik_result.goalset_index is not None:
+            ik_result.goalset_index[ik_result.goalset_index >= goal_pose.n_goalset] = 0

        return ik_result

@@ -684,15 +697,18 @@ class IKSolver(IKSolverConfig):
        self, num_seeds: int, result: WrapResult, goal_pose: Pose, return_seeds: int
    ) -> IKResult:
        success = self.get_success(result.metrics, num_seeds=num_seeds)
-        if result.metrics.cost is not None:
-            result.metrics.pose_error += result.metrics.cost
+        # if result.metrics.cost is not None:
+        #    result.metrics.pose_error += result.metrics.cost * 0.0001
+        if result.metrics.null_space_error is not None:
+            result.metrics.pose_error += result.metrics.null_space_error
        if result.metrics.cspace_error is not None:
            result.metrics.pose_error += result.metrics.cspace_error

-        q_sol, success, position_error, rotation_error, total_error = get_result(
+        q_sol, success, position_error, rotation_error, total_error, goalset_index = get_result(
            result.metrics.pose_error,
            result.metrics.position_error,
            result.metrics.rotation_error,
+            result.metrics.goalset_index,
            success,
            result.action.position,
            self._col,
@@ -717,6 +733,7 @@ class IKSolver(IKSolverConfig):
            solve_time=result.solve_time,
            error=total_error,
            debug_info={"solver": result.debug},
+            goalset_index=goalset_index,
        )
        return ik_result

@@ -959,6 +976,10 @@ class IKSolver(IKSolverConfig):
        self.solver.reset_cuda_graph()
        self.rollout_fn.reset_cuda_graph()

+    def reset_shape(self):
+        self.solver.reset_shape()
+        self.rollout_fn.reset_shape()
+
    def attach_object_to_robot(
        self,
        sphere_radius: float,
@@ -977,6 +998,17 @@ class IKSolver(IKSolverConfig):
    def get_retract_config(self):
        return self.rollout_fn.dynamics_model.retract_config

+    def update_pose_cost_metric(
+        self,
+        metric: PoseCostMetric,
+    ):
+        rollouts = self.get_all_rollout_instances()
+        [
+            rollout.update_pose_cost_metric(metric)
+            for rollout in rollouts
+            if isinstance(rollout, ArmReacher)
+        ]
+

@torch.jit.script
 def get_success(
@@ -1001,6 +1033,7 @@ def get_result(
    pose_error,
    position_error,
    rotation_error,
+    goalset_index: Union[torch.Tensor, None],
    success,
    sol_position,
    col,
@@ -1018,4 +1051,6 @@ def get_result(
    position_error = position_error[idx].view(batch_size, return_seeds)
    rotation_error = rotation_error[idx].view(batch_size, return_seeds)
    total_error = position_error + rotation_error
-    return q_sol, success, position_error, rotation_error, total_error
+    if goalset_index is not None:
+        goalset_index = goalset_index[idx].view(batch_size, return_seeds)
+    return q_sol, success, position_error, rotation_error, total_error, goalset_index
--- a/src/curobo/wrap/reacher/motion_gen.py
+++ b/src/curobo/wrap/reacher/motion_gen.py
--- a/src/curobo/wrap/reacher/mpc.py
+++ b/src/curobo/wrap/reacher/mpc.py
@@ -80,7 +80,7 @@ class MpcSolverConfig:

        task_file = "particle_mpc.yml"
        config_data = load_yaml(join_path(get_task_configs_path(), task_file))
-        config_data["mppi"]["n_envs"] = 1
+        config_data["mppi"]["n_problems"] = 1
        if step_dt is not None:
            config_data["model"]["dt_traj_params"]["base_dt"] = step_dt
        if particle_opt_iters is not None:
@@ -238,7 +238,7 @@ class MpcSolver(MpcSolverConfig):
            self.tensor_args,
        )
        if update_reference:
-            self.solver.update_nenvs(self._solve_state.get_batch_size())
+            self.solver.update_nproblems(self._solve_state.get_batch_size())
            self.reset()
            self.reset_cuda_graph()
            self._col = torch.arange(
--- a/src/curobo/wrap/reacher/trajopt.py
+++ b/src/curobo/wrap/reacher/trajopt.py
@@ -30,6 +30,7 @@ from curobo.opt.newton.newton_base import NewtonOptBase, NewtonOptConfig
 from curobo.opt.particle.parallel_es import ParallelES, ParallelESConfig
 from curobo.opt.particle.parallel_mppi import ParallelMPPI, ParallelMPPIConfig
 from curobo.rollout.arm_reacher import ArmReacher, ArmReacherConfig
+from curobo.rollout.cost.pose_cost import PoseCostMetric
 from curobo.rollout.dynamics_model.integration_utils import (
    action_interpolate_kernel,
    interpolate_kernel,
@@ -39,7 +40,7 @@ from curobo.types.base import TensorDeviceType
 from curobo.types.robot import JointState, RobotConfig
 from curobo.types.tensor import T_BDOF, T_DOF, T_BValue_bool, T_BValue_float
 from curobo.util.helpers import list_idx_if_not_none
-from curobo.util.logger import log_info, log_warn
+from curobo.util.logger import log_error, log_info, log_warn
 from curobo.util.trajectory import (
    InterpolateType,
    calculate_dt_no_clamp,
@@ -78,6 +79,7 @@ class TrajOptSolverConfig:
    trim_steps: Optional[List[int]] = None
    store_debug_in_result: bool = False
    optimize_dt: bool = True
+    use_cuda_graph: bool = True

    @staticmethod
    @profiler.record_function("trajopt_config/load_from_robot_config")
@@ -98,14 +100,14 @@ class TrajOptSolverConfig:
        interpolation_type: InterpolateType = InterpolateType.LINEAR_CUDA,
        interpolation_steps: int = 10000,
        interpolation_dt: float = 0.01,
-        use_cuda_graph: Optional[bool] = None,
+        use_cuda_graph: bool = True,
        self_collision_check: bool = False,
        self_collision_opt: bool = True,
        grad_trajopt_iters: Optional[int] = None,
        num_seeds: int = 2,
        seed_ratio: Dict[str, int] = {"linear": 1.0, "bias": 0.0, "start": 0.0, "end": 0.0},
        use_particle_opt: bool = True,
-        collision_checker_type: Optional[CollisionCheckerType] = CollisionCheckerType.PRIMITIVE,
+        collision_checker_type: Optional[CollisionCheckerType] = CollisionCheckerType.MESH,
        traj_evaluator_config: TrajEvaluatorConfig = TrajEvaluatorConfig(),
        traj_evaluator: Optional[TrajEvaluator] = None,
        minimize_jerk: bool = True,
@@ -128,6 +130,7 @@ class TrajOptSolverConfig:
        state_finite_difference_mode: Optional[str] = None,
        filter_robot_command: bool = False,
        optimize_dt: bool = True,
+        project_pose_to_goal_frame: bool = True,
    ):
        # NOTE: Don't have default optimize_dt, instead read from a configuration file.
        # use default values, disable environment collision checking
@@ -163,6 +166,16 @@ class TrajOptSolverConfig:
        if traj_tsteps is None:
            traj_tsteps = grad_config_data["model"]["horizon"]

+        base_config_data["cost"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+        base_config_data["convergence"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+        config_data["cost"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+        grad_config_data["cost"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+
+        base_config_data["cost"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+        base_config_data["convergence"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+        config_data["cost"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+        grad_config_data["cost"]["pose_cfg"]["project_distance"] = project_pose_to_goal_frame
+
        config_data["model"]["horizon"] = traj_tsteps
        grad_config_data["model"]["horizon"] = traj_tsteps
        if minimize_jerk is not None:
@@ -212,8 +225,8 @@ class TrajOptSolverConfig:
        if not self_collision_opt:
            config_data["cost"]["self_collision_cfg"]["weight"] = 0.0
            grad_config_data["cost"]["self_collision_cfg"]["weight"] = 0.0
-        config_data["mppi"]["n_envs"] = 1
-        grad_config_data["lbfgs"]["n_envs"] = 1
+        config_data["mppi"]["n_problems"] = 1
+        grad_config_data["lbfgs"]["n_problems"] = 1

        if fixed_iters is not None:
            grad_config_data["lbfgs"]["fixed_iters"] = fixed_iters
@@ -256,9 +269,10 @@ class TrajOptSolverConfig:
            world_coll_checker=world_coll_checker,
            tensor_args=tensor_args,
        )
-
+        arm_rollout_mppi = None
        with profiler.record_function("trajopt_config/create_rollouts"):
-            arm_rollout_mppi = ArmReacher(cfg)
+            if use_particle_opt:
+                arm_rollout_mppi = ArmReacher(cfg)
            arm_rollout_grad = ArmReacher(grad_cfg)

            arm_rollout_safety = ArmReacher(safety_cfg)
@@ -266,20 +280,22 @@ class TrajOptSolverConfig:
                aux_rollout = ArmReacher(safety_cfg)
            interpolate_rollout = ArmReacher(safety_cfg)
        if trajopt_dt is not None:
-            arm_rollout_mppi.update_traj_dt(dt=trajopt_dt)
+            if arm_rollout_mppi is not None:
+                arm_rollout_mppi.update_traj_dt(dt=trajopt_dt)
            aux_rollout.update_traj_dt(dt=trajopt_dt)
            arm_rollout_grad.update_traj_dt(dt=trajopt_dt)
            arm_rollout_safety.update_traj_dt(dt=trajopt_dt)
-
-        config_dict = ParallelMPPIConfig.create_data_dict(
-            config_data["mppi"], arm_rollout_mppi, tensor_args
-        )
+        if arm_rollout_mppi is not None:
+            config_dict = ParallelMPPIConfig.create_data_dict(
+                config_data["mppi"], arm_rollout_mppi, tensor_args
+            )
+        parallel_mppi = None
        if use_es is not None and use_es:
            mppi_cfg = ParallelESConfig(**config_dict)
            if es_learning_rate is not None:
                mppi_cfg.learning_rate = es_learning_rate
            parallel_mppi = ParallelES(mppi_cfg)
-        else:
+        elif use_particle_opt:
            mppi_cfg = ParallelMPPIConfig(**config_dict)
            parallel_mppi = ParallelMPPI(mppi_cfg)

@@ -307,7 +323,7 @@ class TrajOptSolverConfig:
        cfg = WrapConfig(
            safety_rollout=arm_rollout_safety,
            optimizers=opt_list,
-            compute_metrics=not evaluate_interpolated_trajectory,
+            compute_metrics=True,  # not evaluate_interpolated_trajectory,
            use_cuda_graph_metrics=grad_config_data["lbfgs"]["use_cuda_graph"],
            sync_cuda_time=sync_cuda_time,
        )
@@ -337,6 +353,7 @@ class TrajOptSolverConfig:
            trim_steps=trim_steps,
            store_debug_in_result=store_debug_in_result,
            optimize_dt=optimize_dt,
+            use_cuda_graph=use_cuda_graph,
        )
        return trajopt_cfg

@@ -360,6 +377,7 @@ class TrajResult(Sequence):
    optimized_dt: Optional[torch.Tensor] = None
    raw_solution: Optional[JointState] = None
    raw_action: Optional[torch.Tensor] = None
+    goalset_index: Optional[torch.Tensor] = None

    def __getitem__(self, idx):
        # position_error = rotation_error = cspace_error = path_buffer_last_tstep = None
@@ -372,6 +390,7 @@ class TrajResult(Sequence):
            self.position_error,
            self.rotation_error,
            self.cspace_error,
+            self.goalset_index,
        ]
        idx_vals = list_idx_if_not_none(d_list, idx)

@@ -388,6 +407,7 @@ class TrajResult(Sequence):
            position_error=idx_vals[3],
            rotation_error=idx_vals[4],
            cspace_error=idx_vals[5],
+            goalset_index=idx_vals[6],
        )

    def __len__(self):
@@ -405,7 +425,7 @@ class TrajOptSolver(TrajOptSolverConfig):
            3, int(self.action_horizon / 2), self.tensor_args
        ).unsqueeze(0)
        assert self.action_horizon / 2 != 0.0
-        self.solver.update_nenvs(self.num_seeds)
+        self.solver.update_nproblems(self.num_seeds)
        self._max_joint_vel = (
            self.solver.safety_rollout.state_bounds.velocity.view(2, self.dof)[1, :].reshape(
                1, 1, self.dof
@@ -469,12 +489,18 @@ class TrajOptSolver(TrajOptSolverConfig):
            self._goal_buffer,
            self.tensor_args,
        )
+
        if update_reference:
-            self.solver.update_nenvs(self._solve_state.get_batch_size())
-            self.reset_cuda_graph()
+            if self.use_cuda_graph and self._col is not None:
+                log_error("changing goal type, breaking previous cuda graph.")
+                self.reset_cuda_graph()
+
+            self.solver.update_nproblems(self._solve_state.get_batch_size())
            self._col = torch.arange(
                0, goal.batch, device=self.tensor_args.device, dtype=torch.long
            )
+            self.reset_shape()
+
        return self._goal_buffer

    def solve_any(
@@ -586,6 +612,8 @@ class TrajOptSolver(TrajOptSolverConfig):
            num_seeds,
            solve_state.batch_mode,
        )
+        if traj_result.goalset_index is not None:
+            traj_result.goalset_index[traj_result.goalset_index >= goal.goal_pose.n_goalset] = 0
        if newton_iters is not None:
            self.solver.newton_optimizer.outer_iters = self._og_newton_iters
            self.solver.newton_optimizer.fixed_iters = self._og_newton_fixed_iters
@@ -839,16 +867,18 @@ class TrajOptSolver(TrajOptSolverConfig):
        if self.evaluate_interpolated_trajectory:
            with profiler.record_function("trajopt/evaluate_interpolated"):
                if self.use_cuda_graph_metrics:
-                    result.metrics = self.interpolate_rollout.get_metrics_cuda_graph(
-                        interpolated_trajs
-                    )
+                    metrics = self.interpolate_rollout.get_metrics_cuda_graph(interpolated_trajs)
                else:
-                    result.metrics = self.interpolate_rollout.get_metrics(interpolated_trajs)
+                    metrics = self.interpolate_rollout.get_metrics(interpolated_trajs)
+                result.metrics.feasible = metrics.feasible
+                result.metrics.position_error = metrics.position_error
+                result.metrics.rotation_error = metrics.rotation_error
+                result.metrics.cspace_error = metrics.cspace_error
+                result.metrics.goalset_index = metrics.goalset_index

        st_time = time.time()
        feasible = torch.all(result.metrics.feasible, dim=-1)
-        # if self.num_seeds == 1:
-        # print(result.metrics)
+
        if result.metrics.position_error is not None:
            converge = torch.logical_and(
                result.metrics.position_error[..., -1] <= self.position_threshold,
@@ -877,10 +907,10 @@ class TrajOptSolver(TrajOptSolverConfig):
                optimized_dt=opt_dt,
                raw_solution=result.action,
                raw_action=result.raw_action,
+                goalset_index=result.metrics.goalset_index,
            )
        else:
            # get path length:
-            # max_vel =
            if self.evaluate_interpolated_trajectory:
                smooth_label, smooth_cost = self.traj_evaluator.evaluate_interpolated_smootheness(
                    interpolated_trajs,
@@ -896,7 +926,6 @@ class TrajOptSolver(TrajOptSolverConfig):
                    self.solver.rollout_fn.dynamics_model.cspace_distance_weight,
                    self._velocity_bounds,
                )
-            # print(smooth_label, success, self._velocity_bounds.shape, self.solver.rollout_fn.dynamics_model.cspace_distance_weight)

            with profiler.record_function("trajopt/best_select"):
                success[~smooth_label] = False
@@ -907,7 +936,8 @@ class TrajOptSolver(TrajOptSolverConfig):
                    convergence_error = result.metrics.cspace_error[..., -1]
                else:
                    raise ValueError("convergence check requires either goal_pose or goal_state")
-                error = convergence_error + smooth_cost
+                running_cost = torch.mean(result.metrics.cost, dim=-1) * 0.0001
+                error = convergence_error + smooth_cost + running_cost
                error[~success] += 10000.0
                if batch_mode:
                    idx = torch.argmin(error.view(goal.batch, num_seeds), dim=-1)
@@ -923,13 +953,16 @@ class TrajOptSolver(TrajOptSolverConfig):
                best_act_seq = result.action[idx]
                best_raw_action = result.raw_action[idx]
                interpolated_traj = interpolated_trajs[idx]
-                position_error = rotation_error = cspace_error = None
+                goalset_index = position_error = rotation_error = cspace_error = None
                if result.metrics.position_error is not None:
                    position_error = result.metrics.position_error[idx, -1]
                if result.metrics.rotation_error is not None:
                    rotation_error = result.metrics.rotation_error[idx, -1]
                if result.metrics.cspace_error is not None:
                    cspace_error = result.metrics.cspace_error[idx, -1]
+                if result.metrics.goalset_index is not None:
+                    goalset_index = result.metrics.goalset_index[idx, -1]
+
                opt_dt = opt_dt[idx]
            if self.sync_cuda_time:
                torch.cuda.synchronize()
@@ -965,6 +998,7 @@ class TrajOptSolver(TrajOptSolverConfig):
                optimized_dt=opt_dt,
                raw_solution=best_act_seq,
                raw_action=best_raw_action,
+                goalset_index=goalset_index,
            )
        return traj_result

@@ -999,7 +1033,6 @@ class TrajOptSolver(TrajOptSolverConfig):
            return self.solve_batch_goalset(
                goal, seed_traj, use_nn_seed, return_all_solutions, num_seeds, seed_success
            )
-        return traj_result

    def get_linear_seed(self, start_state, goal_state):
        start_q = start_state.position.view(-1, 1, self.dof)
@@ -1173,6 +1206,11 @@ class TrajOptSolver(TrajOptSolverConfig):
        self.interpolate_rollout.reset_cuda_graph()
        self.rollout_fn.reset_cuda_graph()

+    def reset_shape(self):
+        self.solver.reset_shape()
+        self.interpolate_rollout.reset_shape()
+        self.rollout_fn.reset_shape()
+
    @property
    def kinematics(self) -> CudaRobotModel:
        return self.rollout_fn.dynamics_model.robot_model
@@ -1205,3 +1243,14 @@ class TrajOptSolver(TrajOptSolverConfig):

    def get_full_js(self, active_js: JointState) -> JointState:
        return self.rollout_fn.get_full_dof_from_solution(active_js)
+
+    def update_pose_cost_metric(
+        self,
+        metric: PoseCostMetric,
+    ):
+        rollouts = self.get_all_rollout_instances()
+        [
+            rollout.update_pose_cost_metric(metric)
+            for rollout in rollouts
+            if isinstance(rollout, ArmReacher)
+        ]
--- a/src/curobo/wrap/reacher/types.py
+++ b/src/curobo/wrap/reacher/types.py
@@ -13,7 +13,7 @@ from __future__ import annotations
 # Standard Library
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union

 # CuRobo
 from curobo.rollout.rollout_base import Goal
@@ -126,16 +126,26 @@ class ReacherSolveState:
        if (
            current_solve_state is None
            or current_goal_buffer is None
-            or current_solve_state != solve_state
            or (current_goal_buffer.retract_state is None and retract_config is not None)
            or (current_goal_buffer.goal_state is None and goal_state is not None)
            or (current_goal_buffer.links_goal_pose is None and link_poses is not None)
        ):
+            update_reference = True
+
+        elif current_solve_state != solve_state:
+            new_goal_pose = get_padded_goalset(
+                solve_state, current_solve_state, current_goal_buffer, goal_pose
+            )
+            if new_goal_pose is not None:
+                goal_pose = new_goal_pose
+            else:
+                update_reference = True
+
+        if update_reference:
            current_solve_state = solve_state
            current_goal_buffer = solve_state.create_goal_buffer(
                goal_pose, goal_state, retract_config, link_poses, tensor_args
            )
-            update_reference = True
        else:
            current_goal_buffer.goal_pose.copy_(goal_pose)
            if retract_config is not None:
@@ -145,6 +155,7 @@ class ReacherSolveState:
            if link_poses is not None:
                for k in link_poses.keys():
                    current_goal_buffer.links_goal_pose[k].copy_(link_poses[k])
+
        return current_solve_state, current_goal_buffer, update_reference

    def update_goal(
@@ -155,17 +166,26 @@ class ReacherSolveState:
        tensor_args: TensorDeviceType = TensorDeviceType(),
    ):
        solve_state = self
-
        update_reference = False
        if (
            current_solve_state is None
            or current_goal_buffer is None
-            or current_solve_state != solve_state
            or (current_goal_buffer.goal_state is None and goal.goal_state is not None)
            or (current_goal_buffer.goal_state is not None and goal.goal_state is None)
        ):
-            # TODO: Check for change in update idx buffers, currently we assume
-            # that solve_state captures difference in idx buffers
+            update_reference = True
+        elif current_solve_state != solve_state:
+            new_goal_pose = get_padded_goalset(
+                solve_state, current_solve_state, current_goal_buffer, goal.goal_pose
+            )
+            if new_goal_pose is not None:
+                goal = goal.clone()
+                goal.goal_pose = new_goal_pose
+
+            else:
+                update_reference = True
+
+        if update_reference:
            current_solve_state = solve_state
            current_goal_buffer = goal.create_index_buffers(
                solve_state.batch_size,
@@ -174,7 +194,6 @@ class ReacherSolveState:
                solve_state.num_seeds,
                tensor_args,
            )
-            update_reference = True
        else:
            current_goal_buffer.copy_(goal, update_idx_buffers=False)
        return current_solve_state, current_goal_buffer, update_reference
@@ -185,3 +204,92 @@ class MotionGenSolverState:
    solve_type: ReacherSolveType
    ik_solve_state: ReacherSolveState
    trajopt_solve_state: ReacherSolveState
+
+
+def get_padded_goalset(
+    solve_state: ReacherSolveState,
+    current_solve_state: ReacherSolveState,
+    current_goal_buffer: Goal,
+    new_goal_pose: Pose,
+) -> Union[Pose, None]:
+    if (
+        current_solve_state.solve_type == ReacherSolveType.GOALSET
+        and solve_state.solve_type == ReacherSolveType.SINGLE
+    ):
+        # convert single goal to goal set
+        # solve_state.solve_type = ReacherSolveType.GOALSET
+        # solve_state.n_goalset = current_solve_state.n_goalset
+
+        goal_pose = current_goal_buffer.goal_pose.clone()
+        goal_pose.position[:] = new_goal_pose.position
+        goal_pose.quaternion[:] = new_goal_pose.quaternion
+        return goal_pose
+
+    elif (
+        current_solve_state.solve_type == ReacherSolveType.BATCH_GOALSET
+        and solve_state.solve_type == ReacherSolveType.BATCH
+        and new_goal_pose.n_goalset <= current_solve_state.n_goalset
+        and new_goal_pose.batch == current_solve_state.batch_size
+    ):
+        goal_pose = current_goal_buffer.goal_pose.clone()
+        if len(new_goal_pose.position.shape) == 2:
+            new_goal_pose = new_goal_pose.unsqueeze(1)
+        goal_pose.position[..., :, :] = new_goal_pose.position
+        goal_pose.quaternion[..., :, :] = new_goal_pose.quaternion
+        return goal_pose
+    elif (
+        current_solve_state.solve_type == ReacherSolveType.BATCH_ENV_GOALSET
+        and solve_state.solve_type == ReacherSolveType.BATCH_ENV
+        and new_goal_pose.n_goalset <= current_solve_state.n_goalset
+        and new_goal_pose.batch == current_solve_state.batch_size
+    ):
+        goal_pose = current_goal_buffer.goal_pose.clone()
+        if len(new_goal_pose.position.shape) == 2:
+            new_goal_pose = new_goal_pose.unsqueeze(1)
+        goal_pose.position[..., :, :] = new_goal_pose.position
+        goal_pose.quaternion[..., :, :] = new_goal_pose.quaternion
+        return goal_pose
+
+    elif (
+        current_solve_state.solve_type == ReacherSolveType.GOALSET
+        and solve_state.solve_type == ReacherSolveType.GOALSET
+        and new_goal_pose.n_goalset <= current_solve_state.n_goalset
+    ):
+        goal_pose = current_goal_buffer.goal_pose.clone()
+        goal_pose.position[..., : new_goal_pose.n_goalset, :] = new_goal_pose.position
+        goal_pose.quaternion[..., : new_goal_pose.n_goalset, :] = new_goal_pose.quaternion
+        goal_pose.position[..., new_goal_pose.n_goalset :, :] = new_goal_pose.position[..., :1, :]
+        goal_pose.quaternion[..., new_goal_pose.n_goalset :, :] = new_goal_pose.quaternion[
+            ..., :1, :
+        ]
+
+        return goal_pose
+    elif (
+        current_solve_state.solve_type == ReacherSolveType.BATCH_GOALSET
+        and solve_state.solve_type == ReacherSolveType.BATCH_GOALSET
+        and new_goal_pose.n_goalset <= current_solve_state.n_goalset
+        and new_goal_pose.batch == current_solve_state.batch_size
+    ):
+        goal_pose = current_goal_buffer.goal_pose.clone()
+        goal_pose.position[..., : new_goal_pose.n_goalset, :] = new_goal_pose.position
+        goal_pose.quaternion[..., : new_goal_pose.n_goalset, :] = new_goal_pose.quaternion
+        goal_pose.position[..., new_goal_pose.n_goalset :, :] = new_goal_pose.position[..., :1, :]
+        goal_pose.quaternion[..., new_goal_pose.n_goalset :, :] = new_goal_pose.quaternion[
+            ..., :1, :
+        ]
+        return goal_pose
+    elif (
+        current_solve_state.solve_type == ReacherSolveType.BATCH_ENV_GOALSET
+        and solve_state.solve_type == ReacherSolveType.BATCH_ENV_GOALSET
+        and new_goal_pose.n_goalset <= current_solve_state.n_goalset
+        and new_goal_pose.batch == current_solve_state.batch_size
+    ):
+        goal_pose = current_goal_buffer.goal_pose.clone()
+        goal_pose.position[..., : new_goal_pose.n_goalset, :] = new_goal_pose.position
+        goal_pose.quaternion[..., : new_goal_pose.n_goalset, :] = new_goal_pose.quaternion
+        goal_pose.position[..., new_goal_pose.n_goalset :, :] = new_goal_pose.position[..., :1, :]
+        goal_pose.quaternion[..., new_goal_pose.n_goalset :, :] = new_goal_pose.quaternion[
+            ..., :1, :
+        ]
+        return goal_pose
+    return None
--- a/src/curobo/wrap/wrap_base.py
+++ b/src/curobo/wrap/wrap_base.py
@@ -54,7 +54,7 @@ class WrapBase(WrapConfig):
    def __init__(self, config: Optional[WrapConfig] = None):
        if config is not None:
            WrapConfig.__init__(self, **vars(config))
-        self.n_envs = 1
+        self.n_problems = 1
        self.opt_dt = 0
        self._rollout_list = None
        self._opt_rollouts = None
@@ -83,11 +83,11 @@ class WrapBase(WrapConfig):
            debug_list.append(opt.debug_cost)
        return debug_list

-    def update_nenvs(self, n_envs):
-        if n_envs != self.n_envs:
-            self.n_envs = n_envs
+    def update_nproblems(self, n_problems):
+        if n_problems != self.n_problems:
+            self.n_problems = n_problems
            for opt in self.optimizers:
-                opt.update_nenvs(self.n_envs)
+                opt.update_nproblems(self.n_problems)

    def update_params(self, goal: Goal):
        with profiler.record_function("wrap_base/safety/update_params"):
@@ -117,6 +117,12 @@ class WrapBase(WrapConfig):
            opt.reset_cuda_graph()
        self._init_solver = False

+    def reset_shape(self):
+        self.safety_rollout.reset_shape()
+        for opt in self.optimizers:
+            opt.reset_shape()
+        self._init_solver = False
+
    @property
    def rollout_fn(self):
        return self.safety_rollout