improved joint space planning

2024-05-30 14:42:22 -07:00
parent 3bfed9d773
commit 0c51dd2da8
28 changed files with 1135 additions and 213 deletions
--- a/src/curobo/content/configs/robot/jaco7.yml
+++ b/src/curobo/content/configs/robot/jaco7.yml
@@ -10,7 +10,7 @@
 ##

 robot_cfg:
-  
+
  kinematics:

    urdf_path: "robot/jaco/jaco_7s.urdf"
@@ -37,13 +37,13 @@ robot_cfg:
    base_link: "root"
    ee_link: "j2s7s300_end_effector"
    link_names: null
-    
+
    collision_link_names: [
      "j2s7s300_link_base", "j2s7s300_link_1", "j2s7s300_link_2",
      "j2s7s300_link_3", "j2s7s300_link_4", "j2s7s300_link_5", "j2s7s300_link_6", "j2s7s300_link_7",
      "j2s7s300_link_finger_tip_1", "j2s7s300_link_finger_tip_2",
-      "j2s7s300_link_finger_tip_3", 
-      "j2s7s300_link_finger_3", 
+      "j2s7s300_link_finger_tip_3",
+      "j2s7s300_link_finger_3",
      "j2s7s300_link_finger_2",
      "j2s7s300_link_finger_1",
      ]
@@ -151,37 +151,60 @@ robot_cfg:
    collision_sphere_buffer: 0.005
    self_collision_ignore: {
      "j2s7s300_link_base":["j2s7s300_link_1"],
-      "j2s7s300_link_1":["j2s7s300_link_2"], 
+      "j2s7s300_link_1":["j2s7s300_link_2"],
      "j2s7s300_link_2":["j2s7s300_link_3"],
-      "j2s7s300_link_3":["j2s7s300_link_4"], 
-      "j2s7s300_link_4":["j2s7s300_link_5"], 
-      "j2s7s300_link_5":["j2s7s300_link_6"], 
-      "j2s7s300_link_6":["j2s7s300_link_7"], 
+      "j2s7s300_link_3":["j2s7s300_link_4"],
+      "j2s7s300_link_4":["j2s7s300_link_5"],
+      "j2s7s300_link_5":["j2s7s300_link_6",
+      "j2s7s300_link_finger_tip_1",
+      "j2s7s300_link_finger_tip_2",
+      "j2s7s300_link_finger_tip_3",
+      "j2s7s300_link_finger_1",
+      "j2s7s300_link_finger_2",
+      "j2s7s300_link_finger_3"],
+      "j2s7s300_link_6":["j2s7s300_link_7",
+      "j2s7s300_link_finger_tip_1",
+      "j2s7s300_link_finger_tip_2",
+      "j2s7s300_link_finger_tip_3",
+      "j2s7s300_link_finger_1",
+      "j2s7s300_link_finger_2",
+      "j2s7s300_link_finger_3"],
      "j2s7s300_link_7":[
-      "j2s7s300_link_finger_tip_1", 
+      "j2s7s300_link_finger_tip_1",
      "j2s7s300_link_finger_tip_2",
      "j2s7s300_link_finger_tip_3",
      "j2s7s300_link_finger_1",
      "j2s7s300_link_finger_2",
      "j2s7s300_link_finger_3",
      ],
-    
-      "j2s7s300_link_finger_3":["j2s7s300_link_finger_tip_3","j2s7s300_link_finger_2",
-       "j2s7s300_link_finger_1"], 
+
+      "j2s7s300_link_finger_3":
+      ["j2s7s300_link_finger_tip_3","j2s7s300_link_finger_2",
+       "j2s7s300_link_finger_1", "j2s7s300_link_finger_tip_1", "j2s7s300_link_finger_tip_2"],
+
      "j2s7s300_link_finger_2":["j2s7s300_link_finger_tip_2", "j2s7s300_link_finger_1",
-      "j2s7s300_link_finger_3"],
+      "j2s7s300_link_finger_3", "j2s7s300_link_finger_tip_3", "j2s7s300_link_finger_tip_1"],
+
      "j2s7s300_link_finger_1":["j2s7s300_link_finger_tip_1", "j2s7s300_link_finger_2",
-      "j2s7s300_link_finger_3"],
+      "j2s7s300_link_finger_3", "j2s7s300_link_finger_tip_3", "j2s7s300_link_finger_tip_2"],
+
+      "j2s7s300_link_finger_tip_1":["j2s7s300_link_finger_tip_2", "j2s7s300_link_finger_tip_3"],
+
+      "j2s7s300_link_finger_tip_2":["j2s7s300_link_finger_tip_3"],

      } # Dict[str, List[str]]
-    self_collision_buffer: {} # Dict[str, float]
-    
-    mesh_link_names: [ 
+    self_collision_buffer: {
+      #"j2s7s300_link_base": 0.02,
+      #"j2s7s300_link_1": 0.01,
+
+    } # Dict[str, float]
+
+    mesh_link_names: [
      "j2s7s300_link_base", "j2s7s300_link_1", "j2s7s300_link_2",
      "j2s7s300_link_3", "j2s7s300_link_4", "j2s7s300_link_5", "j2s7s300_link_6", "j2s7s300_link_7",
      "j2s7s300_link_finger_tip_1", "j2s7s300_link_finger_tip_2",
-      "j2s7s300_link_finger_tip_3", 
-      "j2s7s300_link_finger_3", 
+      "j2s7s300_link_finger_tip_3",
+      "j2s7s300_link_finger_3",
      "j2s7s300_link_finger_2",
      "j2s7s300_link_finger_1",
    ] # List[str]
@@ -191,18 +214,18 @@ robot_cfg:
    "j2s7s300_joint_finger_tip_1": 0,
    "j2s7s300_joint_finger_tip_2": 0,
    "j2s7s300_joint_finger_tip_3": 0,
-    
+
    }

    cspace:
-      joint_names: 
+      joint_names:
        - j2s7s300_joint_1
        - j2s7s300_joint_2
        - j2s7s300_joint_3
        - j2s7s300_joint_4
        - j2s7s300_joint_5
        - j2s7s300_joint_6
-        - j2s7s300_joint_7 
+        - j2s7s300_joint_7
        - j2s7s300_joint_finger_1
        - j2s7s300_joint_finger_2
        - j2s7s300_joint_finger_3
--- a/src/curobo/content/configs/task/base_cfg.yml
+++ b/src/curobo/content/configs/task/base_cfg.yml
@@ -11,10 +11,10 @@


 world_collision_checker_cfg:
-  cache: null 
+  cache: null
  checker_type: "PRIMITIVE"
  max_distance: 0.1
- 
+

 cost:
  pose_cfg:
@@ -27,8 +27,8 @@ cost:
    weight: [0.0, 0.0]
    vec_convergence: [0.0, 0.00] # orientation, position
    terminal: False
-    
- 
+
+
  bound_cfg:
    weight: 000.0
    activation_distance: [0.0,0.0,0.0,0.0]
@@ -58,14 +58,14 @@ convergence:
    weight: [0.1, 10.0]
    vec_convergence: [0.0, 0.0] # orientation, position
    terminal: False
-    
  cspace_cfg:
    weight: 1.0
    terminal: True
    run_weight: 0.0
+    use_l2_kernel: True
  null_space_cfg:
-    weight: 1.0
+    weight: 0.001
    terminal: True
    run_weight: 1.0
-    
- 
+    use_l2_kernel: True
+
--- a/src/curobo/content/configs/task/finetune_trajopt.yml
+++ b/src/curobo/content/configs/task/finetune_trajopt.yml
@@ -49,7 +49,7 @@ cost:


  cspace_cfg:
-    weight: 10000.0
+    weight: 20000.0
    terminal: True
    run_weight: 0.00 #1

@@ -59,7 +59,7 @@ cost:
    run_weight_velocity: 0.0
    run_weight_acceleration: 1.0
    run_weight_jerk: 1.0
-    activation_distance: [0.05,0.05,0.05,0.05] # for position, velocity, acceleration and jerk
+    activation_distance: [0.05,0.001,0.001,0.001] # for position, velocity, acceleration and jerk
    null_space_weight: [0.0]

  primitive_collision_cfg:
--- a/src/curobo/content/configs/task/gradient_ik.yml
+++ b/src/curobo/content/configs/task/gradient_ik.yml
@@ -40,17 +40,22 @@ cost:
    terminal: False
    use_metric: True
    run_weight: 1.0
+
  cspace_cfg:
    weight: 0.000
+
  bound_cfg:
    weight: 5000.0
    activation_distance: [0.001]
-    null_space_weight: [0.1]
+    null_space_weight: [0.001]
+    use_l2_kernel: True
+
  primitive_collision_cfg:
    weight: 5000.0
    use_sweep: False
    classify: False
    activation_distance: 0.01
+
  self_collision_cfg:
    weight: 5000.0
    classify: False
--- a/src/curobo/content/configs/task/gradient_trajopt.yml
+++ b/src/curobo/content/configs/task/gradient_trajopt.yml
@@ -48,7 +48,7 @@ cost:
    use_metric: True

  cspace_cfg:
-    weight: 10000.0
+    weight: 20000.0
    terminal: True
    run_weight: 0.0

@@ -100,7 +100,7 @@ lbfgs:
  use_cuda_update_best_kernel: True
  use_temporal_smooth: False
  sync_cuda_time: True
-  step_scale: 1.0 #1.0
+  step_scale: 1.0
  last_best: 10
  use_coo_sparse: True
  debug_info:
--- a/src/curobo/content/configs/task/particle_mpc.yml
+++ b/src/curobo/content/configs/task/particle_mpc.yml
@@ -25,7 +25,7 @@ model:
  control_space: 'ACCELERATION'
  teleport_mode: False
  state_finite_difference_mode: "CENTRAL"
- 
+

 cost:
  pose_cfg:
@@ -36,16 +36,16 @@ cost:
    terminal: True
    run_weight: 1.0
    use_metric: True
-    
+
  cspace_cfg:
    weight: 1000.0
    terminal: True
    run_weight: 1.0
-    
+
  bound_cfg:
    weight: [5000.0, 5000.0,5000.0,000.0]
    activation_distance: [0.1,0.1,0.1,0.1] # for position, velocity, acceleration and jerk
-    smooth_weight: [0.0, 50.0, 0.0,0.0] # [vel, acc, jerk, alpha_vel, eta_position, eta_vel, eta_acc] 
+    smooth_weight: [0.0, 50.0, 0.0,0.0] # [vel, acc, jerk, alpha_vel, eta_position, eta_vel, eta_acc]
    run_weight_velocity: 0.0
    run_weight_acceleration: 1.0
    run_weight_jerk: 1.0
@@ -60,7 +60,7 @@ cost:
    use_speed_metric: False
    speed_dt: 0.1 # used only for speed metric
    activation_distance: 0.025
-  
+
  self_collision_cfg:
    weight: 50000.0
    classify: False
@@ -83,13 +83,13 @@ mppi:
  alpha             : 1
  num_particles     : 400 #10000
  update_cov        : True
-  cov_type          : "DIAG_A" # 
+  cov_type          : "DIAG_A" #
  kappa             : 0.0001
  null_act_frac     : 0.05
  sample_mode       : 'BEST'
  base_action       : 'REPEAT'
  squash_fn         : 'CLAMP'
-  n_problems            : 1
+  n_problems        : 1
  use_cuda_graph    : True
  seed              : 0
  store_debug       : False
--- a/src/curobo/geom/sdf/warp_primitives.py
+++ b/src/curobo/geom/sdf/warp_primitives.py
@@ -16,6 +16,16 @@ import warp as wp
 wp.set_module_options({"fast_math": False})


+@wp.func
+def mesh_query_point_fn(
+    idx: wp.uint64,
+    point: wp.vec3,
+    max_distance: float,
+):
+    collide_result = wp.mesh_query_point(idx, point, max_distance)
+    return collide_result
+
+
@wp.kernel
 def get_swept_closest_pt_batch_env(
    pt: wp.array(dtype=wp.vec4),
@@ -74,9 +84,6 @@ def get_swept_closest_pt_batch_env(
    eta = float(0.0)
    dt = float(0.0)
    k0 = float(0.0)
-    face_index = int(0)
-    face_u = float(0.0)
-    face_v = float(0.0)
    sign = float(0.0)
    dist = float(0.0)
    dist_metric = float(0.0)
@@ -143,11 +150,12 @@ def get_swept_closest_pt_batch_env(
            obj_w_pose = wp.transform(obj_position, obj_quat)
            obj_w_pose_t = wp.transform_inverse(obj_w_pose)
            local_pt = wp.transform_point(obj_w_pose, in_pt)
-
-            if wp.mesh_query_point(
-                mesh[i], local_pt, max_dist_buffer, sign, face_index, face_u, face_v
-            ):
-                cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+            collide_result = mesh_query_point_fn(mesh[i], local_pt, max_dist_buffer)
+            if collide_result.result:
+                sign = collide_result.sign
+                cl_pt = wp.mesh_eval_position(
+                    mesh[i], collide_result.face, collide_result.u, collide_result.v
+                )
                delta = cl_pt - local_pt
                dis_length = wp.length(delta)
                dist = (-1.0 * dis_length * sign) + in_rad
@@ -185,10 +193,12 @@ def get_swept_closest_pt_batch_env(
                        1.0 - 0.5 * jump_distance / sphere_0_distance
                    )  # dist could be greater than sphere_0_distance here?
                    sphere_int = k0 * local_pt + ((1.0 - k0) * sphere_temp)
-                    if wp.mesh_query_point(
-                        mesh[i], sphere_int, max_dist_buffer, sign, face_index, face_u, face_v
-                    ):
-                        cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+                    collide_result = mesh_query_point_fn(mesh[i], sphere_int, max_dist_buffer)
+                    if collide_result.result:
+                        sign = collide_result.sign
+                        cl_pt = wp.mesh_eval_position(
+                            mesh[i], collide_result.face, collide_result.u, collide_result.v
+                        )
                        delta = cl_pt - sphere_int
                        dis_length = wp.length(delta)
                        dist = (-1.0 * dis_length * sign) + in_rad
@@ -229,11 +239,12 @@ def get_swept_closest_pt_batch_env(
                        1.0 - 0.5 * jump_distance / sphere_2_distance
                    )  # dist could be greater than sphere_0_distance here?
                    sphere_int = k0 * local_pt + (1.0 - k0) * sphere_temp
-
-                    if wp.mesh_query_point(
-                        mesh[i], sphere_int, max_dist_buffer, sign, face_index, face_u, face_v
-                    ):
-                        cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+                    collide_result = mesh_query_point_fn(mesh[i], sphere_int, max_dist_buffer)
+                    if collide_result.result:
+                        sign = collide_result.sign
+                        cl_pt = wp.mesh_eval_position(
+                            mesh[i], collide_result.face, collide_result.u, collide_result.v
+                        )
                        delta = cl_pt - sphere_int
                        dis_length = wp.length(delta)
                        dist = (-1.0 * dis_length * sign) + in_rad
@@ -375,9 +386,6 @@ def get_closest_pt_batch_env(
    if b_idx >= batch_size or h_idx >= horizon or sph_idx >= nspheres:
        return

-    face_index = int(0)
-    face_u = float(0.0)
-    face_v = float(0.0)
    sign = float(0.0)
    dist = float(0.0)
    grad_vec = wp.vec3(0.0)
@@ -444,11 +452,12 @@ def get_closest_pt_batch_env(
            obj_w_pose = wp.transform(obj_position, obj_quat)

            local_pt = wp.transform_point(obj_w_pose, in_pt)
-
-            if wp.mesh_query_point(
-                mesh[i], local_pt, max_dist_buffer, sign, face_index, face_u, face_v
-            ):
-                cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+            collide_result = mesh_query_point_fn(mesh[i], local_pt, max_dist_buffer)
+            if collide_result.result:
+                sign = collide_result.sign
+                cl_pt = wp.mesh_eval_position(
+                    mesh[i], collide_result.face, collide_result.u, collide_result.v
+                )
                delta = cl_pt - local_pt
                dis_length = wp.length(delta)
                dist = -1.0 * dis_length * sign
--- a/src/curobo/geom/sdf/world_mesh.py
+++ b/src/curobo/geom/sdf/world_mesh.py
@@ -126,7 +126,7 @@ class WorldMeshCollision(WorldPrimitiveCollision):
            self._wp_mesh_cache[mesh.name] = self._load_mesh_to_warp(mesh)
            # return self._wp_mesh_cache[mesh.name]
        else:
-            log_info("Object already in warp cache, using existing instance: " + mesh.name)
+            log_warn("Object already in warp cache, using existing instance for: " + mesh.name)
        return self._wp_mesh_cache[mesh.name]

    def _load_batch_mesh_to_warp(self, mesh_list: List[Mesh]):
--- a/src/curobo/opt/particle/particle_opt_utils.py
+++ b/src/curobo/opt/particle/particle_opt_utils.py
@@ -68,10 +68,8 @@ def get_stomp_cov(
    cov, scale_tril, scaled_M = get_stomp_cov_jit(
        horizon, d_action, cov_mode, device=tensor_args.device
    )
-    cov = tensor_args.to_device(cov)
-    scale_tril = tensor_args.to_device(scale_tril)
    if RETURN_M:
-        return cov, scale_tril, tensor_args.to_device(scaled_M)
+        return cov, scale_tril, scaled_M
    return cov, scale_tril


@@ -89,7 +87,7 @@ def get_stomp_cov_jit(
    A = torch.zeros(
        (d_action * horizon, d_action * horizon),
        dtype=torch.float32,
-        device=device,
+        device="cpu",
    )

    if cov_mode == "vel":
@@ -129,10 +127,15 @@ def get_stomp_cov_jit(

    # also compute the cholesky decomposition:
    # scale_tril = torch.zeros((d_action * horizon, d_action * horizon), **tensor_args)
+
    if (cov == cov.T).all() and (torch.linalg.eigvals(cov).real >= 0).all():
        scale_tril = torch.linalg.cholesky(cov)
    else:
        scale_tril = cov
+
+    cov = cov.to(device=device)
+    scale_tril = scale_tril.to(device=device)
+    scaled_M = scaled_M.to(device=device)
    """
    k = 0
    act_cov_matrix = cov[k * horizon:k * horizon + horizon, k * horizon:k * horizon + horizon]
@@ -140,7 +143,7 @@ def get_stomp_cov_jit(
    print(torch.det(act_cov_matrix))
    local_cholesky = matrix_cholesky(act_cov_matrix)
    for k in range(d_action):
-        
+
        scale_tril[k * horizon:k * horizon + horizon,k * horizon:k * horizon + horizon] = local_cholesky
    """

--- a/src/curobo/rollout/arm_base.py
+++ b/src/curobo/rollout/arm_base.py
@@ -261,7 +261,7 @@ class ArmBase(RolloutBase, ArmBaseConfig):
                log_warn(
                    "null space cost is deprecated, use null_space_weight in bound cost instead"
                )
-
+            self.cost_cfg.bound_cfg.dof = self.n_dofs
            self.bound_cost = BoundCost(self.cost_cfg.bound_cfg)

        if self.cost_cfg.manipulability_cfg is not None:
@@ -315,10 +315,12 @@ class ArmBase(RolloutBase, ArmBaseConfig):
        self.cost_cfg.bound_cfg.state_finite_difference_mode = (
            self.dynamics_model.state_finite_difference_mode
        )
-
+        self.cost_cfg.bound_cfg.dof = self.n_dofs
+        self.constraint_cfg.bound_cfg.dof = self.n_dofs
        self.bound_constraint = BoundCost(self.constraint_cfg.bound_cfg)

        if self.convergence_cfg.null_space_cfg is not None:
+            self.convergence_cfg.null_space_cfg.dof = self.n_dofs
            self.null_convergence = DistCost(self.convergence_cfg.null_space_cfg)

        # set start state:
@@ -578,6 +580,7 @@ class ArmBase(RolloutBase, ArmBaseConfig):
        ----------
        action_seq: torch.Tensor [num_particles, horizon, d_act]
        """
+
        # print(act_seq.shape, self._goal_buffer.batch_current_state_idx)
        if self.start_state is None:
            raise ValueError("start_state is not set in rollout")
@@ -585,6 +588,7 @@ class ArmBase(RolloutBase, ArmBaseConfig):
            state = self.dynamics_model.forward(
                self.start_state, act_seq, self._goal_buffer.batch_current_state_idx
            )
+
        with profiler.record_function("cost/all"):
            cost_seq = self.cost_fn(state, act_seq)

--- a/src/curobo/rollout/arm_reacher.py
+++ b/src/curobo/rollout/arm_reacher.py
@@ -174,6 +174,7 @@ class ArmReacher(ArmBase, ArmReacherConfig):
        self._n_goalset = 1

        if self.cost_cfg.cspace_cfg is not None:
+            self.cost_cfg.cspace_cfg.dof = self.d_action
            # self.cost_cfg.cspace_cfg.update_vec_weight(self.dynamics_model.cspace_distance_weight)
            self.dist_cost = DistCost(self.cost_cfg.cspace_cfg)
        if self.cost_cfg.pose_cfg is not None:
@@ -226,6 +227,7 @@ class ArmReacher(ArmBase, ArmReacherConfig):
                if i != self.kinematics.ee_link:
                    self._link_pose_convergence[i] = PoseCost(self.convergence_cfg.link_pose_cfg)
        if self.convergence_cfg.cspace_cfg is not None:
+            self.convergence_cfg.cspace_cfg.dof = self.d_action
            self.cspace_convergence = DistCost(self.convergence_cfg.cspace_cfg)

        # check if g_dist is required in any of the cost terms:
--- a/src/curobo/rollout/cost/bound_cost.py
+++ b/src/curobo/rollout/cost/bound_cost.py
@@ -22,7 +22,7 @@ from curobo.cuda_robot_model.types import JointLimits
 from curobo.types.robot import JointState
 from curobo.types.tensor import T_DOF
 from curobo.util.logger import log_error
-from curobo.util.torch_utils import get_torch_jit_decorator
+from curobo.util.torch_utils import get_cache_fn_decorator, get_torch_jit_decorator
 from curobo.util.warp import init_warp

 # Local Folder
@@ -49,6 +49,7 @@ class BoundCostConfig(CostConfig):
    activation_distance: Union[torch.Tensor, float] = 0.0
    state_finite_difference_mode: str = "BACKWARD"
    null_space_weight: Optional[List[float]] = None
+    use_l2_kernel: bool = False

    def set_bounds(self, bounds: JointLimits, teleport_mode: bool = False):
        self.joint_limits = bounds.clone()
@@ -104,12 +105,21 @@ class BoundCost(CostBase, BoundCostConfig):
            (0, 0, 0), device=self.tensor_args.device, dtype=self.tensor_args.dtype
        )
        self._out_gv_buffer = self._out_ga_buffer = self._out_gj_buffer = empty_buffer
+        if self.use_l2_kernel:
+            if self.cost_type == BoundCostType.POSITION:
+                self._l2_cost = make_bound_pos_kernel(self.dof)
+            if self.cost_type == BoundCostType.BOUNDS_SMOOTH:
+                self._l2_cost = make_bound_pos_smooth_kernel(self.dof)

    def update_batch_size(self, batch, horizon, dof):
        if self._batch_size != batch or self._horizon != horizon or self._dof != dof:
            self._out_c_buffer = torch.zeros(
                (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
            )
+            self._out_c_sum_buffer = torch.zeros(
+                (batch, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+
            self._out_gp_buffer = torch.zeros(
                (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
            )
@@ -184,36 +194,61 @@ class BoundCost(CostBase, BoundCostConfig):
            retract_idx = self._retract_cfg_idx

        if self.cost_type == BoundCostType.BOUNDS_SMOOTH:
-            # print(self.joint_limits.jerk.shape, self.joint_limits.position.shape)
-            cost = WarpBoundSmoothFunction.apply(
-                state_batch.position,
-                state_batch.velocity,
-                state_batch.acceleration,
-                state_batch.jerk,
-                retract_config,
-                retract_idx,
-                self.joint_limits.position,
-                self.joint_limits.velocity,
-                self.joint_limits.acceleration,
-                self.joint_limits.jerk,
-                self.weight,
-                self.activation_distance,
-                self.smooth_weight,
-                self.cspace_distance_weight,
-                self.null_space_weight,
-                self.vec_weight,
-                self._run_weight_vel,
-                self._run_weight_acc,
-                self._run_weight_jerk,
-                self._out_c_buffer,
-                self._out_gp_buffer,
-                self._out_gv_buffer,
-                self._out_ga_buffer,
-                self._out_gj_buffer,
-            )
-            # print(self.cspace_distance_weight.shape)
-            # print(cost)
-            # print(self._run_weight_acc)
+            if self.use_l2_kernel:
+                cost = WarpBoundSmoothL2Function.apply(
+                    state_batch.position,
+                    state_batch.velocity,
+                    state_batch.acceleration,
+                    state_batch.jerk,
+                    retract_config,
+                    retract_idx,
+                    self.joint_limits.position,
+                    self.joint_limits.velocity,
+                    self.joint_limits.acceleration,
+                    self.joint_limits.jerk,
+                    self.weight,
+                    self.activation_distance,
+                    self.smooth_weight,
+                    self.cspace_distance_weight,
+                    self.null_space_weight,
+                    self.vec_weight,
+                    self._run_weight_vel,
+                    self._run_weight_acc,
+                    self._run_weight_jerk,
+                    self._out_c_sum_buffer,
+                    self._out_gp_buffer,
+                    self._out_gv_buffer,
+                    self._out_ga_buffer,
+                    self._out_gj_buffer,
+                    self._l2_cost,
+                )
+            else:
+                cost = WarpBoundSmoothFunction.apply(
+                    state_batch.position,
+                    state_batch.velocity,
+                    state_batch.acceleration,
+                    state_batch.jerk,
+                    retract_config,
+                    retract_idx,
+                    self.joint_limits.position,
+                    self.joint_limits.velocity,
+                    self.joint_limits.acceleration,
+                    self.joint_limits.jerk,
+                    self.weight,
+                    self.activation_distance,
+                    self.smooth_weight,
+                    self.cspace_distance_weight,
+                    self.null_space_weight,
+                    self.vec_weight,
+                    self._run_weight_vel,
+                    self._run_weight_acc,
+                    self._run_weight_jerk,
+                    self._out_c_buffer,
+                    self._out_gp_buffer,
+                    self._out_gv_buffer,
+                    self._out_ga_buffer,
+                    self._out_gj_buffer,
+                )
        elif self.cost_type == BoundCostType.BOUNDS:
            cost = WarpBoundFunction.apply(
                state_batch.position,
@@ -237,8 +272,8 @@ class BoundCost(CostBase, BoundCostConfig):
                self._out_gj_buffer,
            )
        elif self.cost_type == BoundCostType.POSITION:
-            if self.return_loss:
-                cost = WarpBoundPosLoss.apply(
+            if self.use_l2_kernel:
+                cost = WarpBoundPosL2Function.apply(
                    state_batch.position,
                    retract_config,
                    retract_idx,
@@ -247,8 +282,10 @@ class BoundCost(CostBase, BoundCostConfig):
                    self.activation_distance,
                    self.null_space_weight,
                    self.vec_weight,
-                    self._out_c_buffer,
+                    self._out_c_sum_buffer,
                    self._out_gp_buffer,
+                    self._l2_cost,
+                    self.return_loss,
                )
            else:
                cost = WarpBoundPosFunction.apply(
@@ -262,6 +299,7 @@ class BoundCost(CostBase, BoundCostConfig):
                    self.vec_weight,
                    self._out_c_buffer,
                    self._out_gp_buffer,
+                    self.return_loss,
                )

        else:
@@ -458,6 +496,132 @@ class WarpBoundSmoothFunction(torch.autograd.Function):
        )


+# create a bound cost tensor:
+class WarpBoundSmoothL2Function(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        pos,
+        vel,
+        acc,
+        jerk,
+        retract_config,
+        retract_idx,
+        p_b,
+        v_b,
+        a_b,
+        j_b,
+        weight,
+        activation_distance,
+        smooth_weight,
+        cspace_weight,
+        null_space_weight,
+        vec_weight,
+        run_weight_vel,
+        run_weight_acc,
+        run_weight_jerk,
+        out_cost,
+        out_gp,
+        out_gv,
+        out_ga,
+        out_gj,
+        warp_function,
+    ):
+        # scale the weights for smoothness by this dt:
+        wp_device = wp.device_from_torch(vel.device)
+        # assert smooth_weight.shape[0] == 7
+        b, h, dof = vel.shape
+        requires_grad = pos.requires_grad
+
+        wp.launch(
+            kernel=warp_function,
+            dim=b * h,
+            inputs=[
+                wp.from_torch(pos.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(vel.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(acc.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(jerk.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_config.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_idx.detach().view(-1), dtype=wp.int32),
+                wp.from_torch(p_b.view(-1), dtype=wp.float32),
+                wp.from_torch(v_b.view(-1), dtype=wp.float32),
+                wp.from_torch(a_b.view(-1), dtype=wp.float32),
+                wp.from_torch(j_b.view(-1), dtype=wp.float32),
+                wp.from_torch(weight, dtype=wp.float32),
+                wp.from_torch(activation_distance, dtype=wp.float32),
+                wp.from_torch(smooth_weight, dtype=wp.float32),
+                wp.from_torch(cspace_weight, dtype=wp.float32),
+                wp.from_torch(null_space_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(vec_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(run_weight_vel.view(-1), dtype=wp.float32),
+                wp.from_torch(run_weight_acc.view(-1), dtype=wp.float32),
+                wp.from_torch(run_weight_jerk.view(-1), dtype=wp.float32),
+                wp.from_torch(out_cost.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gp.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gv.view(-1), dtype=wp.float32),
+                wp.from_torch(out_ga.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gj.view(-1), dtype=wp.float32),
+                requires_grad,
+                b,
+                h,
+                dof,
+            ],
+            device=wp_device,
+            stream=wp.stream_from_torch(vel.device),
+        )
+        ctx.save_for_backward(out_gp, out_gv, out_ga, out_gj)
+
+        return out_cost
+
+    @staticmethod
+    def backward(ctx, grad_out_cost):
+        (
+            p_grad,
+            v_grad,
+            a_grad,
+            j_grad,
+        ) = ctx.saved_tensors
+        v_g = None
+        a_g = None
+        p_g = None
+        j_g = None
+        if ctx.needs_input_grad[0]:
+            p_g = p_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[1]:
+            v_g = v_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[2]:
+            a_g = a_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[3]:
+            j_g = j_grad  # * grad_out_cost#.unsqueeze(-1)
+        return (
+            p_g,
+            v_g,
+            a_g,
+            j_g,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
 class WarpBoundFunction(torch.autograd.Function):
    @staticmethod
    def forward(
@@ -581,6 +745,7 @@ class WarpBoundPosFunction(torch.autograd.Function):
        vec_weight,
        out_cost,
        out_gp,
+        return_loss=False,
    ):
        wp_device = wp.device_from_torch(pos.device)
        b, h, dof = pos.shape
@@ -607,10 +772,9 @@ class WarpBoundPosFunction(torch.autograd.Function):
            device=wp_device,
            stream=wp.stream_from_torch(pos.device),
        )
+        ctx.return_loss = return_loss
        ctx.save_for_backward(out_gp)
-        # cost = torch.linalg.norm(out_cost, dim=-1)
        cost = torch.sum(out_cost, dim=-1)
-        # cost = out_cost
        return cost

    @staticmethod
@@ -619,20 +783,71 @@ class WarpBoundPosFunction(torch.autograd.Function):

        p_g = None
        if ctx.needs_input_grad[0]:
-            p_g = p_grad  # * grad_out_cost.unsqueeze(-1)
-        return p_g, None, None, None, None, None, None, None, None, None
+            p_g = p_grad
+            if ctx.return_loss:
+                p_g = p_grad * grad_out_cost.unsqueeze(-1)
+
+        return p_g, None, None, None, None, None, None, None, None, None, None


 # create a bound cost tensor:
-class WarpBoundPosLoss(WarpBoundPosFunction):
+class WarpBoundPosL2Function(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        pos,
+        retract_config,
+        retract_idx,
+        p_l,
+        weight,
+        activation_distance,
+        null_space_weight,
+        vec_weight,
+        out_cost,
+        out_gp,
+        warp_function,
+        return_loss=False,
+    ):
+        wp_device = wp.device_from_torch(pos.device)
+        b, h, dof = pos.shape
+        requires_grad = pos.requires_grad
+        wp.launch(
+            kernel=warp_function,
+            dim=b * h,
+            inputs=[
+                wp.from_torch(pos.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_config.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_idx.detach().view(-1), dtype=wp.int32),
+                wp.from_torch(p_l.view(-1), dtype=wp.float32),
+                wp.from_torch(weight, dtype=wp.float32),
+                wp.from_torch(activation_distance, dtype=wp.float32),
+                wp.from_torch(null_space_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(vec_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(out_cost.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gp.view(-1), dtype=wp.float32),
+                requires_grad,
+                b,
+                h,
+                dof,
+            ],
+            device=wp_device,
+            stream=wp.stream_from_torch(pos.device),
+        )
+        ctx.return_loss = return_loss
+        ctx.save_for_backward(out_gp)
+        # cost = torch.sum(out_cost, dim=-1)
+        return out_cost
+
    @staticmethod
    def backward(ctx, grad_out_cost):
        (p_grad,) = ctx.saved_tensors

        p_g = None
        if ctx.needs_input_grad[0]:
-            p_g = p_grad * grad_out_cost.unsqueeze(-1)
-        return p_g, None, None, None, None, None, None, None, None, None
+            p_g = p_grad  # * grad_out_cost.unsqueeze(-1)
+            if ctx.return_loss:
+                p_g = p_grad * grad_out_cost.unsqueeze(-1)
+        return p_g, None, None, None, None, None, None, None, None, None, None, None


@wp.kernel
@@ -1122,3 +1337,314 @@ def forward_bound_smooth_warp(
        out_grad_v[b_addrs] = g_v
        out_grad_a[b_addrs] = g_a
        out_grad_j[b_addrs] = g_j
+
+
+@get_cache_fn_decorator()
+def make_bound_pos_smooth_kernel(dof_template: int):
+    def forward_bound_smooth_loop_warp(
+        pos: wp.array(dtype=wp.float32),
+        vel: wp.array(dtype=wp.float32),
+        acc: wp.array(dtype=wp.float32),
+        jerk: wp.array(dtype=wp.float32),
+        retract_config: wp.array(dtype=wp.float32),
+        retract_idx: wp.array(dtype=wp.int32),
+        p_b: wp.array(dtype=wp.float32),
+        v_b: wp.array(dtype=wp.float32),
+        a_b: wp.array(dtype=wp.float32),
+        j_b: wp.array(dtype=wp.float32),
+        weight: wp.array(dtype=wp.float32),
+        activation_distance: wp.array(dtype=wp.float32),
+        smooth_weight: wp.array(dtype=wp.float32),
+        cspace_weight: wp.array(dtype=wp.float32),
+        null_weight: wp.array(dtype=wp.float32),
+        vec_weight: wp.array(dtype=wp.float32),
+        run_weight_vel: wp.array(dtype=wp.float32),
+        run_weight_acc: wp.array(dtype=wp.float32),
+        run_weight_jerk: wp.array(dtype=wp.float32),
+        out_cost: wp.array(dtype=wp.float32),
+        out_grad_p: wp.array(dtype=wp.float32),
+        out_grad_v: wp.array(dtype=wp.float32),
+        out_grad_a: wp.array(dtype=wp.float32),
+        out_grad_j: wp.array(dtype=wp.float32),
+        write_grad: wp.uint8,  # this should be a bool
+        batch_size: wp.int32,
+        horizon: wp.int32,
+        dof: wp.int32,
+    ):
+        tid = wp.tid()
+        # initialize variables:
+        b_id = wp.int32(0)
+        h_id = wp.int32(0)
+        b_addrs = int(0)
+
+        w = wp.float32(0.0)
+        b_wv = float(0.0)
+        b_wa = float(0.0)
+        b_wj = float(0.0)
+
+        r_wa = wp.float32(0.0)
+        r_wj = wp.float32(0.0)
+
+        w_a = wp.float32(0.0)
+        w_j = wp.float32(0.0)
+
+        s_a = wp.float32(0.0)
+        s_j = wp.float32(0.0)
+
+        c_total = wp.float32(0.0)
+
+        # we launch batch * horizon * dof kernels
+        b_id = tid / (horizon)
+        h_id = tid - (b_id * horizon)
+        if b_id >= batch_size or h_id >= horizon:
+            return
+
+        n_w = wp.float32(0.0)
+        n_w = null_weight[0]
+
+        vec_weight_local = wp.vector(dtype=wp.float32, length=dof_template)
+        cspace_weight_local = wp.vector(dtype=wp.float32, length=dof_template)
+        p_l = wp.vector(dtype=wp.float32, length=dof_template)
+        p_u = wp.vector(dtype=wp.float32, length=dof_template)
+        v_l = wp.vector(dtype=wp.float32, length=dof_template)
+        v_u = wp.vector(dtype=wp.float32, length=dof_template)
+        a_l = wp.vector(dtype=wp.float32, length=dof_template)
+        a_u = wp.vector(dtype=wp.float32, length=dof_template)
+        j_l = wp.vector(dtype=wp.float32, length=dof_template)
+        j_u = wp.vector(dtype=wp.float32, length=dof_template)
+        g_p = wp.vector(dtype=wp.float32, length=dof_template)
+        g_v = wp.vector(dtype=wp.float32, length=dof_template)
+        g_a = wp.vector(dtype=wp.float32, length=dof_template)
+        g_j = wp.vector(dtype=wp.float32, length=dof_template)
+        target_id = wp.int32(0.0)
+        target_id = retract_idx[b_id]
+        current_p = wp.vector(dtype=wp.float32, length=dof_template)
+        current_v = wp.vector(dtype=wp.float32, length=dof_template)
+        current_a = wp.vector(dtype=wp.float32, length=dof_template)
+        current_j = wp.vector(dtype=wp.float32, length=dof_template)
+
+        target_p = wp.vector(dtype=wp.float32, length=dof_template)
+
+        b_addrs = b_id * horizon * dof + h_id * dof
+
+        for i in range(dof_template):
+            vec_weight_local[i] = vec_weight[i]
+            target_p[i] = retract_config[target_id * dof + i]
+            current_p[i] = pos[b_addrs + i]
+            current_v[i] = vel[b_addrs + i]
+            current_a[i] = acc[b_addrs + i]
+            current_j[i] = jerk[b_addrs + i]
+            p_l[i] = p_b[i]
+            p_u[i] = p_b[dof + i]
+            v_l[i] = v_b[i]
+            v_u[i] = v_b[dof + i]
+            a_l[i] = a_b[i]
+            a_u[i] = a_b[dof + i]
+            j_l[i] = j_b[i]
+            j_u[i] = j_b[dof + i]
+            cspace_weight_local[i] = cspace_weight[i]
+
+        # read weights:
+        w = weight[0]
+        b_wv = weight[1]
+        b_wa = weight[2]
+        b_wj = weight[3]
+        r_wa = run_weight_acc[h_id]
+        r_wj = run_weight_jerk[h_id]
+
+        w_a = smooth_weight[1]
+        w_j = smooth_weight[2]
+
+        # compute cost:
+
+        # read buffers:
+
+        # if w_j > 0.0:
+        eta_p = activation_distance[0]
+        eta_v = activation_distance[1]
+        eta_a = activation_distance[2]
+        eta_j = activation_distance[3]
+
+        p_range = p_u - p_l
+        p_l = p_l + (p_range) * eta_p
+        p_u = p_u - (p_range) * eta_p
+        v_l = v_l + (v_u - v_l) * eta_v
+        v_u = v_u - (v_u - v_l) * eta_v
+        a_l = a_l + (a_u - a_l) * eta_a
+        a_u = a_u - (a_u - a_l) * eta_a
+        j_l = j_l + (j_u - j_l) * eta_j
+        j_u = j_u - (j_u - j_l) * eta_j
+
+        # position:
+        if n_w > 0.0:
+            error = wp.cw_mul(vec_weight_local, current_p - target_p)
+            error_length = wp.length(error)
+            c_total = n_w * error_length
+            if error_length > 0.0:
+                g_p = n_w * error / error_length
+
+        # bound cost:
+        # bound cost:
+        bound_delta_p = wp.vector(dtype=wp.float32, length=dof_template)
+        bound_delta_v = wp.vector(dtype=wp.float32, length=dof_template)
+        bound_delta_a = wp.vector(dtype=wp.float32, length=dof_template)
+        bound_delta_j = wp.vector(dtype=wp.float32, length=dof_template)
+        for i in range(dof_template):
+            if current_p[i] < p_l[i]:
+                bound_delta_p[i] = current_p[i] - p_l[i]
+            elif current_p[i] > p_u[i]:
+                bound_delta_p[i] = current_p[i] - p_u[i]
+            if current_v[i] < v_l[i]:
+                bound_delta_v[i] = current_v[i] - v_l[i]
+            elif current_v[i] > v_u[i]:
+                bound_delta_v[i] = current_v[i] - v_u[i]
+            if current_a[i] < a_l[i]:
+                bound_delta_a[i] = current_a[i] - a_l[i]
+            elif current_a[i] > a_u[i]:
+                bound_delta_a[i] = current_a[i] - a_u[i]
+            if current_j[i] < j_l[i]:
+                bound_delta_j[i] = current_j[i] - j_l[i]
+            elif current_j[i] > j_u[i]:
+                bound_delta_j[i] = current_j[i] - j_u[i]
+        delta_p = wp.length(bound_delta_p)
+        if delta_p > 0.0:
+            g_p += w * bound_delta_p / delta_p
+            c_total += w * delta_p
+        delta_v = wp.length(bound_delta_v)
+        if delta_v > 0.0:
+            g_v = b_wv * bound_delta_v / delta_v
+            c_total += b_wv * delta_v
+        delta_a = wp.length(bound_delta_a)
+        if delta_a > 0.0:
+            g_a = b_wa * bound_delta_a / delta_a
+            c_total += b_wa * delta_a
+        delta_j = wp.length(bound_delta_j)
+        if delta_j > 0.0:
+            g_j = b_wj * bound_delta_j / delta_j
+            c_total += b_wj * delta_j
+
+        delta_acc = wp.cw_mul(cspace_weight_local, current_a)
+        # delta_acc = wp.cw_div(delta_acc, a_u - a_l)
+        acc_length = wp.length_sq(delta_acc)
+        s_a = w_a * r_wa * acc_length
+        if acc_length > 0.0:
+            g_a += 2.0 * w_a * r_wa * delta_acc  # / acc_length
+
+        delta_jerk = wp.cw_mul(cspace_weight_local, current_j)
+        # delta_jerk = wp.cw_div(delta_jerk, j_u - j_l)
+        jerk_length = wp.length_sq(delta_jerk)
+        s_j = w_j * r_wj * jerk_length
+        if jerk_length > 0.0:
+            g_j += 2.0 * w_j * r_wj * delta_jerk  # / jerk_length
+
+        c_total += s_a + s_j
+
+        out_cost[b_id * horizon + h_id] = c_total
+
+        # compute gradient
+        if write_grad == 1:
+            b_addrs = b_id * horizon * dof + h_id * dof
+            for i in range(dof_template):
+                out_grad_p[b_addrs + i] = g_p[i]
+                out_grad_v[b_addrs + i] = g_v[i]
+                out_grad_a[b_addrs + i] = g_a[i]
+                out_grad_j[b_addrs + i] = g_j[i]
+
+    return wp.Kernel(forward_bound_smooth_loop_warp)
+
+
+@get_cache_fn_decorator()
+def make_bound_pos_kernel(dof_template: int):
+
+    def forward_bound_pos_loop_warp(
+        pos: wp.array(dtype=wp.float32),
+        retract_config: wp.array(dtype=wp.float32),
+        retract_idx: wp.array(dtype=wp.int32),
+        p_b: wp.array(dtype=wp.float32),
+        weight: wp.array(dtype=wp.float32),
+        activation_distance: wp.array(dtype=wp.float32),
+        null_weight: wp.array(dtype=wp.float32),
+        vec_weight: wp.array(dtype=wp.float32),
+        out_cost: wp.array(dtype=wp.float32),
+        out_grad_p: wp.array(dtype=wp.float32),
+        write_grad: wp.uint8,  # this should be a bool
+        batch_size: wp.int32,
+        horizon: wp.int32,
+        dof: wp.int32,
+    ):
+        tid = wp.tid()
+        # initialize variables:
+        b_id = wp.int32(0)
+        h_id = wp.int32(0)
+        b_addrs = int(0)
+
+        w = wp.float32(0.0)
+        c_total = wp.float32(0.0)
+
+        # we launch batch * horizon * dof kernels
+        b_id = tid / (horizon)
+        h_id = tid - (b_id * horizon)
+        if b_id >= batch_size or h_id >= horizon:
+            return
+
+        # read weights:
+        eta_p = activation_distance[0]
+        w = weight[0]
+
+        n_w = wp.float32(0.0)
+        n_w = null_weight[0]
+        target_id = wp.int32(0.0)
+        vec_weight_local = wp.vector(dtype=wp.float32, length=dof_template)
+        p_l = wp.vector(dtype=wp.float32, length=dof_template)
+        p_u = wp.vector(dtype=wp.float32, length=dof_template)
+        bound_delta = wp.vector(dtype=wp.float32, length=dof_template)
+        target_p = wp.vector(dtype=wp.float32, length=dof_template)
+        current_p = wp.vector(dtype=wp.float32, length=dof_template)
+        g_p = wp.vector(dtype=wp.float32, length=dof_template)
+
+        target_id = retract_idx[b_id]
+        b_addrs = b_id * horizon * dof + h_id * dof
+
+        for i in range(dof_template):
+            vec_weight_local[i] = vec_weight[i]
+            target_p[i] = retract_config[target_id * dof + i]
+            current_p[i] = pos[b_addrs + i]
+            p_l[i] = p_b[i]
+            p_u[i] = p_b[dof + i]
+
+        p_range = p_u - p_l
+        eta_percent = eta_p * (p_range)
+        p_l += eta_percent
+        p_u -= eta_percent
+
+        # compute retract cost:
+
+        if n_w > 0.0:
+            error = wp.cw_mul(vec_weight_local, current_p - target_p)
+            error_length = wp.length(error)
+            c_total = n_w * error_length
+            if error_length > 0.0:
+                g_p = n_w * error / error_length
+
+        # bound cost:
+
+        for i in range(dof_template):
+            if current_p[i] < p_l[i]:
+                bound_delta[i] = current_p[i] - p_l[i]
+            elif current_p[i] > p_u[i]:
+                bound_delta[i] = current_p[i] - p_u[i]
+
+        delta = wp.length(bound_delta)
+        if delta > 0.0:
+            g_p += w * bound_delta / delta
+            c_total += w * delta
+
+        out_cost[b_id * horizon + h_id] = c_total
+
+        # compute gradient
+        if write_grad == 1:
+            b_addrs = b_id * horizon * dof + h_id * dof
+            for i in range(dof_template):
+                out_grad_p[b_addrs + i] = g_p[i]
+
+    return wp.Kernel(forward_bound_pos_loop_warp)
--- a/src/curobo/rollout/cost/dist_cost.py
+++ b/src/curobo/rollout/cost/dist_cost.py
@@ -18,7 +18,8 @@ import torch
 import warp as wp

 # CuRobo
-from curobo.util.torch_utils import get_torch_jit_decorator
+from curobo.util.logger import log_error
+from curobo.util.torch_utils import get_cache_fn_decorator, get_torch_jit_decorator
 from curobo.util.warp import init_warp

 # Local Folder
@@ -37,6 +38,7 @@ class DistType(Enum):
 class DistCostConfig(CostConfig):
    dist_type: DistType = DistType.L2
    use_null_space: bool = False
+    use_l2_kernel: bool = True

    def __post_init__(self):
        return super().__post_init__()
@@ -142,6 +144,89 @@ def forward_l2_warp(
        out_grad_p[b_addrs] = g_p


+@get_cache_fn_decorator()
+def make_l2_kernel(dof_template: int):
+    def forward_l2_loop_warp(
+        pos: wp.array(dtype=wp.float32),
+        target: wp.array(dtype=wp.float32),
+        target_idx: wp.array(dtype=wp.int32),
+        weight: wp.array(dtype=wp.float32),
+        run_weight: wp.array(dtype=wp.float32),
+        vec_weight: wp.array(dtype=wp.float32),
+        out_cost: wp.array(dtype=wp.float32),
+        out_grad_p: wp.array(dtype=wp.float32),
+        write_grad: wp.uint8,  # this should be a bool
+        batch_size: wp.int32,
+        horizon: wp.int32,
+        dof: wp.int32,
+    ):
+        tid = wp.tid()
+        # initialize variables:
+        b_id = wp.int32(0)
+        h_id = wp.int32(0)
+        b_addrs = wp.int32(0)
+        target_id = wp.int32(0)
+        w = wp.float32(0.0)
+        r_w = wp.float32(0.0)
+        c_total = wp.float32(0.0)
+
+        # we launch batch * horizon * dof kernels
+        b_id = tid / (horizon)
+        h_id = tid - (b_id * horizon)
+
+        if b_id >= batch_size or h_id >= horizon:
+            return
+
+        # read weights:
+        w = weight[0]
+        r_w = run_weight[h_id]
+        w = r_w * w
+
+        if w == 0.0:
+            return
+        # compute cost:
+        b_addrs = b_id * horizon * dof + h_id * dof
+
+        # read buffers:
+        current_position = wp.vector(dtype=wp.float32, length=dof_template)
+        target_position = wp.vector(dtype=wp.float32, length=dof_template)
+        vec_weight_local = wp.vector(dtype=wp.float32, length=dof_template)
+        target_id = target_idx[b_id]
+        target_id = target_id * dof
+
+        for i in range(dof_template):
+            current_position[i] = pos[b_addrs + i]
+            target_position[i] = target[target_id + i]
+            vec_weight_local[i] = vec_weight[i]
+
+        error = wp.cw_mul(vec_weight_local, (current_position - target_position))
+
+        c_length = wp.length(error)
+        if w > 100.0:
+            p_w_alpha = 70.0
+            c_total = w * wp.log2(wp.cosh(p_w_alpha * c_length))
+            g_p = error * (
+                w
+                * p_w_alpha
+                * wp.sinh(p_w_alpha * c_length)
+                / (c_length * wp.cosh(p_w_alpha * c_length))
+            )
+        else:
+            g_p = w * error
+            if c_length > 0.0:
+                g_p = g_p / c_length
+            c_total = w * c_length
+
+        out_cost[b_id * horizon + h_id] = c_total
+
+        # compute gradient
+        if write_grad == 1:
+            for i in range(dof_template):
+                out_grad_p[b_addrs + i] = g_p[i]
+
+    return wp.Kernel(forward_l2_loop_warp)
+
+
 # create a bound cost tensor:
 class L2DistFunction(torch.autograd.Function):
    @staticmethod
@@ -195,6 +280,58 @@ class L2DistFunction(torch.autograd.Function):
        return p_g, None, None, None, None, None, None, None, None


+# create a bound cost tensor:
+class L2DistLoopFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        pos,
+        target,
+        target_idx,
+        weight,
+        run_weight,
+        vec_weight,
+        out_cost,
+        out_cost_v,
+        out_gp,
+        l2_dof_kernel,
+    ):
+        wp_device = wp.device_from_torch(pos.device)
+        b, h, dof = pos.shape
+        wp.launch(
+            kernel=l2_dof_kernel,
+            dim=b * h,
+            inputs=[
+                wp.from_torch(pos.detach().view(-1).contiguous(), dtype=wp.float32),
+                wp.from_torch(target.view(-1), dtype=wp.float32),
+                wp.from_torch(target_idx.view(-1), dtype=wp.int32),
+                wp.from_torch(weight, dtype=wp.float32),
+                wp.from_torch(run_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(vec_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(out_cost.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gp.view(-1), dtype=wp.float32),
+                pos.requires_grad,
+                b,
+                h,
+                dof,
+            ],
+            device=wp_device,
+            stream=wp.stream_from_torch(pos.device),
+        )
+
+        ctx.save_for_backward(out_gp)
+        return out_cost
+
+    @staticmethod
+    def backward(ctx, grad_out_cost):
+        (p_grad,) = ctx.saved_tensors
+
+        p_g = None
+        if ctx.needs_input_grad[0]:
+            p_g = p_grad
+        return p_g, None, None, None, None, None, None, None, None, None
+
+
 class DistCost(CostBase, DistCostConfig):
    def __init__(self, config: Optional[DistCostConfig] = None):
        if config is not None:
@@ -202,6 +339,8 @@ class DistCost(CostBase, DistCostConfig):
        CostBase.__init__(self)
        self._init_post_config()
        init_warp()
+        if self.use_l2_kernel:
+            self._l2_dof_kernel = make_l2_kernel(self.dof)

    def _init_post_config(self):
        if self.vec_weight is not None:
@@ -210,13 +349,21 @@ class DistCost(CostBase, DistCostConfig):
                self.vec_weight = self.vec_weight * 0.0 + 1.0

    def update_batch_size(self, batch, horizon, dof):
+        if dof != self.dof:
+            log_error("dof cannot be changed after initializing DistCost")
        if self._batch_size != batch or self._horizon != horizon or self._dof != dof:
-            self._out_cv_buffer = torch.zeros(
-                (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
-            )
-            self._out_c_buffer = torch.zeros(
-                (batch, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
-            )
+            self._out_c_buffer = None
+            self._out_cv_buffer = None
+            if self.use_l2_kernel:
+                self._out_c_buffer = torch.zeros(
+                    (batch, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+                )
+            else:
+                self._out_cv_buffer = torch.zeros(
+                    (batch, horizon, dof),
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                )

            self._out_g_buffer = torch.zeros(
                (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
@@ -293,24 +440,59 @@ class DistCost(CostBase, DistCostConfig):
        else:
            raise NotImplementedError("terminal flag needs to be set to true")
        if self.dist_type == DistType.L2:
-            # print(goal_idx.shape, goal_vec.shape)
-            cost = L2DistFunction.apply(
-                current_vec,
-                goal_vec,
-                goal_idx,
-                self.weight,
-                self._run_weight_vec,
-                self.vec_weight,
-                self._out_c_buffer,
-                self._out_cv_buffer,
-                self._out_g_buffer,
-            )
+            if self.use_l2_kernel:
+
+                cost = L2DistLoopFunction.apply(
+                    current_vec,
+                    goal_vec,
+                    goal_idx,
+                    self.weight,
+                    self._run_weight_vec,
+                    self.vec_weight,
+                    self._out_c_buffer,
+                    None,
+                    self._out_g_buffer,
+                    self._l2_dof_kernel,
+                )
+            else:
+                cost = L2DistFunction.apply(
+                    current_vec,
+                    goal_vec,
+                    goal_idx,
+                    self.weight,
+                    self._run_weight_vec,
+                    self.vec_weight,
+                    None,
+                    self._out_cv_buffer,
+                    self._out_g_buffer,
+                )

        else:
            raise NotImplementedError()
        if RETURN_GOAL_DIST:
-            dist_scale = torch.nan_to_num(
-                1.0 / torch.sqrt((self.weight * self._run_weight_vec)), 0.0
-            )
-            return cost, cost * dist_scale
+
+            if self.use_l2_kernel:
+                distance = weight_cost_to_l2_jit(cost, self.weight, self._run_weight_vec)
+            else:
+                distance = squared_cost_to_l2_jit(cost, self.weight, self._run_weight_vec)
+            return cost, distance
+
        return cost
+
+
+@get_torch_jit_decorator()
+def squared_cost_to_l2_jit(cost, weight, run_weight_vec):
+    weight_inv = weight * run_weight_vec
+    weight_inv = 1.0 / weight_inv
+    weight_inv = torch.nan_to_num(weight_inv, 0.0)
+    distance = torch.sqrt(cost * weight_inv)
+    return distance
+
+
+@get_torch_jit_decorator()
+def weight_cost_to_l2_jit(cost, weight, run_weight_vec):
+    weight_inv = weight * run_weight_vec
+    weight_inv = 1.0 / weight_inv
+    weight_inv = torch.nan_to_num(weight_inv, 0.0)
+    distance = cost * weight_inv
+    return distance
--- a/src/curobo/rollout/dynamics_model/tensor_step.py
+++ b/src/curobo/rollout/dynamics_model/tensor_step.py
@@ -522,27 +522,20 @@ class TensorStepPositionCliqueKernel(TensorStepBase):
                    self._u_grad,
                )
            if self._filter_velocity:
+                out_state_seq.aux_data["raw_velocity"] = out_state_seq.velocity
                out_state_seq.velocity = self.filter_signal(out_state_seq.velocity)

            if self._filter_acceleration:
+                out_state_seq.aux_data["raw_acceleration"] = out_state_seq.acceleration
                out_state_seq.acceleration = self.filter_signal(out_state_seq.acceleration)

            if self._filter_jerk:
+                out_state_seq.aux_data["raw_jerk"] = out_state_seq.jerk
                out_state_seq.jerk = self.filter_signal(out_state_seq.jerk)
        return out_state_seq

    def filter_signal(self, signal: torch.Tensor):
        return filter_signal_jit(signal, self._sma_kernel)
-        b, h, dof = signal.shape
-        new_signal = (
-            self._sma(
-                signal.transpose(-1, -2).reshape(b * dof, 1, h), self._sma_kernel, padding="same"
-            )
-            .view(b, dof, h)
-            .transpose(-1, -2)
-            .contiguous()
-        )
-        return new_signal


@get_torch_jit_decorator(force_jit=True)
--- a/src/curobo/types/robot.py
+++ b/src/curobo/types/robot.py
@@ -58,7 +58,6 @@ class RobotConfig:
        cuda_robot_model_config = CudaRobotModelConfig.from_basic_urdf(
            urdf_path, base_link, ee_link, tensor_args
        )
-        # load other params:

        return RobotConfig(
            cuda_robot_model_config,
--- a/src/curobo/types/state.py
+++ b/src/curobo/types/state.py
@@ -11,7 +11,7 @@
 from __future__ import annotations

 # Standard Library
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import List, Optional, Sequence, Tuple, Union

 # Third Party
@@ -73,6 +73,7 @@ class JointState(State):
    joint_names: Optional[List[str]] = None
    jerk: Union[List[float], T_DOF, None] = None  # Optional
    tensor_args: TensorDeviceType = TensorDeviceType()
+    aux_data: dict = field(default_factory=lambda: {})

    def __post_init__(self):
        if isinstance(self.position, torch.Tensor):
--- a/src/curobo/util/torch_utils.py
+++ b/src/curobo/util/torch_utils.py
@@ -10,6 +10,8 @@
 #
 # Standard Library
 import os
+from functools import lru_cache
+from typing import Optional

 # Third Party
 import torch
@@ -148,5 +150,20 @@ def get_torch_jit_decorator(
        return empty_decorator


+def is_lru_cache_avaiable():
+    use_lru_cache = os.environ.get("CUROBO_USE_LRU_CACHE")
+    if use_lru_cache is not None:
+        return bool(int(use_lru_cache))
+    log_info("Environment variable for CUROBO_USE_LRU_CACHE is not set, Enabling as default.")
+    return False
+
+
+def get_cache_fn_decorator(maxsize: Optional[int] = None):
+    if is_lru_cache_avaiable():
+        return lru_cache(maxsize=maxsize)
+    else:
+        return empty_decorator
+
+
 def empty_decorator(function):
    return function
--- a/src/curobo/util/trajectory.py
+++ b/src/curobo/util/trajectory.py
@@ -141,10 +141,19 @@ def get_batch_interpolated_trajectory(
    # given the dt required to run trajectory at maximum velocity,
    # we find the number of timesteps required:
    if optimize_dt:
+        traj_vel = raw_traj.velocity
+        traj_acc = raw_traj.acceleration
+        traj_jerk = raw_traj.jerk
+        if "raw_velocity" in raw_traj.aux_data:
+            traj_vel = raw_traj.aux_data["raw_velocity"]
+        if "raw_acceleration" in raw_traj.aux_data:
+            traj_acc = raw_traj.aux_data["raw_acceleration"]
+        if "raw_jerk" in raw_traj.aux_data:
+            traj_jerk = raw_traj.aux_data["raw_jerk"]
        traj_steps, steps_max, opt_dt = calculate_tsteps(
-            raw_traj.velocity,
-            raw_traj.acceleration,
-            raw_traj.jerk,
+            traj_vel,
+            traj_acc,
+            traj_jerk,
            interpolation_dt,
            max_vel,
            max_acc,
@@ -464,7 +473,7 @@ def calculate_dt_fixed(
    raw_dt: torch.Tensor,
    min_dt: float,
    max_dt: float,
-    epsilon: float = 1e-6,
+    epsilon: float = 1e-4,
 ):
    # compute scaled dt:
    max_v_arr = torch.max(torch.abs(vel), dim=-2)[0]  # output is batch, dof
@@ -497,7 +506,7 @@ def calculate_dt(
    max_jerk: torch.Tensor,
    raw_dt: float,
    min_dt: float,
-    epsilon: float = 1e-6,
+    epsilon: float = 1e-4,
 ):
    # compute scaled dt:
    max_v_arr = torch.max(torch.abs(vel), dim=-2)[0]  # output is batch, dof
@@ -529,7 +538,7 @@ def calculate_dt_no_clamp(
    max_vel: torch.Tensor,
    max_acc: torch.Tensor,
    max_jerk: torch.Tensor,
-    epsilon: float = 1e-6,
+    epsilon: float = 1e-4,
 ):
    # compute scaled dt:
    max_v_arr = torch.max(torch.abs(vel), dim=-2)[0]  # output is batch, dof
--- a/src/curobo/wrap/reacher/motion_gen.py
+++ b/src/curobo/wrap/reacher/motion_gen.py
@@ -121,6 +121,9 @@ class MotionGenConfig:
    #: instance of trajectory optimization solver to use for reaching joint space targets.
    js_trajopt_solver: TrajOptSolver

+    #: instance of trajectory optimization solver for final fine tuning for joint space targets.
+    finetune_js_trajopt_solver: TrajOptSolver
+
    #: instance of trajectory optimization solver for final fine tuning.
    finetune_trajopt_solver: TrajOptSolver

@@ -760,6 +763,7 @@ class MotionGenConfig:
            minimize_jerk=minimize_jerk,
            filter_robot_command=filter_robot_command,
            optimize_dt=optimize_dt,
+            num_seeds=num_trajopt_noisy_seeds,
        )
        js_trajopt_solver = TrajOptSolver(js_trajopt_cfg)

@@ -805,6 +809,48 @@ class MotionGenConfig:
        )

        finetune_trajopt_solver = TrajOptSolver(finetune_trajopt_cfg)
+
+        finetune_js_trajopt_cfg = TrajOptSolverConfig.load_from_robot_config(
+            robot_cfg=robot_cfg,
+            world_model=world_model,
+            tensor_args=tensor_args,
+            position_threshold=position_threshold,
+            rotation_threshold=rotation_threshold,
+            world_coll_checker=world_coll_checker,
+            base_cfg_file=base_config_data,
+            particle_file=particle_trajopt_file,
+            gradient_file=finetune_trajopt_file,
+            traj_tsteps=js_trajopt_tsteps,
+            interpolation_type=interpolation_type,
+            interpolation_steps=interpolation_steps,
+            use_cuda_graph=use_cuda_graph,
+            self_collision_check=self_collision_check,
+            self_collision_opt=self_collision_opt,
+            grad_trajopt_iters=grad_trajopt_iters,
+            interpolation_dt=interpolation_dt,
+            use_particle_opt=False,
+            traj_evaluator_config=traj_evaluator_config,
+            traj_evaluator=traj_evaluator,
+            use_gradient_descent=use_gradient_descent,
+            use_es=use_es_trajopt,
+            es_learning_rate=es_trajopt_learning_rate,
+            use_fixed_samples=use_trajopt_fixed_samples,
+            evaluate_interpolated_trajectory=evaluate_interpolated_trajectory,
+            fixed_iters=fixed_iters_trajopt,
+            store_debug=store_trajopt_debug,
+            collision_activation_distance=collision_activation_distance,
+            trajopt_dt=js_trajopt_dt,
+            store_debug_in_result=store_debug_in_result,
+            smooth_weight=smooth_weight,
+            cspace_threshold=cspace_threshold,
+            state_finite_difference_mode=state_finite_difference_mode,
+            minimize_jerk=minimize_jerk,
+            filter_robot_command=filter_robot_command,
+            optimize_dt=optimize_dt,
+            num_seeds=num_trajopt_noisy_seeds,
+        )
+        finetune_js_trajopt_solver = TrajOptSolver(finetune_js_trajopt_cfg)
+
        if graph_trajopt_iters is not None:
            graph_trajopt_iters = math.ceil(
                graph_trajopt_iters / finetune_trajopt_solver.solver.newton_optimizer.inner_iters
@@ -823,6 +869,7 @@ class MotionGenConfig:
            graph_planner,
            trajopt_solver=trajopt_solver,
            js_trajopt_solver=js_trajopt_solver,
+            finetune_js_trajopt_solver=finetune_js_trajopt_solver,
            finetune_trajopt_solver=finetune_trajopt_solver,
            interpolation_type=interpolation_type,
            interpolation_steps=interpolation_steps,
@@ -951,6 +998,9 @@ class MotionGenPlanConfig:
    #: check for joint limits, self-collision, and collision with the world.
    check_start_validity: bool = True

+    #: Finetune dt scale for joint space planning.
+    finetune_js_dt_scale: Optional[float] = 1.1
+
    def __post_init__(self):
        """Post initialization checks."""
        if not self.enable_opt and not self.enable_graph:
@@ -983,6 +1033,7 @@ class MotionGenPlanConfig:
            finetune_dt_scale=self.finetune_dt_scale,
            finetune_attempts=self.finetune_attempts,
            time_dilation_factor=self.time_dilation_factor,
+            finetune_js_dt_scale=self.finetune_js_dt_scale,
        )


@@ -1760,6 +1811,7 @@ class MotionGen(MotionGenConfig):
        self.ik_solver.reset_seed()
        self.graph_planner.reset_seed()
        self.trajopt_solver.reset_seed()
+        self.js_trajopt_solver.reset_seed()

    def get_retract_config(self) -> T_DOF:
        """Returns the retract/home configuration of the robot."""
@@ -1803,7 +1855,12 @@ class MotionGen(MotionGenConfig):
            goal_state = start_state.clone()
            goal_state.position[..., warmup_joint_index] += warmup_joint_delta
            for _ in range(3):
-                self.plan_single_js(start_state, goal_state, MotionGenPlanConfig(max_attempts=1))
+                self.plan_single_js(
+                    start_state.clone(),
+                    goal_state.clone(),
+                    MotionGenPlanConfig(max_attempts=1, enable_finetune_trajopt=True),
+                )
+
        if enable_graph:
            start_state = JointState.from_position(
                self.rollout_fn.dynamics_model.retract_config.view(1, -1).clone(),
@@ -1983,7 +2040,7 @@ class MotionGen(MotionGenConfig):
            "finetune_time": 0,
        }
        result = None
-        goal = Goal(goal_state=goal_state, current_state=start_state)
+        # goal = Goal(goal_state=goal_state, current_state=start_state)
        solve_state = ReacherSolveState(
            ReacherSolveType.SINGLE,
            num_ik_seeds=1,
@@ -2009,7 +2066,7 @@ class MotionGen(MotionGenConfig):
            result = self._plan_js_from_solve_state(
                solve_state, start_state, goal_state, plan_config=plan_config
            )
-            time_dict["trajopt_time"] += result.solve_time
+            time_dict["trajopt_time"] += result.trajopt_time
            time_dict["graph_time"] += result.graph_time
            time_dict["finetune_time"] += result.finetune_time
            time_dict["trajopt_attempts"] = n
@@ -2160,6 +2217,7 @@ class MotionGen(MotionGenConfig):
                + self.trajopt_solver.get_all_rollout_instances()
                + self.finetune_trajopt_solver.get_all_rollout_instances()
                + self.js_trajopt_solver.get_all_rollout_instances()
+                + self.finetune_js_trajopt_solver.get_all_rollout_instances()
            )
        return self._rollout_list

@@ -2171,6 +2229,7 @@ class MotionGen(MotionGenConfig):
                + self.trajopt_solver.solver.get_all_rollout_instances()
                + self.finetune_trajopt_solver.solver.get_all_rollout_instances()
                + self.js_trajopt_solver.solver.get_all_rollout_instances()
+                + self.finetune_js_trajopt_solver.solver.get_all_rollout_instances()
            )
        return self._solver_rollout_list

@@ -2529,6 +2588,11 @@ class MotionGen(MotionGenConfig):
        """Check if the pose cost metric is projected to goal frame."""
        return self.trajopt_solver.rollout_fn.goal_cost.project_distance

+    @property
+    def joint_names(self) -> List[str]:
+        """Get the joint names of the robot."""
+        return self.rollout_fn.joint_names
+
    def update_interpolation_type(
        self,
        interpolation_type: InterpolateType,
@@ -2548,6 +2612,7 @@ class MotionGen(MotionGenConfig):
            self.trajopt_solver.interpolation_type = interpolation_type
            self.finetune_trajopt_solver.interpolation_type = interpolation_type
            self.js_trajopt_solver.interpolation_type = interpolation_type
+            self.finetune_js_trajopt_solver.interpolation_type = interpolation_type

    def update_locked_joints(
        self, lock_joints: Dict[str, float], robot_config_dict: Union[str, Dict[Any]]
@@ -3375,8 +3440,9 @@ class MotionGen(MotionGenConfig):
                        seed_override = solve_state.num_trajopt_seeds * self.noisy_trajopt_seeds

                    finetune_time = 0
+                    newton_iters = None
+
                    for k in range(plan_config.finetune_attempts):
-                        newton_iters = None

                        scaled_dt = torch.clamp(
                            opt_dt
@@ -3396,7 +3462,7 @@ class MotionGen(MotionGenConfig):
                            newton_iters=newton_iters,
                        )
                        finetune_time += traj_result.solve_time
-                        if torch.count_nonzero(traj_result.success) > 0:
+                        if torch.count_nonzero(traj_result.success) > 0 or not self.optimize_dt:
                            break
                        seed_traj = traj_result.optimized_seeds.detach().clone()
                        newton_iters = 4
@@ -3592,7 +3658,7 @@ class MotionGen(MotionGenConfig):
                    solve_state,
                    trajopt_seed_traj,
                    num_seeds_override=solve_state.num_trajopt_seeds,
-                    newton_iters=trajopt_newton_iters + 2,
+                    newton_iters=trajopt_newton_iters,
                    return_all_solutions=plan_config.enable_finetune_trajopt,
                    trajopt_instance=self.js_trajopt_solver,
                )
@@ -3605,29 +3671,42 @@ class MotionGen(MotionGenConfig):
            # run finetune
            if plan_config.enable_finetune_trajopt and torch.count_nonzero(traj_result.success) > 0:
                with profiler.record_function("motion_gen/finetune_trajopt"):
-                    seed_traj = traj_result.raw_action.clone()  # solution.position.clone()
-                    seed_traj = seed_traj.contiguous()
+                    seed_traj = traj_result.raw_action.clone()
                    og_solve_time = traj_result.solve_time
+                    opt_dt = traj_result.optimized_dt
+                    opt_dt = torch.min(opt_dt[traj_result.success])
+                    finetune_time = 0
+                    newton_iters = None
+                    for k in range(plan_config.finetune_attempts):
+                        scaled_dt = torch.clamp(
+                            opt_dt
+                            * plan_config.finetune_js_dt_scale
+                            * (plan_config.finetune_dt_decay ** (k)),
+                            self.js_trajopt_solver.minimum_trajectory_dt,
+                        )

-                    scaled_dt = torch.clamp(
-                        torch.max(traj_result.optimized_dt[traj_result.success]),
-                        self.trajopt_solver.minimum_trajectory_dt,
-                    )
-                    og_dt = self.js_trajopt_solver.solver_dt.clone()
-                    self.js_trajopt_solver.update_solver_dt(scaled_dt.item())
-                    traj_result = self._solve_trajopt_from_solve_state(
-                        goal,
-                        solve_state,
-                        seed_traj,
-                        trajopt_instance=self.js_trajopt_solver,
-                        num_seeds_override=solve_state.num_trajopt_seeds,
-                        newton_iters=trajopt_newton_iters + 4,
-                    )
-                    self.js_trajopt_solver.update_solver_dt(og_dt)
+                        if self.optimize_dt:
+                            self.finetune_js_trajopt_solver.update_solver_dt(scaled_dt.item())
+                        traj_result = self._solve_trajopt_from_solve_state(
+                            goal,
+                            solve_state,
+                            seed_traj,
+                            trajopt_instance=self.finetune_js_trajopt_solver,
+                            num_seeds_override=solve_state.num_trajopt_seeds,
+                            newton_iters=newton_iters,
+                            return_all_solutions=False,
+                        )

-                result.finetune_time = traj_result.solve_time
+                        finetune_time += traj_result.solve_time
+                        if torch.count_nonzero(traj_result.success) > 0 or not self.optimize_dt:
+                            break

-                traj_result.solve_time = og_solve_time
+                        seed_traj = traj_result.optimized_seeds.detach().clone()
+                        newton_iters = 4
+
+                    result.finetune_time = finetune_time
+
+                    traj_result.solve_time = og_solve_time
                if self.store_debug_in_result:
                    result.debug_info["finetune_trajopt_result"] = traj_result
                if torch.count_nonzero(traj_result.success) == 0:
@@ -3638,7 +3717,6 @@ class MotionGen(MotionGenConfig):
            result.trajopt_time = traj_result.solve_time
            result.trajopt_attempts = 1
            result.success = traj_result.success
-
            result.interpolated_plan = traj_result.interpolated_solution.trim_trajectory(
                0, traj_result.path_buffer_last_tstep[0]
            )
--- a/src/curobo/wrap/reacher/trajopt.py
+++ b/src/curobo/wrap/reacher/trajopt.py
@@ -1343,14 +1343,26 @@ class TrajOptSolver(TrajOptSolverConfig):
                goal=goal,
                solution=result.action.scale_by_dt(self.solver_dt_tensor, opt_dt.view(-1, 1, 1)),
                seed=seed_traj,
-                position_error=result.metrics.position_error,
-                rotation_error=result.metrics.rotation_error,
+                position_error=(
+                    result.metrics.position_error[..., -1]
+                    if result.metrics.position_error is not None
+                    else None
+                ),
+                rotation_error=(
+                    result.metrics.rotation_error[..., -1]
+                    if result.metrics.rotation_error is not None
+                    else None
+                ),
                solve_time=result.solve_time,
                metrics=result.metrics,  # TODO: index this also
                interpolated_solution=interpolated_trajs,
                debug_info={"solver": result.debug, "interpolation_time": interpolation_time},
                path_buffer_last_tstep=last_tstep,
-                cspace_error=result.metrics.cspace_error,
+                cspace_error=(
+                    result.metrics.cspace_error[..., -1]
+                    if result.metrics.cspace_error is not None
+                    else None
+                ),
                optimized_dt=opt_dt,
                raw_solution=result.action,
                raw_action=result.raw_action,
@@ -1694,6 +1706,7 @@ class TrajOptSolver(TrajOptSolverConfig):
                (b, self.interpolation_steps, dof), self.tensor_args
            )
            self._interpolated_traj_buffer.joint_names = self.rollout_fn.joint_names
+
        state, last_tstep, opt_dt = get_batch_interpolated_trajectory(
            traj_state,
            self.solver_dt_tensor,