diff --git a/CHANGELOG.md b/CHANGELOG.md
index 36b0b08..b960ac5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,10 +10,19 @@ its affiliates is strictly prohibited.
 -->
 # Changelog
 
-## Latest Commit
+## Version 0.7.3
 
 ### New Features
 - Add start state checks for world collision, self-collision, and joint limits.
+- Add finetune with dt scaling for `motion_gen.plan_single_js` to get more time optimal
+trajectories in joint space planning.
+- Improve joint space planning convergence, now succeeds in more planning problems with higher
+accuracy.
+
+### Changes in default behavior
+- Some warp kernels are now compiled based on runtime parameters (dof), causing a slowdown in load
+time for motion_gen. To avoid this slowdown, add an environment variable `CUROBO_USE_LRU_CACHE=1`
+which will cache the runtime generated kernels.
 
 ### BugFixes & Misc.
 - Fix bug in evaluator to account for dof maximum acceleration and jerk.
@@ -23,6 +32,17 @@ its affiliates is strictly prohibited.
 - Add `g_dim` check for `int` in batched planning.
 - Add `link_poses` for motion_gen.warmup() in batch planning mode.
 - Add `link_poses` as input to `batch_goalset`.
+- Add finetune js trajopt solver.
+- Pass raw velocity, acceleration, and jerk values to dt computation function to prevent
+interpolation errors from causing out of joint limit failures
+- Add `finetune_js_dt_scale` with a default value > 1.0 as joint space trajectories are
+time optimal in sparse obstacle environments.
+- Add note on deterministic behavior to website. Use lbfgs history < 12 for deterministic
+optimization results.
+- Add warning when adding a mesh with the same name as in existing cache.
+- Remove warmup for batch motion gen reacher isaac sim example.
+- Fix python examples in getting started webpage.
+- Refactor warp mesh query kernels to use a `wp.func` for signed distance queries.
 
 ## Version 0.7.2
 
diff --git a/benchmark/ik_benchmark.py b/benchmark/ik_benchmark.py
index 6aaf40c..a649c1d 100644
--- a/benchmark/ik_benchmark.py
+++ b/benchmark/ik_benchmark.py
@@ -37,6 +37,9 @@ from curobo.util_file import (
 )
 from curobo.wrap.reacher.ik_solver import IKSolver, IKSolverConfig
 
+# set seeds
+torch.manual_seed(2)
+
 torch.backends.cudnn.benchmark = True
 torch.backends.cuda.matmul.allow_tf32 = True
 torch.backends.cudnn.allow_tf32 = True
@@ -49,6 +52,7 @@ def run_full_config_collision_free_ik(
     use_cuda_graph=False,
     collision_free=True,
     high_precision=False,
+    num_seeds=12,
 ):
     tensor_args = TensorDeviceType()
     robot_data = load_yaml(join_path(get_robot_configs_path(), robot_file))["robot_cfg"]
@@ -67,7 +71,7 @@ def run_full_config_collision_free_ik(
         robot_cfg,
         world_cfg,
         position_threshold=position_threshold,
-        num_seeds=16,
+        num_seeds=num_seeds,
         self_collision_check=collision_free,
         self_collision_opt=collision_free,
         tensor_args=tensor_args,
@@ -120,10 +124,15 @@ if __name__ == "__main__":
         default="ik",
         help="File name prefix to use to save benchmark results",
     )
-
+    parser.add_argument(
+        "--num_seeds",
+        type=int,
+        default=16,
+        help="Number of seeds to use for IK",
+    )
     args = parser.parse_args()
 
-    b_list = [1, 10, 100, 2000][-1:]
+    b_list = [1, 10, 100, 500, 2000][:]
 
     robot_list = get_motion_gen_robot_list() + get_multi_arm_robot_list()[:2]
     world_file = "collision_test.yml"
@@ -142,6 +151,7 @@ if __name__ == "__main__":
         "Orientation-Error-Collision-Free-IK": [],
     }
     for robot_file in robot_list[:-1]:
+        print("running for robot: ", robot_file)
         # create a sampler with dof:
         for b_size in b_list:
             # sample test configs:
@@ -153,6 +163,7 @@ if __name__ == "__main__":
                 use_cuda_graph=True,
                 collision_free=False,
                 high_precision=args.high_precision,
+                num_seeds=args.num_seeds,
             )
             dt_cu_ik_cfree, success, p_err_c, q_err_c = run_full_config_collision_free_ik(
                 robot_file,
@@ -160,6 +171,7 @@ if __name__ == "__main__":
                 batch_size=b_size,
                 use_cuda_graph=True,
                 collision_free=True,
+                num_seeds=args.num_seeds,
                 # high_precision=args.high_precision,
             )
             # print(dt_cu/b_size, dt_cu_cg/b_size)
diff --git a/examples/isaac_sim/batch_motion_gen_reacher.py b/examples/isaac_sim/batch_motion_gen_reacher.py
index 9785dee..d488f68 100644
--- a/examples/isaac_sim/batch_motion_gen_reacher.py
+++ b/examples/isaac_sim/batch_motion_gen_reacher.py
@@ -156,11 +156,11 @@ def main():
     default_config = robot_cfg["kinematics"]["cspace"]["retract_config"]
 
     print("warming up...")
-    motion_gen.warmup(
-        batch=n_envs,
-        batch_env_mode=True,
-        warmup_js_trajopt=False,
-    )
+    # motion_gen.warmup(
+    #     batch=n_envs,
+    #     batch_env_mode=True,
+    #     warmup_js_trajopt=False,
+    # )
 
     add_extensions(simulation_app, args.headless_mode)
     config = RobotWorldConfig.load_from_config(
diff --git a/examples/motion_gen_example.py b/examples/motion_gen_example.py
index c01774a..0302122 100644
--- a/examples/motion_gen_example.py
+++ b/examples/motion_gen_example.py
@@ -134,7 +134,7 @@ def demo_motion_gen_simple():
     )
 
     result = motion_gen.plan_single(start_state, goal_pose, MotionGenPlanConfig(max_attempts=1))
-    traj = result.get_interpolated_plan()  # result.optimized_dt has the dt between timesteps
+    traj = result.get_interpolated_plan()  # result.interpolation_dt has the dt between timesteps
     print("Trajectory Generated: ", result.success)
 
 
diff --git a/setup.cfg b/setup.cfg
index f93dbfe..41592cb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -50,12 +50,13 @@ install_requires =
   torch>=1.10
   trimesh
   yourdfpy>=0.0.53
-  warp-lang>=0.9.0
+  warp-lang>=0.11.0
   scipy>=1.7.0
   tqdm
   wheel
   importlib_resources
   scikit-image
+
 packages = find_namespace:
 package_dir =
   = src
@@ -85,10 +86,10 @@ ci =
 
 
 # this is only available in 3.8+
-smooth = 
+smooth =
   trajectory_smoothing @ https://github.com/balakumar-s/trajectory_smoothing/raw/main/dist/trajectory_smoothing-0.3-cp38-cp38-linux_x86_64.whl
 
-usd = 
+usd =
   usd-core
 
 dev =
@@ -101,12 +102,12 @@ dev =
   pytest>6.2.5
   pytest-cov
 
-isaacsim = 
+isaacsim =
   tomli
   wheel
   ninja
 
-doc = 
+doc =
   sphinx
   sphinx_rtd_theme
   graphviz>=0.20.1
diff --git a/src/curobo/content/configs/robot/jaco7.yml b/src/curobo/content/configs/robot/jaco7.yml
index d7e76c8..58fd74c 100644
--- a/src/curobo/content/configs/robot/jaco7.yml
+++ b/src/curobo/content/configs/robot/jaco7.yml
@@ -10,7 +10,7 @@
 ##
 
 robot_cfg:
-  
+
   kinematics:
 
     urdf_path: "robot/jaco/jaco_7s.urdf"
@@ -37,13 +37,13 @@ robot_cfg:
     base_link: "root"
     ee_link: "j2s7s300_end_effector"
     link_names: null
-    
+
     collision_link_names: [
       "j2s7s300_link_base", "j2s7s300_link_1", "j2s7s300_link_2",
       "j2s7s300_link_3", "j2s7s300_link_4", "j2s7s300_link_5", "j2s7s300_link_6", "j2s7s300_link_7",
       "j2s7s300_link_finger_tip_1", "j2s7s300_link_finger_tip_2",
-      "j2s7s300_link_finger_tip_3", 
-      "j2s7s300_link_finger_3", 
+      "j2s7s300_link_finger_tip_3",
+      "j2s7s300_link_finger_3",
       "j2s7s300_link_finger_2",
       "j2s7s300_link_finger_1",
       ]
@@ -151,37 +151,60 @@ robot_cfg:
     collision_sphere_buffer: 0.005
     self_collision_ignore: {
       "j2s7s300_link_base":["j2s7s300_link_1"],
-      "j2s7s300_link_1":["j2s7s300_link_2"], 
+      "j2s7s300_link_1":["j2s7s300_link_2"],
       "j2s7s300_link_2":["j2s7s300_link_3"],
-      "j2s7s300_link_3":["j2s7s300_link_4"], 
-      "j2s7s300_link_4":["j2s7s300_link_5"], 
-      "j2s7s300_link_5":["j2s7s300_link_6"], 
-      "j2s7s300_link_6":["j2s7s300_link_7"], 
+      "j2s7s300_link_3":["j2s7s300_link_4"],
+      "j2s7s300_link_4":["j2s7s300_link_5"],
+      "j2s7s300_link_5":["j2s7s300_link_6",
+      "j2s7s300_link_finger_tip_1",
+      "j2s7s300_link_finger_tip_2",
+      "j2s7s300_link_finger_tip_3",
+      "j2s7s300_link_finger_1",
+      "j2s7s300_link_finger_2",
+      "j2s7s300_link_finger_3"],
+      "j2s7s300_link_6":["j2s7s300_link_7",
+      "j2s7s300_link_finger_tip_1",
+      "j2s7s300_link_finger_tip_2",
+      "j2s7s300_link_finger_tip_3",
+      "j2s7s300_link_finger_1",
+      "j2s7s300_link_finger_2",
+      "j2s7s300_link_finger_3"],
       "j2s7s300_link_7":[
-      "j2s7s300_link_finger_tip_1", 
+      "j2s7s300_link_finger_tip_1",
       "j2s7s300_link_finger_tip_2",
       "j2s7s300_link_finger_tip_3",
       "j2s7s300_link_finger_1",
       "j2s7s300_link_finger_2",
       "j2s7s300_link_finger_3",
       ],
-    
-      "j2s7s300_link_finger_3":["j2s7s300_link_finger_tip_3","j2s7s300_link_finger_2",
-       "j2s7s300_link_finger_1"], 
+
+      "j2s7s300_link_finger_3":
+      ["j2s7s300_link_finger_tip_3","j2s7s300_link_finger_2",
+       "j2s7s300_link_finger_1", "j2s7s300_link_finger_tip_1", "j2s7s300_link_finger_tip_2"],
+
       "j2s7s300_link_finger_2":["j2s7s300_link_finger_tip_2", "j2s7s300_link_finger_1",
-      "j2s7s300_link_finger_3"],
+      "j2s7s300_link_finger_3", "j2s7s300_link_finger_tip_3", "j2s7s300_link_finger_tip_1"],
+
       "j2s7s300_link_finger_1":["j2s7s300_link_finger_tip_1", "j2s7s300_link_finger_2",
-      "j2s7s300_link_finger_3"],
+      "j2s7s300_link_finger_3", "j2s7s300_link_finger_tip_3", "j2s7s300_link_finger_tip_2"],
+
+      "j2s7s300_link_finger_tip_1":["j2s7s300_link_finger_tip_2", "j2s7s300_link_finger_tip_3"],
+
+      "j2s7s300_link_finger_tip_2":["j2s7s300_link_finger_tip_3"],
 
       } # Dict[str, List[str]]
-    self_collision_buffer: {} # Dict[str, float]
-    
-    mesh_link_names: [ 
+    self_collision_buffer: {
+      #"j2s7s300_link_base": 0.02,
+      #"j2s7s300_link_1": 0.01,
+
+    } # Dict[str, float]
+
+    mesh_link_names: [
       "j2s7s300_link_base", "j2s7s300_link_1", "j2s7s300_link_2",
       "j2s7s300_link_3", "j2s7s300_link_4", "j2s7s300_link_5", "j2s7s300_link_6", "j2s7s300_link_7",
       "j2s7s300_link_finger_tip_1", "j2s7s300_link_finger_tip_2",
-      "j2s7s300_link_finger_tip_3", 
-      "j2s7s300_link_finger_3", 
+      "j2s7s300_link_finger_tip_3",
+      "j2s7s300_link_finger_3",
       "j2s7s300_link_finger_2",
       "j2s7s300_link_finger_1",
     ] # List[str]
@@ -191,18 +214,18 @@ robot_cfg:
     "j2s7s300_joint_finger_tip_1": 0,
     "j2s7s300_joint_finger_tip_2": 0,
     "j2s7s300_joint_finger_tip_3": 0,
-    
+
     }
 
     cspace:
-      joint_names: 
+      joint_names:
         - j2s7s300_joint_1
         - j2s7s300_joint_2
         - j2s7s300_joint_3
         - j2s7s300_joint_4
         - j2s7s300_joint_5
         - j2s7s300_joint_6
-        - j2s7s300_joint_7 
+        - j2s7s300_joint_7
         - j2s7s300_joint_finger_1
         - j2s7s300_joint_finger_2
         - j2s7s300_joint_finger_3
diff --git a/src/curobo/content/configs/task/base_cfg.yml b/src/curobo/content/configs/task/base_cfg.yml
index 27b79c2..b4683e0 100644
--- a/src/curobo/content/configs/task/base_cfg.yml
+++ b/src/curobo/content/configs/task/base_cfg.yml
@@ -11,10 +11,10 @@
 
 
 world_collision_checker_cfg:
-  cache: null 
+  cache: null
   checker_type: "PRIMITIVE"
   max_distance: 0.1
- 
+
 
 cost:
   pose_cfg:
@@ -27,8 +27,8 @@ cost:
     weight: [0.0, 0.0]
     vec_convergence: [0.0, 0.00] # orientation, position
     terminal: False
-    
- 
+
+
   bound_cfg:
     weight: 000.0
     activation_distance: [0.0,0.0,0.0,0.0]
@@ -58,14 +58,14 @@ convergence:
     weight: [0.1, 10.0]
     vec_convergence: [0.0, 0.0] # orientation, position
     terminal: False
-    
   cspace_cfg:
     weight: 1.0
     terminal: True
     run_weight: 0.0
+    use_l2_kernel: True
   null_space_cfg:
-    weight: 1.0
+    weight: 0.001
     terminal: True
     run_weight: 1.0
-    
- 
\ No newline at end of file
+    use_l2_kernel: True
+
diff --git a/src/curobo/content/configs/task/finetune_trajopt.yml b/src/curobo/content/configs/task/finetune_trajopt.yml
index 4ae1f61..5ff0949 100644
--- a/src/curobo/content/configs/task/finetune_trajopt.yml
+++ b/src/curobo/content/configs/task/finetune_trajopt.yml
@@ -49,7 +49,7 @@ cost:
 
 
   cspace_cfg:
-    weight: 10000.0
+    weight: 20000.0
     terminal: True
     run_weight: 0.00 #1
 
@@ -59,7 +59,7 @@ cost:
     run_weight_velocity: 0.0
     run_weight_acceleration: 1.0
     run_weight_jerk: 1.0
-    activation_distance: [0.05,0.05,0.05,0.05] # for position, velocity, acceleration and jerk
+    activation_distance: [0.05,0.001,0.001,0.001] # for position, velocity, acceleration and jerk
     null_space_weight: [0.0]
 
   primitive_collision_cfg:
diff --git a/src/curobo/content/configs/task/gradient_ik.yml b/src/curobo/content/configs/task/gradient_ik.yml
index 98354df..bfb9140 100644
--- a/src/curobo/content/configs/task/gradient_ik.yml
+++ b/src/curobo/content/configs/task/gradient_ik.yml
@@ -40,17 +40,22 @@ cost:
     terminal: False
     use_metric: True
     run_weight: 1.0
+
   cspace_cfg:
     weight: 0.000
+
   bound_cfg:
     weight: 5000.0
     activation_distance: [0.001]
-    null_space_weight: [0.1]
+    null_space_weight: [0.001]
+    use_l2_kernel: True
+
   primitive_collision_cfg:
     weight: 5000.0
     use_sweep: False
     classify: False
     activation_distance: 0.01
+
   self_collision_cfg:
     weight: 5000.0
     classify: False
diff --git a/src/curobo/content/configs/task/gradient_trajopt.yml b/src/curobo/content/configs/task/gradient_trajopt.yml
index da82760..baa93ae 100644
--- a/src/curobo/content/configs/task/gradient_trajopt.yml
+++ b/src/curobo/content/configs/task/gradient_trajopt.yml
@@ -48,7 +48,7 @@ cost:
     use_metric: True
 
   cspace_cfg:
-    weight: 10000.0
+    weight: 20000.0
     terminal: True
     run_weight: 0.0
 
@@ -100,7 +100,7 @@ lbfgs:
   use_cuda_update_best_kernel: True
   use_temporal_smooth: False
   sync_cuda_time: True
-  step_scale: 1.0 #1.0
+  step_scale: 1.0
   last_best: 10
   use_coo_sparse: True
   debug_info:
diff --git a/src/curobo/content/configs/task/particle_mpc.yml b/src/curobo/content/configs/task/particle_mpc.yml
index 7939b8f..c46f41b 100644
--- a/src/curobo/content/configs/task/particle_mpc.yml
+++ b/src/curobo/content/configs/task/particle_mpc.yml
@@ -25,7 +25,7 @@ model:
   control_space: 'ACCELERATION'
   teleport_mode: False
   state_finite_difference_mode: "CENTRAL"
- 
+
 
 cost:
   pose_cfg:
@@ -36,16 +36,16 @@ cost:
     terminal: True
     run_weight: 1.0
     use_metric: True
-    
+
   cspace_cfg:
     weight: 1000.0
     terminal: True
     run_weight: 1.0
-    
+
   bound_cfg:
     weight: [5000.0, 5000.0,5000.0,000.0]
     activation_distance: [0.1,0.1,0.1,0.1] # for position, velocity, acceleration and jerk
-    smooth_weight: [0.0, 50.0, 0.0,0.0] # [vel, acc, jerk, alpha_vel, eta_position, eta_vel, eta_acc] 
+    smooth_weight: [0.0, 50.0, 0.0,0.0] # [vel, acc, jerk, alpha_vel, eta_position, eta_vel, eta_acc]
     run_weight_velocity: 0.0
     run_weight_acceleration: 1.0
     run_weight_jerk: 1.0
@@ -60,7 +60,7 @@ cost:
     use_speed_metric: False
     speed_dt: 0.1 # used only for speed metric
     activation_distance: 0.025
-  
+
   self_collision_cfg:
     weight: 50000.0
     classify: False
@@ -83,13 +83,13 @@ mppi:
   alpha             : 1
   num_particles     : 400 #10000
   update_cov        : True
-  cov_type          : "DIAG_A" # 
+  cov_type          : "DIAG_A" #
   kappa             : 0.0001
   null_act_frac     : 0.05
   sample_mode       : 'BEST'
   base_action       : 'REPEAT'
   squash_fn         : 'CLAMP'
-  n_problems            : 1
+  n_problems        : 1
   use_cuda_graph    : True
   seed              : 0
   store_debug       : False
diff --git a/src/curobo/geom/sdf/warp_primitives.py b/src/curobo/geom/sdf/warp_primitives.py
index a5ac772..540a395 100644
--- a/src/curobo/geom/sdf/warp_primitives.py
+++ b/src/curobo/geom/sdf/warp_primitives.py
@@ -16,6 +16,16 @@ import warp as wp
 wp.set_module_options({"fast_math": False})
 
 
+@wp.func
+def mesh_query_point_fn(
+    idx: wp.uint64,
+    point: wp.vec3,
+    max_distance: float,
+):
+    collide_result = wp.mesh_query_point(idx, point, max_distance)
+    return collide_result
+
+
 @wp.kernel
 def get_swept_closest_pt_batch_env(
     pt: wp.array(dtype=wp.vec4),
@@ -74,9 +84,6 @@ def get_swept_closest_pt_batch_env(
     eta = float(0.0)
     dt = float(0.0)
     k0 = float(0.0)
-    face_index = int(0)
-    face_u = float(0.0)
-    face_v = float(0.0)
     sign = float(0.0)
     dist = float(0.0)
     dist_metric = float(0.0)
@@ -143,11 +150,12 @@ def get_swept_closest_pt_batch_env(
             obj_w_pose = wp.transform(obj_position, obj_quat)
             obj_w_pose_t = wp.transform_inverse(obj_w_pose)
             local_pt = wp.transform_point(obj_w_pose, in_pt)
-
-            if wp.mesh_query_point(
-                mesh[i], local_pt, max_dist_buffer, sign, face_index, face_u, face_v
-            ):
-                cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+            collide_result = mesh_query_point_fn(mesh[i], local_pt, max_dist_buffer)
+            if collide_result.result:
+                sign = collide_result.sign
+                cl_pt = wp.mesh_eval_position(
+                    mesh[i], collide_result.face, collide_result.u, collide_result.v
+                )
                 delta = cl_pt - local_pt
                 dis_length = wp.length(delta)
                 dist = (-1.0 * dis_length * sign) + in_rad
@@ -185,10 +193,12 @@ def get_swept_closest_pt_batch_env(
                         1.0 - 0.5 * jump_distance / sphere_0_distance
                     )  # dist could be greater than sphere_0_distance here?
                     sphere_int = k0 * local_pt + ((1.0 - k0) * sphere_temp)
-                    if wp.mesh_query_point(
-                        mesh[i], sphere_int, max_dist_buffer, sign, face_index, face_u, face_v
-                    ):
-                        cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+                    collide_result = mesh_query_point_fn(mesh[i], sphere_int, max_dist_buffer)
+                    if collide_result.result:
+                        sign = collide_result.sign
+                        cl_pt = wp.mesh_eval_position(
+                            mesh[i], collide_result.face, collide_result.u, collide_result.v
+                        )
                         delta = cl_pt - sphere_int
                         dis_length = wp.length(delta)
                         dist = (-1.0 * dis_length * sign) + in_rad
@@ -229,11 +239,12 @@ def get_swept_closest_pt_batch_env(
                         1.0 - 0.5 * jump_distance / sphere_2_distance
                     )  # dist could be greater than sphere_0_distance here?
                     sphere_int = k0 * local_pt + (1.0 - k0) * sphere_temp
-
-                    if wp.mesh_query_point(
-                        mesh[i], sphere_int, max_dist_buffer, sign, face_index, face_u, face_v
-                    ):
-                        cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+                    collide_result = mesh_query_point_fn(mesh[i], sphere_int, max_dist_buffer)
+                    if collide_result.result:
+                        sign = collide_result.sign
+                        cl_pt = wp.mesh_eval_position(
+                            mesh[i], collide_result.face, collide_result.u, collide_result.v
+                        )
                         delta = cl_pt - sphere_int
                         dis_length = wp.length(delta)
                         dist = (-1.0 * dis_length * sign) + in_rad
@@ -375,9 +386,6 @@ def get_closest_pt_batch_env(
     if b_idx >= batch_size or h_idx >= horizon or sph_idx >= nspheres:
         return
 
-    face_index = int(0)
-    face_u = float(0.0)
-    face_v = float(0.0)
     sign = float(0.0)
     dist = float(0.0)
     grad_vec = wp.vec3(0.0)
@@ -444,11 +452,12 @@ def get_closest_pt_batch_env(
             obj_w_pose = wp.transform(obj_position, obj_quat)
 
             local_pt = wp.transform_point(obj_w_pose, in_pt)
-
-            if wp.mesh_query_point(
-                mesh[i], local_pt, max_dist_buffer, sign, face_index, face_u, face_v
-            ):
-                cl_pt = wp.mesh_eval_position(mesh[i], face_index, face_u, face_v)
+            collide_result = mesh_query_point_fn(mesh[i], local_pt, max_dist_buffer)
+            if collide_result.result:
+                sign = collide_result.sign
+                cl_pt = wp.mesh_eval_position(
+                    mesh[i], collide_result.face, collide_result.u, collide_result.v
+                )
                 delta = cl_pt - local_pt
                 dis_length = wp.length(delta)
                 dist = -1.0 * dis_length * sign
diff --git a/src/curobo/geom/sdf/world_mesh.py b/src/curobo/geom/sdf/world_mesh.py
index 0835bf4..963dd89 100644
--- a/src/curobo/geom/sdf/world_mesh.py
+++ b/src/curobo/geom/sdf/world_mesh.py
@@ -126,7 +126,7 @@ class WorldMeshCollision(WorldPrimitiveCollision):
             self._wp_mesh_cache[mesh.name] = self._load_mesh_to_warp(mesh)
             # return self._wp_mesh_cache[mesh.name]
         else:
-            log_info("Object already in warp cache, using existing instance: " + mesh.name)
+            log_warn("Object already in warp cache, using existing instance for: " + mesh.name)
         return self._wp_mesh_cache[mesh.name]
 
     def _load_batch_mesh_to_warp(self, mesh_list: List[Mesh]):
diff --git a/src/curobo/opt/particle/particle_opt_utils.py b/src/curobo/opt/particle/particle_opt_utils.py
index 2c0bf07..c027f07 100644
--- a/src/curobo/opt/particle/particle_opt_utils.py
+++ b/src/curobo/opt/particle/particle_opt_utils.py
@@ -68,10 +68,8 @@ def get_stomp_cov(
     cov, scale_tril, scaled_M = get_stomp_cov_jit(
         horizon, d_action, cov_mode, device=tensor_args.device
     )
-    cov = tensor_args.to_device(cov)
-    scale_tril = tensor_args.to_device(scale_tril)
     if RETURN_M:
-        return cov, scale_tril, tensor_args.to_device(scaled_M)
+        return cov, scale_tril, scaled_M
     return cov, scale_tril
 
 
@@ -89,7 +87,7 @@ def get_stomp_cov_jit(
     A = torch.zeros(
         (d_action * horizon, d_action * horizon),
         dtype=torch.float32,
-        device=device,
+        device="cpu",
     )
 
     if cov_mode == "vel":
@@ -129,10 +127,15 @@ def get_stomp_cov_jit(
 
     # also compute the cholesky decomposition:
     # scale_tril = torch.zeros((d_action * horizon, d_action * horizon), **tensor_args)
+
     if (cov == cov.T).all() and (torch.linalg.eigvals(cov).real >= 0).all():
         scale_tril = torch.linalg.cholesky(cov)
     else:
         scale_tril = cov
+
+    cov = cov.to(device=device)
+    scale_tril = scale_tril.to(device=device)
+    scaled_M = scaled_M.to(device=device)
     """
     k = 0
     act_cov_matrix = cov[k * horizon:k * horizon + horizon, k * horizon:k * horizon + horizon]
@@ -140,7 +143,7 @@ def get_stomp_cov_jit(
     print(torch.det(act_cov_matrix))
     local_cholesky = matrix_cholesky(act_cov_matrix)
     for k in range(d_action):
-        
+
         scale_tril[k * horizon:k * horizon + horizon,k * horizon:k * horizon + horizon] = local_cholesky
     """
 
diff --git a/src/curobo/rollout/arm_base.py b/src/curobo/rollout/arm_base.py
index 8560a42..09cea1a 100644
--- a/src/curobo/rollout/arm_base.py
+++ b/src/curobo/rollout/arm_base.py
@@ -261,7 +261,7 @@ class ArmBase(RolloutBase, ArmBaseConfig):
                 log_warn(
                     "null space cost is deprecated, use null_space_weight in bound cost instead"
                 )
-
+            self.cost_cfg.bound_cfg.dof = self.n_dofs
             self.bound_cost = BoundCost(self.cost_cfg.bound_cfg)
 
         if self.cost_cfg.manipulability_cfg is not None:
@@ -315,10 +315,12 @@ class ArmBase(RolloutBase, ArmBaseConfig):
         self.cost_cfg.bound_cfg.state_finite_difference_mode = (
             self.dynamics_model.state_finite_difference_mode
         )
-
+        self.cost_cfg.bound_cfg.dof = self.n_dofs
+        self.constraint_cfg.bound_cfg.dof = self.n_dofs
         self.bound_constraint = BoundCost(self.constraint_cfg.bound_cfg)
 
         if self.convergence_cfg.null_space_cfg is not None:
+            self.convergence_cfg.null_space_cfg.dof = self.n_dofs
             self.null_convergence = DistCost(self.convergence_cfg.null_space_cfg)
 
         # set start state:
@@ -578,6 +580,7 @@ class ArmBase(RolloutBase, ArmBaseConfig):
         ----------
         action_seq: torch.Tensor [num_particles, horizon, d_act]
         """
+
         # print(act_seq.shape, self._goal_buffer.batch_current_state_idx)
         if self.start_state is None:
             raise ValueError("start_state is not set in rollout")
@@ -585,6 +588,7 @@ class ArmBase(RolloutBase, ArmBaseConfig):
             state = self.dynamics_model.forward(
                 self.start_state, act_seq, self._goal_buffer.batch_current_state_idx
             )
+
         with profiler.record_function("cost/all"):
             cost_seq = self.cost_fn(state, act_seq)
 
diff --git a/src/curobo/rollout/arm_reacher.py b/src/curobo/rollout/arm_reacher.py
index ac2cebc..93b9fab 100644
--- a/src/curobo/rollout/arm_reacher.py
+++ b/src/curobo/rollout/arm_reacher.py
@@ -174,6 +174,7 @@ class ArmReacher(ArmBase, ArmReacherConfig):
         self._n_goalset = 1
 
         if self.cost_cfg.cspace_cfg is not None:
+            self.cost_cfg.cspace_cfg.dof = self.d_action
             # self.cost_cfg.cspace_cfg.update_vec_weight(self.dynamics_model.cspace_distance_weight)
             self.dist_cost = DistCost(self.cost_cfg.cspace_cfg)
         if self.cost_cfg.pose_cfg is not None:
@@ -226,6 +227,7 @@ class ArmReacher(ArmBase, ArmReacherConfig):
                 if i != self.kinematics.ee_link:
                     self._link_pose_convergence[i] = PoseCost(self.convergence_cfg.link_pose_cfg)
         if self.convergence_cfg.cspace_cfg is not None:
+            self.convergence_cfg.cspace_cfg.dof = self.d_action
             self.cspace_convergence = DistCost(self.convergence_cfg.cspace_cfg)
 
         # check if g_dist is required in any of the cost terms:
diff --git a/src/curobo/rollout/cost/bound_cost.py b/src/curobo/rollout/cost/bound_cost.py
index 0c91f75..edb26a1 100644
--- a/src/curobo/rollout/cost/bound_cost.py
+++ b/src/curobo/rollout/cost/bound_cost.py
@@ -22,7 +22,7 @@ from curobo.cuda_robot_model.types import JointLimits
 from curobo.types.robot import JointState
 from curobo.types.tensor import T_DOF
 from curobo.util.logger import log_error
-from curobo.util.torch_utils import get_torch_jit_decorator
+from curobo.util.torch_utils import get_cache_fn_decorator, get_torch_jit_decorator
 from curobo.util.warp import init_warp
 
 # Local Folder
@@ -49,6 +49,7 @@ class BoundCostConfig(CostConfig):
     activation_distance: Union[torch.Tensor, float] = 0.0
     state_finite_difference_mode: str = "BACKWARD"
     null_space_weight: Optional[List[float]] = None
+    use_l2_kernel: bool = False
 
     def set_bounds(self, bounds: JointLimits, teleport_mode: bool = False):
         self.joint_limits = bounds.clone()
@@ -104,12 +105,21 @@ class BoundCost(CostBase, BoundCostConfig):
             (0, 0, 0), device=self.tensor_args.device, dtype=self.tensor_args.dtype
         )
         self._out_gv_buffer = self._out_ga_buffer = self._out_gj_buffer = empty_buffer
+        if self.use_l2_kernel:
+            if self.cost_type == BoundCostType.POSITION:
+                self._l2_cost = make_bound_pos_kernel(self.dof)
+            if self.cost_type == BoundCostType.BOUNDS_SMOOTH:
+                self._l2_cost = make_bound_pos_smooth_kernel(self.dof)
 
     def update_batch_size(self, batch, horizon, dof):
         if self._batch_size != batch or self._horizon != horizon or self._dof != dof:
             self._out_c_buffer = torch.zeros(
                 (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
             )
+            self._out_c_sum_buffer = torch.zeros(
+                (batch, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+            )
+
             self._out_gp_buffer = torch.zeros(
                 (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
             )
@@ -184,36 +194,61 @@ class BoundCost(CostBase, BoundCostConfig):
             retract_idx = self._retract_cfg_idx
 
         if self.cost_type == BoundCostType.BOUNDS_SMOOTH:
-            # print(self.joint_limits.jerk.shape, self.joint_limits.position.shape)
-            cost = WarpBoundSmoothFunction.apply(
-                state_batch.position,
-                state_batch.velocity,
-                state_batch.acceleration,
-                state_batch.jerk,
-                retract_config,
-                retract_idx,
-                self.joint_limits.position,
-                self.joint_limits.velocity,
-                self.joint_limits.acceleration,
-                self.joint_limits.jerk,
-                self.weight,
-                self.activation_distance,
-                self.smooth_weight,
-                self.cspace_distance_weight,
-                self.null_space_weight,
-                self.vec_weight,
-                self._run_weight_vel,
-                self._run_weight_acc,
-                self._run_weight_jerk,
-                self._out_c_buffer,
-                self._out_gp_buffer,
-                self._out_gv_buffer,
-                self._out_ga_buffer,
-                self._out_gj_buffer,
-            )
-            # print(self.cspace_distance_weight.shape)
-            # print(cost)
-            # print(self._run_weight_acc)
+            if self.use_l2_kernel:
+                cost = WarpBoundSmoothL2Function.apply(
+                    state_batch.position,
+                    state_batch.velocity,
+                    state_batch.acceleration,
+                    state_batch.jerk,
+                    retract_config,
+                    retract_idx,
+                    self.joint_limits.position,
+                    self.joint_limits.velocity,
+                    self.joint_limits.acceleration,
+                    self.joint_limits.jerk,
+                    self.weight,
+                    self.activation_distance,
+                    self.smooth_weight,
+                    self.cspace_distance_weight,
+                    self.null_space_weight,
+                    self.vec_weight,
+                    self._run_weight_vel,
+                    self._run_weight_acc,
+                    self._run_weight_jerk,
+                    self._out_c_sum_buffer,
+                    self._out_gp_buffer,
+                    self._out_gv_buffer,
+                    self._out_ga_buffer,
+                    self._out_gj_buffer,
+                    self._l2_cost,
+                )
+            else:
+                cost = WarpBoundSmoothFunction.apply(
+                    state_batch.position,
+                    state_batch.velocity,
+                    state_batch.acceleration,
+                    state_batch.jerk,
+                    retract_config,
+                    retract_idx,
+                    self.joint_limits.position,
+                    self.joint_limits.velocity,
+                    self.joint_limits.acceleration,
+                    self.joint_limits.jerk,
+                    self.weight,
+                    self.activation_distance,
+                    self.smooth_weight,
+                    self.cspace_distance_weight,
+                    self.null_space_weight,
+                    self.vec_weight,
+                    self._run_weight_vel,
+                    self._run_weight_acc,
+                    self._run_weight_jerk,
+                    self._out_c_buffer,
+                    self._out_gp_buffer,
+                    self._out_gv_buffer,
+                    self._out_ga_buffer,
+                    self._out_gj_buffer,
+                )
         elif self.cost_type == BoundCostType.BOUNDS:
             cost = WarpBoundFunction.apply(
                 state_batch.position,
@@ -237,8 +272,8 @@ class BoundCost(CostBase, BoundCostConfig):
                 self._out_gj_buffer,
             )
         elif self.cost_type == BoundCostType.POSITION:
-            if self.return_loss:
-                cost = WarpBoundPosLoss.apply(
+            if self.use_l2_kernel:
+                cost = WarpBoundPosL2Function.apply(
                     state_batch.position,
                     retract_config,
                     retract_idx,
@@ -247,8 +282,10 @@ class BoundCost(CostBase, BoundCostConfig):
                     self.activation_distance,
                     self.null_space_weight,
                     self.vec_weight,
-                    self._out_c_buffer,
+                    self._out_c_sum_buffer,
                     self._out_gp_buffer,
+                    self._l2_cost,
+                    self.return_loss,
                 )
             else:
                 cost = WarpBoundPosFunction.apply(
@@ -262,6 +299,7 @@ class BoundCost(CostBase, BoundCostConfig):
                     self.vec_weight,
                     self._out_c_buffer,
                     self._out_gp_buffer,
+                    self.return_loss,
                 )
 
         else:
@@ -458,6 +496,132 @@ class WarpBoundSmoothFunction(torch.autograd.Function):
         )
 
 
+# create a bound cost tensor:
+class WarpBoundSmoothL2Function(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        pos,
+        vel,
+        acc,
+        jerk,
+        retract_config,
+        retract_idx,
+        p_b,
+        v_b,
+        a_b,
+        j_b,
+        weight,
+        activation_distance,
+        smooth_weight,
+        cspace_weight,
+        null_space_weight,
+        vec_weight,
+        run_weight_vel,
+        run_weight_acc,
+        run_weight_jerk,
+        out_cost,
+        out_gp,
+        out_gv,
+        out_ga,
+        out_gj,
+        warp_function,
+    ):
+        # scale the weights for smoothness by this dt:
+        wp_device = wp.device_from_torch(vel.device)
+        # assert smooth_weight.shape[0] == 7
+        b, h, dof = vel.shape
+        requires_grad = pos.requires_grad
+
+        wp.launch(
+            kernel=warp_function,
+            dim=b * h,
+            inputs=[
+                wp.from_torch(pos.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(vel.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(acc.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(jerk.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_config.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_idx.detach().view(-1), dtype=wp.int32),
+                wp.from_torch(p_b.view(-1), dtype=wp.float32),
+                wp.from_torch(v_b.view(-1), dtype=wp.float32),
+                wp.from_torch(a_b.view(-1), dtype=wp.float32),
+                wp.from_torch(j_b.view(-1), dtype=wp.float32),
+                wp.from_torch(weight, dtype=wp.float32),
+                wp.from_torch(activation_distance, dtype=wp.float32),
+                wp.from_torch(smooth_weight, dtype=wp.float32),
+                wp.from_torch(cspace_weight, dtype=wp.float32),
+                wp.from_torch(null_space_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(vec_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(run_weight_vel.view(-1), dtype=wp.float32),
+                wp.from_torch(run_weight_acc.view(-1), dtype=wp.float32),
+                wp.from_torch(run_weight_jerk.view(-1), dtype=wp.float32),
+                wp.from_torch(out_cost.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gp.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gv.view(-1), dtype=wp.float32),
+                wp.from_torch(out_ga.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gj.view(-1), dtype=wp.float32),
+                requires_grad,
+                b,
+                h,
+                dof,
+            ],
+            device=wp_device,
+            stream=wp.stream_from_torch(vel.device),
+        )
+        ctx.save_for_backward(out_gp, out_gv, out_ga, out_gj)
+
+        return out_cost
+
+    @staticmethod
+    def backward(ctx, grad_out_cost):
+        (
+            p_grad,
+            v_grad,
+            a_grad,
+            j_grad,
+        ) = ctx.saved_tensors
+        v_g = None
+        a_g = None
+        p_g = None
+        j_g = None
+        if ctx.needs_input_grad[0]:
+            p_g = p_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[1]:
+            v_g = v_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[2]:
+            a_g = a_grad  # * grad_out_cost#.unsqueeze(-1)
+        if ctx.needs_input_grad[3]:
+            j_g = j_grad  # * grad_out_cost#.unsqueeze(-1)
+        return (
+            p_g,
+            v_g,
+            a_g,
+            j_g,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
 class WarpBoundFunction(torch.autograd.Function):
     @staticmethod
     def forward(
@@ -581,6 +745,7 @@ class WarpBoundPosFunction(torch.autograd.Function):
         vec_weight,
         out_cost,
         out_gp,
+        return_loss=False,
     ):
         wp_device = wp.device_from_torch(pos.device)
         b, h, dof = pos.shape
@@ -607,10 +772,9 @@ class WarpBoundPosFunction(torch.autograd.Function):
             device=wp_device,
             stream=wp.stream_from_torch(pos.device),
         )
+        ctx.return_loss = return_loss
         ctx.save_for_backward(out_gp)
-        # cost = torch.linalg.norm(out_cost, dim=-1)
         cost = torch.sum(out_cost, dim=-1)
-        # cost = out_cost
         return cost
 
     @staticmethod
@@ -619,20 +783,71 @@ class WarpBoundPosFunction(torch.autograd.Function):
 
         p_g = None
         if ctx.needs_input_grad[0]:
-            p_g = p_grad  # * grad_out_cost.unsqueeze(-1)
-        return p_g, None, None, None, None, None, None, None, None, None
+            p_g = p_grad
+            if ctx.return_loss:
+                p_g = p_grad * grad_out_cost.unsqueeze(-1)
+
+        return p_g, None, None, None, None, None, None, None, None, None, None
 
 
 # create a bound cost tensor:
-class WarpBoundPosLoss(WarpBoundPosFunction):
+class WarpBoundPosL2Function(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        pos,
+        retract_config,
+        retract_idx,
+        p_l,
+        weight,
+        activation_distance,
+        null_space_weight,
+        vec_weight,
+        out_cost,
+        out_gp,
+        warp_function,
+        return_loss=False,
+    ):
+        wp_device = wp.device_from_torch(pos.device)
+        b, h, dof = pos.shape
+        requires_grad = pos.requires_grad
+        wp.launch(
+            kernel=warp_function,
+            dim=b * h,
+            inputs=[
+                wp.from_torch(pos.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_config.detach().view(-1), dtype=wp.float32),
+                wp.from_torch(retract_idx.detach().view(-1), dtype=wp.int32),
+                wp.from_torch(p_l.view(-1), dtype=wp.float32),
+                wp.from_torch(weight, dtype=wp.float32),
+                wp.from_torch(activation_distance, dtype=wp.float32),
+                wp.from_torch(null_space_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(vec_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(out_cost.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gp.view(-1), dtype=wp.float32),
+                requires_grad,
+                b,
+                h,
+                dof,
+            ],
+            device=wp_device,
+            stream=wp.stream_from_torch(pos.device),
+        )
+        ctx.return_loss = return_loss
+        ctx.save_for_backward(out_gp)
+        # cost = torch.sum(out_cost, dim=-1)
+        return out_cost
+
     @staticmethod
     def backward(ctx, grad_out_cost):
         (p_grad,) = ctx.saved_tensors
 
         p_g = None
         if ctx.needs_input_grad[0]:
-            p_g = p_grad * grad_out_cost.unsqueeze(-1)
-        return p_g, None, None, None, None, None, None, None, None, None
+            p_g = p_grad  # * grad_out_cost.unsqueeze(-1)
+            if ctx.return_loss:
+                p_g = p_grad * grad_out_cost.unsqueeze(-1)
+        return p_g, None, None, None, None, None, None, None, None, None, None, None
 
 
 @wp.kernel
@@ -1122,3 +1337,314 @@ def forward_bound_smooth_warp(
         out_grad_v[b_addrs] = g_v
         out_grad_a[b_addrs] = g_a
         out_grad_j[b_addrs] = g_j
+
+
+@get_cache_fn_decorator()
+def make_bound_pos_smooth_kernel(dof_template: int):
+    def forward_bound_smooth_loop_warp(
+        pos: wp.array(dtype=wp.float32),
+        vel: wp.array(dtype=wp.float32),
+        acc: wp.array(dtype=wp.float32),
+        jerk: wp.array(dtype=wp.float32),
+        retract_config: wp.array(dtype=wp.float32),
+        retract_idx: wp.array(dtype=wp.int32),
+        p_b: wp.array(dtype=wp.float32),
+        v_b: wp.array(dtype=wp.float32),
+        a_b: wp.array(dtype=wp.float32),
+        j_b: wp.array(dtype=wp.float32),
+        weight: wp.array(dtype=wp.float32),
+        activation_distance: wp.array(dtype=wp.float32),
+        smooth_weight: wp.array(dtype=wp.float32),
+        cspace_weight: wp.array(dtype=wp.float32),
+        null_weight: wp.array(dtype=wp.float32),
+        vec_weight: wp.array(dtype=wp.float32),
+        run_weight_vel: wp.array(dtype=wp.float32),
+        run_weight_acc: wp.array(dtype=wp.float32),
+        run_weight_jerk: wp.array(dtype=wp.float32),
+        out_cost: wp.array(dtype=wp.float32),
+        out_grad_p: wp.array(dtype=wp.float32),
+        out_grad_v: wp.array(dtype=wp.float32),
+        out_grad_a: wp.array(dtype=wp.float32),
+        out_grad_j: wp.array(dtype=wp.float32),
+        write_grad: wp.uint8,  # this should be a bool
+        batch_size: wp.int32,
+        horizon: wp.int32,
+        dof: wp.int32,
+    ):
+        tid = wp.tid()
+        # initialize variables:
+        b_id = wp.int32(0)
+        h_id = wp.int32(0)
+        b_addrs = int(0)
+
+        w = wp.float32(0.0)
+        b_wv = float(0.0)
+        b_wa = float(0.0)
+        b_wj = float(0.0)
+
+        r_wa = wp.float32(0.0)
+        r_wj = wp.float32(0.0)
+
+        w_a = wp.float32(0.0)
+        w_j = wp.float32(0.0)
+
+        s_a = wp.float32(0.0)
+        s_j = wp.float32(0.0)
+
+        c_total = wp.float32(0.0)
+
+        # we launch batch * horizon * dof kernels
+        b_id = tid / (horizon)
+        h_id = tid - (b_id * horizon)
+        if b_id >= batch_size or h_id >= horizon:
+            return
+
+        n_w = wp.float32(0.0)
+        n_w = null_weight[0]
+
+        vec_weight_local = wp.vector(dtype=wp.float32, length=dof_template)
+        cspace_weight_local = wp.vector(dtype=wp.float32, length=dof_template)
+        p_l = wp.vector(dtype=wp.float32, length=dof_template)
+        p_u = wp.vector(dtype=wp.float32, length=dof_template)
+        v_l = wp.vector(dtype=wp.float32, length=dof_template)
+        v_u = wp.vector(dtype=wp.float32, length=dof_template)
+        a_l = wp.vector(dtype=wp.float32, length=dof_template)
+        a_u = wp.vector(dtype=wp.float32, length=dof_template)
+        j_l = wp.vector(dtype=wp.float32, length=dof_template)
+        j_u = wp.vector(dtype=wp.float32, length=dof_template)
+        g_p = wp.vector(dtype=wp.float32, length=dof_template)
+        g_v = wp.vector(dtype=wp.float32, length=dof_template)
+        g_a = wp.vector(dtype=wp.float32, length=dof_template)
+        g_j = wp.vector(dtype=wp.float32, length=dof_template)
+        target_id = wp.int32(0.0)
+        target_id = retract_idx[b_id]
+        current_p = wp.vector(dtype=wp.float32, length=dof_template)
+        current_v = wp.vector(dtype=wp.float32, length=dof_template)
+        current_a = wp.vector(dtype=wp.float32, length=dof_template)
+        current_j = wp.vector(dtype=wp.float32, length=dof_template)
+
+        target_p = wp.vector(dtype=wp.float32, length=dof_template)
+
+        b_addrs = b_id * horizon * dof + h_id * dof
+
+        for i in range(dof_template):
+            vec_weight_local[i] = vec_weight[i]
+            target_p[i] = retract_config[target_id * dof + i]
+            current_p[i] = pos[b_addrs + i]
+            current_v[i] = vel[b_addrs + i]
+            current_a[i] = acc[b_addrs + i]
+            current_j[i] = jerk[b_addrs + i]
+            p_l[i] = p_b[i]
+            p_u[i] = p_b[dof + i]
+            v_l[i] = v_b[i]
+            v_u[i] = v_b[dof + i]
+            a_l[i] = a_b[i]
+            a_u[i] = a_b[dof + i]
+            j_l[i] = j_b[i]
+            j_u[i] = j_b[dof + i]
+            cspace_weight_local[i] = cspace_weight[i]
+
+        # read weights:
+        w = weight[0]
+        b_wv = weight[1]
+        b_wa = weight[2]
+        b_wj = weight[3]
+        r_wa = run_weight_acc[h_id]
+        r_wj = run_weight_jerk[h_id]
+
+        w_a = smooth_weight[1]
+        w_j = smooth_weight[2]
+
+        # compute cost:
+
+        # read buffers:
+
+        # if w_j > 0.0:
+        eta_p = activation_distance[0]
+        eta_v = activation_distance[1]
+        eta_a = activation_distance[2]
+        eta_j = activation_distance[3]
+
+        p_range = p_u - p_l
+        p_l = p_l + (p_range) * eta_p
+        p_u = p_u - (p_range) * eta_p
+        v_l = v_l + (v_u - v_l) * eta_v
+        v_u = v_u - (v_u - v_l) * eta_v
+        a_l = a_l + (a_u - a_l) * eta_a
+        a_u = a_u - (a_u - a_l) * eta_a
+        j_l = j_l + (j_u - j_l) * eta_j
+        j_u = j_u - (j_u - j_l) * eta_j
+
+        # position:
+        if n_w > 0.0:
+            error = wp.cw_mul(vec_weight_local, current_p - target_p)
+            error_length = wp.length(error)
+            c_total = n_w * error_length
+            if error_length > 0.0:
+                g_p = n_w * error / error_length
+
+        # bound cost:
+        # bound cost:
+        bound_delta_p = wp.vector(dtype=wp.float32, length=dof_template)
+        bound_delta_v = wp.vector(dtype=wp.float32, length=dof_template)
+        bound_delta_a = wp.vector(dtype=wp.float32, length=dof_template)
+        bound_delta_j = wp.vector(dtype=wp.float32, length=dof_template)
+        for i in range(dof_template):
+            if current_p[i] < p_l[i]:
+                bound_delta_p[i] = current_p[i] - p_l[i]
+            elif current_p[i] > p_u[i]:
+                bound_delta_p[i] = current_p[i] - p_u[i]
+            if current_v[i] < v_l[i]:
+                bound_delta_v[i] = current_v[i] - v_l[i]
+            elif current_v[i] > v_u[i]:
+                bound_delta_v[i] = current_v[i] - v_u[i]
+            if current_a[i] < a_l[i]:
+                bound_delta_a[i] = current_a[i] - a_l[i]
+            elif current_a[i] > a_u[i]:
+                bound_delta_a[i] = current_a[i] - a_u[i]
+            if current_j[i] < j_l[i]:
+                bound_delta_j[i] = current_j[i] - j_l[i]
+            elif current_j[i] > j_u[i]:
+                bound_delta_j[i] = current_j[i] - j_u[i]
+        delta_p = wp.length(bound_delta_p)
+        if delta_p > 0.0:
+            g_p += w * bound_delta_p / delta_p
+            c_total += w * delta_p
+        delta_v = wp.length(bound_delta_v)
+        if delta_v > 0.0:
+            g_v = b_wv * bound_delta_v / delta_v
+            c_total += b_wv * delta_v
+        delta_a = wp.length(bound_delta_a)
+        if delta_a > 0.0:
+            g_a = b_wa * bound_delta_a / delta_a
+            c_total += b_wa * delta_a
+        delta_j = wp.length(bound_delta_j)
+        if delta_j > 0.0:
+            g_j = b_wj * bound_delta_j / delta_j
+            c_total += b_wj * delta_j
+
+        delta_acc = wp.cw_mul(cspace_weight_local, current_a)
+        # delta_acc = wp.cw_div(delta_acc, a_u - a_l)
+        acc_length = wp.length_sq(delta_acc)
+        s_a = w_a * r_wa * acc_length
+        if acc_length > 0.0:
+            g_a += 2.0 * w_a * r_wa * delta_acc  # / acc_length
+
+        delta_jerk = wp.cw_mul(cspace_weight_local, current_j)
+        # delta_jerk = wp.cw_div(delta_jerk, j_u - j_l)
+        jerk_length = wp.length_sq(delta_jerk)
+        s_j = w_j * r_wj * jerk_length
+        if jerk_length > 0.0:
+            g_j += 2.0 * w_j * r_wj * delta_jerk  # / jerk_length
+
+        c_total += s_a + s_j
+
+        out_cost[b_id * horizon + h_id] = c_total
+
+        # compute gradient
+        if write_grad == 1:
+            b_addrs = b_id * horizon * dof + h_id * dof
+            for i in range(dof_template):
+                out_grad_p[b_addrs + i] = g_p[i]
+                out_grad_v[b_addrs + i] = g_v[i]
+                out_grad_a[b_addrs + i] = g_a[i]
+                out_grad_j[b_addrs + i] = g_j[i]
+
+    return wp.Kernel(forward_bound_smooth_loop_warp)
+
+
+@get_cache_fn_decorator()
+def make_bound_pos_kernel(dof_template: int):
+
+    def forward_bound_pos_loop_warp(
+        pos: wp.array(dtype=wp.float32),
+        retract_config: wp.array(dtype=wp.float32),
+        retract_idx: wp.array(dtype=wp.int32),
+        p_b: wp.array(dtype=wp.float32),
+        weight: wp.array(dtype=wp.float32),
+        activation_distance: wp.array(dtype=wp.float32),
+        null_weight: wp.array(dtype=wp.float32),
+        vec_weight: wp.array(dtype=wp.float32),
+        out_cost: wp.array(dtype=wp.float32),
+        out_grad_p: wp.array(dtype=wp.float32),
+        write_grad: wp.uint8,  # this should be a bool
+        batch_size: wp.int32,
+        horizon: wp.int32,
+        dof: wp.int32,
+    ):
+        tid = wp.tid()
+        # initialize variables:
+        b_id = wp.int32(0)
+        h_id = wp.int32(0)
+        b_addrs = int(0)
+
+        w = wp.float32(0.0)
+        c_total = wp.float32(0.0)
+
+        # we launch batch * horizon * dof kernels
+        b_id = tid / (horizon)
+        h_id = tid - (b_id * horizon)
+        if b_id >= batch_size or h_id >= horizon:
+            return
+
+        # read weights:
+        eta_p = activation_distance[0]
+        w = weight[0]
+
+        n_w = wp.float32(0.0)
+        n_w = null_weight[0]
+        target_id = wp.int32(0.0)
+        vec_weight_local = wp.vector(dtype=wp.float32, length=dof_template)
+        p_l = wp.vector(dtype=wp.float32, length=dof_template)
+        p_u = wp.vector(dtype=wp.float32, length=dof_template)
+        bound_delta = wp.vector(dtype=wp.float32, length=dof_template)
+        target_p = wp.vector(dtype=wp.float32, length=dof_template)
+        current_p = wp.vector(dtype=wp.float32, length=dof_template)
+        g_p = wp.vector(dtype=wp.float32, length=dof_template)
+
+        target_id = retract_idx[b_id]
+        b_addrs = b_id * horizon * dof + h_id * dof
+
+        for i in range(dof_template):
+            vec_weight_local[i] = vec_weight[i]
+            target_p[i] = retract_config[target_id * dof + i]
+            current_p[i] = pos[b_addrs + i]
+            p_l[i] = p_b[i]
+            p_u[i] = p_b[dof + i]
+
+        p_range = p_u - p_l
+        eta_percent = eta_p * (p_range)
+        p_l += eta_percent
+        p_u -= eta_percent
+
+        # compute retract cost:
+
+        if n_w > 0.0:
+            error = wp.cw_mul(vec_weight_local, current_p - target_p)
+            error_length = wp.length(error)
+            c_total = n_w * error_length
+            if error_length > 0.0:
+                g_p = n_w * error / error_length
+
+        # bound cost:
+
+        for i in range(dof_template):
+            if current_p[i] < p_l[i]:
+                bound_delta[i] = current_p[i] - p_l[i]
+            elif current_p[i] > p_u[i]:
+                bound_delta[i] = current_p[i] - p_u[i]
+
+        delta = wp.length(bound_delta)
+        if delta > 0.0:
+            g_p += w * bound_delta / delta
+            c_total += w * delta
+
+        out_cost[b_id * horizon + h_id] = c_total
+
+        # compute gradient
+        if write_grad == 1:
+            b_addrs = b_id * horizon * dof + h_id * dof
+            for i in range(dof_template):
+                out_grad_p[b_addrs + i] = g_p[i]
+
+    return wp.Kernel(forward_bound_pos_loop_warp)
diff --git a/src/curobo/rollout/cost/dist_cost.py b/src/curobo/rollout/cost/dist_cost.py
index 543b772..b4dc94b 100644
--- a/src/curobo/rollout/cost/dist_cost.py
+++ b/src/curobo/rollout/cost/dist_cost.py
@@ -18,7 +18,8 @@ import torch
 import warp as wp
 
 # CuRobo
-from curobo.util.torch_utils import get_torch_jit_decorator
+from curobo.util.logger import log_error
+from curobo.util.torch_utils import get_cache_fn_decorator, get_torch_jit_decorator
 from curobo.util.warp import init_warp
 
 # Local Folder
@@ -37,6 +38,7 @@ class DistType(Enum):
 class DistCostConfig(CostConfig):
     dist_type: DistType = DistType.L2
     use_null_space: bool = False
+    use_l2_kernel: bool = True
 
     def __post_init__(self):
         return super().__post_init__()
@@ -142,6 +144,89 @@ def forward_l2_warp(
         out_grad_p[b_addrs] = g_p
 
 
+@get_cache_fn_decorator()
+def make_l2_kernel(dof_template: int):
+    def forward_l2_loop_warp(
+        pos: wp.array(dtype=wp.float32),
+        target: wp.array(dtype=wp.float32),
+        target_idx: wp.array(dtype=wp.int32),
+        weight: wp.array(dtype=wp.float32),
+        run_weight: wp.array(dtype=wp.float32),
+        vec_weight: wp.array(dtype=wp.float32),
+        out_cost: wp.array(dtype=wp.float32),
+        out_grad_p: wp.array(dtype=wp.float32),
+        write_grad: wp.uint8,  # this should be a bool
+        batch_size: wp.int32,
+        horizon: wp.int32,
+        dof: wp.int32,
+    ):
+        tid = wp.tid()
+        # initialize variables:
+        b_id = wp.int32(0)
+        h_id = wp.int32(0)
+        b_addrs = wp.int32(0)
+        target_id = wp.int32(0)
+        w = wp.float32(0.0)
+        r_w = wp.float32(0.0)
+        c_total = wp.float32(0.0)
+
+        # we launch batch * horizon * dof kernels
+        b_id = tid / (horizon)
+        h_id = tid - (b_id * horizon)
+
+        if b_id >= batch_size or h_id >= horizon:
+            return
+
+        # read weights:
+        w = weight[0]
+        r_w = run_weight[h_id]
+        w = r_w * w
+
+        if w == 0.0:
+            return
+        # compute cost:
+        b_addrs = b_id * horizon * dof + h_id * dof
+
+        # read buffers:
+        current_position = wp.vector(dtype=wp.float32, length=dof_template)
+        target_position = wp.vector(dtype=wp.float32, length=dof_template)
+        vec_weight_local = wp.vector(dtype=wp.float32, length=dof_template)
+        target_id = target_idx[b_id]
+        target_id = target_id * dof
+
+        for i in range(dof_template):
+            current_position[i] = pos[b_addrs + i]
+            target_position[i] = target[target_id + i]
+            vec_weight_local[i] = vec_weight[i]
+
+        error = wp.cw_mul(vec_weight_local, (current_position - target_position))
+
+        c_length = wp.length(error)
+        if w > 100.0:
+            p_w_alpha = 70.0
+            c_total = w * wp.log2(wp.cosh(p_w_alpha * c_length))
+            g_p = error * (
+                w
+                * p_w_alpha
+                * wp.sinh(p_w_alpha * c_length)
+                / (c_length * wp.cosh(p_w_alpha * c_length))
+            )
+        else:
+            g_p = w * error
+            if c_length > 0.0:
+                g_p = g_p / c_length
+            c_total = w * c_length
+
+        out_cost[b_id * horizon + h_id] = c_total
+
+        # compute gradient
+        if write_grad == 1:
+            for i in range(dof_template):
+                out_grad_p[b_addrs + i] = g_p[i]
+
+    return wp.Kernel(forward_l2_loop_warp)
+
+
 # create a bound cost tensor:
 class L2DistFunction(torch.autograd.Function):
     @staticmethod
@@ -195,6 +280,58 @@ class L2DistFunction(torch.autograd.Function):
         return p_g, None, None, None, None, None, None, None, None
 
 
+# create a bound cost tensor:
+class L2DistLoopFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        pos,
+        target,
+        target_idx,
+        weight,
+        run_weight,
+        vec_weight,
+        out_cost,
+        out_cost_v,
+        out_gp,
+        l2_dof_kernel,
+    ):
+        wp_device = wp.device_from_torch(pos.device)
+        b, h, dof = pos.shape
+        wp.launch(
+            kernel=l2_dof_kernel,
+            dim=b * h,
+            inputs=[
+                wp.from_torch(pos.detach().view(-1).contiguous(), dtype=wp.float32),
+                wp.from_torch(target.view(-1), dtype=wp.float32),
+                wp.from_torch(target_idx.view(-1), dtype=wp.int32),
+                wp.from_torch(weight, dtype=wp.float32),
+                wp.from_torch(run_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(vec_weight.view(-1), dtype=wp.float32),
+                wp.from_torch(out_cost.view(-1), dtype=wp.float32),
+                wp.from_torch(out_gp.view(-1), dtype=wp.float32),
+                pos.requires_grad,
+                b,
+                h,
+                dof,
+            ],
+            device=wp_device,
+            stream=wp.stream_from_torch(pos.device),
+        )
+
+        ctx.save_for_backward(out_gp)
+        return out_cost
+
+    @staticmethod
+    def backward(ctx, grad_out_cost):
+        (p_grad,) = ctx.saved_tensors
+
+        p_g = None
+        if ctx.needs_input_grad[0]:
+            p_g = p_grad
+        return p_g, None, None, None, None, None, None, None, None, None
+
+
 class DistCost(CostBase, DistCostConfig):
     def __init__(self, config: Optional[DistCostConfig] = None):
         if config is not None:
@@ -202,6 +339,8 @@ class DistCost(CostBase, DistCostConfig):
         CostBase.__init__(self)
         self._init_post_config()
         init_warp()
+        if self.use_l2_kernel:
+            self._l2_dof_kernel = make_l2_kernel(self.dof)
 
     def _init_post_config(self):
         if self.vec_weight is not None:
@@ -210,13 +349,21 @@ class DistCost(CostBase, DistCostConfig):
                 self.vec_weight = self.vec_weight * 0.0 + 1.0
 
     def update_batch_size(self, batch, horizon, dof):
+        if dof != self.dof:
+            log_error("dof cannot be changed after initializing DistCost")
         if self._batch_size != batch or self._horizon != horizon or self._dof != dof:
-            self._out_cv_buffer = torch.zeros(
-                (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
-            )
-            self._out_c_buffer = torch.zeros(
-                (batch, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
-            )
+            self._out_c_buffer = None
+            self._out_cv_buffer = None
+            if self.use_l2_kernel:
+                self._out_c_buffer = torch.zeros(
+                    (batch, horizon), device=self.tensor_args.device, dtype=self.tensor_args.dtype
+                )
+            else:
+                self._out_cv_buffer = torch.zeros(
+                    (batch, horizon, dof),
+                    device=self.tensor_args.device,
+                    dtype=self.tensor_args.dtype,
+                )
 
             self._out_g_buffer = torch.zeros(
                 (batch, horizon, dof), device=self.tensor_args.device, dtype=self.tensor_args.dtype
@@ -293,24 +440,59 @@ class DistCost(CostBase, DistCostConfig):
         else:
             raise NotImplementedError("terminal flag needs to be set to true")
         if self.dist_type == DistType.L2:
-            # print(goal_idx.shape, goal_vec.shape)
-            cost = L2DistFunction.apply(
-                current_vec,
-                goal_vec,
-                goal_idx,
-                self.weight,
-                self._run_weight_vec,
-                self.vec_weight,
-                self._out_c_buffer,
-                self._out_cv_buffer,
-                self._out_g_buffer,
-            )
+            if self.use_l2_kernel:
+
+                cost = L2DistLoopFunction.apply(
+                    current_vec,
+                    goal_vec,
+                    goal_idx,
+                    self.weight,
+                    self._run_weight_vec,
+                    self.vec_weight,
+                    self._out_c_buffer,
+                    None,
+                    self._out_g_buffer,
+                    self._l2_dof_kernel,
+                )
+            else:
+                cost = L2DistFunction.apply(
+                    current_vec,
+                    goal_vec,
+                    goal_idx,
+                    self.weight,
+                    self._run_weight_vec,
+                    self.vec_weight,
+                    None,
+                    self._out_cv_buffer,
+                    self._out_g_buffer,
+                )
 
         else:
             raise NotImplementedError()
         if RETURN_GOAL_DIST:
-            dist_scale = torch.nan_to_num(
-                1.0 / torch.sqrt((self.weight * self._run_weight_vec)), 0.0
-            )
-            return cost, cost * dist_scale
+
+            if self.use_l2_kernel:
+                distance = weight_cost_to_l2_jit(cost, self.weight, self._run_weight_vec)
+            else:
+                distance = squared_cost_to_l2_jit(cost, self.weight, self._run_weight_vec)
+            return cost, distance
+
         return cost
+
+
+@get_torch_jit_decorator()
+def squared_cost_to_l2_jit(cost, weight, run_weight_vec):
+    weight_inv = weight * run_weight_vec
+    weight_inv = 1.0 / weight_inv
+    weight_inv = torch.nan_to_num(weight_inv, 0.0)
+    distance = torch.sqrt(cost * weight_inv)
+    return distance
+
+
+@get_torch_jit_decorator()
+def weight_cost_to_l2_jit(cost, weight, run_weight_vec):
+    weight_inv = weight * run_weight_vec
+    weight_inv = 1.0 / weight_inv
+    weight_inv = torch.nan_to_num(weight_inv, 0.0)
+    distance = cost * weight_inv
+    return distance
diff --git a/src/curobo/rollout/dynamics_model/tensor_step.py b/src/curobo/rollout/dynamics_model/tensor_step.py
index c28b06b..f78d393 100644
--- a/src/curobo/rollout/dynamics_model/tensor_step.py
+++ b/src/curobo/rollout/dynamics_model/tensor_step.py
@@ -522,27 +522,20 @@ class TensorStepPositionCliqueKernel(TensorStepBase):
                     self._u_grad,
                 )
             if self._filter_velocity:
+                out_state_seq.aux_data["raw_velocity"] = out_state_seq.velocity
                 out_state_seq.velocity = self.filter_signal(out_state_seq.velocity)
 
             if self._filter_acceleration:
+                out_state_seq.aux_data["raw_acceleration"] = out_state_seq.acceleration
                 out_state_seq.acceleration = self.filter_signal(out_state_seq.acceleration)
 
             if self._filter_jerk:
+                out_state_seq.aux_data["raw_jerk"] = out_state_seq.jerk
                 out_state_seq.jerk = self.filter_signal(out_state_seq.jerk)
         return out_state_seq
 
     def filter_signal(self, signal: torch.Tensor):
         return filter_signal_jit(signal, self._sma_kernel)
-        b, h, dof = signal.shape
-        new_signal = (
-            self._sma(
-                signal.transpose(-1, -2).reshape(b * dof, 1, h), self._sma_kernel, padding="same"
-            )
-            .view(b, dof, h)
-            .transpose(-1, -2)
-            .contiguous()
-        )
-        return new_signal
 
 
 @get_torch_jit_decorator(force_jit=True)
diff --git a/src/curobo/types/robot.py b/src/curobo/types/robot.py
index 8095430..9a15d1e 100644
--- a/src/curobo/types/robot.py
+++ b/src/curobo/types/robot.py
@@ -58,7 +58,6 @@ class RobotConfig:
         cuda_robot_model_config = CudaRobotModelConfig.from_basic_urdf(
             urdf_path, base_link, ee_link, tensor_args
         )
-        # load other params:
 
         return RobotConfig(
             cuda_robot_model_config,
diff --git a/src/curobo/types/state.py b/src/curobo/types/state.py
index 4be430d..6ec7e52 100644
--- a/src/curobo/types/state.py
+++ b/src/curobo/types/state.py
@@ -11,7 +11,7 @@
 from __future__ import annotations
 
 # Standard Library
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import List, Optional, Sequence, Tuple, Union
 
 # Third Party
@@ -73,6 +73,7 @@ class JointState(State):
     joint_names: Optional[List[str]] = None
     jerk: Union[List[float], T_DOF, None] = None  # Optional
     tensor_args: TensorDeviceType = TensorDeviceType()
+    aux_data: dict = field(default_factory=lambda: {})
 
     def __post_init__(self):
         if isinstance(self.position, torch.Tensor):
diff --git a/src/curobo/util/torch_utils.py b/src/curobo/util/torch_utils.py
index 1f1fc1a..7fd6239 100644
--- a/src/curobo/util/torch_utils.py
+++ b/src/curobo/util/torch_utils.py
@@ -10,6 +10,8 @@
 #
 # Standard Library
 import os
+from functools import lru_cache
+from typing import Optional
 
 # Third Party
 import torch
@@ -148,5 +150,20 @@ def get_torch_jit_decorator(
         return empty_decorator
 
 
+def is_lru_cache_avaiable():
+    use_lru_cache = os.environ.get("CUROBO_USE_LRU_CACHE")
+    if use_lru_cache is not None:
+        return bool(int(use_lru_cache))
+    log_info("Environment variable for CUROBO_USE_LRU_CACHE is not set, Enabling as default.")
+    return False
+
+
+def get_cache_fn_decorator(maxsize: Optional[int] = None):
+    if is_lru_cache_avaiable():
+        return lru_cache(maxsize=maxsize)
+    else:
+        return empty_decorator
+
+
 def empty_decorator(function):
     return function
diff --git a/src/curobo/util/trajectory.py b/src/curobo/util/trajectory.py
index cb8f845..8f46d38 100644
--- a/src/curobo/util/trajectory.py
+++ b/src/curobo/util/trajectory.py
@@ -141,10 +141,19 @@ def get_batch_interpolated_trajectory(
     # given the dt required to run trajectory at maximum velocity,
     # we find the number of timesteps required:
     if optimize_dt:
+        traj_vel = raw_traj.velocity
+        traj_acc = raw_traj.acceleration
+        traj_jerk = raw_traj.jerk
+        if "raw_velocity" in raw_traj.aux_data:
+            traj_vel = raw_traj.aux_data["raw_velocity"]
+        if "raw_acceleration" in raw_traj.aux_data:
+            traj_acc = raw_traj.aux_data["raw_acceleration"]
+        if "raw_jerk" in raw_traj.aux_data:
+            traj_jerk = raw_traj.aux_data["raw_jerk"]
         traj_steps, steps_max, opt_dt = calculate_tsteps(
-            raw_traj.velocity,
-            raw_traj.acceleration,
-            raw_traj.jerk,
+            traj_vel,
+            traj_acc,
+            traj_jerk,
             interpolation_dt,
             max_vel,
             max_acc,
@@ -464,7 +473,7 @@ def calculate_dt_fixed(
     raw_dt: torch.Tensor,
     min_dt: float,
     max_dt: float,
-    epsilon: float = 1e-6,
+    epsilon: float = 1e-4,
 ):
     # compute scaled dt:
     max_v_arr = torch.max(torch.abs(vel), dim=-2)[0]  # output is batch, dof
@@ -497,7 +506,7 @@ def calculate_dt(
     max_jerk: torch.Tensor,
     raw_dt: float,
     min_dt: float,
-    epsilon: float = 1e-6,
+    epsilon: float = 1e-4,
 ):
     # compute scaled dt:
     max_v_arr = torch.max(torch.abs(vel), dim=-2)[0]  # output is batch, dof
@@ -529,7 +538,7 @@ def calculate_dt_no_clamp(
     max_vel: torch.Tensor,
     max_acc: torch.Tensor,
     max_jerk: torch.Tensor,
-    epsilon: float = 1e-6,
+    epsilon: float = 1e-4,
 ):
     # compute scaled dt:
     max_v_arr = torch.max(torch.abs(vel), dim=-2)[0]  # output is batch, dof
diff --git a/src/curobo/wrap/reacher/motion_gen.py b/src/curobo/wrap/reacher/motion_gen.py
index 004ec31..543d1f0 100644
--- a/src/curobo/wrap/reacher/motion_gen.py
+++ b/src/curobo/wrap/reacher/motion_gen.py
@@ -121,6 +121,9 @@ class MotionGenConfig:
     #: instance of trajectory optimization solver to use for reaching joint space targets.
     js_trajopt_solver: TrajOptSolver
 
+    #: instance of trajectory optimization solver for final fine tuning for joint space targets.
+    finetune_js_trajopt_solver: TrajOptSolver
+
     #: instance of trajectory optimization solver for final fine tuning.
     finetune_trajopt_solver: TrajOptSolver
 
@@ -760,6 +763,7 @@ class MotionGenConfig:
             minimize_jerk=minimize_jerk,
             filter_robot_command=filter_robot_command,
             optimize_dt=optimize_dt,
+            num_seeds=num_trajopt_noisy_seeds,
         )
         js_trajopt_solver = TrajOptSolver(js_trajopt_cfg)
 
@@ -805,6 +809,48 @@ class MotionGenConfig:
         )
 
         finetune_trajopt_solver = TrajOptSolver(finetune_trajopt_cfg)
+
+        finetune_js_trajopt_cfg = TrajOptSolverConfig.load_from_robot_config(
+            robot_cfg=robot_cfg,
+            world_model=world_model,
+            tensor_args=tensor_args,
+            position_threshold=position_threshold,
+            rotation_threshold=rotation_threshold,
+            world_coll_checker=world_coll_checker,
+            base_cfg_file=base_config_data,
+            particle_file=particle_trajopt_file,
+            gradient_file=finetune_trajopt_file,
+            traj_tsteps=js_trajopt_tsteps,
+            interpolation_type=interpolation_type,
+            interpolation_steps=interpolation_steps,
+            use_cuda_graph=use_cuda_graph,
+            self_collision_check=self_collision_check,
+            self_collision_opt=self_collision_opt,
+            grad_trajopt_iters=grad_trajopt_iters,
+            interpolation_dt=interpolation_dt,
+            use_particle_opt=False,
+            traj_evaluator_config=traj_evaluator_config,
+            traj_evaluator=traj_evaluator,
+            use_gradient_descent=use_gradient_descent,
+            use_es=use_es_trajopt,
+            es_learning_rate=es_trajopt_learning_rate,
+            use_fixed_samples=use_trajopt_fixed_samples,
+            evaluate_interpolated_trajectory=evaluate_interpolated_trajectory,
+            fixed_iters=fixed_iters_trajopt,
+            store_debug=store_trajopt_debug,
+            collision_activation_distance=collision_activation_distance,
+            trajopt_dt=js_trajopt_dt,
+            store_debug_in_result=store_debug_in_result,
+            smooth_weight=smooth_weight,
+            cspace_threshold=cspace_threshold,
+            state_finite_difference_mode=state_finite_difference_mode,
+            minimize_jerk=minimize_jerk,
+            filter_robot_command=filter_robot_command,
+            optimize_dt=optimize_dt,
+            num_seeds=num_trajopt_noisy_seeds,
+        )
+        finetune_js_trajopt_solver = TrajOptSolver(finetune_js_trajopt_cfg)
+
         if graph_trajopt_iters is not None:
             graph_trajopt_iters = math.ceil(
                 graph_trajopt_iters / finetune_trajopt_solver.solver.newton_optimizer.inner_iters
@@ -823,6 +869,7 @@ class MotionGenConfig:
             graph_planner,
             trajopt_solver=trajopt_solver,
             js_trajopt_solver=js_trajopt_solver,
+            finetune_js_trajopt_solver=finetune_js_trajopt_solver,
             finetune_trajopt_solver=finetune_trajopt_solver,
             interpolation_type=interpolation_type,
             interpolation_steps=interpolation_steps,
@@ -951,6 +998,9 @@ class MotionGenPlanConfig:
     #: check for joint limits, self-collision, and collision with the world.
     check_start_validity: bool = True
 
+    #: Finetune dt scale for joint space planning.
+    finetune_js_dt_scale: Optional[float] = 1.1
+
     def __post_init__(self):
         """Post initialization checks."""
         if not self.enable_opt and not self.enable_graph:
@@ -983,6 +1033,7 @@ class MotionGenPlanConfig:
             finetune_dt_scale=self.finetune_dt_scale,
             finetune_attempts=self.finetune_attempts,
             time_dilation_factor=self.time_dilation_factor,
+            finetune_js_dt_scale=self.finetune_js_dt_scale,
         )
 
 
@@ -1760,6 +1811,7 @@ class MotionGen(MotionGenConfig):
         self.ik_solver.reset_seed()
         self.graph_planner.reset_seed()
         self.trajopt_solver.reset_seed()
+        self.js_trajopt_solver.reset_seed()
 
     def get_retract_config(self) -> T_DOF:
         """Returns the retract/home configuration of the robot."""
@@ -1803,7 +1855,12 @@ class MotionGen(MotionGenConfig):
             goal_state = start_state.clone()
             goal_state.position[..., warmup_joint_index] += warmup_joint_delta
             for _ in range(3):
-                self.plan_single_js(start_state, goal_state, MotionGenPlanConfig(max_attempts=1))
+                self.plan_single_js(
+                    start_state.clone(),
+                    goal_state.clone(),
+                    MotionGenPlanConfig(max_attempts=1, enable_finetune_trajopt=True),
+                )
+
         if enable_graph:
             start_state = JointState.from_position(
                 self.rollout_fn.dynamics_model.retract_config.view(1, -1).clone(),
@@ -1983,7 +2040,7 @@ class MotionGen(MotionGenConfig):
             "finetune_time": 0,
         }
         result = None
-        goal = Goal(goal_state=goal_state, current_state=start_state)
+        # goal = Goal(goal_state=goal_state, current_state=start_state)
         solve_state = ReacherSolveState(
             ReacherSolveType.SINGLE,
             num_ik_seeds=1,
@@ -2009,7 +2066,7 @@ class MotionGen(MotionGenConfig):
             result = self._plan_js_from_solve_state(
                 solve_state, start_state, goal_state, plan_config=plan_config
             )
-            time_dict["trajopt_time"] += result.solve_time
+            time_dict["trajopt_time"] += result.trajopt_time
             time_dict["graph_time"] += result.graph_time
             time_dict["finetune_time"] += result.finetune_time
             time_dict["trajopt_attempts"] = n
@@ -2160,6 +2217,7 @@ class MotionGen(MotionGenConfig):
                 + self.trajopt_solver.get_all_rollout_instances()
                 + self.finetune_trajopt_solver.get_all_rollout_instances()
                 + self.js_trajopt_solver.get_all_rollout_instances()
+                + self.finetune_js_trajopt_solver.get_all_rollout_instances()
             )
         return self._rollout_list
 
@@ -2171,6 +2229,7 @@ class MotionGen(MotionGenConfig):
                 + self.trajopt_solver.solver.get_all_rollout_instances()
                 + self.finetune_trajopt_solver.solver.get_all_rollout_instances()
                 + self.js_trajopt_solver.solver.get_all_rollout_instances()
+                + self.finetune_js_trajopt_solver.solver.get_all_rollout_instances()
             )
         return self._solver_rollout_list
 
@@ -2529,6 +2588,11 @@ class MotionGen(MotionGenConfig):
         """Check if the pose cost metric is projected to goal frame."""
         return self.trajopt_solver.rollout_fn.goal_cost.project_distance
 
+    @property
+    def joint_names(self) -> List[str]:
+        """Get the joint names of the robot."""
+        return self.rollout_fn.joint_names
+
     def update_interpolation_type(
         self,
         interpolation_type: InterpolateType,
@@ -2548,6 +2612,7 @@ class MotionGen(MotionGenConfig):
             self.trajopt_solver.interpolation_type = interpolation_type
             self.finetune_trajopt_solver.interpolation_type = interpolation_type
             self.js_trajopt_solver.interpolation_type = interpolation_type
+            self.finetune_js_trajopt_solver.interpolation_type = interpolation_type
 
     def update_locked_joints(
         self, lock_joints: Dict[str, float], robot_config_dict: Union[str, Dict[Any]]
@@ -3375,8 +3440,9 @@ class MotionGen(MotionGenConfig):
                         seed_override = solve_state.num_trajopt_seeds * self.noisy_trajopt_seeds
 
                     finetune_time = 0
+                    newton_iters = None
+
                     for k in range(plan_config.finetune_attempts):
-                        newton_iters = None
 
                         scaled_dt = torch.clamp(
                             opt_dt
@@ -3396,7 +3462,7 @@ class MotionGen(MotionGenConfig):
                             newton_iters=newton_iters,
                         )
                         finetune_time += traj_result.solve_time
-                        if torch.count_nonzero(traj_result.success) > 0:
+                        if torch.count_nonzero(traj_result.success) > 0 or not self.optimize_dt:
                             break
                         seed_traj = traj_result.optimized_seeds.detach().clone()
                         newton_iters = 4
@@ -3592,7 +3658,7 @@ class MotionGen(MotionGenConfig):
                     solve_state,
                     trajopt_seed_traj,
                     num_seeds_override=solve_state.num_trajopt_seeds,
-                    newton_iters=trajopt_newton_iters + 2,
+                    newton_iters=trajopt_newton_iters,
                     return_all_solutions=plan_config.enable_finetune_trajopt,
                     trajopt_instance=self.js_trajopt_solver,
                 )
@@ -3605,29 +3671,42 @@ class MotionGen(MotionGenConfig):
             # run finetune
             if plan_config.enable_finetune_trajopt and torch.count_nonzero(traj_result.success) > 0:
                 with profiler.record_function("motion_gen/finetune_trajopt"):
-                    seed_traj = traj_result.raw_action.clone()  # solution.position.clone()
-                    seed_traj = seed_traj.contiguous()
+                    seed_traj = traj_result.raw_action.clone()
                     og_solve_time = traj_result.solve_time
+                    opt_dt = traj_result.optimized_dt
+                    opt_dt = torch.min(opt_dt[traj_result.success])
+                    finetune_time = 0
+                    newton_iters = None
+                    for k in range(plan_config.finetune_attempts):
+                        scaled_dt = torch.clamp(
+                            opt_dt
+                            * plan_config.finetune_js_dt_scale
+                            * (plan_config.finetune_dt_decay ** (k)),
+                            self.js_trajopt_solver.minimum_trajectory_dt,
+                        )
 
-                    scaled_dt = torch.clamp(
-                        torch.max(traj_result.optimized_dt[traj_result.success]),
-                        self.trajopt_solver.minimum_trajectory_dt,
-                    )
-                    og_dt = self.js_trajopt_solver.solver_dt.clone()
-                    self.js_trajopt_solver.update_solver_dt(scaled_dt.item())
-                    traj_result = self._solve_trajopt_from_solve_state(
-                        goal,
-                        solve_state,
-                        seed_traj,
-                        trajopt_instance=self.js_trajopt_solver,
-                        num_seeds_override=solve_state.num_trajopt_seeds,
-                        newton_iters=trajopt_newton_iters + 4,
-                    )
-                    self.js_trajopt_solver.update_solver_dt(og_dt)
+                        if self.optimize_dt:
+                            self.finetune_js_trajopt_solver.update_solver_dt(scaled_dt.item())
+                        traj_result = self._solve_trajopt_from_solve_state(
+                            goal,
+                            solve_state,
+                            seed_traj,
+                            trajopt_instance=self.finetune_js_trajopt_solver,
+                            num_seeds_override=solve_state.num_trajopt_seeds,
+                            newton_iters=newton_iters,
+                            return_all_solutions=False,
+                        )
 
-                result.finetune_time = traj_result.solve_time
+                        finetune_time += traj_result.solve_time
+                        if torch.count_nonzero(traj_result.success) > 0 or not self.optimize_dt:
+                            break
 
-                traj_result.solve_time = og_solve_time
+                        seed_traj = traj_result.optimized_seeds.detach().clone()
+                        newton_iters = 4
+
+                    result.finetune_time = finetune_time
+
+                    traj_result.solve_time = og_solve_time
                 if self.store_debug_in_result:
                     result.debug_info["finetune_trajopt_result"] = traj_result
                 if torch.count_nonzero(traj_result.success) == 0:
@@ -3638,7 +3717,6 @@ class MotionGen(MotionGenConfig):
             result.trajopt_time = traj_result.solve_time
             result.trajopt_attempts = 1
             result.success = traj_result.success
-
             result.interpolated_plan = traj_result.interpolated_solution.trim_trajectory(
                 0, traj_result.path_buffer_last_tstep[0]
             )
diff --git a/src/curobo/wrap/reacher/trajopt.py b/src/curobo/wrap/reacher/trajopt.py
index 6bcbd60..f9c92a8 100644
--- a/src/curobo/wrap/reacher/trajopt.py
+++ b/src/curobo/wrap/reacher/trajopt.py
@@ -1343,14 +1343,26 @@ class TrajOptSolver(TrajOptSolverConfig):
                 goal=goal,
                 solution=result.action.scale_by_dt(self.solver_dt_tensor, opt_dt.view(-1, 1, 1)),
                 seed=seed_traj,
-                position_error=result.metrics.position_error,
-                rotation_error=result.metrics.rotation_error,
+                position_error=(
+                    result.metrics.position_error[..., -1]
+                    if result.metrics.position_error is not None
+                    else None
+                ),
+                rotation_error=(
+                    result.metrics.rotation_error[..., -1]
+                    if result.metrics.rotation_error is not None
+                    else None
+                ),
                 solve_time=result.solve_time,
                 metrics=result.metrics,  # TODO: index this also
                 interpolated_solution=interpolated_trajs,
                 debug_info={"solver": result.debug, "interpolation_time": interpolation_time},
                 path_buffer_last_tstep=last_tstep,
-                cspace_error=result.metrics.cspace_error,
+                cspace_error=(
+                    result.metrics.cspace_error[..., -1]
+                    if result.metrics.cspace_error is not None
+                    else None
+                ),
                 optimized_dt=opt_dt,
                 raw_solution=result.action,
                 raw_action=result.raw_action,
@@ -1694,6 +1706,7 @@ class TrajOptSolver(TrajOptSolverConfig):
                 (b, self.interpolation_steps, dof), self.tensor_args
             )
             self._interpolated_traj_buffer.joint_names = self.rollout_fn.joint_names
+
         state, last_tstep, opt_dt = get_batch_interpolated_trajectory(
             traj_state,
             self.solver_dt_tensor,
diff --git a/tests/conftest.py b/tests/conftest.py
index e635c76..9778a6d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,3 +13,4 @@
 import os
 
 os.environ["CUROBO_TORCH_COMPILE_DISABLE"] = str(1)
+os.environ["CUROBO_USE_LRU_CACHE"] = str(1)
diff --git a/tests/motion_gen_js_test.py b/tests/motion_gen_js_test.py
index 36a5af8..0a5481b 100644
--- a/tests/motion_gen_js_test.py
+++ b/tests/motion_gen_js_test.py
@@ -11,6 +11,7 @@
 
 
 # Third Party
+import pytest
 import torch
 
 # CuRobo
@@ -18,21 +19,20 @@ from curobo.types.robot import JointState
 from curobo.wrap.reacher.motion_gen import MotionGen, MotionGenConfig, MotionGenPlanConfig
 
 
-def test_motion_gen_plan_js():
+@pytest.fixture(scope="module")
+def motion_gen():
     world_file = "collision_table.yml"
     robot_file = "ur5e.yml"
     motion_gen_config = MotionGenConfig.load_from_robot_config(
         robot_file,
         world_file,
-        trajopt_tsteps=32,
-        use_cuda_graph=False,
-        num_trajopt_seeds=4,
-        fixed_iters_trajopt=True,
-        evaluate_interpolated_trajectory=True,
     )
-    motion_gen = MotionGen(motion_gen_config)
-    motion_gen.warmup(warmup_js_trajopt=True)
+    motion_gen_instance = MotionGen(motion_gen_config)
+    motion_gen_instance.warmup(warmup_js_trajopt=True)
+    return motion_gen_instance
 
+
+def test_motion_gen_plan_js(motion_gen):
     retract_cfg = motion_gen.get_retract_config()
 
     start_state = JointState.from_position(retract_cfg.view(1, -1).clone())
@@ -42,3 +42,27 @@ def test_motion_gen_plan_js():
     )
     result = motion_gen.plan_single_js(start_state, goal_state, MotionGenPlanConfig(max_attempts=1))
     assert result.success.item()
+
+
+@pytest.mark.parametrize(
+    "motion_gen_str, delta",
+    [
+        ("motion_gen", 0.1),
+        ("motion_gen", 0.2),
+        ("motion_gen", 0.3),
+        ("motion_gen", 0.4),
+        ("motion_gen", 0.5),
+    ],
+)
+def test_motion_gen_plan_js_delta(motion_gen_str, delta, request):
+    motion_gen = request.getfixturevalue(motion_gen_str)
+    start_state = JointState.from_position(
+        motion_gen.get_retract_config().view(1, -1).clone(),
+        joint_names=motion_gen.joint_names,
+    )
+    goal_state = JointState.from_position(
+        motion_gen.get_retract_config().view(1, -1).clone() + delta,
+        joint_names=motion_gen.joint_names,
+    )
+    result = motion_gen.plan_single_js(start_state, goal_state, MotionGenPlanConfig(max_attempts=1))
+    assert result.success.item()
diff --git a/tests/voxel_collision_test.py b/tests/voxel_collision_test.py
index e63623e..9cc2269 100644
--- a/tests/voxel_collision_test.py
+++ b/tests/voxel_collision_test.py
@@ -40,8 +40,8 @@ def get_world_model(single_object: bool = False):
         world_model = WorldConfig.from_dict(
             {
                 "cuboid": {
-                    "block": {"dims": [0.5, 0.5, 0.5], "pose": [-0.25, 0, 0, 1, 0, 0, 0]},
-                    "block1": {"dims": [0.1, 0.2, 0.5], "pose": [0.25, 0.1, 0, 1, 0, 0, 0]},
+                    "block2": {"dims": [0.5, 0.5, 0.5], "pose": [-0.25, 0, 0, 1, 0, 0, 0]},
+                    "block3": {"dims": [0.1, 0.2, 0.5], "pose": [0.25, 0.1, 0, 1, 0, 0, 0]},
                 }
             }
         )
@@ -62,7 +62,7 @@ def world_collision(request):
                 if not request.param[0]
                 else CollisionCheckerType.MESH
             ),
-            "max_distance": 5.0,
+            "max_distance": 1.0,
             "n_envs": 1,
         },
         world_model,